diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index c0d1934a703b66a8ab8a1eab0c1d0680d73b9e17..e0e179a0860e1d36f3b193c10f7bc414a3b57b08 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -125,3 +125,10 @@ if(NOT WIN32)
   cc_library(mmap_allocator SRCS mmap_allocator.cc DEPS allocator)
   cc_test(mmap_allocator_test SRCS mmap_allocator_test.cc DEPS mmap_allocator allocator)
 endif(NOT WIN32)
+
+if(WITH_GPU AND WITH_TESTING)       
+  nv_test(base_ptr_test SRCS base_ptr_test.cu DEPS malloc gpu_info)
+  set_tests_properties(base_ptr_test PROPERTIES 
+                       ENVIRONMENT "FLAGS_allocator_strategy=auto_growth;
+                                    FLAGS_use_stream_safe_cuda_allocator=true;")
+endif()
diff --git a/paddle/fluid/memory/allocation/aligned_allocator.cc b/paddle/fluid/memory/allocation/aligned_allocator.cc
index f0b7f1a4b0d9e704cbe3595eb14d42e68bb4006b..10380c0d6028d57422e17a7c1dff7845ad0390f1 100644
--- a/paddle/fluid/memory/allocation/aligned_allocator.cc
+++ b/paddle/fluid/memory/allocation/aligned_allocator.cc
@@ -26,6 +26,7 @@ class AlignedAllocation : public Allocation {
   AlignedAllocation(AllocationPtr underlying_allocation, size_t offset)
       : Allocation(
             reinterpret_cast<uint8_t*>(underlying_allocation->ptr()) + offset,
+            underlying_allocation->base_ptr(),
             underlying_allocation->size() - offset,
             underlying_allocation->place()),
         underlying_allocation_(std::move(underlying_allocation)) {}
diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
index b11c657b96b74cc078f81bcd6201218a5045f340..ee802462ddc943244fc9cbdbcd7cb8cdd52f8e47 100644
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -23,6 +23,8 @@
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 
+DECLARE_string(allocator_strategy);
+
 namespace paddle {
 namespace memory {
 namespace allocation {
@@ -84,7 +86,10 @@ class Allocator;
 class Allocation {
  public:
   inline Allocation(void* ptr, size_t size, platform::Place place)
-      : ptr_(ptr), size_(size), place_(place) {}
+      : ptr_(ptr), base_ptr_(ptr), size_(size), place_(place) {}
+  inline Allocation(void* ptr, void* base_ptr, size_t size,
+                    platform::Place place)
+      : ptr_(ptr), base_ptr_(base_ptr), size_(size), place_(place) {}
 
   Allocation(const Allocation& o) = delete;
   Allocation& operator=(const Allocation& o) = delete;
@@ -98,6 +103,15 @@ class Allocation {
   // method like `defragmentation` to change `ptr_`.
   inline void* ptr() const { return ptr_; }
 
+  inline void* base_ptr() const {
+    PADDLE_ENFORCE_EQ(FLAGS_allocator_strategy, "auto_growth",
+                      paddle::platform::errors::Unimplemented(
+                          "base_ptr() is only implemented for auto_growth "
+                          "strategy, not support %s strategy",
+                          FLAGS_allocator_strategy));
+    return base_ptr_;
+  }
+
   // Returns the size of this memory buffer, i.e., ptr() + size() - 1 is the
   // last valid element.
   //
@@ -126,6 +140,7 @@ class Allocation {
 
  private:
   void* ptr_;
+  void* base_ptr_;  // the point that directly requested from system
   size_t size_;
   platform::Place place_;
 
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index a2d198aba032308d2f5233e71b832f6515b54f6b..44cd915b168153343d9d1e3a7896cf6d2309b978 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -90,9 +90,9 @@ class CUDAGraphAllocator
    public:
     PrivateAllocation(CUDAGraphAllocator* allocator,
                       AllocationPtr underlying_allocation)
-        : Allocation(underlying_allocation->ptr(),
-                     underlying_allocation->size(),
-                     underlying_allocation->place()),
+        : Allocation(
+              underlying_allocation->ptr(), underlying_allocation->base_ptr(),
+              underlying_allocation->size(), underlying_allocation->place()),
           allocator_(allocator->shared_from_this()),
           underlying_allocation_(std::move(underlying_allocation)) {}
 
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
index d1fa6cce0164f6bc6a4fc330bc0d25bdd087fd5a..2334a1b6d4d55285f49a08938d8625b818dddcc8 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
@@ -73,7 +73,8 @@ class AutoGrowthBestFitAllocator : public Allocator {
 
   struct BlockAllocation : public Allocation {
     explicit BlockAllocation(const List<Block>::iterator &it)
-        : Allocation(it->ptr_, it->size_, it->chunk_->allocation_->place()),
+        : Allocation(it->ptr_, it->chunk_->allocation_->base_ptr(), it->size_,
+                     it->chunk_->allocation_->place()),
           block_it_(it) {}
 
     List<Block>::iterator block_it_;
diff --git a/paddle/fluid/memory/allocation/base_ptr_test.cu b/paddle/fluid/memory/allocation/base_ptr_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1b284c9899dbb021cb1fe98ccf3d8d15ff452646
--- /dev/null
+++ b/paddle/fluid/memory/allocation/base_ptr_test.cu
@@ -0,0 +1,118 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <random>
+#include "gtest/gtest.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+class CUDAAllocatoionBasePtrTest : public ::testing::Test {
+ public:
+  void SetUp() override {
+    place_ = platform::CUDAPlace();
+    alloc_times_ = 100;
+    batch_size_ = 10;
+    max_alloc_size_ = platform::GpuMaxAllocSize() / alloc_times_;
+    random_engine_ = std::default_random_engine(time(NULL));
+    dis_ = std::uniform_int_distribution<int>(0, max_alloc_size_);
+  }
+
+  void OneByOneAllocTest() {
+    for (size_t i = 0; i < alloc_times_; ++i) {
+      size_t size = dis_(random_engine_);
+      std::shared_ptr<Allocation> allocation = AllocShared(place_, size);
+
+      void* base_ptr = allocation->base_ptr();
+      void* system_ptr =
+          platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
+      EXPECT_EQ(base_ptr, system_ptr);
+    }
+
+    Release(place_);
+  }
+
+  void BatchByBatchAllocTest() {
+    std::vector<std::shared_ptr<Allocation>> allocations;
+    allocations.reserve(batch_size_);
+    size_t batch_num = alloc_times_ / batch_size_;
+
+    for (size_t i = 0; i < batch_num; ++i) {
+      for (size_t j = 0; j < batch_size_; ++j) {
+        size_t size = dis_(random_engine_);
+        std::shared_ptr<Allocation> allocation = AllocShared(place_, size);
+
+        void* base_ptr = allocation->base_ptr();
+        void* system_ptr =
+            platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
+        EXPECT_EQ(base_ptr, system_ptr);
+
+        allocations.emplace_back(allocation);
+      }
+      allocations.clear();
+    }
+
+    Release(place_);
+  }
+
+  void ContinuousAllocTest() {
+    std::vector<std::shared_ptr<Allocation>> allocations;
+    allocations.reserve(alloc_times_);
+
+    for (size_t i = 0; i < alloc_times_; ++i) {
+      size_t size = dis_(random_engine_);
+      std::shared_ptr<Allocation> allocation = AllocShared(place_, size);
+
+      void* base_ptr = allocation->base_ptr();
+      void* system_ptr =
+          platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
+      EXPECT_EQ(base_ptr, system_ptr);
+
+      allocations.emplace_back(allocation);
+    }
+
+    allocations.clear();
+    Release(place_);
+  }
+
+  void ZeroSizeAllocTest() {
+    std::shared_ptr<Allocation> allocation = AllocShared(place_, 0);
+    void* base_ptr = allocation->base_ptr();
+    void* system_ptr =
+        platform::GetGpuBasePtr(allocation->ptr(), place_.GetDeviceId());
+    EXPECT_EQ(base_ptr, system_ptr);
+  }
+
+ private:
+  platform::CUDAPlace place_;
+  size_t max_alloc_size_;
+  size_t alloc_times_;
+  size_t batch_size_;
+  std::default_random_engine random_engine_;
+  std::uniform_int_distribution<int> dis_;
+};
+
+TEST_F(CUDAAllocatoionBasePtrTest, base_ptr_test) {
+  OneByOneAllocTest();
+  BatchByBatchAllocTest();
+  ContinuousAllocTest();
+  ZeroSizeAllocTest();
+}
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
index 9e04fd3f0619e3c7a06fc4453ee3f2f0b28a786e..33cf2fe05424778b88eae135f582d3d39405e55a 100644
--- a/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
+++ b/paddle/fluid/memory/allocation/cuda_device_context_allocator.h
@@ -42,7 +42,8 @@ namespace allocation {
 class CUDADeviceContextAllocation : public Allocation {
  public:
   explicit CUDADeviceContextAllocation(AllocationPtr allocation)
-      : Allocation(allocation->ptr(), allocation->size(), allocation->place()),
+      : Allocation(allocation->ptr(), allocation->base_ptr(),
+                   allocation->size(), allocation->place()),
         underlying_allocation_(std::move(allocation)) {}
 
   ~CUDADeviceContextAllocation() {
diff --git a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
index 7b6b61d7a60ca8ab68388820811f7c684f65cc95..0d0318859c6262875fcea5b08ba60c7a65233b8f 100644
--- a/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
+++ b/paddle/fluid/memory/allocation/stream_safe_cuda_allocator.cc
@@ -20,8 +20,9 @@ namespace allocation {
 
 StreamSafeCUDAAllocation::StreamSafeCUDAAllocation(
     AllocationPtr underlying_allocation, gpuStream_t owning_stream)
-    : Allocation(underlying_allocation->ptr(), underlying_allocation->size(),
-                 underlying_allocation->place()),
+    : Allocation(underlying_allocation->ptr(),
+                 underlying_allocation->base_ptr(),
+                 underlying_allocation->size(), underlying_allocation->place()),
       underlying_allocation_(std::move(underlying_allocation)),
       owning_stream_(std::move(owning_stream)) {}
 
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index e68277cc37b381a80e33860543c241f5e137c5be..e09d07a6e39634169fb99aca087ddaf025b18c40 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include <cstdlib>
 #include <mutex>
+#include <set>
 #include <vector>
 
 #include "gflags/gflags.h"
@@ -197,6 +198,11 @@ class RecordedGpuMallocHelper {
     if (result == gpuSuccess) {
       cur_size_.fetch_add(size);
       STAT_INT_ADD("STAT_gpu" + std::to_string(dev_id_) + "_mem_size", size);
+
+#ifdef PADDLE_WITH_TESTING
+      gpu_ptrs.insert(*ptr);
+#endif
+
       return gpuSuccess;
     } else {
       RaiseNonOutOfMemoryError(&result);
@@ -233,7 +239,22 @@ class RecordedGpuMallocHelper {
                                     // cudaErrorCudartUnloading /
                                     // hipErrorDeinitialized
     }
+#ifdef PADDLE_WITH_TESTING
+    gpu_ptrs.erase(ptr);
+#endif
+  }
+
+#ifdef PADDLE_WITH_TESTING
+  void *GetBasePtr(void *ptr) {
+    auto it = gpu_ptrs.upper_bound(ptr);
+
+    if (it == gpu_ptrs.begin()) {
+      return nullptr;
+    }
+
+    return *(--it);
   }
+#endif
 
   bool GetMemInfo(size_t *avail, size_t *total, size_t *actual_avail,
                   size_t *actual_total) {
@@ -301,7 +322,9 @@ class RecordedGpuMallocHelper {
 
   static std::once_flag once_flag_;
   static std::vector<std::unique_ptr<RecordedGpuMallocHelper>> instances_;
-};  // NOLINT
+
+  std::set<void *> gpu_ptrs;  // just for testing
+};                            // NOLINT
 
 std::once_flag RecordedGpuMallocHelper::once_flag_;
 std::vector<std::unique_ptr<RecordedGpuMallocHelper>>
@@ -352,5 +375,11 @@ void EmptyCache(void) {
   }
 }
 
+#ifdef PADDLE_WITH_TESTING
+void *GetGpuBasePtr(void *ptr, int dev_id) {
+  return RecordedGpuMallocHelper::Instance(dev_id)->GetBasePtr(ptr);
+}
+#endif
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.h b/paddle/fluid/platform/device/gpu/gpu_info.h
index 18e6ac83295f89108ea21555a35d2d7f66b3f3dd..9bc4d70bc457d009b771089f4b2acc4fcbc0ea07 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.h
+++ b/paddle/fluid/platform/device/gpu/gpu_info.h
@@ -145,6 +145,11 @@ bool IsGpuMallocRecorded(int dev_id);
 //! Empty idle cached memory held by the allocator.
 void EmptyCache(void);
 
+//! Get the primitive pointer return from cudaMalloc, just for testing
+#ifdef PADDLE_WITH_TESTING
+void *GetGpuBasePtr(void *ptr, int dev_id);
+#endif
+
 }  // namespace platform
 }  // namespace paddle