diff --git a/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h b/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h
deleted file mode 100644
index 5d61d65be2d129c293c63a4ee62d6e26bd9201be..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h
+++ /dev/null
@@ -1,177 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <functional>
-#include <memory>
-#include <mutex>
-#include <shared_mutex>
-#include <thread>
-#include <type_traits>
-#include <unordered_map>
-
-namespace paddle {
-namespace framework {
-
-template <typename T>
-class ThreadDataRegistry {
- public:
-  // Singleton
-  static ThreadDataRegistry& GetInstance() {
-    static ThreadDataRegistry instance;
-    return instance;
-  }
-
-  T* GetMutableCurrentThreadData() { return &CurrentThreadData(); }
-
-  const T& GetCurrentThreadData() { return CurrentThreadData(); }
-
-  template <typename Alias = T,
-            typename = std::enable_if_t<std::is_copy_assignable<Alias>::value>>
-  void SetCurrentThreadData(const T& val) {
-    CurrentThreadData() = val;
-  }
-
-  // Returns current snapshot of all threads. Make sure there is no thread
-  // create/destory when using it.
-  template <
-      typename Alias = T,
-      typename = std::enable_if_t<std::is_copy_constructible<Alias>::value>>
-  std::unordered_map<uint64_t, T> GetAllThreadDataByValue() {
-    return impl_->GetAllThreadDataByValue();
-  }
-
-  // Returns current snapshot of all threads. Make sure there is no thread
-  // create/destory when using it.
-  std::unordered_map<uint64_t, std::reference_wrapper<T>>
-  GetAllThreadDataByRef() {
-    return impl_->GetAllThreadDataByRef();
-  }
-
- private:
-// types
-// Lock types
-#if defined(__clang__) || defined(__GNUC__)  // CLANG or GCC
-#ifndef __APPLE__
-#if __cplusplus >= 201703L
-  using LockType = std::shared_mutex;
-  using SharedLockGuardType = std::shared_lock<std::shared_mutex>;
-#elif __cplusplus >= 201402L
-  using LockType = std::shared_timed_mutex;
-  using SharedLockGuardType = std::shared_lock<std::shared_timed_mutex>;
-#else
-  using LockType = std::mutex;
-  using SharedLockGuardType = std::lock_guard<std::mutex>;
-#endif
-// Special case : mac. https://github.com/facebook/react-native/issues/31250
-#else
-  using LockType = std::mutex;
-  using SharedLockGuardType = std::lock_guard<std::mutex>;
-#endif
-#elif defined(_MSC_VER)  // MSVC
-#if _MSVC_LANG >= 201703L
-  using LockType = std::shared_mutex;
-  using SharedLockGuardType = std::shared_lock<std::shared_mutex>;
-#elif _MSVC_LANG >= 201402L
-  using LockType = std::shared_timed_mutex;
-  using SharedLockGuardType = std::shared_lock<std::shared_timed_mutex>;
-#else
-  using LockType = std::mutex;
-  using SharedLockGuardType = std::lock_guard<std::mutex>;
-#endif
-#else  // other compilers
-  using LockType = std::mutex;
-  using SharedLockGuardType = std::lock_guard<std::mutex>;
-#endif
-
-  class ThreadDataHolder;
-  class ThreadDataRegistryImpl {
-   public:
-    void RegisterData(uint64_t tid, ThreadDataHolder* tls_obj) {
-      std::lock_guard<LockType> guard(lock_);
-      tid_map_[tid] = tls_obj;
-    }
-
-    void UnregisterData(uint64_t tid) {
-      std::lock_guard<LockType> guard(lock_);
-      tid_map_.erase(tid);
-    }
-
-    template <
-        typename Alias = T,
-        typename = std::enable_if_t<std::is_copy_constructible<Alias>::value>>
-    std::unordered_map<uint64_t, T> GetAllThreadDataByValue() {
-      std::unordered_map<uint64_t, T> data_copy;
-      SharedLockGuardType guard(lock_);
-      data_copy.reserve(tid_map_.size());
-      for (auto& kv : tid_map_) {
-        data_copy.emplace(kv.first, kv.second->GetData());
-      }
-      return data_copy;
-    }
-
-    std::unordered_map<uint64_t, std::reference_wrapper<T>>
-    GetAllThreadDataByRef() {
-      std::unordered_map<uint64_t, std::reference_wrapper<T>> data_ref;
-      SharedLockGuardType guard(lock_);
-      data_ref.reserve(tid_map_.size());
-      for (auto& kv : tid_map_) {
-        data_ref.emplace(kv.first, std::ref(kv.second->GetData()));
-      }
-      return data_ref;
-    }
-
-   private:
-    LockType lock_;
-    std::unordered_map<uint64_t, ThreadDataHolder*> tid_map_;  // not owned
-  };
-
-  class ThreadDataHolder {
-   public:
-    explicit ThreadDataHolder(
-        std::shared_ptr<ThreadDataRegistryImpl> registry) {
-      registry_ = std::move(registry);
-      tid_ = std::hash<std::thread::id>()(std::this_thread::get_id());
-      registry_->RegisterData(tid_, this);
-    }
-
-    ~ThreadDataHolder() { registry_->UnregisterData(tid_); }
-
-    T& GetData() { return data_; }
-
-   private:
-    std::shared_ptr<ThreadDataRegistryImpl> registry_;
-    uint64_t tid_;
-    T data_;
-  };
-
-  // methods
-  ThreadDataRegistry() { impl_ = std::make_shared<ThreadDataRegistryImpl>(); }
-
-  ThreadDataRegistry(const ThreadDataRegistry&) = delete;
-
-  ThreadDataRegistry& operator=(const ThreadDataRegistry&) = delete;
-
-  T& CurrentThreadData() {
-    static thread_local ThreadDataHolder thread_data(impl_);
-    return thread_data.GetData();
-  }
-
-  // data
-  std::shared_ptr<ThreadDataRegistryImpl> impl_;
-};
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/memory/stats.h b/paddle/fluid/memory/stats.h
index 2399e41d93ade5f9467a227f70253f68d8b1cd0d..d8cb7b812ad27cc35d3eba9787d1826798b617b1 100644
--- a/paddle/fluid/memory/stats.h
+++ b/paddle/fluid/memory/stats.h
@@ -18,15 +18,15 @@ limitations under the License. */
 #include <map>
 #include <string>
 
-#include "paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/phi/common/thread_data_registry.h"
 
 namespace paddle {
 namespace memory {
 
-using framework::ThreadDataRegistry;
+using phi::ThreadDataRegistry;
 
 struct ThreadLocalStatBase {
   int64_t current{0};
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.cc b/paddle/fluid/platform/device/gpu/gpu_info.cc
index 8023403df078d4878eec6687825c42ad5cb17d6a..3373ef51836eaf988b2dd4272564c713ea2931b6 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.cc
+++ b/paddle/fluid/platform/device/gpu/gpu_info.cc
@@ -61,8 +61,6 @@ PADDLE_DEFINE_EXPORTED_bool(enable_gpu_memory_usage_log_mb,
                             "Whether to print the message of gpu memory usage "
                             "MB as a unit of measurement.");
 
-constexpr static float fraction_reserve_gpu_memory = 0.05f;
-
 USE_GPU_MEM_STAT;
 namespace paddle {
 namespace platform {
@@ -77,20 +75,7 @@ void GpuMemoryUsage(size_t *available, size_t *total) {
 }
 
 size_t GpuAvailableMemToAlloc() {
-  size_t total = 0;
-  size_t available = 0;
-  GpuMemoryUsage(&available, &total);
-  size_t reserving =
-      static_cast<size_t>(fraction_reserve_gpu_memory * available);
-  // If available size is less than minimum chunk size, no usable memory exists
-  size_t available_to_alloc = available - reserving;
-  size_t min_chunk_size = GpuMinChunkSize();
-  if (available_to_alloc < min_chunk_size) {
-    available_to_alloc = 0;
-  }
-  VLOG(10) << "GPU usage " << (available >> 20) << "M/" << (total >> 20)
-           << "M, " << (available_to_alloc >> 20) << "M available to allocate";
-  return available_to_alloc;
+  return phi::backends::gpu::GpuAvailableMemToAlloc();
 }
 
 size_t GpuMaxAllocSize() {
@@ -124,6 +109,8 @@ size_t GpuInitAllocSize() { return GpuAllocSize(/* realloc = */ false); }
 
 size_t GpuReallocSize() { return GpuAllocSize(/* realloc = */ true); }
 
+size_t GpuMinChunkSize() { return phi::backends::gpu::GpuMinChunkSize(); }
+
 size_t GpuMaxChunkSize() {
   size_t max_chunk_size = GpuMaxAllocSize();
   VLOG(10) << "Max chunk size " << (max_chunk_size >> 20) << "M";
diff --git a/paddle/fluid/platform/device/gpu/gpu_info.h b/paddle/fluid/platform/device/gpu/gpu_info.h
index bb876f5c526d5147be296d9337bcd98fc1498b0a..925a150b60efb01911c113f540b2349eece873e7 100644
--- a/paddle/fluid/platform/device/gpu/gpu_info.h
+++ b/paddle/fluid/platform/device/gpu/gpu_info.h
@@ -82,7 +82,8 @@ size_t GpuInitAllocSize();
 //! Get the re-allocation size of current GPU device.
 size_t GpuReallocSize();
 
-using phi::backends::gpu::GpuMinChunkSize;
+//! Get the minimum chunk size for GPU buddy allocator.
+size_t GpuMinChunkSize();
 
 //! Get the maximum chunk size for GPU buddy allocator.
 size_t GpuMaxChunkSize();
diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 7ceaeb8259a8d6c776f88863a358ffcee1a22573..cde64029d9c0a36a36f46f7bad33e0d05bbec0eb 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/phi/backends/cpu/cpu_info.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
+#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #endif
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/cupti.h"
@@ -468,6 +469,9 @@ void InitMemoryMethod() {
     memory_method->copy = paddle::memory::Copy<phi::Place, phi::Place>;
     memory_method->device_memory_stat_current_value =
         paddle::memory::DeviceMemoryStatCurrentValue;
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    memory_method->gpu_memory_usage = paddle::platform::GpuMemoryUsage;
+#endif
     memory_utils.Init(std::move(memory_method));
   });
 }
diff --git a/paddle/fluid/platform/profiler/host_event_recorder.h b/paddle/fluid/platform/profiler/host_event_recorder.h
index 7b7152be7437847dcfe12280632a523182a85aed..b93457099bae4100c3c007a4c45c5018989b35b4 100644
--- a/paddle/fluid/platform/profiler/host_event_recorder.h
+++ b/paddle/fluid/platform/profiler/host_event_recorder.h
@@ -18,7 +18,6 @@
 #include <type_traits>
 #include <vector>
 
-#include "paddle/fluid/framework/new_executor/workqueue/thread_data_registry.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/os_info.h"
 #include "paddle/phi/api/profiler/host_event_recorder.h"
diff --git a/paddle/phi/backends/gpu/gpu_info.cc b/paddle/phi/backends/gpu/gpu_info.cc
index a2399554ba85311830d54a362e70173199c1a930..417ff4c72e86c907266e0b5d83c150cd64f87921 100644
--- a/paddle/phi/backends/gpu/gpu_info.cc
+++ b/paddle/phi/backends/gpu/gpu_info.cc
@@ -18,6 +18,9 @@ limitations under the License. */
 #include <vector>
 
 #include "gflags/gflags.h"
+#include "glog/logging.h"
+
+#include "paddle/phi/common/memory_utils.h"
 
 DECLARE_string(selected_gpus);
 
@@ -56,6 +59,30 @@ std::vector<int> GetSelectedDevices() {
   return devices;
 }
 
+constexpr static float fraction_reserve_gpu_memory = 0.05f;
+
+size_t GpuAvailableMemToAlloc() {
+  size_t total = 0;
+  size_t available = 0;
+  memory_utils::GpuMemoryUsage(&available, &total);
+  size_t reserving =
+      static_cast<size_t>(fraction_reserve_gpu_memory * available);
+  // If available size is less than minimum chunk size, no usable memory exists
+  size_t available_to_alloc = available - reserving;
+  size_t min_chunk_size = GpuMinChunkSize();
+  if (available_to_alloc < min_chunk_size) {
+    available_to_alloc = 0;
+  }
+  VLOG(10) << "GPU usage " << (available >> 20) << "M/" << (total >> 20)
+           << "M, " << (available_to_alloc >> 20) << "M available to allocate";
+  return available_to_alloc;
+}
+
+size_t GpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 256 bytes.
+  return 1 << 8;
+}
+
 }  // namespace gpu
 }  // namespace backends
 }  // namespace phi
diff --git a/paddle/phi/backends/gpu/gpu_info.h b/paddle/phi/backends/gpu/gpu_info.h
index 6ba8863bc39e4f5145a47e799eae06917118bb58..ebf57bd06eb19d10b1c0b49d2c40365b56086609 100644
--- a/paddle/phi/backends/gpu/gpu_info.h
+++ b/paddle/phi/backends/gpu/gpu_info.h
@@ -70,11 +70,12 @@ const gpuDeviceProp &GetDeviceProperties(int id);
 //! Set the GPU device id for next execution.
 void SetDeviceId(int device_id);
 
+//! Get the available memory to allocate, which is the size of available gpu
+//! minus reserving.
+size_t GpuAvailableMemToAlloc();
+
 //! Get the minimum chunk size for GPU buddy allocator.
-inline size_t GpuMinChunkSize() {
-  // Allow to allocate the minimum chunk size is 256 bytes.
-  return 1 << 8;
-}
+size_t GpuMinChunkSize();
 
 //! Copy memory from address src to dst asynchronously.
 void GpuMemcpyAsync(void *dst,
diff --git a/paddle/phi/common/memory_utils.cc b/paddle/phi/common/memory_utils.cc
index cc1e9d75b6f4a8c97807ab548d85938077b3b936..4a8c8f776b286be03987becc654a2de683b5665c 100644
--- a/paddle/phi/common/memory_utils.cc
+++ b/paddle/phi/common/memory_utils.cc
@@ -68,6 +68,13 @@ int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id) {
   return MemoryUtils::Instance().DeviceMemoryStatCurrentValue(stat_type,
                                                               dev_id);
 }
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+void GpuMemoryUsage(size_t* available, size_t* total) {
+  return MemoryUtils::Instance().GpuMemoryUsage(available, total);
+}
+#endif
+
 }  // namespace memory_utils
 
 }  // namespace phi
diff --git a/paddle/phi/common/memory_utils.h b/paddle/phi/common/memory_utils.h
index 6dbf3d5d07acd09b326c3ef5d036dec850c3adb9..180385c6c5255cc4dc6d1d9ffbb4a25d43eda98f 100644
--- a/paddle/phi/common/memory_utils.h
+++ b/paddle/phi/common/memory_utils.h
@@ -113,6 +113,16 @@ struct MemoryInterface {
    */
   int64_t (*device_memory_stat_current_value)(const std::string& stat_type,
                                               int dev_id);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  /**
+   * @brief get the memory usage of current GPU device.
+   *
+   * @param[size_t] available  device available memory to alloc
+   * @param[size_t] total      device total memory
+   */
+  void (*gpu_memory_usage)(size_t* available, size_t* total);
+#endif
 };
 
 class MemoryUtils {
@@ -234,6 +244,18 @@ class MemoryUtils {
     return memory_method_->device_memory_stat_current_value(stat_type, dev_id);
   }
 
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  void GpuMemoryUsage(size_t* available, size_t* total) {
+    CheckMemoryMethod();
+    PADDLE_ENFORCE_NOT_NULL(
+        memory_method_->gpu_memory_usage,
+        phi::errors::Unavailable(
+            "gpu_memory_usage method in memory_method_ is not initiazed "
+            "yet. You need init it first."));
+    return memory_method_->gpu_memory_usage(available, total);
+  }
+#endif
+
   void CheckMemoryMethod() {
     PADDLE_ENFORCE_NE(
         memory_method_.get(),
@@ -288,7 +310,13 @@ void Copy(const Place& dst_place,
           const Place& src_place,
           const void* src,
           size_t num);
+
 int64_t DeviceMemoryStatCurrentValue(const std::string& stat_type, int dev_id);
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+void GpuMemoryUsage(size_t* available, size_t* total);
+#endif
+
 }  // namespace memory_utils
 
 }  // namespace phi
diff --git a/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h b/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
index 79da6d3b1b4a03b4508683c13c903486e8031973..6e75d40d45174900d7a0078e53b9af7474d2555a 100644
--- a/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
+++ b/paddle/phi/kernels/gpudnn/conv_gpudnn_base.h
@@ -20,7 +20,6 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
-#include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/common/memory_utils.h"
 #include "paddle/phi/kernels/autotune/cache.h"
@@ -53,7 +52,7 @@ static size_t CalcWorkspaceLimitInBytes(bool use_fixed_workspace) {
         memory_utils::DeviceMemoryStatCurrentValue("Allocated", device_id);
     int64_t reserved =
         memory_utils::DeviceMemoryStatCurrentValue("Reserved", device_id);
-    int64_t availble = paddle::platform::GpuAvailableMemToAlloc();
+    int64_t availble = phi::backends::gpu::GpuAvailableMemToAlloc();
     VLOG(3) << "[memory] allocated=" << ToMegaBytes(allocated)
             << " MB, reserved=" << ToMegaBytes(reserved)
             << " MB, available_to_alloc=" << ToMegaBytes(availble) << " MB.";
diff --git a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
index 8570b4be5777813a2d9abc44a4abf084194ed3b7..d81c03ceabb5c663c309670258324b6f0758bcf5 100644
--- a/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
+++ b/paddle/phi/kernels/gpudnn/conv_grad_kernel.cu
@@ -627,7 +627,6 @@ void ConvCudnnGradKernel(const Context& ctx,
       compute_format == phi::backends::gpu::DataLayout::kNHWC
           ? phi::backends::gpu::DataLayout::kNHWC
           : phi::backends::gpu::DataLayout::kNCHW;
-  // TODO(phlrain): replace paddle::platform::DataLaytout to phi::DataLayout
   if (transformed_input.dims().size() == 5) {
     layout = compute_format == phi::backends::gpu::DataLayout::kNHWC
                  ? phi::backends::gpu::DataLayout::kNDHWC