[PTEN] Update gpu_context. (#39359)

* gpu_context.. * update * update * update

[PTEN] Update gpu_context. (#39359)
* gpu_context.. * update * update * update
24103cbb · Wilber · GitHub · 0fee0044 · 24103cbb · 24103cbb
7 changed file
--- a/paddle/fluid/operators/conv_cudnn_helper.h
+++ b/paddle/fluid/operators/conv_cudnn_helper.h
@@ -288,7 +288,6 @@ struct SearchAlgorithm<cudnnConvolutionFwdAlgoPerf_t> {
          ctx.template device_context<platform::CUDADeviceContext>();
      auto workspace_handle = dev_ctx.cudnn_workspace_handle();
-      auto& temp = ctx.cuda_device_context();
      AlgorithmsCache<algo_t>& algo_cache =
          *(framework::ConvSearchCache::Instance().GetForward());

--- a/paddle/fluid/operators/math/im2col.cu
+++ b/paddle/fluid/operators/math/im2col.cu
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 namespace paddle {
 namespace operators {
@@ -73,12 +74,12 @@ __global__ void im2col(const T* data_im, int num_outs, int im_height,
 * col =
 *   [input_channels, filter_height, filter_width, output_height, output_width]
 */
-template <class T>
+template <class DeviceContext, class T>
-class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
+class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO, DeviceContext,
-                    platform::CUDADeviceContext, T> {
+                    T> {
 public:
-  void operator()(const platform::CUDADeviceContext& context,
+  void operator()(const DeviceContext& context, const framework::Tensor& im,
-                  const framework::Tensor& im, const std::vector<int>& dilation,
+                  const std::vector<int>& dilation,
                  const std::vector<int>& stride,
                  const std::vector<int>& padding, framework::Tensor* col,
                  const DataLayout data_layout) {
@@ -184,12 +185,11 @@ __global__ void col2im(int n, const T* data_col, int im_height, int im_width,
 * col =
 *   [input_channels, filter_height, filter_width, output_height, output_width]
 */
-template <class T>
+template <class DeviceContext, class T>
-class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
+class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO, DeviceContext,
-                    platform::CUDADeviceContext, T> {
+                    T> {
 public:
-  void operator()(const platform::CUDADeviceContext& context,
+  void operator()(const DeviceContext& context, const framework::Tensor& col,
-                  const framework::Tensor& col,
                  const std::vector<int>& dilation,
                  const std::vector<int>& stride,
                  const std::vector<int>& padding, framework::Tensor* im,
@@ -257,10 +257,18 @@ template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                             platform::CUDADeviceContext, float>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                             platform::CUDADeviceContext, double>;
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
+                             pten::GPUContext, float>;
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
+                             pten::GPUContext, double>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                             platform::CUDADeviceContext, float>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                             platform::CUDADeviceContext, double>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
+                             pten::GPUContext, float>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
+                             pten::GPUContext, double>;
 template <class T>
 __global__ void im2colOCF(const T* im_data, int im_channels, int im_height,
@@ -299,12 +307,12 @@ __global__ void im2colOCF(const T* im_data, int im_channels, int im_height,
 * col =
 *   [output_height, output_width, input_channels, filter_height, filter_width]
 */
-template <class T>
+template <class DeviceContext, class T>
-class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
+class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF, DeviceContext,
-                    platform::CUDADeviceContext, T> {
+                    T> {
 public:
-  void operator()(const platform::CUDADeviceContext& context,
+  void operator()(const DeviceContext& context, const framework::Tensor& im,
-                  const framework::Tensor& im, const std::vector<int>& dilation,
+                  const std::vector<int>& dilation,
                  const std::vector<int>& stride,
                  const std::vector<int>& padding, framework::Tensor* col,
                  const DataLayout data_layout) {
@@ -390,12 +398,11 @@ __global__ void col2imOCF(const T* col_data, int im_channels, int im_height,
 * col =
 *   [output_height, output_width, input_channels, filter_height, filter_width]
 */
-template <class T>
+template <class DeviceContext, class T>
-class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
+class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF, DeviceContext,
-                    platform::CUDADeviceContext, T> {
+                    T> {
 public:
-  void operator()(const platform::CUDADeviceContext& context,
+  void operator()(const DeviceContext& context, const framework::Tensor& col,
-                  const framework::Tensor& col,
                  const std::vector<int>& dilation,
                  const std::vector<int>& stride,
                  const std::vector<int>& padding, framework::Tensor* im,
@@ -464,10 +471,19 @@ template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                             platform::CUDADeviceContext, float>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                             platform::CUDADeviceContext, double>;
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
+                             pten::GPUContext, float>;
+template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
+                             pten::GPUContext, double>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                             platform::CUDADeviceContext, float>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                             platform::CUDADeviceContext, double>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
+                             pten::GPUContext, float>;
+template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
+                             pten::GPUContext, double>;
 }  // namespace math
 }  // namespace operators

--- a/paddle/fluid/operators/math/vol2col.cu
+++ b/paddle/fluid/operators/math/vol2col.cu
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/vol2col.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
 namespace paddle {
 namespace operators {
@@ -82,13 +83,13 @@ __global__ void vol2col(int num_kernels, const T* data_vol, int depth,
 *   [input_channels, filter_depth, filter_height, filter_width,
 *                    output_depth, output_height, output_width]
 */
-template <class T>
+// template <class DeviceContext, class T>
-class Vol2ColFunctor<platform::CUDADeviceContext, T> {
+// class Vol2ColFunctor {
- public:
+//  public:
-  void operator()(const platform::CUDADeviceContext& context,
+template <class DeviceContext, class T>
-                  const framework::Tensor& vol,
+void Vol2ColFunctor<DeviceContext, T>::operator()(
-                  const std::vector<int>& dilations,
+    const DeviceContext& context, const framework::Tensor& vol,
-                  const std::vector<int>& strides,
+    const std::vector<int>& dilations, const std::vector<int>& strides,
    const std::vector<int>& paddings, framework::Tensor* col,
    const DataLayout data_layout) const {
  PADDLE_ENFORCE_EQ(vol.dims().size(), 4,
@@ -126,8 +127,7 @@ class Vol2ColFunctor<platform::CUDADeviceContext, T> {
                          ((dilations[0] * (filter_depth - 1) + 1))) /
                             strides[0] +
                         1;
-    PADDLE_ENFORCE_EQ(
+  PADDLE_ENFORCE_EQ(input_depth_tmp, output_depth,
-        input_depth_tmp, output_depth,
                    platform::errors::InvalidArgument(
                        "input_depth(%d) and output_depth(%d) are mismatching.",
                        input_depth_tmp, output_depth));
@@ -144,8 +144,7 @@ class Vol2ColFunctor<platform::CUDADeviceContext, T> {
                          ((dilations[2] * (filter_width - 1) + 1))) /
                             strides[2] +
                         1;
-    PADDLE_ENFORCE_EQ(
+  PADDLE_ENFORCE_EQ(input_width_tmp, output_width,
-        input_width_tmp, output_width,
                    platform::errors::InvalidArgument(
                        "input_width(%d) and output_width(%d) are mismatching.",
                        input_width_tmp, output_width));
@@ -167,8 +166,8 @@ class Vol2ColFunctor<platform::CUDADeviceContext, T> {
      filter_width, strides[0], strides[1], strides[2], pad_d_forth, pad_h_up,
      pad_w_left, output_depth, output_height, output_width, col->data<T>(),
      data_layout);
-  }
+}
-};
+// };
 template <class T>
 __global__ void col2vol(int num_kernels, const T* data_col, int depth,
@@ -249,13 +248,13 @@ __global__ void col2vol(int num_kernels, const T* data_col, int depth,
 *   [input_channels, filter_depth, filter_height, filter_width,
 *                    output_depth, output_height, output_width]
 */
-template <class T>
+// template <class DeviceContext, class T>
-class Col2VolFunctor<platform::CUDADeviceContext, T> {
+// class Col2VolFunctor<DeviceContext, T> {
- public:
+//  public:
-  void operator()(const platform::CUDADeviceContext& context,
+template <class DeviceContext, class T>
-                  const framework::Tensor& col,
+void Col2VolFunctor<DeviceContext, T>::operator()(
-                  const std::vector<int>& dilations,
+    const DeviceContext& context, const framework::Tensor& col,
-                  const std::vector<int>& strides,
+    const std::vector<int>& dilations, const std::vector<int>& strides,
    const std::vector<int>& paddings, framework::Tensor* vol,
    const DataLayout data_layout) const {
  PADDLE_ENFORCE_EQ(vol->dims().size(), 4,
@@ -294,8 +293,7 @@ class Col2VolFunctor<platform::CUDADeviceContext, T> {
                          ((dilations[0] * (filter_depth - 1) + 1))) /
                             strides[0] +
                         1;
-    PADDLE_ENFORCE_EQ(
+  PADDLE_ENFORCE_EQ(input_depth_tmp, output_depth,
-        input_depth_tmp, output_depth,
                    platform::errors::InvalidArgument(
                        "input_depth(%d) and output_depth(%d) are mismatching.",
                        input_depth_tmp, output_depth));
@@ -312,8 +310,7 @@ class Col2VolFunctor<platform::CUDADeviceContext, T> {
                          ((dilations[2] * (filter_width - 1) + 1))) /
                             strides[2] +
                         1;
-    PADDLE_ENFORCE_EQ(
+  PADDLE_ENFORCE_EQ(input_width_tmp, output_width,
-        input_width_tmp, output_width,
                    platform::errors::InvalidArgument(
                        "input_width(%d) and output_width(%d) are mismatching.",
                        input_width_tmp, output_width));
@@ -334,13 +331,18 @@ class Col2VolFunctor<platform::CUDADeviceContext, T> {
      filter_width, strides[0], strides[1], strides[2], pad_d_forth, pad_h_up,
      pad_w_left, output_depth, output_height, output_width, vol->data<T>(),
      data_layout);
-  }
+}
-};
+// };
 template class Vol2ColFunctor<platform::CUDADeviceContext, float>;
 template class Vol2ColFunctor<platform::CUDADeviceContext, double>;
+template class Vol2ColFunctor<pten::GPUContext, float>;
+template class Vol2ColFunctor<pten::GPUContext, double>;
 template class Col2VolFunctor<platform::CUDADeviceContext, float>;
 template class Col2VolFunctor<platform::CUDADeviceContext, double>;
+template class Col2VolFunctor<pten::GPUContext, float>;
+template class Col2VolFunctor<pten::GPUContext, double>;
 }  // namespace math
 }  // namespace operators

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/stream/cuda_stream.h"
 #include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/core/allocator.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
@@ -485,8 +486,11 @@ CUDAContext::~CUDAContext() {
 CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
    : pten::GPUContext(place) {
  pten::GPUContext::PartialInitWithoutAllocator();
-  cuda_stream_.reset(
+  cuda_stream_.reset(new stream::CUDAStream(pten::GPUContext::stream(), place));
-      new stream::CUDAStream(pten::GPUContext::stream(), this->GetPlace()));
+  workspace_.reset(new pten::DnnWorkspaceHandle(
+      memory::allocation::AllocatorFacade::Instance()
+          .GetAllocator(place, pten::GPUContext::stream())
+          .get()));
 }
 CUDADeviceContext::~CUDADeviceContext() = default;
@@ -571,8 +575,15 @@ void CUDADeviceContext::WaitStreamCallback() const {
  pten::GPUContext::WaitStreamCallback();
 }
-CudnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
+pten::DnnWorkspaceHandle CUDADeviceContext::cudnn_workspace_handle() const {
-  return CudnnWorkspaceHandle(*this, &cudnn_handle_mtx_);
+  if (thread_ctx_.count(this)) {
+    // return workspace_.get();
+    return pten::DnnWorkspaceHandle(
+        memory::allocation::AllocatorFacade::Instance()
+            .GetAllocator(GetPlace(), pten::GPUContext::stream())
+            .get());
+  }
+  return pten::GPUContext::cudnn_workspace_handle();
 }
 gpuStream_t CUDADeviceContext::stream() const {

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -566,7 +566,7 @@ class CUDADeviceContext : public pten::GPUContext {
   *  workspace. Once the handle is destructed, the lock would be released.
   *  CudnnWorkspaceHandle is an RAII object to implement thread-safe
   *  sequential cudnn function calls. */
-  CudnnWorkspaceHandle cudnn_workspace_handle() const;
+  pten::DnnWorkspaceHandle cudnn_workspace_handle() const;
  /*! \brief  Return cuda stream in the device context. */
  gpuStream_t stream() const;
@@ -607,6 +607,7 @@ class CUDADeviceContext : public pten::GPUContext {
  // NOTE: Just for compatibility with the past, please delete if there is an
  // elegant way.
  std::unique_ptr<stream::CUDAStream> cuda_stream_;
+  std::unique_ptr<pten::DnnWorkspaceHandle> workspace_{nullptr};
  DISABLE_COPY_AND_ASSIGN(CUDADeviceContext);
 };

--- a/paddle/pten/backends/gpu/gpu_context.cc
+++ b/paddle/pten/backends/gpu/gpu_context.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/pten/backends/gpu/gpu_context.h"
+#include <algorithm>
 #include <array>
 #include <functional>
 #include <future>
@@ -153,55 +154,14 @@ static void StreamCallbackFunc(gpuStream_t stream,
 }  // namespace internal
-class DnnWorkspaceHandle {
+void DnnWorkspaceHandle::ResetWorkspace() { allocation_ = nullptr; }
- public:
-  explicit inline DnnWorkspaceHandle(Allocator* allocator)
-      : allocator_(allocator) {}
-  inline void RunFunc(const std::function<void(void*)>& cudnn_func,
-                      size_t required_workspace_bytes) {
-    if (required_workspace_bytes > WorkspaceSize()) {
-      ReallocWorkspace(required_workspace_bytes);
-    }
-    VLOG(2) << "Cudnn workspace size at RunFunc: "
-            << static_cast<double>(WorkspaceSize()) / (1 << 20) << " MB";
-    {
-      std::lock_guard<std::mutex> guard(mtx_);
-      cudnn_func(allocation_ ? allocation_->ptr() : nullptr);
-    }
-  }
-  /*! \brief Thread which call RunFuncSync() would release gpu memory after
-   *  running the function. Currently this function is only used when cudnn
-   *  exhaustive searching and callers have to guarantee that the input function
-   *  is host blocking */
-  inline void RunFuncSync(const std::function<void(void*)>& cudnn_func,
-                          size_t required_workspace_bytes) {
-    RunFunc(cudnn_func, required_workspace_bytes);
-    ResetWorkspace();
-  }
-  inline size_t WorkspaceSize() {
+void DnnWorkspaceHandle::ReallocWorkspace(size_t required_workspace_bytes) {
-    if (allocation_ == nullptr) {
-      return 0;
-    }
-    return allocation_->size();
-  }
-  void ResetWorkspace() { allocation_ = nullptr; }
-  void ReallocWorkspace(size_t required_workspace_bytes) {
  if (required_workspace_bytes <= WorkspaceSize()) return;
  // reset allocation first before re-allocate to save memory
  allocation_.reset();
  allocation_ = allocator_->Allocate(required_workspace_bytes);
-  }
+}
- private:
-  Allocator::AllocationPtr allocation_{nullptr};
-  Allocator* allocator_{nullptr};
-  std::mutex mtx_;
-};
 struct GPUContext::Impl {
  void Init() {
@@ -341,9 +301,15 @@ struct GPUContext::Impl {
    }
  }
-  DnnWorkspaceHandle* GetDnnWorkspace() {
+  // TODO(wilber): The return type is a pointer, to be modified later.
-    PD_CHECK(workspace_ != nullptr, "the gpu cudnn workspace is nullptr.");
+  // DnnWorkspaceHandle* GetDnnWorkspace() {
-    return workspace_;
+  //   PD_CHECK(workspace_ != nullptr, "the gpu cudnn workspace is nullptr.");
+  //   return workspace_;
+  // }
+  DnnWorkspaceHandle GetDnnWorkspace() {
+    PD_CHECK(allocator_ != nullptr,
+             "the device allocator for gpu context is nullptr.");
+    return DnnWorkspaceHandle(allocator_);
  }
  void InitStream() {
@@ -797,7 +763,7 @@ Eigen::GpuDevice* GPUContext::eigen_device() const {
  return impl_->eigen_device();
 }
-DnnWorkspaceHandle* GPUContext::cudnn_workspace_handle() {
+DnnWorkspaceHandle GPUContext::cudnn_workspace_handle() const {
  return impl_->GetDnnWorkspace();
 }

--- a/paddle/pten/backends/gpu/gpu_context.h
+++ b/paddle/pten/backends/gpu/gpu_context.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <array>
 #include <functional>
+#include <mutex>
 #include "paddle/pten/backends/gpu/forwards.h"
 #include "paddle/pten/backends/gpu/gpu_decls.h"
 #include "paddle/pten/backends/gpu/gpu_helper.h"
@@ -24,7 +25,53 @@ limitations under the License. */
 namespace pten {
-class DnnWorkspaceHandle;
+class DnnWorkspaceHandle {
+ public:
+  explicit inline DnnWorkspaceHandle(Allocator* allocator)
+      : allocator_(allocator) {
+    mtx_.reset(new std::mutex());
+  }
+  inline void RunFunc(const std::function<void(void*)>& cudnn_func,
+                      size_t required_workspace_bytes) {
+    if (required_workspace_bytes > WorkspaceSize()) {
+      ReallocWorkspace(required_workspace_bytes);
+    }
+    {
+      std::lock_guard<std::mutex> guard(*mtx_);
+      cudnn_func(allocation_ ? allocation_->ptr() : nullptr);
+    }
+  }
+  /*! \brief Thread which call RunFuncSync() would release gpu memory after
+   *  running the function. Currently this function is only used when cudnn
+   *  exhaustive searching and callers have to guarantee that the input function
+   *  is host blocking */
+  inline void RunFuncSync(const std::function<void(void*)>& cudnn_func,
+                          size_t required_workspace_bytes) {
+    RunFunc(cudnn_func, required_workspace_bytes);
+    ResetWorkspace();
+  }
+  inline size_t WorkspaceSize() {
+    if (allocation_ == nullptr) {
+      return 0;
+    }
+    return allocation_->size();
+  }
+  void ResetWorkspace();
+  void ReallocWorkspace(size_t required_workspace_bytes);
+  DnnWorkspaceHandle(DnnWorkspaceHandle&&) = default;
+  DnnWorkspaceHandle& operator=(DnnWorkspaceHandle&&) = delete;
+ private:
+  Allocator::AllocationPtr allocation_{nullptr};
+  Allocator* allocator_{nullptr};
+  std::unique_ptr<std::mutex> mtx_;
+};
 class GPUContext : public DeviceContext {
 public:
@@ -85,7 +132,8 @@ class GPUContext : public DeviceContext {
   *  would be acquired to prevent other threads from accessing the
   *  workspace. Once the handle is destructed, the lock would be released.
   */
-  DnnWorkspaceHandle* cudnn_workspace_handle();
+  // TODO(wilber): The return type is a pointer, to be modified later.
+  DnnWorkspaceHandle cudnn_workspace_handle() const;
 public:
  /*! \brief  Call cublas function safely. */