Merge pull request #15131 from panyx0718/clean

hide temp tensor allocation

Merge pull request #15131 from panyx0718/clean
hide temp tensor allocation
087af6a6 · Xin Pan · GitHub · adc96e06 · 9186451f · 087af6a6
5 changed file
--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -377,6 +377,30 @@ class ExecutionContext {
    return op_.Outputs(name);
  }
+  template <typename T, typename DevContext>
+  Tensor AllocateTmpTensor(const framework::DDim& dim,
+                           const DevContext& dev_ctx) const {
+    auto tmp_allocation_ptr = platform::DeviceTemporaryAllocator::Instance()
+                                  .Get<DevContext>(dev_ctx)
+                                  .Allocate(product(dim) * sizeof(T));
+    auto& deleter = tmp_allocation_ptr.get_deleter();
+    auto* allocation_ptr = tmp_allocation_ptr.release();
+    auto shared_allocation = std::shared_ptr<memory::allocation::Allocation>(
+        allocation_ptr, deleter);
+    PADDLE_ENFORCE(
+        dynamic_cast<platform::TemporaryAllocation*>(allocation_ptr) != nullptr,
+        "The AllocationPtr must be TemporaryAllocation.");
+    PADDLE_ENFORCE_EQ(allocation_ptr->size(),
+                      framework::product(dim) * sizeof(T));
+    paddle::framework::Tensor temp_tensor(
+        framework::ToDataType(std::type_index(typeid(T))));
+    temp_tensor.Resize(dim);
+    temp_tensor.ResetHolder(std::move(shared_allocation));
+    return temp_tensor;
+  }
 private:
  const OperatorBase& op_;
  const Scope& scope_;

--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -151,27 +151,5 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) {
  memory::Copy(dst_place, dst_ptr, boost::get<platform::CPUPlace>(src.place()),
               src_ptr, size);
 }
-template <typename T>
-paddle::framework::Tensor GetTensor(
-    memory::allocation::AllocationPtr temp_allocation_ptr,
-    const framework::DDim& dim) {
-  auto& deleter = temp_allocation_ptr.get_deleter();
-  auto* allocation_ptr = temp_allocation_ptr.release();
-  auto shared_allocation =
-      std::shared_ptr<memory::allocation::Allocation>(allocation_ptr, deleter);
-  PADDLE_ENFORCE(
-      dynamic_cast<platform::TemporaryAllocation*>(allocation_ptr) != nullptr,
-      "The AllocationPtr must be TemporaryAllocation.");
-  PADDLE_ENFORCE_EQ(allocation_ptr->size(),
-                    framework::product(dim) * sizeof(T));
-  paddle::framework::Tensor temp_tensor(
-      framework::ToDataType(std::type_index(typeid(T))));
-  temp_tensor.Resize(dim);
-  temp_tensor.ResetHolder(std::move(shared_allocation));
-  return temp_tensor;
-}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -18,7 +18,6 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/im2col.h"
@@ -158,10 +157,7 @@ class GemmConvKernel : public framework::OpKernel<T> {
    // to call the matrix multiplication interface.
    Tensor col_matrix;
    if (is_expand) {
-      auto tmp_allocation_ptr =
+      col = context.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
-          platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
-              framework::product(col_shape) * sizeof(T));
-      col = framework::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
      col_matrix.ShareDataWith(col);
      col_matrix.Resize(col_matrix_shape);
    }
@@ -293,10 +289,7 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
    // to call the matrix multiplication interface.
    Tensor col_matrix;
    if (is_expand) {
-      auto tmp_allocation_ptr =
+      col = context.AllocateTmpTensor<T, DeviceContext>(col_shape, dev_ctx);
-          platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
-              framework::product(col_shape) * sizeof(T));
-      col = framework::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
      col_matrix.ShareDataWith(col);
      col_matrix.Resize(col_matrix_shape);
    }

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -100,7 +100,7 @@ ENDIF()
 nv_library(cuda_device_guard SRCS cuda_device_guard.cc DEPS gpu_info)
 if(WITH_GPU)
-    nv_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor)
+    nv_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor operator)
 else()
-    cc_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor)
+    cc_test(temporal_allocator_test SRCS temporary_allocator_test.cc DEPS temp_allocator tensor operator)
 endif()
--- a/paddle/fluid/platform/temporary_allocator_test.cc
+++ b/paddle/fluid/platform/temporary_allocator_test.cc
@@ -14,12 +14,27 @@
 #include "paddle/fluid/platform/temporary_allocator.h"
 #include <gtest/gtest.h>
+#include <string>
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor_util.h"
 DECLARE_double(limit_of_temporary_allocation);
 namespace paddle {
 namespace platform {
+class DummyOp : public framework::OperatorBase {
+ public:
+  DummyOp(const std::string& type, const framework::VariableNameMap& inputs,
+          const framework::VariableNameMap& outputs,
+          const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+ protected:
+  void RunImpl(const framework::Scope& scope,
+               const platform::Place& place) const override {}
+};
 TEST(temporary_allocator, temporary_allocator) {
  platform::CPUPlace cpu_place;
  TemporaryAllocator alloc(cpu_place);
@@ -68,96 +83,92 @@ TEST(temporary_allocator, add_callback) {
 }
 TEST(temporary_allocator, create_tensor_with_allocationptr) {
-  platform::CPUPlace cpu_place;
+  framework::VariableNameMap dummy_vars;
-  TemporaryAllocator cpu_alloc(cpu_place);
+  framework::AttributeMap dummy_attrs;
+  DummyOp op("dummy", dummy_vars, dummy_vars, dummy_attrs);
+  framework::Scope scope;
+  framework::VariableValueMap vars;
+  framework::RuntimeContext run_ctx(vars, vars);
+  size_t memory_size = 300;
  {
-    size_t memory_size = 200;
+    platform::CPUPlace cpu_place;
-    auto allocation = cpu_alloc.Allocate(memory_size);
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    void* address = allocation->ptr();
+    auto* dev_ctx =
+        static_cast<platform::CPUDeviceContext*>(pool.Get(cpu_place));
+    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx);
    int numel = memory_size / sizeof(float);
-    framework::Tensor tensor = framework::GetTensor<float>(
+    framework::Tensor tensor =
-        std::move(allocation), framework::make_ddim({numel}));
+        ctx.AllocateTmpTensor<float, platform::CPUDeviceContext>(
-    PADDLE_ENFORCE_EQ(address, tensor.data<float>());
+            framework::make_ddim({numel}), *dev_ctx);
    PADDLE_ENFORCE_EQ(tensor.numel(), numel);
  }
 #ifdef PADDLE_WITH_CUDA
-  platform::CUDAPlace gpu_place(0);
-  TemporaryAllocator gpu_alloc(gpu_place);
  {
-    size_t memory_size = 300;
+    platform::CUDAPlace gpu_place(0);
-    auto allocation = gpu_alloc.Allocate(memory_size);
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    void* address = allocation->ptr();
+    auto* dev_ctx =
+        static_cast<platform::CUDADeviceContext*>(pool.Get(gpu_place));
+    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx);
    int numel = memory_size / sizeof(float);
-    framework::Tensor tensor = framework::GetTensor<float>(
+    framework::Tensor tensor =
-        std::move(allocation), framework::make_ddim({numel}));
+        ctx.AllocateTmpTensor<float, platform::CUDADeviceContext>(
-    PADDLE_ENFORCE_EQ(address, tensor.data<float>());
+            framework::make_ddim({numel}), *dev_ctx);
    PADDLE_ENFORCE_EQ(tensor.numel(), numel);
  }
-  // The allocation is not holded now, it should be placed to
-  // TemporaryAllocationQueue.
-  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1);
-  gpu_alloc.Release([]() {});
-  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
 #endif
 }
 TEST(temporary_allocator, create_tensor_with_allocationptr2) {
-  platform::CPUPlace cpu_place;
+  framework::VariableNameMap dummy_vars;
-  TemporaryAllocator cpu_alloc(cpu_place);
+  framework::AttributeMap dummy_attrs;
+  DummyOp op("dummy", dummy_vars, dummy_vars, dummy_attrs);
+  framework::Scope scope;
+  framework::VariableValueMap vars;
+  framework::RuntimeContext run_ctx(vars, vars);
+  size_t memory_size = 400;
  {
-    size_t memory_size = 400;
+    platform::CPUPlace cpu_place;
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto* dev_ctx =
+        static_cast<platform::CPUDeviceContext*>(pool.Get(cpu_place));
+    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx);
    int numel = memory_size / sizeof(float);
    framework::Tensor out_side_tensor;
-    void* address;
    {
-      auto allocation = cpu_alloc.Allocate(memory_size);
+      framework::Tensor tensor =
-      address = allocation->ptr();
+          ctx.AllocateTmpTensor<float, platform::CPUDeviceContext>(
-      framework::Tensor tensor = framework::GetTensor<float>(
+              framework::make_ddim({numel}), *dev_ctx);
-          std::move(allocation), framework::make_ddim({numel}));
-      PADDLE_ENFORCE_EQ(address, tensor.data<float>());
      PADDLE_ENFORCE_EQ(tensor.numel(), numel);
      out_side_tensor.ShareDataWith(tensor);
    }
-    PADDLE_ENFORCE_EQ(address, out_side_tensor.data<float>());
    PADDLE_ENFORCE_EQ(out_side_tensor.numel(), numel);
  }
 #ifdef PADDLE_WITH_CUDA
-  platform::CUDAPlace gpu_place(0);
-  TemporaryAllocator gpu_alloc(gpu_place);
  {
-    void* address;
+    platform::CUDAPlace gpu_place(0);
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto* dev_ctx =
+        static_cast<platform::CUDADeviceContext*>(pool.Get(gpu_place));
+    framework::ExecutionContext ctx(op, scope, *dev_ctx, run_ctx);
    size_t memory_size = 500;
    int numel = memory_size / sizeof(float);
    framework::Tensor out_side_tensor;
    {
-      auto allocation = gpu_alloc.Allocate(memory_size);
+      framework::Tensor tensor =
-      address = allocation->ptr();
+          ctx.AllocateTmpTensor<float, platform::CUDADeviceContext>(
-      framework::Tensor tensor = framework::GetTensor<float>(
+              framework::make_ddim({numel}), *dev_ctx);
-          std::move(allocation), framework::make_ddim({numel}));
-      PADDLE_ENFORCE_EQ(address, tensor.data<float>());
      PADDLE_ENFORCE_EQ(tensor.numel(), numel);
      out_side_tensor.ShareDataWith(tensor);
    }
-    PADDLE_ENFORCE_EQ(address, out_side_tensor.data<float>());
    PADDLE_ENFORCE_EQ(out_side_tensor.numel(), numel);
-    // The allocation is holded by out_side_tensor.
-    PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
-    gpu_alloc.Release([]() {});
-    PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
  }
-  // The allocation is not holded now, it should be placed to
-  // TemporaryAllocationQueue.
-  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 1);
-  gpu_alloc.Release([]() {});
-  PADDLE_ENFORCE_EQ(gpu_alloc.TemporaryAllocationQueueSize(), 0);
 #endif
 }