From b9fb03cf54d4594269d7c78be521600f75cb6f49 Mon Sep 17 00:00:00 2001
From: chengduo <zhaochengduo@baidu.com>
Date: Tue, 25 Dec 2018 07:17:23 -0600
Subject: [PATCH] Move GetTensor to tensor_util (#15011)

* refine tensor
test=develop

* refine tensor
test=develop

* fix device_context log
test=develop
---
 paddle/fluid/framework/CMakeLists.txt         |  4 +-
 paddle/fluid/framework/tensor.cc              |  3 +-
 paddle/fluid/framework/tensor.h               |  2 +-
 paddle/fluid/framework/tensor_util.h          | 22 ++++++++++
 paddle/fluid/operators/conv_op.h              | 12 ++----
 .../fluid/operators/math/concat_and_split.cu  |  5 ++-
 .../create_tensor_with_allocationptr.h        | 42 -------------------
 paddle/fluid/platform/device_context.cc       |  7 ++--
 paddle/fluid/platform/device_context.h        | 23 +++++++++-
 paddle/fluid/platform/temporary_allocator.h   | 13 ++++++
 .../platform/temporary_allocator_test.cc      | 18 ++++----
 11 files changed, 80 insertions(+), 71 deletions(-)
 delete mode 100644 paddle/fluid/platform/create_tensor_with_allocationptr.h
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 412bc9cbe8..867970717b 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -48,10 +48,10 @@ if(WITH_GPU)
     nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
     add_dependencies(tensor tensor_util)
   else()
-    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context)
+    nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context )
   endif(WIN32)
 else()
-  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context)
+  cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context )
 endif()
 
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc
index 5b09cad06c..ef096c2b81 100644
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -28,8 +28,7 @@ void Tensor::check_memory_size() const {
       "or maybe the required data-type mismatches the data already stored.");
 }
 
-Tensor::Tensor(std::type_index type)
-    : type_(framework::ToDataType(type)), offset_(0) {}
+Tensor::Tensor(const proto::VarType::Type& dtype) : type_(dtype), offset_(0) {}
 
 size_t Tensor::memory_size() const {
   return holder_ == nullptr ? 0UL : holder_->size() - offset_;
diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h
index 2e110133a3..40606d9b06 100644
--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -69,7 +69,7 @@ class Tensor {
  public:
   Tensor() : type_(proto::VarType::FP32), offset_(0) {}
 
-  explicit Tensor(std::type_index type);
+  explicit Tensor(const proto::VarType::Type&);
 
   /*! Return a pointer to mutable memory block. */
   template <typename T>
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index cab6d9b67e..871c7bd2a7 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/temporary_allocator.h"
 
 namespace paddle {
 namespace framework {
@@ -151,5 +152,26 @@ void TensorToVector(const Tensor& src, std::vector<T>* dst) {
                src_ptr, size);
 }
 
+template <typename T>
+paddle::framework::Tensor GetTensor(
+    memory::allocation::AllocationPtr temp_allocation_ptr,
+    const framework::DDim& dim) {
+  auto& deleter = temp_allocation_ptr.get_deleter();
+  auto* allocation_ptr = temp_allocation_ptr.release();
+  auto shared_allocation =
+      std::shared_ptr<memory::allocation::Allocation>(allocation_ptr, deleter);
+
+  PADDLE_ENFORCE(
+      dynamic_cast<platform::TemporaryAllocation*>(allocation_ptr) != nullptr,
+      "The AllocationPtr must be TemporaryAllocation.");
+  PADDLE_ENFORCE_EQ(allocation_ptr->size(),
+                    framework::product(dim) * sizeof(T));
+
+  paddle::framework::Tensor temp_tensor(
+      framework::ToDataType(std::type_index(typeid(T))));
+  temp_tensor.Resize(dim);
+  temp_tensor.ResetHolder(std::move(shared_allocation));
+  return temp_tensor;
+}
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index 4a7b31c7d4..2519f5e7ac 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -18,11 +18,11 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/operators/math/vol2col.h"
-#include "paddle/fluid/platform/create_tensor_with_allocationptr.h"
 
 namespace paddle {
 namespace operators {
@@ -161,10 +161,7 @@ class GemmConvKernel : public framework::OpKernel<T> {
       auto tmp_allocation_ptr =
           platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
               framework::product(col_shape) * sizeof(T));
-      Tensor tep_tensor =
-          platform::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
-
-      col.ShareDataWith(tep_tensor);
+      col = framework::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
       col_matrix.ShareDataWith(col);
       col_matrix.Resize(col_matrix_shape);
     }
@@ -299,10 +296,7 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
       auto tmp_allocation_ptr =
           platform::DeviceTemporaryAllocator::Instance().Get(dev_ctx).Allocate(
               framework::product(col_shape) * sizeof(T));
-      Tensor tep_tensor =
-          platform::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
-
-      col.ShareDataWith(tep_tensor);
+      col = framework::GetTensor<T>(std::move(tmp_allocation_ptr), col_shape);
       col_matrix.ShareDataWith(col);
       col_matrix.Resize(col_matrix_shape);
     }
diff --git a/paddle/fluid/operators/math/concat_and_split.cu b/paddle/fluid/operators/math/concat_and_split.cu
index b10a19b658..e925e7bb59 100644
--- a/paddle/fluid/operators/math/concat_and_split.cu
+++ b/paddle/fluid/operators/math/concat_and_split.cu
@@ -131,8 +131,9 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
     int in_col = input[0].numel() / in_row;
     int out_row = in_row, out_col = 0;
 
-    std::vector<T*> inputs_data(in_num);
+    std::vector<const T*> inputs_data;
     std::vector<int> inputs_col(in_num + 1);
+    inputs_data.reserve(in_num);
 
     inputs_col[0] = 0;
     bool sameShape = true;
@@ -143,7 +144,7 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
       }
       out_col += t_cols;
       inputs_col[i + 1] = out_col;
-      inputs_data[i] = const_cast<T*>(input[i].data<T>());
+      inputs_data.emplace_back(input[i].data<T>());
     }
 
     // computation
diff --git a/paddle/fluid/platform/create_tensor_with_allocationptr.h b/paddle/fluid/platform/create_tensor_with_allocationptr.h
deleted file mode 100644
index 00fcc5f862..0000000000
--- a/paddle/fluid/platform/create_tensor_with_allocationptr.h
+++ /dev/null
@@ -1,42 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/temporary_allocator.h"
-namespace paddle {
-namespace platform {
-
-template <typename T>
-paddle::framework::Tensor GetTensor(
-    memory::allocation::AllocationPtr temp_allocation_ptr,
-    const framework::DDim &dim) {
-  auto &deleter = temp_allocation_ptr.get_deleter();
-  auto *allocation_ptr = temp_allocation_ptr.release();
-  auto shared_allocation =
-      std::shared_ptr<memory::allocation::Allocation>(allocation_ptr, deleter);
-
-  PADDLE_ENFORCE(dynamic_cast<TemporaryAllocation *>(allocation_ptr) != nullptr,
-                 "The AllocationPtr must be TemporaryAllocation.");
-  PADDLE_ENFORCE_EQ(allocation_ptr->size(),
-                    framework::product(dim) * sizeof(T));
-
-  paddle::framework::Tensor temp_tensor(std::type_index(typeid(T)));
-  temp_tensor.Resize(dim);
-  temp_tensor.ResetHolder(std::move(shared_allocation));
-  return temp_tensor;
-}
-
-}  // namespace platform
-}  // namespace paddle
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 81c443d758..022afb686b 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -256,10 +256,11 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
 
   LOG_FIRST_N(WARNING, 1) << "Please NOTE: device: " << place_.device
                           << ", CUDA Capability: " << compute_capability_
-                          << ", Driver Version: " << driver_version_ / 1000
+                          << ", Driver API Version: " << driver_version_ / 1000
                           << "." << (driver_version_ % 100) / 10
-                          << ", Runtime Version: " << runtime_version_ / 1000
-                          << "." << (runtime_version_ % 100) / 10;
+                          << ", Runtime API Version: "
+                          << runtime_version_ / 1000 << "."
+                          << (runtime_version_ % 100) / 10;
   size_t cudnn_dso_ver = dynload::cudnnGetVersion();
   LOG_FIRST_N(WARNING, 1) << "device: " << place_.device
                           << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "."
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index af9744dcb8..7e87580189 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -41,7 +41,28 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
-/*! \brief device temporary allocator singleton */
+/*! \brief device temporary allocator singleton.
+ *
+ * Some operator needs temporary memory during computation, for example,
+ * conv_gemm, which needs use col to store the result of im2col. If we
+ * create a stack memory which is used by CUDA Kernel, before the
+ * Computation(...) returns, we should add ctx->Wait(), because the
+ * execution of CUDA is async, if there doesn't have ctx->Wait(),
+ * the temporary memory will be released before the CUDA Kernel uses
+ * it.
+ *
+ * DeviceTemporaryAllocator is a singleton, which contains a
+ * `TemporaryAllocator` for each <Place, Stream>. And the TemporaryAllocator
+ * contains a temp_allocation_queue which is used to store the temporary
+ * allocations. The allocation, which is allocated by TemporaryAllocator,
+ * is a unique_ptr,  and when it is not held by any variable, it will be
+ * pushed into the temp_allocation_queue. There are two opportunities to free
+ * the allocations of temp_allocation_queue:
+ *  - when the Stream calls cudaStreamSynchronize;
+ *  - when the allocation size of opportunities exceeds a certain threshold
+ *    (defined by FLAGS_limit_of_temporary_allocation).
+ *
+ * */
 class DeviceTemporaryAllocator {
  public:
   static DeviceTemporaryAllocator& Instance() {
diff --git a/paddle/fluid/platform/temporary_allocator.h b/paddle/fluid/platform/temporary_allocator.h
index 4e32d2d695..812c4a3331 100644
--- a/paddle/fluid/platform/temporary_allocator.h
+++ b/paddle/fluid/platform/temporary_allocator.h
@@ -29,6 +29,19 @@ class TemporaryAllocation : public memory::allocation::Allocation {
   memory::allocation::AllocationPtr underlying_allocation_;
 };
 
+/*! \brief the TemporaryAllocator is used to alloc the temporary allocation
+ * which used by CUDA's async operation.
+ *
+ * The TemporaryAllocator contains a temp_allocation_queue which
+ * is used to store the temporary allocations. The allocation, which is
+ * allocated by TemporaryAllocator, is a unique_ptr, and when it is not held
+ * by any variable, it will be pushed into the  temp_allocation_queue.
+ *
+ * There is one opportunity to free the allocations of temp_allocation_queue:
+ *   - when the allocation size of opportunities exceeds a certain threshold
+ *     (defined by FLAGS_limit_of_temporary_allocation).
+ *
+ * */
 class TemporaryAllocator : public memory::allocation::Allocator {
  public:
   explicit TemporaryAllocator(platform::Place place);
diff --git a/paddle/fluid/platform/temporary_allocator_test.cc b/paddle/fluid/platform/temporary_allocator_test.cc
index 3b940b0e82..e4e5be5b89 100644
--- a/paddle/fluid/platform/temporary_allocator_test.cc
+++ b/paddle/fluid/platform/temporary_allocator_test.cc
@@ -14,8 +14,7 @@
 
 #include "paddle/fluid/platform/temporary_allocator.h"
 #include <gtest/gtest.h>
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/create_tensor_with_allocationptr.h"
+#include "paddle/fluid/framework/tensor_util.h"
 DECLARE_double(limit_of_temporary_allocation);
 
 namespace paddle {
@@ -47,6 +46,7 @@ TEST(temporary_allocator, temporary_allocator) {
 
 TEST(temporary_allocator, add_callback) {
 #ifdef PADDLE_WITH_CUDA
+  const double limit = FLAGS_limit_of_temporary_allocation;
   FLAGS_limit_of_temporary_allocation = 10;
   platform::CUDAPlace gpu_place(0);
   TemporaryAllocator gpu_alloc(gpu_place);
@@ -63,7 +63,7 @@ TEST(temporary_allocator, add_callback) {
   });
   { gpu_alloc.Allocate(100); }
   PADDLE_ENFORCE(deleted);
-  FLAGS_limit_of_temporary_allocation = -1;
+  FLAGS_limit_of_temporary_allocation = limit;
 #endif
 }
 
@@ -75,8 +75,8 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) {
     auto allocation = cpu_alloc.Allocate(memory_size);
     void* address = allocation->ptr();
     int numel = memory_size / sizeof(float);
-    framework::Tensor tensor =
-        GetTensor<float>(std::move(allocation), framework::make_ddim({numel}));
+    framework::Tensor tensor = framework::GetTensor<float>(
+        std::move(allocation), framework::make_ddim({numel}));
     PADDLE_ENFORCE_EQ(address, tensor.data<float>());
     PADDLE_ENFORCE_EQ(tensor.numel(), numel);
   }
@@ -90,8 +90,8 @@ TEST(temporary_allocator, create_tensor_with_allocationptr) {
     auto allocation = gpu_alloc.Allocate(memory_size);
     void* address = allocation->ptr();
     int numel = memory_size / sizeof(float);
-    framework::Tensor tensor =
-        GetTensor<float>(std::move(allocation), framework::make_ddim({numel}));
+    framework::Tensor tensor = framework::GetTensor<float>(
+        std::move(allocation), framework::make_ddim({numel}));
     PADDLE_ENFORCE_EQ(address, tensor.data<float>());
     PADDLE_ENFORCE_EQ(tensor.numel(), numel);
   }
@@ -116,7 +116,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) {
     {
       auto allocation = cpu_alloc.Allocate(memory_size);
       address = allocation->ptr();
-      framework::Tensor tensor = GetTensor<float>(
+      framework::Tensor tensor = framework::GetTensor<float>(
           std::move(allocation), framework::make_ddim({numel}));
       PADDLE_ENFORCE_EQ(address, tensor.data<float>());
       PADDLE_ENFORCE_EQ(tensor.numel(), numel);
@@ -138,7 +138,7 @@ TEST(temporary_allocator, create_tensor_with_allocationptr2) {
     {
       auto allocation = gpu_alloc.Allocate(memory_size);
       address = allocation->ptr();
-      framework::Tensor tensor = GetTensor<float>(
+      framework::Tensor tensor = framework::GetTensor<float>(
           std::move(allocation), framework::make_ddim({numel}));
       PADDLE_ENFORCE_EQ(address, tensor.data<float>());
       PADDLE_ENFORCE_EQ(tensor.numel(), numel);
-- 
GitLab