Add communication attr

31270e58 · Yu Yang · 8e3fdc6e · 31270e58 · 31270e58 · 31270e58
11 changed file
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -32,6 +32,7 @@ size_t Tensor::memory_size() const {
 }
 void* Tensor::mutable_data(platform::Place place, std::type_index type,
+                           memory::Allocator::Attr attr,
                           size_t requested_size) {
  type_ = type;
  PADDLE_ENFORCE_GE(numel(), 0,
@@ -46,17 +47,18 @@ void* Tensor::mutable_data(platform::Place place, std::type_index type,
  /* some versions of boost::variant don't have operator!= */
  if (holder_ == nullptr || !(holder_->place() == place) ||
      holder_->size() < size + offset_) {
-    holder_ = memory::AllocShared(place, size);
+    holder_ = memory::AllocShared(place, size, attr);
    offset_ = 0;
  }
  return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                 offset_);
 }
-void* Tensor::mutable_data(platform::Place place, size_t requested_size) {
+void* Tensor::mutable_data(platform::Place place, memory::Allocator::Attr attr,
+                           size_t requested_size) {
  PADDLE_ENFORCE(this->holder_ != nullptr,
                 "Cannot invoke mutable data if current hold nothing.");
-  return mutable_data(place, type_, requested_size);
+  return mutable_data(place, type_, attr, requested_size);
 }
 Tensor& Tensor::ShareDataWith(const Tensor& src) {

--- a/paddle/fluid/framework/tensor.h
+++ b/paddle/fluid/framework/tensor.h
@@ -84,12 +84,17 @@ class Tensor {
   * @note    If not exist, then allocation.
   */
  template <typename T>
-  T* mutable_data(platform::Place place, size_t requested_size = 0);
+  T* mutable_data(platform::Place place,
+                  memory::Allocator::Attr attr = memory::Allocator::kDefault,
+                  size_t requested_size = 0);
  void* mutable_data(platform::Place place, std::type_index type,
+                     memory::Allocator::Attr attr = memory::Allocator::kDefault,
                     size_t requested_size = 0);
-  void* mutable_data(platform::Place place, size_t requested_size = 0);
+  void* mutable_data(platform::Place place,
+                     memory::Allocator::Attr attr = memory::Allocator::kDefault,
+                     size_t requested_size = 0);
  /**
   * @brief     Return a pointer to mutable memory block.
@@ -101,7 +106,9 @@ class Tensor {
   * @note      If not exist, then allocation.
   */
  template <typename T>
-  T* mutable_data(DDim dims, platform::Place place, size_t requested_size = 0);
+  T* mutable_data(DDim dims, platform::Place place,
+                  memory::Allocator::Attr attr = memory::Allocator::kDefault,
+                  size_t requested_size = 0);
  /*! Return the dimensions of the memory block. */
  const DDim& dims() const;

--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -47,16 +47,20 @@ inline T* Tensor::data() {
 template <typename T>
 inline T* Tensor::mutable_data(DDim dims, platform::Place place,
+                               memory::Allocator::Attr attr,
                               size_t requested_size) {
  static_assert(std::is_pod<T>::value, "T must be POD");
  Resize(dims);
-  return mutable_data<T>(place, requested_size);
+  return mutable_data<T>(place, attr, requested_size);
 }
 template <typename T>
-inline T* Tensor::mutable_data(platform::Place place, size_t requested_size) {
+inline T* Tensor::mutable_data(platform::Place place,
+                               memory::Allocator::Attr attr,
+                               size_t requested_size) {
  static_assert(std::is_pod<T>::value, "T must be POD");
-  return reinterpret_cast<T*>(mutable_data(place, typeid(T), requested_size));
+  return reinterpret_cast<T*>(
+      mutable_data(place, typeid(T), attr, requested_size));
 }
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {

--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -25,9 +25,9 @@ endif()
 cc_library(naive_managed_allocator SRCS naive_managed_allocator.cc DEPS allocator)
 cc_test(naive_managed_allocator_test SRCS naive_managed_allocator_test.cc DEPS naive_managed_allocator)
+nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
 if (WITH_GPU)
-    set(AllocatorFacadeDeps gpu_info cuda_allocator)
+    set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator)
 else ()
    set(AllocatorFacadeDeps)
 endif()

--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -60,7 +60,8 @@ class Allocator {
    kFixedHuge = 2,
    kFluxHuge = 3,
    kTmp = 4,
-    NumOfAttrs = 5
+    kCommunication = 5,
+    NumOfAttrs = 6
  };
  virtual ~Allocator();

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -21,6 +21,7 @@
 #include "paddle/fluid/memory/allocation/cpu_allocator.h"
 #include "paddle/fluid/memory/allocation/locked_allocator.h"
 #include "paddle/fluid/memory/allocation/naive_managed_allocator.h"
+#include "paddle/fluid/memory/allocation/pinned_allocator.h"
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #include "paddle/fluid/platform/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
@@ -32,6 +33,35 @@ namespace paddle {
 namespace memory {
 namespace allocation {
+class CPUManagedAllocator : public ManagedAllocator {
+ public:
+  CPUManagedAllocator()
+      : normal_allocator_(NaiveManagedAllocator::Create(
+            std::unique_ptr<Allocator>(new CPUAllocator()))),
+        communication_allocator_(NaiveManagedAllocator::Create(
+            std::unique_ptr<Allocator>(new CPUPinnedAllocator()))) {}
+  std::unique_ptr<Allocation> Allocate(size_t size, Attr attr) override {
+    if (attr == kCommunication) {
+      return communication_allocator_->Allocate(size, attr);
+    } else {
+      return normal_allocator_->Allocate(size, attr);
+    }
+  }
+  std::shared_ptr<Allocation> AllocateShared(size_t size, Attr attr) override {
+    if (attr == kCommunication) {
+      return communication_allocator_->AllocateShared(size, attr);
+    } else {
+      return normal_allocator_->AllocateShared(size, attr);
+    }
+  }
+ private:
+  std::shared_ptr<ManagedAllocator> normal_allocator_;
+  std::shared_ptr<ManagedAllocator> communication_allocator_;
+};
 class AllocatorFacadePrivate {
 public:
  std::map<platform::Place, std::shared_ptr<ManagedAllocator>> allocators_;
@@ -52,10 +82,7 @@ class AllocatorFacadePrivate {
 private:
  void InitCPUAllocator() {
-    auto all = NaiveManagedAllocator::Create(
+    allocators_[platform::CPUPlace()] = std::make_shared<CPUManagedAllocator>();
-        std::unique_ptr<Allocator>(new CPUAllocator()));
-    allocators_[platform::CPUPlace()] = all;
  }
  void InitCUDAAllocator() {

--- a/paddle/fluid/memory/allocation/pinned_allocator.cc
+++ b/paddle/fluid/memory/allocation/pinned_allocator.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/memory/allocation/pinned_allocator.h"
+#include <cuda.h>
+#include <cuda_runtime.h>
+namespace paddle {
+namespace memory {
+namespace allocation {
+std::unique_ptr<Allocation> CPUPinnedAllocator::Allocate(size_t size,
+                                                         Allocator::Attr attr) {
+  PADDLE_ENFORCE_EQ(
+      attr, kCommunication,
+      "CPUPinnedAllocator should be used for Cross-Device Communication");
+  void* ptr;
+  PADDLE_ENFORCE(cudaMallocHost(&ptr, size));
+  return std::unique_ptr<CPUPinnedAllocation>(
+      new CPUPinnedAllocation(ptr, size));
+}
+void CPUPinnedAllocator::Free(Allocation* allocation) {
+  PADDLE_ENFORCE_NOT_NULL(dynamic_cast<CPUPinnedAllocation*>(allocation));
+  PADDLE_ENFORCE(cudaFreeHost(allocation->ptr()));
+}
+bool CPUPinnedAllocator::IsAllocThreadSafe() const { return true; }
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/fluid/memory/allocation/pinned_allocator.h
+++ b/paddle/fluid/memory/allocation/pinned_allocator.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/memory/allocation/allocator.h"
+namespace paddle {
+namespace memory {
+namespace allocation {
+class CPUPinnedAllocation : public Allocation {
+ public:
+  CPUPinnedAllocation(void* ptr, size_t size)
+      : Allocation(ptr, size, platform::CPUPlace()) {}
+};
+class CPUPinnedAllocator : public UnmanagedAllocator {
+ public:
+  std::unique_ptr<Allocation> Allocate(size_t size, Attr attr) override;
+  void Free(Allocation* allocation) override;
+  bool IsAllocThreadSafe() const override;
+};
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/fluid/operators/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/conv_mkldnn_op.cc
@@ -303,7 +303,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    bool fuse_eltwise = ctx.Attr<bool>("fuse_eltwise");
    int groups = ctx.Attr<int>("groups");
-    // TODO: add support for dilation
+    // TODO: add support for dilation  // NOLINT
    PADDLE_ENFORCE(
        dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
        "dilation in convolution is not implemented yet");
@@ -386,8 +386,9 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    auto user_weights_memory_p = handler.AcquireWeightsMemory(
        user_weights_md, to_void_cast<T>(filter_data));
-    T* output_data =
+    T* output_data = output->mutable_data<T>(
-        output->mutable_data<T>(ctx.GetPlace(), handler.GetDstMemorySize());
+        ctx.GetPlace(), paddle::memory::Allocator::kDefault,
+        handler.GetDstMemorySize());
    // create reorder primitive if the input format is not the preferred one
    auto src_memory_p =
        handler.AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
@@ -626,7 +627,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
              user_diff_dst_memory_p, pipeline);
      const size_t size = handler.GetDiffWeightsMemorySize();
-      filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace(), size);
+      filter_grad_data = filter_grad->mutable_data<T>(
+          ctx.GetPlace(), paddle::memory::Allocator::kDefault, size);
      auto diff_weights_memory_p =
          handler.AcquireDiffWeightsMemoryFromWeightsPrimitive(
@@ -651,7 +653,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
                                                        pipeline);
      const size_t size = handler.GetDiffSourceMemorySize();
-      input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace(), size);
+      input_grad_data = input_grad->mutable_data<T>(
+          ctx.GetPlace(), paddle::memory::Allocator::kDefault, size);
      auto diff_src_memory_p = handler.AcquireDiffSrcMemoryFromDataPrimitive(
          reinterpret_cast<void*>(input_grad_data));

--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -112,17 +112,16 @@ T TensorGetElement(const framework::Tensor &self, size_t offset) {
  }
 }
-// TODO(dzhwinter) : fix the redundent Tensor allocate and free
+// TODO(dzhwinter) : fix the redundant Tensor allocate and free
 template <typename T>
 void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
  if (platform::is_gpu_place(self->place())) {
-    std::shared_ptr<framework::Tensor> dst(new framework::Tensor);
+    framework::Tensor dst;
-    framework::TensorCopySync(*self, platform::CPUPlace(), dst.get());
+    framework::TensorCopySync(*self, platform::CPUPlace(), &dst);
-    dst->data<T>()[offset] = elem;
+    dst.mutable_data<T>(platform::CPUPlace())[offset] = elem;
-    framework::TensorCopySync(*dst.get(), self->place(), self);
+    framework::TensorCopySync(dst, self->place(), self);
  } else if (platform::is_cpu_place(self->place())) {
-    self->data<T>()[offset] = elem;
+    self->mutable_data<T>(self->place())[offset] = elem;
  }
 }

--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -113,7 +113,7 @@ class TestConv2dOp(OpTest):
            return
        place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace()
        self.check_grad_with_place(
-            place, set(['Input', 'Filter']), 'Output', max_relative_error=0.02)
+            place, {'Input', 'Filter'}, 'Output', max_relative_error=0.02)
    def test_check_grad_no_filter(self):
        if self.dtype == np.float16: