Merge remote-tracking branch 'wangkuiyi/memory_cpu_allocator' into cpu_mem

09d9794c · liaogang · f3294541 · e14e6873 · 09d9794c · 09d9794c
10 changed file
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -166,11 +166,21 @@ MultiGradientMachine::MultiGradientMachine(const ModelConfig& config,

  outArgStream_ = HPPL_STREAM_1;

+  start();
+}
+
+void MultiGradientMachine::start() {
  for (auto& thread : threads_) {
    thread->start();
  }
 }

+void MultiGradientMachine::finish() {
+  for (auto& thread : threads_) {
+    thread->stop();
+  }
+}
+
 std::vector<const std::vector<ParameterPtr>*>
 MultiGradientMachine::getSlaveParameters() {
  std::vector<const std::vector<ParameterPtr>*> vec;
@@ -326,12 +336,6 @@ void MultiGradientMachine::onPassEnd() {
  }
 }

-void MultiGradientMachine::finish() {
-  for (auto& thread : threads_) {
-    thread->stop();
-  }
-}
-
 Evaluator* MultiGradientMachine::makeEvaluator() const {
  return threads_[0]->getGradientMachine()->makeEvaluator();
 }
@@ -445,7 +449,7 @@ TrainerThread::TrainerThread(const ModelConfig& config,

  gradStream_ = HPPL_STREAM_2;
  valueStream_ = HPPL_STREAM_3;
-  stopping_ = false;
+  stopping_ = true;
  updateCounter_ = 0;
  parameterUpdated_ = false;
 }
@@ -453,6 +457,10 @@ TrainerThread::TrainerThread(const ModelConfig& config,
 TrainerThread::~TrainerThread() { stop(); }

 void TrainerThread::start() {
+  if (!stopping_) return;
+
+  stopping_ = false;
+
  gradientMachine_->start();

  computeThread_.reset(new std::thread([this]() { computeThread(); }));

--- a/paddle/gserver/gradientmachines/MultiGradientMachine.h
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.h
@@ -176,6 +176,10 @@ public:

  explicit MultiGradientMachine(const ModelConfig& config, bool useGpu);

+  virtual void start();
+
+  virtual void finish();
+
  virtual void prefetch(const std::vector<Argument>& inArgs);

  virtual void forward(const std::vector<Argument>& inArgs,
@@ -193,8 +197,6 @@ public:

  virtual void onPassEnd();

-  virtual void finish();
-
  virtual Evaluator* makeEvaluator() const;

  virtual void eval(Evaluator* evaluator) const;

--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
 add_subdirectory(detail)
+
+if(${WITH_GPU})
+  nv_library(memory SRCS memory.cc)
+else(${WITH_GPU})
+  cc_library(memory SRCS memroy.cc)
+endif(${WITH_GPU})
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
-cc_test(cpu_allocator_test SRCS cpu_allocator_test.cc)
-nv_test(gpu_allocator_test SRCS gpu_allocator_test.cc)
+if(${WITH_GPU})
+  nv_test(system_allocator_test SRCS system_allocator_test.cc)
+else(${WITH_GPU})
+  cc_test(system_allocator_test SRCS system_allocator_test.cc)
+endif(${WITH_GPU})
--- a/paddle/memory/detail/cpu_allocator.h
+++ b/paddle/memory/detail/cpu_allocator.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stddef.h>  // for size_t
-#include <cstdlib>   // for malloc and free
-
-#ifndef _WIN32
-#include <sys/mman.h>  // for mlock and munlock
-#endif
-
-namespace paddle {
-namespace memory {
-namespace detail {
-
-// CPUAllocator<staging=true> calls mlock, which returns
-// pinned and locked memory as staging areas for data exchange
-// between host and device.  Allocates too much would reduce the
-// amount of memory available to the system for paging.  So, by
-// default, we should use CPUAllocator<staging=false>.
-template <bool staging>
-class CPUAllocator {
- public:
-  void* Alloc(size_t size);
-  void Free(void* p, size_t size);
-};
-
-template <>
-class CPUAllocator<false> {
- public:
-  void* Alloc(size_t size) { return std::malloc(size); }
-  void Free(void* p, size_t size) { std::free(p); }
-};
-
-template <>
-class CPUAllocator<true> {
- public:
-  void* Alloc(size_t size) {
-    void* p = std::malloc(size);
-    if (p == nullptr) {
-      return p;
-    }
-#ifndef _WIN32
-    mlock(p, size);
-#endif
-    return p;
-  }
-
-  void Free(void* p, size_t size) {
-#ifndef _WIN32
-    munlock(p, size);
-#endif
-    std::free(p);
-  }
-};
-
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
--- a/paddle/memory/detail/cpu_allocator_test.cc
+++ b/paddle/memory/detail/cpu_allocator_test.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/memory/detail/cpu_allocator.h"
-#include "gtest/gtest.h"
-
-TEST(CPUAllocator, NonStaging) {
-  paddle::memory::detail::CPUAllocator<false> a;
-  void* p = a.Alloc(4096);
-  EXPECT_NE(p, nullptr);
-  a.Free(p, 4096);
-}
-
-TEST(CPUAllocator, Staging) {
-  paddle::memory::detail::CPUAllocator<true> a;
-  void* p = a.Alloc(4096);
-  EXPECT_NE(p, nullptr);
-  a.Free(p, 4096);
-}
--- a/paddle/memory/detail/gpu_allocator.h
+++ b/paddle/memory/detail/gpu_allocator.h
@@ -14,79 +14,116 @@ limitations under the License. */

 #pragma once

-#include <stddef.h>  // for size_t
+#include <stddef.h>    // for size_t
+#include <sys/mman.h>  // for mlock and munlock
+#include <cstdlib>     // for malloc and free

+#ifndef PADDLE_ONLY_CPU
 #include <thrust/system/cuda/error.h>
 #include <thrust/system_error.h>
+#endif  // PADDLE_ONLY_CPU
+
+#include "paddle/platform/assert.h"

 namespace paddle {
 namespace memory {
 namespace detail {

+class CPUDeleter {
+ public:
+  CPUDeleter(void* ptr, size_t size, bool locked)
+      : ptr_(ptr), size_(size), locked_(locked) {}
+
+  void* Ptr() { return ptr_; }
+
+  void operator()(void* ptr) {
+    PADDLE_ASSERT(ptr == ptr_);
+    if (ptr_ != nullptr && locked_) {
+      munlock(ptr_, size_);
+    }
+    std::free(ptr_);
+  }
+
+ private:
+  void* ptr_;
+  size_t size_;
+  bool locked_;
+};
+
+// CPUAllocator<lock_memory=true> calls mlock, which returns pinned
+// and locked memory as staging areas for data exchange between host
+// and device.  Allocates too much would reduce the amount of memory
+// available to the system for paging.  So, by default, we should use
+// CPUAllocator<staging=false>.
+template <bool lock_memory>
+class CPUAllocator {
+ public:
+  static CPUDeleter Alloc(size_t size) {
+    void* p = std::malloc(size);
+    if (p != nullptr && lock_memory) {
+      mlock(p, size);
+    }
+    return CPUDeleter(p, size, lock_memory);
+  }
+};
+
+#ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
+
+namespace {
 inline void throw_on_error(cudaError_t e, const char* message) {
  if (e) {
    throw thrust::system_error(e, thrust::cuda_category(), message);
  }
 }
+}  // namespace

-// GPUAllocator<staging=true> calls cudaHostMalloc, which returns
-// pinned and locked memory as staging areas for data exchange
-// between host and device.  Allocates too much would reduce the
-// amount of memory available to the system for paging.  So, by
-// default, we should use GPUAllocator<staging=false>.
-template <bool staging>
-class GPUAllocator {
+class GPUDeleter {
 public:
-  void* Alloc(size_t size);
-  void Free(void* p, size_t size);
-};
+  GPUDeleter(void* ptr, size_t size, bool staging)
+      : ptr_(ptr), size_(size), staging_(staging) {}

-template <>
-class GPUAllocator<false> {
- public:
-  void* Alloc(size_t size) {
-    void* p = 0;
-    cudaError_t result = cudaMalloc(&p, size);
-    if (result == cudaSuccess) {
-      return p;
-    }
-    // clear last error
-    cudaGetLastError();
-    return nullptr;
-  }
+  void* Ptr() { return ptr_; }

-  void Free(void* p, size_t size) {
+  void operator()(void* ptr) {
+    PADDLE_ASSERT(ptr == ptr_);
    // Purposefully allow cudaErrorCudartUnloading, because
    // that is returned if you ever call cudaFree after the
    // driver has already shutdown. This happens only if the
    // process is terminating, in which case we don't care if
    // cudaFree succeeds.
-    auto err = cudaFree(p);
+    cudaError_t err = staging_ ? cudaFreeHost(ptr) : cudaFree(ptr);
    if (err != cudaErrorCudartUnloading) {
-      throw_on_error(err, "cudaFree failed");
+      throw_on_error(err, "cudaFree{Host} failed");
    }
  }
+
+ private:
+  void* ptr_;
+  size_t size_;
+  bool staging_;
 };

-template <>
-class GPUAllocator<true> {
+// GPUAllocator<staging=true> calls cudaHostMalloc, which returns
+// pinned and locked memory as staging areas for data exchange
+// between host and device.  Allocates too much would reduce the
+// amount of memory available to the system for paging.  So, by
+// default, we should use GPUAllocator<staging=false>.
+template <bool staging>
+class GPUAllocator {
 public:
-  void* Alloc(size_t size) {
+  static GPUDeleter Alloc(size_t size) {
    void* p = 0;
-    cudaError_t result = cudaMallocHost(&p, size);
-    if (result == cudaSuccess) {
-      return p;
+    cudaError_t result =
+        staging ? cudaMallocHost(&p, size) : cudaMalloc(&p, size);
+    if (result != cudaSuccess) {
+      cudaGetLastError();  // clear error if there is any.
    }
-    // clear last error
-    cudaGetLastError();
-    return nullptr;
-  }
-
-  void Free(void* p, size_t size) {
-    throw_on_error(cudaFreeHost(p), "cudaFreeHost failed");
+    return GPUDeleter(result == cudaSuccess ? p : nullptr, size, staging);
  }
 };

+#endif  // PADDLE_ONLY_CPU
+
 }  // namespace detail
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/memory/detail/gpu_allocator_test.cc
+++ b/paddle/memory/detail/gpu_allocator_test.cc
@@ -12,19 +12,39 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/memory/detail/gpu_allocator.h"
+#include "paddle/memory/detail/system_allocator.h"
+
+#include <memory>
+#include <vector>
+
 #include "gtest/gtest.h"

-TEST(GPUAllocator, NonStaging) {
-  paddle::memory::detail::GPUAllocator<false> a;
-  void* p = a.Alloc(4096);
-  EXPECT_NE(p, nullptr);
-  a.Free(p, 4096);
+template <typename Allocator>
+void TestAllocator() {
+  {
+    auto d = Allocator::Alloc(sizeof(int));
+    EXPECT_NE(d.Ptr(), nullptr);
+    std::unique_ptr<int> p(static_cast<int*>(d.Ptr()), d);
+  }
+  {
+    auto d = Allocator::Alloc(0);
+    EXPECT_EQ(d.Ptr(), nullptr);
+    std::unique_ptr<int> p(static_cast<int*>(d.Ptr()), d);
+  }
+}
+
+TEST(CPUAllocator, NoLockMem) {
+  TestAllocator<paddle::memory::detail::CPUAllocator<false>>();
+}
+TEST(CPUAllocator, LockMem) {
+  TestAllocator<paddle::memory::detail::CPUAllocator<true>>();
 }

+#ifndef PADDLE_ONLY_CPU
+TEST(GPUAllocator, NoStaging) {
+  TestAllocator<paddle::memory::detail::GPUAllocator<false>>();
+}
 TEST(GPUAllocator, Staging) {
-  paddle::memory::detail::GPUAllocator<true> a;
-  void* p = a.Alloc(4096);
-  EXPECT_NE(p, nullptr);
-  a.Free(p, 4096);
+  TestAllocator<paddle::memory::detail::GPUAllocator<true>>();
 }
+#endif  // PADDLE_ONLY_CPU
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -14,48 +14,41 @@ limitations under the License. */

 #include "paddle/memory/memory.h"

+#include "paddle/memory/detail/cpu_allocator.h"
+#include "paddle/memory/detail/gpu_allocator.h"
+
 namespace paddle {
 namespace memory {

-template <>
-void* Alloc<CPUPlace>(CPUPlace, size_t size) {
-  return GetCPUBuddyAllocator(false /*non-staging*/)->Alloc(size);
-}
-
-void* AllocStaging(CPUPlace, size_t size) {
-  return GetCPUBuddyAllocator(true /*staging*/)->Alloc(size);
-}
-
-template <>
-void* Alloc<GPUPlace>(GPUPlace pl, size_t size) {
-  return GetGPUBuddyAllocator(pl.device)->Alloc(size);
-}
-
-template <>
-void Free<CPUPlace>(CPUPlace, void* p) {
-  return GetCPUBuddyAllocator(false /*non-staging*/)->Free(p);
-}
-
-void FreeStaging(CPUPlace, void* p) {
-  return GetCPUBuddyAllocator(false /*non-staging*/)->Free(p);
-}
-
-#ifdef PADDLE_WITH_GPU
-template <>
-void* Alloc<GPUPlace>(GPUPlace pl, void* p) {
-  return GetGPUBuddyAllocator(pl.device)->Free(p);
-}
-
-template <>
-size_t Used<CPUPlace>(CPUPlace) {
+void Alloc(paddle::platform::Place pl, size_t size) {
+#ifndef PADDLE_ONLY_CPU
+  if (paddle::platform::is_gpu_place(pl)) {
+    return GetGPUBuddyAllocator(pl.device)->Alloc(size);
+  }
+#endif  // PADDLE_ONLY_CPU
+  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
+  return GetCPUBuddyAllocator()->Alloc(size);
+}
+
+void Free(paddle::platform::Place pl, void* p) {
+#ifndef PADDLE_ONLY_CPU
+  if (paddle::platform::is_gpu_place(pl)) {
+    GetGPUBuddyAllocator(pl.device)->Free(p);
+  }
+#endif  // PADDLE_ONLY_CPU
+  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
+  GetCPUBuddyAllocator()->Free(p);
+}
+
+size_t Used(paddle::platform::Place pl) {
+#ifndef PADDLE_ONLY_CPU
+  if (paddle::platform::is_gpu_place(pl)) {
+    return GetGPUBuddyAllocator(pl.device)->Used();
+  }
+#endif  // PADDLE_ONLY_CPU
+  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
  return GetCPUBuddyAllocator()->Used();
 }

-template <>
-size_t Alloc<GPUPlace>(GPUPlace pl) {
-  return GetGPUBuddyAllocator(pl.device)->Used();
-}
-#endif  // PADDLE_WITH_GPU
-
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -19,19 +19,9 @@ limitations under the License. */
 namespace paddle {
 namespace memory {

-template <typename paddle::framework::Place>
-void* Alloc(Place, size_t);
-template <typename paddle::framework::Place>
-void Free(Place, void*);
-template <typename paddle::framework::Place>
-size_t Used(Place);
-
-// Staging memory means "pinned" host memory that can be mapped into
-// the CUDA memory space and accessed by the device rapidly.  Don't
-// allocate too much staging memory; otherwise system performance will
-// degrade because the OS cannot find enough swap memory space.
-void* AllocStaging(CPUPlace, size_t);
-void* FreeStaging(CPUPlace, size_t);
+void* Alloc(paddle::framework::Place, size_t);
+void Free(paddle::framework::Place, void*);
+size_t Used(paddle::framework::Place);

 }  // namespace memory
 }  // namespace paddle