提交 09d9794c 编写于 作者: L liaogang

Merge remote-tracking branch 'wangkuiyi/memory_cpu_allocator' into cpu_mem

...@@ -166,11 +166,21 @@ MultiGradientMachine::MultiGradientMachine(const ModelConfig& config, ...@@ -166,11 +166,21 @@ MultiGradientMachine::MultiGradientMachine(const ModelConfig& config,
outArgStream_ = HPPL_STREAM_1; outArgStream_ = HPPL_STREAM_1;
start();
}
void MultiGradientMachine::start() {
for (auto& thread : threads_) { for (auto& thread : threads_) {
thread->start(); thread->start();
} }
} }
void MultiGradientMachine::finish() {
for (auto& thread : threads_) {
thread->stop();
}
}
std::vector<const std::vector<ParameterPtr>*> std::vector<const std::vector<ParameterPtr>*>
MultiGradientMachine::getSlaveParameters() { MultiGradientMachine::getSlaveParameters() {
std::vector<const std::vector<ParameterPtr>*> vec; std::vector<const std::vector<ParameterPtr>*> vec;
...@@ -326,12 +336,6 @@ void MultiGradientMachine::onPassEnd() { ...@@ -326,12 +336,6 @@ void MultiGradientMachine::onPassEnd() {
} }
} }
void MultiGradientMachine::finish() {
for (auto& thread : threads_) {
thread->stop();
}
}
Evaluator* MultiGradientMachine::makeEvaluator() const { Evaluator* MultiGradientMachine::makeEvaluator() const {
return threads_[0]->getGradientMachine()->makeEvaluator(); return threads_[0]->getGradientMachine()->makeEvaluator();
} }
...@@ -445,7 +449,7 @@ TrainerThread::TrainerThread(const ModelConfig& config, ...@@ -445,7 +449,7 @@ TrainerThread::TrainerThread(const ModelConfig& config,
gradStream_ = HPPL_STREAM_2; gradStream_ = HPPL_STREAM_2;
valueStream_ = HPPL_STREAM_3; valueStream_ = HPPL_STREAM_3;
stopping_ = false; stopping_ = true;
updateCounter_ = 0; updateCounter_ = 0;
parameterUpdated_ = false; parameterUpdated_ = false;
} }
...@@ -453,6 +457,10 @@ TrainerThread::TrainerThread(const ModelConfig& config, ...@@ -453,6 +457,10 @@ TrainerThread::TrainerThread(const ModelConfig& config,
TrainerThread::~TrainerThread() { stop(); } TrainerThread::~TrainerThread() { stop(); }
void TrainerThread::start() { void TrainerThread::start() {
if (!stopping_) return;
stopping_ = false;
gradientMachine_->start(); gradientMachine_->start();
computeThread_.reset(new std::thread([this]() { computeThread(); })); computeThread_.reset(new std::thread([this]() { computeThread(); }));
......
...@@ -176,6 +176,10 @@ public: ...@@ -176,6 +176,10 @@ public:
explicit MultiGradientMachine(const ModelConfig& config, bool useGpu); explicit MultiGradientMachine(const ModelConfig& config, bool useGpu);
virtual void start();
virtual void finish();
virtual void prefetch(const std::vector<Argument>& inArgs); virtual void prefetch(const std::vector<Argument>& inArgs);
virtual void forward(const std::vector<Argument>& inArgs, virtual void forward(const std::vector<Argument>& inArgs,
...@@ -193,8 +197,6 @@ public: ...@@ -193,8 +197,6 @@ public:
virtual void onPassEnd(); virtual void onPassEnd();
virtual void finish();
virtual Evaluator* makeEvaluator() const; virtual Evaluator* makeEvaluator() const;
virtual void eval(Evaluator* evaluator) const; virtual void eval(Evaluator* evaluator) const;
......
add_subdirectory(detail) add_subdirectory(detail)
if(${WITH_GPU})
nv_library(memory SRCS memory.cc)
else(${WITH_GPU})
cc_library(memory SRCS memroy.cc)
endif(${WITH_GPU})
cc_test(cpu_allocator_test SRCS cpu_allocator_test.cc) if(${WITH_GPU})
nv_test(gpu_allocator_test SRCS gpu_allocator_test.cc) nv_test(system_allocator_test SRCS system_allocator_test.cc)
else(${WITH_GPU})
cc_test(system_allocator_test SRCS system_allocator_test.cc)
endif(${WITH_GPU})
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stddef.h> // for size_t
#include <cstdlib> // for malloc and free
#ifndef _WIN32
#include <sys/mman.h> // for mlock and munlock
#endif
namespace paddle {
namespace memory {
namespace detail {
// CPUAllocator<staging=true> calls mlock, which returns
// pinned and locked memory as staging areas for data exchange
// between host and device. Allocates too much would reduce the
// amount of memory available to the system for paging. So, by
// default, we should use CPUAllocator<staging=false>.
template <bool staging>
class CPUAllocator {
public:
void* Alloc(size_t size);
void Free(void* p, size_t size);
};
template <>
class CPUAllocator<false> {
public:
void* Alloc(size_t size) { return std::malloc(size); }
void Free(void* p, size_t size) { std::free(p); }
};
template <>
class CPUAllocator<true> {
public:
void* Alloc(size_t size) {
void* p = std::malloc(size);
if (p == nullptr) {
return p;
}
#ifndef _WIN32
mlock(p, size);
#endif
return p;
}
void Free(void* p, size_t size) {
#ifndef _WIN32
munlock(p, size);
#endif
std::free(p);
}
};
} // namespace detail
} // namespace memory
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/memory/detail/cpu_allocator.h"
#include "gtest/gtest.h"
TEST(CPUAllocator, NonStaging) {
paddle::memory::detail::CPUAllocator<false> a;
void* p = a.Alloc(4096);
EXPECT_NE(p, nullptr);
a.Free(p, 4096);
}
TEST(CPUAllocator, Staging) {
paddle::memory::detail::CPUAllocator<true> a;
void* p = a.Alloc(4096);
EXPECT_NE(p, nullptr);
a.Free(p, 4096);
}
...@@ -14,79 +14,116 @@ limitations under the License. */ ...@@ -14,79 +14,116 @@ limitations under the License. */
#pragma once #pragma once
#include <stddef.h> // for size_t #include <stddef.h> // for size_t
#include <sys/mman.h> // for mlock and munlock
#include <cstdlib> // for malloc and free
#ifndef PADDLE_ONLY_CPU
#include <thrust/system/cuda/error.h> #include <thrust/system/cuda/error.h>
#include <thrust/system_error.h> #include <thrust/system_error.h>
#endif // PADDLE_ONLY_CPU
#include "paddle/platform/assert.h"
namespace paddle { namespace paddle {
namespace memory { namespace memory {
namespace detail { namespace detail {
class CPUDeleter {
public:
CPUDeleter(void* ptr, size_t size, bool locked)
: ptr_(ptr), size_(size), locked_(locked) {}
void* Ptr() { return ptr_; }
void operator()(void* ptr) {
PADDLE_ASSERT(ptr == ptr_);
if (ptr_ != nullptr && locked_) {
munlock(ptr_, size_);
}
std::free(ptr_);
}
private:
void* ptr_;
size_t size_;
bool locked_;
};
// CPUAllocator<lock_memory=true> calls mlock, which returns pinned
// and locked memory as staging areas for data exchange between host
// and device. Allocates too much would reduce the amount of memory
// available to the system for paging. So, by default, we should use
// CPUAllocator<staging=false>.
template <bool lock_memory>
class CPUAllocator {
public:
static CPUDeleter Alloc(size_t size) {
void* p = std::malloc(size);
if (p != nullptr && lock_memory) {
mlock(p, size);
}
return CPUDeleter(p, size, lock_memory);
}
};
#ifndef PADDLE_ONLY_CPU // The following code are for CUDA.
namespace {
inline void throw_on_error(cudaError_t e, const char* message) { inline void throw_on_error(cudaError_t e, const char* message) {
if (e) { if (e) {
throw thrust::system_error(e, thrust::cuda_category(), message); throw thrust::system_error(e, thrust::cuda_category(), message);
} }
} }
} // namespace
// GPUAllocator<staging=true> calls cudaHostMalloc, which returns class GPUDeleter {
// pinned and locked memory as staging areas for data exchange
// between host and device. Allocates too much would reduce the
// amount of memory available to the system for paging. So, by
// default, we should use GPUAllocator<staging=false>.
template <bool staging>
class GPUAllocator {
public: public:
void* Alloc(size_t size); GPUDeleter(void* ptr, size_t size, bool staging)
void Free(void* p, size_t size); : ptr_(ptr), size_(size), staging_(staging) {}
};
template <> void* Ptr() { return ptr_; }
class GPUAllocator<false> {
public:
void* Alloc(size_t size) {
void* p = 0;
cudaError_t result = cudaMalloc(&p, size);
if (result == cudaSuccess) {
return p;
}
// clear last error
cudaGetLastError();
return nullptr;
}
void Free(void* p, size_t size) { void operator()(void* ptr) {
PADDLE_ASSERT(ptr == ptr_);
// Purposefully allow cudaErrorCudartUnloading, because // Purposefully allow cudaErrorCudartUnloading, because
// that is returned if you ever call cudaFree after the // that is returned if you ever call cudaFree after the
// driver has already shutdown. This happens only if the // driver has already shutdown. This happens only if the
// process is terminating, in which case we don't care if // process is terminating, in which case we don't care if
// cudaFree succeeds. // cudaFree succeeds.
auto err = cudaFree(p); cudaError_t err = staging_ ? cudaFreeHost(ptr) : cudaFree(ptr);
if (err != cudaErrorCudartUnloading) { if (err != cudaErrorCudartUnloading) {
throw_on_error(err, "cudaFree failed"); throw_on_error(err, "cudaFree{Host} failed");
} }
} }
private:
void* ptr_;
size_t size_;
bool staging_;
}; };
template <> // GPUAllocator<staging=true> calls cudaHostMalloc, which returns
class GPUAllocator<true> { // pinned and locked memory as staging areas for data exchange
// between host and device. Allocates too much would reduce the
// amount of memory available to the system for paging. So, by
// default, we should use GPUAllocator<staging=false>.
template <bool staging>
class GPUAllocator {
public: public:
void* Alloc(size_t size) { static GPUDeleter Alloc(size_t size) {
void* p = 0; void* p = 0;
cudaError_t result = cudaMallocHost(&p, size); cudaError_t result =
if (result == cudaSuccess) { staging ? cudaMallocHost(&p, size) : cudaMalloc(&p, size);
return p; if (result != cudaSuccess) {
cudaGetLastError(); // clear error if there is any.
} }
// clear last error return GPUDeleter(result == cudaSuccess ? p : nullptr, size, staging);
cudaGetLastError();
return nullptr;
}
void Free(void* p, size_t size) {
throw_on_error(cudaFreeHost(p), "cudaFreeHost failed");
} }
}; };
#endif // PADDLE_ONLY_CPU
} // namespace detail } // namespace detail
} // namespace memory } // namespace memory
} // namespace paddle } // namespace paddle
...@@ -12,19 +12,39 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,19 +12,39 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/memory/detail/gpu_allocator.h" #include "paddle/memory/detail/system_allocator.h"
#include <memory>
#include <vector>
#include "gtest/gtest.h" #include "gtest/gtest.h"
TEST(GPUAllocator, NonStaging) { template <typename Allocator>
paddle::memory::detail::GPUAllocator<false> a; void TestAllocator() {
void* p = a.Alloc(4096); {
EXPECT_NE(p, nullptr); auto d = Allocator::Alloc(sizeof(int));
a.Free(p, 4096); EXPECT_NE(d.Ptr(), nullptr);
std::unique_ptr<int> p(static_cast<int*>(d.Ptr()), d);
}
{
auto d = Allocator::Alloc(0);
EXPECT_EQ(d.Ptr(), nullptr);
std::unique_ptr<int> p(static_cast<int*>(d.Ptr()), d);
}
}
TEST(CPUAllocator, NoLockMem) {
TestAllocator<paddle::memory::detail::CPUAllocator<false>>();
}
TEST(CPUAllocator, LockMem) {
TestAllocator<paddle::memory::detail::CPUAllocator<true>>();
} }
#ifndef PADDLE_ONLY_CPU
TEST(GPUAllocator, NoStaging) {
TestAllocator<paddle::memory::detail::GPUAllocator<false>>();
}
TEST(GPUAllocator, Staging) { TEST(GPUAllocator, Staging) {
paddle::memory::detail::GPUAllocator<true> a; TestAllocator<paddle::memory::detail::GPUAllocator<true>>();
void* p = a.Alloc(4096);
EXPECT_NE(p, nullptr);
a.Free(p, 4096);
} }
#endif // PADDLE_ONLY_CPU
...@@ -14,48 +14,41 @@ limitations under the License. */ ...@@ -14,48 +14,41 @@ limitations under the License. */
#include "paddle/memory/memory.h" #include "paddle/memory/memory.h"
#include "paddle/memory/detail/cpu_allocator.h"
#include "paddle/memory/detail/gpu_allocator.h"
namespace paddle { namespace paddle {
namespace memory { namespace memory {
template <> void Alloc(paddle::platform::Place pl, size_t size) {
void* Alloc<CPUPlace>(CPUPlace, size_t size) { #ifndef PADDLE_ONLY_CPU
return GetCPUBuddyAllocator(false /*non-staging*/)->Alloc(size); if (paddle::platform::is_gpu_place(pl)) {
} return GetGPUBuddyAllocator(pl.device)->Alloc(size);
}
void* AllocStaging(CPUPlace, size_t size) { #endif // PADDLE_ONLY_CPU
return GetCPUBuddyAllocator(true /*staging*/)->Alloc(size); PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
} return GetCPUBuddyAllocator()->Alloc(size);
}
template <>
void* Alloc<GPUPlace>(GPUPlace pl, size_t size) { void Free(paddle::platform::Place pl, void* p) {
return GetGPUBuddyAllocator(pl.device)->Alloc(size); #ifndef PADDLE_ONLY_CPU
} if (paddle::platform::is_gpu_place(pl)) {
GetGPUBuddyAllocator(pl.device)->Free(p);
template <> }
void Free<CPUPlace>(CPUPlace, void* p) { #endif // PADDLE_ONLY_CPU
return GetCPUBuddyAllocator(false /*non-staging*/)->Free(p); PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
} GetCPUBuddyAllocator()->Free(p);
}
void FreeStaging(CPUPlace, void* p) {
return GetCPUBuddyAllocator(false /*non-staging*/)->Free(p); size_t Used(paddle::platform::Place pl) {
} #ifndef PADDLE_ONLY_CPU
if (paddle::platform::is_gpu_place(pl)) {
#ifdef PADDLE_WITH_GPU return GetGPUBuddyAllocator(pl.device)->Used();
template <> }
void* Alloc<GPUPlace>(GPUPlace pl, void* p) { #endif // PADDLE_ONLY_CPU
return GetGPUBuddyAllocator(pl.device)->Free(p); PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
}
template <>
size_t Used<CPUPlace>(CPUPlace) {
return GetCPUBuddyAllocator()->Used(); return GetCPUBuddyAllocator()->Used();
} }
template <>
size_t Alloc<GPUPlace>(GPUPlace pl) {
return GetGPUBuddyAllocator(pl.device)->Used();
}
#endif // PADDLE_WITH_GPU
} // namespace memory } // namespace memory
} // namespace paddle } // namespace paddle
...@@ -19,19 +19,9 @@ limitations under the License. */ ...@@ -19,19 +19,9 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace memory { namespace memory {
template <typename paddle::framework::Place> void* Alloc(paddle::framework::Place, size_t);
void* Alloc(Place, size_t); void Free(paddle::framework::Place, void*);
template <typename paddle::framework::Place> size_t Used(paddle::framework::Place);
void Free(Place, void*);
template <typename paddle::framework::Place>
size_t Used(Place);
// Staging memory means "pinned" host memory that can be mapped into
// the CUDA memory space and accessed by the device rapidly. Don't
// allocate too much staging memory; otherwise system performance will
// degrade because the OS cannot find enough swap memory space.
void* AllocStaging(CPUPlace, size_t);
void* FreeStaging(CPUPlace, size_t);
} // namespace memory } // namespace memory
} // namespace paddle } // namespace paddle
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册