diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp index 3159026e6b92355ba7480b09535388c969a504e2..8ef5e9d0c116dd088b5c5c318dfb47c245b471fa 100644 --- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp +++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp @@ -166,11 +166,21 @@ MultiGradientMachine::MultiGradientMachine(const ModelConfig& config, outArgStream_ = HPPL_STREAM_1; + start(); +} + +void MultiGradientMachine::start() { for (auto& thread : threads_) { thread->start(); } } +void MultiGradientMachine::finish() { + for (auto& thread : threads_) { + thread->stop(); + } +} + std::vector*> MultiGradientMachine::getSlaveParameters() { std::vector*> vec; @@ -326,12 +336,6 @@ void MultiGradientMachine::onPassEnd() { } } -void MultiGradientMachine::finish() { - for (auto& thread : threads_) { - thread->stop(); - } -} - Evaluator* MultiGradientMachine::makeEvaluator() const { return threads_[0]->getGradientMachine()->makeEvaluator(); } @@ -445,7 +449,7 @@ TrainerThread::TrainerThread(const ModelConfig& config, gradStream_ = HPPL_STREAM_2; valueStream_ = HPPL_STREAM_3; - stopping_ = false; + stopping_ = true; updateCounter_ = 0; parameterUpdated_ = false; } @@ -453,6 +457,10 @@ TrainerThread::TrainerThread(const ModelConfig& config, TrainerThread::~TrainerThread() { stop(); } void TrainerThread::start() { + if (!stopping_) return; + + stopping_ = false; + gradientMachine_->start(); computeThread_.reset(new std::thread([this]() { computeThread(); })); diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.h b/paddle/gserver/gradientmachines/MultiGradientMachine.h index 70203bbb97fe79d72fbc6bd2b5d427cb1de7b61f..5e7622f929fd57de6e38855528a752b5586c4cd1 100644 --- a/paddle/gserver/gradientmachines/MultiGradientMachine.h +++ b/paddle/gserver/gradientmachines/MultiGradientMachine.h @@ -176,6 +176,10 @@ public: explicit MultiGradientMachine(const ModelConfig& config, bool useGpu); + virtual void start(); + + virtual void finish(); + virtual void prefetch(const std::vector& inArgs); virtual void forward(const std::vector& inArgs, @@ -193,8 +197,6 @@ public: virtual void onPassEnd(); - virtual void finish(); - virtual Evaluator* makeEvaluator() const; virtual void eval(Evaluator* evaluator) const; diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt index 3943c3cfad31d13a00645aba6fc153d3d13da987..86625124967d7dfb392f5a8e74e591cb2955385f 100644 --- a/paddle/memory/CMakeLists.txt +++ b/paddle/memory/CMakeLists.txt @@ -1 +1,7 @@ add_subdirectory(detail) + +if(${WITH_GPU}) + nv_library(memory SRCS memory.cc) +else(${WITH_GPU}) + cc_library(memory SRCS memroy.cc) +endif(${WITH_GPU}) diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt index 81ca8a0bbf0ef2162a6efc63037a18b10bbcd563..c16dfadeb2180fd18b3d0da56abbe5c2d8ba9b1c 100644 --- a/paddle/memory/detail/CMakeLists.txt +++ b/paddle/memory/detail/CMakeLists.txt @@ -1,2 +1,5 @@ -cc_test(cpu_allocator_test SRCS cpu_allocator_test.cc) -nv_test(gpu_allocator_test SRCS gpu_allocator_test.cc) +if(${WITH_GPU}) + nv_test(system_allocator_test SRCS system_allocator_test.cc) +else(${WITH_GPU}) + cc_test(system_allocator_test SRCS system_allocator_test.cc) +endif(${WITH_GPU}) diff --git a/paddle/memory/detail/cpu_allocator.h b/paddle/memory/detail/cpu_allocator.h deleted file mode 100644 index 17753ccef718f8b258e962384c3273317fc72bec..0000000000000000000000000000000000000000 --- a/paddle/memory/detail/cpu_allocator.h +++ /dev/null @@ -1,71 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include // for size_t -#include // for malloc and free - -#ifndef _WIN32 -#include // for mlock and munlock -#endif - -namespace paddle { -namespace memory { -namespace detail { - -// CPUAllocator calls mlock, which returns -// pinned and locked memory as staging areas for data exchange -// between host and device. Allocates too much would reduce the -// amount of memory available to the system for paging. So, by -// default, we should use CPUAllocator. -template -class CPUAllocator { - public: - void* Alloc(size_t size); - void Free(void* p, size_t size); -}; - -template <> -class CPUAllocator { - public: - void* Alloc(size_t size) { return std::malloc(size); } - void Free(void* p, size_t size) { std::free(p); } -}; - -template <> -class CPUAllocator { - public: - void* Alloc(size_t size) { - void* p = std::malloc(size); - if (p == nullptr) { - return p; - } -#ifndef _WIN32 - mlock(p, size); -#endif - return p; - } - - void Free(void* p, size_t size) { -#ifndef _WIN32 - munlock(p, size); -#endif - std::free(p); - } -}; - -} // namespace detail -} // namespace memory -} // namespace paddle diff --git a/paddle/memory/detail/cpu_allocator_test.cc b/paddle/memory/detail/cpu_allocator_test.cc deleted file mode 100644 index 4e45266cd8ad8a2bfd8f2135d259691559bbcbf4..0000000000000000000000000000000000000000 --- a/paddle/memory/detail/cpu_allocator_test.cc +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/memory/detail/cpu_allocator.h" -#include "gtest/gtest.h" - -TEST(CPUAllocator, NonStaging) { - paddle::memory::detail::CPUAllocator a; - void* p = a.Alloc(4096); - EXPECT_NE(p, nullptr); - a.Free(p, 4096); -} - -TEST(CPUAllocator, Staging) { - paddle::memory::detail::CPUAllocator a; - void* p = a.Alloc(4096); - EXPECT_NE(p, nullptr); - a.Free(p, 4096); -} diff --git a/paddle/memory/detail/gpu_allocator.h b/paddle/memory/detail/gpu_allocator.h deleted file mode 100644 index 682afdf7d3349aee107ed393ce2dbefebe8ff82f..0000000000000000000000000000000000000000 --- a/paddle/memory/detail/gpu_allocator.h +++ /dev/null @@ -1,92 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include // for size_t - -#include -#include - -namespace paddle { -namespace memory { -namespace detail { - -inline void throw_on_error(cudaError_t e, const char* message) { - if (e) { - throw thrust::system_error(e, thrust::cuda_category(), message); - } -} - -// GPUAllocator calls cudaHostMalloc, which returns -// pinned and locked memory as staging areas for data exchange -// between host and device. Allocates too much would reduce the -// amount of memory available to the system for paging. So, by -// default, we should use GPUAllocator. -template -class GPUAllocator { - public: - void* Alloc(size_t size); - void Free(void* p, size_t size); -}; - -template <> -class GPUAllocator { - public: - void* Alloc(size_t size) { - void* p = 0; - cudaError_t result = cudaMalloc(&p, size); - if (result == cudaSuccess) { - return p; - } - // clear last error - cudaGetLastError(); - return nullptr; - } - - void Free(void* p, size_t size) { - // Purposefully allow cudaErrorCudartUnloading, because - // that is returned if you ever call cudaFree after the - // driver has already shutdown. This happens only if the - // process is terminating, in which case we don't care if - // cudaFree succeeds. - auto err = cudaFree(p); - if (err != cudaErrorCudartUnloading) { - throw_on_error(err, "cudaFree failed"); - } - } -}; - -template <> -class GPUAllocator { - public: - void* Alloc(size_t size) { - void* p = 0; - cudaError_t result = cudaMallocHost(&p, size); - if (result == cudaSuccess) { - return p; - } - // clear last error - cudaGetLastError(); - return nullptr; - } - - void Free(void* p, size_t size) { - throw_on_error(cudaFreeHost(p), "cudaFreeHost failed"); - } -}; - -} // namespace detail -} // namespace memory -} // namespace paddle diff --git a/paddle/memory/detail/gpu_allocator_test.cc b/paddle/memory/detail/gpu_allocator_test.cc deleted file mode 100644 index 18c1c9ab43084ae72185565cb7c04cf6d5c9937c..0000000000000000000000000000000000000000 --- a/paddle/memory/detail/gpu_allocator_test.cc +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "paddle/memory/detail/gpu_allocator.h" -#include "gtest/gtest.h" - -TEST(GPUAllocator, NonStaging) { - paddle::memory::detail::GPUAllocator a; - void* p = a.Alloc(4096); - EXPECT_NE(p, nullptr); - a.Free(p, 4096); -} - -TEST(GPUAllocator, Staging) { - paddle::memory::detail::GPUAllocator a; - void* p = a.Alloc(4096); - EXPECT_NE(p, nullptr); - a.Free(p, 4096); -} diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h new file mode 100644 index 0000000000000000000000000000000000000000..1768f9a0da6c96a391a10678fa3490cade7b6c86 --- /dev/null +++ b/paddle/memory/detail/system_allocator.h @@ -0,0 +1,129 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include // for size_t +#include // for mlock and munlock +#include // for malloc and free + +#ifndef PADDLE_ONLY_CPU +#include +#include +#endif // PADDLE_ONLY_CPU + +#include "paddle/platform/assert.h" + +namespace paddle { +namespace memory { +namespace detail { + +class CPUDeleter { + public: + CPUDeleter(void* ptr, size_t size, bool locked) + : ptr_(ptr), size_(size), locked_(locked) {} + + void* Ptr() { return ptr_; } + + void operator()(void* ptr) { + PADDLE_ASSERT(ptr == ptr_); + if (ptr_ != nullptr && locked_) { + munlock(ptr_, size_); + } + std::free(ptr_); + } + + private: + void* ptr_; + size_t size_; + bool locked_; +}; + +// CPUAllocator calls mlock, which returns pinned +// and locked memory as staging areas for data exchange between host +// and device. Allocates too much would reduce the amount of memory +// available to the system for paging. So, by default, we should use +// CPUAllocator. +template +class CPUAllocator { + public: + static CPUDeleter Alloc(size_t size) { + void* p = std::malloc(size); + if (p != nullptr && lock_memory) { + mlock(p, size); + } + return CPUDeleter(p, size, lock_memory); + } +}; + +#ifndef PADDLE_ONLY_CPU // The following code are for CUDA. + +namespace { +inline void throw_on_error(cudaError_t e, const char* message) { + if (e) { + throw thrust::system_error(e, thrust::cuda_category(), message); + } +} +} // namespace + +class GPUDeleter { + public: + GPUDeleter(void* ptr, size_t size, bool staging) + : ptr_(ptr), size_(size), staging_(staging) {} + + void* Ptr() { return ptr_; } + + void operator()(void* ptr) { + PADDLE_ASSERT(ptr == ptr_); + // Purposefully allow cudaErrorCudartUnloading, because + // that is returned if you ever call cudaFree after the + // driver has already shutdown. This happens only if the + // process is terminating, in which case we don't care if + // cudaFree succeeds. + cudaError_t err = staging_ ? cudaFreeHost(ptr) : cudaFree(ptr); + if (err != cudaErrorCudartUnloading) { + throw_on_error(err, "cudaFree{Host} failed"); + } + } + + private: + void* ptr_; + size_t size_; + bool staging_; +}; + +// GPUAllocator calls cudaHostMalloc, which returns +// pinned and locked memory as staging areas for data exchange +// between host and device. Allocates too much would reduce the +// amount of memory available to the system for paging. So, by +// default, we should use GPUAllocator. +template +class GPUAllocator { + public: + static GPUDeleter Alloc(size_t size) { + void* p = 0; + cudaError_t result = + staging ? cudaMallocHost(&p, size) : cudaMalloc(&p, size); + if (result != cudaSuccess) { + cudaGetLastError(); // clear error if there is any. + } + return GPUDeleter(result == cudaSuccess ? p : nullptr, size, staging); + } +}; + +#endif // PADDLE_ONLY_CPU + +} // namespace detail +} // namespace memory +} // namespace paddle diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..fec70a65b77d5f3698726970b3e1797c59261048 --- /dev/null +++ b/paddle/memory/detail/system_allocator_test.cc @@ -0,0 +1,50 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/memory/detail/system_allocator.h" + +#include +#include + +#include "gtest/gtest.h" + +template +void TestAllocator() { + { + auto d = Allocator::Alloc(sizeof(int)); + EXPECT_NE(d.Ptr(), nullptr); + std::unique_ptr p(static_cast(d.Ptr()), d); + } + { + auto d = Allocator::Alloc(0); + EXPECT_EQ(d.Ptr(), nullptr); + std::unique_ptr p(static_cast(d.Ptr()), d); + } +} + +TEST(CPUAllocator, NoLockMem) { + TestAllocator>(); +} +TEST(CPUAllocator, LockMem) { + TestAllocator>(); +} + +#ifndef PADDLE_ONLY_CPU +TEST(GPUAllocator, NoStaging) { + TestAllocator>(); +} +TEST(GPUAllocator, Staging) { + TestAllocator>(); +} +#endif // PADDLE_ONLY_CPU diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc index b617923731a4d92e9765e2b73c55984a70a59264..ca3c01ebdb03598f760d19475b42930a137b3bba 100644 --- a/paddle/memory/memory.cc +++ b/paddle/memory/memory.cc @@ -14,48 +14,41 @@ limitations under the License. */ #include "paddle/memory/memory.h" +#include "paddle/memory/detail/cpu_allocator.h" +#include "paddle/memory/detail/gpu_allocator.h" + namespace paddle { namespace memory { -template <> -void* Alloc(CPUPlace, size_t size) { - return GetCPUBuddyAllocator(false /*non-staging*/)->Alloc(size); -} - -void* AllocStaging(CPUPlace, size_t size) { - return GetCPUBuddyAllocator(true /*staging*/)->Alloc(size); -} - -template <> -void* Alloc(GPUPlace pl, size_t size) { - return GetGPUBuddyAllocator(pl.device)->Alloc(size); -} - -template <> -void Free(CPUPlace, void* p) { - return GetCPUBuddyAllocator(false /*non-staging*/)->Free(p); -} - -void FreeStaging(CPUPlace, void* p) { - return GetCPUBuddyAllocator(false /*non-staging*/)->Free(p); -} - -#ifdef PADDLE_WITH_GPU -template <> -void* Alloc(GPUPlace pl, void* p) { - return GetGPUBuddyAllocator(pl.device)->Free(p); -} - -template <> -size_t Used(CPUPlace) { +void Alloc(paddle::platform::Place pl, size_t size) { +#ifndef PADDLE_ONLY_CPU + if (paddle::platform::is_gpu_place(pl)) { + return GetGPUBuddyAllocator(pl.device)->Alloc(size); + } +#endif // PADDLE_ONLY_CPU + PADDLE_ASSERT(paddle::platform::is_cpu_place(pl)); + return GetCPUBuddyAllocator()->Alloc(size); +} + +void Free(paddle::platform::Place pl, void* p) { +#ifndef PADDLE_ONLY_CPU + if (paddle::platform::is_gpu_place(pl)) { + GetGPUBuddyAllocator(pl.device)->Free(p); + } +#endif // PADDLE_ONLY_CPU + PADDLE_ASSERT(paddle::platform::is_cpu_place(pl)); + GetCPUBuddyAllocator()->Free(p); +} + +size_t Used(paddle::platform::Place pl) { +#ifndef PADDLE_ONLY_CPU + if (paddle::platform::is_gpu_place(pl)) { + return GetGPUBuddyAllocator(pl.device)->Used(); + } +#endif // PADDLE_ONLY_CPU + PADDLE_ASSERT(paddle::platform::is_cpu_place(pl)); return GetCPUBuddyAllocator()->Used(); } -template <> -size_t Alloc(GPUPlace pl) { - return GetGPUBuddyAllocator(pl.device)->Used(); -} -#endif // PADDLE_WITH_GPU - } // namespace memory } // namespace paddle diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h index 8c15a133bb4e9762d4264ee0d02ad96a3ed33e30..0bc609205eca2c53d85b1a2533f8f36d5b19595e 100644 --- a/paddle/memory/memory.h +++ b/paddle/memory/memory.h @@ -19,19 +19,9 @@ limitations under the License. */ namespace paddle { namespace memory { -template -void* Alloc(Place, size_t); -template -void Free(Place, void*); -template -size_t Used(Place); - -// Staging memory means "pinned" host memory that can be mapped into -// the CUDA memory space and accessed by the device rapidly. Don't -// allocate too much staging memory; otherwise system performance will -// degrade because the OS cannot find enough swap memory space. -void* AllocStaging(CPUPlace, size_t); -void* FreeStaging(CPUPlace, size_t); +void* Alloc(paddle::framework::Place, size_t); +void Free(paddle::framework::Place, void*); +size_t Used(paddle::framework::Place); } // namespace memory } // namespace paddle