diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
index 3159026e6b92355ba7480b09535388c969a504e2..8ef5e9d0c116dd088b5c5c318dfb47c245b471fa 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -166,11 +166,21 @@ MultiGradientMachine::MultiGradientMachine(const ModelConfig& config,
 
   outArgStream_ = HPPL_STREAM_1;
 
+  start();
+}
+
+void MultiGradientMachine::start() {
   for (auto& thread : threads_) {
     thread->start();
   }
 }
 
+void MultiGradientMachine::finish() {
+  for (auto& thread : threads_) {
+    thread->stop();
+  }
+}
+
 std::vector<const std::vector<ParameterPtr>*>
 MultiGradientMachine::getSlaveParameters() {
   std::vector<const std::vector<ParameterPtr>*> vec;
@@ -326,12 +336,6 @@ void MultiGradientMachine::onPassEnd() {
   }
 }
 
-void MultiGradientMachine::finish() {
-  for (auto& thread : threads_) {
-    thread->stop();
-  }
-}
-
 Evaluator* MultiGradientMachine::makeEvaluator() const {
   return threads_[0]->getGradientMachine()->makeEvaluator();
 }
@@ -445,7 +449,7 @@ TrainerThread::TrainerThread(const ModelConfig& config,
 
   gradStream_ = HPPL_STREAM_2;
   valueStream_ = HPPL_STREAM_3;
-  stopping_ = false;
+  stopping_ = true;
   updateCounter_ = 0;
   parameterUpdated_ = false;
 }
@@ -453,6 +457,10 @@ TrainerThread::TrainerThread(const ModelConfig& config,
 TrainerThread::~TrainerThread() { stop(); }
 
 void TrainerThread::start() {
+  if (!stopping_) return;
+
+  stopping_ = false;
+
   gradientMachine_->start();
 
   computeThread_.reset(new std::thread([this]() { computeThread(); }));
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.h b/paddle/gserver/gradientmachines/MultiGradientMachine.h
index 70203bbb97fe79d72fbc6bd2b5d427cb1de7b61f..5e7622f929fd57de6e38855528a752b5586c4cd1 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.h
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.h
@@ -176,6 +176,10 @@ public:
 
   explicit MultiGradientMachine(const ModelConfig& config, bool useGpu);
 
+  virtual void start();
+
+  virtual void finish();
+
   virtual void prefetch(const std::vector<Argument>& inArgs);
 
   virtual void forward(const std::vector<Argument>& inArgs,
@@ -193,8 +197,6 @@ public:
 
   virtual void onPassEnd();
 
-  virtual void finish();
-
   virtual Evaluator* makeEvaluator() const;
 
   virtual void eval(Evaluator* evaluator) const;
diff --git a/paddle/memory/CMakeLists.txt b/paddle/memory/CMakeLists.txt
index 3943c3cfad31d13a00645aba6fc153d3d13da987..86625124967d7dfb392f5a8e74e591cb2955385f 100644
--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
@@ -1 +1,7 @@
 add_subdirectory(detail)
+
+if(${WITH_GPU})
+  nv_library(memory SRCS memory.cc)
+else(${WITH_GPU})
+  cc_library(memory SRCS memroy.cc)
+endif(${WITH_GPU})
diff --git a/paddle/memory/detail/CMakeLists.txt b/paddle/memory/detail/CMakeLists.txt
index 81ca8a0bbf0ef2162a6efc63037a18b10bbcd563..c16dfadeb2180fd18b3d0da56abbe5c2d8ba9b1c 100644
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
@@ -1,2 +1,5 @@
-cc_test(cpu_allocator_test SRCS cpu_allocator_test.cc)
-nv_test(gpu_allocator_test SRCS gpu_allocator_test.cc)
+if(${WITH_GPU})
+  nv_test(system_allocator_test SRCS system_allocator_test.cc)
+else(${WITH_GPU})
+  cc_test(system_allocator_test SRCS system_allocator_test.cc)
+endif(${WITH_GPU})
diff --git a/paddle/memory/detail/cpu_allocator.h b/paddle/memory/detail/cpu_allocator.h
deleted file mode 100644
index 17753ccef718f8b258e962384c3273317fc72bec..0000000000000000000000000000000000000000
--- a/paddle/memory/detail/cpu_allocator.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stddef.h>  // for size_t
-#include <cstdlib>   // for malloc and free
-
-#ifndef _WIN32
-#include <sys/mman.h>  // for mlock and munlock
-#endif
-
-namespace paddle {
-namespace memory {
-namespace detail {
-
-// CPUAllocator<staging=true> calls mlock, which returns
-// pinned and locked memory as staging areas for data exchange
-// between host and device.  Allocates too much would reduce the
-// amount of memory available to the system for paging.  So, by
-// default, we should use CPUAllocator<staging=false>.
-template <bool staging>
-class CPUAllocator {
- public:
-  void* Alloc(size_t size);
-  void Free(void* p, size_t size);
-};
-
-template <>
-class CPUAllocator<false> {
- public:
-  void* Alloc(size_t size) { return std::malloc(size); }
-  void Free(void* p, size_t size) { std::free(p); }
-};
-
-template <>
-class CPUAllocator<true> {
- public:
-  void* Alloc(size_t size) {
-    void* p = std::malloc(size);
-    if (p == nullptr) {
-      return p;
-    }
-#ifndef _WIN32
-    mlock(p, size);
-#endif
-    return p;
-  }
-
-  void Free(void* p, size_t size) {
-#ifndef _WIN32
-    munlock(p, size);
-#endif
-    std::free(p);
-  }
-};
-
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/memory/detail/cpu_allocator_test.cc b/paddle/memory/detail/cpu_allocator_test.cc
deleted file mode 100644
index 4e45266cd8ad8a2bfd8f2135d259691559bbcbf4..0000000000000000000000000000000000000000
--- a/paddle/memory/detail/cpu_allocator_test.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/memory/detail/cpu_allocator.h"
-#include "gtest/gtest.h"
-
-TEST(CPUAllocator, NonStaging) {
-  paddle::memory::detail::CPUAllocator<false> a;
-  void* p = a.Alloc(4096);
-  EXPECT_NE(p, nullptr);
-  a.Free(p, 4096);
-}
-
-TEST(CPUAllocator, Staging) {
-  paddle::memory::detail::CPUAllocator<true> a;
-  void* p = a.Alloc(4096);
-  EXPECT_NE(p, nullptr);
-  a.Free(p, 4096);
-}
diff --git a/paddle/memory/detail/gpu_allocator.h b/paddle/memory/detail/gpu_allocator.h
deleted file mode 100644
index 682afdf7d3349aee107ed393ce2dbefebe8ff82f..0000000000000000000000000000000000000000
--- a/paddle/memory/detail/gpu_allocator.h
+++ /dev/null
@@ -1,92 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stddef.h>  // for size_t
-
-#include <thrust/system/cuda/error.h>
-#include <thrust/system_error.h>
-
-namespace paddle {
-namespace memory {
-namespace detail {
-
-inline void throw_on_error(cudaError_t e, const char* message) {
-  if (e) {
-    throw thrust::system_error(e, thrust::cuda_category(), message);
-  }
-}
-
-// GPUAllocator<staging=true> calls cudaHostMalloc, which returns
-// pinned and locked memory as staging areas for data exchange
-// between host and device.  Allocates too much would reduce the
-// amount of memory available to the system for paging.  So, by
-// default, we should use GPUAllocator<staging=false>.
-template <bool staging>
-class GPUAllocator {
- public:
-  void* Alloc(size_t size);
-  void Free(void* p, size_t size);
-};
-
-template <>
-class GPUAllocator<false> {
- public:
-  void* Alloc(size_t size) {
-    void* p = 0;
-    cudaError_t result = cudaMalloc(&p, size);
-    if (result == cudaSuccess) {
-      return p;
-    }
-    // clear last error
-    cudaGetLastError();
-    return nullptr;
-  }
-
-  void Free(void* p, size_t size) {
-    // Purposefully allow cudaErrorCudartUnloading, because
-    // that is returned if you ever call cudaFree after the
-    // driver has already shutdown. This happens only if the
-    // process is terminating, in which case we don't care if
-    // cudaFree succeeds.
-    auto err = cudaFree(p);
-    if (err != cudaErrorCudartUnloading) {
-      throw_on_error(err, "cudaFree failed");
-    }
-  }
-};
-
-template <>
-class GPUAllocator<true> {
- public:
-  void* Alloc(size_t size) {
-    void* p = 0;
-    cudaError_t result = cudaMallocHost(&p, size);
-    if (result == cudaSuccess) {
-      return p;
-    }
-    // clear last error
-    cudaGetLastError();
-    return nullptr;
-  }
-
-  void Free(void* p, size_t size) {
-    throw_on_error(cudaFreeHost(p), "cudaFreeHost failed");
-  }
-};
-
-}  // namespace detail
-}  // namespace memory
-}  // namespace paddle
diff --git a/paddle/memory/detail/gpu_allocator_test.cc b/paddle/memory/detail/gpu_allocator_test.cc
deleted file mode 100644
index 18c1c9ab43084ae72185565cb7c04cf6d5c9937c..0000000000000000000000000000000000000000
--- a/paddle/memory/detail/gpu_allocator_test.cc
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/memory/detail/gpu_allocator.h"
-#include "gtest/gtest.h"
-
-TEST(GPUAllocator, NonStaging) {
-  paddle::memory::detail::GPUAllocator<false> a;
-  void* p = a.Alloc(4096);
-  EXPECT_NE(p, nullptr);
-  a.Free(p, 4096);
-}
-
-TEST(GPUAllocator, Staging) {
-  paddle::memory::detail::GPUAllocator<true> a;
-  void* p = a.Alloc(4096);
-  EXPECT_NE(p, nullptr);
-  a.Free(p, 4096);
-}
diff --git a/paddle/memory/detail/system_allocator.h b/paddle/memory/detail/system_allocator.h
new file mode 100644
index 0000000000000000000000000000000000000000..1768f9a0da6c96a391a10678fa3490cade7b6c86
--- /dev/null
+++ b/paddle/memory/detail/system_allocator.h
@@ -0,0 +1,129 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stddef.h>    // for size_t
+#include <sys/mman.h>  // for mlock and munlock
+#include <cstdlib>     // for malloc and free
+
+#ifndef PADDLE_ONLY_CPU
+#include <thrust/system/cuda/error.h>
+#include <thrust/system_error.h>
+#endif  // PADDLE_ONLY_CPU
+
+#include "paddle/platform/assert.h"
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+class CPUDeleter {
+ public:
+  CPUDeleter(void* ptr, size_t size, bool locked)
+      : ptr_(ptr), size_(size), locked_(locked) {}
+
+  void* Ptr() { return ptr_; }
+
+  void operator()(void* ptr) {
+    PADDLE_ASSERT(ptr == ptr_);
+    if (ptr_ != nullptr && locked_) {
+      munlock(ptr_, size_);
+    }
+    std::free(ptr_);
+  }
+
+ private:
+  void* ptr_;
+  size_t size_;
+  bool locked_;
+};
+
+// CPUAllocator<lock_memory=true> calls mlock, which returns pinned
+// and locked memory as staging areas for data exchange between host
+// and device.  Allocates too much would reduce the amount of memory
+// available to the system for paging.  So, by default, we should use
+// CPUAllocator<staging=false>.
+template <bool lock_memory>
+class CPUAllocator {
+ public:
+  static CPUDeleter Alloc(size_t size) {
+    void* p = std::malloc(size);
+    if (p != nullptr && lock_memory) {
+      mlock(p, size);
+    }
+    return CPUDeleter(p, size, lock_memory);
+  }
+};
+
+#ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
+
+namespace {
+inline void throw_on_error(cudaError_t e, const char* message) {
+  if (e) {
+    throw thrust::system_error(e, thrust::cuda_category(), message);
+  }
+}
+}  // namespace
+
+class GPUDeleter {
+ public:
+  GPUDeleter(void* ptr, size_t size, bool staging)
+      : ptr_(ptr), size_(size), staging_(staging) {}
+
+  void* Ptr() { return ptr_; }
+
+  void operator()(void* ptr) {
+    PADDLE_ASSERT(ptr == ptr_);
+    // Purposefully allow cudaErrorCudartUnloading, because
+    // that is returned if you ever call cudaFree after the
+    // driver has already shutdown. This happens only if the
+    // process is terminating, in which case we don't care if
+    // cudaFree succeeds.
+    cudaError_t err = staging_ ? cudaFreeHost(ptr) : cudaFree(ptr);
+    if (err != cudaErrorCudartUnloading) {
+      throw_on_error(err, "cudaFree{Host} failed");
+    }
+  }
+
+ private:
+  void* ptr_;
+  size_t size_;
+  bool staging_;
+};
+
+// GPUAllocator<staging=true> calls cudaHostMalloc, which returns
+// pinned and locked memory as staging areas for data exchange
+// between host and device.  Allocates too much would reduce the
+// amount of memory available to the system for paging.  So, by
+// default, we should use GPUAllocator<staging=false>.
+template <bool staging>
+class GPUAllocator {
+ public:
+  static GPUDeleter Alloc(size_t size) {
+    void* p = 0;
+    cudaError_t result =
+        staging ? cudaMallocHost(&p, size) : cudaMalloc(&p, size);
+    if (result != cudaSuccess) {
+      cudaGetLastError();  // clear error if there is any.
+    }
+    return GPUDeleter(result == cudaSuccess ? p : nullptr, size, staging);
+  }
+};
+
+#endif  // PADDLE_ONLY_CPU
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/memory/detail/system_allocator_test.cc b/paddle/memory/detail/system_allocator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..fec70a65b77d5f3698726970b3e1797c59261048
--- /dev/null
+++ b/paddle/memory/detail/system_allocator_test.cc
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/detail/system_allocator.h"
+
+#include <memory>
+#include <vector>
+
+#include "gtest/gtest.h"
+
+template <typename Allocator>
+void TestAllocator() {
+  {
+    auto d = Allocator::Alloc(sizeof(int));
+    EXPECT_NE(d.Ptr(), nullptr);
+    std::unique_ptr<int> p(static_cast<int*>(d.Ptr()), d);
+  }
+  {
+    auto d = Allocator::Alloc(0);
+    EXPECT_EQ(d.Ptr(), nullptr);
+    std::unique_ptr<int> p(static_cast<int*>(d.Ptr()), d);
+  }
+}
+
+TEST(CPUAllocator, NoLockMem) {
+  TestAllocator<paddle::memory::detail::CPUAllocator<false>>();
+}
+TEST(CPUAllocator, LockMem) {
+  TestAllocator<paddle::memory::detail::CPUAllocator<true>>();
+}
+
+#ifndef PADDLE_ONLY_CPU
+TEST(GPUAllocator, NoStaging) {
+  TestAllocator<paddle::memory::detail::GPUAllocator<false>>();
+}
+TEST(GPUAllocator, Staging) {
+  TestAllocator<paddle::memory::detail::GPUAllocator<true>>();
+}
+#endif  // PADDLE_ONLY_CPU
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index b617923731a4d92e9765e2b73c55984a70a59264..ca3c01ebdb03598f760d19475b42930a137b3bba 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -14,48 +14,41 @@ limitations under the License. */
 
 #include "paddle/memory/memory.h"
 
+#include "paddle/memory/detail/cpu_allocator.h"
+#include "paddle/memory/detail/gpu_allocator.h"
+
 namespace paddle {
 namespace memory {
 
-template <>
-void* Alloc<CPUPlace>(CPUPlace, size_t size) {
-  return GetCPUBuddyAllocator(false /*non-staging*/)->Alloc(size);
-}
-
-void* AllocStaging(CPUPlace, size_t size) {
-  return GetCPUBuddyAllocator(true /*staging*/)->Alloc(size);
-}
-
-template <>
-void* Alloc<GPUPlace>(GPUPlace pl, size_t size) {
-  return GetGPUBuddyAllocator(pl.device)->Alloc(size);
-}
-
-template <>
-void Free<CPUPlace>(CPUPlace, void* p) {
-  return GetCPUBuddyAllocator(false /*non-staging*/)->Free(p);
-}
-
-void FreeStaging(CPUPlace, void* p) {
-  return GetCPUBuddyAllocator(false /*non-staging*/)->Free(p);
-}
-
-#ifdef PADDLE_WITH_GPU
-template <>
-void* Alloc<GPUPlace>(GPUPlace pl, void* p) {
-  return GetGPUBuddyAllocator(pl.device)->Free(p);
-}
-
-template <>
-size_t Used<CPUPlace>(CPUPlace) {
+void Alloc(paddle::platform::Place pl, size_t size) {
+#ifndef PADDLE_ONLY_CPU
+  if (paddle::platform::is_gpu_place(pl)) {
+    return GetGPUBuddyAllocator(pl.device)->Alloc(size);
+  }
+#endif  // PADDLE_ONLY_CPU
+  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
+  return GetCPUBuddyAllocator()->Alloc(size);
+}
+
+void Free(paddle::platform::Place pl, void* p) {
+#ifndef PADDLE_ONLY_CPU
+  if (paddle::platform::is_gpu_place(pl)) {
+    GetGPUBuddyAllocator(pl.device)->Free(p);
+  }
+#endif  // PADDLE_ONLY_CPU
+  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
+  GetCPUBuddyAllocator()->Free(p);
+}
+
+size_t Used(paddle::platform::Place pl) {
+#ifndef PADDLE_ONLY_CPU
+  if (paddle::platform::is_gpu_place(pl)) {
+    return GetGPUBuddyAllocator(pl.device)->Used();
+  }
+#endif  // PADDLE_ONLY_CPU
+  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
   return GetCPUBuddyAllocator()->Used();
 }
 
-template <>
-size_t Alloc<GPUPlace>(GPUPlace pl) {
-  return GetGPUBuddyAllocator(pl.device)->Used();
-}
-#endif  // PADDLE_WITH_GPU
-
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index 8c15a133bb4e9762d4264ee0d02ad96a3ed33e30..0bc609205eca2c53d85b1a2533f8f36d5b19595e 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -19,19 +19,9 @@ limitations under the License. */
 namespace paddle {
 namespace memory {
 
-template <typename paddle::framework::Place>
-void* Alloc(Place, size_t);
-template <typename paddle::framework::Place>
-void Free(Place, void*);
-template <typename paddle::framework::Place>
-size_t Used(Place);
-
-// Staging memory means "pinned" host memory that can be mapped into
-// the CUDA memory space and accessed by the device rapidly.  Don't
-// allocate too much staging memory; otherwise system performance will
-// degrade because the OS cannot find enough swap memory space.
-void* AllocStaging(CPUPlace, size_t);
-void* FreeStaging(CPUPlace, size_t);
+void* Alloc(paddle::framework::Place, size_t);
+void Free(paddle::framework::Place, void*);
+size_t Used(paddle::framework::Place);
 
 }  // namespace memory
 }  // namespace paddle