Merge branch 'develop' into update_android_dockerfile

ac8c8bd5 · Liu Yiqun · 6c302757 · cdec5634 · ac8c8bd5 · ac8c8bd5
97 changed file
--- a/Dockerfile
+++ b/Dockerfile
@@ -25,7 +25,7 @@ COPY ./paddle/scripts/docker/root/ /root/
 RUN apt-get update && \
    apt-get install -y \
    git python-pip python-dev openssh-server bison  \
-    wget unzip tar xz-utils bzip2 gzip coreutils ntp \
+    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
    curl sed grep graphviz libjpeg-dev zlib1g-dev  \
    python-numpy python-matplotlib gcc g++ \
    automake locales clang-format-3.8 swig doxygen cmake  \

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -102,12 +102,19 @@ if(WITH_GOLANG)
      message(FATAL_ERROR "no glide executeble found: $ENV{GOPATH}/bin/glide")
    endif()

-    add_custom_target(go_vendor)
-    add_custom_command(TARGET go_vendor
+    # this command will only run when the file it depends is missing
+    # or has changed, or the output is missing.
+    add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/glide
      COMMAND env GOPATH=${GOPATH} ${GLIDE} install
+      COMMAND touch ${CMAKE_BINARY_DIR}/glide
+      DEPENDS ${PROJ_ROOT}/go/glide.lock
      WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go"
-    )
-    add_dependencies(go_vendor go_path)
+      )
+
+    # depends on the custom command which outputs
+    # ${CMAKE_BINARY_DIR}/glide, the custom command does not need to
+    # run every time this target is built.
+    add_custom_target(go_vendor DEPENDS ${CMAKE_BINARY_DIR}/glide go_path)
  endif()

 endif(WITH_GOLANG)
--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@@ -27,7 +27,8 @@ set(IGNORE_PATTERN
    .*cblas\\.h.*
    .*\\.pb\\.txt
    .*LtrDataProvider.*
-    .*MultiDataProvider.*)
+    .*MultiDataProvider.*
+    .*pb.*)

 # add_style_check_target
 #
@@ -52,14 +53,13 @@ macro(add_style_check_target TARGET_NAME)
                endif()
            endforeach()
            if(LINT MATCHES ON)
+                # cpplint code style
                get_filename_component(base_filename ${filename} NAME)
                set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint)
-                add_custom_command(OUTPUT ${CUR_GEN}
-                    PRE_BUILD
-                    COMMAND env ${py_env} "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
-                                "--filter=${STYLE_FILTER}"
-                                "--write-success=${CUR_GEN}" ${filename}
-                    DEPENDS ${filename}
+                add_custom_command(TARGET ${TARGET_NAME} PRE_BUILD
+                    COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
+                            "--filter=${STYLE_FILTER}"
+                            "--write-success=${CUR_GEN}" ${filename}
                    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
            endif()
        endforeach()

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -104,6 +104,7 @@ function(merge_static_libs TARGET_NAME)
  foreach(lib ${libs})
    list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
  endforeach()
+  list(REMOVE_DUPLICATES libs_deps)

  if(APPLE) # Use OSX's libtool to merge archives
    # To produce a library we need at least one source file.
@@ -127,7 +128,7 @@ function(merge_static_libs TARGET_NAME)
      # Get the file names of the libraries to be merged
      set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
    endforeach()
-		add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
      COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
      COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
  else() # general UNIX: use "ar" to extract objects and re-add to a common lib
@@ -145,11 +146,11 @@ function(merge_static_libs TARGET_NAME)
        DEPENDS ${lib} ${objdir}
        WORKING_DIRECTORY ${objdir})

-      # Empty dummy source file that goes into merged library
-      set(mergebase ${lib}.mergebase.c)
-      add_custom_command(OUTPUT ${mergebase}
-        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}
-        DEPENDS ${objlistfile})
+      # Empty dummy source file that goes into merged library		
+      set(mergebase ${lib}.mergebase.c)		
+      add_custom_command(OUTPUT ${mergebase}		
+        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}		
+        DEPENDS ${objlistfile})		

      list(APPEND mergebases "${mergebase}")
    endforeach()
@@ -184,6 +185,10 @@ function(cc_library TARGET_NAME)
      add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
    endif()
+    
+    # cpplint code style
+    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS})
+
  else(cc_library_SRCS)
    if (cc_library_DEPS)
      merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})
@@ -337,7 +342,7 @@ function(go_test TARGET_NAME)
  string(REPLACE "${PADDLE_GO_PATH}" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
  add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${go_test_DEPS})
  add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test
+    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test -race
    -c -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
    ".${CMAKE_CURRENT_SOURCE_REL_DIR}"
    WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")

--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -474,6 +474,11 @@ prelu
 ..  autoclass:: paddle.v2.layer.prelu
    :noindex:

+gated_unit
+-----------
+..  autoclass:: paddle.v2.layer.gated_unit
+    :noindex:
+
 Detection output Layer
 ======================


--- a/go/cmd/master/master.go
+++ b/go/cmd/master/master.go
@@ -11,6 +11,7 @@ import (

 	"github.com/namsral/flag"
 	log "github.com/sirupsen/logrus"
+	"github.com/topicai/candy"

 	"github.com/PaddlePaddle/Paddle/go/master"
 	"github.com/PaddlePaddle/Paddle/go/utils/networkhelper"
@@ -20,11 +21,18 @@ func main() {
 	port := flag.Int("port", 8080, "port of the master server.")
 	ttlSec := flag.Int("ttl", 60, "etcd lease TTL in seconds.")
 	endpoints := flag.String("endpoints", "http://127.0.0.1:2379", "comma separated etcd endpoints. If empty, fault tolerance will not be enabled.")
-	taskTimeoutDur := flag.Duration("task_timout_dur", 20*time.Minute, "task timout duration.")
-	taskTimeoutMax := flag.Int("task_timeout_max", 3, "max timtout count for each task before it being declared failed task.")
-	chunkPerTask := flag.Int("chunk_per_task", 10, "chunk per task.")
+	taskTimeoutDur := flag.Duration("task-timout-dur", 20*time.Minute, "task timout duration.")
+	taskTimeoutMax := flag.Int("task-timeout-max", 3, "max timtout count for each task before it being declared failed task.")
+	chunkPerTask := flag.Int("chunk-per-task", 10, "chunk per task.")
+	logLevel := flag.String("log-level", "info",
+		"log level, possible values: debug, info, warning, error, fatal, panic")
 	flag.Parse()

+	level, e := log.ParseLevel(*logLevel)
+	candy.Must(e)
+
+	log.SetLevel(level)
+
 	if *endpoints == "" {
 		log.Warningln("-endpoints not set, fault tolerance not be enabled.")
 	}

--- a/go/cmd/pserver/pserver.go
+++ b/go/cmd/pserver/pserver.go
@@ -40,7 +40,7 @@ func main() {
 		idx = *index
 	} else {
 		e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, *etcdTimeout)
-		idx, err = e.Register()
+		idx, err = e.Register(*port)
 		candy.Must(err)

 		cp, err = pserver.NewCheckpointFromFile(*checkpointPath, idx, e)

--- a/go/master/client.go
+++ b/go/master/client.go
@@ -2,6 +2,7 @@ package master

 import (
 	"os"
+	"time"

 	"github.com/PaddlePaddle/Paddle/go/connection"
 	"github.com/PaddlePaddle/recordio"
@@ -36,9 +37,9 @@ func (c *Client) getRecords() {
 	for {
 		t, err := c.getTask()
 		if err != nil {
-			// TODO(helin): wait before move on with next
 			// getTask call.
-			log.Errorln(err)
+			log.Errorf("Get task failed, sleep 3 seconds and continue, %s", err)
+			time.Sleep(3 * time.Second)
 			continue
 		}


--- a/go/master/service.go
+++ b/go/master/service.go
@@ -215,6 +215,7 @@ func readChunks(globPaths []string) ([]Chunk, error) {
 		}

 		count := index.NumChunks()
+		log.Infof("readChunks: file %s has %d chunks", path, count)
 		for i := 0; i < count; i++ {
 			chunk := Chunk{
 				Path:  path,

--- a/go/pserver/client/c/test/test_train.py
+++ b/go/pserver/client/c/test/test_train.py
 import paddle.v2 as paddle
 import paddle.v2.dataset.uci_housing as uci_housing
+import paddle.v2.master as master
+import os
+import cPickle as pickle
+
+etcd_ip = os.getenv("MASTER_IP", "127.0.0.1")
+etcd_endpoint = "http://" + etcd_ip + ":2379"
+
+
+def cloud_reader():
+    print "connecting to master, etcd endpoints: ", etcd_endpoint
+    master_client = master.client(etcd_endpoint, 5, 64)
+    master_client.set_dataset(
+        ["/pfs/dlnel/public/dataset/uci_housing/uci_housing-*-of-*"])
+    while 1:
+        r, e = master_client.next_record()
+        if not r:
+            break
+        yield pickle.loads(r)


 def main():
@@ -22,13 +40,13 @@ def main():
    # create optimizer of new remote updater to pserver
    optimizer = paddle.optimizer.Momentum(momentum=0)

-    #TODO(zhihong) : replace optimizer with new OptimizerConfig
-
+    print "etcd endoint: ", etcd_endpoint
    trainer = paddle.trainer.SGD(cost=cost,
                                 parameters=parameters,
                                 update_equation=optimizer,
                                 is_local=False,
-                                 pserver_spec="localhost:3000")
+                                 pserver_spec=etcd_endpoint,
+                                 use_etcd=True)

    # event_handler to print training and testing info
    def event_handler(event):
@@ -47,11 +65,11 @@ def main():
                print "Test %d, %.2f" % (event.pass_id, result.cost)

    # training
+    # NOTE: use uci_housing.train() as reader for non-paddlecloud training
    trainer.train(
        reader=paddle.batch(
            paddle.reader.shuffle(
-                uci_housing.train(), buf_size=500),
-            batch_size=2),
+                cloud_reader, buf_size=500), batch_size=2),
        feeding={'x': 0,
                 'y': 1},
        event_handler=event_handler,

--- a/go/pserver/client/client_test.go
+++ b/go/pserver/client/client_test.go
@@ -164,7 +164,7 @@ func testClient(t *testing.T, c *client.Client) {

 		wg.Add(1)
 		go func(gs []pserver.Gradient) {
-			err = c.SendGrads(gs)
+			err := c.SendGrads(gs)
 			if err != nil {
 				t.Fatal(err)
 			}

--- a/go/pserver/client/etcd_client.go
+++ b/go/pserver/client/etcd_client.go
@@ -12,6 +12,7 @@ import (
 )

 const (
+	// DefaultEtcdTimeout is the default etcd timeout
 	DefaultEtcdTimeout time.Duration = 5 * time.Second
 )

@@ -66,12 +67,12 @@ func (p *EtcdClient) List() []Server {
 	for {
 		for i := 0; i < psDesired; i++ {
 			ctx, cancel := context.WithTimeout(context.Background(), p.timeout)
-			cancel()
 			psKey := pserver.PsPath + strconv.Itoa(i)
 			log.Debugf("checking %s", psKey)
 			resp, err := p.client.Get(ctx, psKey)
+			cancel()
 			if err != nil {
-				log.Infof("Get psKey= %s error, %v", psKey, err)
+				log.Infof("Get psKey=%s error, %v", psKey, err)
 				time.Sleep(p.timeout)
 				continue
 			}

--- a/go/pserver/etcd_client.go
+++ b/go/pserver/etcd_client.go
@@ -49,7 +49,7 @@ func NewEtcdClient(endpoints string, numPservers int, timeout time.Duration) *Et
 // Register registers the pserver on etcd
 //
 // Register returns the index of the current pserver.
-func (e *EtcdClient) Register() (int, error) {
+func (e *EtcdClient) Register(port int) (int, error) {

 	var err error
 	e.externalIP, err = networkhelper.GetExternalIP()
@@ -116,7 +116,7 @@ func (e *EtcdClient) Register() (int, error) {
 	for {
 		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
 		var err error
-		pserverIdx, err = e.registerPserverEtcd(ctx)
+		pserverIdx, err = e.registerPserverEtcd(ctx, port)
 		cancel()
 		if err != nil {
 			log.Warn(err)
@@ -140,7 +140,7 @@ func (e *EtcdClient) initDesiredPservers(ctx context.Context, numPservers int) (
 }

 // registerPserverEtcd registers pserver node on etcd using transaction.
-func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
+func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, error) {
 	var idx int
 	_, err := concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error {
 		registered := false
@@ -156,8 +156,9 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
 					log.Fatal(err)
 				}
 				// find the first id and write info
-				c.Put(psKey, e.externalIP, clientv3.WithLease(resp.ID))
-				log.Debugf("set pserver node %s with value %s", psKey, e.externalIP)
+				pserverAddr := e.externalIP + ":" + strconv.Itoa(port)
+				c.Put(psKey, pserverAddr, clientv3.WithLease(resp.ID))
+				log.Debugf("set pserver node %s with value %s", psKey, pserverAddr)
 				ch, kaerr := e.etcdClient.KeepAlive(context.TODO(), resp.ID)
 				if kaerr != nil {
 					log.Errorf("keepalive etcd node error: %v", kaerr)

--- a/paddle/api/ConfigParser.cpp
+++ b/paddle/api/ConfigParser.cpp
@@ -64,11 +64,7 @@ ModelConfig* TrainerConfig::getModelConfig() const {

 ParameterConfig::ParameterConfig() : m(new ParameterConfigPrivate()) {}

-ParameterConfig::~ParameterConfig() {
-  if (m) {
-    delete m;
-  }
-}
+ParameterConfig::~ParameterConfig() { delete m; }

 ParameterConfig* ParameterConfig::createParameterConfigFromParameterSharedPtr(
    void* ptr) {
@@ -98,11 +94,7 @@ void* ParameterConfig::getRawPtr() { return m->getConfigPtr(); }

 OptimizationConfig::OptimizationConfig() : m(new OptimizationConfigPrivate()) {}

-OptimizationConfig::~OptimizationConfig() {
-  if (m) {
-    delete m;
-  }
-}
+OptimizationConfig::~OptimizationConfig() { delete m; }

 std::string OptimizationConfig::toProtoString() {
  return m->getConfig().SerializeAsString();

--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -843,7 +843,8 @@ public:
                                               bool useSparseUpdater);
  static ParameterUpdater* createNewRemoteUpdater(
      OptimizationConfig* config,
-      const std::string pserverSpec) throw(UnsupportError);
+      const std::string pserverSpec,
+      const bool useEtcd) throw(UnsupportError);
  ~ParameterUpdater();

  /**

--- a/paddle/api/ParameterOptimizer.cpp
+++ b/paddle/api/ParameterOptimizer.cpp
@@ -53,11 +53,7 @@ struct ParameterTraverseCallbackPrivate {

 ParameterOptimizer::ParameterOptimizer() : m(new ParameterOptimizerPrivate()) {}

-ParameterOptimizer::~ParameterOptimizer() {
-  if (m) {
-    delete m;
-  }
-}
+ParameterOptimizer::~ParameterOptimizer() { delete m; }

 ParameterOptimizer* ParameterOptimizer::create(OptimizationConfig* config) {
  CHECK(config != nullptr);
@@ -104,11 +100,7 @@ std::vector<int> ParameterOptimizer::getParameterTypes() const {
 ParameterTraverseCallback::ParameterTraverseCallback()
    : m(new ParameterTraverseCallbackPrivate()) {}

-ParameterTraverseCallback::~ParameterTraverseCallback() {
-  if (m) {
-    delete m;
-  }
-}
+ParameterTraverseCallback::~ParameterTraverseCallback() { delete m; }

 void ParameterTraverseCallback::apply(const std::vector<Vector*>& vecs,
                                      const ParameterConfig& conf,

--- a/paddle/api/ParameterUpdater.cpp
+++ b/paddle/api/ParameterUpdater.cpp
@@ -33,11 +33,12 @@ ParameterUpdater *ParameterUpdater::createLocalUpdater(

 ParameterUpdater *ParameterUpdater::createNewRemoteUpdater(
    OptimizationConfig *config,
-    const std::string pserverSpec) throw(UnsupportError) {
+    const std::string pserverSpec,
+    const bool useEtcd) throw(UnsupportError) {
 #ifndef PADDLE_WITHOUT_GOLANG
  auto updater = new ParameterUpdater();
  updater->m->updater.reset(new paddle::NewRemoteParameterUpdater(
-      config->m->getConfig(), pserverSpec));
+      config->m->getConfig(), pserverSpec, useEtcd));
  return updater;
 #else
  throw UnsupportError();

--- a/paddle/api/Vector.cpp
+++ b/paddle/api/Vector.cpp
@@ -171,11 +171,7 @@ struct VectorPrivate {

 Vector::Vector() : m(new VectorPrivate()) {}

-Vector::~Vector() {
-  if (m) {
-    delete m;
-  }
-}
+Vector::~Vector() { delete m; }

 Vector* Vector::createZero(size_t sz, bool useGpu) {
  auto retVec = new Vector();

--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -2,23 +2,28 @@
 cc_library(ddim SRCS ddim.cc)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
-cc_test(tensor_test SRCS tensor_test.cc DEPS ddim)
+cc_test(tensor_test SRCS tensor_test.cc DEPS ddim glog gflags)
 cc_test(variable_test SRCS variable_test.cc)
 cc_test(scope_test SRCS scope_test.cc)
-cc_test(enforce_test SRCS enforce_test.cc)
+cc_library(enforce SRCS enforce.cc DEPS glog gflags)
+cc_test(enforce_test SRCS enforce_test.cc DEPS enforce)
 proto_library(attr_type SRCS attr_type.proto)
 proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
 cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
 proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
 cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
-cc_library(operator SRCS operator.cc DEPS op_desc device_context)
+
+cc_library(operator SRCS operator.cc DEPS op_desc device_context enforce)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
-cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc)
+
+cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc enforce)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry operator)
+
 py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)

 proto_library(net_proto SRCS net_proto.proto DEPS op_proto)
-cc_library(net SRCS net.cc DEPS net_proto)
+cc_library(net SRCS net.cc DEPS operator net_proto op_registry)
+cc_test(net_op_test SRCS net_op_test.cc DEPS net)
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #include "paddle/framework/ddim.h"

 namespace paddle {
 namespace framework {

-///@cond HIDDEN
+/// @cond HIDDEN

 template <int i>
 Dim<i> make_dim(const int* d) {
@@ -50,7 +64,7 @@ void make_ddim(DDim& ddim, const int* dims, int n) {
  }
 }

-///@endcond
+/// @endcond

 DDim make_ddim(std::initializer_list<int> dims) {
  DDim result(make_dim(0));
@@ -64,11 +78,11 @@ DDim make_ddim(const std::vector<int>& dims) {
  return result;
 }

-///@cond HIDDEN
+/// @cond HIDDEN
 // XXX For some reason, putting this in an anonymous namespace causes errors
 class DynamicMutableIndexer : public boost::static_visitor<int&> {
 public:
-  DynamicMutableIndexer(int idx) : idx_(idx) {}
+  explicit DynamicMutableIndexer(int idx) : idx_(idx) {}

  template <int D>
  int& operator()(Dim<D>& dim) const {
@@ -81,7 +95,7 @@ class DynamicMutableIndexer : public boost::static_visitor<int&> {

 class DynamicConstIndexer : public boost::static_visitor<int> {
 public:
-  DynamicConstIndexer(int idx) : idx_(idx) {}
+  explicit DynamicConstIndexer(int idx) : idx_(idx) {}

  template <int D>
  int operator()(const Dim<D>& dim) const {
@@ -92,7 +106,7 @@ class DynamicConstIndexer : public boost::static_visitor<int> {
  int idx_;
 };

-///@endcond
+/// @endcond

 int& DDim::operator[](int idx) {
  return boost::apply_visitor(DynamicMutableIndexer(idx), var);
@@ -155,11 +169,11 @@ int get(const DDim& ddim, int idx) { return ddim[idx]; }

 void set(DDim& ddim, int idx, int value) { ddim[idx] = value; }

-///@cond HIDDEN
+/// @cond HIDDEN
 struct VectorizeVisitor : public boost::static_visitor<> {
  std::vector<int>& vector;

-  VectorizeVisitor(std::vector<int>& v) : vector(v) {}
+  explicit VectorizeVisitor(std::vector<int>& v) : vector(v) {}

  template <typename T>
  void operator()(const T& t) {
@@ -169,7 +183,7 @@ struct VectorizeVisitor : public boost::static_visitor<> {

  void operator()(const Dim<1>& t) { vector.push_back(t.head); }
 };
-///@endcond
+/// @endcond

 std::vector<int> vectorize(const DDim& ddim) {
  std::vector<int> result;
@@ -187,7 +201,7 @@ ssize_t product(const DDim& ddim) {
  return result;
 }

-///\cond HIDDEN
+/// \cond HIDDEN

 struct ArityVisitor : boost::static_visitor<int> {
  template <int D>
@@ -196,15 +210,15 @@ struct ArityVisitor : boost::static_visitor<int> {
  }
 };

-///\endcond
+/// \endcond

 int arity(const DDim& d) { return boost::apply_visitor(ArityVisitor(), d); }

-///\cond HIDDEN
+/// \cond HIDDEN

 struct DDimPrinter : boost::static_visitor<void> {
  std::ostream& os;
-  DDimPrinter(std::ostream& os_) : os(os_) {}
+  explicit DDimPrinter(std::ostream& os_) : os(os_) {}

  template <typename T>
  void operator()(const T& t) {
@@ -212,7 +226,7 @@ struct DDimPrinter : boost::static_visitor<void> {
  }
 };

-///\endcond
+/// \endcond

 std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
  DDimPrinter printer(os);

--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -27,7 +27,7 @@ struct DDim {
  DDim() : var(Dim<1>()) {}

  template <int D>
-  DDim(const Dim<D>& in) : var(in) {}
+  explicit DDim(const Dim<D>& in) : var(in) {}

  template <int D>
  DDim& operator=(const Dim<D>& in) {

--- a/paddle/framework/enforce.cc
+++ b/paddle/framework/enforce.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/enforce.h"
--- a/paddle/framework/enforce.h
+++ b/paddle/framework/enforce.h
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
+#include <glog/logging.h>
 #include <paddle/string/printf.h>
 #include <exception>
 #include <sstream>
@@ -58,12 +59,17 @@ class EnforceNotMet : public std::exception {
 /**
 * @brief Enforce a condition, otherwise throw an EnforceNotMet
 */
+#ifdef NDEBUG
 #define PADDLE_ENFORCE(condition, ...) \
  do {                                 \
    if (UNLIKELY(!(condition))) {      \
      PADDLE_THROW(__VA_ARGS__);       \
    }                                  \
  } while (0)
+#else
+#define PADDLE_ENFORCE(condition, ...) \
+  CHECK(condition) << ::paddle::string::Sprintf(__VA_ARGS__);
+#endif

 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/net.cc
+++ b/paddle/framework/net.cc
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
 #include "paddle/framework/net.h"

 namespace paddle {
 namespace framework {

-PlainNet::PlainNet(const NetDesc& def) {}
-
-void PlainNet::InferShape(const ScopePtr& scope) const {
+void PlainNet::CompleteAddOp() {
+  std::unordered_set<std::string> input_set;
+  std::unordered_set<std::string> output_set;
+  std::unordered_set<std::string> temp_output;
  for (auto& op : ops_) {
-    op.InferShape();
+    for (auto& ipt : op->inputs_) {
+      if (!Contains(output_set, ipt)) {  // Not other op's output
+        input_set.insert(ipt);
+      } else {
+        temp_output.insert(ipt);
+      }
+    }
+
+    for (auto& opt : op->outputs_) {
+      output_set.insert(opt);
+    }
  }
-}
+  inputs_.reserve(input_set.size());
+  std::copy(input_set.begin(), input_set.end(), std::back_inserter(inputs_));

-void PlainNet::Run(const ScopePtr& scope, const DeviceContext& ctx) const {
-  for (auto& op : ops_) {
-    op.Run(ctx);
+  outputs_.reserve(output_set.size());
+  std::vector<int> tmp_index;
+  tmp_index.reserve(temp_output.size());
+  int idx = 0;
+  for (auto& opt : output_set) {
+    if (Contains(temp_output, opt)) {
+      tmp_index.push_back(idx);
+    }
+    outputs_.push_back(opt);
+    ++idx;
  }
+
+  attrs_["temporary_index"] = tmp_index;
+  add_op_done_ = true;
 }
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/net.h
+++ b/paddle/framework/net.h
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.

-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at

-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0

-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */

 #pragma once

+#include <paddle/framework/op_desc.pb.h>
+#include <paddle/framework/operator.h>
 #include "paddle/framework/net_proto.pb.h"
 #include "paddle/framework/op_proto.pb.h"
+#include "paddle/framework/op_registry.h"
 #include "paddle/framework/scope.h"
 #include "paddle/platform/device_context.h"

 namespace paddle {
 namespace framework {
-using namespace paddle::platform;
-
-// operator's index stored in a network.
-typedef int OpIndex;
-/**
- * NOTE following codes are some definitions of unimplemented concepts.
- * We write some basic implementation to make Net compilable. These APIs will
- * keep updating if the concepts related are implemented.
- */
-
-struct OpDesc;
-struct OpAttrs {};
-
-class Operator {
- public:
-  Operator(const OpDesc &def) {}
-  void InferShape() const {}
-  void Run(const DeviceContext &ctx) const {}
-};
-
 /**
- * @brief Network that manage the operators it has.
+ * @brief Network is also a type of Operator
+ *
+ * It will manage the operators it has.
 *
- * Network is the container and controller of a set of operators, user can build
- * a real network from a NetDesc which is a protobuf message and use
- * Network.Run() * to run all the operators in the network.
+ * Network is the container and controller of a set of operators.

 * A network object knows all Operators belonging to this network. Variables,
 * which are inputs and outputs of these operators, are created and managed by a
 * hierarchy of Scope objects.
 *
- * This is the base class of network, all the networks should implement the apis
+ * This is the base class of network, all the networks should implement the APIs
 * it defines.
 */
-class Net {
+class Net : public OperatorBase {
 public:
-  /**
-   * @brief Infer shapes of all inputs and outputs of operators.
-   */
-  virtual void InferShape(const ScopePtr &scope) const = 0;
-  /**
-   * @brief Run the network.
-   *
-   * Run all the operators and return success(true) or not, with all the
-   * variables are located in `scope`. `context` describes the detail execution
-   * environment for ops. `begin` and `end` specify the scope of `ops_` to run,
-   * If no positive indexes are provided, all operators in `ops_` will run.
-   */
-  virtual void Run(const ScopePtr &scope, const DeviceContext &ctx) const = 0;
-
-  /**
-   * @brief Add an Operator according to `def`.
-   */
-  virtual OpIndex AddOp(const OpProto &def) = 0;
-
-  /**
-   * @brief Add optimizer operators acctording to `attrs`.
-   */
-  virtual void AddOptimizerOps(const OpAttrs &attrs) = 0;
-
-  /**
-   * @brief Add backward operators.
-   */
-  virtual void AddBackwardOps() = 0;
-
-  /**
-   * @brief Create a network.
-   */
-  static std::unique_ptr<Net> Create(const NetDesc &def = NetDesc());
-
-  virtual ~Net() {}
+  virtual void AddOp(const OperatorPtr& op) = 0;
+  virtual void CompleteAddOp() = 0;
 };

+using NetPtr = std::shared_ptr<Net>;
+
 /**
 * @brief a basic implementation of Net.
 *
@@ -103,18 +55,14 @@ class Net {
 class PlainNet : public Net {
 public:
  /**
-   * @brief Initialize a PlainNet.
-   *
-   * Initialize from  a network describe by `def`. NetDesc is the definition of
-   * a network.
-   */
-  PlainNet(const NetDesc &def);
-
-  /**
-   * Infer all the operators' input and output varialbes' shapes, will be called
+   * Infer all the operators' input and output variables' shapes, will be called
   * before every mini-batch
   */
-  virtual void InferShape(const ScopePtr &scope) const override;
+  void InferShape(const ScopePtr& scope) const override {
+    for (auto& op : ops_) {
+      op->InferShape(scope);
+    }
+  }

  /**
   * @brief Run the network.
@@ -123,49 +71,32 @@ class PlainNet : public Net {
   * scope will be used instead. If no OpContext is provicded, default context
   * will be used.
   */
-  virtual void Run(const ScopePtr &scope,
-                   const DeviceContext &ctx) const override;
+  void Run(const ScopePtr& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    for (auto& op : ops_) {
+      op->Run(scope, dev_ctx);
+    }
+  }

  /**
-   * @brief Add an operator to this network.
+   * @brief Add an operator by ptr
   */
-  virtual OpIndex AddOp(const OpProto &def) override;
+  void AddOp(const OperatorPtr& op) override {
+    PADDLE_ENFORCE(!add_op_done_, "Cannot AddOp when this network is sealed");
+    ops_.push_back(op);
+  }

-  /**
-   * @brief Add all optimizer operators related into the network.
-   */
-  virtual void AddOptimizerOps(const OpAttrs &attrs) override;
+  void CompleteAddOp() override;

-  /**
-   * @brief Add all backward operators related into the network.
-   */
-  virtual void AddBackwardOps() override;
-
-  virtual ~PlainNet() override {}
-
- protected:
-  /**
-   * @brief Build the network.
-   *
-   * Create operators accordding to `def`, will be called by the constructor.
-   */
-  void BuildNet(const NetDesc &def);
-
-  /**
-   * @brief Add an operator into this network.
-   *
-   * Add a operator which is identified as `type` and has attributes described
-   * in `attrs`, the `inputs` are the keys of readonly input variables,
-   * `outputs` are keys of mutable output variables. An `OpIndex` will be
-   * returned to indicate the offset of the new operator in `ops_`.
-   */
-  OpIndex AddOp(const std::string &type, const std::vector<std::string> &inputs,
-                const std::vector<std::string> &outputs,
-                const OpAttrs &attrs = OpAttrs());
+  std::vector<OperatorPtr> ops_;

 private:
-  // the operators owned by `Network`.
-  std::vector<Operator> ops_;
+  bool add_op_done_{false};
+
+  template <typename T, typename KeyType>
+  static bool Contains(T container, KeyType key) {
+    return container.find(key) != container.end();
+  }
 };

 }  // namespace framework

--- a/paddle/framework/net_op_test.cc
+++ b/paddle/framework/net_op_test.cc
+#include <gtest/gtest.h>
+#include <paddle/framework/net.h>
+#include <paddle/framework/op_registry.h>
+#include <paddle/framework/operator.h>
+
+namespace pd = paddle::framework;
+
+static int infer_shape_cnt = 0;
+static int run_cnt = 0;
+
+class TestOp : public pd::OperatorBase {
+ public:
+  void InferShape(const paddle::framework::ScopePtr& scope) const override {
+    ++infer_shape_cnt;
+  }
+  void Run(const paddle::framework::ScopePtr& scope,
+           const paddle::platform::DeviceContext& dev_ctx) const override {
+    ++run_cnt;
+  }
+};
+
+template <typename T>
+void AssertSameVectorWithoutOrder(const std::vector<T>& expected,
+                                  const std::vector<T>& actual) {
+  ASSERT_EQ(expected.size(), actual.size());
+  std::unordered_set<T> expected_set;
+  for (auto& tmp : expected) {
+    expected_set.insert(tmp);
+  }
+  for (auto& act : actual) {
+    ASSERT_NE(expected_set.end(), expected_set.find(act));
+  }
+}
+
+TEST(OpKernel, all) {
+  auto net = std::make_shared<paddle::framework::PlainNet>();
+  ASSERT_NE(net, nullptr);
+
+  auto op1 = std::make_shared<TestOp>();
+  op1->inputs_ = {"x", "w1", "b1"};
+  op1->outputs_ = {"y"};
+  net->AddOp(op1);
+
+  auto op2 = std::make_shared<TestOp>();
+  op2->inputs_ = {"y", "w2", "b2"};
+  op2->outputs_ = {"z"};
+  net->AddOp(op2);
+
+  net->CompleteAddOp();
+  AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"}, net->inputs_);
+  AssertSameVectorWithoutOrder({"y", "z"}, net->outputs_);
+  auto tmp_idx_iter = net->attrs_.find("temporary_index");
+  ASSERT_NE(net->attrs_.end(), tmp_idx_iter);
+  auto& tmp_idx = boost::get<std::vector<int>>(tmp_idx_iter->second);
+  ASSERT_EQ(1UL, tmp_idx.size());
+  ASSERT_EQ("y", net->outputs_[tmp_idx[0]]);
+
+  auto scope = std::make_shared<pd::Scope>();
+  paddle::platform::CPUDeviceContext dev_ctx;
+
+  net->InferShape(scope);
+  net->Run(scope, dev_ctx);
+  ASSERT_EQ(2, infer_shape_cnt);
+  ASSERT_EQ(2, run_cnt);
+
+  ASSERT_THROW(net->AddOp(op2), paddle::framework::EnforceNotMet);
+}
--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
 #include <paddle/framework/op_registry.h>

 namespace paddle {
@@ -33,4 +47,4 @@ void AttrTypeHelper::SetAttrType<std::vector<std::string>>(AttrProto* attr) {
  attr->set_type(paddle::framework::AttrType::STRINGS);
 }
 }  // namespace framework
-}  // namespace paddle
\ No newline at end of file
+}  // namespace paddle
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -201,7 +201,7 @@ class OpRegistry {
  static OperatorPtr CreateOp(const OpDesc& op_desc) {
    std::string op_type = op_desc.type();
    OperatorPtr op(creators().at(op_type)());
-    op->desc_ = op_desc;
+    op->type_ = op_desc.type();
    op->inputs_.reserve((size_t)op_desc.inputs_size());
    std::copy(op_desc.inputs().begin(), op_desc.inputs().end(),
              std::back_inserter(op->inputs_));
@@ -216,17 +216,17 @@ class OpRegistry {
    return op;
  }

+  static std::unordered_map<std::string, OpProto>& protos() {
+    static std::unordered_map<std::string, OpProto> protos_;
+    return protos_;
+  };
+
 private:
  static std::unordered_map<std::string, OpCreator>& creators() {
    static std::unordered_map<std::string, OpCreator> creators_;
    return creators_;
  }

-  static std::unordered_map<std::string, OpProto>& protos() {
-    static std::unordered_map<std::string, OpProto> protos_;
-    return protos_;
-  };
-
  static std::unordered_map<std::string, OpAttrChecker>& op_checkers() {
    static std::unordered_map<std::string, OpAttrChecker> op_checkers_;
    return op_checkers_;
@@ -241,12 +241,18 @@ class OpRegisterHelper {
  }
 };

+/**
+ * check if MACRO is used in GLOBAL NAMESPACE.
+ */
 #define STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)                        \
  struct __test_global_namespace_##uniq_name##__ {};                          \
  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
                             __test_global_namespace_##uniq_name##__>::value, \
                msg)

+/**
+ * Macro to Register Operator.
+ */
 #define REGISTER_OP(__op_type, __op_class, __op_maker_class)                 \
  STATIC_ASSERT_GLOBAL_NAMESPACE(__reg_op__##__op_type,                      \
                                 "REGISTER_OP must be in global namespace"); \
@@ -254,9 +260,12 @@ class OpRegisterHelper {
      __op_register_##__op_type##__(#__op_type);                             \
  int __op_register_##__op_type##_handle__() { return 0; }

-#define REGISTER_OP_KERNEL(type, GPU_OR_CPU, PlaceType, KernelType)       \
+/**
+ * Macro to Register OperatorKernel.
+ */
+#define REGISTER_OP_KERNEL(type, DEVICE_TYPE, PlaceType, KernelType)      \
  STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
-      __reg_op_kernel_##type##_##GPU_OR_CPU##__,                          \
+      __reg_op_kernel_##type##_##DEVICE_TYPE##__,                         \
      "REGISTER_OP_KERNEL must be in global namespace");                  \
  struct __op_kernel_register__##type##__ {                               \
    __op_kernel_register__##type##__() {                                  \
@@ -267,7 +276,7 @@ class OpRegisterHelper {
    }                                                                     \
  };                                                                      \
  static __op_kernel_register__##type##__ __reg_kernel_##type##__;        \
-  int __op_kernel_register_##type##_handle_##GPU_OR_CPU##__() { return 0; }
+  int __op_kernel_register_##type##_handle_##DEVICE_TYPE##__() { return 0; }

 #define REGISTER_OP_GPU_KERNEL(type, KernelType) \
  REGISTER_OP_KERNEL(type, GPU, ::paddle::platform::GPUPlace, KernelType)
@@ -275,6 +284,10 @@ class OpRegisterHelper {
 #define REGISTER_OP_CPU_KERNEL(type, KernelType) \
  REGISTER_OP_KERNEL(type, CPU, ::paddle::platform::CPUPlace, KernelType)

+/**
+ * Macro to mark what Operator and Kernel we will use and tell the compiler to
+ * link them into target.
+ */
 #define USE_OP_WITHOUT_KERNEL(op_type)                      \
  STATIC_ASSERT_GLOBAL_NAMESPACE(                           \
      __use_op_without_kernel_##op_type,                    \
@@ -292,15 +305,16 @@ class OpRegisterHelper {
      __attribute__((unused)) =                                           \
          __op_kernel_register_##op_type##_handle_##DEVICE_TYPE##__()

-#ifdef PADDLE_ONLY_CPU
-#define USE_OP(op_type)           \
+// use Operator with only cpu kernel.
+#define USE_OP_CPU(op_type)       \
  USE_OP_WITHOUT_KERNEL(op_type); \
-  USE_OP_KERNEL(op_type, CPU);
+  USE_OP_KERNEL(op_type, CPU)

+#ifdef PADDLE_ONLY_CPU
+#define USE_OP(op_type) USE_OP_CPU(op_type)
 #else
-#define USE_OP(op_type)           \
-  USE_OP_WITHOUT_KERNEL(op_type); \
-  USE_OP_KERNEL(op_type, CPU);    \
+#define USE_OP(op_type) \
+  USE_OP_CPU(op_type);  \
  USE_OP_KERNEL(op_type, GPU)
 #endif


--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -20,7 +20,7 @@ namespace framework {
 std::string OperatorBase::DebugString() const {
  std::stringstream ss;
  ss << "=================\n";
-  ss << "type = " << desc_.type() << "\n";
+  ss << "type = " << type_ << "\n";
  ss << "inputs = [";
  for (auto& ipt : inputs_) {
    ss << ipt << ", ";
@@ -40,4 +40,4 @@ std::string OperatorBase::DebugString() const {
 }

 }  // namespace framework
-}  // namespace paddle
\ No newline at end of file
+}  // namespace paddle
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -62,11 +62,8 @@ class OperatorBase {
  virtual void Run(const ScopePtr& scope,
                   const platform::DeviceContext& dev_ctx) const = 0;

- protected:
-  std::string Type() const { return desc_.type(); }
-
 public:
-  OpDesc desc_;
+  std::string type_;
  std::vector<std::string> inputs_;
  std::vector<std::string> outputs_;
  AttributeMap attrs_;
@@ -142,7 +139,7 @@ class OperatorWithKernel : public OperatorBase {

  void Run(const ScopePtr& scope,
           const platform::DeviceContext& dev_ctx) const final {
-    auto& opKernel = AllOpKernels().at(Type()).at(OpKernelKey(dev_ctx));
+    auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
    opKernel->Compute(OpKernel::KernelContext(this, scope, dev_ctx));
  }


--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -19,14 +19,18 @@ limitations under the License. */
 namespace paddle {
 namespace framework {

-class OperatorTest : public OperatorBase {
+static int op_run_num = 0;
+
+class OpWithoutKernelTest : public OperatorBase {
 public:
  void Init() override { x = 1; }
  void InferShape(const ScopePtr& scope) const override {}
  void Run(const ScopePtr& scope,
           const platform::DeviceContext& dev_ctx) const override {
-    float scale = GetAttr<float>("scale");
-    ASSERT_NEAR(scale, 3.14, 1e-5);
+    op_run_num++;
+    ASSERT_EQ((int)inputs_.size(), 1);
+    ASSERT_EQ((int)outputs_.size(), 1);
+    ASSERT_NEAR(GetAttr<float>("scale"), 3.14, 1e-5);
    ASSERT_EQ(scope->GetVariable(inputs_[0]), nullptr);
    ASSERT_EQ(x, 1);
    ASSERT_NE(scope->GetVariable(outputs_[0]), nullptr);
@@ -36,15 +40,14 @@ class OperatorTest : public OperatorBase {
  float x = 0;
 };

-class OperatorTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+class OpeWithoutKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 public:
-  OperatorTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+  OpeWithoutKernelTestProtoAndCheckerMaker(OpProto* proto,
+                                           OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("input", "input of test op");
    AddOutput("output", "output of test op");
-    AddAttr<float>("scale", "scale of cosine op")
-        .SetDefault(1.0)
-        .LargerThan(0.0);
+    AddAttr<float>("scale", "scale of cosine op");
    AddComment("This is test op");
  }
 };
@@ -52,8 +55,8 @@ class OperatorTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 }  // namespace framework
 }  // namespace paddle

-REGISTER_OP(test_operator, paddle::framework::OperatorTest,
-            paddle::framework::OperatorTestProtoAndCheckerMaker);
+REGISTER_OP(test_operator, paddle::framework::OpWithoutKernelTest,
+            paddle::framework::OpeWithoutKernelTestProtoAndCheckerMaker);

 TEST(OperatorBase, all) {
  paddle::framework::OpDesc op_desc;
@@ -63,18 +66,17 @@ TEST(OperatorBase, all) {
  auto attr = op_desc.mutable_attrs()->Add();
  attr->set_name("scale");
  attr->set_type(paddle::framework::AttrType::FLOAT);
-  float scale = 3.14;
-  attr->set_f(scale);
+  attr->set_f(3.14);

  paddle::platform::CPUDeviceContext device_context;
  auto scope = std::make_shared<paddle::framework::Scope>();

  paddle::framework::OperatorPtr op =
      paddle::framework::OpRegistry::CreateOp(op_desc);
-  ASSERT_EQ(op->GetAttr<float>("scale"), scale);
  scope->CreateVariable("OUT1");
+  ASSERT_EQ(paddle::framework::op_run_num, 0);
  op->Run(scope, device_context);
-  std::cout << op->DebugString() << std::endl;
+  ASSERT_EQ(paddle::framework::op_run_num, 1);
 }

 namespace paddle {
@@ -86,13 +88,13 @@ class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("input", "input of test op");
    AddOutput("output", "output of test op");
-    AddAttr<float>("scale", "scale of cosine op")
-        .SetDefault(1.0)
-        .LargerThan(0.0);
+    AddAttr<float>("scale", "scale of cosine op");
    AddComment("This is test op");
  }
 };

+static int cpu_kernel_run_num = 0;
+
 class OpWithKernelTest : public OperatorWithKernel {
 protected:
  void InferShape(const std::vector<const Tensor*>& inputs,
@@ -102,10 +104,10 @@ class OpWithKernelTest : public OperatorWithKernel {
 class CPUKernelTest : public OpKernel {
 public:
  void Compute(const KernelContext& context) const {
-    float scale = context.op_.GetAttr<float>("scale");
-    ASSERT_NEAR(scale, 3.14, 1e-5);
-    std::cout << "this is cpu kernel" << std::endl;
-    std::cout << context.op_.DebugString() << std::endl;
+    cpu_kernel_run_num++;
+    ASSERT_EQ((int)context.op_.inputs_.size(), 1);
+    ASSERT_EQ((int)context.op_.outputs_.size(), 1);
+    ASSERT_NEAR(context.op_.GetAttr<float>("scale"), 3.14, 1e-5);
  }
 };

@@ -131,5 +133,7 @@ TEST(OpKernel, all) {

  paddle::framework::OperatorPtr op =
      paddle::framework::OpRegistry::CreateOp(op_desc);
+  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
  op->Run(scope, cpu_device_context);
+  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
 }
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once

 #include <cstdint>
+#include <cstring>
 #include <memory>
-#include <type_traits>
 #include "paddle/framework/ddim.h"
 #include "paddle/framework/enforce.h"
 #include "paddle/memory/memory.h"
@@ -27,45 +27,63 @@ namespace framework {

 class Tensor {
 public:
-  Tensor() : offset_(0) {}
+  Tensor() : numel_(0), offset_(0) {}

-  explicit Tensor(const DDim& dims) : dims_(dims), offset_(0) {}
+  Tensor& operator=(const Tensor& src) = delete;

  template <typename T>
  const T* data() const {
-    PADDLE_ENFORCE(
-        holder_ != nullptr,
-        "Tenosr has not been initialized. Call Tensor::mutable_data first.");
+    CheckDims<T>();
    return reinterpret_cast<const T*>(
-        reinterpret_cast<uintptr_t>(holder_->Ptr()) + offset_);
+        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
  }

-  template <typename T,  // must be POD types
-            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
+  template <typename T>
  T* mutable_data(DDim dims, paddle::platform::Place place) {
-    dims_ = dims;
+    set_dims(dims);
+    return mutable_data<T>(place);
+  }
+
+  template <typename T>
+  T* mutable_data(paddle::platform::Place place) {
+    PADDLE_ENFORCE(numel_ > 0,
+                   "Tensor::numel_ must be larger than zero to call "
+                   "Tensor::mutable_data. Call Tensor::set_dim first.");
    if (holder_ == nullptr ||
-        !(holder_->Place() ==
+        !(holder_->place() ==
          place) /* some versions of boost::variant don't have operator!= */
-        || holder_->Size() < product(dims) * sizeof(T) + offset_) {
-      holder_.reset(new PlaceholderImpl<T>(place, product(dims) * sizeof(T)));
+        || holder_->size() < numel_ * sizeof(T) + offset_) {
+      holder_.reset(new PlaceholderImpl<T>(place, numel_ * sizeof(T)));
      offset_ = 0;
    }
-    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->Ptr()) +
+    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                offset_);
  }

+  template <typename T>
  void ShareDataFrom(const Tensor& src) {
-    PADDLE_ENFORCE(src.holder_ != nullptr,
-                   "Can not share data from an uninitialized tensor.");
+    src.CheckDims<T>();
    holder_ = src.holder_;
-    dims_ = src.dims_;
+    set_dims(src.dims());
    offset_ = src.offset_;
  }

+  template <typename T>
+  void CopyFrom(const Tensor& src, paddle::platform::Place dst_place) {
+    PADDLE_ENFORCE(platform::is_cpu_place(src.holder_->place()) &&
+                       platform::is_cpu_place(dst_place),
+                   "Tensor::CopyFrom only support CPU now.");
+    src.CheckDims<T>();
+    size_t size = src.numel_ * sizeof(T);
+    set_dims(src.dims());
+    const void* src_ptr = static_cast<const void*>(src.data<T>());
+    void* dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
+    memcpy(dst_ptr, src_ptr, size);
+  }
+
+  template <typename T>
  Tensor Slice(const int& begin_idx, const int& end_idx) const {
-    PADDLE_ENFORCE(holder_ != nullptr,
-                   "The sliced tenosr has not been initialized.");
+    CheckDims<T>();
    PADDLE_ENFORCE(begin_idx >= 0 && end_idx <= dims_[0],
                   "Slice index is less than zero or out of bound.");
    PADDLE_ENFORCE(begin_idx < end_idx,
@@ -78,12 +96,21 @@ class Tensor {
    }
    Tensor dst;
    dst.holder_ = holder_;
-    dst.dims_ = dims_;
-    dst.dims_[0] = end_idx - begin_idx;
-    dst.offset_ = offset_ + begin_idx * base * holder_->TypeSize();
+    DDim dst_dims = dims_;
+    dst_dims[0] = end_idx - begin_idx;
+    dst.set_dims(dst_dims);
+    dst.offset_ = offset_ + begin_idx * base * sizeof(T);
    return dst;
  }

+  void set_dims(const DDim& dims) {
+    if (dims == dims_) {
+      return;
+    }
+    dims_ = dims;
+    numel_ = product(dims_);
+  }
+
  DDim dims() const { return dims_; }

 private:
@@ -91,10 +118,9 @@ class Tensor {
  // parameter of Variable.
  struct Placeholder {
    virtual ~Placeholder() {}
-    virtual void* Ptr() const = 0;
-    virtual paddle::platform::Place Place() const = 0;
-    virtual size_t Size() const = 0;
-    virtual size_t TypeSize() const = 0;
+    virtual void* ptr() const = 0;
+    virtual paddle::platform::Place place() const = 0;
+    virtual size_t size() const = 0;
  };

  template <typename T>
@@ -118,18 +144,27 @@ class Tensor {
          place_(place),
          size_(size) {}

-    virtual void* Ptr() const { return static_cast<void*>(ptr_.get()); }
-    virtual size_t Size() const { return size_; }
-    virtual paddle::platform::Place Place() const { return place_; }
-    virtual size_t TypeSize() const { return sizeof(T); }
+    virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
+    virtual size_t size() const { return size_; }
+    virtual paddle::platform::Place place() const { return place_; }

    std::unique_ptr<T, Deleter> ptr_;
    paddle::platform::Place place_;  // record the place of ptr_.
    size_t size_;                    // size of the memory block.
  };

+  template <typename T>
+  inline void CheckDims() const {
+    PADDLE_ENFORCE(holder_ != nullptr,
+                   "Tenosr holds no memory. Call Tensor::mutable_data first.");
+    PADDLE_ENFORCE(holder_->size() >= numel_ * sizeof(T) + offset_,
+                   "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
+                   "first to re-allocate memory.");
+  }
+
  std::shared_ptr<Placeholder> holder_;  // holds the memory block if allocated.
  DDim dims_;
+  size_t numel_;   // cache of `product(dims_)`
  size_t offset_;  // marks the begin of tensor data area.
 };


--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -18,7 +18,8 @@
 TEST(Tensor, Dims) {
  using namespace paddle::framework;
  using namespace paddle::platform;
-  Tensor tt(make_ddim({2, 3, 4}));
+  Tensor tt;
+  tt.set_dims(make_ddim({2, 3, 4}));
  DDim dims = tt.dims();
  ASSERT_EQ(arity(dims), 3);
  for (int i = 0; i < 3; ++i) {
@@ -35,7 +36,7 @@ TEST(Tensor, DataAssert) {
  } catch (paddle::framework::EnforceNotMet err) {
    caught = true;
    std::string msg =
-        "Tenosr has not been initialized. Call Tensor::mutable_data first.";
+        "Tenosr holds no memory. Call Tensor::mutable_data first.";
    const char* what = err.what();
    for (size_t i = 0; i < msg.length(); ++i) {
      ASSERT_EQ(what[i], msg[i]);
@@ -104,19 +105,18 @@ TEST(Tensor, ShareDataFrom) {
    // Try to share data form uninitialized tensor
    bool caught = false;
    try {
-      dst_tensor.ShareDataFrom(src_tensor);
+      dst_tensor.ShareDataFrom<float>(src_tensor);
    } catch (EnforceNotMet err) {
      caught = true;
-      std::string msg = "Can not share data from an uninitialized tensor.";
-      const char* what = err.what();
-      for (size_t i = 0; i < msg.length(); ++i) {
-        ASSERT_EQ(what[i], msg[i]);
+      std::string msg = "Tenosr holds no memory. Call Tensor::mutable_data
+first."; const char* what = err.what(); for (size_t i = 0; i < msg.length();
++i) { ASSERT_EQ(what[i], msg[i]);
      }
    }
    ASSERT_TRUE(caught);

    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CPUPlace());
-    dst_tensor.ShareDataFrom(src_tensor);
+    dst_tensor.ShareDataFrom<int>(src_tensor);
    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
  }

@@ -124,7 +124,7 @@ TEST(Tensor, ShareDataFrom) {
    Tensor src_tensor;
    Tensor dst_tensor;
    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
-    dst_tensor.ShareDataFrom(src_tensor);
+    dst_tensor.ShareDataFrom<int>(src_tensor);
    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
  }
 }
@@ -135,7 +135,7 @@ TEST(Tensor, Slice) {
  {
    Tensor src_tensor;
    src_tensor.mutable_data<int>(make_ddim({5, 3, 4}), CPUPlace());
-    Tensor slice_tensor = src_tensor.Slice(1, 3);
+    Tensor slice_tensor = src_tensor.Slice<int>(1, 3);
    DDim slice_dims = slice_tensor.dims();
    ASSERT_EQ(arity(slice_dims), 3);
    EXPECT_EQ(slice_dims[0], 2);
@@ -158,7 +158,7 @@ TEST(Tensor, Slice) {
  {
    Tensor src_tensor;
    src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
-    Tensor slice_tensor = src_tensor.Slice(2, 6);
+    Tensor slice_tensor = src_tensor.Slice<double>(2, 6);
    DDim slice_dims = slice_tensor.dims();
    ASSERT_EQ(arity(slice_dims), 2);
    EXPECT_EQ(slice_dims[0], 4);
@@ -178,4 +178,29 @@ TEST(Tensor, Slice) {
  }
 }

+TEST(Tensor, CopyFrom) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+
+  Tensor src_tensor;
+  int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
+  int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  memcpy(src_ptr, arr, 9 * sizeof(int));
+  Tensor dst_tensor;
+  dst_tensor.CopyFrom<int>(src_tensor, CPUPlace());
+  const int* dst_ptr = dst_tensor.data<int>();
+  ASSERT_NE(src_ptr, dst_ptr);
+  for (size_t i = 0; i < 9; ++i) {
+    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+  }
+
+  Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
+  dst_tensor.CopyFrom<int>(slice_tensor, CPUPlace());
+  const int* slice_ptr = slice_tensor.data<int>();
+  dst_ptr = dst_tensor.data<int>();
+  ASSERT_NE(dst_ptr, slice_ptr);
+  for (size_t i = 0; i < 3; ++i) {
+    EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+  }
+}
 */
\ No newline at end of file
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -117,8 +117,7 @@ public:
    ConvFunctionBase::init(config);
  }

-  virtual void check(const BufferArgs& inputs,
-                     const BufferArgs& outputs) override {
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
    const TensorShape& input = inputs[0].shape();
    const TensorShape& filter = inputs[1].shape();
    const TensorShape& output = outputs[0].shape();
@@ -217,8 +216,7 @@ public:
    ConvFunctionBase::init(config);
  }

-  virtual void check(const BufferArgs& inputs,
-                     const BufferArgs& outputs) override {
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
    const TensorShape& output = inputs[0].shape();
    const TensorShape& filter = inputs[1].shape();
    const TensorShape& input = outputs[0].shape();
@@ -311,8 +309,7 @@ public:
    ConvFunctionBase::init(config);
  }

-  virtual void check(const BufferArgs& inputs,
-                     const BufferArgs& outputs) override {
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
    const TensorShape& output = inputs[0].shape();
    const TensorShape& input = inputs[1].shape();
    const TensorShape& filter = outputs[0].shape();

--- a/paddle/function/NaiveConvOp.cpp
+++ b/paddle/function/NaiveConvOp.cpp
@@ -90,8 +90,7 @@ public:
    ConvFunctionBase::init(config);
  }

-  virtual void check(const BufferArgs& inputs,
-                     const BufferArgs& outputs) override {
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
    const TensorShape& input = inputs[0].shape();
    const TensorShape& filter = inputs[1].shape();
    const TensorShape& output = outputs[0].shape();

--- a/paddle/function/RowConvOpGpu.cu
+++ b/paddle/function/RowConvOpGpu.cu
@@ -32,7 +32,7 @@ __global__ void KeRowConv(real* y, const real* x,  const real* w,
  for (int i = tidy; i < context; i += blky) {
    sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0;
  }
-  
+
  __syncthreads();

  for (int i = 0; i < numSeq; ++i) {
@@ -144,12 +144,15 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
      int yoff = start + j;

      // transpose
-      sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
-      sh_dy[tidx][tidy + context - 1] = (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0;
+      sh_x[tidx][tidy] = (xoff < width && yoff < end) ?
+      x[yoff * width + xoff] : 0.0;
+      sh_dy[tidx][tidy + context - 1] = (xoff < width && yoff < end) ?
+      dy[yoff * width + xoff] : 0.0;
      __syncthreads();
      if (tidy < (context - 1)) {
        yoff = yoff - context + 1;
-        sh_dy[tidx][tidy] = (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0;
+        sh_dy[tidx][tidy] = (xoff < width && yoff >= start) ?
+        dy[yoff * width + xoff] : 0.0;
      }
      __syncthreads();

@@ -199,11 +202,13 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
      int yoff = start + j;

      // transpose
-      sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
+      sh_x[tidx][tidy] = (xoff < width && yoff < end) ?
+      x[yoff * width + xoff] : 0.0;
      __syncthreads();

      for (int t = 0; t < context; t++) {
-        sh_dy[tidx][tidy] = (xoff < width && (yoff - t) >= start && yoff - t < end) ? dy[(yoff - t) * width + xoff] : 0.0;
+        sh_dy[tidx][tidy] = (xoff < width && (yoff - t) >= start &&
+        yoff - t < end) ? dy[(yoff - t) * width + xoff] : 0.0;
        __syncthreads();

        real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx];
@@ -239,7 +244,7 @@ __global__ void KeRowConvBwData(real* dx, const real* w, const real* dy,
  for (int i = tidy; i < context; i += blky) {
    sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0;
  }
-  
+
  __syncthreads();

  for (int i = 0; i < numSeq; ++i) {
@@ -312,7 +317,7 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
    dim3 dimBlock(32, 32);
    dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
    real* dw = filterG.getData();
-    if (contextLength <= 32) { 
+    if (contextLength <= 32) {
      KeRowConvBwWeight<32, 32, 32>
        <<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
        (dw, x, dy, starts, height, width, numSeq, contextLength);

--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -205,10 +205,8 @@ public:
      hl_destroy_event(hlEvent_);
      hlEvent_ = NULL;
    }
-    if (batchData_) {
-      delete batchData_;
-      batchData_ = NULL;
-    }
+    delete batchData_;
+    batchData_ = NULL;
  }

  void setDataBatch(DataBatch* batchData) { batchData_ = batchData; }

--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -403,7 +403,7 @@ public:
      : layerName_(layerName) {
    addEvaluator(std::move(evaluator));
  }
-  virtual void eval(const NeuralNetwork& nn) override {
+  void eval(const NeuralNetwork& nn) override {
    const LayerPtr& layer = nn.getLayer(layerName_);
    CHECK(layer) << "Nonexisted layer: " << layerName_ << " in submodel "
                 << nn.getName();

--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -636,7 +636,7 @@ void lenToStarts(std::vector<int>& starts) {
  }
  starts.back() = pos;
 }
-}
+}  // namespace

 void RecurrentGradientMachine::calcSequenceStartPositions() {
  std::vector<int> starts(commonSeqInfo_.size() + 1);

--- a/paddle/gserver/layers/AgentLayer.cpp
+++ b/paddle/gserver/layers/AgentLayer.cpp
@@ -124,7 +124,7 @@ void copyElements(const IVector& srcVec,
    dest[index[i]] = src[i];
  }
 }
-}
+}  // namespace

 void GatherAgentLayer::forwardIds(PassType passType) {
  IVectorPtr realId = realLayers_[0]->getOutputLabel();

--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
@@ -32,9 +32,7 @@ static InitFunction __init_storage_engine([]() { StorageEngine::singleton(); },
 StorageEngine::StorageEngine() : cpuAllocator_(nullptr) {}

 StorageEngine::~StorageEngine() {
-  if (cpuAllocator_) {
-    delete cpuAllocator_;
-  }
+  delete cpuAllocator_;
  for (auto it : gpuAllocator_) {
    delete it;
  }

--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
 add_subdirectory(detail)
+
+cc_library(memory SRCS memory.cc)
+
+cc_library(paddle_memory
+    DEPS
+    memory meta_data
+    meta_cache memory_block
+    buddy_allocator system_allocator)
+
+cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
 if(${WITH_GPU})
-  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags)
-  nv_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
+  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info)
 else(${WITH_GPU})
-  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags)
-  cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
+  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info)
 endif(${WITH_GPU})
+
+cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
+
+cc_library(meta_data SRCS meta_data.cc)
+
+cc_library(meta_cache SRCS meta_cache.cc)
+
+cc_library(memory_block SRCS memory_block.cc)
+
+cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS glog)
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -12,22 +12,317 @@
   See the License for the specific language governing permissions and
   limitations under the License. */

-#pragma once
-
 #include "paddle/memory/detail/buddy_allocator.h"
+#include "glog/logging.h"

 namespace paddle {
 namespace memory {
 namespace detail {

-BuddyAllocator::BuddyAllocator(size_t pool_size, size_t max_pools,
-                               SystemAllocator* system_allocator)
-    : pool_size_(pool_size),
-      max_pools_(max_pools),
-      system_allocator_(system_allocator) {
-  PADDLE_ASSERT(pool_size > 0);
-  PADDLE_ASSERT(max_pools > 0);
-  PADDLE_ASSERT(system_allocator != nullptr);
+BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator,
+                               size_t min_chunk_size, size_t max_chunk_size)
+    : min_chunk_size_(min_chunk_size),
+      max_chunk_size_(max_chunk_size),
+      cache_(system_allocator->UseGpu()),
+      system_allocator_(std::move(system_allocator)) {}
+
+BuddyAllocator::~BuddyAllocator() {
+  DLOG(INFO) << "BuddyAllocator Disconstructor makes sure that all of these "
+                "have actually been freed";
+  while (!pool_.empty()) {
+    auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
+    DLOG(INFO) << "Free from block (" << block << ", " << max_chunk_size_
+               << ")";
+
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    cache_.invalidate(block);
+    pool_.erase(pool_.begin());
+  }
+}
+
+inline size_t align(size_t size, size_t alignment) {
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+void* BuddyAllocator::Alloc(size_t unaligned_size) {
+  // adjust allocation alignment
+  size_t size = align(unaligned_size + sizeof(Metadata), min_chunk_size_);
+
+  // acquire the allocator lock
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  DLOG(INFO) << "Allocate " << unaligned_size << " bytes from chunk size "
+             << size;
+
+  // if the allocation is huge, send directly to the system allocator
+  if (size > max_chunk_size_) {
+    DLOG(INFO) << "Allocate from system allocator.";
+    return SystemAlloc(size);
+  }
+
+  // query and allocate from the existing chunk
+  auto it = FindExistChunk(size);
+
+  // refill the pool if failure
+  if (it == pool_.end()) {
+    it = RefillPool();
+    // if still failure, fail fatally
+    if (it == pool_.end()) {
+      return nullptr;
+    }
+  } else {
+    DLOG(INFO) << "Allocation from existing memory block " << std::get<2>(*it)
+               << " at address "
+               << reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
+  }
+
+  total_used_ += size;
+  total_free_ -= size;
+
+  // split the allocation and return data for use
+  return reinterpret_cast<MemoryBlock*>(SplitToAlloc(it, size))->data();
+}
+
+void BuddyAllocator::Free(void* p) {
+  // Point back to metadata
+  auto block = static_cast<MemoryBlock*>(p)->metadata();
+
+  // Acquire the allocator lock
+  std::lock_guard<std::mutex> lock(mutex_);
+
+  DLOG(INFO) << "Free from address " << block;
+
+  if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) {
+    DLOG(INFO) << "Free directly from system allocator";
+    system_allocator_->Free(block, block->total_size(cache_),
+                            block->index(cache_));
+
+    // Invalidate GPU allocation from cache
+    cache_.invalidate(block);
+
+    return;
+  }
+
+  block->mark_as_free(cache_);
+
+  total_used_ -= block->total_size(cache_);
+  total_free_ += block->total_size(cache_);
+
+  // Trying to merge the right buddy
+  if (block->has_right_buddy(cache_)) {
+    DLOG(INFO) << "Merging this block " << block << " with its right buddy "
+               << block->right_buddy(cache_);
+
+    auto right_buddy = block->right_buddy(cache_);
+
+    if (right_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      // Take away right buddy from pool
+      pool_.erase(IndexSizeAddress(right_buddy->index(cache_),
+                                   right_buddy->total_size(cache_),
+                                   right_buddy));
+
+      // merge its right buddy to the block
+      block->merge(cache_, right_buddy);
+    }
+  }
+
+  // Trying to merge the left buddy
+  if (block->has_left_buddy(cache_)) {
+    DLOG(INFO) << "Merging this block " << block << " with its left buddy "
+               << block->left_buddy(cache_);
+
+    auto left_buddy = block->left_buddy(cache_);
+
+    if (left_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      // Take away right buddy from pool
+      pool_.erase(IndexSizeAddress(left_buddy->index(cache_),
+                                   left_buddy->total_size(cache_), left_buddy));
+
+      // merge the block to its left buddy
+      left_buddy->merge(cache_, block);
+      block = left_buddy;
+    }
+  }
+
+  // Dumping this block into pool
+  DLOG(INFO) << "Inserting free block (" << block << ", "
+             << block->total_size(cache_) << ")";
+  pool_.insert(
+      IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
+
+  // Clean up if existing too much free memory
+
+  // Prefer freeing fallback allocation first
+  CleanIdleFallBackAlloc();
+
+  // Free normal allocation
+  CleanIdleNormalAlloc();
+}
+
+size_t BuddyAllocator::Used() { return total_used_; }
+
+void* BuddyAllocator::SystemAlloc(size_t size) {
+  size_t index = 0;
+  void* p = system_allocator_->Alloc(index, size);
+
+  DLOG(INFO) << "Allocated " << p << " from system allocator.";
+
+  if (p == nullptr) return nullptr;
+
+  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::HUGE_CHUNK, index,
+                                     size, nullptr, nullptr);
+
+  return static_cast<MemoryBlock*>(p)->data();
+}
+
+BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
+#ifndef PADDLE_ONLY_CPU
+  if (system_allocator_->UseGpu()) {
+    if ((total_used_ + total_free_) == 0) {
+      // Compute the maximum allocation size for the first allocation.
+      max_chunk_size_ = platform::GpuMaxChunkSize();
+    }
+  }
+#endif  // PADDLE_ONLY_CPU
+
+  // Allocate a new maximum sized block
+  size_t index = 0;
+  void* p = system_allocator_->Alloc(index, max_chunk_size_);
+
+  if (p == nullptr) return pool_.end();
+
+  DLOG(INFO) << "Creating and inserting new block " << p
+             << " from system allocator";
+
+  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::FREE_CHUNK, index,
+                                     max_chunk_size_, nullptr, nullptr);
+
+  // gpu fallback allocation
+  if (system_allocator_->UseGpu() &&
+      static_cast<MemoryBlock*>(p)->index(cache_) == 1) {
+    fallback_alloc_count_++;
+  }
+
+  total_free_ += max_chunk_size_;
+
+  // dump the block into pool
+  return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first;
+}
+
+BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
+  size_t index = 0;
+
+  while (1) {
+    auto it = pool_.lower_bound(IndexSizeAddress(index, size, nullptr));
+
+    // no match chunk memory
+    if (it == pool_.end()) return it;
+
+    if (std::get<0>(*it) > index) {
+      // find suitable one
+      if (std::get<1>(*it) >= size) {
+        return it;
+      }
+      // update and continue
+      index = std::get<0>(*it);
+      continue;
+    }
+    return it;
+  }
+}
+
+void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
+                                   size_t size) {
+  auto block = static_cast<MemoryBlock*>(std::get<2>(*it));
+  pool_.erase(it);
+
+  DLOG(INFO) << "Split block (" << block << ", " << block->total_size(cache_)
+             << ") into";
+  block->split(cache_, size);
+
+  DLOG(INFO) << "Left block (" << block << ", " << block->total_size(cache_)
+             << ")";
+  block->set_type(cache_, MemoryBlock::ARENA_CHUNK);
+
+  // the rest of memory if exist
+  if (block->has_right_buddy(cache_)) {
+    if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      DLOG(INFO) << "Insert right block (" << block->right_buddy(cache_) << ", "
+                 << block->right_buddy(cache_)->total_size(cache_) << ")";
+
+      pool_.insert(
+          IndexSizeAddress(block->right_buddy(cache_)->index(cache_),
+                           block->right_buddy(cache_)->total_size(cache_),
+                           block->right_buddy(cache_)));
+    }
+  }
+
+  return block;
+}
+
+void BuddyAllocator::CleanIdleFallBackAlloc() {
+  // If fallback allocation does not exist, return directly
+  if (!fallback_alloc_count_) return;
+
+  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
+    // If free memory block less than max_chunk_size_, return directly
+    if (std::get<1>(*pool) < max_chunk_size_) return;
+
+    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
+
+    // If no GPU fallback allocator, return
+    if (!system_allocator_->UseGpu() || block->index(cache_) == 0) {
+      return;
+    }
+
+    DLOG(INFO) << "Return block " << block << " to fallback allocator.";
+
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    cache_.invalidate(block);
+
+    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
+
+    total_free_ -= max_chunk_size_;
+    fallback_alloc_count_--;
+
+    // If no fall allocation exists, return directly
+    if (!fallback_alloc_count_) return;
+  }
+}
+
+void BuddyAllocator::CleanIdleNormalAlloc() {
+  auto shall_free_alloc = [&]() -> bool {
+    // free all fallback allocations
+    if (fallback_alloc_count_ > 0) {
+      return true;
+    }
+    // keep 2x overhead if we haven't fallen back
+    if ((total_used_ + max_chunk_size_) * 2 < total_free_) {
+      return true;
+    }
+    return false;
+  };
+
+  if (!shall_free_alloc()) return;
+
+  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
+    // If free memory block less than max_chunk_size_, return directly
+    if (std::get<1>(*pool) < max_chunk_size_) return;
+
+    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
+
+    DLOG(INFO) << "Return block " << block << " to base allocator.";
+
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    cache_.invalidate(block);
+
+    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
+
+    total_free_ -= max_chunk_size_;
+
+    if (!shall_free_alloc()) return;
+  }
 }

 }  // namespace detail

--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -14,9 +14,16 @@

 #pragma once

+#include "paddle/memory/detail/meta_cache.h"
+#include "paddle/memory/detail/meta_data.h"
 #include "paddle/memory/detail/system_allocator.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/cpu_info.h"
+#include "paddle/platform/gpu_info.h"

 #include <mutex>
+#include <set>
+#include <unordered_map>
 #include <vector>

 namespace paddle {
@@ -25,61 +32,80 @@ namespace detail {

 class BuddyAllocator {
 public:
-  BuddyAllocator(size_t pool_size, size_t max_pools,
-                 SystemAllocator* system_allocator);
+  BuddyAllocator(SystemAllocator* system_allocator, size_t min_chunk_size,
+                 size_t max_chunk_size);
+
  ~BuddyAllocator();

-  void* Alloc(size_t size);
+ public:
+  void* Alloc(size_t unaligned_size);
  void Free(void*);
  size_t Used();

+ public:
+  // Disable copy and assignment
+  BuddyAllocator(const BuddyAllocator&) = delete;
+  BuddyAllocator& operator=(const BuddyAllocator&) = delete;
+
 private:
-  struct Block {
-    size_t size_;
-    Block* left_;   // left buddy
-    Block* right_;  // right buddy
-  };
+  // Tuple (allocator index, memory size, memory address)
+  using IndexSizeAddress = std::tuple<size_t, size_t, void*>;
+  // Each element in PoolSet is a free allocation
+  using PoolSet = std::set<IndexSizeAddress>;

-  // Initially, there is only one pool.  If a Alloc founds not enough
-  // memory from that pool, and there has not been max_num_pools_,
-  // create a new pool by calling system_allocator_.Alloc(pool_size_).
-  std::vector<void*> pools_;
+  /*! \brief Allocate fixed-size memory from system */
+  void* SystemAlloc(size_t size);

-  size_t pool_size_;      // the size of each pool;
-  size_t max_num_pools_;  // the size of all pools;
+  /*! \brief If existing chunks are not suitable, refill pool */
+  PoolSet::iterator RefillPool();

-  SystemAllocator* system_allocator_;
+  /**
+   *  \brief   Find the suitable chunk from existing pool and split
+   *           it to left and right buddies
+   *
+   *  \param   it     the iterator of pool list
+   *  \param   size   the size of allocation
+   *
+   *  \return  the left buddy address
+   */
+  void* SplitToAlloc(PoolSet::iterator it, size_t size);

-  std::mutex mutex_;
+  /*! \brief Find the existing chunk which used to allocation */
+  PoolSet::iterator FindExistChunk(size_t size);

-  // Disable copy and assignment.
-  BuddyAllocator(const BuddyAllocator&) = delete;
-  BuddyAllocator& operator=(const BuddyAllocator&) = delete;
-};
+  /*! \brief Clean idle fallback allocation */
+  void CleanIdleFallBackAlloc();
+
+  /*! \brief Clean idle normal allocation */
+  void CleanIdleNormalAlloc();

-BuddyAllocator<CPUAllocator>* GetCPUBuddyAllocator() {
-  static BuddyAllocator<CPUAllocator>* a = nullptr;
-  if (a == nullptr) {
-    a = new BuddyAllocator<CPUAllocator>();
-  }
-  return a;
-}
-
-#ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
-
-BuddyAllocator<GPUAllocator>* GetGPUBuddyAllocator(int gpu_id) {
-  static BuddyAllocator<GPUAllocator>** as = NULL;
-  if (as == NULL) {
-    int gpu_num = platform::GetDeviceCount();
-    as = new BuddyAllocator<GPUAllocator>*[gpu_num];
-    for (int gpu = 0; gpu < gpu_num; gpu++) {
-      as[gpu] = new BuddyAllocator<GPUAllocator>();
-    }
-  }
-  return as[gpu_id];
-}
-
-#endif  // PADDLE_ONLY_CPU
+ private:
+  size_t total_used_ = 0;  // the total size of used memory
+  size_t total_free_ = 0;  // the total size of free memory
+
+  size_t min_chunk_size_;  // the minimum size of each chunk
+  size_t max_chunk_size_;  // the maximum size of each chunk
+
+ private:
+  /**
+   * \brief A list of free allocation
+   *
+   * \note  Only store free chunk memory in pool
+   */
+  PoolSet pool_;
+
+  /*! Record fallback allocation count for auto-scaling */
+  size_t fallback_alloc_count_ = 0;
+
+ private:
+  /*! Unify the metadata format between GPU and CPU allocations */
+  MetadataCache cache_;
+
+ private:
+  /*! Allocate CPU/GPU memory from system */
+  SystemAllocator* system_allocator_;
+  std::mutex mutex_;
+};

 }  // namespace detail
 }  // namespace memory

--- a/paddle/memory/detail/memory_block.cc
+++ b/paddle/memory/detail/memory_block.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/memory/detail/meta_cache.h"
+#include "paddle/memory/detail/meta_data.h"
+#include "paddle/platform/assert.h"
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+void MemoryBlock::init(MetadataCache& cache, Type t, size_t index, size_t size,
+                       void* left_buddy, void* right_buddy) {
+  cache.store(this, Metadata(t, index, size - sizeof(Metadata), size,
+                             static_cast<MemoryBlock*>(left_buddy),
+                             static_cast<MemoryBlock*>(right_buddy)));
+}
+
+MemoryBlock::Type MemoryBlock::type(MetadataCache& cache) const {
+  return cache.load(this).type;
+}
+
+size_t MemoryBlock::size(MetadataCache& cache) const {
+  return cache.load(this).size;
+}
+
+size_t MemoryBlock::total_size(MetadataCache& cache) const {
+  return cache.load(this).total_size;
+}
+
+MemoryBlock* MemoryBlock::left_buddy(MetadataCache& cache) const {
+  return cache.load(this).left_buddy;
+}
+
+MemoryBlock* MemoryBlock::right_buddy(MetadataCache& cache) const {
+  return cache.load(this).right_buddy;
+}
+
+void MemoryBlock::split(MetadataCache& cache, size_t size) {
+  // make sure the split fits
+  PADDLE_ASSERT(total_size(cache) >= size);
+
+  // bail out if there is no room for another partition
+  if (total_size(cache) - size <= sizeof(Metadata)) {
+    return;
+  }
+
+  // find the position of the split
+  void* right_partition = reinterpret_cast<uint8_t*>(this) + size;
+
+  size_t remaining_size = total_size(cache) - size;
+
+  // Add the new block as a buddy
+  auto metadata = cache.load(this);
+
+  // Write the metadata for the new block
+  auto new_block_right_buddy = metadata.right_buddy;
+
+  cache.store(
+      static_cast<MemoryBlock*>(right_partition),
+      Metadata(FREE_CHUNK, index(cache), remaining_size - sizeof(Metadata),
+               remaining_size, this, new_block_right_buddy));
+
+  metadata.right_buddy = static_cast<MemoryBlock*>(right_partition);
+  metadata.size = size - sizeof(Metadata);
+  metadata.total_size = size;
+
+  cache.store(this, metadata);
+
+  // Write metadata for the new block's right buddy
+  if (new_block_right_buddy != nullptr) {
+    auto buddy_metadata = cache.load(new_block_right_buddy);
+
+    buddy_metadata.left_buddy = static_cast<MemoryBlock*>(right_partition);
+
+    cache.store(new_block_right_buddy, buddy_metadata);
+  }
+}
+
+void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) {
+  // only free blocks can be merged
+  PADDLE_ASSERT(type(cache) == FREE_CHUNK);
+  PADDLE_ASSERT(right_buddy->type(cache) == FREE_CHUNK);
+
+  auto metadata = cache.load(this);
+
+  // link this->buddy's buddy
+  metadata.right_buddy = right_buddy->right_buddy(cache);
+
+  // link buddy's buddy -> this
+  if (metadata.right_buddy != nullptr) {
+    auto buddy_metadata = cache.load(metadata.right_buddy);
+
+    buddy_metadata.left_buddy = this;
+
+    cache.store(metadata.right_buddy, buddy_metadata);
+  }
+
+  metadata.size += right_buddy->total_size(cache);
+  metadata.total_size += right_buddy->total_size(cache);
+
+  cache.store(this, metadata);
+  cache.store(right_buddy, Metadata(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr));
+}
+
+void MemoryBlock::mark_as_free(MetadataCache& cache) {
+  // check for double free or corruption
+  PADDLE_ASSERT(type(cache) != FREE_CHUNK);
+  PADDLE_ASSERT(type(cache) != INVALID_CHUNK);
+
+  set_type(cache, FREE_CHUNK);
+}
+
+void MemoryBlock::set_type(MetadataCache& cache, Type t) {
+  auto metadata = cache.load(this);
+
+  metadata.type = t;
+
+  cache.store(this, metadata);
+}
+
+bool MemoryBlock::has_left_buddy(MetadataCache& cache) const {
+  return left_buddy(cache) != nullptr;
+}
+
+bool MemoryBlock::has_right_buddy(MetadataCache& cache) const {
+  return right_buddy(cache) != nullptr;
+}
+
+size_t MemoryBlock::index(MetadataCache& cache) const {
+  return cache.load(this).index;
+}
+
+void* MemoryBlock::data() const {
+  return const_cast<Metadata*>(reinterpret_cast<const Metadata*>(this)) + 1;
+}
+
+MemoryBlock* MemoryBlock::metadata() const {
+  return const_cast<MemoryBlock*>(reinterpret_cast<const MemoryBlock*>(
+      reinterpret_cast<const Metadata*>(this) - 1));
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/memory/detail/memory_block.h
+++ b/paddle/memory/detail/memory_block.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <cstddef>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+// Forward Declarations
+class MetadataCache;
+
+/*! \brief A class used to interpret the contents of a memory block */
+class MemoryBlock {
+ public:
+  enum Type {
+    FREE_CHUNK,    // memory is free and idle
+    ARENA_CHUNK,   // memory is being occupied
+    HUGE_CHUNK,    // memory is out of management
+    INVALID_CHUNK  // memory is invalid
+  };
+
+ public:
+  void init(MetadataCache& cache, Type t, size_t index, size_t size,
+            void* left_buddy, void* right_buddy);
+
+ public:
+  /*! \brief The type of the allocation */
+  Type type(MetadataCache& cache) const;
+
+  /*! \brief The size of the data region */
+  size_t size(MetadataCache& cache) const;
+
+  /*! \brief An index to track the allocator */
+  size_t index(MetadataCache& cache) const;
+
+  /*! \brief The total size of the block */
+  size_t total_size(MetadataCache& cache) const;
+
+  /*! \brief Check the left buddy of the block */
+  bool has_left_buddy(MetadataCache& cache) const;
+
+  /*! \brief Check the right buddy of the block */
+  bool has_right_buddy(MetadataCache& cache) const;
+
+  /*! \brief Get the left buddy */
+  MemoryBlock* left_buddy(MetadataCache& cache) const;
+
+  /*! \brief Get the right buddy */
+  MemoryBlock* right_buddy(MetadataCache& cache) const;
+
+ public:
+  /*! \brief Split the allocation into left/right blocks */
+  void split(MetadataCache& cache, size_t size);
+
+  /*! \brief Merge left and right blocks together */
+  void merge(MetadataCache& cache, MemoryBlock* right_buddy);
+
+  /*! \brief Mark the allocation as free */
+  void mark_as_free(MetadataCache& cache);
+
+  /*! \brief Change the type of the allocation */
+  void set_type(MetadataCache& cache, Type t);
+
+ public:
+  /*! \brief Get a pointer to the memory block's data */
+  void* data() const;
+
+  /*! \brief Get a pointer to the memory block's metadata */
+  MemoryBlock* metadata() const;
+
+ public:
+  static size_t overhead();
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/memory/detail/meta_cache.cc
+++ b/paddle/memory/detail/meta_cache.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/memory/detail/meta_cache.h"
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/platform/assert.h"
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {}
+
+Metadata MetadataCache::load(const MemoryBlock* block) {
+  if (uses_gpu_) {
+    auto existing_metadata = cache_.find(block);
+    PADDLE_ASSERT(existing_metadata->second.check_guards());
+    return existing_metadata->second;
+  } else {
+    PADDLE_ASSERT(reinterpret_cast<const Metadata*>(block)->check_guards());
+    return *reinterpret_cast<const Metadata*>(block);
+  }
+}
+
+void MetadataCache::store(MemoryBlock* block,
+                          const Metadata& original_metadata) {
+  auto metadata = original_metadata;
+
+  metadata.update_guards();
+
+  if (uses_gpu_) {
+    cache_[block] = metadata;
+  } else {
+    *reinterpret_cast<Metadata*>(block) = metadata;
+  }
+}
+
+void MetadataCache::invalidate(MemoryBlock* block) {
+  if (uses_gpu_) {
+    cache_.erase(block);
+  }
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/memory/detail/meta_cache.h
+++ b/paddle/memory/detail/meta_cache.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/memory/detail/meta_data.h"
+
+#include <unordered_map>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+/**
+ *  \brief A cache for accessing memory block meta-data that may be expensive
+ *         to access directly.
+ *
+ *  \note  This class exists to unify the metadata format between GPU and CPU
+ *         allocations. It should be removed when the CPU can access all GPU
+ *         allocations directly via UVM.
+ */
+class MetadataCache {
+ public:
+  MetadataCache(bool uses_gpu);
+
+ public:
+  /*! \brief Load the associated metadata for the specified memory block. */
+  Metadata load(const MemoryBlock*);
+
+  /*! \brief Store the associated metadata for the specified memory block. */
+  void store(MemoryBlock*, const Metadata&);
+
+  /*! \brief Indicate that the specified metadata will no longer be used. */
+  void invalidate(MemoryBlock*);
+
+ public:
+  MetadataCache(const MetadataCache&) = delete;
+  MetadataCache& operator=(const MetadataCache&) = delete;
+
+ private:
+  bool uses_gpu_;
+
+ private:
+  typedef std::unordered_map<const MemoryBlock*, Metadata> MetadataMap;
+
+ private:
+  MetadataMap cache_;
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/memory/detail/meta_data.cc
+++ b/paddle/memory/detail/meta_data.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/memory/detail/meta_data.h"
+
+#include <functional>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
+                   MemoryBlock* l, MemoryBlock* r)
+    : type(t),
+      index(i),
+      size(s),
+      total_size(ts),
+      left_buddy(l),
+      right_buddy(r) {}
+
+Metadata::Metadata()
+    : type(MemoryBlock::INVALID_CHUNK),
+      index(0),
+      size(0),
+      total_size(0),
+      left_buddy(nullptr),
+      right_buddy(nullptr) {}
+
+template <class T>
+inline void hash_combine(std::size_t& seed, const T& v) {
+  std::hash<T> hasher;
+  seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+
+inline size_t hash(const Metadata* metadata, size_t initial_seed) {
+  size_t seed = initial_seed;
+
+  hash_combine(seed, (size_t)metadata->type);
+  hash_combine(seed, metadata->index);
+  hash_combine(seed, metadata->size);
+  hash_combine(seed, metadata->total_size);
+  hash_combine(seed, metadata->left_buddy);
+  hash_combine(seed, metadata->right_buddy);
+
+  return seed;
+}
+
+void Metadata::update_guards() {
+  guard_begin = hash(this, 1);
+  guard_end = hash(this, 2);
+}
+
+bool Metadata::check_guards() const {
+  return guard_begin == hash(this, 1) && guard_end == hash(this, 2);
+}
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/memory/detail/meta_data.h
+++ b/paddle/memory/detail/meta_data.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/detail/memory_block.h"
+
+#include <stddef.h>
+
+namespace paddle {
+namespace memory {
+namespace detail {
+
+class Metadata {
+ public:
+  Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l,
+           MemoryBlock* r);
+  Metadata();
+
+ public:
+  /*! \brief Update the guards when metadata is changed */
+  void update_guards();
+
+  /*! \brief Check consistency to previous modification */
+  bool check_guards() const;
+
+ public:
+  // TODO(gangliao): compress this
+  // clang-format off
+  size_t            guard_begin = 0;
+  MemoryBlock::Type type        = MemoryBlock::INVALID_CHUNK;
+  size_t            index       = 0;
+  size_t            size        = 0;
+  size_t            total_size  = 0;
+  MemoryBlock*      left_buddy  = nullptr;
+  MemoryBlock*      right_buddy = nullptr;
+  size_t            guard_end   = 0;
+  // clang-format on
+};
+
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
@@ -13,76 +13,128 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/memory/detail/system_allocator.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/error.h"
+#include "paddle/platform/gpu_info.h"

 #include <stdlib.h>    // for malloc and free
 #include <sys/mman.h>  // for mlock and munlock

 #include "gflags/gflags.h"
-#include "paddle/platform/assert.h"
-#include "paddle/platform/cuda.h"

 // If use_pinned_memory is true, CPUAllocator calls mlock, which
 // returns pinned and locked memory as staging areas for data exchange
 // between host and device.  Allocates too much would reduce the amount
 // of memory available to the system for paging.  So, by default, we
 // should set false to use_pinned_memory.
-DEFINE_bool(use_pinned_memory, false,
-            "If set, allocate cpu/gpu pinned memory.");
+DEFINE_bool(use_pinned_memory, false, "If set, allocate cpu pinned memory.");

 namespace paddle {
 namespace memory {
 namespace detail {

-void* CPUAllocator::Alloc(size_t size) {
+void* CPUAllocator::Alloc(size_t& index, size_t size) {
  // According to http://www.cplusplus.com/reference/cstdlib/malloc/,
  // malloc might not return nullptr if size is zero, but the returned
  // pointer shall not be dereferenced -- so we make it nullptr.
  if (size <= 0) return nullptr;

+  index = 0;  // unlock memory
+
  void* p = malloc(size);
-  if (p != nullptr && FLAGS_use_pinned_memory) {
-    mlock(p, size);
+
+  if (p != nullptr) {
+    if (FLAGS_use_pinned_memory) {
+      index = 1;
+      mlock(p, size);  // lock memory
+    }
  }
+
  return p;
 }

-void CPUAllocator::Free(void* p, size_t size) {
-  if (p != nullptr && FLAGS_use_pinned_memory) {
+void CPUAllocator::Free(void* p, size_t size, size_t index) {
+  if (p != nullptr && index == 1) {
    munlock(p, size);
  }
  free(p);
 }

+bool CPUAllocator::UseGpu() const { return false; }
+
 #ifndef PADDLE_ONLY_CPU

-void* GPUAllocator::Alloc(size_t size) {
+void* GPUAllocator::Alloc(size_t& index, size_t size) {
  // CUDA documentation doesn't explain if cudaMalloc returns nullptr
  // if size is 0.  We just make sure it does.
-  if (size <= 0) {
-    return nullptr;
-  }
+  if (size <= 0) return nullptr;

+  size_t available = 0;
+  size_t capacity = 0;
+  paddle::platform::GpuMemoryUsage(available, capacity);
+
+  // Reserve memory for page tables, etc.
+  size_t reserving = capacity - paddle::platform::GpuMaxAllocSize();
+  size_t usable = available > reserving ? available - reserving : 0;
+
+  // If remaining size no less than expected size, using general
+  // cudaMalloc to allocate GPU memory.
  void* p = 0;
-  cudaError_t result =
-      FLAGS_use_pinned_memory ? cudaMallocHost(&p, size) : cudaMalloc(&p, size);
-  if (result != cudaSuccess) {
-    cudaGetLastError();  // clear error if there is any.
+  if (size <= usable) {
+    cudaError_t result = cudaMalloc(&p, size);
+    if (result == cudaSuccess) {
+      index = 0;
+      gpu_alloc_size_ += size;
+      return p;
+    }
+  }
+
+  // If remaining size less than expected size or cudaMalloc failed,
+  // cudaMallocHost will be considered as a fallback allocator.
+  //
+  // NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
+  // of host fallback allocation. Allocates too much would reduce
+  // the amount of memory available to the underlying system for paging.
+  usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
+
+  if (size > usable) return nullptr;
+
+  cudaError_t result = cudaMallocHost(&p, size);
+  if (result == cudaSuccess) {
+    index = 1;
+    fallback_alloc_size_ += size;
+    return p;
  }
-  return result == cudaSuccess ? p : nullptr;
+
+  return nullptr;
 }

-void GPUAllocator::Free(void* p, size_t size) {
+void GPUAllocator::Free(void* p, size_t size, size_t index) {
+  cudaError_t err;
+
+  if (index == 0) {
+    PADDLE_ASSERT(gpu_alloc_size_ >= size);
+    gpu_alloc_size_ -= size;
+    err = cudaFree(p);
+  } else {
+    PADDLE_ASSERT(fallback_alloc_size_ >= size);
+    fallback_alloc_size_ -= size;
+    err = cudaFreeHost(p);
+  }
+
  // Purposefully allow cudaErrorCudartUnloading, because
  // that is returned if you ever call cudaFree after the
  // driver has already shutdown. This happens only if the
  // process is terminating, in which case we don't care if
  // cudaFree succeeds.
-  cudaError_t err = FLAGS_use_pinned_memory ? cudaFreeHost(p) : cudaFree(p);
  if (err != cudaErrorCudartUnloading) {
-    platform::throw_on_error(err, "cudaFree{Host} failed");
+    platform::throw_on_error(err,
+                             "cudaFree{Host} failed in GPUAllocator::Free.");
  }
 }

+bool GPUAllocator::UseGpu() const { return true; }
+
 #endif  // PADDLE_ONLY_CPU

 }  // namespace detail

--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -20,31 +20,36 @@ namespace paddle {
 namespace memory {
 namespace detail {

-// SystemAllocator is the parent class of CPUAllocator and
-// GPUAllocator.  A BuddyAllocator object uses a SystemAllocator*
-// pointing to the underlying system allocator.  An alternative to
-// this class hierarchy is to pass a system allocator class to
-// BuddyAllocator as a template parameter.  This approach makes
-// BuddyAllocator a class template, and it's very complicated
-// algorithm would make the buddy_allocator.h messy.
+/**
+ * \brief SystemAllocator is the parent class of CPUAllocator and GPUAllocator.
+ *        A BuddyAllocator object uses a SystemAllocator* pointing to the
+ *        underlying system allocator.
+ */
 class SystemAllocator {
 public:
  virtual ~SystemAllocator() {}
-  virtual void* Alloc(size_t size) = 0;
-  virtual void Free(void* p, size_t size) = 0;
+  virtual void* Alloc(size_t& index, size_t size) = 0;
+  virtual void Free(void* p, size_t size, size_t index) = 0;
+  virtual bool UseGpu() const = 0;
 };

 class CPUAllocator : public SystemAllocator {
 public:
-  virtual void* Alloc(size_t size);
-  virtual void Free(void* p, size_t size);
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
 };

 #ifndef PADDLE_ONLY_CPU
 class GPUAllocator : public SystemAllocator {
 public:
-  virtual void* Alloc(size_t size);
-  virtual void Free(void* p, size_t size);
+  virtual void* Alloc(size_t& index, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+
+ private:
+  size_t gpu_alloc_size_ = 0;
+  size_t fallback_alloc_size_ = 0;
 };
 #endif  // PADDLE_ONLY_CPU


--- a/paddle/memory/detail/system_allocator_test.cc
+++ b/paddle/memory/detail/system_allocator_test.cc
@@ -25,7 +25,8 @@ DECLARE_bool(use_pinned_memory);
 void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
  bool freed = false;
  {
-    void* p = a.Alloc(size);
+    size_t index;
+    void* p = a.Alloc(index, size);
    if (size > 0) {
      EXPECT_NE(p, nullptr);
    } else {
@@ -35,7 +36,7 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
    int* i = static_cast<int*>(p);
    std::shared_ptr<int> ptr(i, [&](void* p) {
      freed = true;
-      a.Free(p, size);
+      a.Free(p, size, index);
    });
  }
  EXPECT_TRUE(freed);
@@ -56,14 +57,7 @@ TEST(CPUAllocator, LockMem) {
 }

 #ifndef PADDLE_ONLY_CPU
-TEST(GPUAllocator, NoStaging) {
-  FLAGS_use_pinned_memory = false;
-  paddle::memory::detail::GPUAllocator a;
-  TestAllocator(a, 2048);
-  TestAllocator(a, 0);
-}
-TEST(GPUAllocator, Staging) {
-  FLAGS_use_pinned_memory = true;
+TEST(GPUAllocator, Alloc) {
  paddle::memory::detail::GPUAllocator a;
  TestAllocator(a, 2048);
  TestAllocator(a, 0);

--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -17,43 +17,67 @@ limitations under the License. */
 #include "paddle/memory/detail/system_allocator.h"
 #include "paddle/platform/assert.h"

-#include <boost/variant.hpp>
-
 namespace paddle {
 namespace memory {

-void* Alloc(platform::Place pl, size_t size) {
-#ifndef PADDLE_ONLY_CPU
-  if (paddle::platform::is_gpu_place(pl)) {
-    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    return detail::GetGPUBuddyAllocator(gpu_id)->Alloc(size);
+detail::BuddyAllocator* GetCPUBuddyAllocator() {
+  static detail::BuddyAllocator* a = nullptr;
+  if (a == nullptr) {
+    a = new detail::BuddyAllocator(new detail::CPUAllocator,
+                                   platform::CpuMinChunkSize(),
+                                   platform::CpuMaxChunkSize());
  }
-#endif  // PADDLE_ONLY_CPU
-  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  return detail::GetCPUBuddyAllocator()->Alloc(size);
+  return a;
 }

-void Free(paddle::platform::Place pl, void* p) {
-#ifndef PADDLE_ONLY_CPU
-  if (paddle::platform::is_gpu_place(pl)) {
-    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    detail::GetGPUBuddyAllocator(gpu_id)->Free(p);
-  }
-#endif  // PADDLE_ONLY_CPU
-  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  detail::GetCPUBuddyAllocator()->Free(p);
+template <>
+void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
+  return GetCPUBuddyAllocator()->Alloc(size);
+}
+
+template <>
+void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) {
+  GetCPUBuddyAllocator()->Free(p);
+}
+
+template <>
+size_t Used<platform::CPUPlace>(platform::CPUPlace place) {
+  return GetCPUBuddyAllocator()->Used();
 }

-size_t Used(paddle::platform::Place pl) {
 #ifndef PADDLE_ONLY_CPU
-  if (paddle::platform::is_gpu_place(pl)) {
-    size_t gpu_id = boost::get<platform::GPUPlace>(pl).device;
-    return detail::GetGPUBuddyAllocator(gpu_id)->Used();
+
+detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
+  static detail::BuddyAllocator** as = NULL;
+  if (as == NULL) {
+    int gpu_num = platform::GetDeviceCount();
+    as = new detail::BuddyAllocator*[gpu_num];
+    for (int gpu = 0; gpu < gpu_num; gpu++) {
+      platform::SetDeviceId(gpu);
+      as[gpu] = new detail::BuddyAllocator(new detail::GPUAllocator,
+                                           platform::GpuMinChunkSize(),
+                                           platform::GpuMaxChunkSize());
+    }
  }
-#endif  // PADDLE_ONLY_CPU
-  PADDLE_ASSERT(paddle::platform::is_cpu_place(pl));
-  return detail::GetCPUBuddyAllocator()->Used();
+  return as[gpu_id];
+}
+
+template <>
+void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
+  return GetGPUBuddyAllocator(place.device)->Alloc(size);
+}
+
+template <>
+void Free<platform::GPUPlace>(platform::GPUPlace place, void* p) {
+  GetGPUBuddyAllocator(place.device)->Free(p);
+}
+
+template <>
+size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
+  return GetGPUBuddyAllocator(place.device)->Used();
 }

+#endif  // PADDLE_ONLY_CPU
+
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -19,9 +19,14 @@ limitations under the License. */
 namespace paddle {
 namespace memory {

-void* Alloc(paddle::platform::Place, size_t);
-void Free(paddle::platform::Place, void*);
-size_t Used(paddle::platform::Place);
+template <class Place>
+void* Alloc(Place, size_t);
+
+template <class Place>
+void Free(Place, void*);
+
+template <class Place>
+size_t Used(Place);

 }  // namespace memory
 }  // namespace paddle
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/memory.h"
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/memory/detail/meta_data.h"
+
+#include "paddle/platform/cpu_info.h"
+#include "paddle/platform/gpu_info.h"
+#include "paddle/platform/place.h"
+
+#include <gtest/gtest.h>
+#include <unordered_map>
+
+inline bool is_aligned(void const *p) {
+  return 0 == (reinterpret_cast<uintptr_t>(p) & 0x3);
+}
+
+size_t align(size_t size, paddle::platform::CPUPlace place) {
+  size += sizeof(paddle::memory::detail::Metadata);
+  size_t alignment = paddle::platform::CpuMinChunkSize();
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+TEST(BuddyAllocator, CPUAllocation) {
+  void *p = nullptr;
+
+  EXPECT_EQ(p, nullptr);
+
+  paddle::platform::CPUPlace cpu;
+  p = paddle::memory::Alloc(cpu, 4096);
+
+  EXPECT_NE(p, nullptr);
+
+  paddle::memory::Free(cpu, p);
+}
+
+TEST(BuddyAllocator, CPUMultAlloc) {
+  paddle::platform::CPUPlace cpu;
+
+  std::unordered_map<void *, size_t> ps;
+
+  size_t total_size = paddle::memory::Used(cpu);
+  EXPECT_EQ(total_size, 0UL);
+
+  for (auto size :
+       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+    ps[paddle::memory::Alloc(cpu, size)] = size;
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(size, cpu);
+    total_size += aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
+  }
+
+  for (auto p : ps) {
+    EXPECT_EQ(is_aligned(p.first), true);
+    paddle::memory::Free(cpu, p.first);
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(cpu) == total_size) continue;
+
+    size_t aligned_size = align(p.second, cpu);
+    total_size -= aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(cpu));
+  }
+}
+
+#ifndef PADDLE_ONLY_CPU
+
+size_t align(size_t size, paddle::platform::GPUPlace place) {
+  size += sizeof(paddle::memory::detail::Metadata);
+  size_t alignment = paddle::platform::GpuMinChunkSize();
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+
+TEST(BuddyAllocator, GPUAllocation) {
+  void *p = nullptr;
+
+  EXPECT_EQ(p, nullptr);
+
+  paddle::platform::GPUPlace gpu(0);
+  p = paddle::memory::Alloc(gpu, 4096);
+
+  EXPECT_NE(p, nullptr);
+
+  paddle::memory::Free(gpu, p);
+}
+
+TEST(BuddyAllocator, GPUMultAlloc) {
+  paddle::platform::GPUPlace gpu;
+
+  std::unordered_map<void *, size_t> ps;
+
+  size_t total_size = paddle::memory::Used(gpu);
+  EXPECT_EQ(total_size, 0UL);
+
+  for (auto size :
+       {128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
+    ps[paddle::memory::Alloc(gpu, size)] = size;
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(gpu) == total_size) continue;
+
+    size_t aligned_size = align(size, gpu);
+    total_size += aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(gpu));
+  }
+
+  for (auto p : ps) {
+    EXPECT_EQ(is_aligned(p.first), true);
+    paddle::memory::Free(gpu, p.first);
+
+    // Buddy Allocator doesn't manage too large memory chunk
+    if (paddle::memory::Used(gpu) == total_size) continue;
+
+    size_t aligned_size = align(p.second, gpu);
+    total_size -= aligned_size;
+    EXPECT_EQ(total_size, paddle::memory::Used(gpu));
+  }
+}
+
+#endif  // PADDLE_ONLY_CPU
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include <paddle/framework/op_registry.h>
 #include <paddle/framework/tensor.h>
 #include <paddle/operators/add_op.h>
@@ -36,9 +50,9 @@ The equation is: Out = X + Y
 )DOC");
  }
 };
-}  // namespace op
+}  // namespace operators
 }  // namespace paddle

 REGISTER_OP(add_two, paddle::operators::AddOp, paddle::operators::AddOpMaker);
 REGISTER_OP_CPU_KERNEL(
-    add_two, ::paddle::operators::AddKernel<::paddle::platform::CPUPlace>);
\ No newline at end of file
+    add_two, ::paddle::operators::AddKernel<::paddle::platform::CPUPlace>);
--- a/paddle/optimizer/parameter_optimizer_test.cpp
+++ b/paddle/optimizer/parameter_optimizer_test.cpp
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
 #include "parameter_optimizer.h"
 #include <cmath>
 #include <map>
@@ -5,21 +21,18 @@
 #include "gtest/gtest.h"
 #include "lr_policy.h"

-using namespace paddle;
-using namespace paddle::optimizer;
-
-Tensor* FillTensor(size_t size) {
-  Tensor* param = new Tensor(size);
-  Tensor& p = *param;
+paddle::optimizer::Tensor* FillTensor(size_t size) {
+  paddle::optimizer::Tensor* param = new paddle::optimizer::Tensor(size);
+  paddle::optimizer::Tensor& p = *param;
  for (size_t i = 0; i < p.size(); ++i) {
    p[i] = (float)rand() / (float)RAND_MAX;
  }
  return param;
 }

-Tensor* FixedTensor(size_t size) {
-  Tensor* param = new Tensor(size);
-  Tensor& p = *param;
+paddle::optimizer::Tensor* FixedTensor(size_t size) {
+  paddle::optimizer::Tensor* param = new paddle::optimizer::Tensor(size);
+  paddle::optimizer::Tensor& p = *param;
  for (size_t i = 0; i < p.size(); ++i) {
    p[i] = i;
  }
@@ -28,7 +41,8 @@ Tensor* FixedTensor(size_t size) {

 class OptimizerTest : public testing::Test {
 public:
-  // init tensor shape
+  virtual ~OptimizerTest() {}
+  // init paddle::optimizer::Tensor shape
  const size_t kSize = 5;

  virtual void SetUp() {
@@ -38,34 +52,36 @@ public:
  virtual void TearDown() {}

  void CreateSGD() {
-    Tensor* parameter = FixedTensor(kSize);
-    config_.set_optimizer(OptimizerConfig::SGD);
+    paddle::optimizer::Tensor* parameter = FixedTensor(kSize);
+    config_.set_optimizer(paddle::OptimizerConfig::SGD);
    config_.mutable_sgd()->set_momentum(0.0);
    config_.mutable_sgd()->set_decay(0.0);
    config_.mutable_sgd()->set_nesterov(false);
-    config_.set_lr_policy(OptimizerConfig::Const);
+    config_.set_lr_policy(paddle::OptimizerConfig::Const);
    config_.mutable_const_lr()->set_learning_rate(0.1);
    std::string str = config_.SerializeAsString();
-    ParameterOptimizer* opt = ParameterOptimizer::Create(str, parameter);
+    paddle::optimizer::ParameterOptimizer* opt =
+        paddle::optimizer::ParameterOptimizer::Create(str, parameter);
    opts_.push_back(opt);
  }

  void CreateAdam() {
-    Tensor* parameter = FixedTensor(kSize);
-    config_.set_optimizer(OptimizerConfig::Adam);
+    paddle::optimizer::Tensor* parameter = FixedTensor(kSize);
+    config_.set_optimizer(paddle::OptimizerConfig::Adam);
    config_.mutable_adam()->set_beta_1(0.9);
    config_.mutable_adam()->set_beta_2(0.1);
    config_.mutable_adam()->set_epsilon(1e-3);
    config_.mutable_adam()->set_decay(0.0);
-    config_.set_lr_policy(OptimizerConfig::Const);
+    config_.set_lr_policy(paddle::OptimizerConfig::Const);
    config_.mutable_const_lr()->set_learning_rate(0.1);
    std::string str = config_.SerializeAsString();
-    ParameterOptimizer* opt = ParameterOptimizer::Create(str, parameter);
+    paddle::optimizer::ParameterOptimizer* opt =
+        paddle::optimizer::ParameterOptimizer::Create(str, parameter);
    opts_.push_back(opt);
  }

  void TestGetWeight() {
-    Tensor* p = FixedTensor(kSize);
+    paddle::optimizer::Tensor* p = FixedTensor(kSize);
    for (size_t i = 0; i < opts_.size(); ++i) {
      int s = 0;
      float* newp = (float*)opts_[i]->get_weight(&s);
@@ -76,7 +92,7 @@ public:
  }

  void TestUpdate() {
-    Tensor* g = FixedTensor(kSize);
+    paddle::optimizer::Tensor* g = FixedTensor(kSize);
    for (size_t i = 0; i < opts_.size(); ++i) {
      opts_[i]->Update(g);
    }
@@ -91,8 +107,8 @@ public:
  }

 private:
-  std::vector<ParameterOptimizer*> opts_;
-  OptimizerConfig config_;
+  std::vector<paddle::optimizer::ParameterOptimizer*> opts_;
+  paddle::OptimizerConfig config_;
 };

 TEST_F(OptimizerTest, TestGetWeight) { TestGetWeight(); }

--- a/paddle/optimizer/serialization_test.cpp
+++ b/paddle/optimizer/serialization_test.cpp
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
 #include "serialization.h"
 #include "gtest/gtest.h"

-using namespace paddle;
-using namespace paddle::optimizer;
-
 TEST(TensorToProto, Case1) {
-  Tensor t(3), t1(3);
+  paddle::optimizer::Tensor t(3), t1(3);
  for (size_t i = 0; i < t.size(); ++i) {
    t[i] = i;
    t1[i] = 0;
  }

-  TensorProto proto;
-  TensorToProto(t, &proto);
-  ProtoToTensor(proto, &t1);
+  paddle::TensorProto proto;
+  paddle::optimizer::TensorToProto(t, &proto);
+  paddle::optimizer::ProtoToTensor(proto, &t1);
  for (size_t i = 0; i < t1.size(); ++i) {
    EXPECT_EQ(t1[i], t[i]);
  }

--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
-add_subdirectory(dynload)
+cc_library(cpu_info SRCS cpu_info.cc DEPS gflags glog)
+cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)

-nv_test(cuda_test SRCS cuda_test.cu)
+nv_library(gpu_info SRCS gpu_info.cc DEPS gflags)

 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)

+add_subdirectory(dynload)
+
 IF(WITH_GPU)
    set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
 ELSE()
@@ -12,4 +15,4 @@ ELSE()
 ENDIF()

 cc_library(device_context SRCS device_context.cc DEPS place eigen3 ${GPU_CTX_DEPS})
-nv_test(device_context_test SRCS device_context_test.cc DEPS device_context glog gflags)
+nv_test(device_context_test SRCS device_context_test.cc DEPS device_context gpu_info)
--- a/paddle/platform/cpu_info.cc
+++ b/paddle/platform/cpu_info.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/cpu_info.h"
+
+#ifdef __APPLE__
+#include <sys/sysctl.h>
+#include <sys/types.h>
+#else
+#include <unistd.h>
+#endif
+
+#include "gflags/gflags.h"
+#include "paddle/platform/error.h"
+
+DEFINE_double(fraction_of_cpu_memory_to_use, 1,
+              "Default use 100% of CPU memory for PaddlePaddle,"
+              "reserve the rest for page tables, etc");
+
+namespace paddle {
+namespace platform {
+
+inline size_t CpuTotalPhysicalMemory() {
+#ifdef __APPLE__
+  int mib[2];
+  mib[0] = CTL_HW;
+  mib[1] = HW_MEMSIZE;
+  int64_t size = 0;
+  size_t len = sizeof(size);
+  if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size;
+  return 0L;
+#else
+  int64_t pages = sysconf(_SC_PHYS_PAGES);
+  int64_t page_size = sysconf(_SC_PAGE_SIZE);
+  return pages * page_size;
+#endif
+}
+
+size_t CpuMaxAllocSize() {
+  // For distributed systems, it requires configuring and limiting
+  // the fraction of memory to use.
+  return FLAGS_fraction_of_cpu_memory_to_use * CpuTotalPhysicalMemory();
+}
+
+size_t CpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 4 KB.
+  return 1 << 12;
+}
+
+size_t CpuMaxChunkSize() {
+  // Allow to allocate the maximum chunk size is roughly 3% of CPU memory.
+  return CpuMaxAllocSize() / 32;
+}
+
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/platform/cpu_info.h
+++ b/paddle/platform/cpu_info.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <stddef.h>
+
+namespace paddle {
+namespace platform {
+
+//! Get the maximum allocation size for a machine.
+size_t CpuMaxAllocSize();
+
+//! Get the minimum chunk size for buddy allocator.
+size_t CpuMinChunkSize();
+
+//! Get the maximum chunk size for buddy allocator.
+size_t CpuMaxChunkSize();
+
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/platform/cpu_info_test.cc
+++ b/paddle/platform/cpu_info_test.cc
+#include "paddle/platform/cpu_info.h"
+#include "paddle/string/printf.h"
+
+#include <ostream>
+#include <sstream>
+
+#include "gflags/gflags.h"
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+DECLARE_double(fraction_of_cpu_memory_to_use);
+
+TEST(CpuMemoryUsage, Print) {
+  std::stringstream ss;
+  size_t memory_size = paddle::platform::CpuMaxAllocSize() / 1024 / 1024 / 1024;
+  float use_percent = FLAGS_fraction_of_cpu_memory_to_use * 100;
+
+  std::cout << paddle::string::Sprintf("\n%.2f %% of CPU Memory Usage: %d GB\n",
+                                       use_percent, memory_size)
+            << std::endl;
+}
--- a/paddle/platform/cuda_test.cu
+++ b/paddle/platform/cuda_test.cu
-#include <cuda_runtime.h>
-#include <stdio.h>
-#include "gtest/gtest.h"
-
-#define CHECK_ERR(x)                 \
-  if (x != cudaSuccess) {            \
-    fprintf(stderr,                  \
-            "%s in %s at line %d\n", \
-            cudaGetErrorString(err), \
-            __FILE__,                \
-            __LINE__);               \
-    exit(-1);                        \
-  }
-
-__global__ void vecAdd(float *d_A, float *d_B, float *d_C, int n) {
-  int i = blockDim.x * blockIdx.x + threadIdx.x;
-  if (i < n) {
-    d_C[i] = d_A[i] + d_B[i];
-  }
-}
-
-TEST(Cuda, Equality) {
-  int n = 10;
-  // Memory allocation for h_A, h_B and h_C (in the host)
-  float h_A[10] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 0.0};
-  float h_B[10] = {0.0, 9.0, 8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0};
-  float h_C[10];
-  float *d_A, *d_B, *d_C;
-  cudaError_t err;
-  // Memory allocation for d_A, d_B and d_C (in the device)
-  err = cudaMalloc((void **)&d_A, sizeof(float) * n);
-  CHECK_ERR(err);
-
-  err = cudaMalloc((void **)&d_B, sizeof(float) * n);
-  CHECK_ERR(err);
-
-  err = cudaMalloc((void **)&d_C, sizeof(float) * n);
-  CHECK_ERR(err);
-
-  // Copying memory to device
-  err = cudaMemcpy(d_A, h_A, sizeof(float) * n, cudaMemcpyHostToDevice);
-  CHECK_ERR(err);
-
-  err = cudaMemcpy(d_B, h_B, sizeof(float) * n, cudaMemcpyHostToDevice);
-  CHECK_ERR(err);
-
-  // Calling the kernel
-  vecAdd<<<ceil(n / 256.0), 256>>>(d_A, d_B, d_C, n);
-
-  // Copying results back to host
-  err = cudaMemcpy(h_C, d_C, sizeof(float) * n, cudaMemcpyDeviceToHost);
-  CHECK_ERR(err);
-
-  EXPECT_EQ(h_C[0], 1.0);
-  for (int i = 1; i < n - 1; ++i) {
-    EXPECT_EQ(h_C[i], 11.0);
-  }
-  EXPECT_EQ(h_C[9], 1.0);
-}
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -13,10 +13,11 @@ limitations under the License. */

 #include "paddle/framework/enforce.h"
 #ifndef PADDLE_ONLY_CPU
-#include "paddle/platform/cuda.h"
 #include "paddle/platform/dynload/cublas.h"
 #include "paddle/platform/dynload/cudnn.h"
 #include "paddle/platform/dynload/curand.h"
+#include "paddle/platform/error.h"
+#include "paddle/platform/gpu_info.h"
 #define EIGEN_USE_GPU
 #endif
 #include <paddle/platform/place.h>

--- a/paddle/platform/error.h
+++ b/paddle/platform/error.h
+#pragma once
+
+#include <sstream>
+#include <stdexcept>
+#include <string>
+
+#ifndef PADDLE_ONLY_CPU
+
+#include <cublas_v2.h>
+#include <cudnn.h>
+#include <curand.h>
+#include <thrust/system/cuda/error.h>
+#include <thrust/system_error.h>
+
+#endif  // PADDLE_ONLY_CPU
+
+namespace paddle {
+namespace platform {
+
+#ifndef PADDLE_ONLY_CPU
+
+inline void throw_on_error(cudaError_t e, const char* message) {
+  if (e) {
+    throw thrust::system_error(e, thrust::cuda_category(), message);
+  }
+}
+
+inline void throw_on_error(curandStatus_t stat, const char* message) {
+  if (stat != CURAND_STATUS_SUCCESS) {
+    throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(),
+                               message);
+  }
+}
+
+inline void throw_on_error(cudnnStatus_t stat, const char* message) {
+  std::stringstream ss;
+  if (stat == CUDNN_STATUS_SUCCESS) {
+    return;
+  } else {
+    ss << cudnnGetErrorString(stat);
+    ss << ", " << message;
+    throw std::runtime_error(ss.str());
+  }
+}
+
+inline void throw_on_error(cublasStatus_t stat, const char* message) {
+  std::stringstream ss;
+  if (stat == CUBLAS_STATUS_SUCCESS) {
+    return;
+  } else if (stat == CUBLAS_STATUS_NOT_INITIALIZED) {
+    ss << "CUBLAS: not initialized";
+  } else if (stat == CUBLAS_STATUS_ALLOC_FAILED) {
+    ss << "CUBLAS: alloc failed";
+  } else if (stat == CUBLAS_STATUS_INVALID_VALUE) {
+    ss << "CUBLAS: invalid value";
+  } else if (stat == CUBLAS_STATUS_ARCH_MISMATCH) {
+    ss << "CUBLAS: arch mismatch";
+  } else if (stat == CUBLAS_STATUS_MAPPING_ERROR) {
+    ss << "CUBLAS: mapping error";
+  } else if (stat == CUBLAS_STATUS_EXECUTION_FAILED) {
+    ss << "CUBLAS: execution failed";
+  } else if (stat == CUBLAS_STATUS_INTERNAL_ERROR) {
+    ss << "CUBLAS: internal error";
+  } else if (stat == CUBLAS_STATUS_NOT_SUPPORTED) {
+    ss << "CUBLAS: not supported";
+  } else if (stat == CUBLAS_STATUS_LICENSE_ERROR) {
+    ss << "CUBLAS: license error";
+  }
+  ss << ", " << message;
+  throw std::runtime_error(ss.str());
+}
+
+inline void throw_on_error(cublasStatus_t stat) {
+  const char* message = "";
+  throw_on_error(stat, message);
+}
+
+#endif  // PADDLE_ONLY_CPU
+
+inline void throw_on_error(int stat, const char* message) {
+  if (stat) {
+    throw std::runtime_error(message + (", stat = " + std::to_string(stat)));
+  }
+}
+
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/gpu_info.h"
+#include "gflags/gflags.h"
+#include "paddle/platform/error.h"
+
+DEFINE_double(fraction_of_gpu_memory_to_use, 0.95,
+              "Default use 95% of GPU memory for PaddlePaddle,"
+              "reserve the rest for page tables, etc");
+
+namespace paddle {
+namespace platform {
+
+int GetDeviceCount() {
+  int count;
+  throw_on_error(
+      cudaGetDeviceCount(&count),
+      "cudaGetDeviceCount failed in paddle::platform::GetDeviceCount");
+  return count;
+}
+
+int GetCurrentDeviceId() {
+  int device_id;
+  throw_on_error(
+      cudaGetDevice(&device_id),
+      "cudaGetDevice failed in paddle::platform::GetCurrentDeviceId");
+  return device_id;
+}
+
+void SetDeviceId(int id) {
+  throw_on_error(cudaSetDevice(id),
+                 "cudaSetDevice failed in paddle::platform::SetDeviceId");
+}
+
+void GpuMemoryUsage(size_t& available, size_t& total) {
+  throw_on_error(cudaMemGetInfo(&available, &total),
+                 "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
+}
+
+size_t GpuMaxAllocSize() {
+  size_t total = 0;
+  size_t available = 0;
+
+  GpuMemoryUsage(available, total);
+
+  // Reserve the rest for page tables, etc.
+  return static_cast<size_t>(total * FLAGS_fraction_of_gpu_memory_to_use);
+}
+
+size_t GpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 256 bytes.
+  return 1 << 8;
+}
+
+size_t GpuMaxChunkSize() {
+  size_t total = 0;
+  size_t available = 0;
+
+  GpuMemoryUsage(available, total);
+
+  // Reserving the rest memory for page tables, etc.
+  size_t reserving = (1 - FLAGS_fraction_of_gpu_memory_to_use) * total;
+
+  // If available less than minimum chunk size, no usable memory exists.
+  available = std::max(available, GpuMinChunkSize()) - GpuMinChunkSize();
+
+  // If available less than reserving, no usable memory exists.
+  size_t usable = std::max(available, reserving) - reserving;
+
+  return usable;
+}
+
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/platform/cuda.h
+++ b/paddle/platform/cuda.h
@@ -16,33 +16,31 @@ limitations under the License. */

 #ifndef PADDLE_ONLY_CPU

-#include <thrust/system/cuda/error.h>
-#include <thrust/system_error.h>
+#include <stddef.h>

 namespace paddle {
 namespace platform {

-inline void throw_on_error(cudaError_t e, const char* message) {
-  if (e) {
-    throw thrust::system_error(e, thrust::cuda_category(), message);
-  }
-}
-
-inline int GetDeviceCount(void) {
-  int count;
-  throw_on_error(cudaGetDeviceCount(&count), "cudaGetDeviceCount failed");
-  return count;
-}
-
-inline int GetCurrentDeviceId(void) {
-  int device_id;
-  throw_on_error(cudaGetDevice(&device_id), "cudaGetDevice failed");
-  return device_id;
-}
-
-inline void SetDeviceId(int device_id) {
-  throw_on_error(cudaSetDevice(device_id), "cudaSetDevice failed");
-}
+//! Get the total number of GPU devices in system.
+int GetDeviceCount();
+
+//! Get the current GPU device id in system.
+int GetCurrentDeviceId();
+
+//! Set the GPU device id for next execution.
+void SetDeviceId(int device_id);
+
+//！Get the memory usage of current GPU device.
+void GpuMemoryUsage(size_t& available, size_t& total);
+
+//! Get the maximum allocation size of current GPU device.
+size_t GpuMaxAllocSize();
+
+//! Get the minimum chunk size for GPU buddy allocator.
+size_t GpuMinChunkSize();
+
+//! Get the maximum chunk size for GPU buddy allocator.
+size_t GpuMaxChunkSize();

 }  // namespace platform
 }  // namespace paddle

--- a/paddle/platform/place.cc
+++ b/paddle/platform/place.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include "paddle/platform/place.h"

 namespace paddle {
@@ -7,7 +21,7 @@ namespace detail {

 class PlacePrinter : public boost::static_visitor<> {
 public:
-  PlacePrinter(std::ostream &os) : os_(os) {}
+  explicit PlacePrinter(std::ostream &os) : os_(os) {}
  void operator()(const CPUPlace &) { os_ << "CPUPlace"; }
  void operator()(const GPUPlace &p) { os_ << "GPUPlace(" << p.device << ")"; }


--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
-cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python)
+cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python add_op)
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -13,12 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <Python.h>
+#include <paddle/framework/op_registry.h>
 #include <paddle/framework/scope.h>
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <fstream>
+#include <vector>

 namespace py = pybind11;
 namespace pd = paddle::framework;

+USE_OP(add_two);
+
 PYBIND11_PLUGIN(core) {
  py::module m("core", "C++ core of Paddle Paddle");

@@ -43,5 +49,20 @@ All parameter, weight, gradient are variables in Paddle.
           &pd::Scope::CreateVariable,
           py::return_value_policy::reference);

+  //! @note: Be careful! PyBind will return std::string as an unicode, not
+  //! Python str. If you want a str object, you should cast them in Python.
+  m.def("get_all_op_protos", []() -> std::vector<std::string> {
+    auto& protos = pd::OpRegistry::protos();
+    std::vector<std::string> ret_values;
+    for (auto it = protos.begin(); it != protos.end(); ++it) {
+      PADDLE_ENFORCE(it->second.IsInitialized(),
+                     "OpProto must all be initialized");
+      ret_values.emplace_back();
+      PADDLE_ENFORCE(it->second.SerializeToString(&ret_values.back()),
+                     "Serialize OpProto Error. This could be a bug of Paddle.");
+    }
+    return ret_values;
+  });
+
  return m.ptr();
 }
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -155,7 +155,8 @@ RUN apt-get update &&\
    paddle version
 ${DOCKERFILE_CUDNN_DSO}
 ${DOCKERFILE_GPU_ENV}
-
+ADD go/cmd/pserver/pserver /usr/bin/
+ADD go/cmd/master/master /usr/bin/
 # default command shows the paddle version and exit
 CMD ["paddle", "version"]
 EOF
--- a/paddle/string/piece.h
+++ b/paddle/string/piece.h
@@ -35,7 +35,7 @@ public:

  // We provide non-explicit singleton constructors so users can
  // pass in a "const char*" or a "string" wherever a "Piece"
-  // is expected.  These contructors ensure that if data_ is NULL,
+  // is expected.  These constructors ensure that if data_ is NULL,
  // size_ is 0.
  Piece();
  Piece(const char* d, size_t n);

--- a/paddle/trainer/NewRemoteParameterUpdater.cpp
+++ b/paddle/trainer/NewRemoteParameterUpdater.cpp
@@ -28,6 +28,17 @@ NewRemoteParameterUpdater::NewRemoteParameterUpdater(
      newGradients_(nullptr),
      pserverSpec_(pserverSpec) {}

+NewRemoteParameterUpdater::NewRemoteParameterUpdater(
+    const OptimizationConfig &config,
+    const std::string pserverSpec,
+    const bool useEtcd)
+    : trainerConfig_(config),
+      parameterClient_(-1),
+      newParameters_(nullptr),
+      newGradients_(nullptr),
+      pserverSpec_(pserverSpec),
+      useEtcd_(useEtcd) {}
+
 void NewRemoteParameterUpdater::init(
    const std::vector<ParameterPtr> &parameters) {
  ParameterUpdater::init(parameters);
@@ -38,8 +49,13 @@ void NewRemoteParameterUpdater::init(
  }

  // create parameter server client.
-  parameterClient_ = paddle_new_pserver_client((char *)pserverSpec_.c_str(),
-                                               FLAGS_trainer_id == 0);
+  if (useEtcd_) {
+    parameterClient_ = paddle_new_etcd_pserver_client(
+        (char *)pserverSpec_.c_str(), FLAGS_trainer_id == 0);
+  } else {
+    parameterClient_ = paddle_new_pserver_client((char *)pserverSpec_.c_str(),
+                                                 FLAGS_trainer_id == 0);
+  }

  // init new parameter and gradient.
  newParameters_ = initNewParameter(PARAMETER_VALUE);

--- a/paddle/trainer/NewRemoteParameterUpdater.h
+++ b/paddle/trainer/NewRemoteParameterUpdater.h
@@ -32,6 +32,9 @@ class NewRemoteParameterUpdater : public ParameterUpdater {
 public:
  NewRemoteParameterUpdater(const OptimizationConfig& config,
                            const std::string pserverSpec);
+  NewRemoteParameterUpdater(const OptimizationConfig& config,
+                            const std::string pserverSpec,
+                            const bool useEtcd);
  ~NewRemoteParameterUpdater() {
    releaseNewParameter(newParameters_);
    releaseNewParameter(newGradients_);
@@ -111,6 +114,8 @@ protected:
  paddle_parameter** newGradients_;
  /// the specification of parameter server "host1:port,host1:port"
  std::string pserverSpec_;
+  /// true if pserverSpec_ is etcd endpoint, else pserverSpec_ is pserver addr
+  bool useEtcd_;
 };

 }  // namespace paddle
--- a/paddle/trainer/TrainerConfigHelper.cpp
+++ b/paddle/trainer/TrainerConfigHelper.cpp
@@ -62,11 +62,7 @@ TrainerConfigHelper::TrainerConfigHelper(const TrainerConfig &config)
  m->conf = config;
 }

-TrainerConfigHelper::~TrainerConfigHelper() {
-  if (m) {
-    delete m;
-  }
-}
+TrainerConfigHelper::~TrainerConfigHelper() { delete m; }

 const TrainerConfig &TrainerConfigHelper::getConfig() const { return m->conf; }


--- a/paddle/utils/DynamicLoader.h
+++ b/paddle/utils/DynamicLoader.h
@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#ifndef DYNAMIC_LOAD_H_
-#define DYNAMIC_LOAD_H_
+#pragma once

 #include <dlfcn.h>
 #include <memory>
@@ -59,5 +58,3 @@ void GetWarpCTCDsoHandle(void** dso_handle);
 *
 */
 void GetLapackDsoHandle(void** dso_handle);
-
-#endif  // DYNAMIC_LOAD_H_
--- a/paddle/utils/ThreadLocal.h
+++ b/paddle/utils/ThreadLocal.h
@@ -51,7 +51,7 @@ template <class T>
 class ThreadLocal {
 public:
  ThreadLocal() {
-    CHECK(pthread_key_create(&threadSpecificKey_, dataDestructor) == 0);
+    CHECK_EQ(pthread_key_create(&threadSpecificKey_, dataDestructor), 0);
  }
  ~ThreadLocal() { pthread_key_delete(threadSpecificKey_); }

@@ -65,7 +65,7 @@ public:
    if (!p && createLocal) {
      p = new T();
      int ret = pthread_setspecific(threadSpecificKey_, p);
-      CHECK(ret == 0);
+      CHECK_EQ(ret, 0);
    }
    return p;
  }
@@ -79,7 +79,7 @@ public:
    if (T* q = get(false)) {
      dataDestructor(q);
    }
-    CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+    CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
  }

  /**
@@ -112,7 +112,7 @@ private:
 template <class T>
 class ThreadLocalD {
 public:
-  ThreadLocalD() { CHECK(pthread_key_create(&threadSpecificKey_, NULL) == 0); }
+  ThreadLocalD() { CHECK_EQ(pthread_key_create(&threadSpecificKey_, NULL), 0); }
  ~ThreadLocalD() {
    pthread_key_delete(threadSpecificKey_);
    for (auto t : threadMap_) {
@@ -127,7 +127,7 @@ public:
    T* p = (T*)pthread_getspecific(threadSpecificKey_);
    if (!p) {
      p = new T();
-      CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+      CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
      updateMap(p);
    }
    return p;
@@ -141,7 +141,7 @@ public:
    if (T* q = (T*)pthread_getspecific(threadSpecificKey_)) {
      dataDestructor(q);
    }
-    CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+    CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
    updateMap(p);
  }


--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -126,6 +126,7 @@ __all__ = [
    'row_conv_layer',
    'dropout_layer',
    'prelu_layer',
+    'gated_unit_layer',
 ]


@@ -5862,7 +5863,7 @@ def prelu_layer(input,
    :rtype: LayerOutput
    """

-    assert isinstance(input, LayerOutput), 'prelu_layer only accepts one input'
+    assert isinstance(input, LayerOutput), 'prelu_layer accepts only one input.'
    assert isinstance(param_attr, ParameterAttribute)

    l = Layer(
@@ -5876,3 +5877,96 @@ def prelu_layer(input,
        layer_type=LayerType.PRELU,
        parents=input,
        size=l.config.size)
+
+
+@wrap_name_default()
+@layer_support(ERROR_CLIPPING, DROPOUT)
+@wrap_act_default(act=LinearActivation())
+def gated_unit_layer(input,
+                     size,
+                     act=None,
+                     name=None,
+                     gate_attr=None,
+                     gate_param_attr=None,
+                     gate_bias_attr=True,
+                     inproj_attr=None,
+                     inproj_param_attr=None,
+                     inproj_bias_attr=True,
+                     layer_attr=None):
+    """
+    The gated unit layer implements a simple gating mechanism over the input.
+    The input :math:`X` is first projected into a new space :math:`X'`, and
+    it is also used to produce a gate weight :math:`\sigma`. Element-wise
+    prodict between :match:`X'` and :math:`\sigma` is finally returned.
+
+    Reference:
+        Language Modeling with Gated Convolutional Networks
+        https://arxiv.org/abs/1612.08083
+
+    .. math::
+       y=\\text{act}(X \cdot W + b)\otimes \sigma(X \cdot V + c)
+
+    The example usage is:
+
+    .. code-block:: python
+        gated_unit = gated_unit_layer(size=128, input=input_layer))
+
+    :param input: input for this layer.
+    :type input: LayerOutput
+    :param size: output size of the gated unit.
+    :type size: int
+    :param act: activation type of the projected input.
+    :type act: BaseActivation
+    :param name: name of this layer.
+    :type name: basestring
+    :param gate_attr: Attributes to tune the gate output, for example, error
+        clipping threshold, dropout and so on. See ExtraLayerAttribute for
+        more details.
+    :type gate_attr: ExtraLayerAttribute|None
+    :param gate_param_attr: Attributes to tune the learnable projected matrix
+        parameter of the gate.
+    :type gate_param_attr: ParameterAttribute|None
+    :param gate_bias_attr: Attributes to tune the learnable bias of the gate.
+    :type gate_bias_attr: ParameterAttribute|None
+    :param inproj_attr: Attributes to the tune the projected input, for
+        example, error clipping threshold, dropout and so on. See
+        ExtraLayerAttribute for more details.
+    :type inproj_attr: ExtraLayerAttribute|None
+    :param inproj_param_attr: Attributes to tune the learnable parameter of
+        the projection of input.
+    :type inproj_param_attr: ParameterAttribute|None
+    :param inproj_bias_attr: Attributes to tune the learnable bias of
+        projection of the input.
+    :type inproj_bias_attr: ParameterAttribute|None
+    :param layer_attr: Attributes to tune the final output of the gated unit,
+        for example, error clipping threshold, dropout and so on. See
+        ExtraLayerAttribute for more details.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(
+        input, LayerOutput), 'The gated linear unit accepts only one input.'
+
+    input_proj = fc_layer(
+        input=input,
+        name="%s_input_proj" % name,
+        size=size,
+        act=act,
+        layer_attr=inproj_attr,
+        param_attr=inproj_param_attr,
+        bias_attr=inproj_bias_attr)
+
+    gate = fc_layer(
+        size=size,
+        name="%s_gate" % name,
+        act=SigmoidActivation(),
+        input=input,
+        layer_attr=gate_attr,
+        param_attr=gate_param_attr,
+        bias_attr=gate_bias_attr)
+    return mixed_layer(
+        name="%s_gated_act" % name,
+        input=dotmul_operator(input_proj, gate),
+        layer_attr=layer_attr)
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -7,6 +7,6 @@ test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
 test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
 test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
 test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer
-test_recursive_topology)
+test_recursive_topology test_gated_unit_layer)

 export whole_configs=(test_split_datasource)
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 256
+  active_type: ""
+}
+layers {
+  name: "__gated_unit_layer_0___input_proj"
+  type: "fc"
+  size: 512
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___gated_unit_layer_0___input_proj.w0"
+  }
+  bias_parameter_name: "___gated_unit_layer_0___input_proj.wbias"
+  error_clipping_threshold: 100.0
+}
+layers {
+  name: "__gated_unit_layer_0___gate"
+  type: "fc"
+  size: 512
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___gated_unit_layer_0___gate.w0"
+  }
+  bias_parameter_name: "___gated_unit_layer_0___gate.wbias"
+  error_clipping_threshold: 100.0
+}
+layers {
+  name: "__gated_unit_layer_0___gated_act"
+  type: "mixed"
+  size: 512
+  active_type: ""
+  inputs {
+    input_layer_name: "__gated_unit_layer_0___input_proj"
+  }
+  inputs {
+    input_layer_name: "__gated_unit_layer_0___gate"
+  }
+  error_clipping_threshold: 100.0
+  operator_confs {
+    type: "dot_mul"
+    input_indices: 0
+    input_indices: 1
+    input_sizes: 512
+    input_sizes: 512
+    output_size: 512
+    dotmul_scale: 1
+  }
+}
+parameters {
+  name: "___gated_unit_layer_0___input_proj.w0"
+  size: 131072
+  initial_mean: 0.0
+  initial_std: 0.0001
+  dims: 256
+  dims: 512
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___gated_unit_layer_0___input_proj.wbias"
+  size: 512
+  initial_mean: 0.0
+  initial_std: 1
+  dims: 1
+  dims: 512
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___gated_unit_layer_0___gate.w0"
+  size: 131072
+  initial_mean: 0.0
+  initial_std: 0.0001
+  dims: 256
+  dims: 512
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___gated_unit_layer_0___gate.wbias"
+  size: 512
+  initial_mean: 0.0
+  initial_std: 1
+  dims: 1
+  dims: 512
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "input"
+output_layer_names: "__gated_unit_layer_0___gated_act"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "__gated_unit_layer_0___input_proj"
+  layer_names: "__gated_unit_layer_0___gate"
+  layer_names: "__gated_unit_layer_0___gated_act"
+  input_layer_names: "input"
+  output_layer_names: "__gated_unit_layer_0___gated_act"
+  is_recurrent_layer_group: false
+}
+
--- a/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='input', size=256)
+glu = gated_unit_layer(
+    size=512,
+    input=data,
+    act=TanhActivation(),
+    gate_attr=ExtraLayerAttribute(error_clipping_threshold=100.0),
+    gate_param_attr=ParamAttr(initial_std=1e-4),
+    gate_bias_attr=ParamAttr(initial_std=1),
+    inproj_attr=ExtraLayerAttribute(error_clipping_threshold=100.0),
+    inproj_param_attr=ParamAttr(initial_std=1e-4),
+    inproj_bias_attr=ParamAttr(initial_std=1),
+    layer_attr=ExtraLayerAttribute(error_clipping_threshold=100.0))
+
+outputs(glu)
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -20,7 +20,6 @@ import trainer
 import event
 import data_type
 import topology
-import data_feeder
 import networks
 import evaluator
 from . import dataset
@@ -31,7 +30,6 @@ import op
 import pooling
 import inference
 import networks
-import py_paddle.swig_paddle as api
 import minibatch
 import plot
 import image
@@ -47,7 +45,6 @@ __all__ = [
    'data_type',
    'attr',
    'pooling',
-    'data_feeder',
    'dataset',
    'reader',
    'topology',
@@ -61,6 +58,7 @@ __all__ = [


 def init(**kwargs):
+    import py_paddle.swig_paddle as api
    args = []
    args_dict = {}
    # NOTE: append arguments if they are in ENV

--- a/python/paddle/v2/data_feeder.py
+++ b/python/paddle/v2/data_feeder.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 from py_paddle import DataProviderConverter
 import collections
 import paddle.trainer.PyDataProvider2 as pydp2

--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@@ -22,6 +22,8 @@ import importlib
 import paddle.v2.dataset
 import cPickle
 import glob
+import cPickle as pickle
+import random

 __all__ = [
    'DATA_HOME', 'download', 'md5file', 'split', 'cluster_files_reader',
@@ -170,8 +172,6 @@ def convert(output_path,
            name_prefix,
            max_lines_to_shuffle=1000):
    import recordio
-    import cPickle as pickle
-    import random
    """
    Convert data from reader to recordio format files.

@@ -201,8 +201,10 @@ def convert(output_path,
    def write_data(w, lines):
        random.shuffle(lines)
        for i, d in enumerate(lines):
-            d = pickle.dumps(d, pickle.HIGHEST_PROTOCOL)
-            w[i % num_shards].write(d)
+            # FIXME(Yancey1989):
+            # dumps with protocol: pickle.HIGHEST_PROTOCOL
+            o = pickle.dumps(d)
+            w[i % num_shards].write(o)

    w = open_writers()
    lines = []

--- a/python/paddle/v2/dataset/mq2007.py
+++ b/python/paddle/v2/dataset/mq2007.py
@@ -212,19 +212,19 @@ def gen_pair(querylist, partial_order="full"):
        for j in range(i + 1, len(querylist)):
            query_right = querylist[j]
            if query_left.relevance_score > query_right.relevance_score:
-                labels.append(1)
+                labels.append([1])
                docpairs.append([
                    np.array(query_left.feature_vector),
                    np.array(query_right.feature_vector)
                ])
            elif query_left.relevance_score < query_right.relevance_score:
-                labels.append(1)
+                labels.append([1])
                docpairs.append([
                    np.array(query_right.feature_vector),
                    np.array(query_left.feature_vector)
                ])
    for label, pair in zip(labels, docpairs):
-        yield label, pair[0], pair[1]
+        yield np.array(label), pair[0], pair[1]


 def gen_list(querylist):

--- a/python/paddle/v2/event.py
+++ b/python/paddle/v2/event.py
@@ -9,8 +9,6 @@ There are:
 * BeginPass
 * EndPass
 """
-import py_paddle.swig_paddle as api
-
 __all__ = [
    'EndIteration', 'BeginIteration', 'BeginPass', 'EndPass', 'TestResult'
 ]
@@ -18,6 +16,7 @@ __all__ = [

 class WithMetric(object):
    def __init__(self, evaluator):
+        import py_paddle.swig_paddle as api
        if not isinstance(evaluator, api.Evaluator):
            raise TypeError("Evaluator should be api.Evaluator type")
        self.__evaluator__ = evaluator

--- a/python/paddle/v2/framework/create_op_creation_methods.py
+++ b/python/paddle/v2/framework/create_op_creation_methods.py
+import paddle.v2.framework.core as core
+import paddle.v2.framework.proto.op_proto_pb2 as op_proto_pb2
+
+
+def get_all_op_protos():
+    protostrs = core.get_all_op_protos()
+    ret_values = []
+    for pbstr in protostrs:
+        op_proto = op_proto_pb2.OpProto.FromString(str(pbstr))
+        ret_values.append(op_proto)
+    return ret_values
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
 add_python_test(test_framework test_protobuf.py test_scope.py
-    test_default_scope_funcs.py)
+    test_default_scope_funcs.py test_op_creation_methods.py)
--- a/python/paddle/v2/framework/tests/test_op_creation_methods.py
+++ b/python/paddle/v2/framework/tests/test_op_creation_methods.py
+import unittest
+import paddle.v2.framework.create_op_creation_methods as creation
+
+
+class TestOpCreationsMethods(unittest.TestCase):
+    def test_all_protos(self):
+        all_protos = creation.get_all_op_protos()
+        self.assertNotEqual(0, len(all_protos))
+
+        for each in all_protos:
+            self.assertTrue(each.IsInitialized())
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
 import numpy
-import py_paddle.swig_paddle as api
 import collections
 import topology
 import minibatch
-from data_feeder import DataFeeder

 __all__ = ['infer', 'Inference']

@@ -28,6 +26,7 @@ class Inference(object):
    """

    def __init__(self, output_layer, parameters):
+        import py_paddle.swig_paddle as api
        topo = topology.Topology(output_layer)
        gm = api.GradientMachine.createFromConfigProto(
            topo.proto(), api.CREATE_MODE_TESTING, [api.PARAMETER_VALUE])
@@ -40,6 +39,7 @@ class Inference(object):
        self.__data_types__ = topo.data_type()

    def iter_infer(self, input, feeding=None):
+        from data_feeder import DataFeeder
        feeder = DataFeeder(self.__data_types__, feeding)
        batch_size = len(input)


--- a/python/paddle/v2/master/client.py
+++ b/python/paddle/v2/master/client.py
@@ -10,8 +10,9 @@ class client(object):
    client is a client to the master server.
    """

-    def __init__(self, addr, buf_size):
-        self.c = lib.paddle_new_master_client(addr, buf_size)
+    def __init__(self, etcd_endpoints, timeout, buf_size):
+        self.c = lib.paddle_new_etcd_master_client(etcd_endpoints, timeout,
+                                                   buf_size)

    def close(self):
        lib.paddle_release_master_client(self.c)

--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
 import py_paddle.swig_paddle as swig_api
-
 import paddle.trainer_config_helpers.config_parser_utils as config_parser_utils
 import paddle.trainer_config_helpers.optimizers as v1_optimizers
 """
@@ -47,12 +46,12 @@ class Optimizer(object):
        return swig_api.ParameterUpdater.createRemoteUpdater(
            self.__opt_conf__, pass_num, use_sparse_updater)

-    def __create_new_remote_updater__(self, pserver_spec):
+    def __create_new_remote_updater__(self, pserver_spec, use_etcd):
        return swig_api.ParameterUpdater.createNewRemoteUpdater(
-            self.__opt_conf__, pserver_spec)
+            self.__opt_conf__, pserver_spec, use_etcd)

    def create_updater(self, is_local, num_passes, use_sparse_updater,
-                       pserver_spec):
+                       pserver_spec, use_etcd):
        """
        create proper parameter_updater by configuration.
        :param is_local: create local or remote parameter updater
@@ -78,7 +77,7 @@ class Optimizer(object):
                    num_passes, use_sparse_updater)
            else:
                parameter_updater = self.__create_new_remote_updater__(
-                    pserver_spec)
+                    pserver_spec, use_etcd)
        return parameter_updater


@@ -268,6 +267,7 @@ ModelAverage = v1_optimizers.ModelAverage
 L2Regularization = v1_optimizers.L2Regularization

 if __name__ == '__main__':
+    import py_paddle.swig_paddle as swig_api
    swig_api.initPaddle('--use_gpu=false')
    for opt in [
            Momentum(), Adam(), Adamax(), AdaGrad(), DecayedAdaGrad(),

--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
 import numpy as np
-import py_paddle.swig_paddle as api
 from paddle.proto.ParameterConfig_pb2 import ParameterConfig
 import paddle.trainer.config_parser as cp
 import struct
@@ -124,6 +123,7 @@ class Parameters(object):
        :return: parameter value
        :rtype: np.ndarray
        """
+        import py_paddle.swig_paddle as api
        shape = self.get_shape(key)

        if len(self.__gradient_machines__) == 0:
@@ -223,7 +223,7 @@ class Parameters(object):
        :type gradient_machine: api.GradientMachine
        :return:
        """
-
+        import py_paddle.swig_paddle as api
        if not isinstance(gradient_machine, api.GradientMachine):
            raise ValueError("gradient_machine should be api.GradientMachine")

@@ -359,6 +359,7 @@ def __copy_parameter_to_gradient_machine__(gradient_machine, name, arr):
    :return:
    :rtype: api.Parameter
    """
+    import py_paddle.swig_paddle as api
    param = __get_parameter_in_gradient_machine__(gradient_machine, name)
    vec = param.getBuf(api.PARAMETER_VALUE)
    assert isinstance(vec, api.Vector)

--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -2,12 +2,6 @@
 Module Trainer
 """
 import collections
-import gzip
-import os
-
-import py_paddle.swig_paddle as api
-
-from data_feeder import DataFeeder
 from topology import Topology
 from . import event as v2_event
 from . import optimizer as v2_optimizer
@@ -51,7 +45,8 @@ class SGD(object):
                 update_equation,
                 extra_layers=None,
                 is_local=True,
-                 pserver_spec=None):
+                 pserver_spec=None,
+                 use_etcd=True):

        if not isinstance(parameters, v2_parameters.Parameters):
            raise TypeError('parameters should be parameters')
@@ -59,6 +54,7 @@ class SGD(object):
        if not isinstance(update_equation, v2_optimizer.Optimizer):
            raise TypeError("update equation parameter must be "
                            "paddle.v2.optimizer.Optimizer")
+        import py_paddle.swig_paddle as api
        topology = Topology(cost, extra_layers=extra_layers)
        self.__optimizer__ = update_equation
        self.__topology__ = topology
@@ -66,6 +62,7 @@ class SGD(object):
        self.__topology_in_proto__ = topology.proto()
        self.__is_local__ = is_local
        self.__pserver_spec__ = pserver_spec
+        self.__use_etcd__ = use_etcd

        self.__use_sparse_updater__ = self.__topology__.use_sparse_updater()
        # # In local mode, disable sparse_remote_update.
@@ -124,13 +121,15 @@ class SGD(object):
        :type feeding: dict|list
        :return:
        """
+        import py_paddle.swig_paddle as api
+        from data_feeder import DataFeeder
        if event_handler is None:
            event_handler = default_event_handler
        __check_train_args__(**locals())

        self.__parameter_updater__ = self.__optimizer__.create_updater(
            self.__is_local__, num_passes, self.__use_sparse_updater__,
-            self.__pserver_spec__)
+            self.__pserver_spec__, self.__use_etcd__)
        self.__parameter_updater__.init(self.__gradient_machine__)

        self.__gradient_machine__.start()
@@ -187,6 +186,8 @@ class SGD(object):
        :type feeding: dict
        :return:
        """
+        import py_paddle.swig_paddle as api
+        from data_feeder import DataFeeder
        feeder = DataFeeder(self.__data_types__, feeding)
        evaluator = self.__gradient_machine__.makeEvaluator()
        out_args = api.Arguments.createArguments(0)

--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -19,7 +19,8 @@ setup_requires=["requests",
                "recordio",
                "matplotlib",
                "rarfile",
-                "scipy>=0.19.0"]
+                "scipy>=0.19.0",
+                "nltk"]

 if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
    setup_requires+=["opencv-python"]