Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into mobilenet_gpu

44927bf7 · xzl · d43fbbae · ad728419 · 44927bf7 · 44927bf7
144 changed file
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -21,10 +21,10 @@
    sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
    hooks:
    -   id: clang-formater
-   repo: https://github.com/dnephin/pre-commit-golang
+-   repo: https://github.com/PaddlePaddle/pre-commit-golang
-    sha: e4693a4c282b4fc878eda172a929f7a6508e7d16
+    sha: 16398aeccf263adaf53b2495eed0406347d76281
    hooks:
      -   id: go-fmt
-          files: (.*\.go)
+          types: [go]
-      -   id: go-lint
+      -   id: gometalinter
-          files: (.*\.go)
+          types: [go]
--- a/.travis.yml
+++ b/.travis.yml
@@ -41,6 +41,8 @@ before_install:
  - pip install rarfile
  - curl https://glide.sh/get | bash
  - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
+  - go get -u github.com/alecthomas/gometalinter
+  - gometalinter --install
  - |
    function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -137,7 +137,8 @@ if(WITH_GPU)
 endif(WITH_GPU)
 if(USE_NNPACK)
-  list(APPEND EXTERNAL_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB} "rt")
+    include(external/nnpack)
+    list(APPEND EXTERNAL_LIBS ${NNPACK_LIBS})
 endif(USE_NNPACK)
 add_subdirectory(proto)

--- a/Dockerfile
+++ b/Dockerfile
@@ -25,7 +25,7 @@ COPY ./paddle/scripts/docker/root/ /root/
 RUN apt-get update && \
    apt-get install -y \
    git python-pip python-dev openssh-server bison  \
-    wget unzip tar xz-utils bzip2 gzip coreutils ntp \
+    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
    curl sed grep graphviz libjpeg-dev zlib1g-dev  \
    python-numpy python-matplotlib gcc g++ \
    automake locales clang-format-3.8 swig doxygen cmake  \

--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -14,6 +14,17 @@ RUN apt-get update && \
    wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
    apt-get clean -y
+# Install Go and glide
+RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
+    tar -C /usr/local -xzf go.tgz && \
+    mkdir /root/gopath && \
+    mkdir /root/gopath/bin && \
+    mkdir /root/gopath/src && \
+    rm go.tgz
+ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
+ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
 # git credential to skip password typing
 RUN git config --global credential.helper store

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -102,12 +102,19 @@ if(WITH_GOLANG)
      message(FATAL_ERROR "no glide executeble found: $ENV{GOPATH}/bin/glide")
    endif()
-    add_custom_target(go_vendor)
+    # this command will only run when the file it depends is missing
-    add_custom_command(TARGET go_vendor
+    # or has changed, or the output is missing.
+    add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/glide
      COMMAND env GOPATH=${GOPATH} ${GLIDE} install
+      COMMAND touch ${CMAKE_BINARY_DIR}/glide
+      DEPENDS ${PROJ_ROOT}/go/glide.lock
      WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go"
-    )
+      )
-    add_dependencies(go_vendor go_path)
+    # depends on the custom command which outputs
+    # ${CMAKE_BINARY_DIR}/glide, the custom command does not need to
+    # run every time this target is built.
+    add_custom_target(go_vendor DEPENDS ${CMAKE_BINARY_DIR}/glide go_path)
  endif()
 endif(WITH_GOLANG)
--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@@ -27,7 +27,8 @@ set(IGNORE_PATTERN
    .*cblas\\.h.*
    .*\\.pb\\.txt
    .*LtrDataProvider.*
-    .*MultiDataProvider.*)
+    .*MultiDataProvider.*
+    .*pb.*)
 # add_style_check_target
 #
@@ -52,14 +53,13 @@ macro(add_style_check_target TARGET_NAME)
                endif()
            endforeach()
            if(LINT MATCHES ON)
+                # cpplint code style
                get_filename_component(base_filename ${filename} NAME)
                set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint)
-                add_custom_command(OUTPUT ${CUR_GEN}
+                add_custom_command(TARGET ${TARGET_NAME} PRE_BUILD
-                    PRE_BUILD
+                    COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
-                    COMMAND env ${py_env} "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
+                            "--filter=${STYLE_FILTER}"
-                                "--filter=${STYLE_FILTER}"
+                            "--write-success=${CUR_GEN}" ${filename}
-                                "--write-success=${CUR_GEN}" ${filename}
-                    DEPENDS ${filename}
                    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
            endif()
        endforeach()

--- a/cmake/cross_compiling/android.cmake
+++ b/cmake/cross_compiling/android.cmake
@@ -108,6 +108,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
        ENDIF()
        IF(ANDROID_ABI STREQUAL "arm64-v8a")
            SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
+            SET(CMAKE_SYSTEM_PROCESSOR aarch64)
        ENDIF()
        SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
    ENDIF()
@@ -166,7 +167,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
    ENDIF()
    IF(ANDROID_ABI STREQUAL "arm64-v8a")
-      LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
+        LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
    ENDIF()
    STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}")
@@ -193,6 +194,10 @@ ELSE()
        SET(CMAKE_ANDROID_STANDALONE_TOOLCHAIN ${ANDROID_STANDALONE_TOOLCHAIN})
    ENDIF()
    SET(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ABI})
-    SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE})
+    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
-    SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON})
+        SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE})
+        IF(ANDROID_ABI STREQUAL "armeabi-v7a")
+            SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON})
+        ENDIF()
+    ENDIF()
 ENDIF()
--- a/paddle/function/nnpack/nnpack.cmake
+++ b/paddle/function/nnpack/nnpack.cmake
@@ -7,10 +7,24 @@ set(NNPACK_ROOT $ENV{NNPACK_ROOT} CACHE PATH "Folder contains NNPACK")
 find_path(NNPACK_INC_DIR nnpack.h PATHS ${NNPACK_ROOT}/include)
 find_library(NNPACK_LIB NAMES nnpack PATHS ${NNPACK_ROOT}/lib)
 find_library(PTHREADPOOL_LIB NAMES pthreadpool PATHS ${NNPACK_ROOT}/lib)
+find_library(NNPACK_UKERNELS_LIB NAMES nnpack_ukernels PATHS ${NNPACK_ROOT}/lib)
+find_library(NNPACK_CPUFEATURES_LIB NAMES cpufeatures PATHS ${NNPACK_ROOT}/lib)
 if(NNPACK_INC_DIR AND NNPACK_LIB AND PTHREADPOOL_LIB)
  set(NNPACK_FOUND ON)
  INCLUDE_DIRECTORIES(${NNPACK_INC_DIR})
+  set(NNPACK_LIBS)
+  list(APPEND NNPACK_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB})
+  if (NNPACK_UKERNELS_LIB)
+    list(APPEND NNPACK_LIBS ${NNPACK_UKERNELS_LIB})
+  endif()
+  if (NNPACK_CPUFEATURES_LIB)
+    list(APPEND NNPACK_LIBS ${NNPACK_CPUFEATURES_LIB})
+  endif()
+  if(NOT ANDROID)
+    list(APPEND NNPACK_LIBS "rt")
+  endif()
 else()
  message(FATAL_ERROR "Cannot find NNPACK in (${NNPACK_ROOT})")
 endif()
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -104,6 +104,7 @@ function(merge_static_libs TARGET_NAME)
  foreach(lib ${libs})
    list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
  endforeach()
+  list(REMOVE_DUPLICATES libs_deps)
  if(APPLE) # Use OSX's libtool to merge archives
    # To produce a library we need at least one source file.
@@ -127,7 +128,7 @@ function(merge_static_libs TARGET_NAME)
      # Get the file names of the libraries to be merged
      set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
    endforeach()
-		add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
+    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
      COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
      COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
  else() # general UNIX: use "ar" to extract objects and re-add to a common lib
@@ -145,11 +146,11 @@ function(merge_static_libs TARGET_NAME)
        DEPENDS ${lib} ${objdir}
        WORKING_DIRECTORY ${objdir})
-      # Empty dummy source file that goes into merged library
+      # Empty dummy source file that goes into merged library		
-      set(mergebase ${lib}.mergebase.c)
+      set(mergebase ${lib}.mergebase.c)		
-      add_custom_command(OUTPUT ${mergebase}
+      add_custom_command(OUTPUT ${mergebase}		
-        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}
+        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}		
-        DEPENDS ${objlistfile})
+        DEPENDS ${objlistfile})		
      list(APPEND mergebases "${mergebase}")
    endforeach()
@@ -184,6 +185,10 @@ function(cc_library TARGET_NAME)
      add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
    endif()
+    # cpplint code style
+    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS})
  else(cc_library_SRCS)
    if (cc_library_DEPS)
      merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})
@@ -337,7 +342,7 @@ function(go_test TARGET_NAME)
  string(REPLACE "${PADDLE_GO_PATH}" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
  add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${go_test_DEPS})
  add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test
+    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test -race
    -c -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
    ".${CMAKE_CURRENT_SOURCE_REL_DIR}"
    WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")

--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -474,6 +474,11 @@ prelu
 ..  autoclass:: paddle.v2.layer.prelu
    :noindex:
+gated_unit
+-----------
+..  autoclass:: paddle.v2.layer.gated_unit
+    :noindex:
 Detection output Layer
 ======================

--- a/go/cmd/master/master.go
+++ b/go/cmd/master/master.go
@@ -11,6 +11,7 @@ import (
 	"github.com/namsral/flag"
 	log "github.com/sirupsen/logrus"
+	"github.com/topicai/candy"
 	"github.com/PaddlePaddle/Paddle/go/master"
 	"github.com/PaddlePaddle/Paddle/go/utils/networkhelper"
@@ -20,11 +21,18 @@ func main() {
 	port := flag.Int("port", 8080, "port of the master server.")
 	ttlSec := flag.Int("ttl", 60, "etcd lease TTL in seconds.")
 	endpoints := flag.String("endpoints", "http://127.0.0.1:2379", "comma separated etcd endpoints. If empty, fault tolerance will not be enabled.")
-	taskTimeoutDur := flag.Duration("task_timout_dur", 20*time.Minute, "task timout duration.")
+	taskTimeoutDur := flag.Duration("task-timout-dur", 20*time.Minute, "task timout duration.")
-	taskTimeoutMax := flag.Int("task_timeout_max", 3, "max timtout count for each task before it being declared failed task.")
+	taskTimeoutMax := flag.Int("task-timeout-max", 3, "max timtout count for each task before it being declared failed task.")
-	chunkPerTask := flag.Int("chunk_per_task", 10, "chunk per task.")
+	chunkPerTask := flag.Int("chunk-per-task", 10, "chunk per task.")
+	logLevel := flag.String("log-level", "info",
+		"log level, possible values: debug, info, warning, error, fatal, panic")
 	flag.Parse()
+	level, e := log.ParseLevel(*logLevel)
+	candy.Must(e)
+	log.SetLevel(level)
 	if *endpoints == "" {
 		log.Warningln("-endpoints not set, fault tolerance not be enabled.")
 	}

--- a/go/cmd/pserver/pserver.go
+++ b/go/cmd/pserver/pserver.go
@@ -40,7 +40,7 @@ func main() {
 		idx = *index
 	} else {
 		e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, *etcdTimeout)
-		idx, err = e.Register()
+		idx, err = e.Register(*port)
 		candy.Must(err)
 		cp, err = pserver.NewCheckpointFromFile(*checkpointPath, idx, e)

--- a/go/master/c/client.go
+++ b/go/master/c/client.go
@@ -23,7 +23,6 @@ import (
 	log "github.com/sirupsen/logrus"
 )
-var nullPtr = unsafe.Pointer(uintptr(0))
 var mu sync.Mutex
 var handleMap = make(map[C.paddle_master_client]*master.Client)
 var curHandle C.paddle_master_client
@@ -114,13 +113,13 @@ func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
 	if err != nil {
 		// Error
 		// TODO: return the type of error?
-		*record = (*C.uchar)(nullPtr)
+		*record = (*C.uchar)(nil)
 		return -1
 	}
 	if len(r) == 0 {
 		// Empty record
-		*record = (*C.uchar)(nullPtr)
+		*record = (*C.uchar)(nil)
 		return 0
 	}

--- a/go/master/client.go
+++ b/go/master/client.go
@@ -2,6 +2,7 @@ package master
 import (
 	"os"
+	"time"
 	"github.com/PaddlePaddle/Paddle/go/connection"
 	"github.com/PaddlePaddle/recordio"
@@ -36,9 +37,9 @@ func (c *Client) getRecords() {
 	for {
 		t, err := c.getTask()
 		if err != nil {
-			// TODO(helin): wait before move on with next
 			// getTask call.
-			log.Errorln(err)
+			log.Errorf("Get task failed, sleep 3 seconds and continue, %s", err)
+			time.Sleep(3 * time.Second)
 			continue
 		}
@@ -68,7 +69,10 @@ func (c *Client) getRecords() {
 		// We treat a task as finished whenever the last data
 		// instance of the task is read. This is not exactly
 		// correct, but a reasonable approximation.
-		c.taskFinished(t.Meta.ID)
+		err = c.taskFinished(t.Meta.ID)
+		if err != nil {
+			log.Errorln(err)
+		}
 	}
 }

--- a/go/master/client_internal_test.go
+++ b/go/master/client_internal_test.go
@@ -66,11 +66,21 @@ func TestGetFinishTask(t *testing.T) {
 	for i := 0; i < totalTask*chunkPerTask; i++ {
 		w := recordio.NewWriter(f, -1, -1)
-		w.Write(nil)
+		_, err = w.Write(nil)
+		if err != nil {
+			panic(err)
+		}
 		// call Close to force RecordIO writing a chunk.
-		w.Close()
+		err = w.Close()
+		if err != nil {
+			panic(err)
+		}
+	}
+	err = f.Close()
+	if err != nil {
+		panic(err)
 	}
-	f.Close()
 	// Manually intialize client to avoid calling c.getRecords()
 	c := &Client{}
@@ -79,7 +89,11 @@ func TestGetFinishTask(t *testing.T) {
 	ch := make(chan string, 1)
 	ch <- addr
 	go c.monitorMaster(ch)
-	c.SetDataset([]string{path})
+	err = c.SetDataset([]string{path})
+	if err != nil {
+		panic(err)
+	}
 	checkOnePass := func(i int) {
 		var tasks []Task
 		for idx := 0; idx < totalTask; idx++ {

--- a/go/master/client_test.go
+++ b/go/master/client_test.go
@@ -57,14 +57,30 @@ func TestNextRecord(t *testing.T) {
 	w := recordio.NewWriter(f, -1, -1)
 	for i := 0; i < total; i++ {
-		w.Write([]byte{byte(i)})
+		_, err = w.Write([]byte{byte(i)})
+		if err != nil {
+			panic(err)
+		}
+	}
+	err = w.Close()
+	if err != nil {
+		panic(err)
+	}
+	err = f.Close()
+	if err != nil {
+		panic(err)
 	}
-	w.Close()
-	f.Close()
 	curAddr := make(chan string, 1)
 	curAddr <- fmt.Sprintf(":%d", p)
 	c := master.NewClient(curAddr, 10)
-	c.SetDataset([]string{path})
+	err = c.SetDataset([]string{path})
+	if err != nil {
+		panic(err)
+	}
 	for pass := 0; pass < 50; pass++ {
 		received := make(map[byte]bool)
 		for i := 0; i < total; i++ {

--- a/go/master/etcd_client.go
+++ b/go/master/etcd_client.go
@@ -30,7 +30,7 @@ type EtcdClient struct {
 // NewEtcdClient creates a new EtcdClient.
 func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePath string, ttlSec int) (*EtcdClient, error) {
 	log.Debugf("Connecting to etcd at %v", endpoints)
-	// TODO(helin): gracefully shutdown etcd store. Becuase etcd
+	// TODO(helin): gracefully shutdown etcd store. Because etcd
 	// store holds a etcd lock, even though the lock will expire
 	// when the lease timeout, we need to implement graceful
 	// shutdown to release the lock.
@@ -60,7 +60,7 @@ func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePat
 	}
 	log.Debugf("Successfully acquired lock at %s.", lockPath)
-	put := clientv3.OpPut(addrPath, string(addr))
+	put := clientv3.OpPut(addrPath, addr)
 	resp, err := cli.Txn(context.Background()).If(lock.IsOwner()).Then(put).Commit()
 	if err != nil {
 		return nil, err

--- a/go/master/inmem_store.go
+++ b/go/master/inmem_store.go
@@ -4,7 +4,7 @@ import "sync"
 // InMemStore is an in memory implementation of Store interface.
 //
-// It does not tolerate the fault that casues the program to crash.
+// It does not tolerate the fault that causes the program to crash.
 type InMemStore struct {
 	mu  sync.Mutex
 	buf []byte

--- a/go/master/service.go
+++ b/go/master/service.go
@@ -160,7 +160,7 @@ func (s *Service) recover() (bool, error) {
 // snapshot *must* be called with s.mu being held.
 func (s *Service) snapshot() error {
-	// TOOD(helin): etcd request has a size limit, so the snapshot
+	// TODO(helin): etcd request has a size limit, so the snapshot
 	// size is limited by the max request size. We should either
 	// divide the snapshot into smaller chunks and save under
 	// different keys, or configure the request size to be big
@@ -215,6 +215,7 @@ func readChunks(globPaths []string) ([]Chunk, error) {
 		}
 		count := index.NumChunks()
+		log.Infof("readChunks: file %s has %d chunks", path, count)
 		for i := 0; i < count; i++ {
 			chunk := Chunk{
 				Path:  path,
@@ -288,7 +289,6 @@ func (s *Service) processFailedTask(t taskEntry, epoch int) {
 	log.Warningf("Task %v failed %d times, discard.", t.Task, t.NumFailure)
 	s.taskQueues.Todo = append(s.taskQueues.Todo, t)
-	return
 }
 func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {

--- a/go/pserver/client/c/cclient.go
+++ b/go/pserver/client/c/cclient.go
@@ -34,7 +34,6 @@ import (
 	log "github.com/sirupsen/logrus"
 )
-var nullPtr = unsafe.Pointer(uintptr(0))
 var mu sync.Mutex
 var handleMap = make(map[C.paddle_pserver_client]*client.Client)
 var curHandle C.paddle_pserver_client
@@ -63,7 +62,7 @@ func remove(client C.paddle_pserver_client) *client.Client {
 }
 func cArrayToSlice(p unsafe.Pointer, len int) []byte {
-	if p == nullPtr {
+	if p == nil {
 		return nil
 	}
@@ -101,11 +100,11 @@ func paddle_new_pserver_client(addrs *C.char, selected int) C.paddle_pserver_cli
 }
 //export paddle_new_etcd_pserver_client
-func paddle_new_etcd_pserver_client(etcd_endpoints *C.char, selected int) C.paddle_pserver_client {
+func paddle_new_etcd_pserver_client(etcdEndpoints *C.char, selected int) C.paddle_pserver_client {
 	// TODO(Longfei: use etcd lock to decide which trainer to initialize the parameters)
-	addr := C.GoString(etcd_endpoints)
+	addr := C.GoString(etcdEndpoints)
-	etcd_client := client.NewEtcd(addr)
+	etcdClient := client.NewEtcd(addr)
-	c := client.NewClient(etcd_client, etcd_client.Desired(), selector(selected != 0))
+	c := client.NewClient(etcdClient, etcdClient.Desired(), selector(selected != 0))
 	return add(c)
 }
@@ -124,20 +123,20 @@ func paddle_begin_init_params(client C.paddle_pserver_client) C.int {
 }
 //export paddle_init_param
-func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter, param_config unsafe.Pointer, config_len C.int) C.int {
+func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter, paramConfig unsafe.Pointer, configLen C.int) C.int {
 	et := pserver.ElementType(param.element_type)
 	name := C.GoString(param.name)
 	content := cArrayToSlice(unsafe.Pointer(param.content), int(param.content_len))
 	pc := pserver.ParameterWithConfig{
 		Param:  pserver.Parameter{Name: name, ElementType: et, Content: content},
-		Config: cArrayToSlice(param_config, int(config_len)),
+		Config: cArrayToSlice(paramConfig, int(configLen)),
 	}
 	c := get(client)
 	err := c.InitParam(pc)
 	if err != nil {
 		if err.Error() == pserver.AlreadyInitialized {
-			log.Warningf("parameter %s already initialized, treat paddle_init_param as sucessful.", name)
+			log.Warningf("parameter %s already initialized, treat paddle_init_param as successful.", name)
 			return C.PSERVER_OK
 		}
 		log.Errorln(err)
@@ -153,7 +152,7 @@ func paddle_finish_init_params(client C.paddle_pserver_client) C.int {
 	err := c.FinishInitParams()
 	if err != nil {
 		if err.Error() == pserver.AlreadyInitialized {
-			log.Warningln("parameters already initialized, treat paddle_finish_init_params as sucessful.")
+			log.Warningln("parameters already initialized, treat paddle_finish_init_params as successful.")
 			return C.PSERVER_OK
 		}
@@ -223,12 +222,12 @@ func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter,
 		p := ps[i]
 		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
-		if unsafe.Pointer(param) == nullPtr {
+		if unsafe.Pointer(param) == nil {
 			log.Errorln("must pre-allocate parameter.")
 			return C.PSERVER_ERROR
 		}
-		if unsafe.Pointer(param.content) != nullPtr {
+		if unsafe.Pointer(param.content) != nil {
 			if int(param.content_len) != len(p.Content) {
 				log.Errorf("the pre-allocated content len does not match parameter content len. Pre-allocated len: %d, returned len: %d", param.content_len, len(p.Content))
 				return C.PSERVER_ERROR

--- a/go/pserver/client/c/test/test_train.py
+++ b/go/pserver/client/c/test/test_train.py
 import paddle.v2 as paddle
 import paddle.v2.dataset.uci_housing as uci_housing
+import paddle.v2.master as master
+import os
+import cPickle as pickle
+etcd_ip = os.getenv("MASTER_IP", "127.0.0.1")
+etcd_endpoint = "http://" + etcd_ip + ":2379"
+def cloud_reader():
+    print "connecting to master, etcd endpoints: ", etcd_endpoint
+    master_client = master.client(etcd_endpoint, 5, 64)
+    master_client.set_dataset(
+        ["/pfs/dlnel/public/dataset/uci_housing/uci_housing-*-of-*"])
+    while 1:
+        r, e = master_client.next_record()
+        if not r:
+            break
+        yield pickle.loads(r)
 def main():
@@ -22,13 +40,13 @@ def main():
    # create optimizer of new remote updater to pserver
    optimizer = paddle.optimizer.Momentum(momentum=0)
-    #TODO(zhihong) : replace optimizer with new OptimizerConfig
+    print "etcd endoint: ", etcd_endpoint
    trainer = paddle.trainer.SGD(cost=cost,
                                 parameters=parameters,
                                 update_equation=optimizer,
                                 is_local=False,
-                                 pserver_spec="localhost:3000")
+                                 pserver_spec=etcd_endpoint,
+                                 use_etcd=True)
    # event_handler to print training and testing info
    def event_handler(event):
@@ -47,11 +65,11 @@ def main():
                print "Test %d, %.2f" % (event.pass_id, result.cost)
    # training
+    # NOTE: use uci_housing.train() as reader for non-paddlecloud training
    trainer.train(
        reader=paddle.batch(
            paddle.reader.shuffle(
-                uci_housing.train(), buf_size=500),
+                cloud_reader, buf_size=500), batch_size=2),
-            batch_size=2),
        feeding={'x': 0,
                 'y': 1},
        event_handler=event_handler,

--- a/go/pserver/client/client.go
+++ b/go/pserver/client/client.go
@@ -233,7 +233,7 @@ func (c *Client) Save(path string) error {
 func strHash(s string) uint32 {
 	h := fnv.New32a()
-	h.Write([]byte(s))
+	_, _ = h.Write([]byte(s))
 	return h.Sum32()
 }

--- a/go/pserver/client/client_test.go
+++ b/go/pserver/client/client_test.go
@@ -3,11 +3,13 @@ package client_test
 import (
 	"context"
 	"io/ioutil"
+	"math/rand"
 	"net"
 	"net/http"
 	"net/rpc"
 	"strconv"
 	"strings"
+	"sync"
 	"testing"
 	"time"
@@ -77,15 +79,33 @@ func initEtcdClient() {
 		log.Errorf("err %v", err)
 	}
 	ctx, cancel := context.WithTimeout(context.Background(), timeout)
-	client.Delete(ctx, pserver.PsDesired)
+	_, err = client.Delete(ctx, pserver.PsDesired)
-	client.Delete(ctx, pserver.PsPath)
+	if err != nil {
-	client.Put(ctx, pserver.PsDesired, strconv.Itoa(numPserver))
+		panic(err)
+	}
+	_, err = client.Delete(ctx, pserver.PsPath)
+	if err != nil {
+		panic(err)
+	}
+	_, err = client.Put(ctx, pserver.PsDesired, strconv.Itoa(numPserver))
+	if err != nil {
+		panic(err)
+	}
 	ports := initClient()
 	for i := 0; i < numPserver; i++ {
-		client.Put(ctx, pserver.PsPath+strconv.Itoa(i), ":"+strconv.Itoa(ports[i]))
+		_, err = client.Put(ctx, pserver.PsPath+strconv.Itoa(i), ":"+strconv.Itoa(ports[i]))
+		if err != nil {
+			panic(err)
+		}
 	}
 	cancel()
-	client.Close()
+	err = client.Close()
+	if err != nil {
+		panic(err)
+	}
 }
 type selector bool
@@ -100,27 +120,34 @@ func (l lister) List() []client.Server {
 	return l
 }
-func ClientTest(t *testing.T, c *client.Client) {
+func testClient(t *testing.T, c *client.Client) {
 	selected := c.BeginInitParams()
 	if !selected {
 		t.Fatal("should be selected.")
 	}
-	const numParameter = 100
+	const numParameter = 1000
 	config, err := ioutil.ReadFile("./c/test/testdata/optimizer.pb")
 	if err != nil {
 		t.Fatalf("read optimizer proto failed")
 	}
+	var wg sync.WaitGroup
 	for i := 0; i < numParameter; i++ {
-		var p pserver.Parameter
+		wg.Add(1)
-		p.Name = "p_" + strconv.Itoa(i)
+		go func(i int) {
-		p.ElementType = pserver.Float32
+			var p pserver.Parameter
-		p.Content = make([]byte, (i+1)*100)
+			p.Name = "p_" + strconv.Itoa(i)
-		err := c.InitParam(pserver.ParameterWithConfig{Param: p, Config: config})
+			p.ElementType = pserver.Float32
-		if err != nil {
+			p.Content = make([]byte, (i+1)*100)
-			t.Fatal(err)
+			err := c.InitParam(pserver.ParameterWithConfig{Param: p, Config: config})
-		}
+			if err != nil {
+				t.Fatal(err)
+			}
+			wg.Done()
+		}(i)
 	}
+	wg.Wait()
 	err = c.FinishInitParams()
 	if err != nil {
@@ -128,7 +155,7 @@ func ClientTest(t *testing.T, c *client.Client) {
 	}
 	var grads []pserver.Gradient
-	for i := 0; i < numParameter/2; i++ {
+	for i := 0; i < numParameter; i++ {
 		var g pserver.Gradient
 		g.Name = "p_" + strconv.Itoa(i)
 		g.ElementType = pserver.Float32
@@ -136,9 +163,31 @@ func ClientTest(t *testing.T, c *client.Client) {
 		grads = append(grads, g)
 	}
-	err = c.SendGrads(grads)
+	const paramPerGroup = 10
-	if err != nil {
+	const numGroups = numParameter / paramPerGroup
-		t.Fatal(err)
+	// shuffle send grads order
+	for i := range grads {
+		j := rand.Intn(i + 1)
+		grads[i], grads[j] = grads[j], grads[i]
+	}
+	for i := 0; i < numGroups; i++ {
+		var gs []pserver.Gradient
+		if i == numGroups-1 {
+			gs = grads[i*paramPerGroup:]
+		} else {
+			gs = grads[i*paramPerGroup : (i+1)*paramPerGroup]
+		}
+		wg.Add(1)
+		go func(gs []pserver.Gradient) {
+			err := c.SendGrads(gs)
+			if err != nil {
+				t.Fatal(err)
+			}
+			wg.Done()
+		}(gs)
 	}
 	names := make([]string, numParameter)
@@ -146,20 +195,35 @@ func ClientTest(t *testing.T, c *client.Client) {
 		names[i] = "p_" + strconv.Itoa(i)
 	}
-	params, err := c.GetParams(names)
+	for i := 0; i < numGroups; i++ {
-	if err != nil {
+		var ns []string
-		t.Fatal(err)
+		if i == numGroups-1 {
-	}
+			ns = names[i*paramPerGroup:]
+		} else {
+			ns = names[i*paramPerGroup : (i+1)*paramPerGroup]
+		}
-	if len(names) != len(params) {
+		wg.Add(1)
-		t.Fatalf("parameter size not match, need: %d, have: %d", len(names), len(params))
+		go func(ns []string) {
-	}
+			params, err := c.GetParams(ns)
+			if err != nil {
+				t.Fatal(err)
+			}
-	for i := range params {
+			if len(ns) != len(params) {
-		if names[i] != params[i].Name {
+				t.Fatalf("parameter size not match, need: %d, have: %d", len(names), len(params))
-			t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", names[i], params[i].Name)
+			}
-		}
+			for i := range params {
+				if ns[i] != params[i].Name {
+					t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", ns[i], params[i].Name)
+				}
+			}
+			wg.Done()
+		}(ns)
 	}
+	wg.Wait()
 }
 func TestNativeClient(t *testing.T) {
@@ -169,13 +233,14 @@ func TestNativeClient(t *testing.T) {
 		servers[i] = client.Server{Index: i, Addr: ":" + strconv.Itoa(pserverClientPorts[i])}
 	}
 	c1 := client.NewClient(lister(servers), len(servers), selector(true))
-	ClientTest(t, c1)
+	testClient(t, c1)
 }
-// TODO: tmperary disable etcdClient test for dependency of etcd)
+// EtcdClient is a disabled test, since we have not embedded etcd into
+// our test.
 func EtcdClient(t *testing.T) {
 	initEtcdClient()
 	etcdClient := client.NewEtcd(etcdEndpoints)
 	c2 := client.NewClient(etcdClient, etcdClient.Desired(), selector(true))
-	ClientTest(t, c2)
+	testClient(t, c2)
 }
--- a/go/pserver/client/etcd_client.go
+++ b/go/pserver/client/etcd_client.go
@@ -12,7 +12,7 @@ import (
 )
 const (
-	DefaultEtcdTimeout time.Duration = 5 * time.Second
+	defaultEtcdTimeout time.Duration = 5 * time.Second
 )
 // EtcdClient is used by pserver client that is a part of trainer process.
@@ -47,7 +47,7 @@ func (p *EtcdClient) Desired() int {
 		psDesired, err = strconv.Atoi(string(resp.Kvs[0].Value))
 		if err != nil {
-			log.Errorf("psDesired %s invalid %v", psDesired, err)
+			log.Errorf("psDesired %d invalid %v", psDesired, err)
 			time.Sleep(p.timeout)
 			continue
 		}
@@ -106,11 +106,11 @@ func NewEtcd(endpoints string) *EtcdClient {
 	for {
 		cli, err = clientv3.New(clientv3.Config{
 			Endpoints:   ep,
-			DialTimeout: DefaultEtcdTimeout,
+			DialTimeout: defaultEtcdTimeout,
 		})
 		if err != nil {
 			log.Errorf("Init etcd connection failed: %v", err)
-			time.Sleep(DefaultEtcdTimeout)
+			time.Sleep(defaultEtcdTimeout)
 			continue
 		}
 		break
@@ -118,7 +118,7 @@ func NewEtcd(endpoints string) *EtcdClient {
 	log.Infof("Connected to etcd: %s\n", endpoints)
 	client := &EtcdClient{
 		client:    cli,
-		timeout:   DefaultEtcdTimeout,
+		timeout:   defaultEtcdTimeout,
 		endpoints: ep,
 	}
 	return client

--- a/go/pserver/etcd_client.go
+++ b/go/pserver/etcd_client.go
@@ -49,7 +49,7 @@ func NewEtcdClient(endpoints string, numPservers int, timeout time.Duration) *Et
 // Register registers the pserver on etcd
 //
 // Register returns the index of the current pserver.
-func (e *EtcdClient) Register() (int, error) {
+func (e *EtcdClient) Register(port int) (int, error) {
 	var err error
 	e.externalIP, err = networkhelper.GetExternalIP()
@@ -116,7 +116,7 @@ func (e *EtcdClient) Register() (int, error) {
 	for {
 		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
 		var err error
-		pserverIdx, err = e.registerPserverEtcd(ctx)
+		pserverIdx, err = e.registerPserverEtcd(ctx, port)
 		cancel()
 		if err != nil {
 			log.Warn(err)
@@ -140,7 +140,7 @@ func (e *EtcdClient) initDesiredPservers(ctx context.Context, numPservers int) (
 }
 // registerPserverEtcd registers pserver node on etcd using transaction.
-func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
+func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, error) {
 	var idx int
 	_, err := concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error {
 		registered := false
@@ -156,8 +156,9 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
 					log.Fatal(err)
 				}
 				// find the first id and write info
-				c.Put(psKey, e.externalIP, clientv3.WithLease(resp.ID))
+				pserverAddr := e.externalIP + ":" + strconv.Itoa(port)
-				log.Debugf("set pserver node %s with value %s", psKey, e.externalIP)
+				c.Put(psKey, pserverAddr, clientv3.WithLease(resp.ID))
+				log.Debugf("set pserver node %s with value %s", psKey, pserverAddr)
 				ch, kaerr := e.etcdClient.KeepAlive(context.TODO(), resp.ID)
 				if kaerr != nil {
 					log.Errorf("keepalive etcd node error: %v", kaerr)
@@ -176,10 +177,10 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
 				break
 			}
 		}
-		if registered == true {
+		if registered {
 			return nil
 		}
-		return errors.New("not registerd, may due to already have enough pservers")
+		return errors.New("not registered, may due to already have enough pservers")
 	}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
 	if err != nil {
@@ -210,8 +211,5 @@ func (e *EtcdClient) PutKey(key string, value []byte, timeout time.Duration) err
 	ctx, cancel := context.WithTimeout(context.Background(), timeout)
 	_, err := e.etcdClient.Put(ctx, key, string(value))
 	cancel()
-	if err != nil {
+	return err
-		return err
-	}
-	return nil
 }
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
@@ -14,15 +14,14 @@ import (
 	log "github.com/sirupsen/logrus"
 )
-var nullPtr = unsafe.Pointer(uintptr(0))
 type optimizer struct {
 	opt         *C.struct_paddle_optimizer
 	elementType ElementType
+	contentLen  int
 }
 func cArrayToSlice(p unsafe.Pointer, len int) []byte {
-	if p == nullPtr {
+	if p == nil {
 		return nil
 	}
@@ -37,10 +36,11 @@ func cArrayToSlice(p unsafe.Pointer, len int) []byte {
 func newOptimizer(paramWithConfigs ParameterWithConfig, State []byte) *optimizer {
 	o := &optimizer{}
 	o.elementType = paramWithConfigs.Param.ElementType
+	o.contentLen = len(paramWithConfigs.Param.Content)
 	p := paramWithConfigs.Param
 	c := paramWithConfigs.Config
 	s := State
-	paramBufferSize := C.size_t(len(p.Content) / C.sizeof_float)
+	paramBufferSize := C.size_t(len(p.Content))
 	log.WithFields(log.Fields{
 		"ElementType": p.ElementType,
 		"ParamSize":   paramBufferSize,
@@ -78,7 +78,11 @@ func (o *optimizer) UpdateParameter(g Gradient) error {
 		return fmt.Errorf("Name: %s, parameter and gradient element type not match, parameter: %v, gradient: %v", g.Name, o.elementType, g.ElementType)
 	}
-	r := C.paddle_update_parameter(o.opt, C.paddle_element_type(g.ElementType), unsafe.Pointer(&g.Content[0]), C.int(len(g.Content))/C.sizeof_float)
+	if o.contentLen != len(g.Content) {
+		return fmt.Errorf("Name: %s, parameter and gradient does not have same content len, parameter: %d, gradient: %d", g.Name, o.contentLen, len(g.Content))
+	}
+	r := C.paddle_update_parameter(o.opt, C.paddle_element_type(g.ElementType), unsafe.Pointer(&g.Content[0]), C.int(len(g.Content)))
 	if r != 0 {
 		return fmt.Errorf("optimizer update returned error code: %d", r)
 	}
@@ -86,8 +90,8 @@ func (o *optimizer) UpdateParameter(g Gradient) error {
 }
 func (o *optimizer) Cleanup() {
-	if unsafe.Pointer(o.opt) != nullPtr {
+	if unsafe.Pointer(o.opt) != nil {
 		C.paddle_release_optimizer(o.opt)
-		o.opt = (*C.struct_paddle_optimizer)(nullPtr)
+		o.opt = (*C.struct_paddle_optimizer)(nil)
 	}
 }
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -211,7 +211,7 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
 	// learning optimization methods are stochastic in
 	// nature. This race condition is allowed deliberately
 	// to save the program from making a copy of the
-	// paramter content.
+	// parameter content.
 	parameter.Name = name
 	parameter.ElementType = opt.elementType
 	parameter.Content = opt.GetWeights()
@@ -219,7 +219,7 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
 }
 // pserver save checkpoint
-func (s *Service) doCheckpoint() error {
+func (s *Service) doCheckpoint() (err error) {
 	<-s.initialized
 	s.mu.Lock()
 	defer s.mu.Unlock()
@@ -237,9 +237,9 @@ func (s *Service) doCheckpoint() error {
 	}
 	var buf bytes.Buffer
 	encoder := gob.NewEncoder(&buf)
-	err := encoder.Encode(cp)
+	err = encoder.Encode(cp)
 	if err != nil {
-		return err
+		return
 	}
 	cpMeta := checkpointMeta{}
@@ -248,10 +248,14 @@ func (s *Service) doCheckpoint() error {
 	h := md5.New()
 	cpMeta.MD5 = hex.EncodeToString(h.Sum(buf.Bytes()))
-	cpMetajson, _ := json.Marshal(cpMeta)
+	cpMetajson, err := json.Marshal(cpMeta)
+	if err != nil {
+		return
+	}
 	err = s.client.PutKey(filepath.Join(PsCheckpoint, strconv.Itoa(s.idx)), cpMetajson, 3*time.Second)
 	if err != nil {
-		return err
+		return
 	}
 	if _, err = os.Stat(cpMeta.UUID); os.IsNotExist(err) {
 		log.Info("checkpoint does not exists.")
@@ -264,15 +268,32 @@ func (s *Service) doCheckpoint() error {
 		}
 	}
 	f, err := os.Create(cpMeta.UUID)
-	defer f.Close()
 	if err != nil {
-		return err
+		return
 	}
+	defer func() {
+		closeErr := f.Close()
+		if closeErr != nil {
+			if err != nil {
+				log.Errorln(closeErr)
+			} else {
+				// Set closeErr as return value.
+				err = closeErr
+			}
+		}
+	}()
 	writer := bufio.NewWriter(f)
 	_, err = writer.Write(buf.Bytes())
-	writer.Flush()
 	if err != nil {
-		return err
+		return
 	}
-	return nil
+	err = writer.Flush()
+	if err != nil {
+		return
+	}
+	return
 }
--- a/go/pserver/service_test.go
+++ b/go/pserver/service_test.go
@@ -31,7 +31,7 @@ func TestServiceFull(t *testing.T) {
 	err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: config}, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 	var p1 pserver.Parameter
@@ -40,40 +40,40 @@ func TestServiceFull(t *testing.T) {
 	p1.ElementType = pserver.Float32
 	err = s.InitParam(pserver.ParameterWithConfig{Param: p1, Config: config}, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 	err = s.FinishInitParams(0, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 	var param pserver.Parameter
 	err = s.GetParam("param_b", &param)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 	if !reflect.DeepEqual(param, p1) {
-		t.FailNow()
+		t.Fatal("not equal:", param, p1)
 	}
 	g1, g2 := pserver.Gradient(p1), pserver.Gradient(p)
 	err = s.SendGrad(g1, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 	err = s.SendGrad(g2, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 	var param1 pserver.Parameter
 	err = s.GetParam("param_a", &param1)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 	// don't compare content, since it's already changed by
@@ -82,7 +82,7 @@ func TestServiceFull(t *testing.T) {
 	p.Content = nil
 	if !reflect.DeepEqual(param1, p) {
-		t.FailNow()
+		t.Fatal("not equal:", param1, p)
 	}
 }
@@ -90,16 +90,16 @@ func TestMultipleInit(t *testing.T) {
 	var cp pserver.Checkpoint
 	s, err := pserver.NewService(0, 1, "", nil, cp)
 	if err != nil {
-		t.Error(err)
+		t.Fatal(err)
 	}
 	err = s.FinishInitParams(0, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 	err = s.FinishInitParams(0, nil)
 	if err.Error() != pserver.AlreadyInitialized {
-		t.FailNow()
+		t.Fatal(err)
 	}
 }
@@ -108,7 +108,7 @@ func TestUninitialized(t *testing.T) {
 	s, err := pserver.NewService(0, 1, "", nil, cp)
 	err = s.SendGrad(pserver.Gradient{}, nil)
 	if err.Error() != pserver.Uninitialized {
-		t.FailNow()
+		t.Fatal(err)
 	}
 }
@@ -154,12 +154,12 @@ func TestBlockUntilInitialized(t *testing.T) {
 	err = s.InitParam(pserver.ParameterWithConfig{Param: p, Config: config}, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 	err = s.FinishInitParams(0, nil)
 	if err != nil {
-		t.FailNow()
+		t.Fatal(err)
 	}
 	wg.Wait()

--- a/paddle/api/ConfigParser.cpp
+++ b/paddle/api/ConfigParser.cpp
@@ -64,11 +64,7 @@ ModelConfig* TrainerConfig::getModelConfig() const {
 ParameterConfig::ParameterConfig() : m(new ParameterConfigPrivate()) {}
-ParameterConfig::~ParameterConfig() {
+ParameterConfig::~ParameterConfig() { delete m; }
-  if (m) {
-    delete m;
-  }
-}
 ParameterConfig* ParameterConfig::createParameterConfigFromParameterSharedPtr(
    void* ptr) {
@@ -98,11 +94,7 @@ void* ParameterConfig::getRawPtr() { return m->getConfigPtr(); }
 OptimizationConfig::OptimizationConfig() : m(new OptimizationConfigPrivate()) {}
-OptimizationConfig::~OptimizationConfig() {
+OptimizationConfig::~OptimizationConfig() { delete m; }
-  if (m) {
-    delete m;
-  }
-}
 std::string OptimizationConfig::toProtoString() {
  return m->getConfig().SerializeAsString();

--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -843,7 +843,8 @@ public:
                                               bool useSparseUpdater);
  static ParameterUpdater* createNewRemoteUpdater(
      OptimizationConfig* config,
-      const std::string pserverSpec) throw(UnsupportError);
+      const std::string pserverSpec,
+      const bool useEtcd) throw(UnsupportError);
  ~ParameterUpdater();
  /**

--- a/paddle/api/ParameterOptimizer.cpp
+++ b/paddle/api/ParameterOptimizer.cpp
@@ -53,11 +53,7 @@ struct ParameterTraverseCallbackPrivate {
 ParameterOptimizer::ParameterOptimizer() : m(new ParameterOptimizerPrivate()) {}
-ParameterOptimizer::~ParameterOptimizer() {
+ParameterOptimizer::~ParameterOptimizer() { delete m; }
-  if (m) {
-    delete m;
-  }
-}
 ParameterOptimizer* ParameterOptimizer::create(OptimizationConfig* config) {
  CHECK(config != nullptr);
@@ -104,11 +100,7 @@ std::vector<int> ParameterOptimizer::getParameterTypes() const {
 ParameterTraverseCallback::ParameterTraverseCallback()
    : m(new ParameterTraverseCallbackPrivate()) {}
-ParameterTraverseCallback::~ParameterTraverseCallback() {
+ParameterTraverseCallback::~ParameterTraverseCallback() { delete m; }
-  if (m) {
-    delete m;
-  }
-}
 void ParameterTraverseCallback::apply(const std::vector<Vector*>& vecs,
                                      const ParameterConfig& conf,

--- a/paddle/api/ParameterUpdater.cpp
+++ b/paddle/api/ParameterUpdater.cpp
@@ -33,11 +33,12 @@ ParameterUpdater *ParameterUpdater::createLocalUpdater(
 ParameterUpdater *ParameterUpdater::createNewRemoteUpdater(
    OptimizationConfig *config,
-    const std::string pserverSpec) throw(UnsupportError) {
+    const std::string pserverSpec,
+    const bool useEtcd) throw(UnsupportError) {
 #ifndef PADDLE_WITHOUT_GOLANG
  auto updater = new ParameterUpdater();
  updater->m->updater.reset(new paddle::NewRemoteParameterUpdater(
-      config->m->getConfig(), pserverSpec));
+      config->m->getConfig(), pserverSpec, useEtcd));
  return updater;
 #else
  throw UnsupportError();

--- a/paddle/api/Vector.cpp
+++ b/paddle/api/Vector.cpp
@@ -171,11 +171,7 @@ struct VectorPrivate {
 Vector::Vector() : m(new VectorPrivate()) {}
-Vector::~Vector() {
+Vector::~Vector() { delete m; }
-  if (m) {
-    delete m;
-  }
-}
 Vector* Vector::createZero(size_t sz, bool useGpu) {
  auto retVec = new Vector();

--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
 # ddim lib
-cc_library(ddim SRCS ddim.cc)
+cc_library(enforce SRCS enforce.cc DEPS glog)
+cc_test(enforce_test SRCS enforce_test.cc DEPS enforce)
+cc_library(ddim SRCS ddim.cc DEPS eigen3)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
-cc_test(tensor_test SRCS tensor_test.cc DEPS ddim)
+cc_library(tensor SRCS tensor.cc DEPS ddim place enforce paddle_memory)
+cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
 cc_test(variable_test SRCS variable_test.cc)
 cc_test(scope_test SRCS scope_test.cc)
-cc_test(enforce_test SRCS enforce_test.cc)
 proto_library(attr_type SRCS attr_type.proto)
 proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
 cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
 proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
 cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
-cc_library(operator SRCS operator.cc DEPS op_desc device_context)
+cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
-cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc)
+cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc enforce)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry operator)
 py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
 proto_library(net_proto SRCS net_proto.proto DEPS op_proto)
-cc_library(net SRCS net.cc DEPS net_proto)
+cc_library(net SRCS net.cc DEPS operator net_proto op_registry)
+cc_test(net_op_test SRCS net_op_test.cc DEPS net)
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include "paddle/framework/ddim.h"
+#include "paddle/framework/enforce.h"
 namespace paddle {
 namespace framework {
-///@cond HIDDEN
+/// @cond HIDDEN
 template <int i>
 Dim<i> make_dim(const int* d) {
@@ -50,7 +65,7 @@ void make_ddim(DDim& ddim, const int* dims, int n) {
  }
 }
-///@endcond
+/// @endcond
 DDim make_ddim(std::initializer_list<int> dims) {
  DDim result(make_dim(0));
@@ -64,11 +79,11 @@ DDim make_ddim(const std::vector<int>& dims) {
  return result;
 }
-///@cond HIDDEN
+/// @cond HIDDEN
 // XXX For some reason, putting this in an anonymous namespace causes errors
 class DynamicMutableIndexer : public boost::static_visitor<int&> {
 public:
-  DynamicMutableIndexer(int idx) : idx_(idx) {}
+  explicit DynamicMutableIndexer(int idx) : idx_(idx) {}
  template <int D>
  int& operator()(Dim<D>& dim) const {
@@ -81,7 +96,7 @@ class DynamicMutableIndexer : public boost::static_visitor<int&> {
 class DynamicConstIndexer : public boost::static_visitor<int> {
 public:
-  DynamicConstIndexer(int idx) : idx_(idx) {}
+  explicit DynamicConstIndexer(int idx) : idx_(idx) {}
  template <int D>
  int operator()(const Dim<D>& dim) const {
@@ -92,7 +107,7 @@ class DynamicConstIndexer : public boost::static_visitor<int> {
  int idx_;
 };
-///@endcond
+/// @endcond
 int& DDim::operator[](int idx) {
  return boost::apply_visitor(DynamicMutableIndexer(idx), var);
@@ -102,6 +117,8 @@ int DDim::operator[](int idx) const {
  return boost::apply_visitor(DynamicConstIndexer(idx), var);
 }
+ssize_t DDim::size() const { return arity(*this); }
 bool DDim::operator==(DDim d) const {
  if (var.which() != d.getVar().which()) {
    return false;
@@ -155,11 +172,11 @@ int get(const DDim& ddim, int idx) { return ddim[idx]; }
 void set(DDim& ddim, int idx, int value) { ddim[idx] = value; }
-///@cond HIDDEN
+/// @cond HIDDEN
 struct VectorizeVisitor : public boost::static_visitor<> {
  std::vector<int>& vector;
-  VectorizeVisitor(std::vector<int>& v) : vector(v) {}
+  explicit VectorizeVisitor(std::vector<int>& v) : vector(v) {}
  template <typename T>
  void operator()(const T& t) {
@@ -169,7 +186,7 @@ struct VectorizeVisitor : public boost::static_visitor<> {
  void operator()(const Dim<1>& t) { vector.push_back(t.head); }
 };
-///@endcond
+/// @endcond
 std::vector<int> vectorize(const DDim& ddim) {
  std::vector<int> result;
@@ -178,16 +195,59 @@ std::vector<int> vectorize(const DDim& ddim) {
  return result;
 }
+struct ProductVisitor : public boost::static_visitor<ssize_t> {
+  template <int D>
+  ssize_t operator()(const Dim<D>& dim) {
+    return product(dim);
+  }
+};
 ssize_t product(const DDim& ddim) {
-  ssize_t result = 1;
+  ProductVisitor visitor;
-  std::vector<int> v = vectorize(ddim);
+  return boost::apply_visitor(visitor, ddim);
-  for (auto i : v) {
+}
-    result *= i;
+struct SliceVectorizeVisitor : public boost::static_visitor<> {
+  std::vector<int>& vector;
+  int begin;
+  int end;
+  SliceVectorizeVisitor(std::vector<int>& v, int b, int e)
+      : vector(v), begin(b), end(e) {
+    PADDLE_ENFORCE(begin < end,
+                   "Begin index must be less than end index in ddim slice.");
+    PADDLE_ENFORCE(begin >= 0,
+                   "Begin index can't be less than zero in ddim slice.");
  }
-  return result;
+  template <int S>
+  void operator()(const Dim<S>& dim) {
+    if (begin == 0) {
+      vector.push_back(dim.head);
+    } else {
+      --begin;
+    }
+    --end;
+    if (end > 0) {
+      this->operator()(dim.tail);
+    }
+  }
+  void operator()(const Dim<1>& dim) {
+    PADDLE_ENFORCE(end == 1, "End index in ddim slice is out of bound.");
+    vector.push_back(dim.head);
+  }
+};
+DDim slice_ddim(const DDim& dim, int begin, int end) {
+  std::vector<int> vec;
+  vec.reserve(end - begin);
+  SliceVectorizeVisitor visitor(vec, begin, end);
+  boost::apply_visitor(visitor, dim);
+  return make_ddim(vec);
 }
-///\cond HIDDEN
+/// \cond HIDDEN
 struct ArityVisitor : boost::static_visitor<int> {
  template <int D>
@@ -196,15 +256,15 @@ struct ArityVisitor : boost::static_visitor<int> {
  }
 };
-///\endcond
+/// \endcond
 int arity(const DDim& d) { return boost::apply_visitor(ArityVisitor(), d); }
-///\cond HIDDEN
+/// \cond HIDDEN
 struct DDimPrinter : boost::static_visitor<void> {
  std::ostream& os;
-  DDimPrinter(std::ostream& os_) : os(os_) {}
+  explicit DDimPrinter(std::ostream& os_) : os(os_) {}
  template <typename T>
  void operator()(const T& t) {
@@ -212,7 +272,7 @@ struct DDimPrinter : boost::static_visitor<void> {
  }
 };
-///\endcond
+/// \endcond
 std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
  DDimPrinter printer(os);
@@ -220,5 +280,9 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
  return os;
 }
+DDim::DDim(std::initializer_list<int> init_list) {
+  *this = make_ddim(init_list);
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #pragma once
 #include <boost/variant.hpp>
 #include <initializer_list>
 #include <stdexcept>
 #include <vector>
 #include "paddle/framework/dim.h"
+#include "paddle/framework/enforce.h"
+#include "unsupported/Eigen/CXX11/Tensor"
 namespace paddle {
 namespace framework {
@@ -27,7 +42,9 @@ struct DDim {
  DDim() : var(Dim<1>()) {}
  template <int D>
-  DDim(const Dim<D>& in) : var(in) {}
+  explicit DDim(const Dim<D>& in) : var(in) {}
+  /*implicit*/ DDim(std::initializer_list<int> init_list);
  template <int D>
  DDim& operator=(const Dim<D>& in) {
@@ -57,6 +74,8 @@ struct DDim {
  DDim operator+(DDim d) const;
  DDim operator*(DDim d) const;
+  ssize_t size() const;
 };
 /**
@@ -81,6 +100,15 @@ std::vector<int> vectorize(const DDim& ddim);
 ssize_t product(const DDim& ddim);
+/**
+ * \brief Slice a ddim
+ *
+ * Slice dim with [begin, end).
+ * e.g.  DDim d = make_ddim({1,2,3,4,5});
+ *       slice_ddim(d, 1, 3); ====> {2,3}
+ */
+DDim slice_ddim(const DDim& dim, int begin, int end);
 /**
 * \brief What is the length of this dimension?
 *
@@ -91,6 +119,17 @@ int arity(const DDim& ddim);
 std::ostream& operator<<(std::ostream&, const DDim&);
+template <int NDIMS>
+Eigen::DSizes<Eigen::DenseIndex, NDIMS> ToEigenDSizes(const DDim& dims) {
+  int rank = arity(dims);
+  PADDLE_ENFORCE(rank == NDIMS, "DDim and NDIMS must be same");
+  Eigen::DSizes<Eigen::DenseIndex, NDIMS> dsizes;
+  for (int d = 0; d < rank; d++) {
+    dsizes[d] = dims[d];
+  }
+  return dsizes;
+}
 }  // namespace framework
 }  // namespace paddle

--- a/paddle/framework/ddim_test.cc
+++ b/paddle/framework/ddim_test.cc
@@ -49,9 +49,30 @@ TEST(DDim, Equality) {
  // arity of a DDim
  EXPECT_EQ(paddle::framework::arity(ddim), 3);
+  EXPECT_EQ(ddim.size(), 3);
  // product of a DDim
  EXPECT_EQ(paddle::framework::product(vddim), 45);
+  EXPECT_EQ(
+      paddle::framework::product(paddle::framework::make_ddim({3, 2, 5, 3})),
+      90);
+  // slice a DDim
+  paddle::framework::DDim ddim2 =
+      paddle::framework::make_ddim({1, 2, 3, 4, 5, 6});
+  paddle::framework::DDim ss = paddle::framework::slice_ddim(ddim2, 2, 5);
+  EXPECT_EQ(arity(ss), 3);
+  EXPECT_EQ(ss[0], 3);
+  EXPECT_EQ(ss[1], 4);
+  EXPECT_EQ(ss[2], 5);
+  paddle::framework::DDim ss2 = paddle::framework::slice_ddim(ddim2, 0, 6);
+  EXPECT_EQ(arity(ss2), 6);
+  EXPECT_EQ(ss2[0], 1);
+  EXPECT_EQ(ss2[1], 2);
+  EXPECT_EQ(ss2[2], 3);
+  EXPECT_EQ(ss2[3], 4);
+  EXPECT_EQ(ss2[4], 5);
+  EXPECT_EQ(ss2[5], 6);
 }
 TEST(DDim, Print) {

--- a/paddle/framework/dim_test.cu
+++ b/paddle/framework/dim_test.cu
 #include <thrust/device_vector.h>
 #include <sstream>
-#include "paddle/framework/dim.h"
 #include "gtest/gtest.h"
+#include "paddle/framework/dim.h"
 __global__ void test(paddle::framework::Dim<2>* o) {
-    o[0] = paddle::framework::make_dim(5, 6);
+  o[0] = paddle::framework::make_dim(5, 6);
 }
 __global__ void dyn_idx_gpu(int* o) {
-    auto d = paddle::framework::make_dim(5, 6);
+  auto d = paddle::framework::make_dim(5, 6);
-    o[0] = d[1];
+  o[0] = d[1];
 }
 TEST(Dim, Equality) {
-    // construct a Dim on the CPU
+  // construct a Dim on the CPU
-    auto a = paddle::framework::make_dim(3, 4);
+  auto a = paddle::framework::make_dim(3, 4);
-    EXPECT_EQ(paddle::framework::get<0>(a), 3);
+  EXPECT_EQ(paddle::framework::get<0>(a), 3);
-    EXPECT_EQ(paddle::framework::get<1>(a), 4);
+  EXPECT_EQ(paddle::framework::get<1>(a), 4);
-    // construct a Dim on the GPU
+  // construct a Dim on the GPU
-    thrust::device_vector<paddle::framework::Dim<2>> t(2);
+  thrust::device_vector<paddle::framework::Dim<2>> t(2);
-    test<<<1,1>>>(thrust::raw_pointer_cast(t.data()));
+  test<<<1, 1>>>(thrust::raw_pointer_cast(t.data()));
-    a = t[0];
+  a = t[0];
-    EXPECT_EQ(paddle::framework::get<0>(a), 5);
+  EXPECT_EQ(paddle::framework::get<0>(a), 5);
-    EXPECT_EQ(paddle::framework::get<1>(a), 6);
+  EXPECT_EQ(paddle::framework::get<1>(a), 6);
-    // linearization
+  // linearization
-    auto b = paddle::framework::make_dim(7, 8);
+  auto b = paddle::framework::make_dim(7, 8);
-    EXPECT_EQ(paddle::framework::linearize(a, b), 83);
+  EXPECT_EQ(paddle::framework::linearize(a, b), 83);
-    // product
+  // product
-    EXPECT_EQ(paddle::framework::product(a), 30);
+  EXPECT_EQ(paddle::framework::product(a), 30);
-    // mutate a Dim
+  // mutate a Dim
-    paddle::framework::get<1>(b) = 10;
+  paddle::framework::get<1>(b) = 10;
-    EXPECT_EQ(paddle::framework::get<0>(b), 7);
+  EXPECT_EQ(paddle::framework::get<0>(b), 7);
-    EXPECT_EQ(paddle::framework::get<1>(b), 10);
+  EXPECT_EQ(paddle::framework::get<1>(b), 10);
-    // dynamic access
+  // dynamic access
-    paddle::framework::get(b, 0) = 8;
+  paddle::framework::get(b, 0) = 8;
-    b[1] = 11;
+  b[1] = 11;
-    EXPECT_EQ(paddle::framework::get<0>(b), 8);
+  EXPECT_EQ(paddle::framework::get<0>(b), 8);
-    EXPECT_EQ(paddle::framework::get<1>(b), 11);
+  EXPECT_EQ(paddle::framework::get<1>(b), 11);
-    EXPECT_EQ(paddle::framework::get(b, 0), 8);
+  EXPECT_EQ(paddle::framework::get(b, 0), 8);
-    EXPECT_EQ(b[1], 11);
+  EXPECT_EQ(b[1], 11);
-    // dynamic access on GPU
+  // dynamic access on GPU
-    thrust::device_vector<int> r(1);
+  thrust::device_vector<int> r(1);
-    dyn_idx_gpu<<<1,1>>>(thrust::raw_pointer_cast(r.data()));
+  dyn_idx_gpu<<<1, 1>>>(thrust::raw_pointer_cast(r.data()));
-    int res = r[0];
+  int res = r[0];
-    EXPECT_EQ(res, 6);
+  EXPECT_EQ(res, 6);
-    // ex_prefix_mul
+  // ex_prefix_mul
-    paddle::framework::Dim<3> c = paddle::framework::ex_prefix_mul(paddle::framework::Dim<3>(3, 4, 5));
+  paddle::framework::Dim<3> c =
-    EXPECT_EQ(paddle::framework::get<0>(c), 1);
+      paddle::framework::ex_prefix_mul(paddle::framework::Dim<3>(3, 4, 5));
-    EXPECT_EQ(paddle::framework::get<1>(c), 3);
+  EXPECT_EQ(paddle::framework::get<0>(c), 1);
-    EXPECT_EQ(paddle::framework::get<2>(c), 12);
+  EXPECT_EQ(paddle::framework::get<1>(c), 3);
+  EXPECT_EQ(paddle::framework::get<2>(c), 12);
-    // generate from an index
-    auto size = paddle::framework::make_dim(4, 5, 2);
+  // generate from an index
-    c = paddle::framework::Dim<3>(14, size);
+  auto size = paddle::framework::make_dim(4, 5, 2);
-    EXPECT_EQ(paddle::framework::get<0>(c), 2);
+  c = paddle::framework::Dim<3>(14, size);
-    EXPECT_EQ(paddle::framework::get<1>(c), 3);
+  EXPECT_EQ(paddle::framework::get<0>(c), 2);
-    EXPECT_EQ(paddle::framework::get<2>(c), 0);
+  EXPECT_EQ(paddle::framework::get<1>(c), 3);
-    c = paddle::framework::Dim<3>(25, size);
+  EXPECT_EQ(paddle::framework::get<2>(c), 0);
-    EXPECT_EQ(paddle::framework::get<0>(c), 1);
+  c = paddle::framework::Dim<3>(25, size);
-    EXPECT_EQ(paddle::framework::get<1>(c), 1);
+  EXPECT_EQ(paddle::framework::get<0>(c), 1);
-    EXPECT_EQ(paddle::framework::get<2>(c), 1);
+  EXPECT_EQ(paddle::framework::get<1>(c), 1);
+  EXPECT_EQ(paddle::framework::get<2>(c), 1);
 }
 TEST(Dim, Bool) {
-    auto a = paddle::framework::make_dim(3, 4);
+  auto a = paddle::framework::make_dim(3, 4);
-    auto b = paddle::framework::make_dim(5, 6);
+  auto b = paddle::framework::make_dim(5, 6);
-    auto c = paddle::framework::make_dim(3, 4);
+  auto c = paddle::framework::make_dim(3, 4);
-    // in_bounds check
+  // in_bounds check
-    EXPECT_TRUE(paddle::framework::contained(a, b));
+  EXPECT_TRUE(paddle::framework::contained(a, b));
-    EXPECT_FALSE(paddle::framework::contained(b, a));
+  EXPECT_FALSE(paddle::framework::contained(b, a));
-    // comparison
+  // comparison
-    EXPECT_TRUE(a == a);
+  EXPECT_TRUE(a == a);
-    EXPECT_FALSE(a == b);
+  EXPECT_FALSE(a == b);
-    EXPECT_TRUE(a == c);
+  EXPECT_TRUE(a == c);
 }
 TEST(Dim, Print) {
-    {
+  {
-        std::stringstream ss;
+    std::stringstream ss;
-        auto a = paddle::framework::make_dim(2, 3);
+    auto a = paddle::framework::make_dim(2, 3);
-        ss << a;
+    ss << a;
-        EXPECT_EQ(ss.str(), "2, 3");
+    EXPECT_EQ(ss.str(), "2, 3");
-    }
+  }
-    {
+  {
-        std::stringstream ss;
+    std::stringstream ss;
-        ss << paddle::framework::make_dim(8);
+    ss << paddle::framework::make_dim(8);
-        EXPECT_EQ(ss.str(), "8");
+    EXPECT_EQ(ss.str(), "8");
-    }
+  }
 }
--- a/paddle/framework/enforce.cc
+++ b/paddle/framework/enforce.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/framework/enforce.h"
--- a/paddle/framework/enforce.h
+++ b/paddle/framework/enforce.h
@@ -10,6 +10,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <glog/logging.h>
 #include <paddle/string/printf.h>
 #include <exception>
 #include <sstream>
@@ -58,12 +59,17 @@ class EnforceNotMet : public std::exception {
 /**
 * @brief Enforce a condition, otherwise throw an EnforceNotMet
 */
+#ifdef NDEBUG
 #define PADDLE_ENFORCE(condition, ...) \
  do {                                 \
    if (UNLIKELY(!(condition))) {      \
      PADDLE_THROW(__VA_ARGS__);       \
    }                                  \
  } while (0)
+#else
+#define PADDLE_ENFORCE(condition, ...) \
+  CHECK(condition) << ::paddle::string::Sprintf(__VA_ARGS__);
+#endif
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/net.cc
+++ b/paddle/framework/net.cc
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
 #include "paddle/framework/net.h"
 namespace paddle {
 namespace framework {
-PlainNet::PlainNet(const NetDesc& def) {}
+void PlainNet::CompleteAddOp() {
+  std::unordered_set<std::string> input_set;
-void PlainNet::InferShape(Scope* scope) {
+  std::unordered_set<std::string> output_set;
+  std::unordered_set<std::string> temp_output;
  for (auto& op : ops_) {
-    op.InferShape();
+    for (auto& ipt : op->inputs_) {
+      if (!Contains(output_set, ipt)) {  // Not other op's output
+        input_set.insert(ipt);
+      } else {
+        temp_output.insert(ipt);
+      }
+    }
+    for (auto& opt : op->outputs_) {
+      output_set.insert(opt);
+    }
+  }
+  inputs_.reserve(input_set.size());
+  std::copy(input_set.begin(), input_set.end(), std::back_inserter(inputs_));
+  outputs_.reserve(output_set.size());
+  std::vector<int> tmp_index;
+  tmp_index.reserve(temp_output.size());
+  int idx = 0;
+  for (auto& opt : output_set) {
+    if (Contains(temp_output, opt)) {
+      tmp_index.push_back(idx);
+    }
+    outputs_.push_back(opt);
+    ++idx;
  }
+  attrs_["temporary_index"] = tmp_index;
+  add_op_done_ = true;
 }
-void PlainNet::Run(std::shared_ptr<Scope> scope, DeviceContext* ctx) {
+std::string PlainNet::DebugString() const {
+  std::ostringstream os;
+  os << this->type_ << ":" << std::endl;
  for (auto& op : ops_) {
-    op.Run(ctx);
+    os << "\t" << op->DebugString() << std::endl;
  }
+  return os.str();
 }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/net.h
+++ b/paddle/framework/net.h
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
+Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
+you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+You may obtain a copy of the License at
-   http://www.apache.org/licenses/LICENSE-2.0
+    http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
+Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
+distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
+See the License for the specific language governing permissions and
-   limitations under the License. */
+limitations under the License. */
 #pragma once
+#include <paddle/framework/op_desc.pb.h>
+#include <paddle/framework/operator.h>
 #include "paddle/framework/net_proto.pb.h"
 #include "paddle/framework/op_proto.pb.h"
+#include "paddle/framework/op_registry.h"
 #include "paddle/framework/scope.h"
 #include "paddle/platform/device_context.h"
 namespace paddle {
 namespace framework {
-using namespace paddle::platform;
-// operator's index stored in a network.
-typedef int OpIndex;
-/**
- * NOTE following codes are some definitions of unimplemented concepts.
- * We write some basic implementation to make Net compilable. These APIs will
- * keep updating if the concepts related are implemented.
- */
-struct OpDesc;
-struct OpAttrs {};
-class Operator {
- public:
-  Operator(const OpDesc &def) {}
-  void InferShape() {}
-  void Run(DeviceContext *ctx) {}
-};
 /**
- * @brief Network that manage the operators it has.
+ * @brief Network is also a type of Operator
+ *
+ * It will manage the operators it has.
 *
- * Network is the container and controller of a set of operators, user can build
+ * Network is the container and controller of a set of operators.
- * a real network from a NetDesc which is a protobuf message and use
- * Network.Run() * to run all the operators in the network.
 * A network object knows all Operators belonging to this network. Variables,
 * which are inputs and outputs of these operators, are created and managed by a
 * hierarchy of Scope objects.
 *
- * This is the base class of network, all the networks should implement the apis
+ * This is the base class of network, all the networks should implement the APIs
 * it defines.
 */
-class Net {
+class Net : public OperatorBase {
 public:
-  /**
+  virtual void AddOp(const OperatorPtr& op) = 0;
-   * @brief Infer shapes of all inputs and outputs of operators.
+  virtual void CompleteAddOp() = 0;
-   */
-  virtual void InferShape(Scope *scope) = 0;
-  /**
-   * @brief Run the network.
-   *
-   * Run all the operators and return success(true) or not, with all the
-   * variables are located in `scope`. `context` describes the detail execution
-   * environment for ops. `begin` and `end` specify the scope of `ops_` to run,
-   * If no positive indexes are provided, all operators in `ops_` will run.
-   */
-  virtual void Run(std::shared_ptr<Scope> scope, DeviceContext *ctx) = 0;
-  /**
-   * @brief Add an Operator according to `def`.
-   */
-  virtual OpIndex AddOp(const OpProto &def) = 0;
-  /**
-   * @brief Add optimizer operators acctording to `attrs`.
-   */
-  virtual void AddOptimizerOps(const OpAttrs &attrs) = 0;
-  /**
-   * @brief Add backward operators.
-   */
-  virtual void AddBackwardOps() = 0;
-  /**
-   * @brief Create a network.
-   */
-  static std::unique_ptr<Net> Create(const NetDesc &def = NetDesc());
-  virtual ~Net() {}
 };
+using NetPtr = std::shared_ptr<Net>;
 /**
 * @brief a basic implementation of Net.
 *
@@ -103,18 +55,14 @@ class Net {
 class PlainNet : public Net {
 public:
  /**
-   * @brief Initialize a PlainNet.
+   * Infer all the operators' input and output variables' shapes, will be called
-   *
-   * Initialize from  a network describe by `def`. NetDesc is the definition of
-   * a network.
-   */
-  PlainNet(const NetDesc &def);
-  /**
-   * Infer all the operators' input and output varialbes' shapes, will be called
   * before every mini-batch
   */
-  virtual void InferShape(Scope *scope) override;
+  void InferShape(const ScopePtr& scope) const override {
+    for (auto& op : ops_) {
+      op->InferShape(scope);
+    }
+  }
  /**
   * @brief Run the network.
@@ -123,48 +71,34 @@ class PlainNet : public Net {
   * scope will be used instead. If no OpContext is provicded, default context
   * will be used.
   */
-  virtual void Run(std::shared_ptr<Scope> scope, DeviceContext *ctx) override;
+  void Run(const ScopePtr& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    for (auto& op : ops_) {
+      op->Run(scope, dev_ctx);
+    }
+  }
  /**
-   * @brief Add an operator to this network.
+   * @brief Add an operator by ptr
   */
-  virtual OpIndex AddOp(const OpProto &def) override;
+  void AddOp(const OperatorPtr& op) override {
+    PADDLE_ENFORCE(!add_op_done_, "Cannot AddOp when this network is sealed");
+    ops_.push_back(op);
+  }
-  /**
+  void CompleteAddOp() override;
-   * @brief Add all optimizer operators related into the network.
-   */
-  virtual void AddOptimizerOps(const OpAttrs &attrs) override;
-  /**
+  std::string DebugString() const override;
-   * @brief Add all backward operators related into the network.
-   */
-  virtual void AddBackwardOps() override;
-  virtual ~PlainNet() override {}
- protected:
+  std::vector<OperatorPtr> ops_;
-  /**
-   * @brief Build the network.
-   *
-   * Create operators accordding to `def`, will be called by the constructor.
-   */
-  void BuildNet(const NetDesc &def);
-  /**
-   * @brief Add an operator into this network.
-   *
-   * Add a operator which is identified as `type` and has attributes described
-   * in `attrs`, the `inputs` are the keys of readonly input variables,
-   * `outputs` are keys of mutable output variables. An `OpIndex` will be
-   * returned to indicate the offset of the new operator in `ops_`.
-   */
-  OpIndex AddOp(const std::string &type, const std::vector<std::string> &inputs,
-                const std::vector<std::string> &outputs,
-                const OpAttrs &attrs = OpAttrs());
 private:
-  // the operators owned by `Network`.
+  bool add_op_done_{false};
-  std::vector<Operator> ops_;
+  template <typename T, typename KeyType>
+  static bool Contains(T container, KeyType key) {
+    return container.find(key) != container.end();
+  }
 };
 }  // namespace framework

--- a/paddle/framework/net_op_test.cc
+++ b/paddle/framework/net_op_test.cc
+#include <gtest/gtest.h>
+#include <paddle/framework/net.h>
+#include <paddle/framework/op_registry.h>
+#include <paddle/framework/operator.h>
+namespace pd = paddle::framework;
+static int infer_shape_cnt = 0;
+static int run_cnt = 0;
+class TestOp : public pd::OperatorBase {
+ public:
+  void InferShape(const paddle::framework::ScopePtr& scope) const override {
+    ++infer_shape_cnt;
+  }
+  void Run(const paddle::framework::ScopePtr& scope,
+           const paddle::platform::DeviceContext& dev_ctx) const override {
+    ++run_cnt;
+  }
+};
+template <typename T>
+void AssertSameVectorWithoutOrder(const std::vector<T>& expected,
+                                  const std::vector<T>& actual) {
+  ASSERT_EQ(expected.size(), actual.size());
+  std::unordered_set<T> expected_set;
+  for (auto& tmp : expected) {
+    expected_set.insert(tmp);
+  }
+  for (auto& act : actual) {
+    ASSERT_NE(expected_set.end(), expected_set.find(act));
+  }
+}
+TEST(OpKernel, all) {
+  auto net = std::make_shared<paddle::framework::PlainNet>();
+  ASSERT_NE(net, nullptr);
+  auto op1 = std::make_shared<TestOp>();
+  op1->inputs_ = {"x", "w1", "b1"};
+  op1->outputs_ = {"y"};
+  net->AddOp(op1);
+  auto op2 = std::make_shared<TestOp>();
+  op2->inputs_ = {"y", "w2", "b2"};
+  op2->outputs_ = {"z"};
+  net->AddOp(op2);
+  net->CompleteAddOp();
+  AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"}, net->inputs_);
+  AssertSameVectorWithoutOrder({"y", "z"}, net->outputs_);
+  auto tmp_idx_iter = net->attrs_.find("temporary_index");
+  ASSERT_NE(net->attrs_.end(), tmp_idx_iter);
+  auto& tmp_idx = boost::get<std::vector<int>>(tmp_idx_iter->second);
+  ASSERT_EQ(1UL, tmp_idx.size());
+  ASSERT_EQ("y", net->outputs_[tmp_idx[0]]);
+  auto scope = std::make_shared<pd::Scope>();
+  paddle::platform::CPUDeviceContext dev_ctx;
+  net->InferShape(scope);
+  net->Run(scope, dev_ctx);
+  ASSERT_EQ(2, infer_shape_cnt);
+  ASSERT_EQ(2, run_cnt);
+  ASSERT_THROW(net->AddOp(op2), paddle::framework::EnforceNotMet);
+}
--- a/paddle/framework/op_proto.proto
+++ b/paddle/framework/op_proto.proto
@@ -34,6 +34,11 @@ message AttrProto {
    // Supported attribute comments. It helps 3rd-party language generate doc-string.
    required string comment = 3;
+    // If that attribute is generated, it means the Paddle third language
+    // binding has responsibility to fill that attribute. End-User should
+    // not set that attribute.
+    optional bool generated = 4 [default=false];
 }
 // Input or output message for 3rd-party language binding.
@@ -45,6 +50,40 @@ message VarProto {
    // The comment for that input. It helps 3rd-party language generate doc-string.
    required string comment = 2;
+    // Is that input/output could be a list or not.
+    // If so, that Op should write a attributed named `input_format` or
+    // `output_format`.
+    //
+    // e.g.
+    //   If the op is a fc op, the inputs are `X`, `W`, `b`. The `X` and `W`
+    //   could be multiple, so the multiple of `X` and `W` is True, and OpDesc
+    //   will hold a attribute of them.
+    //
+    //   The Op desc of same fc could be
+    //   {
+    //      "type": "fc",
+    //      "input": ["X1", "X2", "W1", "W2", "b"],
+    //      "output": "fc.out",
+    //      "attrs" : {
+    //        "input_format": [0, 2, 4, 5]
+    //      }
+    //   }
+    //
+    optional bool multiple = 3 [default=false];
+    // It marks that output is a temporary output. That output is not used by
+    // user, but used by other op internally as input. If other op is not use
+    // that output, it could be optimized early.
+    //
+    // Attribute temporary_index will be set in OpDesc if there is some
+    // outputs are temporary.
+    //
+    // output = [ "xxx.out1", "xxx.tmp", "xxx.out2"],
+    // attrs = {
+    //   "temporary_index": [1]
+    // }
+    optional bool temporary = 4 [default=false];
 }
 // Op protocol message for 3rd-party language binding.

--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include <paddle/framework/op_registry.h>
 namespace paddle {
@@ -33,4 +47,4 @@ void AttrTypeHelper::SetAttrType<std::vector<std::string>>(AttrProto* attr) {
  attr->set_type(paddle::framework::AttrType::STRINGS);
 }
 }  // namespace framework
 }  // namespace paddle
\ No newline at end of file
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
 #pragma once
 #include <algorithm>
+#include <atomic>
 #include <type_traits>
+#include <unordered_map>
+#include <unordered_set>
 #include "paddle/framework/attr_checker.h"
 #include "paddle/framework/op_desc.pb.h"
 #include "paddle/framework/op_proto.pb.h"
@@ -59,25 +62,59 @@ class OpProtoAndCheckerMaker {
  OpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
      : proto_(proto), op_checker_(op_checker) {}
+  ~OpProtoAndCheckerMaker() {
+    PADDLE_ENFORCE(validated_, "should call Validate after build");
+  }
+  void Validate() {
+    validated_ = true;
+    CheckNoDuplicatedInOutAttrs();
+  }
 protected:
-  void AddInput(const std::string& name, const std::string& comment) {
+  void AddInput(const std::string& name, const std::string& comment,
+                bool multiple = false) {
    auto input = proto_->mutable_inputs()->Add();
    *input->mutable_name() = name;
    *input->mutable_comment() = comment;
+    input->set_multiple(multiple);
+    if (multiple) {
+      SetHasMultipleInput();
+    }
  }
-  void AddOutput(const std::string& name, const std::string& comment) {
+  void AddInputs(const std::string& name, const std::string& comment) {
+    AddInput(name, comment, true);
+  }
+  void AddOutput(const std::string& name, const std::string& comment,
+                 bool temporary = false, bool multiple = false) {
    auto output = proto_->mutable_outputs()->Add();
    *output->mutable_name() = name;
    *output->mutable_comment() = comment;
+    output->set_multiple(multiple);
+    if (multiple) {
+      SetHasMultipleOutput();
+    }
+    output->set_temporary(temporary);
+    if (temporary) {
+      SetHasTemporaryOutput();
+    }
+  }
+  void AddOutputs(const std::string& name, const std::string& comment,
+                  bool temporary = false) {
+    AddOutput(name, comment, temporary, true);
  }
  template <typename T>
  TypedAttrChecker<T>& AddAttr(const std::string& name,
-                               const std::string& comment) {
+                               const std::string& comment,
+                               bool generated = false) {
    auto attr = proto_->mutable_attrs()->Add();
    *attr->mutable_name() = name;
    *attr->mutable_comment() = comment;
+    attr->set_generated(generated);
    AttrTypeHelper::SetAttrType<T>(attr);
    return op_checker_->AddAttrChecker<T>(name);
  }
@@ -86,12 +123,83 @@ class OpProtoAndCheckerMaker {
    *(proto_->mutable_comment()) = comment;
  }
+ private:
+  void SetHasMultiple(const std::string& in_out, bool* flag) {
+    if (!*flag) {
+      AddAttr<std::vector<int>>(in_out + "_format",
+                                "The multiple index of " + in_out +
+                                    "\n"
+                                    R"DOC(
+This attribute is used by Paddle core framework. Paddle's Op support each input
+or output could be a list of variable. This attribute is used to show how that
+list organized.
+e.g.
+  input = ["a", "b", "c", "d", "e", "f"]
+  input_format = [0, 4, 5, 6]
+means
+  The number of all input variables this op is six, and they are segmented into
+  three inputs.
+  The first input is input[0:4], second is input[4:5], third is input[5:6].
+)DOC",
+                                /*generated*/ true);
+      *flag = true;
+    }
+  }
+  void SetHasMultipleInput() { SetHasMultiple("input", &has_multiple_input_); }
+  void SetHasMultipleOutput() {
+    SetHasMultiple("output", &has_multiple_output_);
+  }
+  void SetHasTemporaryOutput() {
+    if (!has_temporary_output_) {
+      AddAttr<std::vector<int>>("temporary_index",
+                                R"DOC(The temporary index of output.
+Not all output of Paddle Op is used by user. For faster computation, each op
+could output some its internal state to other op, other op could take that
+output to make compute faster.
+Add a mark to which output is temporary is helpful for future optimization.
+)DOC",
+                                /*generated*/ true)
+          .SetDefault(std::vector<int>());
+      has_temporary_output_ = true;
+    }
+  }
+  void CheckNoDuplicatedInOutAttrs() {
+    std::unordered_set<std::string> names;
+    auto checker = [&](const std::string& name) {
+      PADDLE_ENFORCE(!names.count(name), "[%s] is duplicated", name);
+      names.insert(name);
+    };
+    for (auto& attr : proto_->attrs()) {
+      checker(attr.name());
+    }
+    for (auto& input : proto_->inputs()) {
+      checker(input.name());
+    }
+    for (auto& output : proto_->outputs()) {
+      checker(output.name());
+    }
+  }
  OpProto* proto_;
  OpAttrChecker* op_checker_;
+  bool validated_{false};
+  bool has_multiple_input_{false};
+  bool has_multiple_output_{false};
+  bool has_temporary_output_{false};
 };
 class OpRegistry {
  using OpCreator = std::function<OperatorBase*()>;
+  using VarIndexMap = std::unordered_map<std::string, int>;
+  using VarNameList = std::vector<std::string>;
 public:
  template <typename OpType, typename ProtoMakerType>
@@ -99,36 +207,71 @@ class OpRegistry {
    creators()[op_type] = [] { return new OpType; };
    OpProto& op_proto = protos()[op_type];
    OpAttrChecker& op_checker = op_checkers()[op_type];
-    ProtoMakerType(&op_proto, &op_checker);
+    auto maker = ProtoMakerType(&op_proto, &op_checker);
+    maker.Validate();
    *op_proto.mutable_type() = op_type;
    PADDLE_ENFORCE(
        op_proto.IsInitialized(),
        "Fail to initialize %s's OpProto, because %s is not initialized",
        op_type, op_proto.InitializationErrorString());
+    VarIndexMaps()[op_type].reset(new VarIndexMap());
+    auto& varmap = *VarIndexMaps()[op_type];
+    int idx = 0;
+    for (auto& var : op_proto.inputs()) {
+      varmap[var.name()] = idx++;
+    }
+    idx = 0;
+    for (auto& var : op_proto.outputs()) {
+      varmap[var.name()] = idx++;
+    }
  }
-  static OperatorBase* CreateOp(const OpDesc& op_desc) {
+  static OperatorPtr CreateOp(const std::string& type,
-    std::string op_type = op_desc.type();
+                              const VarNameList& inputs,
-    OperatorBase* op = creators().at(op_type)();
+                              const VarNameList& outputs,
-    op->desc_ = op_desc;
+                              const AttributeMap& attrs) {
-    op->inputs_.reserve((size_t)op_desc.inputs_size());
+    auto op_create_it = creators().find(type);
+    PADDLE_ENFORCE(op_create_it != creators().end(),
+                   "Operator %s cannot be found", type);
+    auto op = op_create_it->second();
+    op->type_ = type;
+    op->inputs_ = inputs;
+    op->outputs_ = outputs;
+    op->attrs_ = attrs;
+    op_checkers().at(type).Check(op->attrs_);
+    GenerateTempVariableName(op);
+    {
+      auto var_index_it = VarIndexMaps().find(type);
+      if (var_index_it != VarIndexMaps().end()) {
+        op->in_out_idxs_ = var_index_it->second;
+      }
+    }
+    op->Init();
+    return OperatorPtr(op);
+  }
+  static OperatorPtr CreateOp(const OpDesc& op_desc) {
+    std::vector<std::string> inputs;
+    inputs.reserve((size_t)op_desc.inputs_size());
    std::copy(op_desc.inputs().begin(), op_desc.inputs().end(),
-              std::back_inserter(op->inputs_));
+              std::back_inserter(inputs));
-    op->outputs_.reserve((size_t)op_desc.outputs_size());
+    std::vector<std::string> outputs;
+    outputs.reserve((size_t)op_desc.outputs_size());
    std::copy(op_desc.outputs().begin(), op_desc.outputs().end(),
-              std::back_inserter(op->outputs_));
+              std::back_inserter(outputs));
+    AttributeMap attrs;
    for (auto& attr : op_desc.attrs()) {
-      op->attrs_[attr.name()] = AttrTypeHelper::GetAttrValue(attr);
+      attrs[attr.name()] = AttrTypeHelper::GetAttrValue(attr);
    }
-    op_checkers().at(op_type).Check(op->attrs_);
-    op->Init();
-    return op;
-  }
- private:
+    return CreateOp(op_desc.type(), inputs, outputs, attrs);
-  static std::unordered_map<std::string, OpCreator>& creators() {
-    static std::unordered_map<std::string, OpCreator> creators_;
-    return creators_;
  }
  static std::unordered_map<std::string, OpProto>& protos() {
@@ -136,6 +279,29 @@ class OpRegistry {
    return protos_;
  };
+ private:
+  static std::unordered_map<std::string, std::shared_ptr<VarIndexMap>>&
+  VarIndexMaps() {
+    static std::unordered_map<std::string, std::shared_ptr<VarIndexMap>> maps_;
+    return maps_;
+  }
+  static void GenerateTempVariableName(OperatorBase* op) {
+    static std::atomic<size_t> gUniqId(0UL);
+    for (auto& outname : op->outputs_) {
+      if (outname == OperatorBase::TMP_VAR_NAME()) {
+        outname += op->type_;
+        outname += "@";
+        outname += std::to_string(gUniqId.fetch_add(1));
+      }
+    }
+  }
+  static std::unordered_map<std::string, OpCreator>& creators() {
+    static std::unordered_map<std::string, OpCreator> creators_;
+    return creators_;
+  }
  static std::unordered_map<std::string, OpAttrChecker>& op_checkers() {
    static std::unordered_map<std::string, OpAttrChecker> op_checkers_;
    return op_checkers_;
@@ -150,12 +316,18 @@ class OpRegisterHelper {
  }
 };
+/**
+ * check if MACRO is used in GLOBAL NAMESPACE.
+ */
 #define STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)                        \
  struct __test_global_namespace_##uniq_name##__ {};                          \
  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
                             __test_global_namespace_##uniq_name##__>::value, \
                msg)
+/**
+ * Macro to Register Operator.
+ */
 #define REGISTER_OP(__op_type, __op_class, __op_maker_class)                 \
  STATIC_ASSERT_GLOBAL_NAMESPACE(__reg_op__##__op_type,                      \
                                 "REGISTER_OP must be in global namespace"); \
@@ -163,27 +335,36 @@ class OpRegisterHelper {
      __op_register_##__op_type##__(#__op_type);                             \
  int __op_register_##__op_type##_handle__() { return 0; }
-#define REGISTER_OP_KERNEL(type, GPU_OR_CPU, PlaceType, KernelType)       \
+/**
+ * Macro to Register OperatorKernel.
+ */
+#define REGISTER_OP_KERNEL(type, DEVICE_TYPE, PlaceType, ...)             \
  STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
-      __reg_op_kernel_##type##_##GPU_OR_CPU##__,                          \
+      __reg_op_kernel_##type##_##DEVICE_TYPE##__,                         \
      "REGISTER_OP_KERNEL must be in global namespace");                  \
  struct __op_kernel_register__##type##__ {                               \
    __op_kernel_register__##type##__() {                                  \
      ::paddle::framework::OperatorWithKernel::OpKernelKey key;           \
      key.place_ = PlaceType();                                           \
      ::paddle::framework::OperatorWithKernel::AllOpKernels()[#type][key] \
-          .reset(new KernelType());                                       \
+          .reset(new __VA_ARGS__());                                      \
    }                                                                     \
  };                                                                      \
  static __op_kernel_register__##type##__ __reg_kernel_##type##__;        \
-  int __op_kernel_register_##type##_handle_##GPU_OR_CPU##__() { return 0; }
+  int __op_kernel_register_##type##_handle_##DEVICE_TYPE##__() { return 0; }
-#define REGISTER_OP_GPU_KERNEL(type, KernelType) \
+// (type, KernelType)
-  REGISTER_OP_KERNEL(type, GPU, ::paddle::platform::GPUPlace, KernelType)
+#define REGISTER_OP_GPU_KERNEL(type, ...) \
+  REGISTER_OP_KERNEL(type, GPU, ::paddle::platform::GPUPlace, __VA_ARGS__)
-#define REGISTER_OP_CPU_KERNEL(type, KernelType) \
+// (type, KernelType)
-  REGISTER_OP_KERNEL(type, CPU, ::paddle::platform::CPUPlace, KernelType)
+#define REGISTER_OP_CPU_KERNEL(type, ...) \
+  REGISTER_OP_KERNEL(type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
+/**
+ * Macro to mark what Operator and Kernel we will use and tell the compiler to
+ * link them into target.
+ */
 #define USE_OP_WITHOUT_KERNEL(op_type)                      \
  STATIC_ASSERT_GLOBAL_NAMESPACE(                           \
      __use_op_without_kernel_##op_type,                    \
@@ -201,15 +382,16 @@ class OpRegisterHelper {
      __attribute__((unused)) =                                           \
          __op_kernel_register_##op_type##_handle_##DEVICE_TYPE##__()
-#ifdef PADDLE_ONLY_CPU
+// use Operator with only cpu kernel.
-#define USE_OP(op_type)           \
+#define USE_OP_CPU(op_type)       \
  USE_OP_WITHOUT_KERNEL(op_type); \
-  USE_OP_KERNEL(op_type, CPU);
+  USE_OP_KERNEL(op_type, CPU)
+#ifdef PADDLE_ONLY_CPU
+#define USE_OP(op_type) USE_OP_CPU(op_type)
 #else
-#define USE_OP(op_type)           \
+#define USE_OP(op_type) \
-  USE_OP_WITHOUT_KERNEL(op_type); \
+  USE_OP_CPU(op_type);  \
-  USE_OP_KERNEL(op_type, CPU);    \
  USE_OP_KERNEL(op_type, GPU)
 #endif

--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
 #include "paddle/framework/op_registry.h"
 #include <gtest/gtest.h>
+namespace pd = paddle::framework;
 namespace paddle {
 namespace framework {
 class CosineOp : public OperatorBase {
 public:
-  void Run(const std::shared_ptr<Scope>& scope,
+  void Run(const ScopePtr& scope,
           const platform::DeviceContext& dev_ctx) const override {}
-  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
+  void InferShape(const ScopePtr& scope) const override {}
 };
 class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@@ -25,19 +27,18 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 class MyTestOp : public OperatorBase {
 public:
-  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
+  void InferShape(const ScopePtr& scope) const override {}
-  void Run(const std::shared_ptr<Scope>& scope,
+  void Run(const ScopePtr& scope,
           const platform::DeviceContext& dev_ctx) const override {}
- public:
 };
 class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 public:
  MyTestOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "input of cosine op");
+    AddInputs("input", "input of cosine op");
-    AddOutput("output", "output of cosine op");
+    AddOutput("output", "output of cosine op",
+              /*temporary*/ true);
    auto my_checker = [](int i) {
      PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
    };
@@ -66,7 +67,7 @@ TEST(OpRegistry, CreateOp) {
  attr->set_type(paddle::framework::AttrType::FLOAT);
  attr->set_f(scale);
-  paddle::framework::OperatorBase* op =
+  paddle::framework::OperatorPtr op =
      paddle::framework::OpRegistry::CreateOp(op_desc);
  auto scope = std::make_shared<paddle::framework::Scope>();
  paddle::platform::CPUDeviceContext dev_ctx;
@@ -88,7 +89,7 @@ TEST(OpRegistry, IllegalAttr) {
  bool caught = false;
  try {
-    paddle::framework::OperatorBase* op __attribute__((unused)) =
+    paddle::framework::OperatorPtr op __attribute__((unused)) =
        paddle::framework::OpRegistry::CreateOp(op_desc);
  } catch (paddle::framework::EnforceNotMet err) {
    caught = true;
@@ -109,7 +110,7 @@ TEST(OpRegistry, DefaultValue) {
  ASSERT_TRUE(op_desc.IsInitialized());
-  paddle::framework::OperatorBase* op =
+  paddle::framework::OperatorPtr op =
      paddle::framework::OpRegistry::CreateOp(op_desc);
  auto scope = std::make_shared<paddle::framework::Scope>();
  paddle::platform::CPUDeviceContext dev_ctx;
@@ -117,16 +118,25 @@ TEST(OpRegistry, DefaultValue) {
  ASSERT_EQ(op->GetAttr<float>("scale"), 1.0);
 }
+static void SetInputFormat(paddle::framework::OpDesc* desc) {
+  auto attr = desc->add_attrs();
+  attr->set_name("input_format");
+  attr->set_type(paddle::framework::INTS);
+  attr->mutable_ints()->Add(0);
+  attr->mutable_ints()->Add(1);
+}
 TEST(OpRegistry, CustomChecker) {
  paddle::framework::OpDesc op_desc;
  op_desc.set_type("my_test_op");
  op_desc.add_inputs("ii");
  op_desc.add_outputs("oo");
+  SetInputFormat(&op_desc);
  // attr 'test_attr' is not set
  bool caught = false;
  try {
-    paddle::framework::OperatorBase* op __attribute__((unused)) =
+    paddle::framework::OperatorPtr op __attribute__((unused)) =
        paddle::framework::OpRegistry::CreateOp(op_desc);
  } catch (paddle::framework::EnforceNotMet err) {
    caught = true;
@@ -145,7 +155,7 @@ TEST(OpRegistry, CustomChecker) {
  attr->set_i(3);
  caught = false;
  try {
-    paddle::framework::OperatorBase* op __attribute__((unused)) =
+    paddle::framework::OperatorPtr op __attribute__((unused)) =
        paddle::framework::OpRegistry::CreateOp(op_desc);
  } catch (paddle::framework::EnforceNotMet err) {
    caught = true;
@@ -163,7 +173,8 @@ TEST(OpRegistry, CustomChecker) {
  attr->set_name("test_attr");
  attr->set_type(paddle::framework::AttrType::INT);
  attr->set_i(4);
-  paddle::framework::OperatorBase* op =
+  SetInputFormat(&op_desc);
+  paddle::framework::OperatorPtr op =
      paddle::framework::OpRegistry::CreateOp(op_desc);
  paddle::platform::CPUDeviceContext dev_ctx;
  auto scope = std::make_shared<paddle::framework::Scope>();
@@ -171,3 +182,35 @@ TEST(OpRegistry, CustomChecker) {
  int test_attr = op->GetAttr<int>("test_attr");
  ASSERT_EQ(test_attr, 4);
 }
+class TestAttrProtoMaker : public pd::OpProtoAndCheckerMaker {
+ public:
+  TestAttrProtoMaker(pd::OpProto* proto, pd::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddAttr<float>("scale", "scale of test op");
+    AddAttr<float>("scale", "scale of test op");
+  }
+};
+TEST(ProtoMaker, DuplicatedAttr) {
+  pd::OpProto op_proto;
+  pd::OpAttrChecker op_checker;
+  auto proto_maker = TestAttrProtoMaker(&op_proto, &op_checker);
+  ASSERT_THROW(proto_maker.Validate(), paddle::framework::EnforceNotMet);
+}
+class TestInOutProtoMaker : public pd::OpProtoAndCheckerMaker {
+ public:
+  TestInOutProtoMaker(pd::OpProto* proto, pd::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of test op");
+    AddInput("input", "input of test op");
+  }
+};
+TEST(ProtoMaker, DuplicatedInOut) {
+  pd::OpProto op_proto;
+  pd::OpAttrChecker op_checker;
+  auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker);
+  ASSERT_THROW(proto_maker.Validate(), paddle::framework::EnforceNotMet);
+}
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -12,32 +12,92 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <algorithm>
 #include "paddle/framework/operator.h"
 namespace paddle {
 namespace framework {
+template <>
+Eigen::DefaultDevice* KernelContext::GetEigenDevice<
+    platform::CPUPlace, Eigen::DefaultDevice>() const {
+  return device_context_.get_eigen_device<Eigen::DefaultDevice>();
+}
+#ifndef PADDLE_ONLY_CPU
+template <>
+Eigen::GpuDevice*
+KernelContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
+  return device_context_.get_eigen_device<Eigen::GpuDevice>();
+}
+#endif
+const std::string& OperatorBase::Input(const std::string& name) const {
+  auto it = in_out_idxs_->find(name);
+  PADDLE_ENFORCE(it != in_out_idxs_->end(), "no key [%s] in in_out_idxs_",
+                 name);
+  if (attrs_.count("input_format") == 0) {
+    return inputs_[it->second];
+  } else {
+    const auto& input_format = GetAttr<std::vector<int>>("input_format");
+    int idx = input_format[it->second];
+    return inputs_.at(idx);
+  }
+}
+std::vector<std::string> OperatorBase::Inputs(const std::string& name) const {
+  auto input_format = GetAttr<std::vector<int>>("input_format");
+  auto offset = in_out_idxs_->at(name);
+  return std::vector<std::string>{
+      inputs_.begin() + input_format.at(offset),
+      inputs_.begin() + input_format.at(offset + 1)};
+}
+const std::string& OperatorBase::Output(const std::string& name) const {
+  auto it = in_out_idxs_->find(name);
+  PADDLE_ENFORCE(it != in_out_idxs_->end(), "no key [%s] in in_out_idxs_",
+                 name);
+  if (attrs_.count("output_format") == 0) {
+    return outputs_[it->second];
+  } else {
+    const auto& output_format = GetAttr<std::vector<int>>("output_format");
+    int idx = output_format[it->second];
+    return outputs_.at(idx);
+  }
+}
+std::vector<std::string> OperatorBase::Outputs(const std::string& name) const {
+  auto output_format = GetAttr<std::vector<int>>("output_format");
+  auto offset = in_out_idxs_->at(name);
+  return std::vector<std::string>{
+      outputs_.begin() + output_format.at(offset),
+      outputs_.begin() + output_format.at(offset + 1)};
+}
 std::string OperatorBase::DebugString() const {
  std::stringstream ss;
-  ss << "=================\n";
+  ss << "Op(" << type_ << "), inputs:(";
-  ss << "type = " << desc_.type() << "\n";
+  for (size_t i = 0; i < inputs_.size(); ++i) {
-  ss << "inputs = [";
+    ss << inputs_[i];
-  for (auto& ipt : inputs_) {
+    if (i != inputs_.size() - 1) {
-    ss << ipt << ", ";
+      ss << ", ";
-  }
+    }
-  ss << "]\n";
-  ss << "outputs = [";
-  for (auto& opt : outputs_) {
-    ss << opt << ", ";
  }
-  ss << "]\n";
+  ss << "), outputs:(";
-  ss << "attr_keys = [";
+  for (size_t i = 0; i < outputs_.size(); ++i) {
-  for (auto& attr : attrs_) {
+    ss << outputs_[i];
-    ss << attr.first << ", ";
+    if (i != outputs_.size() - 1) {
+      ss << ", ";
+    }
  }
-  ss << "]\n";
+  ss << ").";
  return ss.str();
 }
 }  // namespace framework
 }  // namespace paddle
\ No newline at end of file
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -14,23 +14,40 @@ limitations under the License. */
 #pragma once
-#include <paddle/framework/attr_checker.h>
-#include <paddle/framework/op_desc.pb.h>
-#include <paddle/framework/scope.h>
-#include <paddle/framework/tensor.h>
-#include <paddle/platform/device_context.h>
-#include <paddle/platform/place.h>
-#include <paddle/utils/Error.h>
 #include <boost/variant.hpp>
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include "paddle/framework/attr_checker.h"
+#include "paddle/framework/op_desc.pb.h"
+#include "paddle/framework/op_proto.pb.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/place.h"
+#include "paddle/utils/Error.h"
 namespace paddle {
 namespace framework {
-class OperatorBase;
+template <typename T>
+struct EigenDeviceConverter;
+template <>
+struct EigenDeviceConverter<platform::CPUPlace> {
+  using EigenDeviceType = Eigen::DefaultDevice;
+};
+#ifndef PADDLE_ONLY_CPU
+template <>
+struct EigenDeviceConverter<platform::GPUPlace> {
+  using EigenDeviceType = Eigen::GpuDevice;
+};
+#endif
+class OperatorBase;
+using OperatorPtr = std::shared_ptr<OperatorBase>;
 /**
 * OperatorBase has the basic element that Net will call to do computation.
 * Only CreateOperator from OpRegistry will new Operator directly. User
@@ -39,6 +56,13 @@ class OperatorBase;
 */
 class OperatorBase {
 public:
+  /// If a variable is a empty variable, that name will be used.
+  static std::string EMPTY_VAR_NAME() { return "@EMPTY@"; }
+  /// If a variable is a temporary variable, that name will be set in Python,
+  /// but it will be convert to a unique name in scope after OpCreator.
+  static std::string TMP_VAR_NAME() { return "@TEMP@"; }
  virtual ~OperatorBase() {}
  template <typename T>
@@ -48,7 +72,7 @@ class OperatorBase {
    return boost::get<T>(attrs_.at(name));
  }
-  std::string DebugString() const;
+  virtual std::string DebugString() const;
  /// Init will be called after CreateOperator, you can put some initialization
  /// logic here.
@@ -56,20 +80,82 @@ class OperatorBase {
  /// InferShape infer the size of Variables used by this Operator with
  /// information inside scope
-  virtual void InferShape(const std::shared_ptr<Scope>& scope) const = 0;
+  virtual void InferShape(const ScopePtr& scope) const = 0;
  /// Net will call this function to Run an op.
-  virtual void Run(const std::shared_ptr<Scope>& scope,
+  virtual void Run(const ScopePtr& scope,
                   const platform::DeviceContext& dev_ctx) const = 0;
- protected:
+  // Get a input with argument's name described in `op_proto`
-  std::string Type() const { return desc_.type(); }
+  const std::string& Input(const std::string& name) const;
+  // Get a input which has multiple variables.
+  // TODO add a vector_view to prevent memory copy.
+  std::vector<std::string> Inputs(const std::string& name) const;
+  // Get a output with argument's name described in `op_proto`
+  const std::string& Output(const std::string& name) const;
+  // Get an output which has multiple variables.
+  // TODO add a vector_view to prevent memory copy.
+  std::vector<std::string> Outputs(const std::string& name) const;
 public:
-  OpDesc desc_;
+  std::string type_;
  std::vector<std::string> inputs_;
  std::vector<std::string> outputs_;
  AttributeMap attrs_;
+  // store the arguments' offset described in op_desc.
+  std::shared_ptr<std::unordered_map<std::string, int>> in_out_idxs_;
+};
+class KernelContext {
+ public:
+  KernelContext(const OperatorBase* op, const std::shared_ptr<Scope>& scope,
+                const platform::DeviceContext& device_context)
+      : op_(*op), scope_(scope), device_context_(device_context) {}
+  const Variable* Input(int index) const {
+    return scope_->GetVariable(op_.inputs_[index]);
+  }
+  Variable* Output(int index) const {
+    return scope_->GetVariable(op_.outputs_[index]);
+  }
+  const Variable* Input(const std::string& name) const {
+    return scope_->GetVariable(op_.Input(name));
+  }
+  const Variable* Output(const std::string& name) const {
+    return scope_->GetVariable(op_.Output(name));
+  }
+  const std::vector<const Variable*> Inputs(const std::string& name) const {
+    auto names = op_.Inputs(name);
+    std::vector<const Variable*> res;
+    std::transform(
+        names.begin(), names.end(), res.begin(),
+        [this](const std::string& name) { return scope_->GetVariable(name); });
+    return res;
+  }
+  const std::vector<const Variable*> Outputs(const std::string& name) const {
+    auto names = op_.Outputs(name);
+    std::vector<const Variable*> res;
+    std::transform(
+        names.begin(), names.end(), res.begin(),
+        [this](const std::string& name) { return scope_->GetVariable(name); });
+    return res;
+  }
+  template <typename PlaceType,
+            typename DeviceType =
+                typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
+  DeviceType* GetEigenDevice() const;
+  platform::Place GetPlace() const { return device_context_.GetPlace(); }
+  const OperatorBase& op_;
+  const std::shared_ptr<Scope>& scope_;
+  const platform::DeviceContext& device_context_;
 };
 class OpKernel {
@@ -80,24 +166,6 @@ class OpKernel {
   * device resource such as CUDA stream, cublas handle, etc. from
   * KernelContext. User should construct it before run the Operator.
   */
-  class KernelContext {
-   public:
-    KernelContext(const OperatorBase* op, const std::shared_ptr<Scope>& scope,
-                  const platform::DeviceContext& device_context)
-        : op_(*op), scope_(scope), device_context_(device_context) {}
-    const Variable* Input(int index) const {
-      return scope_->GetVariable(op_.inputs_[index]);
-    }
-    Variable* Output(int index) const {
-      return scope_->GetVariable(op_.outputs_[index]);
-    }
-    const OperatorBase& op_;
-    const std::shared_ptr<Scope>& scope_;
-    const platform::DeviceContext& device_context_;
-  };
  virtual void Compute(const KernelContext& context) const = 0;
@@ -140,10 +208,10 @@ class OperatorWithKernel : public OperatorBase {
  using OpKernelMap =
      std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;
-  void Run(const std::shared_ptr<Scope>& scope,
+  void Run(const ScopePtr& scope,
           const platform::DeviceContext& dev_ctx) const final {
-    auto& opKernel = AllOpKernels().at(Type()).at(OpKernelKey(dev_ctx));
+    auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
-    opKernel->Compute(OpKernel::KernelContext(this, scope, dev_ctx));
+    opKernel->Compute(KernelContext(this, scope, dev_ctx));
  }
  static std::unordered_map<std::string /* op_type */, OpKernelMap>&
@@ -151,6 +219,7 @@ class OperatorWithKernel : public OperatorBase {
    static std::unordered_map<std::string, OpKernelMap> g_all_op_kernels;
    return g_all_op_kernels;
  }
  void InferShape(const std::shared_ptr<Scope>& scope) const final {
    std::vector<const Tensor*> ins;
    VarNamesToTensors(scope, inputs_, &ins);

--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -19,14 +19,17 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-class OperatorTest : public OperatorBase {
+static int op_run_num = 0;
+class OpWithoutKernelTest : public OperatorBase {
 public:
  void Init() override { x = 1; }
-  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
+  void InferShape(const ScopePtr& scope) const override {}
-  void Run(const std::shared_ptr<Scope>& scope,
+  void Run(const ScopePtr& scope,
           const platform::DeviceContext& dev_ctx) const override {
-    float scale = GetAttr<float>("scale");
+    op_run_num++;
-    ASSERT_NEAR(scale, 3.14, 1e-5);
+    ASSERT_EQ((int)inputs_.size(), 1);
+    ASSERT_EQ((int)outputs_.size(), 1);
    ASSERT_EQ(scope->GetVariable(inputs_[0]), nullptr);
    ASSERT_EQ(x, 1);
    ASSERT_NE(scope->GetVariable(outputs_[0]), nullptr);
@@ -36,12 +39,54 @@ class OperatorTest : public OperatorBase {
  float x = 0;
 };
-class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+class OpeWithoutKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 public:
-  OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+  OpeWithoutKernelTestProtoAndCheckerMaker(OpProto* proto,
+                                           OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("input", "input of test op");
    AddOutput("output", "output of test op");
+    AddAttr<float>("scale", "scale of cosine op");
+    AddComment("This is test op");
+  }
+};
+}  // namespace framework
+}  // namespace paddle
+REGISTER_OP(test_operator, paddle::framework::OpWithoutKernelTest,
+            paddle::framework::OpeWithoutKernelTestProtoAndCheckerMaker);
+TEST(OperatorBase, all) {
+  paddle::framework::OpDesc op_desc;
+  op_desc.set_type("test_operator");
+  *op_desc.mutable_inputs()->Add() = "IN1";
+  *op_desc.mutable_outputs()->Add() = "OUT1";
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_f(3.14);
+  paddle::platform::CPUDeviceContext device_context;
+  auto scope = std::make_shared<paddle::framework::Scope>();
+  paddle::framework::OperatorPtr op =
+      paddle::framework::OpRegistry::CreateOp(op_desc);
+  scope->CreateVariable("OUT1");
+  ASSERT_EQ(paddle::framework::op_run_num, 0);
+  op->Run(scope, device_context);
+  ASSERT_EQ(paddle::framework::op_run_num, 1);
+}
+namespace paddle {
+namespace framework {
+class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("x", "input of test op");
+    AddOutput("y", "output of test op");
    AddAttr<float>("scale", "scale of cosine op")
        .SetDefault(1.0)
        .LargerThan(0.0);
@@ -49,19 +94,76 @@ class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
  }
 };
+static int cpu_kernel_run_num = 0;
 class OpWithKernelTest : public OperatorWithKernel {
 protected:
  void InferShape(const std::vector<const Tensor*>& inputs,
                  const std::vector<Tensor*>& outputs) const override {}
 };
+template <typename T1, typename T2>
 class CPUKernelTest : public OpKernel {
 public:
-  void Compute(const KernelContext& context) const {
+  void Compute(const KernelContext& ctx) const {
-    float scale = context.op_.GetAttr<float>("scale");
-    ASSERT_NEAR(scale, 3.14, 1e-5);
    std::cout << "this is cpu kernel" << std::endl;
-    std::cout << context.op_.DebugString() << std::endl;
+    std::cout << ctx.op_.DebugString() << std::endl;
+    cpu_kernel_run_num++;
+    ASSERT_EQ(ctx.op_.Input("x"), "IN1");
+    ASSERT_EQ(ctx.op_.Output("y"), "OUT1");
+  }
+};
+// multiple inputs test
+class OperatorMultiInputsTest : public OperatorBase {
+ public:
+  void Init() override { x = 1; }
+  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
+  void Run(const std::shared_ptr<Scope>& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    ASSERT_EQ(scope->GetVariable(inputs_[0]), nullptr);
+    ASSERT_EQ(x, 1);
+    ASSERT_NE(scope->GetVariable(outputs_[0]), nullptr);
+    ASSERT_EQ(Input("x"), "IN1");
+    ASSERT_EQ(Input("y"), "OUT1");
+  }
+ public:
+  float x = 0;
+};
+class OpKernelTestMultiInputsProtoAndCheckerMaker
+    : public OpProtoAndCheckerMaker {
+ public:
+  OpKernelTestMultiInputsProtoAndCheckerMaker(OpProto* proto,
+                                              OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInputs("xs", "inputs of test op");
+    AddInput("k", "input of test op");
+    AddOutputs("ys", "outputs of test op");
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .LargerThan(0.0);
+    AddComment("This is test op");
+  }
+};
+class CPUKernalMultiInputsTest : public OpKernel {
+ public:
+  void Compute(const KernelContext& ctx) const {
+    auto xs = ctx.op_.Inputs("xs");
+    ASSERT_EQ(xs.size(), 3UL);
+    ASSERT_EQ(xs[0], "x0");
+    ASSERT_EQ(xs[1], "x1");
+    ASSERT_EQ(xs[2], "x2");
+    auto k = ctx.op_.Input("k");
+    ASSERT_EQ(k, "k0");
+    auto ys = ctx.op_.Outputs("ys");
+    ASSERT_EQ(ys.size(), 2UL);
+    ASSERT_EQ(ys[0], "y0");
+    ASSERT_EQ(ys[1], "y1");
  }
 };
@@ -70,12 +172,12 @@ class CPUKernelTest : public OpKernel {
 REGISTER_OP(op_with_kernel, paddle::framework::OpWithKernelTest,
            paddle::framework::OpKernelTestProtoAndCheckerMaker);
-REGISTER_OP_CPU_KERNEL(op_with_kernel, paddle::framework::CPUKernelTest);
+REGISTER_OP_CPU_KERNEL(op_with_kernel,
+                       paddle::framework::CPUKernelTest<float, float>);
+// test with single input
 TEST(OpKernel, all) {
-  using namespace paddle::framework;
+  paddle::framework::OpDesc op_desc;
-  OpDesc op_desc;
  op_desc.set_type("op_with_kernel");
  *op_desc.mutable_inputs()->Add() = "IN1";
  *op_desc.mutable_outputs()->Add() = "OUT1";
@@ -85,10 +187,55 @@ TEST(OpKernel, all) {
  attr->set_f(3.14);
  paddle::platform::CPUDeviceContext cpu_device_context;
-  auto scope = std::make_shared<Scope>();
+  auto scope = std::make_shared<paddle::framework::Scope>();
-  OperatorBase* op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  paddle::framework::OperatorPtr op =
+      paddle::framework::OpRegistry::CreateOp(op_desc);
+  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
  op->Run(scope, cpu_device_context);
+  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
+}
-  delete op;
+REGISTER_OP(op_multi_inputs_with_kernel, paddle::framework::OpWithKernelTest,
+            paddle::framework::OpKernelTestMultiInputsProtoAndCheckerMaker);
+REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
+                       paddle::framework::CPUKernalMultiInputsTest);
+// test with multi inputs
+TEST(OpKernel, multi_inputs) {
+  using namespace paddle::framework;
+  OpDesc op_desc;
+  op_desc.set_type("op_multi_inputs_with_kernel");
+  *op_desc.mutable_inputs()->Add() = "x0";
+  *op_desc.mutable_inputs()->Add() = "x1";
+  *op_desc.mutable_inputs()->Add() = "x2";
+  *op_desc.mutable_inputs()->Add() = "k0";
+  *op_desc.mutable_outputs()->Add() = "y0";
+  *op_desc.mutable_outputs()->Add() = "y1";
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_f(3.14);
+  auto attr0 = op_desc.mutable_attrs()->Add();
+  attr0->set_name("input_format");
+  attr0->set_type(paddle::framework::AttrType::INTS);
+  auto input_format = attr0->mutable_ints();
+  input_format->Add(0);  // x0
+  input_format->Add(3);  // k
+  input_format->Add(4);  // end
+  auto attr1 = op_desc.mutable_attrs()->Add();
+  attr1->set_name("output_format");
+  attr1->set_type(paddle::framework::AttrType::INTS);
+  auto output_format = attr1->mutable_ints();
+  output_format->Add(0);  // y0
+  output_format->Add(2);  // y1
+  paddle::platform::CPUDeviceContext cpu_device_context;
+  auto scope = std::make_shared<Scope>();
+  OperatorPtr op(paddle::framework::OpRegistry::CreateOp(op_desc));
+  op->Run(scope, cpu_device_context);
 }
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -23,6 +23,9 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
+class Scope;
+using ScopePtr = std::shared_ptr<Scope>;
 /**
 * @brief Scope that manage all variables.
 *
@@ -41,7 +44,7 @@ class Scope {
  /**
   * @brief Initialize a Scope with parent.
   */
-  explicit Scope(const std::shared_ptr<Scope>& parent) : parent_(parent) {}
+  explicit Scope(const ScopePtr& parent) : parent_(parent) {}
  /**
   * @brief Create Variable
@@ -88,7 +91,7 @@ class Scope {
 private:
  std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
-  std::shared_ptr<Scope> parent_{nullptr};
+  ScopePtr parent_{nullptr};
 };
 }  // namespace framework

--- a/paddle/framework/tensor.cc
+++ b/paddle/framework/tensor.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <paddle/framework/tensor.h>
+namespace paddle {
+namespace framework {}
+}  // namespace paddle
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -15,57 +15,161 @@ limitations under the License. */
 #pragma once
 #include <cstdint>
+#include <cstring>
 #include <memory>
-#include <type_traits>
+#include <typeindex>
 #include "paddle/framework/ddim.h"
 #include "paddle/framework/enforce.h"
+#include "paddle/framework/tensor_types.h"
 #include "paddle/memory/memory.h"
 #include "paddle/platform/place.h"
+#include "unsupported/Eigen/CXX11/Tensor"
 namespace paddle {
+namespace pybind {
+namespace details {  // forward declare
+template <bool less, size_t i, typename... args>
+struct CastToPyBufferImpl;
+}  // namespace details
+}  // namespace pybind
 namespace framework {
 class Tensor {
 public:
  Tensor() : offset_(0) {}
-  explicit Tensor(const DDim& dims) : dims_(dims), offset_(0) {}
  template <typename T>
  const T* data() const {
-    PADDLE_ENFORCE(
+    CheckDims<T>();
-        holder_ != nullptr,
-        "Tenosr has not been initialized. Call Tensor::mutable_data first.");
    return reinterpret_cast<const T*>(
-        reinterpret_cast<uintptr_t>(holder_->Ptr()) + offset_);
+        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
  }
-  template <typename T,  // must be POD types
+  template <typename T>
-            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
+  T* raw_data() const {
-  T* mutable_data(DDim dims, paddle::platform::Place place) {
+    CheckDims<T>();
-    dims_ = dims;
+    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                                offset_);
+  }
+  template <typename T>
+  T* mutable_data(DDim dims, platform::Place place) {
+    set_dims(dims);
+    return mutable_data<T>(place);
+  }
+  template <typename T>
+  T* mutable_data(platform::Place place) {
+    PADDLE_ENFORCE(product(dims_) > 0,
+                   "Tensor's numel must be larger than zero to call "
+                   "Tensor::mutable_data. Call Tensor::set_dim first.");
    if (holder_ == nullptr ||
-        !(holder_->Place() ==
+        !(holder_->place() ==
          place) /* some versions of boost::variant don't have operator!= */
-        || holder_->Size() < product(dims) * sizeof(T) + offset_) {
+        || holder_->size() < product(dims_) * sizeof(T) + offset_) {
-      holder_.reset(new PlaceholderImpl<T>(place, product(dims) * sizeof(T)));
+      if (platform::is_cpu_place(place)) {
+        holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
+            boost::get<platform::CPUPlace>(place), product(dims_) * sizeof(T)));
+      } else if (platform::is_gpu_place(place)) {
+#ifdef __CUDACC__
+        holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
+            boost::get<platform::GPUPlace>(place), product(dims_) * sizeof(T)));
+#else
+        PADDLE_ENFORCE(true, "'GPUPlace' is not supported in CPU only device.");
+#endif
+      } else {
+        PADDLE_ENFORCE(true, "Unknown 'place'.");
+      }
      offset_ = 0;
    }
-    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->Ptr()) +
+    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                offset_);
  }
+  template <typename T, size_t NDIMS>
+  typename TTypes<T, NDIMS>::Tensor shaped(DDim new_dims) {
+    Eigen::array<Eigen::DenseIndex, NDIMS> dims =
+        paddle::framework::ToEigenDSizes<NDIMS>(new_dims);
+    return typename TTypes<T, NDIMS>::Tensor(raw_data<T>(), dims);
+  }
+  template <typename T, size_t NDIMS>
+  typename TTypes<T, NDIMS>::Tensor tensor() {
+    return typename TTypes<T, NDIMS>::Tensor(
+        raw_data<T>(), paddle::framework::ToEigenDSizes<NDIMS>(dims_));
+  }
+  // flat to rank = 1
+  template <typename T>
+  typename TTypes<T>::Flat flat() {
+    return shaped<T, 1>(make_ddim({static_cast<int>(product(dims_))}));
+  }
+  // to TensorType Vec
+  template <typename T>
+  typename TTypes<T>::Vec vec() {
+    return tensor<T, 1>();
+  }
+  // to TensorType Matrix
+  template <typename T>
+  typename TTypes<T>::Matrix matrix() {
+    return tensor<T, 2>();
+  }
+  // const versions of all the methods above.
+  template <typename T, size_t NDIMS>
+  typename TTypes<T, NDIMS>::Tensor shaped(DDim new_dims) const {
+    Eigen::array<Eigen::DenseIndex, NDIMS> dims =
+        paddle::framework::ToEigenDSizes<NDIMS>(new_dims);
+    return typename TTypes<T, NDIMS>::Tensor(data<T>(), dims);
+  }
+  template <typename T, size_t NDIMS>
+  typename TTypes<T, NDIMS>::ConstantTensor tensor() const {
+    return typename TTypes<T, NDIMS>::Tensor(
+        data<T>(), paddle::framework::ToEigenDSizes<NDIMS>(dims_));
+  }
+  template <typename T>
+  typename TTypes<T>::ConstFlat flat() const {
+    return shaped<T, 1>(make_ddim({static_cast<int>(product(dims_))}));
+  }
+  template <typename T>
+  typename TTypes<T>::ConstVec vec() const {
+    return tensor<T, 1>();
+  }
+  template <typename T>
+  typename TTypes<T>::ConstMatrix matrix() const {
+    return tensor<T, 2>();
+  }
+  template <typename T>
  void ShareDataFrom(const Tensor& src) {
-    PADDLE_ENFORCE(src.holder_ != nullptr,
+    src.CheckDims<T>();
-                   "Can not share data from an uninitialized tensor.");
    holder_ = src.holder_;
-    dims_ = src.dims_;
+    set_dims(src.dims());
    offset_ = src.offset_;
  }
+  template <typename T>
+  void CopyFrom(const Tensor& src, platform::Place dst_place) {
+    PADDLE_ENFORCE(platform::is_cpu_place(src.holder_->place()) &&
+                       platform::is_cpu_place(dst_place),
+                   "Tensor::CopyFrom only support CPU now.");
+    src.CheckDims<T>();
+    size_t size = product(src.dims_) * sizeof(T);
+    set_dims(src.dims());
+    const void* src_ptr = static_cast<const void*>(src.data<T>());
+    void* dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
+    memcpy(dst_ptr, src_ptr, size);
+  }
+  template <typename T>
  Tensor Slice(const int& begin_idx, const int& end_idx) const {
-    PADDLE_ENFORCE(holder_ != nullptr,
+    CheckDims<T>();
-                   "The sliced tenosr has not been initialized.");
    PADDLE_ENFORCE(begin_idx >= 0 && end_idx <= dims_[0],
                   "Slice index is less than zero or out of bound.");
    PADDLE_ENFORCE(begin_idx < end_idx,
@@ -78,12 +182,20 @@ class Tensor {
    }
    Tensor dst;
    dst.holder_ = holder_;
-    dst.dims_ = dims_;
+    DDim dst_dims = dims_;
-    dst.dims_[0] = end_idx - begin_idx;
+    dst_dims[0] = end_idx - begin_idx;
-    dst.offset_ = offset_ + begin_idx * base * holder_->TypeSize();
+    dst.set_dims(dst_dims);
+    dst.offset_ = offset_ + begin_idx * base * sizeof(T);
    return dst;
  }
+  void set_dims(const DDim& dims) {
+    if (dims == dims_) {
+      return;
+    }
+    dims_ = dims;
+  }
  DDim dims() const { return dims_; }
 private:
@@ -91,46 +203,56 @@ class Tensor {
  // parameter of Variable.
  struct Placeholder {
    virtual ~Placeholder() {}
-    virtual void* Ptr() const = 0;
+    virtual void* ptr() const = 0;
-    virtual paddle::platform::Place Place() const = 0;
+    virtual platform::Place place() const = 0;
-    virtual size_t Size() const = 0;
+    virtual size_t size() const = 0;
-    virtual size_t TypeSize() const = 0;
+    virtual std::type_index type() const = 0;
  };
-  template <typename T>
+  template <typename T, typename PlaceType>
  struct PlaceholderImpl : public Placeholder {
   private:
+    template <typename PType>
    class Deleter {
     public:
-      Deleter(platform::Place place) : place_(place) {}
+      Deleter(PType place) : place_(place) {}
-      void operator()(T* ptr) {
+      void operator()(T* ptr) { memory::Free(place_, static_cast<void*>(ptr)); }
-        paddle::memory::Free(place_, static_cast<void*>(ptr));
-      }
     private:
-      paddle::platform::Place place_;
+      PType place_;
    };
   public:
-    PlaceholderImpl(paddle::platform::Place place, size_t size)
+    PlaceholderImpl(PlaceType place, size_t size)
-        : ptr_(static_cast<T*>(paddle::memory::Alloc(place, size)),
+        : ptr_(static_cast<T*>(memory::Alloc(place, size)),
-               Deleter(place)),
+               Deleter<PlaceType>(place)),
          place_(place),
          size_(size) {}
-    virtual void* Ptr() const { return static_cast<void*>(ptr_.get()); }
+    virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
-    virtual size_t Size() const { return size_; }
+    virtual size_t size() const { return size_; }
-    virtual paddle::platform::Place Place() const { return place_; }
+    virtual paddle::platform::Place place() const { return place_; }
-    virtual size_t TypeSize() const { return sizeof(T); }
+    virtual std::type_index type() const { return std::type_index(typeid(T)); }
-    std::unique_ptr<T, Deleter> ptr_;
+    std::unique_ptr<T, Deleter<PlaceType>> ptr_;
-    paddle::platform::Place place_;  // record the place of ptr_.
+    platform::Place place_;  // record the place of ptr_.
-    size_t size_;                    // size of the memory block.
+    size_t size_;            // size of the memory block.
  };
+  template <typename T>
+  inline void CheckDims() const {
+    PADDLE_ENFORCE(holder_ != nullptr,
+                   "Tenosr holds no memory. Call Tensor::mutable_data first.");
+    PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_,
+                   "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
+                   "first to re-allocate memory.");
+  }
  std::shared_ptr<Placeholder> holder_;  // holds the memory block if allocated.
  DDim dims_;
  size_t offset_;  // marks the begin of tensor data area.
+  template <bool less, size_t i, typename... args>
+  friend struct paddle::pybind::details::CastToPyBufferImpl;
 };
 }  // namespace framework

--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -18,7 +18,8 @@
 TEST(Tensor, Dims) {
  using namespace paddle::framework;
  using namespace paddle::platform;
-  Tensor tt(make_ddim({2, 3, 4}));
+  Tensor tt;
+  tt.set_dims(make_ddim({2, 3, 4}));
  DDim dims = tt.dims();
  ASSERT_EQ(arity(dims), 3);
  for (int i = 0; i < 3; ++i) {
@@ -35,7 +36,7 @@ TEST(Tensor, DataAssert) {
  } catch (paddle::framework::EnforceNotMet err) {
    caught = true;
    std::string msg =
-        "Tenosr has not been initialized. Call Tensor::mutable_data first.";
+        "Tenosr holds no memory. Call Tensor::mutable_data first.";
    const char* what = err.what();
    for (size_t i = 0; i < msg.length(); ++i) {
      ASSERT_EQ(what[i], msg[i]);
@@ -46,7 +47,7 @@ TEST(Tensor, DataAssert) {
 /* following tests are not available at present
   because Memory::Alloc() and Memory::Free() have not been ready.
+*/
 TEST(Tensor, MutableData) {
  using namespace paddle::framework;
  using namespace paddle::platform;
@@ -71,7 +72,7 @@ TEST(Tensor, MutableData) {
    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CPUPlace());
    EXPECT_EQ(p1, p2);
  }
+#ifdef __CUDACC__
  {
    Tensor src_tensor;
    float* p1 = nullptr;
@@ -93,6 +94,7 @@ TEST(Tensor, MutableData) {
    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), GPUPlace());
    EXPECT_EQ(p1, p2);
  }
+#endif
 }
 TEST(Tensor, ShareDataFrom) {
@@ -104,10 +106,11 @@ TEST(Tensor, ShareDataFrom) {
    // Try to share data form uninitialized tensor
    bool caught = false;
    try {
-      dst_tensor.ShareDataFrom(src_tensor);
+      dst_tensor.ShareDataFrom<float>(src_tensor);
    } catch (EnforceNotMet err) {
      caught = true;
-      std::string msg = "Can not share data from an uninitialized tensor.";
+      std::string msg =
+          "Tenosr holds no memory. Call Tensor::mutable_data first.";
      const char* what = err.what();
      for (size_t i = 0; i < msg.length(); ++i) {
        ASSERT_EQ(what[i], msg[i]);
@@ -116,17 +119,19 @@ TEST(Tensor, ShareDataFrom) {
    ASSERT_TRUE(caught);
    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CPUPlace());
-    dst_tensor.ShareDataFrom(src_tensor);
+    dst_tensor.ShareDataFrom<int>(src_tensor);
    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
  }
+#ifdef __CUDACC__
  {
    Tensor src_tensor;
    Tensor dst_tensor;
    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
-    dst_tensor.ShareDataFrom(src_tensor);
+    dst_tensor.ShareDataFrom<int>(src_tensor);
    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
  }
+#endif
 }
 TEST(Tensor, Slice) {
@@ -135,7 +140,7 @@ TEST(Tensor, Slice) {
  {
    Tensor src_tensor;
    src_tensor.mutable_data<int>(make_ddim({5, 3, 4}), CPUPlace());
-    Tensor slice_tensor = src_tensor.Slice(1, 3);
+    Tensor slice_tensor = src_tensor.Slice<int>(1, 3);
    DDim slice_dims = slice_tensor.dims();
    ASSERT_EQ(arity(slice_dims), 3);
    EXPECT_EQ(slice_dims[0], 2);
@@ -155,10 +160,11 @@ TEST(Tensor, Slice) {
    EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
  }
+#ifdef __CUDACC__
  {
    Tensor src_tensor;
    src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
-    Tensor slice_tensor = src_tensor.Slice(2, 6);
+    Tensor slice_tensor = src_tensor.Slice<double>(2, 6);
    DDim slice_dims = slice_tensor.dims();
    ASSERT_EQ(arity(slice_dims), 2);
    EXPECT_EQ(slice_dims[0], 4);
@@ -176,6 +182,31 @@ TEST(Tensor, Slice) {
    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
    EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
  }
+#endif
 }
-*/
+TEST(Tensor, CopyFrom) {
\ No newline at end of file
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  Tensor src_tensor;
+  int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
+  int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  memcpy(src_ptr, arr, 9 * sizeof(int));
+  Tensor dst_tensor;
+  dst_tensor.CopyFrom<int>(src_tensor, CPUPlace());
+  const int* dst_ptr = dst_tensor.data<int>();
+  ASSERT_NE(src_ptr, dst_ptr);
+  for (size_t i = 0; i < 9; ++i) {
+    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+  }
+  Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
+  dst_tensor.CopyFrom<int>(slice_tensor, CPUPlace());
+  const int* slice_ptr = slice_tensor.data<int>();
+  dst_ptr = dst_tensor.data<int>();
+  ASSERT_NE(dst_ptr, slice_ptr);
+  for (size_t i = 0; i < 3; ++i) {
+    EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+  }
+}
--- a/paddle/framework/tensor_types.h
+++ b/paddle/framework/tensor_types.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "unsupported/Eigen/CXX11/Tensor"
+namespace paddle {
+namespace framework {
+// Helper to define Tensor types given that the scalar is of type T.
+template <typename T, int NDIMS = 1, typename IndexType = Eigen::DenseIndex>
+struct TTypes {
+  // Rank-<NDIMS> tensor of scalar type T.
+  typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>,
+                           Eigen::Aligned>
+      Tensor;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>, Eigen::Aligned>
+      ConstTensor;
+  // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
+  typedef Eigen::TensorMap<
+      Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>,
+      Eigen::Aligned>
+      Scalar;
+  typedef Eigen::TensorMap<Eigen::TensorFixedSize<const T, Eigen::Sizes<>,
+                                                  Eigen::RowMajor, IndexType>,
+                           Eigen::Aligned>
+      ConstScalar;
+  // Rank-1 tensor (vector) of scalar type T.
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>,
+                           Eigen::Aligned>
+      Flat;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
+      ConstFlat;
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>,
+                           Eigen::Aligned>
+      Vec;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
+      ConstVec;
+  // Rank-2 tensor (matrix) of scalar type T.
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>,
+                           Eigen::Aligned>
+      Matrix;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>, Eigen::Aligned>
+      ConstMatrix;
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -11,7 +11,6 @@ if(WITH_GPU)
 endif()
 if(USE_NNPACK)
-  include(nnpack/nnpack.cmake)
  list(APPEND cpp_files nnpack/NNPACKConvOp.cpp)
  if(WITH_TESTING)
    add_unittest(NNPACKConvOpTest nnpack/NNPACKConvOpTest.cpp)

--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -117,8 +117,7 @@ public:
    ConvFunctionBase::init(config);
  }
-  virtual void check(const BufferArgs& inputs,
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-                     const BufferArgs& outputs) override {
    const TensorShape& input = inputs[0].shape();
    const TensorShape& filter = inputs[1].shape();
    const TensorShape& output = outputs[0].shape();
@@ -217,8 +216,7 @@ public:
    ConvFunctionBase::init(config);
  }
-  virtual void check(const BufferArgs& inputs,
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-                     const BufferArgs& outputs) override {
    const TensorShape& output = inputs[0].shape();
    const TensorShape& filter = inputs[1].shape();
    const TensorShape& input = outputs[0].shape();
@@ -311,8 +309,7 @@ public:
    ConvFunctionBase::init(config);
  }
-  virtual void check(const BufferArgs& inputs,
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-                     const BufferArgs& outputs) override {
    const TensorShape& output = inputs[0].shape();
    const TensorShape& input = inputs[1].shape();
    const TensorShape& filter = outputs[0].shape();

--- a/paddle/function/NaiveConvOp.cpp
+++ b/paddle/function/NaiveConvOp.cpp
@@ -90,8 +90,7 @@ public:
    ConvFunctionBase::init(config);
  }
-  virtual void check(const BufferArgs& inputs,
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-                     const BufferArgs& outputs) override {
    const TensorShape& input = inputs[0].shape();
    const TensorShape& filter = inputs[1].shape();
    const TensorShape& output = outputs[0].shape();

--- a/paddle/function/RowConvOpGpu.cu
+++ b/paddle/function/RowConvOpGpu.cu
@@ -32,7 +32,7 @@ __global__ void KeRowConv(real* y, const real* x,  const real* w,
  for (int i = tidy; i < context; i += blky) {
    sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0;
  }
  __syncthreads();
  for (int i = 0; i < numSeq; ++i) {
@@ -144,12 +144,15 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
      int yoff = start + j;
      // transpose
-      sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
+      sh_x[tidx][tidy] = (xoff < width && yoff < end) ?
-      sh_dy[tidx][tidy + context - 1] = (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0;
+      x[yoff * width + xoff] : 0.0;
+      sh_dy[tidx][tidy + context - 1] = (xoff < width && yoff < end) ?
+      dy[yoff * width + xoff] : 0.0;
      __syncthreads();
      if (tidy < (context - 1)) {
        yoff = yoff - context + 1;
-        sh_dy[tidx][tidy] = (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0;
+        sh_dy[tidx][tidy] = (xoff < width && yoff >= start) ?
+        dy[yoff * width + xoff] : 0.0;
      }
      __syncthreads();
@@ -199,11 +202,13 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
      int yoff = start + j;
      // transpose
-      sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
+      sh_x[tidx][tidy] = (xoff < width && yoff < end) ?
+      x[yoff * width + xoff] : 0.0;
      __syncthreads();
      for (int t = 0; t < context; t++) {
-        sh_dy[tidx][tidy] = (xoff < width && (yoff - t) >= start && yoff - t < end) ? dy[(yoff - t) * width + xoff] : 0.0;
+        sh_dy[tidx][tidy] = (xoff < width && (yoff - t) >= start &&
+        yoff - t < end) ? dy[(yoff - t) * width + xoff] : 0.0;
        __syncthreads();
        real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx];
@@ -239,7 +244,7 @@ __global__ void KeRowConvBwData(real* dx, const real* w, const real* dy,
  for (int i = tidy; i < context; i += blky) {
    sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0;
  }
  __syncthreads();
  for (int i = 0; i < numSeq; ++i) {
@@ -312,7 +317,7 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
    dim3 dimBlock(32, 32);
    dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
    real* dw = filterG.getData();
-    if (contextLength <= 32) { 
+    if (contextLength <= 32) {
      KeRowConvBwWeight<32, 32, 32>
        <<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
        (dw, x, dy, starts, height, width, numSeq, contextLength);

--- a/paddle/function/nnpack/NNPACKConvOp.cpp
+++ b/paddle/function/nnpack/NNPACKConvOp.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/function/ConvOp.h"
 DEFINE_bool(nnpack_allocate_outside,
-            false,
+            true,
            "Allocate and free workspace memory outside the NNPACK interface.");
 DEFINE_int32(nnpack_num_threads,
             0,
@@ -58,18 +58,10 @@ public:
    workspaceBuffer_ = nullptr;
    workspaceSize_ = 0;
-    threadpool_ = nullptr;
+    create_nnpack_threadpool();
-    if (FLAGS_nnpack_num_threads) {
-      threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
-      VLOG(3) << "Number of threads "
-              << pthreadpool_get_threads_count(threadpool_);
-    }
  }
  ~NNPACKConvFunction() {
-    if (threadpool_) {
-      pthreadpool_destroy(threadpool_);
-    }
    if (workspaceBuffer_) {
      free(workspaceBuffer_);
    }
@@ -225,14 +217,25 @@ public:
    }
  }
+  static void create_nnpack_threadpool() {
+    if (FLAGS_nnpack_num_threads && threadpool_ == nullptr) {
+      threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
+      VLOG(3) << "Number of threads "
+              << pthreadpool_get_threads_count(threadpool_);
+    }
+  }
 private:
  nnp_convolution_algorithm algorithm_;
  nnp_convolution_transform_strategy transform_strategy_;
  void* workspaceBuffer_;
  size_t workspaceSize_;
-  pthreadpool_t threadpool_;
+  static pthreadpool_t threadpool_;
 };
+template <DeviceType Device>
+pthreadpool_t NNPACKConvFunction<Device>::threadpool_ = nullptr;
 REGISTER_TYPED_FUNC(NNPACKConv, CPU, NNPACKConvFunction);
 }  // namespace paddle
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -205,10 +205,8 @@ public:
      hl_destroy_event(hlEvent_);
      hlEvent_ = NULL;
    }
-    if (batchData_) {
+    delete batchData_;
-      delete batchData_;
+    batchData_ = NULL;
-      batchData_ = NULL;
-    }
  }
  void setDataBatch(DataBatch* batchData) { batchData_ = batchData; }

--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -403,7 +403,7 @@ public:
      : layerName_(layerName) {
    addEvaluator(std::move(evaluator));
  }
-  virtual void eval(const NeuralNetwork& nn) override {
+  void eval(const NeuralNetwork& nn) override {
    const LayerPtr& layer = nn.getLayer(layerName_);
    CHECK(layer) << "Nonexisted layer: " << layerName_ << " in submodel "
                 << nn.getName();

--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -636,7 +636,7 @@ void lenToStarts(std::vector<int>& starts) {
  }
  starts.back() = pos;
 }
-}
+}  // namespace
 void RecurrentGradientMachine::calcSequenceStartPositions() {
  std::vector<int> starts(commonSeqInfo_.size() + 1);

--- a/paddle/gserver/layers/AgentLayer.cpp
+++ b/paddle/gserver/layers/AgentLayer.cpp
@@ -124,7 +124,7 @@ void copyElements(const IVector& srcVec,
    dest[index[i]] = src[i];
  }
 }
-}
+}  // namespace
 void GatherAgentLayer::forwardIds(PassType passType) {
  IVectorPtr realId = realLayers_[0]->getOutputLabel();

--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
@@ -32,9 +32,7 @@ static InitFunction __init_storage_engine([]() { StorageEngine::singleton(); },
 StorageEngine::StorageEngine() : cpuAllocator_(nullptr) {}
 StorageEngine::~StorageEngine() {
-  if (cpuAllocator_) {
+  delete cpuAllocator_;
-    delete cpuAllocator_;
-  }
  for (auto it : gpuAllocator_) {
    delete it;
  }

--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
 add_subdirectory(detail)
+cc_library(memory SRCS memory.cc)
+cc_library(paddle_memory
+    DEPS
+    memory meta_data
+    meta_cache memory_block
+    buddy_allocator system_allocator)
+cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
--- a/paddle/memory/detail/CMakeLists.txt
+++ b/paddle/memory/detail/CMakeLists.txt
 if(${WITH_GPU})
-  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags)
+  nv_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info gpu_info)
-  nv_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
 else(${WITH_GPU})
-  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags)
+  cc_library(system_allocator SRCS system_allocator.cc DEPS gflags cpu_info)
-  cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator gflags)
 endif(${WITH_GPU})
+cc_test(system_allocator_test SRCS system_allocator_test.cc DEPS system_allocator)
+cc_library(meta_data SRCS meta_data.cc)
+cc_library(meta_cache SRCS meta_cache.cc)
+cc_library(memory_block SRCS memory_block.cc)
+cc_library(buddy_allocator SRCS buddy_allocator.cc DEPS glog)
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -12,22 +12,317 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
-#pragma once
 #include "paddle/memory/detail/buddy_allocator.h"
+#include "glog/logging.h"
 namespace paddle {
 namespace memory {
 namespace detail {
-BuddyAllocator::BuddyAllocator(size_t pool_size, size_t max_pools,
+BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator,
-                               SystemAllocator* system_allocator)
+                               size_t min_chunk_size, size_t max_chunk_size)
-    : pool_size_(pool_size),
+    : min_chunk_size_(min_chunk_size),
-      max_pools_(max_pools),
+      max_chunk_size_(max_chunk_size),
-      system_allocator_(system_allocator) {
+      cache_(system_allocator->UseGpu()),
-  PADDLE_ASSERT(pool_size > 0);
+      system_allocator_(std::move(system_allocator)) {}
-  PADDLE_ASSERT(max_pools > 0);
-  PADDLE_ASSERT(system_allocator != nullptr);
+BuddyAllocator::~BuddyAllocator() {
+  DLOG(INFO) << "BuddyAllocator Disconstructor makes sure that all of these "
+                "have actually been freed";
+  while (!pool_.empty()) {
+    auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
+    DLOG(INFO) << "Free from block (" << block << ", " << max_chunk_size_
+               << ")";
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    cache_.invalidate(block);
+    pool_.erase(pool_.begin());
+  }
+}
+inline size_t align(size_t size, size_t alignment) {
+  size_t remaining = size % alignment;
+  return remaining == 0 ? size : size + (alignment - remaining);
+}
+void* BuddyAllocator::Alloc(size_t unaligned_size) {
+  // adjust allocation alignment
+  size_t size = align(unaligned_size + sizeof(Metadata), min_chunk_size_);
+  // acquire the allocator lock
+  std::lock_guard<std::mutex> lock(mutex_);
+  DLOG(INFO) << "Allocate " << unaligned_size << " bytes from chunk size "
+             << size;
+  // if the allocation is huge, send directly to the system allocator
+  if (size > max_chunk_size_) {
+    DLOG(INFO) << "Allocate from system allocator.";
+    return SystemAlloc(size);
+  }
+  // query and allocate from the existing chunk
+  auto it = FindExistChunk(size);
+  // refill the pool if failure
+  if (it == pool_.end()) {
+    it = RefillPool();
+    // if still failure, fail fatally
+    if (it == pool_.end()) {
+      return nullptr;
+    }
+  } else {
+    DLOG(INFO) << "Allocation from existing memory block " << std::get<2>(*it)
+               << " at address "
+               << reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
+  }
+  total_used_ += size;
+  total_free_ -= size;
+  // split the allocation and return data for use
+  return reinterpret_cast<MemoryBlock*>(SplitToAlloc(it, size))->data();
+}
+void BuddyAllocator::Free(void* p) {
+  // Point back to metadata
+  auto block = static_cast<MemoryBlock*>(p)->metadata();
+  // Acquire the allocator lock
+  std::lock_guard<std::mutex> lock(mutex_);
+  DLOG(INFO) << "Free from address " << block;
+  if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) {
+    DLOG(INFO) << "Free directly from system allocator";
+    system_allocator_->Free(block, block->total_size(cache_),
+                            block->index(cache_));
+    // Invalidate GPU allocation from cache
+    cache_.invalidate(block);
+    return;
+  }
+  block->mark_as_free(cache_);
+  total_used_ -= block->total_size(cache_);
+  total_free_ += block->total_size(cache_);
+  // Trying to merge the right buddy
+  if (block->has_right_buddy(cache_)) {
+    DLOG(INFO) << "Merging this block " << block << " with its right buddy "
+               << block->right_buddy(cache_);
+    auto right_buddy = block->right_buddy(cache_);
+    if (right_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      // Take away right buddy from pool
+      pool_.erase(IndexSizeAddress(right_buddy->index(cache_),
+                                   right_buddy->total_size(cache_),
+                                   right_buddy));
+      // merge its right buddy to the block
+      block->merge(cache_, right_buddy);
+    }
+  }
+  // Trying to merge the left buddy
+  if (block->has_left_buddy(cache_)) {
+    DLOG(INFO) << "Merging this block " << block << " with its left buddy "
+               << block->left_buddy(cache_);
+    auto left_buddy = block->left_buddy(cache_);
+    if (left_buddy->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      // Take away right buddy from pool
+      pool_.erase(IndexSizeAddress(left_buddy->index(cache_),
+                                   left_buddy->total_size(cache_), left_buddy));
+      // merge the block to its left buddy
+      left_buddy->merge(cache_, block);
+      block = left_buddy;
+    }
+  }
+  // Dumping this block into pool
+  DLOG(INFO) << "Inserting free block (" << block << ", "
+             << block->total_size(cache_) << ")";
+  pool_.insert(
+      IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
+  // Clean up if existing too much free memory
+  // Prefer freeing fallback allocation first
+  CleanIdleFallBackAlloc();
+  // Free normal allocation
+  CleanIdleNormalAlloc();
+}
+size_t BuddyAllocator::Used() { return total_used_; }
+void* BuddyAllocator::SystemAlloc(size_t size) {
+  size_t index = 0;
+  void* p = system_allocator_->Alloc(index, size);
+  DLOG(INFO) << "Allocated " << p << " from system allocator.";
+  if (p == nullptr) return nullptr;
+  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::HUGE_CHUNK, index,
+                                     size, nullptr, nullptr);
+  return static_cast<MemoryBlock*>(p)->data();
+}
+BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
+#ifndef PADDLE_ONLY_CPU
+  if (system_allocator_->UseGpu()) {
+    if ((total_used_ + total_free_) == 0) {
+      // Compute the maximum allocation size for the first allocation.
+      max_chunk_size_ = platform::GpuMaxChunkSize();
+    }
+  }
+#endif  // PADDLE_ONLY_CPU
+  // Allocate a new maximum sized block
+  size_t index = 0;
+  void* p = system_allocator_->Alloc(index, max_chunk_size_);
+  if (p == nullptr) return pool_.end();
+  DLOG(INFO) << "Creating and inserting new block " << p
+             << " from system allocator";
+  static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::FREE_CHUNK, index,
+                                     max_chunk_size_, nullptr, nullptr);
+  // gpu fallback allocation
+  if (system_allocator_->UseGpu() &&
+      static_cast<MemoryBlock*>(p)->index(cache_) == 1) {
+    fallback_alloc_count_++;
+  }
+  total_free_ += max_chunk_size_;
+  // dump the block into pool
+  return pool_.insert(IndexSizeAddress(index, max_chunk_size_, p)).first;
+}
+BuddyAllocator::PoolSet::iterator BuddyAllocator::FindExistChunk(size_t size) {
+  size_t index = 0;
+  while (1) {
+    auto it = pool_.lower_bound(IndexSizeAddress(index, size, nullptr));
+    // no match chunk memory
+    if (it == pool_.end()) return it;
+    if (std::get<0>(*it) > index) {
+      // find suitable one
+      if (std::get<1>(*it) >= size) {
+        return it;
+      }
+      // update and continue
+      index = std::get<0>(*it);
+      continue;
+    }
+    return it;
+  }
+}
+void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
+                                   size_t size) {
+  auto block = static_cast<MemoryBlock*>(std::get<2>(*it));
+  pool_.erase(it);
+  DLOG(INFO) << "Split block (" << block << ", " << block->total_size(cache_)
+             << ") into";
+  block->split(cache_, size);
+  DLOG(INFO) << "Left block (" << block << ", " << block->total_size(cache_)
+             << ")";
+  block->set_type(cache_, MemoryBlock::ARENA_CHUNK);
+  // the rest of memory if exist
+  if (block->has_right_buddy(cache_)) {
+    if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) {
+      DLOG(INFO) << "Insert right block (" << block->right_buddy(cache_) << ", "
+                 << block->right_buddy(cache_)->total_size(cache_) << ")";
+      pool_.insert(
+          IndexSizeAddress(block->right_buddy(cache_)->index(cache_),
+                           block->right_buddy(cache_)->total_size(cache_),
+                           block->right_buddy(cache_)));
+    }
+  }
+  return block;
+}
+void BuddyAllocator::CleanIdleFallBackAlloc() {
+  // If fallback allocation does not exist, return directly
+  if (!fallback_alloc_count_) return;
+  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
+    // If free memory block less than max_chunk_size_, return directly
+    if (std::get<1>(*pool) < max_chunk_size_) return;
+    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
+    // If no GPU fallback allocator, return
+    if (!system_allocator_->UseGpu() || block->index(cache_) == 0) {
+      return;
+    }
+    DLOG(INFO) << "Return block " << block << " to fallback allocator.";
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    cache_.invalidate(block);
+    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
+    total_free_ -= max_chunk_size_;
+    fallback_alloc_count_--;
+    // If no fall allocation exists, return directly
+    if (!fallback_alloc_count_) return;
+  }
+}
+void BuddyAllocator::CleanIdleNormalAlloc() {
+  auto shall_free_alloc = [&]() -> bool {
+    // free all fallback allocations
+    if (fallback_alloc_count_ > 0) {
+      return true;
+    }
+    // keep 2x overhead if we haven't fallen back
+    if ((total_used_ + max_chunk_size_) * 2 < total_free_) {
+      return true;
+    }
+    return false;
+  };
+  if (!shall_free_alloc()) return;
+  for (auto pool = pool_.rbegin(); pool != pool_.rend();) {
+    // If free memory block less than max_chunk_size_, return directly
+    if (std::get<1>(*pool) < max_chunk_size_) return;
+    MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
+    DLOG(INFO) << "Return block " << block << " to base allocator.";
+    system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
+    cache_.invalidate(block);
+    pool = PoolSet::reverse_iterator(pool_.erase(std::next(pool).base()));
+    total_free_ -= max_chunk_size_;
+    if (!shall_free_alloc()) return;
+  }
 }
 }  // namespace detail

--- a/paddle/memory/detail/buddy_allocator.h
+++ b/paddle/memory/detail/buddy_allocator.h
@@ -14,9 +14,16 @@
 #pragma once
+#include "paddle/memory/detail/meta_cache.h"
+#include "paddle/memory/detail/meta_data.h"
 #include "paddle/memory/detail/system_allocator.h"
+#include "paddle/platform/assert.h"
+#include "paddle/platform/cpu_info.h"
+#include "paddle/platform/gpu_info.h"
 #include <mutex>
+#include <set>
+#include <unordered_map>
 #include <vector>
 namespace paddle {
@@ -25,61 +32,80 @@ namespace detail {
 class BuddyAllocator {
 public:
-  BuddyAllocator(size_t pool_size, size_t max_pools,
+  BuddyAllocator(SystemAllocator* system_allocator, size_t min_chunk_size,
-                 SystemAllocator* system_allocator);
+                 size_t max_chunk_size);
  ~BuddyAllocator();
-  void* Alloc(size_t size);
+ public:
+  void* Alloc(size_t unaligned_size);
  void Free(void*);
  size_t Used();
+ public:
+  // Disable copy and assignment
+  BuddyAllocator(const BuddyAllocator&) = delete;
+  BuddyAllocator& operator=(const BuddyAllocator&) = delete;
 private:
-  struct Block {
+  // Tuple (allocator index, memory size, memory address)
-    size_t size_;
+  using IndexSizeAddress = std::tuple<size_t, size_t, void*>;
-    Block* left_;   // left buddy
+  // Each element in PoolSet is a free allocation
-    Block* right_;  // right buddy
+  using PoolSet = std::set<IndexSizeAddress>;
-  };
-  // Initially, there is only one pool.  If a Alloc founds not enough
+  /*! \brief Allocate fixed-size memory from system */
-  // memory from that pool, and there has not been max_num_pools_,
+  void* SystemAlloc(size_t size);
-  // create a new pool by calling system_allocator_.Alloc(pool_size_).
-  std::vector<void*> pools_;
-  size_t pool_size_;      // the size of each pool;
+  /*! \brief If existing chunks are not suitable, refill pool */
-  size_t max_num_pools_;  // the size of all pools;
+  PoolSet::iterator RefillPool();
-  SystemAllocator* system_allocator_;
+  /**
+   *  \brief   Find the suitable chunk from existing pool and split
+   *           it to left and right buddies
+   *
+   *  \param   it     the iterator of pool list
+   *  \param   size   the size of allocation
+   *
+   *  \return  the left buddy address
+   */
+  void* SplitToAlloc(PoolSet::iterator it, size_t size);
-  std::mutex mutex_;
+  /*! \brief Find the existing chunk which used to allocation */
+  PoolSet::iterator FindExistChunk(size_t size);
-  // Disable copy and assignment.
+  /*! \brief Clean idle fallback allocation */
-  BuddyAllocator(const BuddyAllocator&) = delete;
+  void CleanIdleFallBackAlloc();
-  BuddyAllocator& operator=(const BuddyAllocator&) = delete;
-};
+  /*! \brief Clean idle normal allocation */
+  void CleanIdleNormalAlloc();
-BuddyAllocator<CPUAllocator>* GetCPUBuddyAllocator() {
+ private:
-  static BuddyAllocator<CPUAllocator>* a = nullptr;
+  size_t total_used_ = 0;  // the total size of used memory
-  if (a == nullptr) {
+  size_t total_free_ = 0;  // the total size of free memory
-    a = new BuddyAllocator<CPUAllocator>();
-  }
+  size_t min_chunk_size_;  // the minimum size of each chunk
-  return a;
+  size_t max_chunk_size_;  // the maximum size of each chunk
-}
+ private:
-#ifndef PADDLE_ONLY_CPU  // The following code are for CUDA.
+  /**
+   * \brief A list of free allocation
-BuddyAllocator<GPUAllocator>* GetGPUBuddyAllocator(int gpu_id) {
+   *
-  static BuddyAllocator<GPUAllocator>** as = NULL;
+   * \note  Only store free chunk memory in pool
-  if (as == NULL) {
+   */
-    int gpu_num = platform::GetDeviceCount();
+  PoolSet pool_;
-    as = new BuddyAllocator<GPUAllocator>*[gpu_num];
-    for (int gpu = 0; gpu < gpu_num; gpu++) {
+  /*! Record fallback allocation count for auto-scaling */
-      as[gpu] = new BuddyAllocator<GPUAllocator>();
+  size_t fallback_alloc_count_ = 0;
-    }
-  }
+ private:
-  return as[gpu_id];
+  /*! Unify the metadata format between GPU and CPU allocations */
-}
+  MetadataCache cache_;
-#endif  // PADDLE_ONLY_CPU
+ private:
+  /*! Allocate CPU/GPU memory from system */
+  SystemAllocator* system_allocator_;
+  std::mutex mutex_;
+};
 }  // namespace detail
 }  // namespace memory

--- a/paddle/memory/detail/memory_block.cc
+++ b/paddle/memory/detail/memory_block.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/memory/detail/meta_cache.h"
+#include "paddle/memory/detail/meta_data.h"
+#include "paddle/platform/assert.h"
+namespace paddle {
+namespace memory {
+namespace detail {
+void MemoryBlock::init(MetadataCache& cache, Type t, size_t index, size_t size,
+                       void* left_buddy, void* right_buddy) {
+  cache.store(this, Metadata(t, index, size - sizeof(Metadata), size,
+                             static_cast<MemoryBlock*>(left_buddy),
+                             static_cast<MemoryBlock*>(right_buddy)));
+}
+MemoryBlock::Type MemoryBlock::type(MetadataCache& cache) const {
+  return cache.load(this).type;
+}
+size_t MemoryBlock::size(MetadataCache& cache) const {
+  return cache.load(this).size;
+}
+size_t MemoryBlock::total_size(MetadataCache& cache) const {
+  return cache.load(this).total_size;
+}
+MemoryBlock* MemoryBlock::left_buddy(MetadataCache& cache) const {
+  return cache.load(this).left_buddy;
+}
+MemoryBlock* MemoryBlock::right_buddy(MetadataCache& cache) const {
+  return cache.load(this).right_buddy;
+}
+void MemoryBlock::split(MetadataCache& cache, size_t size) {
+  // make sure the split fits
+  PADDLE_ASSERT(total_size(cache) >= size);
+  // bail out if there is no room for another partition
+  if (total_size(cache) - size <= sizeof(Metadata)) {
+    return;
+  }
+  // find the position of the split
+  void* right_partition = reinterpret_cast<uint8_t*>(this) + size;
+  size_t remaining_size = total_size(cache) - size;
+  // Add the new block as a buddy
+  auto metadata = cache.load(this);
+  // Write the metadata for the new block
+  auto new_block_right_buddy = metadata.right_buddy;
+  cache.store(
+      static_cast<MemoryBlock*>(right_partition),
+      Metadata(FREE_CHUNK, index(cache), remaining_size - sizeof(Metadata),
+               remaining_size, this, new_block_right_buddy));
+  metadata.right_buddy = static_cast<MemoryBlock*>(right_partition);
+  metadata.size = size - sizeof(Metadata);
+  metadata.total_size = size;
+  cache.store(this, metadata);
+  // Write metadata for the new block's right buddy
+  if (new_block_right_buddy != nullptr) {
+    auto buddy_metadata = cache.load(new_block_right_buddy);
+    buddy_metadata.left_buddy = static_cast<MemoryBlock*>(right_partition);
+    cache.store(new_block_right_buddy, buddy_metadata);
+  }
+}
+void MemoryBlock::merge(MetadataCache& cache, MemoryBlock* right_buddy) {
+  // only free blocks can be merged
+  PADDLE_ASSERT(type(cache) == FREE_CHUNK);
+  PADDLE_ASSERT(right_buddy->type(cache) == FREE_CHUNK);
+  auto metadata = cache.load(this);
+  // link this->buddy's buddy
+  metadata.right_buddy = right_buddy->right_buddy(cache);
+  // link buddy's buddy -> this
+  if (metadata.right_buddy != nullptr) {
+    auto buddy_metadata = cache.load(metadata.right_buddy);
+    buddy_metadata.left_buddy = this;
+    cache.store(metadata.right_buddy, buddy_metadata);
+  }
+  metadata.size += right_buddy->total_size(cache);
+  metadata.total_size += right_buddy->total_size(cache);
+  cache.store(this, metadata);
+  cache.store(right_buddy, Metadata(INVALID_CHUNK, 0, 0, 0, nullptr, nullptr));
+}
+void MemoryBlock::mark_as_free(MetadataCache& cache) {
+  // check for double free or corruption
+  PADDLE_ASSERT(type(cache) != FREE_CHUNK);
+  PADDLE_ASSERT(type(cache) != INVALID_CHUNK);
+  set_type(cache, FREE_CHUNK);
+}
+void MemoryBlock::set_type(MetadataCache& cache, Type t) {
+  auto metadata = cache.load(this);
+  metadata.type = t;
+  cache.store(this, metadata);
+}
+bool MemoryBlock::has_left_buddy(MetadataCache& cache) const {
+  return left_buddy(cache) != nullptr;
+}
+bool MemoryBlock::has_right_buddy(MetadataCache& cache) const {
+  return right_buddy(cache) != nullptr;
+}
+size_t MemoryBlock::index(MetadataCache& cache) const {
+  return cache.load(this).index;
+}
+void* MemoryBlock::data() const {
+  return const_cast<Metadata*>(reinterpret_cast<const Metadata*>(this)) + 1;
+}
+MemoryBlock* MemoryBlock::metadata() const {
+  return const_cast<MemoryBlock*>(reinterpret_cast<const MemoryBlock*>(
+      reinterpret_cast<const Metadata*>(this) - 1));
+}
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/memory/detail/memory_block.h
+++ b/paddle/memory/detail/memory_block.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <cstddef>
+namespace paddle {
+namespace memory {
+namespace detail {
+// Forward Declarations
+class MetadataCache;
+/*! \brief A class used to interpret the contents of a memory block */
+class MemoryBlock {
+ public:
+  enum Type {
+    FREE_CHUNK,    // memory is free and idle
+    ARENA_CHUNK,   // memory is being occupied
+    HUGE_CHUNK,    // memory is out of management
+    INVALID_CHUNK  // memory is invalid
+  };
+ public:
+  void init(MetadataCache& cache, Type t, size_t index, size_t size,
+            void* left_buddy, void* right_buddy);
+ public:
+  /*! \brief The type of the allocation */
+  Type type(MetadataCache& cache) const;
+  /*! \brief The size of the data region */
+  size_t size(MetadataCache& cache) const;
+  /*! \brief An index to track the allocator */
+  size_t index(MetadataCache& cache) const;
+  /*! \brief The total size of the block */
+  size_t total_size(MetadataCache& cache) const;
+  /*! \brief Check the left buddy of the block */
+  bool has_left_buddy(MetadataCache& cache) const;
+  /*! \brief Check the right buddy of the block */
+  bool has_right_buddy(MetadataCache& cache) const;
+  /*! \brief Get the left buddy */
+  MemoryBlock* left_buddy(MetadataCache& cache) const;
+  /*! \brief Get the right buddy */
+  MemoryBlock* right_buddy(MetadataCache& cache) const;
+ public:
+  /*! \brief Split the allocation into left/right blocks */
+  void split(MetadataCache& cache, size_t size);
+  /*! \brief Merge left and right blocks together */
+  void merge(MetadataCache& cache, MemoryBlock* right_buddy);
+  /*! \brief Mark the allocation as free */
+  void mark_as_free(MetadataCache& cache);
+  /*! \brief Change the type of the allocation */
+  void set_type(MetadataCache& cache, Type t);
+ public:
+  /*! \brief Get a pointer to the memory block's data */
+  void* data() const;
+  /*! \brief Get a pointer to the memory block's metadata */
+  MemoryBlock* metadata() const;
+ public:
+  static size_t overhead();
+};
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/memory/detail/meta_cache.cc
+++ b/paddle/memory/detail/meta_cache.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/memory/detail/meta_cache.h"
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/platform/assert.h"
+namespace paddle {
+namespace memory {
+namespace detail {
+MetadataCache::MetadataCache(bool uses_gpu) : uses_gpu_(uses_gpu) {}
+Metadata MetadataCache::load(const MemoryBlock* block) {
+  if (uses_gpu_) {
+    auto existing_metadata = cache_.find(block);
+    PADDLE_ASSERT(existing_metadata->second.check_guards());
+    return existing_metadata->second;
+  } else {
+    PADDLE_ASSERT(reinterpret_cast<const Metadata*>(block)->check_guards());
+    return *reinterpret_cast<const Metadata*>(block);
+  }
+}
+void MetadataCache::store(MemoryBlock* block,
+                          const Metadata& original_metadata) {
+  auto metadata = original_metadata;
+  metadata.update_guards();
+  if (uses_gpu_) {
+    cache_[block] = metadata;
+  } else {
+    *reinterpret_cast<Metadata*>(block) = metadata;
+  }
+}
+void MetadataCache::invalidate(MemoryBlock* block) {
+  if (uses_gpu_) {
+    cache_.erase(block);
+  }
+}
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/memory/detail/meta_cache.h
+++ b/paddle/memory/detail/meta_cache.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include "paddle/memory/detail/memory_block.h"
+#include "paddle/memory/detail/meta_data.h"
+#include <unordered_map>
+namespace paddle {
+namespace memory {
+namespace detail {
+/**
+ *  \brief A cache for accessing memory block meta-data that may be expensive
+ *         to access directly.
+ *
+ *  \note  This class exists to unify the metadata format between GPU and CPU
+ *         allocations. It should be removed when the CPU can access all GPU
+ *         allocations directly via UVM.
+ */
+class MetadataCache {
+ public:
+  MetadataCache(bool uses_gpu);
+ public:
+  /*! \brief Load the associated metadata for the specified memory block. */
+  Metadata load(const MemoryBlock*);
+  /*! \brief Store the associated metadata for the specified memory block. */
+  void store(MemoryBlock*, const Metadata&);
+  /*! \brief Indicate that the specified metadata will no longer be used. */
+  void invalidate(MemoryBlock*);
+ public:
+  MetadataCache(const MetadataCache&) = delete;
+  MetadataCache& operator=(const MetadataCache&) = delete;
+ private:
+  bool uses_gpu_;
+ private:
+  typedef std::unordered_map<const MemoryBlock*, Metadata> MetadataMap;
+ private:
+  MetadataMap cache_;
+};
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/memory/detail/meta_data.cc
+++ b/paddle/memory/detail/meta_data.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/memory/detail/meta_data.h"
+#include <functional>
+namespace paddle {
+namespace memory {
+namespace detail {
+Metadata::Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts,
+                   MemoryBlock* l, MemoryBlock* r)
+    : type(t),
+      index(i),
+      size(s),
+      total_size(ts),
+      left_buddy(l),
+      right_buddy(r) {}
+Metadata::Metadata()
+    : type(MemoryBlock::INVALID_CHUNK),
+      index(0),
+      size(0),
+      total_size(0),
+      left_buddy(nullptr),
+      right_buddy(nullptr) {}
+template <class T>
+inline void hash_combine(std::size_t& seed, const T& v) {
+  std::hash<T> hasher;
+  seed ^= hasher(v) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+inline size_t hash(const Metadata* metadata, size_t initial_seed) {
+  size_t seed = initial_seed;
+  hash_combine(seed, (size_t)metadata->type);
+  hash_combine(seed, metadata->index);
+  hash_combine(seed, metadata->size);
+  hash_combine(seed, metadata->total_size);
+  hash_combine(seed, metadata->left_buddy);
+  hash_combine(seed, metadata->right_buddy);
+  return seed;
+}
+void Metadata::update_guards() {
+  guard_begin = hash(this, 1);
+  guard_end = hash(this, 2);
+}
+bool Metadata::check_guards() const {
+  return guard_begin == hash(this, 1) && guard_end == hash(this, 2);
+}
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/memory/detail/meta_data.h
+++ b/paddle/memory/detail/meta_data.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include "paddle/memory/detail/memory_block.h"
+#include <stddef.h>
+namespace paddle {
+namespace memory {
+namespace detail {
+class Metadata {
+ public:
+  Metadata(MemoryBlock::Type t, size_t i, size_t s, size_t ts, MemoryBlock* l,
+           MemoryBlock* r);
+  Metadata();
+ public:
+  /*! \brief Update the guards when metadata is changed */
+  void update_guards();
+  /*! \brief Check consistency to previous modification */
+  bool check_guards() const;
+ public:
+  // TODO(gangliao): compress this
+  // clang-format off
+  size_t            guard_begin = 0;
+  MemoryBlock::Type type        = MemoryBlock::INVALID_CHUNK;
+  size_t            index       = 0;
+  size_t            size        = 0;
+  size_t            total_size  = 0;
+  MemoryBlock*      left_buddy  = nullptr;
+  MemoryBlock*      right_buddy = nullptr;
+  size_t            guard_end   = 0;
+  // clang-format on
+};
+}  // namespace detail
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/memory/detail/system_allocator.cc
+++ b/paddle/memory/detail/system_allocator.cc
--- a/paddle/memory/detail/system_allocator.h
+++ b/paddle/memory/detail/system_allocator.h
@@ -20,31 +20,36 @@ namespace paddle {
 namespace memory {
 namespace detail {
-// SystemAllocator is the parent class of CPUAllocator and
+/**
-// GPUAllocator.  A BuddyAllocator object uses a SystemAllocator*
+ * \brief SystemAllocator is the parent class of CPUAllocator and GPUAllocator.
-// pointing to the underlying system allocator.  An alternative to
+ *        A BuddyAllocator object uses a SystemAllocator* pointing to the
-// this class hierarchy is to pass a system allocator class to
+ *        underlying system allocator.
-// BuddyAllocator as a template parameter.  This approach makes
+ */
-// BuddyAllocator a class template, and it's very complicated
-// algorithm would make the buddy_allocator.h messy.
 class SystemAllocator {
 public:
  virtual ~SystemAllocator() {}
-  virtual void* Alloc(size_t size) = 0;
+  virtual void* Alloc(size_t& index, size_t size) = 0;
-  virtual void Free(void* p, size_t size) = 0;
+  virtual void Free(void* p, size_t size, size_t index) = 0;
+  virtual bool UseGpu() const = 0;
 };
 class CPUAllocator : public SystemAllocator {
 public:
-  virtual void* Alloc(size_t size);
+  virtual void* Alloc(size_t& index, size_t size);
-  virtual void Free(void* p, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
 };
 #ifndef PADDLE_ONLY_CPU
 class GPUAllocator : public SystemAllocator {
 public:
-  virtual void* Alloc(size_t size);
+  virtual void* Alloc(size_t& index, size_t size);
-  virtual void Free(void* p, size_t size);
+  virtual void Free(void* p, size_t size, size_t index);
+  virtual bool UseGpu() const;
+ private:
+  size_t gpu_alloc_size_ = 0;
+  size_t fallback_alloc_size_ = 0;
 };
 #endif  // PADDLE_ONLY_CPU

--- a/paddle/memory/detail/system_allocator_test.cc
+++ b/paddle/memory/detail/system_allocator_test.cc
@@ -25,7 +25,8 @@ DECLARE_bool(use_pinned_memory);
 void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
  bool freed = false;
  {
-    void* p = a.Alloc(size);
+    size_t index;
+    void* p = a.Alloc(index, size);
    if (size > 0) {
      EXPECT_NE(p, nullptr);
    } else {
@@ -35,7 +36,7 @@ void TestAllocator(paddle::memory::detail::SystemAllocator& a, size_t size) {
    int* i = static_cast<int*>(p);
    std::shared_ptr<int> ptr(i, [&](void* p) {
      freed = true;
-      a.Free(p, size);
+      a.Free(p, size, index);
    });
  }
  EXPECT_TRUE(freed);
@@ -56,14 +57,7 @@ TEST(CPUAllocator, LockMem) {
 }
 #ifndef PADDLE_ONLY_CPU
-TEST(GPUAllocator, NoStaging) {
+TEST(GPUAllocator, Alloc) {
-  FLAGS_use_pinned_memory = false;
-  paddle::memory::detail::GPUAllocator a;
-  TestAllocator(a, 2048);
-  TestAllocator(a, 0);
-}
-TEST(GPUAllocator, Staging) {
-  FLAGS_use_pinned_memory = true;
  paddle::memory::detail::GPUAllocator a;
  TestAllocator(a, 2048);
  TestAllocator(a, 0);

--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -19,9 +19,14 @@ limitations under the License. */
 namespace paddle {
 namespace memory {
-void* Alloc(paddle::platform::Place, size_t);
+template <class Place>
-void Free(paddle::platform::Place, void*);
+void* Alloc(Place, size_t);
-size_t Used(paddle::platform::Place);
+template <class Place>
+void Free(Place, void*);
+template <class Place>
+size_t Used(Place);
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/add_op.cu
--- a/paddle/operators/add_op.h
+++ b/paddle/operators/add_op.h
--- a/paddle/operators/add_op_test.cc
+++ b/paddle/operators/add_op_test.cc
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
--- a/paddle/operators/mul_op.cu
+++ b/paddle/operators/mul_op.cu
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
--- a/paddle/operators/sigmoid_op.cu
+++ b/paddle/operators/sigmoid_op.cu
--- a/paddle/operators/sigmoid_op.h
+++ b/paddle/operators/sigmoid_op.h
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
--- a/paddle/operators/softmax_op.cu
+++ b/paddle/operators/softmax_op.cu
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
--- a/paddle/optimizer/optimizer.cc
+++ b/paddle/optimizer/optimizer.cc
--- a/paddle/optimizer/parameter_optimizer_test.cpp
+++ b/paddle/optimizer/parameter_optimizer_test.cpp
--- a/paddle/optimizer/serialization_test.cpp
+++ b/paddle/optimizer/serialization_test.cpp
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
--- a/paddle/platform/cpu_info.cc
+++ b/paddle/platform/cpu_info.cc
--- a/paddle/platform/cpu_info.h
+++ b/paddle/platform/cpu_info.h
--- a/paddle/platform/cpu_info_test.cc
+++ b/paddle/platform/cpu_info_test.cc
--- a/paddle/platform/cuda_test.cu
+++ b/paddle/platform/cuda_test.cu
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
--- a/paddle/platform/error.h
+++ b/paddle/platform/error.h
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
--- a/paddle/platform/cuda.h
+++ b/paddle/platform/cuda.h
--- a/paddle/platform/place.cc
+++ b/paddle/platform/place.cc
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
--- a/paddle/pybind/tensor_bind.h
+++ b/paddle/pybind/tensor_bind.h
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
--- a/paddle/scripts/docker/build_android.sh
+++ b/paddle/scripts/docker/build_android.sh
--- a/paddle/scripts/travis/check_style.sh
+++ b/paddle/scripts/travis/check_style.sh
--- a/paddle/string/piece.h
+++ b/paddle/string/piece.h
--- a/paddle/trainer/NewRemoteParameterUpdater.cpp
+++ b/paddle/trainer/NewRemoteParameterUpdater.cpp
--- a/paddle/trainer/NewRemoteParameterUpdater.h
+++ b/paddle/trainer/NewRemoteParameterUpdater.h
--- a/paddle/trainer/TrainerConfigHelper.cpp
+++ b/paddle/trainer/TrainerConfigHelper.cpp
--- a/paddle/utils/DynamicLoader.h
+++ b/paddle/utils/DynamicLoader.h
--- a/paddle/utils/ThreadLocal.h
+++ b/paddle/utils/ThreadLocal.h
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_gated_unit_layer.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_gated_unit_layer.py
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
--- a/python/paddle/v2/data_feeder.py
+++ b/python/paddle/v2/data_feeder.py
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
--- a/python/paddle/v2/dataset/mq2007.py
+++ b/python/paddle/v2/dataset/mq2007.py
--- a/python/paddle/v2/event.py
+++ b/python/paddle/v2/event.py
--- a/python/paddle/v2/framework/create_op_creation_methods.py
+++ b/python/paddle/v2/framework/create_op_creation_methods.py
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
--- a/python/paddle/v2/framework/tests/test_op_creation_methods.py
+++ b/python/paddle/v2/framework/tests/test_op_creation_methods.py
--- a/python/paddle/v2/framework/tests/test_tensor.py
+++ b/python/paddle/v2/framework/tests/test_tensor.py
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
--- a/python/paddle/v2/master/client.py
+++ b/python/paddle/v2/master/client.py
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
--- a/python/setup.py.in
+++ b/python/setup.py.in