Follow comments

941eccac · liaogang · 7010a5da · b90780c3 · 941eccac · 941eccac
112 changed file
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -21,10 +21,10 @@
    sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
    hooks:
    -   id: clang-formater
-   repo: https://github.com/dnephin/pre-commit-golang
+-   repo: https://github.com/PaddlePaddle/pre-commit-golang
-    sha: e4693a4c282b4fc878eda172a929f7a6508e7d16
+    sha: 16398aeccf263adaf53b2495eed0406347d76281
    hooks:
      -   id: go-fmt
-          files: (.*\.go)
+          types: [go]
-      -   id: go-lint
+      -   id: gometalinter
-          files: (.*\.go)
+          types: [go]
--- a/.travis.yml
+++ b/.travis.yml
@@ -41,6 +41,8 @@ before_install:
  - pip install rarfile
  - curl https://glide.sh/get | bash
  - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
+  - go get -u github.com/alecthomas/gometalinter
+  - gometalinter --install
  - |
    function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -137,7 +137,8 @@ if(WITH_GPU)
 endif(WITH_GPU)
 if(USE_NNPACK)
-  list(APPEND EXTERNAL_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB} "rt")
+    include(external/nnpack)
+    list(APPEND EXTERNAL_LIBS ${NNPACK_LIBS})
 endif(USE_NNPACK)
 add_subdirectory(proto)

--- a/Dockerfile
+++ b/Dockerfile
@@ -25,7 +25,7 @@ COPY ./paddle/scripts/docker/root/ /root/
 RUN apt-get update && \
    apt-get install -y \
    git python-pip python-dev openssh-server bison  \
-    wget unzip tar xz-utils bzip2 gzip coreutils ntp \
+    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
    curl sed grep graphviz libjpeg-dev zlib1g-dev  \
    python-numpy python-matplotlib gcc g++ \
    automake locales clang-format-3.8 swig doxygen cmake  \

--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -14,6 +14,17 @@ RUN apt-get update && \
    wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
    apt-get clean -y
+# Install Go and glide
+RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
+    tar -C /usr/local -xzf go.tgz && \
+    mkdir /root/gopath && \
+    mkdir /root/gopath/bin && \
+    mkdir /root/gopath/src && \
+    rm go.tgz
+ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
+ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
 # git credential to skip password typing
 RUN git config --global credential.helper store

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -102,12 +102,19 @@ if(WITH_GOLANG)
      message(FATAL_ERROR "no glide executeble found: $ENV{GOPATH}/bin/glide")
    endif()
-    add_custom_target(go_vendor)
+    # this command will only run when the file it depends is missing
-    add_custom_command(TARGET go_vendor
+    # or has changed, or the output is missing.
+    add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/glide
      COMMAND env GOPATH=${GOPATH} ${GLIDE} install
+      COMMAND touch ${CMAKE_BINARY_DIR}/glide
+      DEPENDS ${PROJ_ROOT}/go/glide.lock
      WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go"
      )
-    add_dependencies(go_vendor go_path)
+    # depends on the custom command which outputs
+    # ${CMAKE_BINARY_DIR}/glide, the custom command does not need to
+    # run every time this target is built.
+    add_custom_target(go_vendor DEPENDS ${CMAKE_BINARY_DIR}/glide go_path)
  endif()
 endif(WITH_GOLANG)
--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@@ -27,7 +27,8 @@ set(IGNORE_PATTERN
    .*cblas\\.h.*
    .*\\.pb\\.txt
    .*LtrDataProvider.*
-    .*MultiDataProvider.*)
+    .*MultiDataProvider.*
+    .*pb.*)
 # add_style_check_target
 #
@@ -52,14 +53,13 @@ macro(add_style_check_target TARGET_NAME)
                endif()
            endforeach()
            if(LINT MATCHES ON)
+                # cpplint code style
                get_filename_component(base_filename ${filename} NAME)
                set(CUR_GEN ${CMAKE_CURRENT_BINARY_DIR}/${base_filename}.cpplint)
-                add_custom_command(OUTPUT ${CUR_GEN}
+                add_custom_command(TARGET ${TARGET_NAME} PRE_BUILD
-                    PRE_BUILD
+                    COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
-                    COMMAND env ${py_env} "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
                            "--filter=${STYLE_FILTER}"
                            "--write-success=${CUR_GEN}" ${filename}
-                    DEPENDS ${filename}
                    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
            endif()
        endforeach()

--- a/cmake/cross_compiling/android.cmake
+++ b/cmake/cross_compiling/android.cmake
@@ -108,6 +108,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
        ENDIF()
        IF(ANDROID_ABI STREQUAL "arm64-v8a")
            SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
+            SET(CMAKE_SYSTEM_PROCESSOR aarch64)
        ENDIF()
        SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
    ENDIF()
@@ -193,6 +194,10 @@ ELSE()
        SET(CMAKE_ANDROID_STANDALONE_TOOLCHAIN ${ANDROID_STANDALONE_TOOLCHAIN})
    ENDIF()
    SET(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ABI})
+    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
        SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE})
+        IF(ANDROID_ABI STREQUAL "armeabi-v7a")
            SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON})
+        ENDIF()
+    ENDIF()
 ENDIF()
--- a/paddle/function/nnpack/nnpack.cmake
+++ b/paddle/function/nnpack/nnpack.cmake
@@ -7,10 +7,24 @@ set(NNPACK_ROOT $ENV{NNPACK_ROOT} CACHE PATH "Folder contains NNPACK")
 find_path(NNPACK_INC_DIR nnpack.h PATHS ${NNPACK_ROOT}/include)
 find_library(NNPACK_LIB NAMES nnpack PATHS ${NNPACK_ROOT}/lib)
 find_library(PTHREADPOOL_LIB NAMES pthreadpool PATHS ${NNPACK_ROOT}/lib)
+find_library(NNPACK_UKERNELS_LIB NAMES nnpack_ukernels PATHS ${NNPACK_ROOT}/lib)
+find_library(NNPACK_CPUFEATURES_LIB NAMES cpufeatures PATHS ${NNPACK_ROOT}/lib)
 if(NNPACK_INC_DIR AND NNPACK_LIB AND PTHREADPOOL_LIB)
  set(NNPACK_FOUND ON)
  INCLUDE_DIRECTORIES(${NNPACK_INC_DIR})
+  set(NNPACK_LIBS)
+  list(APPEND NNPACK_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB})
+  if (NNPACK_UKERNELS_LIB)
+    list(APPEND NNPACK_LIBS ${NNPACK_UKERNELS_LIB})
+  endif()
+  if (NNPACK_CPUFEATURES_LIB)
+    list(APPEND NNPACK_LIBS ${NNPACK_CPUFEATURES_LIB})
+  endif()
+  if(NOT ANDROID)
+    list(APPEND NNPACK_LIBS "rt")
+  endif()
 else()
  message(FATAL_ERROR "Cannot find NNPACK in (${NNPACK_ROOT})")
 endif()
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -185,6 +185,10 @@ function(cc_library TARGET_NAME)
      add_dependencies(${TARGET_NAME} ${cc_library_DEPS})
      target_link_libraries(${TARGET_NAME} ${cc_library_DEPS})
    endif()
+    # cpplint code style
+    add_style_check_target(${TARGET_NAME} ${cc_library_SRCS})
  else(cc_library_SRCS)
    if (cc_library_DEPS)
      merge_static_libs(${TARGET_NAME} ${cc_library_DEPS})
@@ -338,7 +342,7 @@ function(go_test TARGET_NAME)
  string(REPLACE "${PADDLE_GO_PATH}" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
  add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${go_test_DEPS})
  add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test
+    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test -race
    -c -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
    ".${CMAKE_CURRENT_SOURCE_REL_DIR}"
    WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")

--- a/go/cmd/master/master.go
+++ b/go/cmd/master/master.go
@@ -11,6 +11,7 @@ import (
 	"github.com/namsral/flag"
 	log "github.com/sirupsen/logrus"
+	"github.com/topicai/candy"
 	"github.com/PaddlePaddle/Paddle/go/master"
 	"github.com/PaddlePaddle/Paddle/go/utils/networkhelper"
@@ -20,11 +21,18 @@ func main() {
 	port := flag.Int("port", 8080, "port of the master server.")
 	ttlSec := flag.Int("ttl", 60, "etcd lease TTL in seconds.")
 	endpoints := flag.String("endpoints", "http://127.0.0.1:2379", "comma separated etcd endpoints. If empty, fault tolerance will not be enabled.")
-	taskTimeoutDur := flag.Duration("task_timout_dur", 20*time.Minute, "task timout duration.")
+	taskTimeoutDur := flag.Duration("task-timout-dur", 20*time.Minute, "task timout duration.")
-	taskTimeoutMax := flag.Int("task_timeout_max", 3, "max timtout count for each task before it being declared failed task.")
+	taskTimeoutMax := flag.Int("task-timeout-max", 3, "max timtout count for each task before it being declared failed task.")
-	chunkPerTask := flag.Int("chunk_per_task", 10, "chunk per task.")
+	chunkPerTask := flag.Int("chunk-per-task", 10, "chunk per task.")
+	logLevel := flag.String("log-level", "info",
+		"log level, possible values: debug, info, warning, error, fatal, panic")
 	flag.Parse()
+	level, e := log.ParseLevel(*logLevel)
+	candy.Must(e)
+	log.SetLevel(level)
 	if *endpoints == "" {
 		log.Warningln("-endpoints not set, fault tolerance not be enabled.")
 	}

--- a/go/cmd/pserver/pserver.go
+++ b/go/cmd/pserver/pserver.go
@@ -40,7 +40,7 @@ func main() {
 		idx = *index
 	} else {
 		e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, *etcdTimeout)
-		idx, err = e.Register()
+		idx, err = e.Register(*port)
 		candy.Must(err)
 		cp, err = pserver.NewCheckpointFromFile(*checkpointPath, idx, e)

--- a/go/master/c/client.go
+++ b/go/master/c/client.go
@@ -23,7 +23,6 @@ import (
 	log "github.com/sirupsen/logrus"
 )
-var nullPtr = unsafe.Pointer(uintptr(0))
 var mu sync.Mutex
 var handleMap = make(map[C.paddle_master_client]*master.Client)
 var curHandle C.paddle_master_client
@@ -114,13 +113,13 @@ func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
 	if err != nil {
 		// Error
 		// TODO: return the type of error?
-		*record = (*C.uchar)(nullPtr)
+		*record = (*C.uchar)(nil)
 		return -1
 	}
 	if len(r) == 0 {
 		// Empty record
-		*record = (*C.uchar)(nullPtr)
+		*record = (*C.uchar)(nil)
 		return 0
 	}

--- a/go/master/client.go
+++ b/go/master/client.go
@@ -2,6 +2,7 @@ package master
 import (
 	"os"
+	"time"
 	"github.com/PaddlePaddle/Paddle/go/connection"
 	"github.com/PaddlePaddle/recordio"
@@ -36,9 +37,9 @@ func (c *Client) getRecords() {
 	for {
 		t, err := c.getTask()
 		if err != nil {
-			// TODO(helin): wait before move on with next
 			// getTask call.
-			log.Errorln(err)
+			log.Errorf("Get task failed, sleep 3 seconds and continue, %s", err)
+			time.Sleep(3 * time.Second)
 			continue
 		}
@@ -68,7 +69,10 @@ func (c *Client) getRecords() {
 		// We treat a task as finished whenever the last data
 		// instance of the task is read. This is not exactly
 		// correct, but a reasonable approximation.
-		c.taskFinished(t.Meta.ID)
+		err = c.taskFinished(t.Meta.ID)
+		if err != nil {
+			log.Errorln(err)
+		}
 	}
 }

--- a/go/master/client_internal_test.go
+++ b/go/master/client_internal_test.go
@@ -66,11 +66,21 @@ func TestGetFinishTask(t *testing.T) {
 	for i := 0; i < totalTask*chunkPerTask; i++ {
 		w := recordio.NewWriter(f, -1, -1)
-		w.Write(nil)
+		_, err = w.Write(nil)
+		if err != nil {
+			panic(err)
+		}
 		// call Close to force RecordIO writing a chunk.
-		w.Close()
+		err = w.Close()
+		if err != nil {
+			panic(err)
+		}
+	}
+	err = f.Close()
+	if err != nil {
+		panic(err)
 	}
-	f.Close()
 	// Manually intialize client to avoid calling c.getRecords()
 	c := &Client{}
@@ -79,7 +89,11 @@ func TestGetFinishTask(t *testing.T) {
 	ch := make(chan string, 1)
 	ch <- addr
 	go c.monitorMaster(ch)
-	c.SetDataset([]string{path})
+	err = c.SetDataset([]string{path})
+	if err != nil {
+		panic(err)
+	}
 	checkOnePass := func(i int) {
 		var tasks []Task
 		for idx := 0; idx < totalTask; idx++ {

--- a/go/master/client_test.go
+++ b/go/master/client_test.go
@@ -57,14 +57,30 @@ func TestNextRecord(t *testing.T) {
 	w := recordio.NewWriter(f, -1, -1)
 	for i := 0; i < total; i++ {
-		w.Write([]byte{byte(i)})
+		_, err = w.Write([]byte{byte(i)})
+		if err != nil {
+			panic(err)
+		}
+	}
+	err = w.Close()
+	if err != nil {
+		panic(err)
+	}
+	err = f.Close()
+	if err != nil {
+		panic(err)
 	}
-	w.Close()
-	f.Close()
 	curAddr := make(chan string, 1)
 	curAddr <- fmt.Sprintf(":%d", p)
 	c := master.NewClient(curAddr, 10)
-	c.SetDataset([]string{path})
+	err = c.SetDataset([]string{path})
+	if err != nil {
+		panic(err)
+	}
 	for pass := 0; pass < 50; pass++ {
 		received := make(map[byte]bool)
 		for i := 0; i < total; i++ {

--- a/go/master/etcd_client.go
+++ b/go/master/etcd_client.go
@@ -30,7 +30,7 @@ type EtcdClient struct {
 // NewEtcdClient creates a new EtcdClient.
 func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePath string, ttlSec int) (*EtcdClient, error) {
 	log.Debugf("Connecting to etcd at %v", endpoints)
-	// TODO(helin): gracefully shutdown etcd store. Becuase etcd
+	// TODO(helin): gracefully shutdown etcd store. Because etcd
 	// store holds a etcd lock, even though the lock will expire
 	// when the lease timeout, we need to implement graceful
 	// shutdown to release the lock.
@@ -60,7 +60,7 @@ func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePat
 	}
 	log.Debugf("Successfully acquired lock at %s.", lockPath)
-	put := clientv3.OpPut(addrPath, string(addr))
+	put := clientv3.OpPut(addrPath, addr)
 	resp, err := cli.Txn(context.Background()).If(lock.IsOwner()).Then(put).Commit()
 	if err != nil {
 		return nil, err

--- a/go/master/inmem_store.go
+++ b/go/master/inmem_store.go
@@ -4,7 +4,7 @@ import "sync"
 // InMemStore is an in memory implementation of Store interface.
 //
-// It does not tolerate the fault that casues the program to crash.
+// It does not tolerate the fault that causes the program to crash.
 type InMemStore struct {
 	mu  sync.Mutex
 	buf []byte

--- a/go/master/service.go
+++ b/go/master/service.go
@@ -160,7 +160,7 @@ func (s *Service) recover() (bool, error) {
 // snapshot *must* be called with s.mu being held.
 func (s *Service) snapshot() error {
-	// TOOD(helin): etcd request has a size limit, so the snapshot
+	// TODO(helin): etcd request has a size limit, so the snapshot
 	// size is limited by the max request size. We should either
 	// divide the snapshot into smaller chunks and save under
 	// different keys, or configure the request size to be big
@@ -215,6 +215,7 @@ func readChunks(globPaths []string) ([]Chunk, error) {
 		}
 		count := index.NumChunks()
+		log.Infof("readChunks: file %s has %d chunks", path, count)
 		for i := 0; i < count; i++ {
 			chunk := Chunk{
 				Path:  path,
@@ -288,7 +289,6 @@ func (s *Service) processFailedTask(t taskEntry, epoch int) {
 	log.Warningf("Task %v failed %d times, discard.", t.Task, t.NumFailure)
 	s.taskQueues.Todo = append(s.taskQueues.Todo, t)
-	return
 }
 func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {

--- a/go/pserver/client/c/cclient.go
+++ b/go/pserver/client/c/cclient.go
@@ -34,7 +34,6 @@ import (
 	log "github.com/sirupsen/logrus"
 )
-var nullPtr = unsafe.Pointer(uintptr(0))
 var mu sync.Mutex
 var handleMap = make(map[C.paddle_pserver_client]*client.Client)
 var curHandle C.paddle_pserver_client
@@ -63,7 +62,7 @@ func remove(client C.paddle_pserver_client) *client.Client {
 }
 func cArrayToSlice(p unsafe.Pointer, len int) []byte {
-	if p == nullPtr {
+	if p == nil {
 		return nil
 	}
@@ -101,11 +100,11 @@ func paddle_new_pserver_client(addrs *C.char, selected int) C.paddle_pserver_cli
 }
 //export paddle_new_etcd_pserver_client
-func paddle_new_etcd_pserver_client(etcd_endpoints *C.char, selected int) C.paddle_pserver_client {
+func paddle_new_etcd_pserver_client(etcdEndpoints *C.char, selected int) C.paddle_pserver_client {
 	// TODO(Longfei: use etcd lock to decide which trainer to initialize the parameters)
-	addr := C.GoString(etcd_endpoints)
+	addr := C.GoString(etcdEndpoints)
-	etcd_client := client.NewEtcd(addr)
+	etcdClient := client.NewEtcd(addr)
-	c := client.NewClient(etcd_client, etcd_client.Desired(), selector(selected != 0))
+	c := client.NewClient(etcdClient, etcdClient.Desired(), selector(selected != 0))
 	return add(c)
 }
@@ -124,20 +123,20 @@ func paddle_begin_init_params(client C.paddle_pserver_client) C.int {
 }
 //export paddle_init_param
-func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter, param_config unsafe.Pointer, config_len C.int) C.int {
+func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter, paramConfig unsafe.Pointer, configLen C.int) C.int {
 	et := pserver.ElementType(param.element_type)
 	name := C.GoString(param.name)
 	content := cArrayToSlice(unsafe.Pointer(param.content), int(param.content_len))
 	pc := pserver.ParameterWithConfig{
 		Param:  pserver.Parameter{Name: name, ElementType: et, Content: content},
-		Config: cArrayToSlice(param_config, int(config_len)),
+		Config: cArrayToSlice(paramConfig, int(configLen)),
 	}
 	c := get(client)
 	err := c.InitParam(pc)
 	if err != nil {
 		if err.Error() == pserver.AlreadyInitialized {
-			log.Warningf("parameter %s already initialized, treat paddle_init_param as sucessful.", name)
+			log.Warningf("parameter %s already initialized, treat paddle_init_param as successful.", name)
 			return C.PSERVER_OK
 		}
 		log.Errorln(err)
@@ -153,7 +152,7 @@ func paddle_finish_init_params(client C.paddle_pserver_client) C.int {
 	err := c.FinishInitParams()
 	if err != nil {
 		if err.Error() == pserver.AlreadyInitialized {
-			log.Warningln("parameters already initialized, treat paddle_finish_init_params as sucessful.")
+			log.Warningln("parameters already initialized, treat paddle_finish_init_params as successful.")
 			return C.PSERVER_OK
 		}
@@ -223,12 +222,12 @@ func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter,
 		p := ps[i]
 		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
-		if unsafe.Pointer(param) == nullPtr {
+		if unsafe.Pointer(param) == nil {
 			log.Errorln("must pre-allocate parameter.")
 			return C.PSERVER_ERROR
 		}
-		if unsafe.Pointer(param.content) != nullPtr {
+		if unsafe.Pointer(param.content) != nil {
 			if int(param.content_len) != len(p.Content) {
 				log.Errorf("the pre-allocated content len does not match parameter content len. Pre-allocated len: %d, returned len: %d", param.content_len, len(p.Content))
 				return C.PSERVER_ERROR

--- a/go/pserver/client/c/test/test_train.py
+++ b/go/pserver/client/c/test/test_train.py
 import paddle.v2 as paddle
 import paddle.v2.dataset.uci_housing as uci_housing
+import paddle.v2.master as master
+import os
+import cPickle as pickle
+etcd_ip = os.getenv("MASTER_IP", "127.0.0.1")
+etcd_endpoint = "http://" + etcd_ip + ":2379"
+def cloud_reader():
+    print "connecting to master, etcd endpoints: ", etcd_endpoint
+    master_client = master.client(etcd_endpoint, 5, 64)
+    master_client.set_dataset(
+        ["/pfs/dlnel/public/dataset/uci_housing/uci_housing-*-of-*"])
+    while 1:
+        r, e = master_client.next_record()
+        if not r:
+            break
+        yield pickle.loads(r)
 def main():
@@ -22,13 +40,13 @@ def main():
    # create optimizer of new remote updater to pserver
    optimizer = paddle.optimizer.Momentum(momentum=0)
-    #TODO(zhihong) : replace optimizer with new OptimizerConfig
+    print "etcd endoint: ", etcd_endpoint
    trainer = paddle.trainer.SGD(cost=cost,
                                 parameters=parameters,
                                 update_equation=optimizer,
                                 is_local=False,
-                                 pserver_spec="localhost:3000")
+                                 pserver_spec=etcd_endpoint,
+                                 use_etcd=True)
    # event_handler to print training and testing info
    def event_handler(event):
@@ -47,11 +65,11 @@ def main():
                print "Test %d, %.2f" % (event.pass_id, result.cost)
    # training
+    # NOTE: use uci_housing.train() as reader for non-paddlecloud training
    trainer.train(
        reader=paddle.batch(
            paddle.reader.shuffle(
-                uci_housing.train(), buf_size=500),
+                cloud_reader, buf_size=500), batch_size=2),
-            batch_size=2),
        feeding={'x': 0,
                 'y': 1},
        event_handler=event_handler,

--- a/go/pserver/client/client.go
+++ b/go/pserver/client/client.go
@@ -233,7 +233,7 @@ func (c *Client) Save(path string) error {
 func strHash(s string) uint32 {
 	h := fnv.New32a()
-	h.Write([]byte(s))
+	_, _ = h.Write([]byte(s))
 	return h.Sum32()
 }

--- a/go/pserver/client/client_test.go
+++ b/go/pserver/client/client_test.go
@@ -79,15 +79,33 @@ func initEtcdClient() {
 		log.Errorf("err %v", err)
 	}
 	ctx, cancel := context.WithTimeout(context.Background(), timeout)
-	client.Delete(ctx, pserver.PsDesired)
+	_, err = client.Delete(ctx, pserver.PsDesired)
-	client.Delete(ctx, pserver.PsPath)
+	if err != nil {
-	client.Put(ctx, pserver.PsDesired, strconv.Itoa(numPserver))
+		panic(err)
+	}
+	_, err = client.Delete(ctx, pserver.PsPath)
+	if err != nil {
+		panic(err)
+	}
+	_, err = client.Put(ctx, pserver.PsDesired, strconv.Itoa(numPserver))
+	if err != nil {
+		panic(err)
+	}
 	ports := initClient()
 	for i := 0; i < numPserver; i++ {
-		client.Put(ctx, pserver.PsPath+strconv.Itoa(i), ":"+strconv.Itoa(ports[i]))
+		_, err = client.Put(ctx, pserver.PsPath+strconv.Itoa(i), ":"+strconv.Itoa(ports[i]))
+		if err != nil {
+			panic(err)
+		}
 	}
 	cancel()
-	client.Close()
+	err = client.Close()
+	if err != nil {
+		panic(err)
+	}
 }
 type selector bool
@@ -164,7 +182,7 @@ func testClient(t *testing.T, c *client.Client) {
 		wg.Add(1)
 		go func(gs []pserver.Gradient) {
-			err = c.SendGrads(gs)
+			err := c.SendGrads(gs)
 			if err != nil {
 				t.Fatal(err)
 			}

--- a/go/pserver/client/etcd_client.go
+++ b/go/pserver/client/etcd_client.go
@@ -12,7 +12,7 @@ import (
 )
 const (
-	DefaultEtcdTimeout time.Duration = 5 * time.Second
+	defaultEtcdTimeout time.Duration = 5 * time.Second
 )
 // EtcdClient is used by pserver client that is a part of trainer process.
@@ -47,7 +47,7 @@ func (p *EtcdClient) Desired() int {
 		psDesired, err = strconv.Atoi(string(resp.Kvs[0].Value))
 		if err != nil {
-			log.Errorf("psDesired %s invalid %v", psDesired, err)
+			log.Errorf("psDesired %d invalid %v", psDesired, err)
 			time.Sleep(p.timeout)
 			continue
 		}
@@ -106,11 +106,11 @@ func NewEtcd(endpoints string) *EtcdClient {
 	for {
 		cli, err = clientv3.New(clientv3.Config{
 			Endpoints:   ep,
-			DialTimeout: DefaultEtcdTimeout,
+			DialTimeout: defaultEtcdTimeout,
 		})
 		if err != nil {
 			log.Errorf("Init etcd connection failed: %v", err)
-			time.Sleep(DefaultEtcdTimeout)
+			time.Sleep(defaultEtcdTimeout)
 			continue
 		}
 		break
@@ -118,7 +118,7 @@ func NewEtcd(endpoints string) *EtcdClient {
 	log.Infof("Connected to etcd: %s\n", endpoints)
 	client := &EtcdClient{
 		client:    cli,
-		timeout:   DefaultEtcdTimeout,
+		timeout:   defaultEtcdTimeout,
 		endpoints: ep,
 	}
 	return client

--- a/go/pserver/etcd_client.go
+++ b/go/pserver/etcd_client.go
@@ -49,7 +49,7 @@ func NewEtcdClient(endpoints string, numPservers int, timeout time.Duration) *Et
 // Register registers the pserver on etcd
 //
 // Register returns the index of the current pserver.
-func (e *EtcdClient) Register() (int, error) {
+func (e *EtcdClient) Register(port int) (int, error) {
 	var err error
 	e.externalIP, err = networkhelper.GetExternalIP()
@@ -116,7 +116,7 @@ func (e *EtcdClient) Register() (int, error) {
 	for {
 		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
 		var err error
-		pserverIdx, err = e.registerPserverEtcd(ctx)
+		pserverIdx, err = e.registerPserverEtcd(ctx, port)
 		cancel()
 		if err != nil {
 			log.Warn(err)
@@ -140,7 +140,7 @@ func (e *EtcdClient) initDesiredPservers(ctx context.Context, numPservers int) (
 }
 // registerPserverEtcd registers pserver node on etcd using transaction.
-func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
+func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, error) {
 	var idx int
 	_, err := concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error {
 		registered := false
@@ -156,8 +156,9 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
 					log.Fatal(err)
 				}
 				// find the first id and write info
-				c.Put(psKey, e.externalIP, clientv3.WithLease(resp.ID))
+				pserverAddr := e.externalIP + ":" + strconv.Itoa(port)
-				log.Debugf("set pserver node %s with value %s", psKey, e.externalIP)
+				c.Put(psKey, pserverAddr, clientv3.WithLease(resp.ID))
+				log.Debugf("set pserver node %s with value %s", psKey, pserverAddr)
 				ch, kaerr := e.etcdClient.KeepAlive(context.TODO(), resp.ID)
 				if kaerr != nil {
 					log.Errorf("keepalive etcd node error: %v", kaerr)
@@ -176,10 +177,10 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
 				break
 			}
 		}
-		if registered == true {
+		if registered {
 			return nil
 		}
-		return errors.New("not registerd, may due to already have enough pservers")
+		return errors.New("not registered, may due to already have enough pservers")
 	}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
 	if err != nil {
@@ -210,8 +211,5 @@ func (e *EtcdClient) PutKey(key string, value []byte, timeout time.Duration) err
 	ctx, cancel := context.WithTimeout(context.Background(), timeout)
 	_, err := e.etcdClient.Put(ctx, key, string(value))
 	cancel()
-	if err != nil {
 	return err
-	}
-	return nil
 }
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
@@ -14,8 +14,6 @@ import (
 	log "github.com/sirupsen/logrus"
 )
-var nullPtr = unsafe.Pointer(uintptr(0))
 type optimizer struct {
 	opt         *C.struct_paddle_optimizer
 	elementType ElementType
@@ -23,7 +21,7 @@ type optimizer struct {
 }
 func cArrayToSlice(p unsafe.Pointer, len int) []byte {
-	if p == nullPtr {
+	if p == nil {
 		return nil
 	}
@@ -92,8 +90,8 @@ func (o *optimizer) UpdateParameter(g Gradient) error {
 }
 func (o *optimizer) Cleanup() {
-	if unsafe.Pointer(o.opt) != nullPtr {
+	if unsafe.Pointer(o.opt) != nil {
 		C.paddle_release_optimizer(o.opt)
-		o.opt = (*C.struct_paddle_optimizer)(nullPtr)
+		o.opt = (*C.struct_paddle_optimizer)(nil)
 	}
 }
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -211,7 +211,7 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
 	// learning optimization methods are stochastic in
 	// nature. This race condition is allowed deliberately
 	// to save the program from making a copy of the
-	// paramter content.
+	// parameter content.
 	parameter.Name = name
 	parameter.ElementType = opt.elementType
 	parameter.Content = opt.GetWeights()
@@ -219,7 +219,7 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
 }
 // pserver save checkpoint
-func (s *Service) doCheckpoint() error {
+func (s *Service) doCheckpoint() (err error) {
 	<-s.initialized
 	s.mu.Lock()
 	defer s.mu.Unlock()
@@ -237,9 +237,9 @@ func (s *Service) doCheckpoint() error {
 	}
 	var buf bytes.Buffer
 	encoder := gob.NewEncoder(&buf)
-	err := encoder.Encode(cp)
+	err = encoder.Encode(cp)
 	if err != nil {
-		return err
+		return
 	}
 	cpMeta := checkpointMeta{}
@@ -248,10 +248,14 @@ func (s *Service) doCheckpoint() error {
 	h := md5.New()
 	cpMeta.MD5 = hex.EncodeToString(h.Sum(buf.Bytes()))
-	cpMetajson, _ := json.Marshal(cpMeta)
+	cpMetajson, err := json.Marshal(cpMeta)
+	if err != nil {
+		return
+	}
 	err = s.client.PutKey(filepath.Join(PsCheckpoint, strconv.Itoa(s.idx)), cpMetajson, 3*time.Second)
 	if err != nil {
-		return err
+		return
 	}
 	if _, err = os.Stat(cpMeta.UUID); os.IsNotExist(err) {
 		log.Info("checkpoint does not exists.")
@@ -264,15 +268,32 @@ func (s *Service) doCheckpoint() error {
 		}
 	}
 	f, err := os.Create(cpMeta.UUID)
-	defer f.Close()
 	if err != nil {
-		return err
+		return
+	}
+	defer func() {
+		closeErr := f.Close()
+		if closeErr != nil {
+			if err != nil {
+				log.Errorln(closeErr)
+			} else {
+				// Set closeErr as return value.
+				err = closeErr
 			}
+		}
+	}()
 	writer := bufio.NewWriter(f)
 	_, err = writer.Write(buf.Bytes())
-	writer.Flush()
 	if err != nil {
-		return err
+		return
 	}
-	return nil
+	err = writer.Flush()
+	if err != nil {
+		return
+	}
+	return
 }
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -843,7 +843,8 @@ public:
                                               bool useSparseUpdater);
  static ParameterUpdater* createNewRemoteUpdater(
      OptimizationConfig* config,
-      const std::string pserverSpec) throw(UnsupportError);
+      const std::string pserverSpec,
+      const bool useEtcd) throw(UnsupportError);
  ~ParameterUpdater();
  /**

--- a/paddle/api/ParameterUpdater.cpp
+++ b/paddle/api/ParameterUpdater.cpp
@@ -33,11 +33,12 @@ ParameterUpdater *ParameterUpdater::createLocalUpdater(
 ParameterUpdater *ParameterUpdater::createNewRemoteUpdater(
    OptimizationConfig *config,
-    const std::string pserverSpec) throw(UnsupportError) {
+    const std::string pserverSpec,
+    const bool useEtcd) throw(UnsupportError) {
 #ifndef PADDLE_WITHOUT_GOLANG
  auto updater = new ParameterUpdater();
  updater->m->updater.reset(new paddle::NewRemoteParameterUpdater(
-      config->m->getConfig(), pserverSpec));
+      config->m->getConfig(), pserverSpec, useEtcd));
  return updater;
 #else
  throw UnsupportError();

--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
 # ddim lib
-cc_library(ddim SRCS ddim.cc)
+cc_library(ddim SRCS ddim.cc DEPS eigen3)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
-cc_test(tensor_test SRCS tensor_test.cc DEPS ddim)
+cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory)
+cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
 cc_test(variable_test SRCS variable_test.cc)
 cc_test(scope_test SRCS scope_test.cc)
 proto_library(attr_type SRCS attr_type.proto)
 proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
-cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
 proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
+cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
 cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
-cc_library(operator SRCS operator.cc DEPS op_desc device_context)
+cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry operator)
 py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
 proto_library(net_proto SRCS net_proto.proto DEPS op_proto)
-cc_library(net SRCS net.cc DEPS net_proto)
+cc_library(net SRCS net.cc DEPS operator net_proto op_registry)
+cc_test(net_op_test SRCS net_op_test.cc DEPS net)
--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include "paddle/framework/ddim.h"
+#include "paddle/platform/enforce.h"
 namespace paddle {
 namespace framework {
-///@cond HIDDEN
+/// @cond HIDDEN
 template <int i>
 Dim<i> make_dim(const int* d) {
@@ -50,7 +65,7 @@ void make_ddim(DDim& ddim, const int* dims, int n) {
  }
 }
-///@endcond
+/// @endcond
 DDim make_ddim(std::initializer_list<int> dims) {
  DDim result(make_dim(0));
@@ -64,11 +79,11 @@ DDim make_ddim(const std::vector<int>& dims) {
  return result;
 }
-///@cond HIDDEN
+/// @cond HIDDEN
 // XXX For some reason, putting this in an anonymous namespace causes errors
 class DynamicMutableIndexer : public boost::static_visitor<int&> {
 public:
-  DynamicMutableIndexer(int idx) : idx_(idx) {}
+  explicit DynamicMutableIndexer(int idx) : idx_(idx) {}
  template <int D>
  int& operator()(Dim<D>& dim) const {
@@ -81,7 +96,7 @@ class DynamicMutableIndexer : public boost::static_visitor<int&> {
 class DynamicConstIndexer : public boost::static_visitor<int> {
 public:
-  DynamicConstIndexer(int idx) : idx_(idx) {}
+  explicit DynamicConstIndexer(int idx) : idx_(idx) {}
  template <int D>
  int operator()(const Dim<D>& dim) const {
@@ -92,7 +107,7 @@ class DynamicConstIndexer : public boost::static_visitor<int> {
  int idx_;
 };
-///@endcond
+/// @endcond
 int& DDim::operator[](int idx) {
  return boost::apply_visitor(DynamicMutableIndexer(idx), var);
@@ -102,6 +117,8 @@ int DDim::operator[](int idx) const {
  return boost::apply_visitor(DynamicConstIndexer(idx), var);
 }
+ssize_t DDim::size() const { return arity(*this); }
 bool DDim::operator==(DDim d) const {
  if (var.which() != d.getVar().which()) {
    return false;
@@ -155,11 +172,11 @@ int get(const DDim& ddim, int idx) { return ddim[idx]; }
 void set(DDim& ddim, int idx, int value) { ddim[idx] = value; }
-///@cond HIDDEN
+/// @cond HIDDEN
 struct VectorizeVisitor : public boost::static_visitor<> {
  std::vector<int>& vector;
-  VectorizeVisitor(std::vector<int>& v) : vector(v) {}
+  explicit VectorizeVisitor(std::vector<int>& v) : vector(v) {}
  template <typename T>
  void operator()(const T& t) {
@@ -169,7 +186,7 @@ struct VectorizeVisitor : public boost::static_visitor<> {
  void operator()(const Dim<1>& t) { vector.push_back(t.head); }
 };
-///@endcond
+/// @endcond
 std::vector<int> vectorize(const DDim& ddim) {
  std::vector<int> result;
@@ -178,16 +195,59 @@ std::vector<int> vectorize(const DDim& ddim) {
  return result;
 }
+struct ProductVisitor : public boost::static_visitor<ssize_t> {
+  template <int D>
+  ssize_t operator()(const Dim<D>& dim) {
+    return product(dim);
+  }
+};
 ssize_t product(const DDim& ddim) {
-  ssize_t result = 1;
+  ProductVisitor visitor;
-  std::vector<int> v = vectorize(ddim);
+  return boost::apply_visitor(visitor, ddim);
-  for (auto i : v) {
+}
-    result *= i;
+struct SliceVectorizeVisitor : public boost::static_visitor<> {
+  std::vector<int>& vector;
+  int begin;
+  int end;
+  SliceVectorizeVisitor(std::vector<int>& v, int b, int e)
+      : vector(v), begin(b), end(e) {
+    PADDLE_ENFORCE(begin < end,
+                   "Begin index must be less than end index in ddim slice.");
+    PADDLE_ENFORCE(begin >= 0,
+                   "Begin index can't be less than zero in ddim slice.");
  }
-  return result;
+  template <int S>
+  void operator()(const Dim<S>& dim) {
+    if (begin == 0) {
+      vector.push_back(dim.head);
+    } else {
+      --begin;
+    }
+    --end;
+    if (end > 0) {
+      this->operator()(dim.tail);
+    }
+  }
+  void operator()(const Dim<1>& dim) {
+    PADDLE_ENFORCE(end == 1, "End index in ddim slice is out of bound.");
+    vector.push_back(dim.head);
+  }
+};
+DDim slice_ddim(const DDim& dim, int begin, int end) {
+  std::vector<int> vec;
+  vec.reserve(end - begin);
+  SliceVectorizeVisitor visitor(vec, begin, end);
+  boost::apply_visitor(visitor, dim);
+  return make_ddim(vec);
 }
-///\cond HIDDEN
+/// \cond HIDDEN
 struct ArityVisitor : boost::static_visitor<int> {
  template <int D>
@@ -196,15 +256,15 @@ struct ArityVisitor : boost::static_visitor<int> {
  }
 };
-///\endcond
+/// \endcond
 int arity(const DDim& d) { return boost::apply_visitor(ArityVisitor(), d); }
-///\cond HIDDEN
+/// \cond HIDDEN
 struct DDimPrinter : boost::static_visitor<void> {
  std::ostream& os;
-  DDimPrinter(std::ostream& os_) : os(os_) {}
+  explicit DDimPrinter(std::ostream& os_) : os(os_) {}
  template <typename T>
  void operator()(const T& t) {
@@ -212,7 +272,7 @@ struct DDimPrinter : boost::static_visitor<void> {
  }
 };
-///\endcond
+/// \endcond
 std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
  DDimPrinter printer(os);
@@ -220,5 +280,9 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
  return os;
 }
+DDim::DDim(std::initializer_list<int> init_list) {
+  *this = make_ddim(init_list);
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #pragma once
 #include <boost/variant.hpp>
 #include <initializer_list>
 #include <stdexcept>
 #include <vector>
 #include "paddle/framework/dim.h"
+#include "paddle/platform/enforce.h"
+#include "unsupported/Eigen/CXX11/Tensor"
 namespace paddle {
 namespace framework {
@@ -27,7 +42,9 @@ struct DDim {
  DDim() : var(Dim<1>()) {}
  template <int D>
-  DDim(const Dim<D>& in) : var(in) {}
+  explicit DDim(const Dim<D>& in) : var(in) {}
+  /*implicit*/ DDim(std::initializer_list<int> init_list);
  template <int D>
  DDim& operator=(const Dim<D>& in) {
@@ -57,6 +74,8 @@ struct DDim {
  DDim operator+(DDim d) const;
  DDim operator*(DDim d) const;
+  ssize_t size() const;
 };
 /**
@@ -81,6 +100,15 @@ std::vector<int> vectorize(const DDim& ddim);
 ssize_t product(const DDim& ddim);
+/**
+ * \brief Slice a ddim
+ *
+ * Slice dim with [begin, end).
+ * e.g.  DDim d = make_ddim({1,2,3,4,5});
+ *       slice_ddim(d, 1, 3); ====> {2,3}
+ */
+DDim slice_ddim(const DDim& dim, int begin, int end);
 /**
 * \brief What is the length of this dimension?
 *
@@ -91,6 +119,17 @@ int arity(const DDim& ddim);
 std::ostream& operator<<(std::ostream&, const DDim&);
+template <int NDIMS>
+Eigen::DSizes<Eigen::DenseIndex, NDIMS> ToEigenDSizes(const DDim& dims) {
+  int rank = arity(dims);
+  PADDLE_ENFORCE(rank == NDIMS, "DDim and NDIMS must be same");
+  Eigen::DSizes<Eigen::DenseIndex, NDIMS> dsizes;
+  for (int d = 0; d < rank; d++) {
+    dsizes[d] = dims[d];
+  }
+  return dsizes;
+}
 }  // namespace framework
 }  // namespace paddle

--- a/paddle/framework/ddim_test.cc
+++ b/paddle/framework/ddim_test.cc
@@ -49,9 +49,30 @@ TEST(DDim, Equality) {
  // arity of a DDim
  EXPECT_EQ(paddle::framework::arity(ddim), 3);
+  EXPECT_EQ(ddim.size(), 3);
  // product of a DDim
  EXPECT_EQ(paddle::framework::product(vddim), 45);
+  EXPECT_EQ(
+      paddle::framework::product(paddle::framework::make_ddim({3, 2, 5, 3})),
+      90);
+  // slice a DDim
+  paddle::framework::DDim ddim2 =
+      paddle::framework::make_ddim({1, 2, 3, 4, 5, 6});
+  paddle::framework::DDim ss = paddle::framework::slice_ddim(ddim2, 2, 5);
+  EXPECT_EQ(arity(ss), 3);
+  EXPECT_EQ(ss[0], 3);
+  EXPECT_EQ(ss[1], 4);
+  EXPECT_EQ(ss[2], 5);
+  paddle::framework::DDim ss2 = paddle::framework::slice_ddim(ddim2, 0, 6);
+  EXPECT_EQ(arity(ss2), 6);
+  EXPECT_EQ(ss2[0], 1);
+  EXPECT_EQ(ss2[1], 2);
+  EXPECT_EQ(ss2[2], 3);
+  EXPECT_EQ(ss2[3], 4);
+  EXPECT_EQ(ss2[4], 5);
+  EXPECT_EQ(ss2[5], 6);
 }
 TEST(DDim, Print) {

--- a/paddle/framework/dim_test.cu
+++ b/paddle/framework/dim_test.cu
 #include <thrust/device_vector.h>
 #include <sstream>
-#include "paddle/framework/dim.h"
 #include "gtest/gtest.h"
+#include "paddle/framework/dim.h"
 __global__ void test(paddle::framework::Dim<2>* o) {
  o[0] = paddle::framework::make_dim(5, 6);
@@ -21,7 +21,7 @@ TEST(Dim, Equality) {
  // construct a Dim on the GPU
  thrust::device_vector<paddle::framework::Dim<2>> t(2);
-    test<<<1,1>>>(thrust::raw_pointer_cast(t.data()));
+  test<<<1, 1>>>(thrust::raw_pointer_cast(t.data()));
  a = t[0];
  EXPECT_EQ(paddle::framework::get<0>(a), 5);
  EXPECT_EQ(paddle::framework::get<1>(a), 6);
@@ -48,12 +48,13 @@ TEST(Dim, Equality) {
  // dynamic access on GPU
  thrust::device_vector<int> r(1);
-    dyn_idx_gpu<<<1,1>>>(thrust::raw_pointer_cast(r.data()));
+  dyn_idx_gpu<<<1, 1>>>(thrust::raw_pointer_cast(r.data()));
  int res = r[0];
  EXPECT_EQ(res, 6);
  // ex_prefix_mul
-    paddle::framework::Dim<3> c = paddle::framework::ex_prefix_mul(paddle::framework::Dim<3>(3, 4, 5));
+  paddle::framework::Dim<3> c =
+      paddle::framework::ex_prefix_mul(paddle::framework::Dim<3>(3, 4, 5));
  EXPECT_EQ(paddle::framework::get<0>(c), 1);
  EXPECT_EQ(paddle::framework::get<1>(c), 3);
  EXPECT_EQ(paddle::framework::get<2>(c), 12);

--- a/paddle/framework/net.cc
+++ b/paddle/framework/net.cc
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
 #include "paddle/framework/net.h"
 namespace paddle {
 namespace framework {
-PlainNet::PlainNet(const NetDesc& def) {}
+void PlainNet::CompleteAddOp() {
+  std::unordered_set<std::string> input_set;
-void PlainNet::InferShape(const ScopePtr& scope) const {
+  std::unordered_set<std::string> output_set;
+  std::unordered_set<std::string> temp_output;
  for (auto& op : ops_) {
-    op.InferShape();
+    for (auto& ipt : op->inputs_) {
+      if (!Contains(output_set, ipt)) {  // Not other op's output
+        input_set.insert(ipt);
+      } else {
+        temp_output.insert(ipt);
+      }
+    }
+    for (auto& opt : op->outputs_) {
+      output_set.insert(opt);
+    }
+  }
+  inputs_.reserve(input_set.size());
+  std::copy(input_set.begin(), input_set.end(), std::back_inserter(inputs_));
+  outputs_.reserve(output_set.size());
+  std::vector<int> tmp_index;
+  tmp_index.reserve(temp_output.size());
+  int idx = 0;
+  for (auto& opt : output_set) {
+    if (Contains(temp_output, opt)) {
+      tmp_index.push_back(idx);
    }
+    outputs_.push_back(opt);
+    ++idx;
+  }
+  attrs_["temporary_index"] = tmp_index;
+  add_op_done_ = true;
 }
-void PlainNet::Run(const ScopePtr& scope, const DeviceContext& ctx) const {
+std::string PlainNet::DebugString() const {
+  std::ostringstream os;
+  os << this->type_ << ":" << std::endl;
  for (auto& op : ops_) {
-    op.Run(ctx);
+    os << "\t" << op->DebugString() << std::endl;
  }
+  return os.str();
 }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/net.h
+++ b/paddle/framework/net.h
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-   Licensed under the Apache License, Version 2.0 (the "License");
+Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
+you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
+You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
-   Unless required by applicable law or agreed to in writing, software
+Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
+distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
+See the License for the specific language governing permissions and
-   limitations under the License. */
+limitations under the License. */
 #pragma once
+#include <paddle/framework/op_desc.pb.h>
+#include <paddle/framework/operator.h>
 #include "paddle/framework/net_proto.pb.h"
 #include "paddle/framework/op_proto.pb.h"
+#include "paddle/framework/op_registry.h"
 #include "paddle/framework/scope.h"
 #include "paddle/platform/device_context.h"
 namespace paddle {
 namespace framework {
-using namespace paddle::platform;
-// operator's index stored in a network.
-typedef int OpIndex;
-/**
- * NOTE following codes are some definitions of unimplemented concepts.
- * We write some basic implementation to make Net compilable. These APIs will
- * keep updating if the concepts related are implemented.
- */
-struct OpDesc;
-struct OpAttrs {};
-class Operator {
- public:
-  Operator(const OpDesc &def) {}
-  void InferShape() const {}
-  void Run(const DeviceContext &ctx) const {}
-};
 /**
- * @brief Network that manage the operators it has.
+ * @brief Network is also a type of Operator
+ *
+ * It will manage the operators it has.
 *
- * Network is the container and controller of a set of operators, user can build
+ * Network is the container and controller of a set of operators.
- * a real network from a NetDesc which is a protobuf message and use
- * Network.Run() * to run all the operators in the network.
 * A network object knows all Operators belonging to this network. Variables,
 * which are inputs and outputs of these operators, are created and managed by a
 * hierarchy of Scope objects.
 *
- * This is the base class of network, all the networks should implement the apis
+ * This is the base class of network, all the networks should implement the APIs
 * it defines.
 */
-class Net {
+class Net : public OperatorBase {
 public:
-  /**
+  virtual void AddOp(const OperatorPtr& op) = 0;
-   * @brief Infer shapes of all inputs and outputs of operators.
+  virtual void CompleteAddOp() = 0;
-   */
-  virtual void InferShape(const ScopePtr &scope) const = 0;
-  /**
-   * @brief Run the network.
-   *
-   * Run all the operators and return success(true) or not, with all the
-   * variables are located in `scope`. `context` describes the detail execution
-   * environment for ops. `begin` and `end` specify the scope of `ops_` to run,
-   * If no positive indexes are provided, all operators in `ops_` will run.
-   */
-  virtual void Run(const ScopePtr &scope, const DeviceContext &ctx) const = 0;
-  /**
-   * @brief Add an Operator according to `def`.
-   */
-  virtual OpIndex AddOp(const OpProto &def) = 0;
-  /**
-   * @brief Add optimizer operators acctording to `attrs`.
-   */
-  virtual void AddOptimizerOps(const OpAttrs &attrs) = 0;
-  /**
-   * @brief Add backward operators.
-   */
-  virtual void AddBackwardOps() = 0;
-  /**
-   * @brief Create a network.
-   */
-  static std::unique_ptr<Net> Create(const NetDesc &def = NetDesc());
-  virtual ~Net() {}
 };
+using NetPtr = std::shared_ptr<Net>;
 /**
 * @brief a basic implementation of Net.
 *
@@ -103,18 +55,14 @@ class Net {
 class PlainNet : public Net {
 public:
  /**
-   * @brief Initialize a PlainNet.
+   * Infer all the operators' input and output variables' shapes, will be called
-   *
-   * Initialize from  a network describe by `def`. NetDesc is the definition of
-   * a network.
-   */
-  PlainNet(const NetDesc &def);
-  /**
-   * Infer all the operators' input and output varialbes' shapes, will be called
   * before every mini-batch
   */
-  virtual void InferShape(const ScopePtr &scope) const override;
+  void InferShape(const ScopePtr& scope) const override {
+    for (auto& op : ops_) {
+      op->InferShape(scope);
+    }
+  }
  /**
   * @brief Run the network.
@@ -123,49 +71,34 @@ class PlainNet : public Net {
   * scope will be used instead. If no OpContext is provicded, default context
   * will be used.
   */
-  virtual void Run(const ScopePtr &scope,
+  void Run(const ScopePtr& scope,
-                   const DeviceContext &ctx) const override;
+           const platform::DeviceContext& dev_ctx) const override {
+    for (auto& op : ops_) {
+      op->Run(scope, dev_ctx);
+    }
+  }
  /**
-   * @brief Add an operator to this network.
+   * @brief Add an operator by ptr
   */
-  virtual OpIndex AddOp(const OpProto &def) override;
+  void AddOp(const OperatorPtr& op) override {
+    PADDLE_ENFORCE(!add_op_done_, "Cannot AddOp when this network is sealed");
+    ops_.push_back(op);
+  }
-  /**
+  void CompleteAddOp() override;
-   * @brief Add all optimizer operators related into the network.
-   */
-  virtual void AddOptimizerOps(const OpAttrs &attrs) override;
-  /**
+  std::string DebugString() const override;
-   * @brief Add all backward operators related into the network.
-   */
-  virtual void AddBackwardOps() override;
-  virtual ~PlainNet() override {}
- protected:
+  std::vector<OperatorPtr> ops_;
-  /**
-   * @brief Build the network.
-   *
-   * Create operators accordding to `def`, will be called by the constructor.
-   */
-  void BuildNet(const NetDesc &def);
-  /**
-   * @brief Add an operator into this network.
-   *
-   * Add a operator which is identified as `type` and has attributes described
-   * in `attrs`, the `inputs` are the keys of readonly input variables,
-   * `outputs` are keys of mutable output variables. An `OpIndex` will be
-   * returned to indicate the offset of the new operator in `ops_`.
-   */
-  OpIndex AddOp(const std::string &type, const std::vector<std::string> &inputs,
-                const std::vector<std::string> &outputs,
-                const OpAttrs &attrs = OpAttrs());
 private:
-  // the operators owned by `Network`.
+  bool add_op_done_{false};
-  std::vector<Operator> ops_;
+  template <typename T, typename KeyType>
+  static bool Contains(T container, KeyType key) {
+    return container.find(key) != container.end();
+  }
 };
 }  // namespace framework

--- a/paddle/framework/net_op_test.cc
+++ b/paddle/framework/net_op_test.cc
+#include <gtest/gtest.h>
+#include <paddle/framework/net.h>
+#include <paddle/framework/op_registry.h>
+#include <paddle/framework/operator.h>
+namespace pd = paddle::framework;
+static int infer_shape_cnt = 0;
+static int run_cnt = 0;
+class TestOp : public pd::OperatorBase {
+ public:
+  void InferShape(const paddle::framework::ScopePtr& scope) const override {
+    ++infer_shape_cnt;
+  }
+  void Run(const paddle::framework::ScopePtr& scope,
+           const paddle::platform::DeviceContext& dev_ctx) const override {
+    ++run_cnt;
+  }
+};
+template <typename T>
+void AssertSameVectorWithoutOrder(const std::vector<T>& expected,
+                                  const std::vector<T>& actual) {
+  ASSERT_EQ(expected.size(), actual.size());
+  std::unordered_set<T> expected_set;
+  for (auto& tmp : expected) {
+    expected_set.insert(tmp);
+  }
+  for (auto& act : actual) {
+    ASSERT_NE(expected_set.end(), expected_set.find(act));
+  }
+}
+TEST(OpKernel, all) {
+  auto net = std::make_shared<paddle::framework::PlainNet>();
+  ASSERT_NE(net, nullptr);
+  auto op1 = std::make_shared<TestOp>();
+  op1->inputs_ = {"x", "w1", "b1"};
+  op1->outputs_ = {"y"};
+  net->AddOp(op1);
+  auto op2 = std::make_shared<TestOp>();
+  op2->inputs_ = {"y", "w2", "b2"};
+  op2->outputs_ = {"z"};
+  net->AddOp(op2);
+  net->CompleteAddOp();
+  AssertSameVectorWithoutOrder({"x", "w1", "b1", "w2", "b2"}, net->inputs_);
+  AssertSameVectorWithoutOrder({"y", "z"}, net->outputs_);
+  auto tmp_idx_iter = net->attrs_.find("temporary_index");
+  ASSERT_NE(net->attrs_.end(), tmp_idx_iter);
+  auto& tmp_idx = boost::get<std::vector<int>>(tmp_idx_iter->second);
+  ASSERT_EQ(1UL, tmp_idx.size());
+  ASSERT_EQ("y", net->outputs_[tmp_idx[0]]);
+  auto scope = std::make_shared<pd::Scope>();
+  paddle::platform::CPUDeviceContext dev_ctx;
+  net->InferShape(scope);
+  net->Run(scope, dev_ctx);
+  ASSERT_EQ(2, infer_shape_cnt);
+  ASSERT_EQ(2, run_cnt);
+  ASSERT_THROW(net->AddOp(op2), std::runtime_error);
+}
--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include <paddle/framework/op_registry.h>
 namespace paddle {

--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
 #pragma once
 #include <algorithm>
+#include <atomic>
 #include <type_traits>
 #include <unordered_map>
 #include <unordered_set>
@@ -61,7 +62,14 @@ class OpProtoAndCheckerMaker {
  OpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
      : proto_(proto), op_checker_(op_checker) {}
-  ~OpProtoAndCheckerMaker() { CheckNoDuplicatedAttrs(); }
+  ~OpProtoAndCheckerMaker() {
+    PADDLE_ENFORCE(validated_, "should call Validate after build");
+  }
+  void Validate() {
+    validated_ = true;
+    CheckNoDuplicatedInOutAttrs();
+  }
 protected:
  void AddInput(const std::string& name, const std::string& comment,
@@ -163,19 +171,26 @@ Add a mark to which output is temporary is helpful for future optimization.
    }
  }
-  void CheckNoDuplicatedAttrs() {
+  void CheckNoDuplicatedInOutAttrs() {
    std::unordered_set<std::string> names;
-    size_t cnt = 0;
+    auto checker = [&](const std::string& name) {
+      PADDLE_ENFORCE(!names.count(name), "[%s] is duplicated", name);
+      names.insert(name);
+    };
    for (auto& attr : proto_->attrs()) {
-      names.insert(attr.name());
+      checker(attr.name());
-      ++cnt;
+    }
+    for (auto& input : proto_->inputs()) {
+      checker(input.name());
+    }
+    for (auto& output : proto_->outputs()) {
+      checker(output.name());
    }
-    PADDLE_ENFORCE(names.size() == cnt,
-                   "Cannot register two attribute in same name!");
  }
  OpProto* proto_;
  OpAttrChecker* op_checker_;
+  bool validated_{false};
  bool has_multiple_input_{false};
  bool has_multiple_output_{false};
  bool has_temporary_output_{false};
@@ -183,6 +198,8 @@ Add a mark to which output is temporary is helpful for future optimization.
 class OpRegistry {
  using OpCreator = std::function<OperatorBase*()>;
+  using VarIndexMap = std::unordered_map<std::string, int>;
+  using VarNameList = std::vector<std::string>;
 public:
  template <typename OpType, typename ProtoMakerType>
@@ -190,36 +207,71 @@ class OpRegistry {
    creators()[op_type] = [] { return new OpType; };
    OpProto& op_proto = protos()[op_type];
    OpAttrChecker& op_checker = op_checkers()[op_type];
-    ProtoMakerType(&op_proto, &op_checker);
+    auto maker = ProtoMakerType(&op_proto, &op_checker);
+    maker.Validate();
    *op_proto.mutable_type() = op_type;
    PADDLE_ENFORCE(
        op_proto.IsInitialized(),
        "Fail to initialize %s's OpProto, because %s is not initialized",
        op_type, op_proto.InitializationErrorString());
+    VarIndexMaps()[op_type].reset(new VarIndexMap());
+    auto& varmap = *VarIndexMaps()[op_type];
+    int idx = 0;
+    for (auto& var : op_proto.inputs()) {
+      varmap[var.name()] = idx++;
+    }
+    idx = 0;
+    for (auto& var : op_proto.outputs()) {
+      varmap[var.name()] = idx++;
+    }
+  }
+  static OperatorPtr CreateOp(const std::string& type,
+                              const VarNameList& inputs,
+                              const VarNameList& outputs,
+                              const AttributeMap& attrs) {
+    auto op_create_it = creators().find(type);
+    PADDLE_ENFORCE(op_create_it != creators().end(),
+                   "Operator %s cannot be found", type);
+    auto op = op_create_it->second();
+    op->type_ = type;
+    op->inputs_ = inputs;
+    op->outputs_ = outputs;
+    op->attrs_ = attrs;
+    op_checkers().at(type).Check(op->attrs_);
+    GenerateTempVariableName(op);
+    {
+      auto var_index_it = VarIndexMaps().find(type);
+      if (var_index_it != VarIndexMaps().end()) {
+        op->in_out_idxs_ = var_index_it->second;
+      }
+    }
+    op->Init();
+    return OperatorPtr(op);
  }
  static OperatorPtr CreateOp(const OpDesc& op_desc) {
-    std::string op_type = op_desc.type();
+    std::vector<std::string> inputs;
-    OperatorPtr op(creators().at(op_type)());
+    inputs.reserve((size_t)op_desc.inputs_size());
-    op->desc_ = op_desc;
-    op->inputs_.reserve((size_t)op_desc.inputs_size());
    std::copy(op_desc.inputs().begin(), op_desc.inputs().end(),
-              std::back_inserter(op->inputs_));
+              std::back_inserter(inputs));
-    op->outputs_.reserve((size_t)op_desc.outputs_size());
+    std::vector<std::string> outputs;
+    outputs.reserve((size_t)op_desc.outputs_size());
    std::copy(op_desc.outputs().begin(), op_desc.outputs().end(),
-              std::back_inserter(op->outputs_));
+              std::back_inserter(outputs));
+    AttributeMap attrs;
    for (auto& attr : op_desc.attrs()) {
-      op->attrs_[attr.name()] = AttrTypeHelper::GetAttrValue(attr);
+      attrs[attr.name()] = AttrTypeHelper::GetAttrValue(attr);
-    }
-    op_checkers().at(op_type).Check(op->attrs_);
-    op->Init();
-    return op;
    }
- private:
+    return CreateOp(op_desc.type(), inputs, outputs, attrs);
-  static std::unordered_map<std::string, OpCreator>& creators() {
-    static std::unordered_map<std::string, OpCreator> creators_;
-    return creators_;
  }
  static std::unordered_map<std::string, OpProto>& protos() {
@@ -227,6 +279,29 @@ class OpRegistry {
    return protos_;
  };
+ private:
+  static std::unordered_map<std::string, std::shared_ptr<VarIndexMap>>&
+  VarIndexMaps() {
+    static std::unordered_map<std::string, std::shared_ptr<VarIndexMap>> maps_;
+    return maps_;
+  }
+  static void GenerateTempVariableName(OperatorBase* op) {
+    static std::atomic<size_t> gUniqId(0UL);
+    for (auto& outname : op->outputs_) {
+      if (outname == OperatorBase::TMP_VAR_NAME()) {
+        outname += op->type_;
+        outname += "@";
+        outname += std::to_string(gUniqId.fetch_add(1));
+      }
+    }
+  }
+  static std::unordered_map<std::string, OpCreator>& creators() {
+    static std::unordered_map<std::string, OpCreator> creators_;
+    return creators_;
+  }
  static std::unordered_map<std::string, OpAttrChecker>& op_checkers() {
    static std::unordered_map<std::string, OpAttrChecker> op_checkers_;
    return op_checkers_;
@@ -241,12 +316,18 @@ class OpRegisterHelper {
  }
 };
+/**
+ * check if MACRO is used in GLOBAL NAMESPACE.
+ */
 #define STATIC_ASSERT_GLOBAL_NAMESPACE(uniq_name, msg)                        \
  struct __test_global_namespace_##uniq_name##__ {};                          \
  static_assert(std::is_same<::__test_global_namespace_##uniq_name##__,       \
                             __test_global_namespace_##uniq_name##__>::value, \
                msg)
+/**
+ * Macro to Register Operator.
+ */
 #define REGISTER_OP(__op_type, __op_class, __op_maker_class)                 \
  STATIC_ASSERT_GLOBAL_NAMESPACE(__reg_op__##__op_type,                      \
                                 "REGISTER_OP must be in global namespace"); \
@@ -254,27 +335,36 @@ class OpRegisterHelper {
      __op_register_##__op_type##__(#__op_type);                             \
  int __op_register_##__op_type##_handle__() { return 0; }
-#define REGISTER_OP_KERNEL(type, GPU_OR_CPU, PlaceType, KernelType)       \
+/**
+ * Macro to Register OperatorKernel.
+ */
+#define REGISTER_OP_KERNEL(type, DEVICE_TYPE, PlaceType, ...)             \
  STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
-      __reg_op_kernel_##type##_##GPU_OR_CPU##__,                          \
+      __reg_op_kernel_##type##_##DEVICE_TYPE##__,                         \
      "REGISTER_OP_KERNEL must be in global namespace");                  \
  struct __op_kernel_register__##type##__ {                               \
    __op_kernel_register__##type##__() {                                  \
      ::paddle::framework::OperatorWithKernel::OpKernelKey key;           \
      key.place_ = PlaceType();                                           \
      ::paddle::framework::OperatorWithKernel::AllOpKernels()[#type][key] \
-          .reset(new KernelType());                                       \
+          .reset(new __VA_ARGS__());                                      \
    }                                                                     \
  };                                                                      \
  static __op_kernel_register__##type##__ __reg_kernel_##type##__;        \
-  int __op_kernel_register_##type##_handle_##GPU_OR_CPU##__() { return 0; }
+  int __op_kernel_register_##type##_handle_##DEVICE_TYPE##__() { return 0; }
-#define REGISTER_OP_GPU_KERNEL(type, KernelType) \
+// (type, KernelType)
-  REGISTER_OP_KERNEL(type, GPU, ::paddle::platform::GPUPlace, KernelType)
+#define REGISTER_OP_GPU_KERNEL(type, ...) \
+  REGISTER_OP_KERNEL(type, GPU, ::paddle::platform::GPUPlace, __VA_ARGS__)
-#define REGISTER_OP_CPU_KERNEL(type, KernelType) \
+// (type, KernelType)
-  REGISTER_OP_KERNEL(type, CPU, ::paddle::platform::CPUPlace, KernelType)
+#define REGISTER_OP_CPU_KERNEL(type, ...) \
+  REGISTER_OP_KERNEL(type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
+/**
+ * Macro to mark what Operator and Kernel we will use and tell the compiler to
+ * link them into target.
+ */
 #define USE_OP_WITHOUT_KERNEL(op_type)                      \
  STATIC_ASSERT_GLOBAL_NAMESPACE(                           \
      __use_op_without_kernel_##op_type,                    \
@@ -292,15 +382,16 @@ class OpRegisterHelper {
      __attribute__((unused)) =                                           \
          __op_kernel_register_##op_type##_handle_##DEVICE_TYPE##__()
-#ifdef PADDLE_ONLY_CPU
+// use Operator with only cpu kernel.
-#define USE_OP(op_type)           \
+#define USE_OP_CPU(op_type)       \
  USE_OP_WITHOUT_KERNEL(op_type); \
-  USE_OP_KERNEL(op_type, CPU);
+  USE_OP_KERNEL(op_type, CPU)
+#ifdef PADDLE_ONLY_CPU
+#define USE_OP(op_type) USE_OP_CPU(op_type)
 #else
 #define USE_OP(op_type) \
-  USE_OP_WITHOUT_KERNEL(op_type); \
+  USE_OP_CPU(op_type);  \
-  USE_OP_KERNEL(op_type, CPU);    \
  USE_OP_KERNEL(op_type, GPU)
 #endif

--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
 #include "paddle/framework/op_registry.h"
 #include <gtest/gtest.h>
+namespace pd = paddle::framework;
 namespace paddle {
 namespace framework {
 class CosineOp : public OperatorBase {
@@ -28,8 +30,6 @@ class MyTestOp : public OperatorBase {
  void InferShape(const ScopePtr& scope) const override {}
  void Run(const ScopePtr& scope,
           const platform::DeviceContext& dev_ctx) const override {}
- public:
 };
 class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@@ -91,7 +91,7 @@ TEST(OpRegistry, IllegalAttr) {
  try {
    paddle::framework::OperatorPtr op __attribute__((unused)) =
        paddle::framework::OpRegistry::CreateOp(op_desc);
-  } catch (paddle::platform::EnforceNotMet err) {
+  } catch (std::runtime_error& err) {
    caught = true;
    std::string msg = "larger_than check fail";
    const char* err_msg = err.what();
@@ -138,7 +138,7 @@ TEST(OpRegistry, CustomChecker) {
  try {
    paddle::framework::OperatorPtr op __attribute__((unused)) =
        paddle::framework::OpRegistry::CreateOp(op_desc);
-  } catch (paddle::platform::EnforceNotMet err) {
+  } catch (std::runtime_error& err) {
    caught = true;
    std::string msg = "Attribute 'test_attr' is required!";
    const char* err_msg = err.what();
@@ -157,7 +157,7 @@ TEST(OpRegistry, CustomChecker) {
  try {
    paddle::framework::OperatorPtr op __attribute__((unused)) =
        paddle::framework::OpRegistry::CreateOp(op_desc);
-  } catch (paddle::platform::EnforceNotMet err) {
+  } catch (std::runtime_error& err) {
    caught = true;
    std::string msg = "'test_attr' must be even!";
    const char* err_msg = err.what();
@@ -182,3 +182,35 @@ TEST(OpRegistry, CustomChecker) {
  int test_attr = op->GetAttr<int>("test_attr");
  ASSERT_EQ(test_attr, 4);
 }
+class TestAttrProtoMaker : public pd::OpProtoAndCheckerMaker {
+ public:
+  TestAttrProtoMaker(pd::OpProto* proto, pd::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddAttr<float>("scale", "scale of test op");
+    AddAttr<float>("scale", "scale of test op");
+  }
+};
+TEST(ProtoMaker, DuplicatedAttr) {
+  pd::OpProto op_proto;
+  pd::OpAttrChecker op_checker;
+  auto proto_maker = TestAttrProtoMaker(&op_proto, &op_checker);
+  ASSERT_THROW(proto_maker.Validate(), std::runtime_error);
+}
+class TestInOutProtoMaker : public pd::OpProtoAndCheckerMaker {
+ public:
+  TestInOutProtoMaker(pd::OpProto* proto, pd::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of test op");
+    AddInput("input", "input of test op");
+  }
+};
+TEST(ProtoMaker, DuplicatedInOut) {
+  pd::OpProto op_proto;
+  pd::OpAttrChecker op_checker;
+  auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker);
+  ASSERT_THROW(proto_maker.Validate(), std::runtime_error);
+}
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -12,30 +12,90 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <algorithm>
 #include "paddle/framework/operator.h"
 namespace paddle {
 namespace framework {
+template <>
+Eigen::DefaultDevice* KernelContext::GetEigenDevice<
+    platform::CPUPlace, Eigen::DefaultDevice>() const {
+  return device_context_.get_eigen_device<Eigen::DefaultDevice>();
+}
+#ifndef PADDLE_ONLY_CPU
+template <>
+Eigen::GpuDevice*
+KernelContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
+  return device_context_.get_eigen_device<Eigen::GpuDevice>();
+}
+#endif
+const std::string& OperatorBase::Input(const std::string& name) const {
+  auto it = in_out_idxs_->find(name);
+  PADDLE_ENFORCE(it != in_out_idxs_->end(), "no key [%s] in in_out_idxs_",
+                 name);
+  if (attrs_.count("input_format") == 0) {
+    return inputs_[it->second];
+  } else {
+    const auto& input_format = GetAttr<std::vector<int>>("input_format");
+    int idx = input_format[it->second];
+    return inputs_.at(idx);
+  }
+}
+std::vector<std::string> OperatorBase::Inputs(const std::string& name) const {
+  auto input_format = GetAttr<std::vector<int>>("input_format");
+  auto offset = in_out_idxs_->at(name);
+  return std::vector<std::string>{
+      inputs_.begin() + input_format.at(offset),
+      inputs_.begin() + input_format.at(offset + 1)};
+}
+const std::string& OperatorBase::Output(const std::string& name) const {
+  auto it = in_out_idxs_->find(name);
+  PADDLE_ENFORCE(it != in_out_idxs_->end(), "no key [%s] in in_out_idxs_",
+                 name);
+  if (attrs_.count("output_format") == 0) {
+    return outputs_[it->second];
+  } else {
+    const auto& output_format = GetAttr<std::vector<int>>("output_format");
+    int idx = output_format[it->second];
+    return outputs_.at(idx);
+  }
+}
+std::vector<std::string> OperatorBase::Outputs(const std::string& name) const {
+  auto output_format = GetAttr<std::vector<int>>("output_format");
+  auto offset = in_out_idxs_->at(name);
+  return std::vector<std::string>{
+      outputs_.begin() + output_format.at(offset),
+      outputs_.begin() + output_format.at(offset + 1)};
+}
 std::string OperatorBase::DebugString() const {
  std::stringstream ss;
-  ss << "=================\n";
+  ss << "Op(" << type_ << "), inputs:(";
-  ss << "type = " << desc_.type() << "\n";
+  for (size_t i = 0; i < inputs_.size(); ++i) {
-  ss << "inputs = [";
+    ss << inputs_[i];
-  for (auto& ipt : inputs_) {
+    if (i != inputs_.size() - 1) {
-    ss << ipt << ", ";
+      ss << ", ";
+    }
  }
-  ss << "]\n";
+  ss << "), outputs:(";
-  ss << "outputs = [";
+  for (size_t i = 0; i < outputs_.size(); ++i) {
-  for (auto& opt : outputs_) {
+    ss << outputs_[i];
-    ss << opt << ", ";
+    if (i != outputs_.size() - 1) {
+      ss << ", ";
    }
-  ss << "]\n";
-  ss << "attr_keys = [";
-  for (auto& attr : attrs_) {
-    ss << attr.first << ", ";
  }
-  ss << "]\n";
+  ss << ").";
  return ss.str();
 }

--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -14,21 +14,38 @@ limitations under the License. */
 #pragma once
-#include <paddle/framework/attr_checker.h>
-#include <paddle/framework/op_desc.pb.h>
-#include <paddle/framework/scope.h>
-#include <paddle/framework/tensor.h>
-#include <paddle/platform/device_context.h>
-#include <paddle/platform/place.h>
-#include <paddle/utils/Error.h>
 #include <boost/variant.hpp>
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include "paddle/framework/attr_checker.h"
+#include "paddle/framework/op_desc.pb.h"
+#include "paddle/framework/op_proto.pb.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/place.h"
+#include "paddle/utils/Error.h"
 namespace paddle {
 namespace framework {
+template <typename T>
+struct EigenDeviceConverter;
+template <>
+struct EigenDeviceConverter<platform::CPUPlace> {
+  using EigenDeviceType = Eigen::DefaultDevice;
+};
+#ifndef PADDLE_ONLY_CPU
+template <>
+struct EigenDeviceConverter<platform::GPUPlace> {
+  using EigenDeviceType = Eigen::GpuDevice;
+};
+#endif
 class OperatorBase;
 using OperatorPtr = std::shared_ptr<OperatorBase>;
 /**
@@ -39,6 +56,13 @@ using OperatorPtr = std::shared_ptr<OperatorBase>;
 */
 class OperatorBase {
 public:
+  /// If a variable is a empty variable, that name will be used.
+  static std::string EMPTY_VAR_NAME() { return "@EMPTY@"; }
+  /// If a variable is a temporary variable, that name will be set in Python,
+  /// but it will be convert to a unique name in scope after OpCreator.
+  static std::string TMP_VAR_NAME() { return "@TEMP@"; }
  virtual ~OperatorBase() {}
  template <typename T>
@@ -48,7 +72,7 @@ class OperatorBase {
    return boost::get<T>(attrs_.at(name));
  }
-  std::string DebugString() const;
+  virtual std::string DebugString() const;
  /// Init will be called after CreateOperator, you can put some initialization
  /// logic here.
@@ -62,27 +86,29 @@ class OperatorBase {
  virtual void Run(const ScopePtr& scope,
                   const platform::DeviceContext& dev_ctx) const = 0;
- protected:
+  // Get a input with argument's name described in `op_proto`
-  std::string Type() const { return desc_.type(); }
+  const std::string& Input(const std::string& name) const;
+  // Get a input which has multiple variables.
+  // TODO add a vector_view to prevent memory copy.
+  std::vector<std::string> Inputs(const std::string& name) const;
+  // Get a output with argument's name described in `op_proto`
+  const std::string& Output(const std::string& name) const;
+  // Get an output which has multiple variables.
+  // TODO add a vector_view to prevent memory copy.
+  std::vector<std::string> Outputs(const std::string& name) const;
 public:
-  OpDesc desc_;
+  std::string type_;
  std::vector<std::string> inputs_;
  std::vector<std::string> outputs_;
  AttributeMap attrs_;
+  // store the arguments' offset described in op_desc.
+  std::shared_ptr<std::unordered_map<std::string, int>> in_out_idxs_;
 };
-class OpKernel {
+class KernelContext {
- public:
-  /**
-   * KernelContext is the only parameter of Kernel Run function.
-   * Run will get input/output variables, state such as momentum and
-   * device resource such as CUDA stream, cublas handle, etc. from
-   * KernelContext. User should construct it before run the Operator.
-   */
-  class KernelContext {
 public:
-    KernelContext(const OperatorBase* op, const ScopePtr& scope,
+  KernelContext(const OperatorBase* op, const std::shared_ptr<Scope>& scope,
                const platform::DeviceContext& device_context)
      : op_(*op), scope_(scope), device_context_(device_context) {}
@@ -94,10 +120,52 @@ class OpKernel {
    return scope_->GetVariable(op_.outputs_[index]);
  }
+  const Variable* Input(const std::string& name) const {
+    return scope_->GetVariable(op_.Input(name));
+  }
+  const Variable* Output(const std::string& name) const {
+    return scope_->GetVariable(op_.Output(name));
+  }
+  const std::vector<const Variable*> Inputs(const std::string& name) const {
+    auto names = op_.Inputs(name);
+    std::vector<const Variable*> res;
+    std::transform(
+        names.begin(), names.end(), res.begin(),
+        [this](const std::string& name) { return scope_->GetVariable(name); });
+    return res;
+  }
+  const std::vector<const Variable*> Outputs(const std::string& name) const {
+    auto names = op_.Outputs(name);
+    std::vector<const Variable*> res;
+    std::transform(
+        names.begin(), names.end(), res.begin(),
+        [this](const std::string& name) { return scope_->GetVariable(name); });
+    return res;
+  }
+  template <typename PlaceType,
+            typename DeviceType =
+                typename EigenDeviceConverter<PlaceType>::EigenDeviceType>
+  DeviceType* GetEigenDevice() const;
+  platform::Place GetPlace() const { return device_context_.GetPlace(); }
  const OperatorBase& op_;
-    const ScopePtr& scope_;
+  const std::shared_ptr<Scope>& scope_;
  const platform::DeviceContext& device_context_;
-  };
+};
+class OpKernel {
+ public:
+  /**
+   * KernelContext is the only parameter of Kernel Run function.
+   * Run will get input/output variables, state such as momentum and
+   * device resource such as CUDA stream, cublas handle, etc. from
+   * KernelContext. User should construct it before run the Operator.
+   */
  virtual void Compute(const KernelContext& context) const = 0;
@@ -142,8 +210,8 @@ class OperatorWithKernel : public OperatorBase {
  void Run(const ScopePtr& scope,
           const platform::DeviceContext& dev_ctx) const final {
-    auto& opKernel = AllOpKernels().at(Type()).at(OpKernelKey(dev_ctx));
+    auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
-    opKernel->Compute(OpKernel::KernelContext(this, scope, dev_ctx));
+    opKernel->Compute(KernelContext(this, scope, dev_ctx));
  }
  static std::unordered_map<std::string /* op_type */, OpKernelMap>&
@@ -151,6 +219,7 @@ class OperatorWithKernel : public OperatorBase {
    static std::unordered_map<std::string, OpKernelMap> g_all_op_kernels;
    return g_all_op_kernels;
  }
  void InferShape(const std::shared_ptr<Scope>& scope) const final {
    std::vector<const Tensor*> ins;
    VarNamesToTensors(scope, inputs_, &ins);

--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -19,14 +19,17 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
-class OperatorTest : public OperatorBase {
+static int op_run_num = 0;
+class OpWithoutKernelTest : public OperatorBase {
 public:
  void Init() override { x = 1; }
  void InferShape(const ScopePtr& scope) const override {}
  void Run(const ScopePtr& scope,
           const platform::DeviceContext& dev_ctx) const override {
-    float scale = GetAttr<float>("scale");
+    op_run_num++;
-    ASSERT_NEAR(scale, 3.14, 1e-5);
+    ASSERT_EQ((int)inputs_.size(), 1);
+    ASSERT_EQ((int)outputs_.size(), 1);
    ASSERT_EQ(scope->GetVariable(inputs_[0]), nullptr);
    ASSERT_EQ(x, 1);
    ASSERT_NE(scope->GetVariable(outputs_[0]), nullptr);
@@ -36,15 +39,14 @@ class OperatorTest : public OperatorBase {
  float x = 0;
 };
-class OperatorTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+class OpeWithoutKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 public:
-  OperatorTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+  OpeWithoutKernelTestProtoAndCheckerMaker(OpProto* proto,
+                                           OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("input", "input of test op");
    AddOutput("output", "output of test op");
-    AddAttr<float>("scale", "scale of cosine op")
+    AddAttr<float>("scale", "scale of cosine op");
-        .SetDefault(1.0)
-        .LargerThan(0.0);
    AddComment("This is test op");
  }
 };
@@ -52,8 +54,8 @@ class OperatorTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 }  // namespace framework
 }  // namespace paddle
-REGISTER_OP(test_operator, paddle::framework::OperatorTest,
+REGISTER_OP(test_operator, paddle::framework::OpWithoutKernelTest,
-            paddle::framework::OperatorTestProtoAndCheckerMaker);
+            paddle::framework::OpeWithoutKernelTestProtoAndCheckerMaker);
 TEST(OperatorBase, all) {
  paddle::framework::OpDesc op_desc;
@@ -63,18 +65,17 @@ TEST(OperatorBase, all) {
  auto attr = op_desc.mutable_attrs()->Add();
  attr->set_name("scale");
  attr->set_type(paddle::framework::AttrType::FLOAT);
-  float scale = 3.14;
+  attr->set_f(3.14);
-  attr->set_f(scale);
  paddle::platform::CPUDeviceContext device_context;
  auto scope = std::make_shared<paddle::framework::Scope>();
  paddle::framework::OperatorPtr op =
      paddle::framework::OpRegistry::CreateOp(op_desc);
-  ASSERT_EQ(op->GetAttr<float>("scale"), scale);
  scope->CreateVariable("OUT1");
+  ASSERT_EQ(paddle::framework::op_run_num, 0);
  op->Run(scope, device_context);
-  std::cout << op->DebugString() << std::endl;
+  ASSERT_EQ(paddle::framework::op_run_num, 1);
 }
 namespace paddle {
@@ -84,8 +85,8 @@ class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 public:
  OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "input of test op");
+    AddInput("x", "input of test op");
-    AddOutput("output", "output of test op");
+    AddOutput("y", "output of test op");
    AddAttr<float>("scale", "scale of cosine op")
        .SetDefault(1.0)
        .LargerThan(0.0);
@@ -93,19 +94,76 @@ class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
  }
 };
+static int cpu_kernel_run_num = 0;
 class OpWithKernelTest : public OperatorWithKernel {
 protected:
  void InferShape(const std::vector<const Tensor*>& inputs,
                  const std::vector<Tensor*>& outputs) const override {}
 };
+template <typename T1, typename T2>
 class CPUKernelTest : public OpKernel {
 public:
-  void Compute(const KernelContext& context) const {
+  void Compute(const KernelContext& ctx) const {
-    float scale = context.op_.GetAttr<float>("scale");
-    ASSERT_NEAR(scale, 3.14, 1e-5);
    std::cout << "this is cpu kernel" << std::endl;
-    std::cout << context.op_.DebugString() << std::endl;
+    std::cout << ctx.op_.DebugString() << std::endl;
+    cpu_kernel_run_num++;
+    ASSERT_EQ(ctx.op_.Input("x"), "IN1");
+    ASSERT_EQ(ctx.op_.Output("y"), "OUT1");
+  }
+};
+// multiple inputs test
+class OperatorMultiInputsTest : public OperatorBase {
+ public:
+  void Init() override { x = 1; }
+  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
+  void Run(const std::shared_ptr<Scope>& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    ASSERT_EQ(scope->GetVariable(inputs_[0]), nullptr);
+    ASSERT_EQ(x, 1);
+    ASSERT_NE(scope->GetVariable(outputs_[0]), nullptr);
+    ASSERT_EQ(Input("x"), "IN1");
+    ASSERT_EQ(Input("y"), "OUT1");
+  }
+ public:
+  float x = 0;
+};
+class OpKernelTestMultiInputsProtoAndCheckerMaker
+    : public OpProtoAndCheckerMaker {
+ public:
+  OpKernelTestMultiInputsProtoAndCheckerMaker(OpProto* proto,
+                                              OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInputs("xs", "inputs of test op");
+    AddInput("k", "input of test op");
+    AddOutputs("ys", "outputs of test op");
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .LargerThan(0.0);
+    AddComment("This is test op");
+  }
+};
+class CPUKernalMultiInputsTest : public OpKernel {
+ public:
+  void Compute(const KernelContext& ctx) const {
+    auto xs = ctx.op_.Inputs("xs");
+    ASSERT_EQ(xs.size(), 3UL);
+    ASSERT_EQ(xs[0], "x0");
+    ASSERT_EQ(xs[1], "x1");
+    ASSERT_EQ(xs[2], "x2");
+    auto k = ctx.op_.Input("k");
+    ASSERT_EQ(k, "k0");
+    auto ys = ctx.op_.Outputs("ys");
+    ASSERT_EQ(ys.size(), 2UL);
+    ASSERT_EQ(ys[0], "y0");
+    ASSERT_EQ(ys[1], "y1");
  }
 };
@@ -114,8 +172,10 @@ class CPUKernelTest : public OpKernel {
 REGISTER_OP(op_with_kernel, paddle::framework::OpWithKernelTest,
            paddle::framework::OpKernelTestProtoAndCheckerMaker);
-REGISTER_OP_CPU_KERNEL(op_with_kernel, paddle::framework::CPUKernelTest);
+REGISTER_OP_CPU_KERNEL(op_with_kernel,
+                       paddle::framework::CPUKernelTest<float, float>);
+// test with single input
 TEST(OpKernel, all) {
  paddle::framework::OpDesc op_desc;
  op_desc.set_type("op_with_kernel");
@@ -131,5 +191,51 @@ TEST(OpKernel, all) {
  paddle::framework::OperatorPtr op =
      paddle::framework::OpRegistry::CreateOp(op_desc);
+  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
+  op->Run(scope, cpu_device_context);
+  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
+}
+REGISTER_OP(op_multi_inputs_with_kernel, paddle::framework::OpWithKernelTest,
+            paddle::framework::OpKernelTestMultiInputsProtoAndCheckerMaker);
+REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
+                       paddle::framework::CPUKernalMultiInputsTest);
+// test with multi inputs
+TEST(OpKernel, multi_inputs) {
+  using namespace paddle::framework;
+  OpDesc op_desc;
+  op_desc.set_type("op_multi_inputs_with_kernel");
+  *op_desc.mutable_inputs()->Add() = "x0";
+  *op_desc.mutable_inputs()->Add() = "x1";
+  *op_desc.mutable_inputs()->Add() = "x2";
+  *op_desc.mutable_inputs()->Add() = "k0";
+  *op_desc.mutable_outputs()->Add() = "y0";
+  *op_desc.mutable_outputs()->Add() = "y1";
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_f(3.14);
+  auto attr0 = op_desc.mutable_attrs()->Add();
+  attr0->set_name("input_format");
+  attr0->set_type(paddle::framework::AttrType::INTS);
+  auto input_format = attr0->mutable_ints();
+  input_format->Add(0);  // x0
+  input_format->Add(3);  // k
+  input_format->Add(4);  // end
+  auto attr1 = op_desc.mutable_attrs()->Add();
+  attr1->set_name("output_format");
+  attr1->set_type(paddle::framework::AttrType::INTS);
+  auto output_format = attr1->mutable_ints();
+  output_format->Add(0);  // y0
+  output_format->Add(2);  // y1
+  paddle::platform::CPUDeviceContext cpu_device_context;
+  auto scope = std::make_shared<Scope>();
+  OperatorPtr op(paddle::framework::OpRegistry::CreateOp(op_desc));
  op->Run(scope, cpu_device_context);
 }
--- a/paddle/framework/tensor.cc
+++ b/paddle/framework/tensor.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <paddle/framework/tensor.h>
+namespace paddle {
+namespace framework {}
+}  // namespace paddle
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -17,19 +17,26 @@ limitations under the License. */
 #include <cstdint>
 #include <cstring>
 #include <memory>
+#include <typeindex>
 #include "paddle/framework/ddim.h"
+#include "paddle/framework/tensor_types.h"
 #include "paddle/memory/memory.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
+#include "unsupported/Eigen/CXX11/Tensor"
 namespace paddle {
+namespace pybind {
+namespace details {  // forward declare
+template <bool less, size_t i, typename... args>
+struct CastToPyBufferImpl;
+}  // namespace details
+}  // namespace pybind
 namespace framework {
 class Tensor {
 public:
-  Tensor() : numel_(0), offset_(0) {}
+  Tensor() : offset_(0) {}
-  Tensor& operator=(const Tensor& src) = delete;
  template <typename T>
  const T* data() const {
@@ -39,27 +46,106 @@ class Tensor {
  }
  template <typename T>
-  T* mutable_data(DDim dims, paddle::platform::Place place) {
+  T* raw_data() const {
+    CheckDims<T>();
+    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                                offset_);
+  }
+  template <typename T>
+  T* mutable_data(DDim dims, platform::Place place) {
    set_dims(dims);
    return mutable_data<T>(place);
  }
  template <typename T>
-  T* mutable_data(paddle::platform::Place place) {
+  T* mutable_data(platform::Place place) {
-    PADDLE_ENFORCE(numel_ > 0,
+    PADDLE_ENFORCE(product(dims_) > 0,
-                   "Tensor::numel_ must be larger than zero to call "
+                   "Tensor's numel must be larger than zero to call "
                   "Tensor::mutable_data. Call Tensor::set_dim first.");
    if (holder_ == nullptr ||
        !(holder_->place() ==
          place) /* some versions of boost::variant don't have operator!= */
-        || holder_->size() < numel_ * sizeof(T) + offset_) {
+        || holder_->size() < product(dims_) * sizeof(T) + offset_) {
-      holder_.reset(new PlaceholderImpl<T>(place, numel_ * sizeof(T)));
+      if (platform::is_cpu_place(place)) {
+        holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
+            boost::get<platform::CPUPlace>(place), product(dims_) * sizeof(T)));
+      } else if (platform::is_gpu_place(place)) {
+#ifdef PADDLE_ONLY_CPU
+        PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
+#else
+        holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
+            boost::get<platform::GPUPlace>(place), product(dims_) * sizeof(T)));
+#endif
+      } else {
+        PADDLE_THROW("Unknown 'place'.");
+      }
      offset_ = 0;
    }
    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                offset_);
  }
+  template <typename T, size_t NDIMS>
+  typename TTypes<T, NDIMS>::Tensor shaped(DDim new_dims) {
+    Eigen::array<Eigen::DenseIndex, NDIMS> dims =
+        paddle::framework::ToEigenDSizes<NDIMS>(new_dims);
+    return typename TTypes<T, NDIMS>::Tensor(raw_data<T>(), dims);
+  }
+  template <typename T, size_t NDIMS>
+  typename TTypes<T, NDIMS>::Tensor tensor() {
+    return typename TTypes<T, NDIMS>::Tensor(
+        raw_data<T>(), paddle::framework::ToEigenDSizes<NDIMS>(dims_));
+  }
+  // flat to rank = 1
+  template <typename T>
+  typename TTypes<T>::Flat flat() {
+    return shaped<T, 1>(make_ddim({static_cast<int>(product(dims_))}));
+  }
+  // to TensorType Vec
+  template <typename T>
+  typename TTypes<T>::Vec vec() {
+    return tensor<T, 1>();
+  }
+  // to TensorType Matrix
+  template <typename T>
+  typename TTypes<T>::Matrix matrix() {
+    return tensor<T, 2>();
+  }
+  // const versions of all the methods above.
+  template <typename T, size_t NDIMS>
+  typename TTypes<T, NDIMS>::Tensor shaped(DDim new_dims) const {
+    Eigen::array<Eigen::DenseIndex, NDIMS> dims =
+        paddle::framework::ToEigenDSizes<NDIMS>(new_dims);
+    return typename TTypes<T, NDIMS>::Tensor(data<T>(), dims);
+  }
+  template <typename T, size_t NDIMS>
+  typename TTypes<T, NDIMS>::ConstantTensor tensor() const {
+    return typename TTypes<T, NDIMS>::Tensor(
+        data<T>(), paddle::framework::ToEigenDSizes<NDIMS>(dims_));
+  }
+  template <typename T>
+  typename TTypes<T>::ConstFlat flat() const {
+    return shaped<T, 1>(make_ddim({static_cast<int>(product(dims_))}));
+  }
+  template <typename T>
+  typename TTypes<T>::ConstVec vec() const {
+    return tensor<T, 1>();
+  }
+  template <typename T>
+  typename TTypes<T>::ConstMatrix matrix() const {
+    return tensor<T, 2>();
+  }
  template <typename T>
  void ShareDataFrom(const Tensor& src) {
    src.CheckDims<T>();
@@ -69,12 +155,12 @@ class Tensor {
  }
  template <typename T>
-  void CopyFrom(const Tensor& src, paddle::platform::Place dst_place) {
+  void CopyFrom(const Tensor& src, platform::Place dst_place) {
    PADDLE_ENFORCE(platform::is_cpu_place(src.holder_->place()) &&
                       platform::is_cpu_place(dst_place),
                   "Tensor::CopyFrom only support CPU now.");
    src.CheckDims<T>();
-    size_t size = src.numel_ * sizeof(T);
+    size_t size = product(src.dims_) * sizeof(T);
    set_dims(src.dims());
    const void* src_ptr = static_cast<const void*>(src.data<T>());
    void* dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
@@ -108,7 +194,6 @@ class Tensor {
      return;
    }
    dims_ = dims;
-    numel_ = product(dims_);
  }
  DDim dims() const { return dims_; }
@@ -119,37 +204,38 @@ class Tensor {
  struct Placeholder {
    virtual ~Placeholder() {}
    virtual void* ptr() const = 0;
-    virtual paddle::platform::Place place() const = 0;
+    virtual platform::Place place() const = 0;
    virtual size_t size() const = 0;
+    virtual std::type_index type() const = 0;
  };
-  template <typename T>
+  template <typename T, typename PlaceType>
  struct PlaceholderImpl : public Placeholder {
   private:
+    template <typename PType>
    class Deleter {
     public:
-      Deleter(platform::Place place) : place_(place) {}
+      Deleter(PType place) : place_(place) {}
-      void operator()(T* ptr) {
+      void operator()(T* ptr) { memory::Free(place_, static_cast<void*>(ptr)); }
-        paddle::memory::Free(place_, static_cast<void*>(ptr));
-      }
     private:
-      paddle::platform::Place place_;
+      PType place_;
    };
   public:
-    PlaceholderImpl(paddle::platform::Place place, size_t size)
+    PlaceholderImpl(PlaceType place, size_t size)
-        : ptr_(static_cast<T*>(paddle::memory::Alloc(place, size)),
+        : ptr_(static_cast<T*>(memory::Alloc(place, size)),
-               Deleter(place)),
+               Deleter<PlaceType>(place)),
          place_(place),
          size_(size) {}
    virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
    virtual size_t size() const { return size_; }
    virtual paddle::platform::Place place() const { return place_; }
+    virtual std::type_index type() const { return std::type_index(typeid(T)); }
-    std::unique_ptr<T, Deleter> ptr_;
+    std::unique_ptr<T, Deleter<PlaceType>> ptr_;
-    paddle::platform::Place place_;  // record the place of ptr_.
+    platform::Place place_;  // record the place of ptr_.
    size_t size_;            // size of the memory block.
  };
@@ -157,15 +243,16 @@ class Tensor {
  inline void CheckDims() const {
    PADDLE_ENFORCE(holder_ != nullptr,
                   "Tenosr holds no memory. Call Tensor::mutable_data first.");
-    PADDLE_ENFORCE(holder_->size() >= numel_ * sizeof(T) + offset_,
+    PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_,
                   "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
                   "first to re-allocate memory.");
  }
  std::shared_ptr<Placeholder> holder_;  // holds the memory block if allocated.
  DDim dims_;
-  size_t numel_;   // cache of `product(dims_)`
  size_t offset_;  // marks the begin of tensor data area.
+  template <bool less, size_t i, typename... args>
+  friend struct paddle::pybind::details::CastToPyBufferImpl;
 };
 }  // namespace framework

--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -33,7 +33,7 @@ TEST(Tensor, DataAssert) {
  bool caught = false;
  try {
    src_tensor.data<double>();
-  } catch (paddle::platform::EnforceNotMet err) {
+  } catch (std::runtime_error& err) {
    caught = true;
    std::string msg =
        "Tenosr holds no memory. Call Tensor::mutable_data first.";
@@ -47,7 +47,7 @@ TEST(Tensor, DataAssert) {
 /* following tests are not available at present
   because Memory::Alloc() and Memory::Free() have not been ready.
+*/
 TEST(Tensor, MutableData) {
  using namespace paddle::framework;
  using namespace paddle::platform;
@@ -72,7 +72,7 @@ TEST(Tensor, MutableData) {
    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CPUPlace());
    EXPECT_EQ(p1, p2);
  }
+#ifdef __CUDACC__
  {
    Tensor src_tensor;
    float* p1 = nullptr;
@@ -94,6 +94,7 @@ TEST(Tensor, MutableData) {
    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), GPUPlace());
    EXPECT_EQ(p1, p2);
  }
+#endif
 }
 TEST(Tensor, ShareDataFrom) {
@@ -106,11 +107,13 @@ TEST(Tensor, ShareDataFrom) {
    bool caught = false;
    try {
      dst_tensor.ShareDataFrom<float>(src_tensor);
-    } catch (EnforceNotMet err) {
+    } catch (std::runtime_error& err) {
      caught = true;
-      std::string msg = "Tenosr holds no memory. Call Tensor::mutable_data
+      std::string msg =
-first."; const char* what = err.what(); for (size_t i = 0; i < msg.length();
+          "Tenosr holds no memory. Call Tensor::mutable_data first.";
-++i) { ASSERT_EQ(what[i], msg[i]);
+      const char* what = err.what();
+      for (size_t i = 0; i < msg.length(); ++i) {
+        ASSERT_EQ(what[i], msg[i]);
      }
    }
    ASSERT_TRUE(caught);
@@ -120,6 +123,7 @@ first."; const char* what = err.what(); for (size_t i = 0; i < msg.length();
    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
  }
+#ifdef __CUDACC__
  {
    Tensor src_tensor;
    Tensor dst_tensor;
@@ -127,6 +131,7 @@ first."; const char* what = err.what(); for (size_t i = 0; i < msg.length();
    dst_tensor.ShareDataFrom<int>(src_tensor);
    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
  }
+#endif
 }
 TEST(Tensor, Slice) {
@@ -155,6 +160,7 @@ TEST(Tensor, Slice) {
    EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
  }
+#ifdef __CUDACC__
  {
    Tensor src_tensor;
    src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
@@ -176,6 +182,7 @@ TEST(Tensor, Slice) {
    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
    EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
  }
+#endif
 }
 TEST(Tensor, CopyFrom) {
@@ -203,4 +210,3 @@ TEST(Tensor, CopyFrom) {
    EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
  }
 }
-*/
\ No newline at end of file
--- a/paddle/framework/tensor_types.h
+++ b/paddle/framework/tensor_types.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "unsupported/Eigen/CXX11/Tensor"
+namespace paddle {
+namespace framework {
+// Helper to define Tensor types given that the scalar is of type T.
+template <typename T, int NDIMS = 1, typename IndexType = Eigen::DenseIndex>
+struct TTypes {
+  // Rank-<NDIMS> tensor of scalar type T.
+  typedef Eigen::TensorMap<Eigen::Tensor<T, NDIMS, Eigen::RowMajor, IndexType>,
+                           Eigen::Aligned>
+      Tensor;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, NDIMS, Eigen::RowMajor, IndexType>, Eigen::Aligned>
+      ConstTensor;
+  // Scalar tensor (implemented as a rank-0 tensor) of scalar type T.
+  typedef Eigen::TensorMap<
+      Eigen::TensorFixedSize<T, Eigen::Sizes<>, Eigen::RowMajor, IndexType>,
+      Eigen::Aligned>
+      Scalar;
+  typedef Eigen::TensorMap<Eigen::TensorFixedSize<const T, Eigen::Sizes<>,
+                                                  Eigen::RowMajor, IndexType>,
+                           Eigen::Aligned>
+      ConstScalar;
+  // Rank-1 tensor (vector) of scalar type T.
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>,
+                           Eigen::Aligned>
+      Flat;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
+      ConstFlat;
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 1, Eigen::RowMajor, IndexType>,
+                           Eigen::Aligned>
+      Vec;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 1, Eigen::RowMajor, IndexType>, Eigen::Aligned>
+      ConstVec;
+  // Rank-2 tensor (matrix) of scalar type T.
+  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, IndexType>,
+                           Eigen::Aligned>
+      Matrix;
+  typedef Eigen::TensorMap<
+      Eigen::Tensor<const T, 2, Eigen::RowMajor, IndexType>, Eigen::Aligned>
+      ConstMatrix;
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -11,7 +11,6 @@ if(WITH_GPU)
 endif()
 if(USE_NNPACK)
-  include(nnpack/nnpack.cmake)
  list(APPEND cpp_files nnpack/NNPACKConvOp.cpp)
  if(WITH_TESTING)
    add_unittest(NNPACKConvOpTest nnpack/NNPACKConvOpTest.cpp)

--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -117,8 +117,7 @@ public:
    ConvFunctionBase::init(config);
  }
-  virtual void check(const BufferArgs& inputs,
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-                     const BufferArgs& outputs) override {
    const TensorShape& input = inputs[0].shape();
    const TensorShape& filter = inputs[1].shape();
    const TensorShape& output = outputs[0].shape();
@@ -217,8 +216,7 @@ public:
    ConvFunctionBase::init(config);
  }
-  virtual void check(const BufferArgs& inputs,
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-                     const BufferArgs& outputs) override {
    const TensorShape& output = inputs[0].shape();
    const TensorShape& filter = inputs[1].shape();
    const TensorShape& input = outputs[0].shape();
@@ -311,8 +309,7 @@ public:
    ConvFunctionBase::init(config);
  }
-  virtual void check(const BufferArgs& inputs,
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-                     const BufferArgs& outputs) override {
    const TensorShape& output = inputs[0].shape();
    const TensorShape& input = inputs[1].shape();
    const TensorShape& filter = outputs[0].shape();

--- a/paddle/function/NaiveConvOp.cpp
+++ b/paddle/function/NaiveConvOp.cpp
@@ -90,8 +90,7 @@ public:
    ConvFunctionBase::init(config);
  }
-  virtual void check(const BufferArgs& inputs,
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
-                     const BufferArgs& outputs) override {
    const TensorShape& input = inputs[0].shape();
    const TensorShape& filter = inputs[1].shape();
    const TensorShape& output = outputs[0].shape();

--- a/paddle/function/RowConvOpGpu.cu
+++ b/paddle/function/RowConvOpGpu.cu
@@ -144,12 +144,15 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
      int yoff = start + j;
      // transpose
-      sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
+      sh_x[tidx][tidy] = (xoff < width && yoff < end) ?
-      sh_dy[tidx][tidy + context - 1] = (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0;
+      x[yoff * width + xoff] : 0.0;
+      sh_dy[tidx][tidy + context - 1] = (xoff < width && yoff < end) ?
+      dy[yoff * width + xoff] : 0.0;
      __syncthreads();
      if (tidy < (context - 1)) {
        yoff = yoff - context + 1;
-        sh_dy[tidx][tidy] = (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0;
+        sh_dy[tidx][tidy] = (xoff < width && yoff >= start) ?
+        dy[yoff * width + xoff] : 0.0;
      }
      __syncthreads();
@@ -199,11 +202,13 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
      int yoff = start + j;
      // transpose
-      sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
+      sh_x[tidx][tidy] = (xoff < width && yoff < end) ?
+      x[yoff * width + xoff] : 0.0;
      __syncthreads();
      for (int t = 0; t < context; t++) {
-        sh_dy[tidx][tidy] = (xoff < width && (yoff - t) >= start && yoff - t < end) ? dy[(yoff - t) * width + xoff] : 0.0;
+        sh_dy[tidx][tidy] = (xoff < width && (yoff - t) >= start &&
+        yoff - t < end) ? dy[(yoff - t) * width + xoff] : 0.0;
        __syncthreads();
        real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx];

--- a/paddle/function/nnpack/NNPACKConvOp.cpp
+++ b/paddle/function/nnpack/NNPACKConvOp.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/function/ConvOp.h"
 DEFINE_bool(nnpack_allocate_outside,
-            false,
+            true,
            "Allocate and free workspace memory outside the NNPACK interface.");
 DEFINE_int32(nnpack_num_threads,
             0,
@@ -58,18 +58,10 @@ public:
    workspaceBuffer_ = nullptr;
    workspaceSize_ = 0;
-    threadpool_ = nullptr;
+    create_nnpack_threadpool();
-    if (FLAGS_nnpack_num_threads) {
-      threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
-      VLOG(3) << "Number of threads "
-              << pthreadpool_get_threads_count(threadpool_);
-    }
  }
  ~NNPACKConvFunction() {
-    if (threadpool_) {
-      pthreadpool_destroy(threadpool_);
-    }
    if (workspaceBuffer_) {
      free(workspaceBuffer_);
    }
@@ -225,14 +217,25 @@ public:
    }
  }
+  static void create_nnpack_threadpool() {
+    if (FLAGS_nnpack_num_threads && threadpool_ == nullptr) {
+      threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
+      VLOG(3) << "Number of threads "
+              << pthreadpool_get_threads_count(threadpool_);
+    }
+  }
 private:
  nnp_convolution_algorithm algorithm_;
  nnp_convolution_transform_strategy transform_strategy_;
  void* workspaceBuffer_;
  size_t workspaceSize_;
-  pthreadpool_t threadpool_;
+  static pthreadpool_t threadpool_;
 };
+template <DeviceType Device>
+pthreadpool_t NNPACKConvFunction<Device>::threadpool_ = nullptr;
 REGISTER_TYPED_FUNC(NNPACKConv, CPU, NNPACKConvFunction);
 }  // namespace paddle
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -403,7 +403,7 @@ public:
      : layerName_(layerName) {
    addEvaluator(std::move(evaluator));
  }
-  virtual void eval(const NeuralNetwork& nn) override {
+  void eval(const NeuralNetwork& nn) override {
    const LayerPtr& layer = nn.getLayer(layerName_);
    CHECK(layer) << "Nonexisted layer: " << layerName_ << " in submodel "
                 << nn.getName();

--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -636,7 +636,7 @@ void lenToStarts(std::vector<int>& starts) {
  }
  starts.back() = pos;
 }
-}
+}  // namespace
 void RecurrentGradientMachine::calcSequenceStartPositions() {
  std::vector<int> starts(commonSeqInfo_.size() + 1);

--- a/paddle/gserver/layers/AgentLayer.cpp
+++ b/paddle/gserver/layers/AgentLayer.cpp
@@ -124,7 +124,7 @@ void copyElements(const IVector& srcVec,
    dest[index[i]] = src[i];
  }
 }
-}
+}  // namespace
 void GatherAgentLayer::forwardIds(PassType passType) {
  IVectorPtr realId = realLayers_[0]->getOutputLabel();

--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -359,12 +359,11 @@ void Layer::backwardActivation() {
  /* Do error clipping */
  if (config_.error_clipping_threshold() > 0.0f) {
    if (FLAGS_log_error_clipping) {
-      CpuVector outGradVec(0, nullptr);
+      VectorPtr outGradVec = Vector::create(
-      outGradVec.subVecFrom(
+          output_.grad->getData(), output_.grad->getElementCnt(), useGpu_);
-          output_.grad->getData(), 0, output_.grad->getElementCnt());
+      real maxAbsGrad = outGradVec->getAbsMax();
-      real maxAbsGrad = outGradVec.getAbsMax();
      if (maxAbsGrad > config_.error_clipping_threshold()) {
-        real avgAbsGrad = outGradVec.getAbsSum() / outGradVec.getSize();
+        real avgAbsGrad = outGradVec->getAbsSum() / outGradVec->getSize();
        LOG(INFO) << " layer=" << config_.name() << " need clipping,"
                  << " max error=" << maxAbsGrad << " avg error=" << avgAbsGrad;
      }

--- a/paddle/memory/detail/memory_block.cc
+++ b/paddle/memory/detail/memory_block.cc
@@ -152,6 +152,6 @@ MemoryBlock* MemoryBlock::metadata() const {
      reinterpret_cast<const Metadata*>(this) - 1));
 }
-}  // detail
+}  // namespace detail
-}  // memory
+}  // namespace memory
-}  // paddle
+}  // namespace paddle
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -17,8 +17,6 @@ limitations under the License. */
 #include "paddle/memory/detail/system_allocator.h"
 #include "paddle/platform/assert.h"
-#include <boost/variant.hpp>
 namespace paddle {
 namespace memory {

--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
-if(WITH_GPU)
+function(op_library TARGET)
-    nv_library(add_op SRCS add_op.cc add_op.cu DEPS operator op_registry glog ddim)
+    # op_library is a function to create op library. The interface is same as
-else()
+    # cc_library. But it handle split GPU/CPU code and link some common library
-    cc_library(add_op SRCS add_op.cc DEPS operator op_registry glog ddim)
+    # for ops.
-endif()
+    set(cc_srcs)
+    set(cu_srcs)
+    set(op_common_deps operator op_registry)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN})
+    foreach(src ${op_library_SRCS})
+        if (${src} MATCHES ".*\\.cu$")
+            list(APPEND cu_srcs ${src})
+        elseif(${src} MATCHES ".*\\.cc$")
+            list(APPEND cc_srcs ${src})
+        else()
+            message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
+        endif()
+    endforeach()
+    list(LENGTH cc_srcs cc_srcs_len)
+    if (${cc_srcs_len} EQUAL 0)
+        message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
+    endif()
+    list(LENGTH cu_srcs cu_srcs_len)
+    if (${cu_srcs_len} EQUAL 0)
+        message(WARNING "The op library ${TARGET} not support GPU!")
+    endif()
+    if (WITH_GPU)
+        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
+                ${op_common_deps})
+    else()
+        cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS}
+                ${op_common_deps})
+    endif()
+endfunction()
+op_library(add_op SRCS add_op.cc add_op.cu)
 cc_test(add_op_test SRCS add_op_test.cc DEPS add_op)
+op_library(mul_op SRCS mul_op.cc mul_op.cu)
+op_library(rowwise_add_op SRCS rowwise_add_op.cu rowwise_add_op.cc)
+op_library(sigmoid_op SRCS sigmoid_op.cu sigmoid_op.cc)
+op_library(softmax_op SRCS softmax_op.cc softmax_op.cu)
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
-#include <paddle/framework/op_registry.h>
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-#include <paddle/framework/tensor.h>
-#include <paddle/operators/add_op.h>
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/add_op.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/tensor.h"
 namespace paddle {
 namespace operators {
@@ -17,8 +31,7 @@ protected:
        "Inputs/Outputs of AddOp must all be set");
    PADDLE_ENFORCE(inputs[0]->dims() == inputs[1]->dims(),
                   "Two input of Add Op's dimension must be same.");
-    // Need set dims in Tensor
+    outputs[0]->set_dims(inputs[0]->dims());
-    // outputs[0]->set_dims(inputs[0]->dims())
  }
 };
@@ -36,9 +49,10 @@ The equation is: Out = X + Y
 )DOC");
  }
 };
-}  // namespace op
+}  // namespace operators
 }  // namespace paddle
 REGISTER_OP(add_two, paddle::operators::AddOp, paddle::operators::AddOpMaker);
-REGISTER_OP_CPU_KERNEL(
+typedef paddle::operators::AddKernel<::paddle::platform::CPUPlace, float>
-    add_two, ::paddle::operators::AddKernel<::paddle::platform::CPUPlace>);
+    AddKernel_CPU_float;
\ No newline at end of file
+REGISTER_OP_CPU_KERNEL(add_two, AddKernel_CPU_float);
--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/add_op.cu
-#include <paddle/operators/add_op.h>
+#include "paddle/operators/add_op.h"
-#include <paddle/framework/op_registry.h>
+#include "paddle/framework/op_registry.h"
+typedef paddle::operators::AddKernel<::paddle::platform::GPUPlace, float> AddKernel_GPU_float;
 REGISTER_OP_GPU_KERNEL(add_two,
-                       paddle::operators::AddKernel<paddle::platform::GPUPlace>);
+                       AddKernel_GPU_float);
\ No newline at end of file
--- a/paddle/operators/add_op.h
+++ b/paddle/operators/add_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #pragma once
-#include <glog/logging.h>
+#include "glog/logging.h"
-#include <paddle/framework/operator.h>
+#include "paddle/framework/operator.h"
 namespace paddle {
 namespace operators {
-template <typename Place>
+template <typename Place, typename T>
 class AddKernel : public framework::OpKernel {
 public:
-  void Compute(const KernelContext &context) const override {
+  void Compute(const framework::KernelContext& context) const override {
-    LOG(INFO) << "Add kernel in " << typeid(Place).name();
+    auto input0 = context.Input(0)->Get<framework::Tensor>();
+    auto input1 = context.Input(1)->Get<framework::Tensor>();
+    auto* output = context.Output(0)->GetMutable<framework::Tensor>();
+    output->mutable_data<T>(context.GetPlace());
+    output->flat<T>().device(*(context.GetEigenDevice<Place>())) =
+        input0.flat<T>() + input1.flat<T>();
  }
 };
-}  // namespace op
+}  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/add_op_test.cc
+++ b/paddle/operators/add_op_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
 #include <gtest/gtest.h>
 #define private public
 #include <paddle/framework/op_registry.h>

--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <paddle/framework/op_registry.h>
+#include <paddle/framework/tensor.h>
+#include <paddle/operators/mul_op.h>
+namespace paddle {
+namespace operators {
+class MulOp : public framework::OperatorWithKernel {
+protected:
+  void InferShape(
+      const std::vector<const framework::Tensor *> &inputs,
+      const std::vector<framework::Tensor *> &outputs) const override {
+    PADDLE_ENFORCE(inputs.size() == 2, "The mul op must take two inputs");
+    auto dim0 = inputs[0]->dims();
+    auto dim1 = inputs[1]->dims();
+    PADDLE_ENFORCE(dim0.size() == 2 && dim1.size() == 2,
+                   "The input of mul op must be matrix");
+    PADDLE_ENFORCE(
+        dim0[1] == dim1[0],
+        "First matrix's width must be equal with second matrix's height.");
+    PADDLE_ENFORCE(outputs.size() == 1, "The mul op must take one output");
+    outputs[0]->set_dims({dim0[0], dim1[1]});
+  }
+};
+class MulOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of mul op");
+    AddInput("Y", "The second input of mul op");
+    AddOutput("Out", "The output of mul op");
+    AddComment(R"DOC(
+Two Element Mul Operator.
+The equation is: Out = X * Y
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+REGISTER_OP(mul, paddle::operators::MulOp, paddle::operators::MulOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    mul, paddle::operators::MulKernel<paddle::platform::CPUPlace>);
--- a/paddle/operators/mul_op.cu
+++ b/paddle/operators/mul_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <paddle/operators/mul_op.h>
+#include <paddle/framework/op_registry.h>
+REGISTER_OP_GPU_KERNEL(mul,
+                       paddle::operators::MulKernel<paddle::platform
+                       ::GPUPlace>);
\ No newline at end of file
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <glog/logging.h>
+#include <paddle/framework/operator.h>
+namespace paddle {
+namespace operators {
+template <typename Place>
+class MulKernel : public framework::OpKernel {
+public:
+  void Compute(const framework::KernelContext &context) const override {
+    LOG(INFO) << "Mul kernel in " << typeid(Place).name();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <paddle/framework/op_registry.h>
+#include <paddle/operators/rowwise_add_op.h>
+namespace paddle {
+namespace operators {
+class RowWiseAddOp : public framework::OperatorWithKernel {
+protected:
+  void InferShape(
+      const std::vector<const framework::Tensor *> &inputs,
+      const std::vector<framework::Tensor *> &outputs) const override {
+    PADDLE_ENFORCE(inputs.size() == 2UL, "Two inputs is needed by rowwise add");
+    auto dim0 = inputs[0]->dims();
+    auto dim1 = inputs[1]->dims();
+    PADDLE_ENFORCE(dim0.size() == 2, "Input 0 must be matrix");
+    PADDLE_ENFORCE(dim1.size() == 1, "The second input must be vector");
+    PADDLE_ENFORCE(dim0[1] == dim1[0], "The width of two input must be same");
+    PADDLE_ENFORCE(outputs.size() == 1, "The output size must be 1");
+    outputs[0]->set_dims(inputs[0]->dims());
+  }
+};
+class RowWiseAddOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  RowWiseAddOpMaker(framework::OpProto *proto,
+                    framework::OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The left input of row-wise add op, must be matrix");
+    AddInput("b", "The right input of row-wise add op, must be vector");
+    AddOutput("Out", "The output of row-wise add op");
+    AddComment(R"DOC(Row-wise Add operator
+for i in xrange(X.shape[0]):
+  Out = X[i] + b
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+REGISTER_OP(rowwise_add,
+            paddle::operators::RowWiseAddOp,
+            paddle::operators::RowWiseAddOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    rowwise_add,
+    paddle::operators::RowWiseAddKernel<paddle::platform::CPUPlace>);
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
+#include <paddle/framework/op_registry.h>
+#include <paddle/operators/rowwise_add_op.h>
+REGISTER_OP_GPU_KERNEL(
+    rowwise_add,
+    paddle::operators::RowWiseAddKernel<paddle::platform ::GPUPlace>);
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <glog/logging.h>
+#include <paddle/framework/operator.h>
+namespace paddle {
+namespace operators {
+template <typename Place>
+class RowWiseAddKernel : public framework::OpKernel {
+public:
+  void Compute(const framework::KernelContext &context) const override {
+    LOG(INFO) << "RowWiseAdd kernel in " << typeid(Place).name();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <paddle/framework/op_registry.h>
+#include <paddle/operators/sigmoid_op.h>
+namespace paddle {
+namespace operators {
+class SigmoidOp : public framework::OperatorWithKernel {
+protected:
+  void InferShape(
+      const std::vector<const framework::Tensor *> &inputs,
+      const std::vector<framework::Tensor *> &outputs) const override {
+    PADDLE_ENFORCE(inputs.size() == 1, "Sigmoid Op only have one input");
+    PADDLE_ENFORCE(outputs.size() == 1, "Sigmoid Op only have one output");
+    outputs[0]->set_dims(inputs[0]->dims());
+  }
+};
+class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  SigmoidOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "sigmoid input");
+    AddInput("Y", "sigmoid output");
+    AddComment("Sigmoid function");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+REGISTER_OP(sigmoid,
+            paddle::operators::SigmoidOp,
+            paddle::operators::SigmoidOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    sigmoid, paddle::operators::SigmoidKernel<paddle::platform::CPUPlace>);
--- a/paddle/operators/sigmoid_op.cu
+++ b/paddle/operators/sigmoid_op.cu
+#include <paddle/operators/sigmoid_op.h>
+#include <paddle/framework/op_registry.h>
+REGISTER_OP_GPU_KERNEL(
+    sigmoid, paddle::operators::SigmoidKernel<paddle::platform::GPUPlace>);
--- a/paddle/operators/sigmoid_op.h
+++ b/paddle/operators/sigmoid_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <glog/logging.h>
+#include <paddle/framework/operator.h>
+namespace paddle {
+namespace operators {
+template <typename Place>
+class SigmoidKernel : public framework::OpKernel {
+public:
+  void Compute(const framework::KernelContext &context) const override {
+    LOG(INFO) << "Sigmoid kernel in " << typeid(Place).name();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <paddle/framework/op_registry.h>
+#include <paddle/operators/softmax_op.h>
+namespace paddle {
+namespace operators {
+class SoftmaxOp : public framework::OperatorWithKernel {
+protected:
+  void InferShape(
+      const std::vector<const framework::Tensor *> &inputs,
+      const std::vector<framework::Tensor *> &outputs) const override {
+    PADDLE_ENFORCE(inputs.size() == 1, "Only one input is need for softmax");
+    PADDLE_ENFORCE(outputs.size() == 1, "Only one output is need for softmax");
+    outputs[0]->set_dims(inputs[0]->dims());
+  }
+};
+class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  SoftmaxOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "input of softmax");
+    AddOutput("Y", "output of softmax");
+    AddComment("Softmax Op");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker);
+REGISTER_OP_CPU_KERNEL(softmax, ops::SoftmaxKernel<paddle::platform::CPUPlace>);
--- a/paddle/operators/softmax_op.cu
+++ b/paddle/operators/softmax_op.cu
+#include <paddle/framework/op_registry.h>
+#include <paddle/operators/softmax_op.h>
+REGISTER_OP_GPU_KERNEL(
+    softmax, paddle::operators::SoftmaxKernel<paddle::platform::GPUPlace>);
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <glog/logging.h>
+#include <paddle/framework/operator.h>
+namespace paddle {
+namespace operators {
+template <typename Place>
+class SoftmaxKernel : public framework::OpKernel {
+public:
+  void Compute(const framework::KernelContext &context) const override {
+    LOG(INFO) << "Softmax kernel in " << typeid(Place).name();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/optimizer/parameter_optimizer_test.cpp
+++ b/paddle/optimizer/parameter_optimizer_test.cpp
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
 #include "parameter_optimizer.h"
 #include <cmath>
 #include <map>
@@ -5,21 +21,18 @@
 #include "gtest/gtest.h"
 #include "lr_policy.h"
-using namespace paddle;
+paddle::optimizer::Tensor* FillTensor(size_t size) {
-using namespace paddle::optimizer;
+  paddle::optimizer::Tensor* param = new paddle::optimizer::Tensor(size);
+  paddle::optimizer::Tensor& p = *param;
-Tensor* FillTensor(size_t size) {
-  Tensor* param = new Tensor(size);
-  Tensor& p = *param;
  for (size_t i = 0; i < p.size(); ++i) {
    p[i] = (float)rand() / (float)RAND_MAX;
  }
  return param;
 }
-Tensor* FixedTensor(size_t size) {
+paddle::optimizer::Tensor* FixedTensor(size_t size) {
-  Tensor* param = new Tensor(size);
+  paddle::optimizer::Tensor* param = new paddle::optimizer::Tensor(size);
-  Tensor& p = *param;
+  paddle::optimizer::Tensor& p = *param;
  for (size_t i = 0; i < p.size(); ++i) {
    p[i] = i;
  }
@@ -28,7 +41,8 @@ Tensor* FixedTensor(size_t size) {
 class OptimizerTest : public testing::Test {
 public:
-  // init tensor shape
+  virtual ~OptimizerTest() {}
+  // init paddle::optimizer::Tensor shape
  const size_t kSize = 5;
  virtual void SetUp() {
@@ -38,34 +52,36 @@ public:
  virtual void TearDown() {}
  void CreateSGD() {
-    Tensor* parameter = FixedTensor(kSize);
+    paddle::optimizer::Tensor* parameter = FixedTensor(kSize);
-    config_.set_optimizer(OptimizerConfig::SGD);
+    config_.set_optimizer(paddle::OptimizerConfig::SGD);
    config_.mutable_sgd()->set_momentum(0.0);
    config_.mutable_sgd()->set_decay(0.0);
    config_.mutable_sgd()->set_nesterov(false);
-    config_.set_lr_policy(OptimizerConfig::Const);
+    config_.set_lr_policy(paddle::OptimizerConfig::Const);
    config_.mutable_const_lr()->set_learning_rate(0.1);
    std::string str = config_.SerializeAsString();
-    ParameterOptimizer* opt = ParameterOptimizer::Create(str, parameter);
+    paddle::optimizer::ParameterOptimizer* opt =
+        paddle::optimizer::ParameterOptimizer::Create(str, parameter);
    opts_.push_back(opt);
  }
  void CreateAdam() {
-    Tensor* parameter = FixedTensor(kSize);
+    paddle::optimizer::Tensor* parameter = FixedTensor(kSize);
-    config_.set_optimizer(OptimizerConfig::Adam);
+    config_.set_optimizer(paddle::OptimizerConfig::Adam);
    config_.mutable_adam()->set_beta_1(0.9);
    config_.mutable_adam()->set_beta_2(0.1);
    config_.mutable_adam()->set_epsilon(1e-3);
    config_.mutable_adam()->set_decay(0.0);
-    config_.set_lr_policy(OptimizerConfig::Const);
+    config_.set_lr_policy(paddle::OptimizerConfig::Const);
    config_.mutable_const_lr()->set_learning_rate(0.1);
    std::string str = config_.SerializeAsString();
-    ParameterOptimizer* opt = ParameterOptimizer::Create(str, parameter);
+    paddle::optimizer::ParameterOptimizer* opt =
+        paddle::optimizer::ParameterOptimizer::Create(str, parameter);
    opts_.push_back(opt);
  }
  void TestGetWeight() {
-    Tensor* p = FixedTensor(kSize);
+    paddle::optimizer::Tensor* p = FixedTensor(kSize);
    for (size_t i = 0; i < opts_.size(); ++i) {
      int s = 0;
      float* newp = (float*)opts_[i]->get_weight(&s);
@@ -76,7 +92,7 @@ public:
  }
  void TestUpdate() {
-    Tensor* g = FixedTensor(kSize);
+    paddle::optimizer::Tensor* g = FixedTensor(kSize);
    for (size_t i = 0; i < opts_.size(); ++i) {
      opts_[i]->Update(g);
    }
@@ -91,8 +107,8 @@ public:
  }
 private:
-  std::vector<ParameterOptimizer*> opts_;
+  std::vector<paddle::optimizer::ParameterOptimizer*> opts_;
-  OptimizerConfig config_;
+  paddle::OptimizerConfig config_;
 };
 TEST_F(OptimizerTest, TestGetWeight) { TestGetWeight(); }

--- a/paddle/optimizer/serialization_test.cpp
+++ b/paddle/optimizer/serialization_test.cpp
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
 #include "serialization.h"
 #include "gtest/gtest.h"
-using namespace paddle;
-using namespace paddle::optimizer;
 TEST(TensorToProto, Case1) {
-  Tensor t(3), t1(3);
+  paddle::optimizer::Tensor t(3), t1(3);
  for (size_t i = 0; i < t.size(); ++i) {
    t[i] = i;
    t1[i] = 0;
  }
-  TensorProto proto;
+  paddle::TensorProto proto;
-  TensorToProto(t, &proto);
+  paddle::optimizer::TensorToProto(t, &proto);
-  ProtoToTensor(proto, &t1);
+  paddle::optimizer::ProtoToTensor(proto, &t1);
  for (size_t i = 0; i < t1.size(); ++i) {
    EXPECT_EQ(t1[i], t[i]);
  }

--- a/paddle/platform/cpu_info.cc
+++ b/paddle/platform/cpu_info.cc
@@ -40,8 +40,8 @@ inline size_t CpuTotalPhysicalMemory() {
  if (sysctl(mib, 2, &size, &len, NULL, 0) == 0) return (size_t)size;
  return 0L;
 #else
-  long pages = sysconf(_SC_PHYS_PAGES);
+  int64_t pages = sysconf(_SC_PHYS_PAGES);
-  long page_size = sysconf(_SC_PAGE_SIZE);
+  int64_t page_size = sysconf(_SC_PAGE_SIZE);
  return pages * page_size;
 #endif
 }

--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -15,14 +15,15 @@ namespace paddle {
 namespace platform {
 template <>
-Eigen::DefaultDevice* DeviceContext::get_eigen_device<Eigen::DefaultDevice>() {
+Eigen::DefaultDevice* DeviceContext::get_eigen_device<Eigen::DefaultDevice>()
-  return reinterpret_cast<CPUDeviceContext*>(this)->eigen_device();
+    const {
+  return reinterpret_cast<const CPUDeviceContext*>(this)->eigen_device();
 }
 #ifndef PADDLE_ONLY_CPU
 template <>
-Eigen::GpuDevice* DeviceContext::get_eigen_device<Eigen::GpuDevice>() {
+Eigen::GpuDevice* DeviceContext::get_eigen_device<Eigen::GpuDevice>() const {
-  return reinterpret_cast<CUDADeviceContext*>(this)->eigen_device();
+  return reinterpret_cast<const CUDADeviceContext*>(this)->eigen_device();
 }
 #endif

--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -21,9 +21,9 @@ limitations under the License. */
 #include "paddle/platform/gpu_info.h"
 #define EIGEN_USE_GPU
 #endif
 #include <memory>
-#include <unsupported/Eigen/CXX11/Tensor>
+#include "paddle/platform/place.h"
+#include "unsupported/Eigen/CXX11/Tensor"
 namespace paddle {
 namespace platform {
@@ -34,17 +34,14 @@ class DeviceContext {
  virtual Place GetPlace() const = 0;
  template <typename DeviceType>
-  DeviceType* get_eigen_device();
+  DeviceType* get_eigen_device() const;
 };
 class CPUDeviceContext : public DeviceContext {
 public:
-  Eigen::DefaultDevice* eigen_device() {
+  CPUDeviceContext() { eigen_device_.reset(new Eigen::DefaultDevice()); }
-    if (!eigen_device_) {
-      eigen_device_.reset(new Eigen::DefaultDevice());
+  Eigen::DefaultDevice* eigen_device() const { return eigen_device_.get(); }
-    }
-    return eigen_device_.get();
-  }
  Place GetPlace() const override {
    Place retv = CPUPlace();
@@ -92,7 +89,7 @@ class CUDADeviceContext : public DeviceContext {
  cudaStream_t stream() { return stream_; }
-  Eigen::GpuDevice* eigen_device() { return eigen_device_.get(); }
+  Eigen::GpuDevice* eigen_device() const { return eigen_device_.get(); }
  cublasHandle_t cublas_handle() {
    if (!blas_handle_) {

--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -36,64 +36,36 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
-/**
+// Because most enforce conditions would evaluate to true, we can use
- * @brief Enforce exception. Inherits std::exception
+// __builtin_expect to instruct the C++ compiler to generate code that
- *
+// always forces branch prediction of true.
- * All enforce condition not met, will throw an EnforceNotMet exception.
+// This generates faster binary code. __builtin_expect is since C++11.
- */
+// For more details, please check https://stackoverflow.com/a/43870188/724872.
-class EnforceNotMet : public std::exception {
- public:
-  EnforceNotMet(const std::string& msg, const char* file, int fileline) {
-    std::ostringstream sout;
-    sout << msg << " at [" << file << ":" << fileline << "];";
-    all_msg_ = sout.str();
-  }
-  const char* what() const noexcept override { return all_msg_.c_str(); }
- private:
-  std::string all_msg_;
-};
-// From https://stackoverflow.com/questions/30130930/
-// __buildin_expect is in C++ 11 standard. Since the condition which enforced
-// should be true in most situation, it will make the compiler generate faster
-// code by adding `UNLIKELY` macro.
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
-/**
- * @brief Throw a EnforceNotMet exception, automatically filled __FILE__ &
- * __LINE__
- *
- * This macro take __VA_ARGS__, user can pass any type if that type can
- * serialize to std::ostream
- */
-#define PADDLE_THROW(...)                                            \
-  do {                                                               \
-    throw ::paddle::platform::EnforceNotMet(                         \
-        ::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__); \
-  } while (0)
 #ifndef PADDLE_ONLY_CPU
 template <typename... Args>
 inline void throw_on_error(cudaError_t e, const Args&... args) {
-  if (e) {
+  if (UNLIKELY(e)) {
-    std::stringstream ss;
+    // clang-format off
-    ss << ::paddle::string::Sprintf(args...);
+    throw thrust::system_error(
-    ss << ::paddle::string::Sprintf(" at [%s:%s];", __FILE__, __LINE__);
+        e, thrust::cuda_category(),
-    throw thrust::system_error(e, thrust::cuda_category(), ss.str());
+        string::Sprintf(args...) +
+        string::Sprintf(" at [%s:%s];", __FILE__, __LINE__));
+    // clang-format on
  }
 }
 template <typename... Args>
 inline void throw_on_error(curandStatus_t stat, const Args&... args) {
  if (stat != CURAND_STATUS_SUCCESS) {
-    std::stringstream ss;
+    // clang-format off
-    ss << ::paddle::string::Sprintf(args...);
+    throw thrust::system_error(
-    ss << ::paddle::string::Sprintf(" at [%s:%s];", __FILE__, __LINE__);
+        cudaErrorLaunchFailure, thrust::cuda_category(),
-    throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(),
+        string::Sprintf(args...) +
-                               ss.str());
+        string::Sprintf(" at [%s:%s];", __FILE__, __LINE__));
+    // clang-format on
  }
 }
@@ -102,11 +74,12 @@ inline void throw_on_error(cudnnStatus_t stat, const Args&... args) {
  if (stat == CUDNN_STATUS_SUCCESS) {
    return;
  } else {
-    std::stringstream ss;
+    // clang-format off
-    ss << ::paddle::platform::dynload::cudnnGetErrorString(stat);
+    throw std::runtime_error(
-    ss << ", " << ::paddle::string::Sprintf(args...);
+        platform::dynload::cudnnGetErrorString(stat) + ", " +
-    ss << ::paddle::string::Sprintf(" at [%s:%s];", __FILE__, __LINE__);
+        string::Sprintf(args...) +
-    throw std::runtime_error(ss.str());
+        string::Sprintf(" at [%s:%s];", __FILE__, __LINE__));
+    // clang-format on
  }
 }
@@ -134,9 +107,8 @@ inline void throw_on_error(cublasStatus_t stat, const Args&... args) {
  } else if (stat == CUBLAS_STATUS_LICENSE_ERROR) {
    ss << "CUBLAS: license error";
  }
-  ss << ", " << ::paddle::string::Sprintf(args...);
+  throw std::runtime_error(ss + ", " + string::Sprintf(args...) +
-  ss << ::paddle::string::Sprintf(" at [%s:%s];", __FILE__, __LINE__);
+                           string::Sprintf(" at [%s:%s];", __FILE__, __LINE__));
-  throw std::runtime_error(ss.str());
 }
 #endif  // PADDLE_ONLY_CPU
@@ -144,10 +116,19 @@ inline void throw_on_error(cublasStatus_t stat, const Args&... args) {
 template <typename... Args>
 inline void throw_on_error(int stat, const Args&... args) {
  if (UNLIKELY(!(stat))) {
-    PADDLE_THROW(args...);
+    throw std::runtime_error(
+        string::Sprintf(args...) +
+        string::Sprintf(" at [%s:%s];", __FILE__, __LINE__));
  }
 }
+#define PADDLE_THROW(...)                                     \
+  do {                                                        \
+    throw std::runtime_error(                                 \
+        string::Sprintf(__VA_ARGS__) +                        \
+        string::Sprintf(" at [%s:%s];", __FILE__, __LINE__)); \
+  } while (0)
 /**
 * @brief Enforce a condition, otherwise throw an EnforceNotMet
 */

--- a/paddle/platform/enforce_test.cc
+++ b/paddle/platform/enforce_test.cc
@@ -9,8 +9,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <gtest/gtest.h>
+#include "paddle/platform/enforce.h"
-#include <paddle/platform/enforce.h>
+#include "gtest/gtest.h"
+using namespace paddle;
 TEST(ENFORCE, OK) {
  PADDLE_ENFORCE(true, "Enforce is ok %d now %f", 123, 0.345);
@@ -23,10 +25,11 @@ TEST(ENFORCE, FAILED) {
  bool in_catch = false;
  try {
    PADDLE_ENFORCE(false, "Enforce is not ok %d at all", 123);
-  } catch (paddle::platform::EnforceNotMet err) {
+  } catch (const std::runtime_error& error) {
+    // your error handling code here
    in_catch = true;
    std::string msg = "Enforce is not ok 123 at all";
-    const char* what = err.what();
+    const char* what = error.what();
    for (size_t i = 0; i < msg.length(); ++i) {
      ASSERT_EQ(what[i], msg[i]);
    }

--- a/paddle/platform/place.cc
+++ b/paddle/platform/place.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
 #include "paddle/platform/place.h"
 namespace paddle {
@@ -7,7 +21,7 @@ namespace detail {
 class PlacePrinter : public boost::static_visitor<> {
 public:
-  PlacePrinter(std::ostream &os) : os_(os) {}
+  explicit PlacePrinter(std::ostream &os) : os_(os) {}
  void operator()(const CPUPlace &) { os_ << "CPUPlace"; }
  void operator()(const GPUPlace &p) { os_ << "GPUPlace(" << p.device << ")"; }

--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
-cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python)
+cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python
+        add_op mul_op rowwise_add_op sigmoid_op softmax_op)
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -13,15 +13,48 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <Python.h>
+#include <paddle/framework/op_registry.h>
 #include <paddle/framework/scope.h>
+#include <paddle/pybind/tensor_bind.h>
+#include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+#include <fstream>
+#include <vector>
 namespace py = pybind11;
 namespace pd = paddle::framework;
+USE_OP(add_two);
+USE_OP(softmax);
+USE_OP(mul);
+USE_OP(rowwise_add);
+USE_OP(sigmoid);
 PYBIND11_PLUGIN(core) {
  py::module m("core", "C++ core of Paddle Paddle");
+  py::class_<pd::Tensor>(m, "Tensor", py::buffer_protocol())
+      .def_buffer([](pd::Tensor& self) -> py::buffer_info {
+        return paddle::pybind::CastToPyBuffer(self);
+      })
+      .def("get_dims",
+           [](const pd::Tensor& self) { return pd::vectorize(self.dims()); })
+      .def("set_dims",
+           [](pd::Tensor& self, const std::vector<int>& dim) {
+             self.set_dims(pd::make_ddim(dim));
+           })
+      .def("alloc_float",
+           [](pd::Tensor& self) {
+             self.mutable_data<float>(paddle::platform::CPUPlace());
+           })
+      .def("alloc_int",
+           [](pd::Tensor& self) {
+             self.mutable_data<int>(paddle::platform::CPUPlace());
+           })
+      .def("set", paddle::pybind::PyTensorSetFromArray<float>)
+      .def("set", paddle::pybind::PyTensorSetFromArray<int>);
  py::class_<pd::Variable>(m, "Variable", R"DOC(Variable Class.
 All parameter, weight, gradient are variables in Paddle.
@@ -32,7 +65,12 @@ All parameter, weight, gradient are variables in Paddle.
             *var.GetMutable<int>() = val;
           })
      .def("get_int",
-           [](const pd::Variable& var) -> int { return var.Get<int>(); });
+           [](const pd::Variable& var) -> int { return var.Get<int>(); })
+      .def("get_tensor",
+           [](pd::Variable& self) -> pd::Tensor* {
+             return self.GetMutable<pd::Tensor>();
+           },
+           py::return_value_policy::reference);
  py::class_<pd::Scope, std::shared_ptr<pd::Scope>>(m, "Scope")
      .def(py::init<const std::shared_ptr<pd::Scope>&>())
@@ -43,5 +81,37 @@ All parameter, weight, gradient are variables in Paddle.
           &pd::Scope::CreateVariable,
           py::return_value_policy::reference);
+  //! @note: Be careful! PyBind will return std::string as an unicode, not
+  //! Python str. If you want a str object, you should cast them in Python.
+  m.def("get_all_op_protos", []() -> std::vector<std::string> {
+    auto& protos = pd::OpRegistry::protos();
+    std::vector<std::string> ret_values;
+    for (auto it = protos.begin(); it != protos.end(); ++it) {
+      PADDLE_ENFORCE(it->second.IsInitialized(),
+                     "OpProto must all be initialized");
+      ret_values.emplace_back();
+      PADDLE_ENFORCE(it->second.SerializeToString(&ret_values.back()),
+                     "Serialize OpProto Error. This could be a bug of Paddle.");
+    }
+    return ret_values;
+  });
+  m.def_submodule(
+       "var_names",
+       "The module will return special predefined variable name in Paddle")
+      .def("empty", pd::OperatorBase::EMPTY_VAR_NAME)
+      .def("temp", pd::OperatorBase::TMP_VAR_NAME);
+  py::class_<pd::OperatorBase, pd::OperatorPtr>(m, "Operator")
+      .def("__str__", &pd::OperatorBase::DebugString)
+      .def_static("create", [](const std::string& protobin) {
+        pd::OpDesc desc;
+        PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
+                       "Cannot parse user input to OpDesc");
+        PADDLE_ENFORCE(desc.IsInitialized(),
+                       "User OpDesc is not initialized, reason %s",
+                       desc.InitializationErrorString());
+        return pd::OpRegistry::CreateOp(desc);
+      });
  return m.ptr();
 }
--- a/paddle/pybind/tensor_bind.h
+++ b/paddle/pybind/tensor_bind.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <paddle/framework/tensor.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+namespace paddle {
+namespace pybind {
+namespace details {
+template <bool less, size_t I, typename... ARGS>
+struct CastToPyBufferImpl;
+template <size_t I, typename... ARGS>
+struct CastToPyBufferImpl<false, I, ARGS...> {
+  py::buffer_info operator()(framework::Tensor &tensor) {
+    PADDLE_THROW("This type of tensor cannot be expose to Python");
+    return py::buffer_info();
+  }
+};
+template <size_t I, typename... ARGS>
+struct CastToPyBufferImpl<true, I, ARGS...> {
+  using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
+  py::buffer_info operator()(framework::Tensor &tensor) {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(tensor.holder_->place()),
+                   "Only CPU tensor can cast to numpy array");
+    if (std::type_index(typeid(CUR_TYPE)) == tensor.holder_->type()) {
+      auto dim_vec = framework::vectorize(tensor.dims());
+      std::vector<size_t> dims_outside;
+      std::vector<size_t> strides;
+      dims_outside.resize(dim_vec.size());
+      strides.resize(dim_vec.size());
+      size_t prod = 1;
+      for (size_t i = dim_vec.size(); i != 0; --i) {
+        dims_outside[i - 1] = (size_t)dim_vec[i - 1];
+        strides[i - 1] = sizeof(CUR_TYPE) * prod;
+        prod *= dims_outside[i - 1];
+      }
+      return py::buffer_info(
+          tensor.mutable_data<CUR_TYPE>(tensor.holder_->place()),
+          sizeof(CUR_TYPE),
+          py::format_descriptor<CUR_TYPE>::format(),
+          (size_t)framework::arity(tensor.dims()),
+          dims_outside,
+          strides);
+    } else {
+      constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
+      return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
+    }
+  }
+};
+}  // namespace details
+inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
+  auto buffer_info = details::CastToPyBufferImpl<true, 0, float, int>()(tensor);
+  return buffer_info;
+}
+template <typename T>
+void PyTensorSetFromArray(
+    framework::Tensor &self,
+    py::array_t<T, py::array::c_style | py::array::forcecast> array) {
+  std::vector<int> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back((int)array.shape()[i]);
+  }
+  self.set_dims(framework::make_ddim(dims));
+  auto *dst = self.mutable_data<T>(paddle::platform::CPUPlace());
+  std::memcpy(dst, array.data(), sizeof(T) * array.size());
+}
+}  // namespace pybind
+}  // namespace paddle
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -155,7 +155,8 @@ RUN apt-get update &&\
    paddle version
 ${DOCKERFILE_CUDNN_DSO}
 ${DOCKERFILE_GPU_ENV}
+ADD go/cmd/pserver/pserver /usr/bin/
+ADD go/cmd/master/master /usr/bin/
 # default command shows the paddle version and exit
 CMD ["paddle", "version"]
 EOF
--- a/paddle/scripts/docker/build_android.sh
+++ b/paddle/scripts/docker/build_android.sh
@@ -2,9 +2,9 @@
 set -xe
-mkdir -p /paddle/build
+mkdir -p /paddle/build_android
-cd /paddle/build
+cd /paddle/build_android
-rm -f /paddle/install 2>/dev/null || true
+rm -rf /paddle/install 2>/dev/null || true
 cmake -DCMAKE_SYSTEM_NAME=Android \
      -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
      -DANDROID_ABI=armeabi-v7a \
@@ -21,6 +21,3 @@ cmake -DCMAKE_SYSTEM_NAME=Android \
      ..
 make -j `nproc`
 make install
-export PATH=/paddle/install/bin:/paddle/install/opt/paddle/bin:$PATH
-paddle version
--- a/paddle/scripts/travis/check_style.sh
+++ b/paddle/scripts/travis/check_style.sh
 #!/bin/bash
 function abort(){
    echo "Your change doesn't follow PaddlePaddle's code style." 1>&2
-    echo "Please use pre-commit to reformat your code and git push again." 1>&2
+    echo "Please use pre-commit to check what is wrong." 1>&2
    exit 1
 }
@@ -13,8 +13,14 @@ export PATH=/usr/bin:$PATH
 pre-commit install
 clang-format --version
+# set up go environment for running gometalinter
+mkdir -p $GOPATH/src/github.com/PaddlePaddle/
+ln -sf $TRAVIS_BUILD_DIR $GOPATH/src/github.com/PaddlePaddle/Paddle
+cd  $GOPATH/src/github.com/PaddlePaddle/Paddle/go; glide install; cd -
 if ! pre-commit run -a ; then
-  git diff  --exit-code
+    git diff
+    exit 1
 fi
 trap : 0
--- a/paddle/trainer/NewRemoteParameterUpdater.cpp
+++ b/paddle/trainer/NewRemoteParameterUpdater.cpp
@@ -28,6 +28,17 @@ NewRemoteParameterUpdater::NewRemoteParameterUpdater(
      newGradients_(nullptr),
      pserverSpec_(pserverSpec) {}
+NewRemoteParameterUpdater::NewRemoteParameterUpdater(
+    const OptimizationConfig &config,
+    const std::string pserverSpec,
+    const bool useEtcd)
+    : trainerConfig_(config),
+      parameterClient_(-1),
+      newParameters_(nullptr),
+      newGradients_(nullptr),
+      pserverSpec_(pserverSpec),
+      useEtcd_(useEtcd) {}
 void NewRemoteParameterUpdater::init(
    const std::vector<ParameterPtr> &parameters) {
  ParameterUpdater::init(parameters);
@@ -38,8 +49,13 @@ void NewRemoteParameterUpdater::init(
  }
  // create parameter server client.
+  if (useEtcd_) {
+    parameterClient_ = paddle_new_etcd_pserver_client(
+        (char *)pserverSpec_.c_str(), FLAGS_trainer_id == 0);
+  } else {
    parameterClient_ = paddle_new_pserver_client((char *)pserverSpec_.c_str(),
                                                 FLAGS_trainer_id == 0);
+  }
  // init new parameter and gradient.
  newParameters_ = initNewParameter(PARAMETER_VALUE);

--- a/paddle/trainer/NewRemoteParameterUpdater.h
+++ b/paddle/trainer/NewRemoteParameterUpdater.h
@@ -32,6 +32,9 @@ class NewRemoteParameterUpdater : public ParameterUpdater {
 public:
  NewRemoteParameterUpdater(const OptimizationConfig& config,
                            const std::string pserverSpec);
+  NewRemoteParameterUpdater(const OptimizationConfig& config,
+                            const std::string pserverSpec,
+                            const bool useEtcd);
  ~NewRemoteParameterUpdater() {
    releaseNewParameter(newParameters_);
    releaseNewParameter(newGradients_);
@@ -111,6 +114,8 @@ protected:
  paddle_parameter** newGradients_;
  /// the specification of parameter server "host1:port,host1:port"
  std::string pserverSpec_;
+  /// true if pserverSpec_ is etcd endpoint, else pserverSpec_ is pserver addr
+  bool useEtcd_;
 };
 }  // namespace paddle
--- a/paddle/utils/DynamicLoader.h
+++ b/paddle/utils/DynamicLoader.h
@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifndef DYNAMIC_LOAD_H_
+#pragma once
-#define DYNAMIC_LOAD_H_
 #include <dlfcn.h>
 #include <memory>
@@ -59,5 +58,3 @@ void GetWarpCTCDsoHandle(void** dso_handle);
 *
 */
 void GetLapackDsoHandle(void** dso_handle);
-#endif  // DYNAMIC_LOAD_H_
--- a/paddle/utils/ThreadLocal.h
+++ b/paddle/utils/ThreadLocal.h
@@ -51,7 +51,7 @@ template <class T>
 class ThreadLocal {
 public:
  ThreadLocal() {
-    CHECK(pthread_key_create(&threadSpecificKey_, dataDestructor) == 0);
+    CHECK_EQ(pthread_key_create(&threadSpecificKey_, dataDestructor), 0);
  }
  ~ThreadLocal() { pthread_key_delete(threadSpecificKey_); }
@@ -65,7 +65,7 @@ public:
    if (!p && createLocal) {
      p = new T();
      int ret = pthread_setspecific(threadSpecificKey_, p);
-      CHECK(ret == 0);
+      CHECK_EQ(ret, 0);
    }
    return p;
  }
@@ -79,7 +79,7 @@ public:
    if (T* q = get(false)) {
      dataDestructor(q);
    }
-    CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+    CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
  }
  /**
@@ -112,7 +112,7 @@ private:
 template <class T>
 class ThreadLocalD {
 public:
-  ThreadLocalD() { CHECK(pthread_key_create(&threadSpecificKey_, NULL) == 0); }
+  ThreadLocalD() { CHECK_EQ(pthread_key_create(&threadSpecificKey_, NULL), 0); }
  ~ThreadLocalD() {
    pthread_key_delete(threadSpecificKey_);
    for (auto t : threadMap_) {
@@ -127,7 +127,7 @@ public:
    T* p = (T*)pthread_getspecific(threadSpecificKey_);
    if (!p) {
      p = new T();
-      CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+      CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
      updateMap(p);
    }
    return p;
@@ -141,7 +141,7 @@ public:
    if (T* q = (T*)pthread_getspecific(threadSpecificKey_)) {
      dataDestructor(q);
    }
-    CHECK(pthread_setspecific(threadSpecificKey_, p) == 0);
+    CHECK_EQ(pthread_setspecific(threadSpecificKey_, p), 0);
    updateMap(p);
  }

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1575,7 +1575,13 @@ class MultiClassCrossEntropySelfNormCostLayer(LayerBase):
 @config_layer('fc')
 class FCLayer(LayerBase):
-    def __init__(self, name, size, inputs, bias=True, **xargs):
+    def __init__(self,
+                 name,
+                 size,
+                 inputs,
+                 bias=True,
+                 error_clipping_threshold=None,
+                 **xargs):
        super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)
        for input_index in xrange(len(self.inputs)):
            input_layer = self.get_input_layer(input_index)
@@ -1592,6 +1598,8 @@ class FCLayer(LayerBase):
            self.create_input_parameter(input_index, psize, dims, sparse,
                                        format)
        self.create_bias_parameter(bias, self.config.size)
+        if error_clipping_threshold is not None:
+            self.config.error_clipping_threshold = error_clipping_threshold
 @config_layer('selective_fc')

--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -20,7 +20,6 @@ import trainer
 import event
 import data_type
 import topology
-import data_feeder
 import networks
 import evaluator
 from . import dataset
@@ -31,7 +30,6 @@ import op
 import pooling
 import inference
 import networks
-import py_paddle.swig_paddle as api
 import minibatch
 import plot
 import image
@@ -47,7 +45,6 @@ __all__ = [
    'data_type',
    'attr',
    'pooling',
-    'data_feeder',
    'dataset',
    'reader',
    'topology',
@@ -61,6 +58,7 @@ __all__ = [
 def init(**kwargs):
+    import py_paddle.swig_paddle as api
    args = []
    args_dict = {}
    # NOTE: append arguments if they are in ENV

--- a/python/paddle/v2/data_feeder.py
+++ b/python/paddle/v2/data_feeder.py
@@ -11,7 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from py_paddle import DataProviderConverter
 import collections
 import paddle.trainer.PyDataProvider2 as pydp2

--- a/python/paddle/v2/dataset/__init__.py
+++ b/python/paddle/v2/dataset/__init__.py
@@ -26,8 +26,9 @@ import sentiment
 import wmt14
 import mq2007
 import flowers
+import voc2012
 __all__ = [
    'mnist', 'imikolov', 'imdb', 'cifar', 'movielens', 'conll05', 'sentiment'
-    'uci_housing', 'wmt14', 'mq2007', 'flowers'
+    'uci_housing', 'wmt14', 'mq2007', 'flowers', 'voc2012'
 ]
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@@ -22,6 +22,8 @@ import importlib
 import paddle.v2.dataset
 import cPickle
 import glob
+import cPickle as pickle
+import random
 __all__ = [
    'DATA_HOME', 'download', 'md5file', 'split', 'cluster_files_reader',
@@ -170,8 +172,6 @@ def convert(output_path,
            name_prefix,
            max_lines_to_shuffle=1000):
    import recordio
-    import cPickle as pickle
-    import random
    """
    Convert data from reader to recordio format files.
@@ -201,8 +201,10 @@ def convert(output_path,
    def write_data(w, lines):
        random.shuffle(lines)
        for i, d in enumerate(lines):
-            d = pickle.dumps(d, pickle.HIGHEST_PROTOCOL)
+            # FIXME(Yancey1989):
-            w[i % num_shards].write(d)
+            # dumps with protocol: pickle.HIGHEST_PROTOCOL
+            o = pickle.dumps(d)
+            w[i % num_shards].write(o)
    w = open_writers()
    lines = []

--- a/python/paddle/v2/dataset/mq2007.py
+++ b/python/paddle/v2/dataset/mq2007.py
@@ -212,19 +212,19 @@ def gen_pair(querylist, partial_order="full"):
        for j in range(i + 1, len(querylist)):
            query_right = querylist[j]
            if query_left.relevance_score > query_right.relevance_score:
-                labels.append(1)
+                labels.append([1])
                docpairs.append([
                    np.array(query_left.feature_vector),
                    np.array(query_right.feature_vector)
                ])
            elif query_left.relevance_score < query_right.relevance_score:
-                labels.append(1)
+                labels.append([1])
                docpairs.append([
                    np.array(query_right.feature_vector),
                    np.array(query_left.feature_vector)
                ])
    for label, pair in zip(labels, docpairs):
-        yield label, pair[0], pair[1]
+        yield np.array(label), pair[0], pair[1]
 def gen_list(querylist):

--- a/python/paddle/v2/dataset/tests/voc2012_test.py
+++ b/python/paddle/v2/dataset/tests/voc2012_test.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle.v2.dataset.voc2012
+import unittest
+class TestVOC(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        for l in reader():
+            self.assertEqual(l[0].size, 3 * l[1].size)
+            sum += 1
+        return sum
+    def test_train(self):
+        count = self.check_reader(paddle.v2.dataset.voc_seg.train())
+        self.assertEqual(count, 2913)
+    def test_test(self):
+        count = self.check_reader(paddle.v2.dataset.voc_seg.test())
+        self.assertEqual(count, 1464)
+    def test_val(self):
+        count = self.check_reader(paddle.v2.dataset.voc_seg.val())
+        self.assertEqual(count, 1449)
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/dataset/voc2012.py
+++ b/python/paddle/v2/dataset/voc2012.py
--- a/python/paddle/v2/event.py
+++ b/python/paddle/v2/event.py
--- a/python/paddle/v2/framework/create_op_creation_methods.py
+++ b/python/paddle/v2/framework/create_op_creation_methods.py
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
--- a/python/paddle/v2/framework/tests/test_op_creation_methods.py
+++ b/python/paddle/v2/framework/tests/test_op_creation_methods.py
--- a/python/paddle/v2/framework/tests/test_tensor.py
+++ b/python/paddle/v2/framework/tests/test_tensor.py
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
--- a/python/paddle/v2/master/client.py
+++ b/python/paddle/v2/master/client.py
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
--- a/python/setup.py.in
+++ b/python/setup.py.in