Merge branch 'develop' into feature/better_network_debug_str

107b3395 · Yu Yang · 1ac0bffa · fb48cb12 · 107b3395 · 107b3395
58 changed file
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -21,10 +21,10 @@
    sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
    hooks:
    -   id: clang-formater
-   repo: https://github.com/dnephin/pre-commit-golang
+-   repo: https://github.com/PaddlePaddle/pre-commit-golang
-    sha: e4693a4c282b4fc878eda172a929f7a6508e7d16
+    sha: 16398aeccf263adaf53b2495eed0406347d76281
    hooks:
      -   id: go-fmt
-          files: (.*\.go)
+          types: [go]
-      -   id: go-lint
+      -   id: gometalinter
-          files: (.*\.go)
+          types: [go]
--- a/.travis.yml
+++ b/.travis.yml
@@ -41,6 +41,8 @@ before_install:
  - pip install rarfile
  - curl https://glide.sh/get | bash
  - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
+  - go get -u github.com/alecthomas/gometalinter
+  - gometalinter --install
  - |
    function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -137,7 +137,8 @@ if(WITH_GPU)
 endif(WITH_GPU)
 if(USE_NNPACK)
-  list(APPEND EXTERNAL_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB} "rt")
+    include(external/nnpack)
+    list(APPEND EXTERNAL_LIBS ${NNPACK_LIBS})
 endif(USE_NNPACK)
 add_subdirectory(proto)

--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -14,6 +14,17 @@ RUN apt-get update && \
    wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
    apt-get clean -y
+# Install Go and glide
+RUN wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
+    tar -C /usr/local -xzf go.tgz && \
+    mkdir /root/gopath && \
+    mkdir /root/gopath/bin && \
+    mkdir /root/gopath/src && \
+    rm go.tgz
+ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
+ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
 # git credential to skip password typing
 RUN git config --global credential.helper store

--- a/cmake/cross_compiling/android.cmake
+++ b/cmake/cross_compiling/android.cmake
@@ -108,6 +108,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
        ENDIF()
        IF(ANDROID_ABI STREQUAL "arm64-v8a")
            SET(ANDROID_TOOLCHAIN_NAME aarch64-linux-android)
+            SET(CMAKE_SYSTEM_PROCESSOR aarch64)
        ENDIF()
        SET(ANDROID_TOOLCHAIN_PREFIX "${ANDROID_TOOLCHAIN_ROOT}/bin/${ANDROID_TOOLCHAIN_NAME}-")
    ENDIF()
@@ -166,7 +167,7 @@ IF("${CMAKE_VERSION}" VERSION_LESS "3.7.0")
    ENDIF()
    IF(ANDROID_ABI STREQUAL "arm64-v8a")
-      LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
+        LIST(APPEND ANDROID_COMPILER_FLAGS -march=armv8-a)
    ENDIF()
    STRING(REPLACE ";" " " ANDROID_COMPILER_FLAGS "${ANDROID_COMPILER_FLAGS}")
@@ -193,6 +194,10 @@ ELSE()
        SET(CMAKE_ANDROID_STANDALONE_TOOLCHAIN ${ANDROID_STANDALONE_TOOLCHAIN})
    ENDIF()
    SET(CMAKE_ANDROID_ARCH_ABI ${ANDROID_ABI})
-    SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE})
+    IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
-    SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON})
+        SET(CMAKE_ANDROID_ARM_MODE ${ANDROID_ARM_MODE})
+        IF(ANDROID_ABI STREQUAL "armeabi-v7a")
+            SET(CMAKE_ANDROID_ARM_NEON ${ANDROID_ARM_NEON})
+        ENDIF()
+    ENDIF()
 ENDIF()
--- a/paddle/function/nnpack/nnpack.cmake
+++ b/paddle/function/nnpack/nnpack.cmake
@@ -7,10 +7,24 @@ set(NNPACK_ROOT $ENV{NNPACK_ROOT} CACHE PATH "Folder contains NNPACK")
 find_path(NNPACK_INC_DIR nnpack.h PATHS ${NNPACK_ROOT}/include)
 find_library(NNPACK_LIB NAMES nnpack PATHS ${NNPACK_ROOT}/lib)
 find_library(PTHREADPOOL_LIB NAMES pthreadpool PATHS ${NNPACK_ROOT}/lib)
+find_library(NNPACK_UKERNELS_LIB NAMES nnpack_ukernels PATHS ${NNPACK_ROOT}/lib)
+find_library(NNPACK_CPUFEATURES_LIB NAMES cpufeatures PATHS ${NNPACK_ROOT}/lib)
 if(NNPACK_INC_DIR AND NNPACK_LIB AND PTHREADPOOL_LIB)
  set(NNPACK_FOUND ON)
  INCLUDE_DIRECTORIES(${NNPACK_INC_DIR})
+  set(NNPACK_LIBS)
+  list(APPEND NNPACK_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB})
+  if (NNPACK_UKERNELS_LIB)
+    list(APPEND NNPACK_LIBS ${NNPACK_UKERNELS_LIB})
+  endif()
+  if (NNPACK_CPUFEATURES_LIB)
+    list(APPEND NNPACK_LIBS ${NNPACK_CPUFEATURES_LIB})
+  endif()
+  if(NOT ANDROID)
+    list(APPEND NNPACK_LIBS "rt")
+  endif()
 else()
  message(FATAL_ERROR "Cannot find NNPACK in (${NNPACK_ROOT})")
 endif()
--- a/go/master/c/client.go
+++ b/go/master/c/client.go
@@ -23,7 +23,6 @@ import (
 	log "github.com/sirupsen/logrus"
 )
-var nullPtr = unsafe.Pointer(uintptr(0))
 var mu sync.Mutex
 var handleMap = make(map[C.paddle_master_client]*master.Client)
 var curHandle C.paddle_master_client
@@ -114,13 +113,13 @@ func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
 	if err != nil {
 		// Error
 		// TODO: return the type of error?
-		*record = (*C.uchar)(nullPtr)
+		*record = (*C.uchar)(nil)
 		return -1
 	}
 	if len(r) == 0 {
 		// Empty record
-		*record = (*C.uchar)(nullPtr)
+		*record = (*C.uchar)(nil)
 		return 0
 	}

--- a/go/master/client.go
+++ b/go/master/client.go
@@ -69,7 +69,10 @@ func (c *Client) getRecords() {
 		// We treat a task as finished whenever the last data
 		// instance of the task is read. This is not exactly
 		// correct, but a reasonable approximation.
-		c.taskFinished(t.Meta.ID)
+		err = c.taskFinished(t.Meta.ID)
+		if err != nil {
+			log.Errorln(err)
+		}
 	}
 }

--- a/go/master/client_internal_test.go
+++ b/go/master/client_internal_test.go
@@ -66,11 +66,21 @@ func TestGetFinishTask(t *testing.T) {
 	for i := 0; i < totalTask*chunkPerTask; i++ {
 		w := recordio.NewWriter(f, -1, -1)
-		w.Write(nil)
+		_, err = w.Write(nil)
+		if err != nil {
+			panic(err)
+		}
 		// call Close to force RecordIO writing a chunk.
-		w.Close()
+		err = w.Close()
+		if err != nil {
+			panic(err)
+		}
+	}
+	err = f.Close()
+	if err != nil {
+		panic(err)
 	}
-	f.Close()
 	// Manually intialize client to avoid calling c.getRecords()
 	c := &Client{}
@@ -79,7 +89,11 @@ func TestGetFinishTask(t *testing.T) {
 	ch := make(chan string, 1)
 	ch <- addr
 	go c.monitorMaster(ch)
-	c.SetDataset([]string{path})
+	err = c.SetDataset([]string{path})
+	if err != nil {
+		panic(err)
+	}
 	checkOnePass := func(i int) {
 		var tasks []Task
 		for idx := 0; idx < totalTask; idx++ {

--- a/go/master/client_test.go
+++ b/go/master/client_test.go
@@ -57,14 +57,30 @@ func TestNextRecord(t *testing.T) {
 	w := recordio.NewWriter(f, -1, -1)
 	for i := 0; i < total; i++ {
-		w.Write([]byte{byte(i)})
+		_, err = w.Write([]byte{byte(i)})
+		if err != nil {
+			panic(err)
+		}
+	}
+	err = w.Close()
+	if err != nil {
+		panic(err)
+	}
+	err = f.Close()
+	if err != nil {
+		panic(err)
 	}
-	w.Close()
-	f.Close()
 	curAddr := make(chan string, 1)
 	curAddr <- fmt.Sprintf(":%d", p)
 	c := master.NewClient(curAddr, 10)
-	c.SetDataset([]string{path})
+	err = c.SetDataset([]string{path})
+	if err != nil {
+		panic(err)
+	}
 	for pass := 0; pass < 50; pass++ {
 		received := make(map[byte]bool)
 		for i := 0; i < total; i++ {

--- a/go/master/etcd_client.go
+++ b/go/master/etcd_client.go
@@ -30,7 +30,7 @@ type EtcdClient struct {
 // NewEtcdClient creates a new EtcdClient.
 func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePath string, ttlSec int) (*EtcdClient, error) {
 	log.Debugf("Connecting to etcd at %v", endpoints)
-	// TODO(helin): gracefully shutdown etcd store. Becuase etcd
+	// TODO(helin): gracefully shutdown etcd store. Because etcd
 	// store holds a etcd lock, even though the lock will expire
 	// when the lease timeout, we need to implement graceful
 	// shutdown to release the lock.
@@ -60,7 +60,7 @@ func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePat
 	}
 	log.Debugf("Successfully acquired lock at %s.", lockPath)
-	put := clientv3.OpPut(addrPath, string(addr))
+	put := clientv3.OpPut(addrPath, addr)
 	resp, err := cli.Txn(context.Background()).If(lock.IsOwner()).Then(put).Commit()
 	if err != nil {
 		return nil, err

--- a/go/master/inmem_store.go
+++ b/go/master/inmem_store.go
@@ -4,7 +4,7 @@ import "sync"
 // InMemStore is an in memory implementation of Store interface.
 //
-// It does not tolerate the fault that casues the program to crash.
+// It does not tolerate the fault that causes the program to crash.
 type InMemStore struct {
 	mu  sync.Mutex
 	buf []byte

--- a/go/master/service.go
+++ b/go/master/service.go
@@ -160,7 +160,7 @@ func (s *Service) recover() (bool, error) {
 // snapshot *must* be called with s.mu being held.
 func (s *Service) snapshot() error {
-	// TOOD(helin): etcd request has a size limit, so the snapshot
+	// TODO(helin): etcd request has a size limit, so the snapshot
 	// size is limited by the max request size. We should either
 	// divide the snapshot into smaller chunks and save under
 	// different keys, or configure the request size to be big
@@ -289,7 +289,6 @@ func (s *Service) processFailedTask(t taskEntry, epoch int) {
 	log.Warningf("Task %v failed %d times, discard.", t.Task, t.NumFailure)
 	s.taskQueues.Todo = append(s.taskQueues.Todo, t)
-	return
 }
 func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {

--- a/go/pserver/client/c/cclient.go
+++ b/go/pserver/client/c/cclient.go
@@ -34,7 +34,6 @@ import (
 	log "github.com/sirupsen/logrus"
 )
-var nullPtr = unsafe.Pointer(uintptr(0))
 var mu sync.Mutex
 var handleMap = make(map[C.paddle_pserver_client]*client.Client)
 var curHandle C.paddle_pserver_client
@@ -63,7 +62,7 @@ func remove(client C.paddle_pserver_client) *client.Client {
 }
 func cArrayToSlice(p unsafe.Pointer, len int) []byte {
-	if p == nullPtr {
+	if p == nil {
 		return nil
 	}
@@ -101,11 +100,11 @@ func paddle_new_pserver_client(addrs *C.char, selected int) C.paddle_pserver_cli
 }
 //export paddle_new_etcd_pserver_client
-func paddle_new_etcd_pserver_client(etcd_endpoints *C.char, selected int) C.paddle_pserver_client {
+func paddle_new_etcd_pserver_client(etcdEndpoints *C.char, selected int) C.paddle_pserver_client {
 	// TODO(Longfei: use etcd lock to decide which trainer to initialize the parameters)
-	addr := C.GoString(etcd_endpoints)
+	addr := C.GoString(etcdEndpoints)
-	etcd_client := client.NewEtcd(addr)
+	etcdClient := client.NewEtcd(addr)
-	c := client.NewClient(etcd_client, etcd_client.Desired(), selector(selected != 0))
+	c := client.NewClient(etcdClient, etcdClient.Desired(), selector(selected != 0))
 	return add(c)
 }
@@ -124,20 +123,20 @@ func paddle_begin_init_params(client C.paddle_pserver_client) C.int {
 }
 //export paddle_init_param
-func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter, param_config unsafe.Pointer, config_len C.int) C.int {
+func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter, paramConfig unsafe.Pointer, configLen C.int) C.int {
 	et := pserver.ElementType(param.element_type)
 	name := C.GoString(param.name)
 	content := cArrayToSlice(unsafe.Pointer(param.content), int(param.content_len))
 	pc := pserver.ParameterWithConfig{
 		Param:  pserver.Parameter{Name: name, ElementType: et, Content: content},
-		Config: cArrayToSlice(param_config, int(config_len)),
+		Config: cArrayToSlice(paramConfig, int(configLen)),
 	}
 	c := get(client)
 	err := c.InitParam(pc)
 	if err != nil {
 		if err.Error() == pserver.AlreadyInitialized {
-			log.Warningf("parameter %s already initialized, treat paddle_init_param as sucessful.", name)
+			log.Warningf("parameter %s already initialized, treat paddle_init_param as successful.", name)
 			return C.PSERVER_OK
 		}
 		log.Errorln(err)
@@ -153,7 +152,7 @@ func paddle_finish_init_params(client C.paddle_pserver_client) C.int {
 	err := c.FinishInitParams()
 	if err != nil {
 		if err.Error() == pserver.AlreadyInitialized {
-			log.Warningln("parameters already initialized, treat paddle_finish_init_params as sucessful.")
+			log.Warningln("parameters already initialized, treat paddle_finish_init_params as successful.")
 			return C.PSERVER_OK
 		}
@@ -223,12 +222,12 @@ func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter,
 		p := ps[i]
 		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
-		if unsafe.Pointer(param) == nullPtr {
+		if unsafe.Pointer(param) == nil {
 			log.Errorln("must pre-allocate parameter.")
 			return C.PSERVER_ERROR
 		}
-		if unsafe.Pointer(param.content) != nullPtr {
+		if unsafe.Pointer(param.content) != nil {
 			if int(param.content_len) != len(p.Content) {
 				log.Errorf("the pre-allocated content len does not match parameter content len. Pre-allocated len: %d, returned len: %d", param.content_len, len(p.Content))
 				return C.PSERVER_ERROR

--- a/go/pserver/client/client.go
+++ b/go/pserver/client/client.go
@@ -233,7 +233,7 @@ func (c *Client) Save(path string) error {
 func strHash(s string) uint32 {
 	h := fnv.New32a()
-	h.Write([]byte(s))
+	_, _ = h.Write([]byte(s))
 	return h.Sum32()
 }

--- a/go/pserver/client/client_test.go
+++ b/go/pserver/client/client_test.go
@@ -79,15 +79,33 @@ func initEtcdClient() {
 		log.Errorf("err %v", err)
 	}
 	ctx, cancel := context.WithTimeout(context.Background(), timeout)
-	client.Delete(ctx, pserver.PsDesired)
+	_, err = client.Delete(ctx, pserver.PsDesired)
-	client.Delete(ctx, pserver.PsPath)
+	if err != nil {
-	client.Put(ctx, pserver.PsDesired, strconv.Itoa(numPserver))
+		panic(err)
+	}
+	_, err = client.Delete(ctx, pserver.PsPath)
+	if err != nil {
+		panic(err)
+	}
+	_, err = client.Put(ctx, pserver.PsDesired, strconv.Itoa(numPserver))
+	if err != nil {
+		panic(err)
+	}
 	ports := initClient()
 	for i := 0; i < numPserver; i++ {
-		client.Put(ctx, pserver.PsPath+strconv.Itoa(i), ":"+strconv.Itoa(ports[i]))
+		_, err = client.Put(ctx, pserver.PsPath+strconv.Itoa(i), ":"+strconv.Itoa(ports[i]))
+		if err != nil {
+			panic(err)
+		}
 	}
 	cancel()
-	client.Close()
+	err = client.Close()
+	if err != nil {
+		panic(err)
+	}
 }
 type selector bool

--- a/go/pserver/client/etcd_client.go
+++ b/go/pserver/client/etcd_client.go
@@ -12,8 +12,7 @@ import (
 )
 const (
-	// DefaultEtcdTimeout is the default etcd timeout
+	defaultEtcdTimeout time.Duration = 5 * time.Second
-	DefaultEtcdTimeout time.Duration = 5 * time.Second
 )
 // EtcdClient is used by pserver client that is a part of trainer process.
@@ -48,7 +47,7 @@ func (p *EtcdClient) Desired() int {
 		psDesired, err = strconv.Atoi(string(resp.Kvs[0].Value))
 		if err != nil {
-			log.Errorf("psDesired %s invalid %v", psDesired, err)
+			log.Errorf("psDesired %d invalid %v", psDesired, err)
 			time.Sleep(p.timeout)
 			continue
 		}
@@ -67,12 +66,12 @@ func (p *EtcdClient) List() []Server {
 	for {
 		for i := 0; i < psDesired; i++ {
 			ctx, cancel := context.WithTimeout(context.Background(), p.timeout)
+			cancel()
 			psKey := pserver.PsPath + strconv.Itoa(i)
 			log.Debugf("checking %s", psKey)
 			resp, err := p.client.Get(ctx, psKey)
-			cancel()
 			if err != nil {
-				log.Infof("Get psKey=%s error, %v", psKey, err)
+				log.Infof("Get psKey= %s error, %v", psKey, err)
 				time.Sleep(p.timeout)
 				continue
 			}
@@ -107,11 +106,11 @@ func NewEtcd(endpoints string) *EtcdClient {
 	for {
 		cli, err = clientv3.New(clientv3.Config{
 			Endpoints:   ep,
-			DialTimeout: DefaultEtcdTimeout,
+			DialTimeout: defaultEtcdTimeout,
 		})
 		if err != nil {
 			log.Errorf("Init etcd connection failed: %v", err)
-			time.Sleep(DefaultEtcdTimeout)
+			time.Sleep(defaultEtcdTimeout)
 			continue
 		}
 		break
@@ -119,7 +118,7 @@ func NewEtcd(endpoints string) *EtcdClient {
 	log.Infof("Connected to etcd: %s\n", endpoints)
 	client := &EtcdClient{
 		client:    cli,
-		timeout:   DefaultEtcdTimeout,
+		timeout:   defaultEtcdTimeout,
 		endpoints: ep,
 	}
 	return client

--- a/go/pserver/etcd_client.go
+++ b/go/pserver/etcd_client.go
@@ -177,10 +177,10 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, er
 				break
 			}
 		}
-		if registered == true {
+		if registered {
 			return nil
 		}
-		return errors.New("not registerd, may due to already have enough pservers")
+		return errors.New("not registered, may due to already have enough pservers")
 	}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
 	if err != nil {
@@ -211,8 +211,5 @@ func (e *EtcdClient) PutKey(key string, value []byte, timeout time.Duration) err
 	ctx, cancel := context.WithTimeout(context.Background(), timeout)
 	_, err := e.etcdClient.Put(ctx, key, string(value))
 	cancel()
-	if err != nil {
+	return err
-		return err
-	}
-	return nil
 }
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
@@ -14,8 +14,6 @@ import (
 	log "github.com/sirupsen/logrus"
 )
-var nullPtr = unsafe.Pointer(uintptr(0))
 type optimizer struct {
 	opt         *C.struct_paddle_optimizer
 	elementType ElementType
@@ -23,7 +21,7 @@ type optimizer struct {
 }
 func cArrayToSlice(p unsafe.Pointer, len int) []byte {
-	if p == nullPtr {
+	if p == nil {
 		return nil
 	}
@@ -92,8 +90,8 @@ func (o *optimizer) UpdateParameter(g Gradient) error {
 }
 func (o *optimizer) Cleanup() {
-	if unsafe.Pointer(o.opt) != nullPtr {
+	if unsafe.Pointer(o.opt) != nil {
 		C.paddle_release_optimizer(o.opt)
-		o.opt = (*C.struct_paddle_optimizer)(nullPtr)
+		o.opt = (*C.struct_paddle_optimizer)(nil)
 	}
 }
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -211,7 +211,7 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
 	// learning optimization methods are stochastic in
 	// nature. This race condition is allowed deliberately
 	// to save the program from making a copy of the
-	// paramter content.
+	// parameter content.
 	parameter.Name = name
 	parameter.ElementType = opt.elementType
 	parameter.Content = opt.GetWeights()
@@ -219,7 +219,7 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
 }
 // pserver save checkpoint
-func (s *Service) doCheckpoint() error {
+func (s *Service) doCheckpoint() (err error) {
 	<-s.initialized
 	s.mu.Lock()
 	defer s.mu.Unlock()
@@ -237,9 +237,9 @@ func (s *Service) doCheckpoint() error {
 	}
 	var buf bytes.Buffer
 	encoder := gob.NewEncoder(&buf)
-	err := encoder.Encode(cp)
+	err = encoder.Encode(cp)
 	if err != nil {
-		return err
+		return
 	}
 	cpMeta := checkpointMeta{}
@@ -248,10 +248,14 @@ func (s *Service) doCheckpoint() error {
 	h := md5.New()
 	cpMeta.MD5 = hex.EncodeToString(h.Sum(buf.Bytes()))
-	cpMetajson, _ := json.Marshal(cpMeta)
+	cpMetajson, err := json.Marshal(cpMeta)
+	if err != nil {
+		return
+	}
 	err = s.client.PutKey(filepath.Join(PsCheckpoint, strconv.Itoa(s.idx)), cpMetajson, 3*time.Second)
 	if err != nil {
-		return err
+		return
 	}
 	if _, err = os.Stat(cpMeta.UUID); os.IsNotExist(err) {
 		log.Info("checkpoint does not exists.")
@@ -264,15 +268,32 @@ func (s *Service) doCheckpoint() error {
 		}
 	}
 	f, err := os.Create(cpMeta.UUID)
-	defer f.Close()
 	if err != nil {
-		return err
+		return
 	}
+	defer func() {
+		closeErr := f.Close()
+		if closeErr != nil {
+			if err != nil {
+				log.Errorln(closeErr)
+			} else {
+				// Set closeErr as return value.
+				err = closeErr
+			}
+		}
+	}()
 	writer := bufio.NewWriter(f)
 	_, err = writer.Write(buf.Bytes())
-	writer.Flush()
 	if err != nil {
-		return err
+		return
 	}
-	return nil
+	err = writer.Flush()
+	if err != nil {
+		return
+	}
+	return
 }
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
-# ddim lib
+cc_library(enforce SRCS enforce.cc DEPS glog)
+cc_test(enforce_test SRCS enforce_test.cc DEPS enforce)
 cc_library(ddim SRCS ddim.cc)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
-cc_test(tensor_test SRCS tensor_test.cc DEPS ddim glog gflags)
+cc_library(tensor SRCS tensor.cc DEPS ddim place enforce paddle_memory)
+cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
 cc_test(variable_test SRCS variable_test.cc)
 cc_test(scope_test SRCS scope_test.cc)
-cc_library(enforce SRCS enforce.cc DEPS glog gflags)
-cc_test(enforce_test SRCS enforce_test.cc DEPS enforce)
 proto_library(attr_type SRCS attr_type.proto)
 proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
 cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
 proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
 cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
-cc_library(operator SRCS operator.cc DEPS op_desc device_context enforce)
+cc_library(operator SRCS operator.cc DEPS op_desc device_context tensor)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc enforce)

--- a/paddle/framework/ddim.cc
+++ b/paddle/framework/ddim.cc
@@ -117,6 +117,8 @@ int DDim::operator[](int idx) const {
  return boost::apply_visitor(DynamicConstIndexer(idx), var);
 }
+ssize_t DDim::size() const { return arity(*this); }
 bool DDim::operator==(DDim d) const {
  if (var.which() != d.getVar().which()) {
    return false;
@@ -278,5 +280,9 @@ std::ostream& operator<<(std::ostream& os, const DDim& ddim) {
  return os;
 }
+DDim::DDim(std::initializer_list<int> init_list) {
+  *this = make_ddim(init_list);
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -29,6 +29,8 @@ struct DDim {
  template <int D>
  explicit DDim(const Dim<D>& in) : var(in) {}
+  /*implicit*/ DDim(std::initializer_list<int> init_list);
  template <int D>
  DDim& operator=(const Dim<D>& in) {
    var = in;
@@ -57,6 +59,8 @@ struct DDim {
  DDim operator+(DDim d) const;
  DDim operator*(DDim d) const;
+  ssize_t size() const;
 };
 /**

--- a/paddle/framework/ddim_test.cc
+++ b/paddle/framework/ddim_test.cc
@@ -49,6 +49,7 @@ TEST(DDim, Equality) {
  // arity of a DDim
  EXPECT_EQ(paddle::framework::arity(ddim), 3);
+  EXPECT_EQ(ddim.size(), 3);
  // product of a DDim
  EXPECT_EQ(paddle::framework::product(vddim), 45);

--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
 #pragma once
 #include <algorithm>
+#include <atomic>
 #include <type_traits>
 #include <unordered_map>
 #include <unordered_set>
@@ -197,6 +198,8 @@ Add a mark to which output is temporary is helpful for future optimization.
 class OpRegistry {
  using OpCreator = std::function<OperatorBase*()>;
+  using VarIndexMap = std::unordered_map<std::string, int>;
+  using VarNameList = std::vector<std::string>;
 public:
  template <typename OpType, typename ProtoMakerType>
@@ -211,24 +214,64 @@ class OpRegistry {
        op_proto.IsInitialized(),
        "Fail to initialize %s's OpProto, because %s is not initialized",
        op_type, op_proto.InitializationErrorString());
+    VarIndexMaps()[op_type].reset(new VarIndexMap());
+    auto& varmap = *VarIndexMaps()[op_type];
+    int idx = 0;
+    for (auto& var : op_proto.inputs()) {
+      varmap[var.name()] = idx++;
+    }
+    idx = 0;
+    for (auto& var : op_proto.outputs()) {
+      varmap[var.name()] = idx++;
+    }
+  }
+  static OperatorPtr CreateOp(const std::string& type,
+                              const VarNameList& inputs,
+                              const VarNameList& outputs,
+                              const AttributeMap& attrs) {
+    auto op_create_it = creators().find(type);
+    PADDLE_ENFORCE(op_create_it != creators().end(),
+                   "Operator %s cannot be found", type);
+    auto op = op_create_it->second();
+    op->type_ = type;
+    op->inputs_ = inputs;
+    op->outputs_ = outputs;
+    op->attrs_ = attrs;
+    op_checkers().at(type).Check(op->attrs_);
+    GenerateTempVariableName(op);
+    {
+      auto var_index_it = VarIndexMaps().find(type);
+      if (var_index_it != VarIndexMaps().end()) {
+        op->in_out_idxs_ = var_index_it->second;
+      }
+    }
+    op->Init();
+    return OperatorPtr(op);
  }
  static OperatorPtr CreateOp(const OpDesc& op_desc) {
-    std::string op_type = op_desc.type();
+    std::vector<std::string> inputs;
-    OperatorPtr op(creators().at(op_type)());
+    inputs.reserve((size_t)op_desc.inputs_size());
-    op->type_ = op_desc.type();
-    op->inputs_.reserve((size_t)op_desc.inputs_size());
    std::copy(op_desc.inputs().begin(), op_desc.inputs().end(),
-              std::back_inserter(op->inputs_));
+              std::back_inserter(inputs));
-    op->outputs_.reserve((size_t)op_desc.outputs_size());
+    std::vector<std::string> outputs;
+    outputs.reserve((size_t)op_desc.outputs_size());
    std::copy(op_desc.outputs().begin(), op_desc.outputs().end(),
-              std::back_inserter(op->outputs_));
+              std::back_inserter(outputs));
+    AttributeMap attrs;
    for (auto& attr : op_desc.attrs()) {
-      op->attrs_[attr.name()] = AttrTypeHelper::GetAttrValue(attr);
+      attrs[attr.name()] = AttrTypeHelper::GetAttrValue(attr);
    }
-    op_checkers().at(op_type).Check(op->attrs_);
-    op->Init();
+    return CreateOp(op_desc.type(), inputs, outputs, attrs);
-    return op;
  }
  static std::unordered_map<std::string, OpProto>& protos() {
@@ -237,6 +280,23 @@ class OpRegistry {
  };
 private:
+  static std::unordered_map<std::string, std::shared_ptr<VarIndexMap>>&
+  VarIndexMaps() {
+    static std::unordered_map<std::string, std::shared_ptr<VarIndexMap>> maps_;
+    return maps_;
+  }
+  static void GenerateTempVariableName(OperatorBase* op) {
+    static std::atomic<size_t> gUniqId(0UL);
+    for (auto& outname : op->outputs_) {
+      if (outname == OperatorBase::TMP_VAR_NAME()) {
+        outname += op->type_;
+        outname += "@";
+        outname += std::to_string(gUniqId.fetch_add(1));
+      }
+    }
+  }
  static std::unordered_map<std::string, OpCreator>& creators() {
    static std::unordered_map<std::string, OpCreator> creators_;
    return creators_;
@@ -278,7 +338,7 @@ class OpRegisterHelper {
 /**
 * Macro to Register OperatorKernel.
 */
-#define REGISTER_OP_KERNEL(type, DEVICE_TYPE, PlaceType, KernelType)      \
+#define REGISTER_OP_KERNEL(type, DEVICE_TYPE, PlaceType, ...)             \
  STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
      __reg_op_kernel_##type##_##DEVICE_TYPE##__,                         \
      "REGISTER_OP_KERNEL must be in global namespace");                  \
@@ -287,17 +347,19 @@ class OpRegisterHelper {
      ::paddle::framework::OperatorWithKernel::OpKernelKey key;           \
      key.place_ = PlaceType();                                           \
      ::paddle::framework::OperatorWithKernel::AllOpKernels()[#type][key] \
-          .reset(new KernelType());                                       \
+          .reset(new __VA_ARGS__());                                      \
    }                                                                     \
  };                                                                      \
  static __op_kernel_register__##type##__ __reg_kernel_##type##__;        \
  int __op_kernel_register_##type##_handle_##DEVICE_TYPE##__() { return 0; }
-#define REGISTER_OP_GPU_KERNEL(type, KernelType) \
+// (type, KernelType)
-  REGISTER_OP_KERNEL(type, GPU, ::paddle::platform::GPUPlace, KernelType)
+#define REGISTER_OP_GPU_KERNEL(type, ...) \
+  REGISTER_OP_KERNEL(type, GPU, ::paddle::platform::GPUPlace, __VA_ARGS__)
-#define REGISTER_OP_CPU_KERNEL(type, KernelType) \
+// (type, KernelType)
-  REGISTER_OP_KERNEL(type, CPU, ::paddle::platform::CPUPlace, KernelType)
+#define REGISTER_OP_CPU_KERNEL(type, ...) \
+  REGISTER_OP_KERNEL(type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
 /**
 * Macro to mark what Operator and Kernel we will use and tell the compiler to

--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -12,30 +12,76 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <algorithm>
 #include "paddle/framework/operator.h"
 namespace paddle {
 namespace framework {
+const std::string& OperatorBase::Input(const std::string& name) const {
+  auto it = in_out_idxs_->find(name);
+  PADDLE_ENFORCE(it != in_out_idxs_->end(), "no key [%s] in in_out_idxs_",
+                 name);
+  if (attrs_.count("input_format") == 0) {
+    return inputs_[it->second];
+  } else {
+    const auto& input_format = GetAttr<std::vector<int>>("input_format");
+    int idx = input_format[it->second];
+    return inputs_.at(idx);
+  }
+}
+std::vector<std::string> OperatorBase::Inputs(const std::string& name) const {
+  auto input_format = GetAttr<std::vector<int>>("input_format");
+  auto offset = in_out_idxs_->at(name);
+  return std::vector<std::string>{
+      inputs_.begin() + input_format.at(offset),
+      inputs_.begin() + input_format.at(offset + 1)};
+}
+const std::string& OperatorBase::Output(const std::string& name) const {
+  auto it = in_out_idxs_->find(name);
+  PADDLE_ENFORCE(it != in_out_idxs_->end(), "no key [%s] in in_out_idxs_",
+                 name);
+  if (attrs_.count("output_format") == 0) {
+    return outputs_[it->second];
+  } else {
+    const auto& output_format = GetAttr<std::vector<int>>("output_format");
+    int idx = output_format[it->second];
+    return outputs_.at(idx);
+  }
+}
+std::vector<std::string> OperatorBase::Outputs(const std::string& name) const {
+  auto output_format = GetAttr<std::vector<int>>("output_format");
+  auto offset = in_out_idxs_->at(name);
+  return std::vector<std::string>{
+      outputs_.begin() + output_format.at(offset),
+      outputs_.begin() + output_format.at(offset + 1)};
+}
 std::string OperatorBase::DebugString() const {
  std::stringstream ss;
-  ss << "=================\n";
+  ss << "Op(" << type_ << "), inputs:(";
-  ss << "type = " << type_ << "\n";
+  for (size_t i = 0; i < inputs_.size(); ++i) {
-  ss << "inputs = [";
+    ss << inputs_[i];
-  for (auto& ipt : inputs_) {
+    if (i != inputs_.size() - 1) {
-    ss << ipt << ", ";
+      ss << ", ";
-  }
+    }
-  ss << "]\n";
-  ss << "outputs = [";
-  for (auto& opt : outputs_) {
-    ss << opt << ", ";
  }
-  ss << "]\n";
+  ss << "), outputs:(";
-  ss << "attr_keys = [";
+  for (size_t i = 0; i < outputs_.size(); ++i) {
-  for (auto& attr : attrs_) {
+    ss << outputs_[i];
-    ss << attr.first << ", ";
+    if (i != outputs_.size() - 1) {
+      ss << ", ";
+    }
  }
-  ss << "]\n";
+  ss << ").";
  return ss.str();
 }

--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -14,18 +14,20 @@ limitations under the License. */
 #pragma once
-#include <paddle/framework/attr_checker.h>
-#include <paddle/framework/op_desc.pb.h>
-#include <paddle/framework/scope.h>
-#include <paddle/framework/tensor.h>
-#include <paddle/platform/device_context.h>
-#include <paddle/platform/place.h>
-#include <paddle/utils/Error.h>
 #include <boost/variant.hpp>
 #include <string>
 #include <unordered_map>
 #include <vector>
+#include "paddle/framework/attr_checker.h"
+#include "paddle/framework/op_desc.pb.h"
+#include "paddle/framework/op_proto.pb.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/device_context.h"
+#include "paddle/platform/place.h"
+#include "paddle/utils/Error.h"
 namespace paddle {
 namespace framework {
@@ -39,6 +41,13 @@ using OperatorPtr = std::shared_ptr<OperatorBase>;
 */
 class OperatorBase {
 public:
+  /// If a variable is a empty variable, that name will be used.
+  static std::string EMPTY_VAR_NAME() { return "@EMPTY@"; }
+  /// If a variable is a temporary variable, that name will be set in Python,
+  /// but it will be convert to a unique name in scope after OpCreator.
+  static std::string TMP_VAR_NAME() { return "@TEMP@"; }
  virtual ~OperatorBase() {}
  template <typename T>
@@ -62,11 +71,69 @@ class OperatorBase {
  virtual void Run(const ScopePtr& scope,
                   const platform::DeviceContext& dev_ctx) const = 0;
+  // Get a input with argument's name described in `op_proto`
+  const std::string& Input(const std::string& name) const;
+  // Get a input which has multiple variables.
+  // TODO add a vector_view to prevent memory copy.
+  std::vector<std::string> Inputs(const std::string& name) const;
+  // Get a output with argument's name described in `op_proto`
+  const std::string& Output(const std::string& name) const;
+  // Get an output which has multiple variables.
+  // TODO add a vector_view to prevent memory copy.
+  std::vector<std::string> Outputs(const std::string& name) const;
 public:
  std::string type_;
  std::vector<std::string> inputs_;
  std::vector<std::string> outputs_;
  AttributeMap attrs_;
+  // store the arguments' offset described in op_desc.
+  std::shared_ptr<std::unordered_map<std::string, int>> in_out_idxs_;
+};
+class KernelContext {
+ public:
+  KernelContext(const OperatorBase* op, const std::shared_ptr<Scope>& scope,
+                const platform::DeviceContext& device_context)
+      : op_(*op), scope_(scope), device_context_(device_context) {}
+  const Variable* Input(int index) const {
+    return scope_->GetVariable(op_.inputs_[index]);
+  }
+  Variable* Output(int index) const {
+    return scope_->GetVariable(op_.outputs_[index]);
+  }
+  const Variable* Input(const std::string& name) const {
+    return scope_->GetVariable(op_.Input(name));
+  }
+  const Variable* Output(const std::string& name) const {
+    return scope_->GetVariable(op_.Output(name));
+  }
+  const std::vector<const Variable*> Inputs(const std::string& name) const {
+    auto names = op_.Inputs(name);
+    std::vector<const Variable*> res;
+    std::transform(
+        names.begin(), names.end(), res.begin(),
+        [this](const std::string& name) { return scope_->GetVariable(name); });
+    return res;
+  }
+  const std::vector<const Variable*> Outputs(const std::string& name) const {
+    auto names = op_.Outputs(name);
+    std::vector<const Variable*> res;
+    std::transform(
+        names.begin(), names.end(), res.begin(),
+        [this](const std::string& name) { return scope_->GetVariable(name); });
+    return res;
+  }
+  const OperatorBase& op_;
+  const std::shared_ptr<Scope>& scope_;
+  const platform::DeviceContext& device_context_;
 };
 class OpKernel {
@@ -77,25 +144,6 @@ class OpKernel {
   * device resource such as CUDA stream, cublas handle, etc. from
   * KernelContext. User should construct it before run the Operator.
   */
-  class KernelContext {
-   public:
-    KernelContext(const OperatorBase* op, const ScopePtr& scope,
-                  const platform::DeviceContext& device_context)
-        : op_(*op), scope_(scope), device_context_(device_context) {}
-    const Variable* Input(int index) const {
-      return scope_->GetVariable(op_.inputs_[index]);
-    }
-    Variable* Output(int index) const {
-      return scope_->GetVariable(op_.outputs_[index]);
-    }
-    const OperatorBase& op_;
-    const ScopePtr& scope_;
-    const platform::DeviceContext& device_context_;
-  };
  virtual void Compute(const KernelContext& context) const = 0;
  virtual ~OpKernel() {}
@@ -140,7 +188,7 @@ class OperatorWithKernel : public OperatorBase {
  void Run(const ScopePtr& scope,
           const platform::DeviceContext& dev_ctx) const final {
    auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
-    opKernel->Compute(OpKernel::KernelContext(this, scope, dev_ctx));
+    opKernel->Compute(KernelContext(this, scope, dev_ctx));
  }
  static std::unordered_map<std::string /* op_type */, OpKernelMap>&
@@ -148,6 +196,7 @@ class OperatorWithKernel : public OperatorBase {
    static std::unordered_map<std::string, OpKernelMap> g_all_op_kernels;
    return g_all_op_kernels;
  }
  void InferShape(const std::shared_ptr<Scope>& scope) const final {
    std::vector<const Tensor*> ins;
    VarNamesToTensors(scope, inputs_, &ins);

--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -30,7 +30,6 @@ class OpWithoutKernelTest : public OperatorBase {
    op_run_num++;
    ASSERT_EQ((int)inputs_.size(), 1);
    ASSERT_EQ((int)outputs_.size(), 1);
-    ASSERT_NEAR(GetAttr<float>("scale"), 3.14, 1e-5);
    ASSERT_EQ(scope->GetVariable(inputs_[0]), nullptr);
    ASSERT_EQ(x, 1);
    ASSERT_NE(scope->GetVariable(outputs_[0]), nullptr);
@@ -86,9 +85,11 @@ class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 public:
  OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "input of test op");
+    AddInput("x", "input of test op");
-    AddOutput("output", "output of test op");
+    AddOutput("y", "output of test op");
-    AddAttr<float>("scale", "scale of cosine op");
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .LargerThan(0.0);
    AddComment("This is test op");
  }
 };
@@ -101,13 +102,68 @@ class OpWithKernelTest : public OperatorWithKernel {
                  const std::vector<Tensor*>& outputs) const override {}
 };
+template <typename T1, typename T2>
 class CPUKernelTest : public OpKernel {
 public:
-  void Compute(const KernelContext& context) const {
+  void Compute(const KernelContext& ctx) const {
+    std::cout << "this is cpu kernel" << std::endl;
+    std::cout << ctx.op_.DebugString() << std::endl;
    cpu_kernel_run_num++;
-    ASSERT_EQ((int)context.op_.inputs_.size(), 1);
+    ASSERT_EQ(ctx.op_.Input("x"), "IN1");
-    ASSERT_EQ((int)context.op_.outputs_.size(), 1);
+    ASSERT_EQ(ctx.op_.Output("y"), "OUT1");
-    ASSERT_NEAR(context.op_.GetAttr<float>("scale"), 3.14, 1e-5);
+  }
+};
+// multiple inputs test
+class OperatorMultiInputsTest : public OperatorBase {
+ public:
+  void Init() override { x = 1; }
+  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
+  void Run(const std::shared_ptr<Scope>& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    ASSERT_EQ(scope->GetVariable(inputs_[0]), nullptr);
+    ASSERT_EQ(x, 1);
+    ASSERT_NE(scope->GetVariable(outputs_[0]), nullptr);
+    ASSERT_EQ(Input("x"), "IN1");
+    ASSERT_EQ(Input("y"), "OUT1");
+  }
+ public:
+  float x = 0;
+};
+class OpKernelTestMultiInputsProtoAndCheckerMaker
+    : public OpProtoAndCheckerMaker {
+ public:
+  OpKernelTestMultiInputsProtoAndCheckerMaker(OpProto* proto,
+                                              OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInputs("xs", "inputs of test op");
+    AddInput("k", "input of test op");
+    AddOutputs("ys", "outputs of test op");
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .LargerThan(0.0);
+    AddComment("This is test op");
+  }
+};
+class CPUKernalMultiInputsTest : public OpKernel {
+ public:
+  void Compute(const KernelContext& ctx) const {
+    auto xs = ctx.op_.Inputs("xs");
+    ASSERT_EQ(xs.size(), 3UL);
+    ASSERT_EQ(xs[0], "x0");
+    ASSERT_EQ(xs[1], "x1");
+    ASSERT_EQ(xs[2], "x2");
+    auto k = ctx.op_.Input("k");
+    ASSERT_EQ(k, "k0");
+    auto ys = ctx.op_.Outputs("ys");
+    ASSERT_EQ(ys.size(), 2UL);
+    ASSERT_EQ(ys[0], "y0");
+    ASSERT_EQ(ys[1], "y1");
  }
 };
@@ -116,8 +172,10 @@ class CPUKernelTest : public OpKernel {
 REGISTER_OP(op_with_kernel, paddle::framework::OpWithKernelTest,
            paddle::framework::OpKernelTestProtoAndCheckerMaker);
-REGISTER_OP_CPU_KERNEL(op_with_kernel, paddle::framework::CPUKernelTest);
+REGISTER_OP_CPU_KERNEL(op_with_kernel,
+                       paddle::framework::CPUKernelTest<float, float>);
+// test with single input
 TEST(OpKernel, all) {
  paddle::framework::OpDesc op_desc;
  op_desc.set_type("op_with_kernel");
@@ -137,3 +195,47 @@ TEST(OpKernel, all) {
  op->Run(scope, cpu_device_context);
  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
 }
+REGISTER_OP(op_multi_inputs_with_kernel, paddle::framework::OpWithKernelTest,
+            paddle::framework::OpKernelTestMultiInputsProtoAndCheckerMaker);
+REGISTER_OP_CPU_KERNEL(op_multi_inputs_with_kernel,
+                       paddle::framework::CPUKernalMultiInputsTest);
+// test with multi inputs
+TEST(OpKernel, multi_inputs) {
+  using namespace paddle::framework;
+  OpDesc op_desc;
+  op_desc.set_type("op_multi_inputs_with_kernel");
+  *op_desc.mutable_inputs()->Add() = "x0";
+  *op_desc.mutable_inputs()->Add() = "x1";
+  *op_desc.mutable_inputs()->Add() = "x2";
+  *op_desc.mutable_inputs()->Add() = "k0";
+  *op_desc.mutable_outputs()->Add() = "y0";
+  *op_desc.mutable_outputs()->Add() = "y1";
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_f(3.14);
+  auto attr0 = op_desc.mutable_attrs()->Add();
+  attr0->set_name("input_format");
+  attr0->set_type(paddle::framework::AttrType::INTS);
+  auto input_format = attr0->mutable_ints();
+  input_format->Add(0);  // x0
+  input_format->Add(3);  // k
+  input_format->Add(4);  // end
+  auto attr1 = op_desc.mutable_attrs()->Add();
+  attr1->set_name("output_format");
+  attr1->set_type(paddle::framework::AttrType::INTS);
+  auto output_format = attr1->mutable_ints();
+  output_format->Add(0);  // y0
+  output_format->Add(2);  // y1
+  paddle::platform::CPUDeviceContext cpu_device_context;
+  auto scope = std::make_shared<Scope>();
+  OperatorPtr op(paddle::framework::OpRegistry::CreateOp(op_desc));
+  op->Run(scope, cpu_device_context);
+}
--- a/paddle/framework/tensor.cc
+++ b/paddle/framework/tensor.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <paddle/framework/tensor.h>
+namespace paddle {
+namespace framework {}
+}  // namespace paddle
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -17,19 +17,24 @@ limitations under the License. */
 #include <cstdint>
 #include <cstring>
 #include <memory>
+#include <typeindex>
 #include "paddle/framework/ddim.h"
 #include "paddle/framework/enforce.h"
 #include "paddle/memory/memory.h"
 #include "paddle/platform/place.h"
 namespace paddle {
+namespace pybind {
+namespace details {  // forward declare
+template <bool less, size_t i, typename... args>
+struct CastToPyBufferImpl;
+}  // namespace details
+}  // namespace pybind
 namespace framework {
 class Tensor {
 public:
-  Tensor() : numel_(0), offset_(0) {}
+  Tensor() : offset_(0) {}
-  Tensor& operator=(const Tensor& src) = delete;
  template <typename T>
  const T* data() const {
@@ -39,21 +44,33 @@ class Tensor {
  }
  template <typename T>
-  T* mutable_data(DDim dims, paddle::platform::Place place) {
+  T* mutable_data(DDim dims, platform::Place place) {
    set_dims(dims);
    return mutable_data<T>(place);
  }
  template <typename T>
-  T* mutable_data(paddle::platform::Place place) {
+  T* mutable_data(platform::Place place) {
-    PADDLE_ENFORCE(numel_ > 0,
+    PADDLE_ENFORCE(product(dims_) > 0,
-                   "Tensor::numel_ must be larger than zero to call "
+                   "Tensor's numel must be larger than zero to call "
                   "Tensor::mutable_data. Call Tensor::set_dim first.");
    if (holder_ == nullptr ||
        !(holder_->place() ==
          place) /* some versions of boost::variant don't have operator!= */
-        || holder_->size() < numel_ * sizeof(T) + offset_) {
+        || holder_->size() < product(dims_) * sizeof(T) + offset_) {
-      holder_.reset(new PlaceholderImpl<T>(place, numel_ * sizeof(T)));
+      if (platform::is_cpu_place(place)) {
+        holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
+            boost::get<platform::CPUPlace>(place), product(dims_) * sizeof(T)));
+      } else if (platform::is_gpu_place(place)) {
+#ifdef __CUDACC__
+        holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
+            boost::get<platform::GPUPlace>(place), product(dims_) * sizeof(T)));
+#else
+        PADDLE_ENFORCE(true, "'GPUPlace' is not supported in CPU only device.");
+#endif
+      } else {
+        PADDLE_ENFORCE(true, "Unknown 'place'.");
+      }
      offset_ = 0;
    }
    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
@@ -69,12 +86,12 @@ class Tensor {
  }
  template <typename T>
-  void CopyFrom(const Tensor& src, paddle::platform::Place dst_place) {
+  void CopyFrom(const Tensor& src, platform::Place dst_place) {
    PADDLE_ENFORCE(platform::is_cpu_place(src.holder_->place()) &&
                       platform::is_cpu_place(dst_place),
                   "Tensor::CopyFrom only support CPU now.");
    src.CheckDims<T>();
-    size_t size = src.numel_ * sizeof(T);
+    size_t size = product(src.dims_) * sizeof(T);
    set_dims(src.dims());
    const void* src_ptr = static_cast<const void*>(src.data<T>());
    void* dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
@@ -108,7 +125,6 @@ class Tensor {
      return;
    }
    dims_ = dims;
-    numel_ = product(dims_);
  }
  DDim dims() const { return dims_; }
@@ -119,53 +135,55 @@ class Tensor {
  struct Placeholder {
    virtual ~Placeholder() {}
    virtual void* ptr() const = 0;
-    virtual paddle::platform::Place place() const = 0;
+    virtual platform::Place place() const = 0;
    virtual size_t size() const = 0;
+    virtual std::type_index type() const = 0;
  };
-  template <typename T>
+  template <typename T, typename PlaceType>
  struct PlaceholderImpl : public Placeholder {
   private:
+    template <typename PType>
    class Deleter {
     public:
-      Deleter(platform::Place place) : place_(place) {}
+      Deleter(PType place) : place_(place) {}
-      void operator()(T* ptr) {
+      void operator()(T* ptr) { memory::Free(place_, static_cast<void*>(ptr)); }
-        paddle::memory::Free(place_, static_cast<void*>(ptr));
-      }
     private:
-      paddle::platform::Place place_;
+      PType place_;
    };
   public:
-    PlaceholderImpl(paddle::platform::Place place, size_t size)
+    PlaceholderImpl(PlaceType place, size_t size)
-        : ptr_(static_cast<T*>(paddle::memory::Alloc(place, size)),
+        : ptr_(static_cast<T*>(memory::Alloc(place, size)),
-               Deleter(place)),
+               Deleter<PlaceType>(place)),
          place_(place),
          size_(size) {}
    virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
    virtual size_t size() const { return size_; }
    virtual paddle::platform::Place place() const { return place_; }
+    virtual std::type_index type() const { return std::type_index(typeid(T)); }
-    std::unique_ptr<T, Deleter> ptr_;
+    std::unique_ptr<T, Deleter<PlaceType>> ptr_;
-    paddle::platform::Place place_;  // record the place of ptr_.
+    platform::Place place_;  // record the place of ptr_.
-    size_t size_;                    // size of the memory block.
+    size_t size_;            // size of the memory block.
  };
  template <typename T>
  inline void CheckDims() const {
    PADDLE_ENFORCE(holder_ != nullptr,
                   "Tenosr holds no memory. Call Tensor::mutable_data first.");
-    PADDLE_ENFORCE(holder_->size() >= numel_ * sizeof(T) + offset_,
+    PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_,
                   "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
                   "first to re-allocate memory.");
  }
  std::shared_ptr<Placeholder> holder_;  // holds the memory block if allocated.
  DDim dims_;
-  size_t numel_;   // cache of `product(dims_)`
  size_t offset_;  // marks the begin of tensor data area.
+  template <bool less, size_t i, typename... args>
+  friend struct paddle::pybind::details::CastToPyBufferImpl;
 };
 }  // namespace framework

--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -47,7 +47,7 @@ TEST(Tensor, DataAssert) {
 /* following tests are not available at present
   because Memory::Alloc() and Memory::Free() have not been ready.
+*/
 TEST(Tensor, MutableData) {
  using namespace paddle::framework;
  using namespace paddle::platform;
@@ -72,7 +72,7 @@ TEST(Tensor, MutableData) {
    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CPUPlace());
    EXPECT_EQ(p1, p2);
  }
+#ifdef __CUDACC__
  {
    Tensor src_tensor;
    float* p1 = nullptr;
@@ -94,6 +94,7 @@ TEST(Tensor, MutableData) {
    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), GPUPlace());
    EXPECT_EQ(p1, p2);
  }
+#endif
 }
 TEST(Tensor, ShareDataFrom) {
@@ -108,9 +109,11 @@ TEST(Tensor, ShareDataFrom) {
      dst_tensor.ShareDataFrom<float>(src_tensor);
    } catch (EnforceNotMet err) {
      caught = true;
-      std::string msg = "Tenosr holds no memory. Call Tensor::mutable_data
+      std::string msg =
-first."; const char* what = err.what(); for (size_t i = 0; i < msg.length();
+          "Tenosr holds no memory. Call Tensor::mutable_data first.";
-++i) { ASSERT_EQ(what[i], msg[i]);
+      const char* what = err.what();
+      for (size_t i = 0; i < msg.length(); ++i) {
+        ASSERT_EQ(what[i], msg[i]);
      }
    }
    ASSERT_TRUE(caught);
@@ -120,6 +123,7 @@ first."; const char* what = err.what(); for (size_t i = 0; i < msg.length();
    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
  }
+#ifdef __CUDACC__
  {
    Tensor src_tensor;
    Tensor dst_tensor;
@@ -127,6 +131,7 @@ first."; const char* what = err.what(); for (size_t i = 0; i < msg.length();
    dst_tensor.ShareDataFrom<int>(src_tensor);
    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
  }
+#endif
 }
 TEST(Tensor, Slice) {
@@ -155,6 +160,7 @@ TEST(Tensor, Slice) {
    EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
  }
+#ifdef __CUDACC__
  {
    Tensor src_tensor;
    src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
@@ -176,6 +182,7 @@ TEST(Tensor, Slice) {
    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
    EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
  }
+#endif
 }
 TEST(Tensor, CopyFrom) {
@@ -203,4 +210,3 @@ TEST(Tensor, CopyFrom) {
    EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
  }
 }
-*/
\ No newline at end of file
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -11,7 +11,6 @@ if(WITH_GPU)
 endif()
 if(USE_NNPACK)
-  include(nnpack/nnpack.cmake)
  list(APPEND cpp_files nnpack/NNPACKConvOp.cpp)
  if(WITH_TESTING)
    add_unittest(NNPACKConvOpTest nnpack/NNPACKConvOpTest.cpp)

--- a/paddle/function/nnpack/NNPACKConvOp.cpp
+++ b/paddle/function/nnpack/NNPACKConvOp.cpp
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/function/ConvOp.h"
 DEFINE_bool(nnpack_allocate_outside,
-            false,
+            true,
            "Allocate and free workspace memory outside the NNPACK interface.");
 DEFINE_int32(nnpack_num_threads,
             0,
@@ -58,18 +58,10 @@ public:
    workspaceBuffer_ = nullptr;
    workspaceSize_ = 0;
-    threadpool_ = nullptr;
+    create_nnpack_threadpool();
-    if (FLAGS_nnpack_num_threads) {
-      threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
-      VLOG(3) << "Number of threads "
-              << pthreadpool_get_threads_count(threadpool_);
-    }
  }
  ~NNPACKConvFunction() {
-    if (threadpool_) {
-      pthreadpool_destroy(threadpool_);
-    }
    if (workspaceBuffer_) {
      free(workspaceBuffer_);
    }
@@ -225,14 +217,25 @@ public:
    }
  }
+  static void create_nnpack_threadpool() {
+    if (FLAGS_nnpack_num_threads && threadpool_ == nullptr) {
+      threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
+      VLOG(3) << "Number of threads "
+              << pthreadpool_get_threads_count(threadpool_);
+    }
+  }
 private:
  nnp_convolution_algorithm algorithm_;
  nnp_convolution_transform_strategy transform_strategy_;
  void* workspaceBuffer_;
  size_t workspaceSize_;
-  pthreadpool_t threadpool_;
+  static pthreadpool_t threadpool_;
 };
+template <DeviceType Device>
+pthreadpool_t NNPACKConvFunction<Device>::threadpool_ = nullptr;
 REGISTER_TYPED_FUNC(NNPACKConv, CPU, NNPACKConvFunction);
 }  // namespace paddle
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
-if(WITH_GPU)
+function(op_library TARGET)
-    nv_library(add_op SRCS add_op.cc add_op.cu DEPS operator op_registry glog ddim)
+    # op_library is a function to create op library. The interface is same as
-else()
+    # cc_library. But it handle split GPU/CPU code and link some common library
-    cc_library(add_op SRCS add_op.cc DEPS operator op_registry glog ddim)
+    # for ops.
-endif()
+    set(cc_srcs)
+    set(cu_srcs)
+    set(op_common_deps operator op_registry)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS)
+    cmake_parse_arguments(op_library "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN})
+    foreach(src ${op_library_SRCS})
+        if (${src} MATCHES ".*\\.cu$")
+            list(APPEND cu_srcs ${src})
+        elseif(${src} MATCHES ".*\\.cc$")
+            list(APPEND cc_srcs ${src})
+        else()
+            message(FATAL_ERROR "${TARGET} Source file ${src} should only be .cc or .cu")
+        endif()
+    endforeach()
+    list(LENGTH cc_srcs cc_srcs_len)
+    if (${cc_srcs_len} EQUAL 0)
+        message(FATAL_ERROR "The op library ${TARGET} should contains at least one .cc file")
+    endif()
+    list(LENGTH cu_srcs cu_srcs_len)
+    if (${cu_srcs_len} EQUAL 0)
+        message(WARNING "The op library ${TARGET} not support GPU!")
+    endif()
+    if (WITH_GPU)
+        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
+                ${op_common_deps})
+    else()
+        cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS}
+                ${op_common_deps})
+    endif()
+endfunction()
+op_library(add_op SRCS add_op.cc add_op.cu)
 cc_test(add_op_test SRCS add_op_test.cc DEPS add_op)
+op_library(mul_op SRCS mul_op.cc mul_op.cu)
+op_library(rowwise_add_op SRCS rowwise_add_op.cu rowwise_add_op.cc)
+op_library(sigmoid_op SRCS sigmoid_op.cu sigmoid_op.cc)
+op_library(softmax_op SRCS softmax_op.cc softmax_op.cu)
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -31,8 +31,7 @@ protected:
        "Inputs/Outputs of AddOp must all be set");
    PADDLE_ENFORCE(inputs[0]->dims() == inputs[1]->dims(),
                   "Two input of Add Op's dimension must be same.");
-    // Need set dims in Tensor
+    outputs[0]->set_dims(inputs[0]->dims());
-    // outputs[0]->set_dims(inputs[0]->dims())
  }
 };

--- a/paddle/operators/add_op.h
+++ b/paddle/operators/add_op.h
@@ -8,10 +8,10 @@ namespace operators {
 template <typename Place>
 class AddKernel : public framework::OpKernel {
 public:
-  void Compute(const KernelContext &context) const override {
+  void Compute(const framework::KernelContext &context) const override {
    LOG(INFO) << "Add kernel in " << typeid(Place).name();
  }
 };
-}  // namespace op
+}  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <paddle/framework/op_registry.h>
+#include <paddle/framework/tensor.h>
+#include <paddle/operators/mul_op.h>
+namespace paddle {
+namespace operators {
+class MulOp : public framework::OperatorWithKernel {
+protected:
+  void InferShape(
+      const std::vector<const framework::Tensor *> &inputs,
+      const std::vector<framework::Tensor *> &outputs) const override {
+    PADDLE_ENFORCE(inputs.size() == 2, "The mul op must take two inputs");
+    auto dim0 = inputs[0]->dims();
+    auto dim1 = inputs[1]->dims();
+    PADDLE_ENFORCE(dim0.size() == 2 && dim1.size() == 2,
+                   "The input of mul op must be matrix");
+    PADDLE_ENFORCE(
+        dim0[1] == dim1[0],
+        "First matrix's width must be equal with second matrix's height.");
+    PADDLE_ENFORCE(outputs.size() == 1, "The mul op must take one output");
+    outputs[0]->set_dims({dim0[0], dim1[1]});
+  }
+};
+class MulOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of mul op");
+    AddInput("Y", "The second input of mul op");
+    AddOutput("Out", "The output of mul op");
+    AddComment(R"DOC(
+Two Element Mul Operator.
+The equation is: Out = X * Y
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+REGISTER_OP(mul, paddle::operators::MulOp, paddle::operators::MulOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    mul, paddle::operators::MulKernel<paddle::platform::CPUPlace>);
--- a/paddle/operators/mul_op.cu
+++ b/paddle/operators/mul_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <paddle/operators/mul_op.h>
+#include <paddle/framework/op_registry.h>
+REGISTER_OP_GPU_KERNEL(mul,
+                       paddle::operators::MulKernel<paddle::platform
+                       ::GPUPlace>);
\ No newline at end of file
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <glog/logging.h>
+#include <paddle/framework/operator.h>
+namespace paddle {
+namespace operators {
+template <typename Place>
+class MulKernel : public framework::OpKernel {
+public:
+  void Compute(const framework::KernelContext &context) const override {
+    LOG(INFO) << "Mul kernel in " << typeid(Place).name();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <paddle/framework/op_registry.h>
+#include <paddle/operators/rowwise_add_op.h>
+namespace paddle {
+namespace operators {
+class RowWiseAddOp : public framework::OperatorWithKernel {
+protected:
+  void InferShape(
+      const std::vector<const framework::Tensor *> &inputs,
+      const std::vector<framework::Tensor *> &outputs) const override {
+    PADDLE_ENFORCE(inputs.size() == 2UL, "Two inputs is needed by rowwise add");
+    auto dim0 = inputs[0]->dims();
+    auto dim1 = inputs[1]->dims();
+    PADDLE_ENFORCE(dim0.size() == 2, "Input 0 must be matrix");
+    PADDLE_ENFORCE(dim1.size() == 1, "The second input must be vector");
+    PADDLE_ENFORCE(dim0[1] == dim1[0], "The width of two input must be same");
+    PADDLE_ENFORCE(outputs.size() == 1, "The output size must be 1");
+    outputs[0]->set_dims(inputs[0]->dims());
+  }
+};
+class RowWiseAddOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  RowWiseAddOpMaker(framework::OpProto *proto,
+                    framework::OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The left input of row-wise add op, must be matrix");
+    AddInput("b", "The right input of row-wise add op, must be vector");
+    AddOutput("Out", "The output of row-wise add op");
+    AddComment(R"DOC(Row-wise Add operator
+for i in xrange(X.shape[0]):
+  Out = X[i] + b
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+REGISTER_OP(rowwise_add,
+            paddle::operators::RowWiseAddOp,
+            paddle::operators::RowWiseAddOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    rowwise_add,
+    paddle::operators::RowWiseAddKernel<paddle::platform::CPUPlace>);
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
+#include <paddle/framework/op_registry.h>
+#include <paddle/operators/rowwise_add_op.h>
+REGISTER_OP_GPU_KERNEL(
+    rowwise_add,
+    paddle::operators::RowWiseAddKernel<paddle::platform ::GPUPlace>);
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <glog/logging.h>
+#include <paddle/framework/operator.h>
+namespace paddle {
+namespace operators {
+template <typename Place>
+class RowWiseAddKernel : public framework::OpKernel {
+public:
+  void Compute(const framework::KernelContext &context) const override {
+    LOG(INFO) << "RowWiseAdd kernel in " << typeid(Place).name();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <paddle/framework/op_registry.h>
+#include <paddle/operators/sigmoid_op.h>
+namespace paddle {
+namespace operators {
+class SigmoidOp : public framework::OperatorWithKernel {
+protected:
+  void InferShape(
+      const std::vector<const framework::Tensor *> &inputs,
+      const std::vector<framework::Tensor *> &outputs) const override {
+    PADDLE_ENFORCE(inputs.size() == 1, "Sigmoid Op only have one input");
+    PADDLE_ENFORCE(outputs.size() == 1, "Sigmoid Op only have one output");
+    outputs[0]->set_dims(inputs[0]->dims());
+  }
+};
+class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  SigmoidOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "sigmoid input");
+    AddInput("Y", "sigmoid output");
+    AddComment("Sigmoid function");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+REGISTER_OP(sigmoid,
+            paddle::operators::SigmoidOp,
+            paddle::operators::SigmoidOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    sigmoid, paddle::operators::SigmoidKernel<paddle::platform::CPUPlace>);
--- a/paddle/operators/sigmoid_op.cu
+++ b/paddle/operators/sigmoid_op.cu
+#include <paddle/operators/sigmoid_op.h>
+#include <paddle/framework/op_registry.h>
+REGISTER_OP_GPU_KERNEL(
+    sigmoid, paddle::operators::SigmoidKernel<paddle::platform::GPUPlace>);
--- a/paddle/operators/sigmoid_op.h
+++ b/paddle/operators/sigmoid_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <glog/logging.h>
+#include <paddle/framework/operator.h>
+namespace paddle {
+namespace operators {
+template <typename Place>
+class SigmoidKernel : public framework::OpKernel {
+public:
+  void Compute(const framework::KernelContext &context) const override {
+    LOG(INFO) << "Sigmoid kernel in " << typeid(Place).name();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include <paddle/framework/op_registry.h>
+#include <paddle/operators/softmax_op.h>
+namespace paddle {
+namespace operators {
+class SoftmaxOp : public framework::OperatorWithKernel {
+protected:
+  void InferShape(
+      const std::vector<const framework::Tensor *> &inputs,
+      const std::vector<framework::Tensor *> &outputs) const override {
+    PADDLE_ENFORCE(inputs.size() == 1, "Only one input is need for softmax");
+    PADDLE_ENFORCE(outputs.size() == 1, "Only one output is need for softmax");
+    outputs[0]->set_dims(inputs[0]->dims());
+  }
+};
+class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  SoftmaxOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "input of softmax");
+    AddOutput("Y", "output of softmax");
+    AddComment("Softmax Op");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker);
+REGISTER_OP_CPU_KERNEL(softmax, ops::SoftmaxKernel<paddle::platform::CPUPlace>);
--- a/paddle/operators/softmax_op.cu
+++ b/paddle/operators/softmax_op.cu
+#include <paddle/framework/op_registry.h>
+#include <paddle/operators/softmax_op.h>
+REGISTER_OP_GPU_KERNEL(
+    softmax, paddle::operators::SoftmaxKernel<paddle::platform::GPUPlace>);
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <glog/logging.h>
+#include <paddle/framework/operator.h>
+namespace paddle {
+namespace operators {
+template <typename Place>
+class SoftmaxKernel : public framework::OpKernel {
+public:
+  void Compute(const framework::KernelContext &context) const override {
+    LOG(INFO) << "Softmax kernel in " << typeid(Place).name();
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
-cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python add_op)
+cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python
+        add_op mul_op rowwise_add_op sigmoid_op softmax_op)
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -15,6 +15,8 @@ limitations under the License. */
 #include <Python.h>
 #include <paddle/framework/op_registry.h>
 #include <paddle/framework/scope.h>
+#include <paddle/pybind/tensor_bind.h>
+#include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <fstream>
@@ -24,10 +26,35 @@ namespace py = pybind11;
 namespace pd = paddle::framework;
 USE_OP(add_two);
+USE_OP(softmax);
+USE_OP(mul);
+USE_OP(rowwise_add);
+USE_OP(sigmoid);
 PYBIND11_PLUGIN(core) {
  py::module m("core", "C++ core of Paddle Paddle");
+  py::class_<pd::Tensor>(m, "Tensor", py::buffer_protocol())
+      .def_buffer([](pd::Tensor& self) -> py::buffer_info {
+        return paddle::pybind::CastToPyBuffer(self);
+      })
+      .def("get_dims",
+           [](const pd::Tensor& self) { return pd::vectorize(self.dims()); })
+      .def("set_dims",
+           [](pd::Tensor& self, const std::vector<int>& dim) {
+             self.set_dims(pd::make_ddim(dim));
+           })
+      .def("alloc_float",
+           [](pd::Tensor& self) {
+             self.mutable_data<float>(paddle::platform::CPUPlace());
+           })
+      .def("alloc_int",
+           [](pd::Tensor& self) {
+             self.mutable_data<int>(paddle::platform::CPUPlace());
+           })
+      .def("set", paddle::pybind::PyTensorSetFromArray<float>)
+      .def("set", paddle::pybind::PyTensorSetFromArray<int>);
  py::class_<pd::Variable>(m, "Variable", R"DOC(Variable Class.
 All parameter, weight, gradient are variables in Paddle.
@@ -38,7 +65,12 @@ All parameter, weight, gradient are variables in Paddle.
             *var.GetMutable<int>() = val;
           })
      .def("get_int",
-           [](const pd::Variable& var) -> int { return var.Get<int>(); });
+           [](const pd::Variable& var) -> int { return var.Get<int>(); })
+      .def("get_tensor",
+           [](pd::Variable& self) -> pd::Tensor* {
+             return self.GetMutable<pd::Tensor>();
+           },
+           py::return_value_policy::reference);
  py::class_<pd::Scope, std::shared_ptr<pd::Scope>>(m, "Scope")
      .def(py::init<const std::shared_ptr<pd::Scope>&>())
@@ -63,6 +95,23 @@ All parameter, weight, gradient are variables in Paddle.
    }
    return ret_values;
  });
+  m.def_submodule(
+       "var_names",
+       "The module will return special predefined variable name in Paddle")
+      .def("empty", pd::OperatorBase::EMPTY_VAR_NAME)
+      .def("temp", pd::OperatorBase::TMP_VAR_NAME);
+  py::class_<pd::OperatorBase, pd::OperatorPtr>(m, "Operator")
+      .def("__str__", &pd::OperatorBase::DebugString)
+      .def_static("create", [](const std::string& protobin) {
+        pd::OpDesc desc;
+        PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
+                       "Cannot parse user input to OpDesc");
+        PADDLE_ENFORCE(desc.IsInitialized(),
+                       "User OpDesc is not initialized, reason %s",
+                       desc.InitializationErrorString());
+        return pd::OpRegistry::CreateOp(desc);
+      });
  return m.ptr();
 }
--- a/paddle/pybind/tensor_bind.h
+++ b/paddle/pybind/tensor_bind.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <paddle/framework/tensor.h>
+#include <pybind11/numpy.h>
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+namespace paddle {
+namespace pybind {
+namespace details {
+template <bool less, size_t I, typename... ARGS>
+struct CastToPyBufferImpl;
+template <size_t I, typename... ARGS>
+struct CastToPyBufferImpl<false, I, ARGS...> {
+  py::buffer_info operator()(framework::Tensor &tensor) {
+    PADDLE_THROW("This type of tensor cannot be expose to Python");
+    return py::buffer_info();
+  }
+};
+template <size_t I, typename... ARGS>
+struct CastToPyBufferImpl<true, I, ARGS...> {
+  using CUR_TYPE = typename std::tuple_element<I, std::tuple<ARGS...>>::type;
+  py::buffer_info operator()(framework::Tensor &tensor) {
+    PADDLE_ENFORCE(paddle::platform::is_cpu_place(tensor.holder_->place()),
+                   "Only CPU tensor can cast to numpy array");
+    if (std::type_index(typeid(CUR_TYPE)) == tensor.holder_->type()) {
+      auto dim_vec = framework::vectorize(tensor.dims());
+      std::vector<size_t> dims_outside;
+      std::vector<size_t> strides;
+      dims_outside.resize(dim_vec.size());
+      strides.resize(dim_vec.size());
+      size_t prod = 1;
+      for (size_t i = dim_vec.size(); i != 0; --i) {
+        dims_outside[i - 1] = (size_t)dim_vec[i - 1];
+        strides[i - 1] = sizeof(CUR_TYPE) * prod;
+        prod *= dims_outside[i - 1];
+      }
+      return py::buffer_info(
+          tensor.mutable_data<CUR_TYPE>(tensor.holder_->place()),
+          sizeof(CUR_TYPE),
+          py::format_descriptor<CUR_TYPE>::format(),
+          (size_t)framework::arity(tensor.dims()),
+          dims_outside,
+          strides);
+    } else {
+      constexpr bool less = I + 1 < std::tuple_size<std::tuple<ARGS...>>::value;
+      return CastToPyBufferImpl<less, I + 1, ARGS...>()(tensor);
+    }
+  }
+};
+}  // namespace details
+inline py::buffer_info CastToPyBuffer(framework::Tensor &tensor) {
+  auto buffer_info = details::CastToPyBufferImpl<true, 0, float, int>()(tensor);
+  return buffer_info;
+}
+template <typename T>
+void PyTensorSetFromArray(
+    framework::Tensor &self,
+    py::array_t<T, py::array::c_style | py::array::forcecast> array) {
+  std::vector<int> dims;
+  dims.reserve(array.ndim());
+  for (size_t i = 0; i < array.ndim(); ++i) {
+    dims.push_back((int)array.shape()[i]);
+  }
+  self.set_dims(framework::make_ddim(dims));
+  auto *dst = self.mutable_data<T>(paddle::platform::CPUPlace());
+  std::memcpy(dst, array.data(), sizeof(T) * array.size());
+}
+}  // namespace pybind
+}  // namespace paddle
--- a/paddle/scripts/docker/build_android.sh
+++ b/paddle/scripts/docker/build_android.sh
@@ -2,9 +2,9 @@
 set -xe
-mkdir -p /paddle/build
+mkdir -p /paddle/build_android
-cd /paddle/build
+cd /paddle/build_android
-rm -f /paddle/install 2>/dev/null || true
+rm -rf /paddle/install 2>/dev/null || true
 cmake -DCMAKE_SYSTEM_NAME=Android \
      -DANDROID_STANDALONE_TOOLCHAIN=$ANDROID_STANDALONE_TOOLCHAIN \
      -DANDROID_ABI=armeabi-v7a \
@@ -21,6 +21,3 @@ cmake -DCMAKE_SYSTEM_NAME=Android \
      ..
 make -j `nproc`
 make install
-export PATH=/paddle/install/bin:/paddle/install/opt/paddle/bin:$PATH
-paddle version
--- a/paddle/scripts/travis/check_style.sh
+++ b/paddle/scripts/travis/check_style.sh
@@ -13,6 +13,11 @@ export PATH=/usr/bin:$PATH
 pre-commit install
 clang-format --version
+# set up go environment for running gometalinter
+mkdir -p $GOPATH/src/github.com/PaddlePaddle/
+ln -sf $TRAVIS_BUILD_DIR $GOPATH/src/github.com/PaddlePaddle/Paddle
+cd  $GOPATH/src/github.com/PaddlePaddle/Paddle/go; glide install; cd -
 if ! pre-commit run -a ; then
  git diff  --exit-code
 fi

--- a/python/paddle/v2/framework/create_op_creation_methods.py
+++ b/python/paddle/v2/framework/create_op_creation_methods.py
 import paddle.v2.framework.core as core
 import paddle.v2.framework.proto.op_proto_pb2 as op_proto_pb2
+import paddle.v2.framework.proto.op_desc_pb2 as op_desc_pb2
+import paddle.v2.framework.proto.attr_type_pb2 as attr_type_pb2
+import cStringIO
 def get_all_op_protos():
+    """
+    Get all registered op proto from Paddle C++
+    :return: list of OpProto
+    """
    protostrs = core.get_all_op_protos()
    ret_values = []
    for pbstr in protostrs:
        op_proto = op_proto_pb2.OpProto.FromString(str(pbstr))
        ret_values.append(op_proto)
    return ret_values
+class OpDescCreationMethod(object):
+    """
+    A Functor object to convert user input(use key word args) to OpDesc based on
+    OpProto.
+    :param op_proto: The OpProto object.
+    :type op_proto: op_proto_pb2.OpProto
+    """
+    def __init__(self, op_proto):
+        if not isinstance(op_proto, op_proto_pb2.OpProto):
+            raise TypeError("Argument should be OpProto")
+        self.__op_proto__ = op_proto
+    def __call__(self, *args, **kwargs):
+        """
+        Convert user input to OpDesc. Only key-word args are supported. 
+        :return: OpDesc based on user input
+        :rtype: op_desc_pb2.OpDesc
+        """
+        if len(args) != 0:
+            raise ValueError("Only keyword arguments is supported by Paddle")
+        op_desc = op_desc_pb2.OpDesc()
+        # Inputs
+        ipts, ipt_format, _ = OpDescCreationMethod.extract_input_or_output(
+            "input", kwargs, self.__op_proto__.inputs)
+        op_desc.inputs.extend(ipts)
+        if ipt_format is not None:
+            op_desc.attrs.extend([ipt_format])
+        # Outputs
+        outs, out_format, tmp_index = OpDescCreationMethod.extract_input_or_output(
+            "output", kwargs, self.__op_proto__.outputs)
+        op_desc.outputs.extend(outs)
+        if out_format is not None:
+            op_desc.attrs.extend([out_format])
+        if len(tmp_index) != 0:
+            tmp_index_attr = op_desc.attrs.add()
+            tmp_index_attr.type = attr_type_pb2.INTS
+            tmp_index_attr.name = "temporary_index"
+            tmp_index_attr.ints.extend(tmp_index)
+        # Types
+        op_desc.type = self.__op_proto__.type
+        # Attrs
+        for attr in self.__op_proto__.attrs:
+            if attr.generated:
+                continue
+            user_defined_attr = kwargs.get(attr.name, None)
+            if user_defined_attr is not None:
+                new_attr = op_desc.attrs.add()
+                new_attr.name = attr.name
+                new_attr.type = attr.type
+                if attr.type == attr_type_pb2.INT:
+                    new_attr.i = user_defined_attr
+                elif attr.type == attr_type_pb2.FLOAT:
+                    new_attr.f = user_defined_attr
+                elif attr.type == attr_type_pb2.STRING:
+                    new_attr.s = user_defined_attr
+                elif attr.type == attr_type_pb2.INTS:
+                    new_attr.ints.extend(user_defined_attr)
+                elif attr.type == attr_type_pb2.FLOATS:
+                    new_attr.floats.extend(user_defined_attr)
+                elif attr.type == attr_type_pb2.STRINGS:
+                    new_attr.strings.extend(user_defined_attr)
+                else:
+                    raise NotImplementedError("Not support attribute type " +
+                                              attr.type)
+        return op_desc
+    @staticmethod
+    def extract_input_or_output(in_out, kwargs, meta):
+        """
+        Extract input variable names or output variable names from key-word 
+        arguments, which base on VarProtos.
+        :param in_out: "input" or "output"
+        :param kwargs: key-word arguments that user inputted.
+        :param meta: a list of VarProto
+        :return: The three object will be return. The variable names. The 
+        input_format or output_format attribute(None if the input or output is 
+        not multiple). The temporary variable index list.
+        """
+        multiple = OpDescCreationMethod.any_is_true((m.multiple for m in meta))
+        tmp_index = []
+        retv = []
+        if multiple:
+            var_format = op_desc_pb2.AttrDesc()
+            var_format.type = attr_type_pb2.INTS
+            var_format.name = "%s_format" % in_out
+            var_format.ints.append(0)
+            for var in meta:
+                var_name = var.name
+                if var.temporary:
+                    var_name = [core.var_names.temp()]
+                    tmp_index.append(len(retv))
+                else:
+                    var_name = kwargs.get(var_name, [])
+                if not isinstance(var_name, list):
+                    var_name = [var_name]
+                retv.extend(var_name)
+                var_format.ints.append(len(var_name) + var_format.ints[-1])
+            return retv, var_format, tmp_index
+        else:
+            for var in meta:
+                if var.temporary:
+                    retv.append(kwargs.get(var.name, core.var_names.temp()))
+                    tmp_index.append(len(retv))
+                else:
+                    retv.append(kwargs.get(var.name, core.var_names.empty()))
+            return retv, None, tmp_index
+    @staticmethod
+    def any_is_true(generator):
+        """
+        Reduce a bool array to one. If any of them is True, then return True.
+        """
+        for flag in generator:
+            if flag:
+                return True
+        return False
+def get_docstring_from_op_proto(op_proto):
+    """
+    Generate docstring from a OpProto
+    :param op_proto: a OpProto instance.
+    :type op_proto: op_proto_pb2.OpProto
+    :return: docstring
+    """
+    if not isinstance(op_proto, op_proto_pb2.OpProto):
+        raise TypeError("Input must be OpProto")
+    f = cStringIO.StringIO()
+    f.write(op_proto.comment)
+    f.write("\n")
+    def __append_param__(name, comment, type):
+        # Maybe replace the following line with template engine is better.
+        f.write(":param ")
+        f.write(name)
+        f.write(": ")
+        f.write(comment)
+        f.write("\n")
+        f.write(":type ")
+        f.write(name)
+        f.write(": ")
+        f.write(type)
+        f.write("\n")
+    for ipt in op_proto.inputs:
+        __append_param__(ipt.name, ipt.comment, "list | basestr"
+                         if ipt.multiple else "basestr")
+    temp_var_prefix = \
+        "This is a temporary variable. It does not have to set by user. "
+    for opt in op_proto.outputs:
+        __append_param__(opt.name, opt.comment if not opt.temporary else
+                         temp_var_prefix + opt.comment, "list | basestr"
+                         if opt.multiple else "basestr")
+    for attr in op_proto.attrs:
+        attr_type = None
+        if attr.type == attr_type_pb2.INT:
+            attr_type = "int"
+        elif attr.type == attr_type_pb2.FLOAT:
+            attr_type = "float"
+        elif attr.type == attr_type_pb2.STRING:
+            attr_type = "basestr"
+        elif attr.type == attr_type_pb2.INTS:
+            attr_type = "list of int"
+        elif attr.type == attr_type_pb2.FLOATS:
+            attr_type = "list of float"
+        elif attr.type == attr_type_pb2.STRINGS:
+            attr_type = "list of basestr"
+        if attr_type is None:
+            raise RuntimeError("Not supported attribute type " + attr.type)
+        __append_param__(attr.name, attr.comment, attr_type)
+    return f.getvalue()
+def create_op_creation_method(op_proto):
+    """
+    Generate op creation method for an OpProto
+    """
+    method = OpDescCreationMethod(op_proto)
+    def __impl__(*args, **kwargs):
+        opdesc = method(*args, **kwargs)
+        return core.Operator.create(opdesc.SerializeToString())
+    __impl__.__doc__ = get_docstring_from_op_proto(op_proto)
+    return __impl__
+class OpCreationsHolder(object):
+    """
+    A object will holds all op creation methods.
+    Use `op_creations.xxx_op` to access them.
+    """
+    pass
+op_creations = OpCreationsHolder()
+def __bootstrap__():
+    """
+    Bootstrap function for this module. It will dynamic create all op creation
+    methods in runtime.
+    """
+    for op_proto in get_all_op_protos():
+        func = create_op_creation_method(op_proto)
+        func.__name__ = str(op_proto.type)
+        setattr(op_creations, func.__name__, func)
+__bootstrap__()
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
 add_python_test(test_framework test_protobuf.py test_scope.py
-    test_default_scope_funcs.py test_op_creation_methods.py)
+    test_default_scope_funcs.py test_op_creation_methods.py
+    test_tensor.py)
--- a/python/paddle/v2/framework/tests/test_op_creation_methods.py
+++ b/python/paddle/v2/framework/tests/test_op_creation_methods.py
 import unittest
 import paddle.v2.framework.create_op_creation_methods as creation
+import paddle.v2.framework.core as core
+import paddle.v2.framework.proto.op_proto_pb2 as op_proto_pb2
+import paddle.v2.framework.proto.op_desc_pb2 as op_desc_pb2
+import paddle.v2.framework.proto.attr_type_pb2 as attr_type_pb2
-class TestOpCreationsMethods(unittest.TestCase):
+class TestGetAllProtos(unittest.TestCase):
-    def test_all_protos(self):
+    def test_all(self):
        all_protos = creation.get_all_op_protos()
        self.assertNotEqual(0, len(all_protos))
@@ -11,5 +15,240 @@ class TestOpCreationsMethods(unittest.TestCase):
            self.assertTrue(each.IsInitialized())
+class TestOpDescCreationMethod(unittest.TestCase):
+    def test_plain_input_output(self):
+        op = op_proto_pb2.OpProto()
+        op.type = "test"
+        ipt = op.inputs.add()
+        ipt.name = "X"
+        ipt.comment = "not matter"
+        ipt = op.inputs.add()
+        ipt.name = "Y"
+        ipt.comment = "not matter"
+        opt = op.outputs.add()
+        opt.name = "Z"
+        opt.comment = "not matter"
+        op.comment = "not matter"
+        self.assertTrue(op.IsInitialized())
+        method = creation.OpDescCreationMethod(op)
+        output = method(X="a", Y="b", Z="c")
+        expected = op_desc_pb2.OpDesc()
+        expected.type = "test"
+        expected.inputs.extend(["a", "b"])
+        expected.outputs.append("c")
+        self.assertEqual(expected, output)
+    def test_multiple_input_plain_output(self):
+        op = op_proto_pb2.OpProto()
+        op.type = "fc"
+        ipt = op.inputs.add()
+        ipt.name = "X"
+        ipt.comment = ""
+        ipt.multiple = True
+        ipt = op.inputs.add()
+        ipt.name = "W"
+        ipt.comment = ""
+        ipt.multiple = True
+        ipt = op.inputs.add()
+        ipt.name = "b"
+        ipt.comment = ""
+        out = op.outputs.add()
+        out.name = "Y"
+        out.comment = ""
+        op.comment = ""
+        self.assertTrue(op.IsInitialized())
+        method = creation.OpDescCreationMethod(op)
+        generated1 = method(X="x", W="w", b="b", Y="y")
+        expected1 = op_desc_pb2.OpDesc()
+        expected1.inputs.extend(['x', 'w', 'b'])
+        expected1.outputs.extend(['y'])
+        expected1.type = 'fc'
+        attr = expected1.attrs.add()
+        attr.name = 'input_format'
+        attr.type = attr_type_pb2.INTS
+        attr.ints.extend([0, 1, 2, 3])
+        self.assertEqual(expected1, generated1)
+        generated2 = method(
+            X=['x1', 'x2', 'x3'], b='b', W=['w1', 'w2', 'w3'], Y='y')
+        expected2 = op_desc_pb2.OpDesc()
+        expected2.inputs.extend(['x1', 'x2', 'x3', 'w1', 'w2', 'w3', 'b'])
+        expected2.outputs.extend(['y'])
+        expected2.type = 'fc'
+        attr = expected2.attrs.add()
+        attr.name = 'input_format'
+        attr.type = attr_type_pb2.INTS
+        attr.ints.extend([0, 3, 6, 7])
+        self.assertEqual(expected2, generated2)
+    def test_attrs(self):
+        op = op_proto_pb2.OpProto()
+        op.type = "test"
+        ipt = op.inputs.add()
+        ipt.name = 'X'
+        ipt.comment = ""
+        def __add_attr__(name, type):
+            attr = op.attrs.add()
+            attr.name = name
+            attr.comment = ""
+            attr.type = type
+        __add_attr__("int_attr", attr_type_pb2.INT)
+        __add_attr__("float_attr", attr_type_pb2.FLOAT)
+        __add_attr__("string_attr", attr_type_pb2.STRING)
+        __add_attr__("ints_attr", attr_type_pb2.INTS)
+        __add_attr__("floats_attr", attr_type_pb2.FLOATS)
+        __add_attr__("strings_attr", attr_type_pb2.STRINGS)
+        op.comment = ""
+        self.assertTrue(op.IsInitialized())
+        method = creation.OpDescCreationMethod(op)
+        generated = method(
+            X="a",
+            int_attr=10,
+            float_attr=3.2,
+            string_attr="test_str",
+            ints_attr=[0, 1, 2, 3, 4],
+            floats_attr=[0.2, 3.2, 4.5],
+            strings_attr=["a", "b", "c"])
+        expected = op_desc_pb2.OpDesc()
+        expected.type = "test"
+        expected.inputs.extend(['a'])
+        attr = expected.attrs.add()
+        attr.name = "int_attr"
+        attr.type = attr_type_pb2.INT
+        attr.i = 10
+        attr = expected.attrs.add()
+        attr.name = "float_attr"
+        attr.type = attr_type_pb2.FLOAT
+        attr.f = 3.2
+        attr = expected.attrs.add()
+        attr.name = "string_attr"
+        attr.type = attr_type_pb2.STRING
+        attr.s = "test_str"
+        attr = expected.attrs.add()
+        attr.name = "ints_attr"
+        attr.type = attr_type_pb2.INTS
+        attr.ints.extend([0, 1, 2, 3, 4])
+        attr = expected.attrs.add()
+        attr.name = 'floats_attr'
+        attr.type = attr_type_pb2.FLOATS
+        attr.floats.extend([0.2, 3.2, 4.5])
+        attr = expected.attrs.add()
+        attr.name = 'strings_attr'
+        attr.type = attr_type_pb2.STRINGS
+        attr.strings.extend(['a', 'b', 'c'])
+        self.assertEqual(expected, generated)
+    def test_input_temporary_output(self):
+        op = op_proto_pb2.OpProto()
+        op.type = "test"
+        out = op.outputs.add()
+        out.name = "OUT"
+        out.comment = ""
+        out = op.outputs.add()
+        out.name = "TMP"
+        out.comment = ""
+        out.temporary = True
+        out = op.outputs.add()
+        out.name = "OUT2"
+        out.comment = ""
+        op.comment = ""
+        method = creation.OpDescCreationMethod(op)
+        generated = method(OUT="a", OUT2="b")
+        desc = op_desc_pb2.OpDesc()
+        desc.outputs.extend(["a", core.var_names.temp(), "b"])
+        desc.type = "test"
+        attr = desc.attrs.add()
+        attr.name = "temporary_index"
+        attr.type = attr_type_pb2.INTS
+        attr.ints.append(2)
+        self.assertEqual(generated, desc)
+class TestOpCreationDocStr(unittest.TestCase):
+    def test_all(self):
+        op = op_proto_pb2.OpProto()
+        op.type = "test"
+        op.comment = """Test Op.
+This op is used for unit test, not a real op.
+"""
+        a = op.inputs.add()
+        a.name = "a"
+        a.comment = "Input a for test op"
+        a.multiple = True
+        b = op.inputs.add()
+        b.name = "b"
+        b.comment = "Input b for test op"
+        self.assertTrue(op.IsInitialized())
+        o1 = op.outputs.add()
+        o1.name = "output"
+        o1.comment = "The output of test op"
+        o2 = op.outputs.add()
+        o2.name = "temp output"
+        o2.comment = "The temporary output of test op"
+        o2.temporary = True
+        test_str = op.attrs.add()
+        test_str.name = "str_attr"
+        test_str.type = attr_type_pb2.STRING
+        test_str.comment = "A string attribute for test op"
+        actual = creation.get_docstring_from_op_proto(op)
+        expected_docstring = '''Test Op.
+This op is used for unit test, not a real op.
+:param a: Input a for test op
+:type a: list | basestr
+:param b: Input b for test op
+:type b: basestr
+:param output: The output of test op
+:type output: basestr
+:param temp output: This is a temporary variable. It does not have to set by user. The temporary output of test op
+:type temp output: basestr
+:param str_attr: A string attribute for test op
+:type str_attr: basestr
+'''
+        self.assertEqual(expected_docstring, actual)
+class TestOpCreations(unittest.TestCase):
+    def test_all(self):
+        add_op = creation.op_creations.add_two(X="a", Y="b", Out="z")
+        self.assertIsNotNone(add_op)
+        # Invoke C++ DebugString()
+        self.assertEqual('Op(add_two), inputs:(a, b), outputs:(z).',
+                         str(add_op))
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/v2/framework/tests/test_tensor.py
+++ b/python/paddle/v2/framework/tests/test_tensor.py
+import paddle.v2.framework.core as core
+import unittest
+import numpy
+class TestScope(unittest.TestCase):
+    def test_int_tensor(self):
+        scope = core.Scope(None)
+        var = scope.create_var("test_tensor")
+        tensor = var.get_tensor()
+        tensor.set_dims([1000, 784])
+        tensor.alloc_int()
+        tensor_array = numpy.array(tensor)
+        self.assertEqual((1000, 784), tensor_array.shape)
+        tensor_array[3, 9] = 1
+        tensor_array[19, 11] = 2
+        tensor.set(tensor_array)
+        tensor_array_2 = numpy.array(tensor)
+        self.assertEqual(1.0, tensor_array_2[3, 9])
+        self.assertEqual(2.0, tensor_array_2[19, 11])
+    def test_float_tensor(self):
+        scope = core.Scope(None)
+        var = scope.create_var("test_tensor")
+        tensor = var.get_tensor()
+        tensor.set_dims([1000, 784])
+        tensor.alloc_float()
+        tensor_array = numpy.array(tensor)
+        self.assertEqual((1000, 784), tensor_array.shape)
+        tensor_array[3, 9] = 1.0
+        tensor_array[19, 11] = 2.0
+        tensor.set(tensor_array)
+        tensor_array_2 = numpy.array(tensor)
+        self.assertAlmostEqual(1.0, tensor_array_2[3, 9])
+        self.assertAlmostEqual(2.0, tensor_array_2[19, 11])
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
-import py_paddle.swig_paddle as swig_api
 import paddle.trainer_config_helpers.config_parser_utils as config_parser_utils
 import paddle.trainer_config_helpers.optimizers as v1_optimizers
 """
@@ -17,6 +16,7 @@ __all__ = [
 class Optimizer(object):
    def __init__(self, **kwargs):
+        import py_paddle.swig_paddle as swig_api
        if 'batch_size' in kwargs:
            del kwargs['batch_size']  # not important for python library.
@@ -35,18 +35,22 @@ class Optimizer(object):
        For each optimizer(SGD, Adam), GradientMachine should enable different
        buffers.
        """
+        import py_paddle.swig_paddle as swig_api
        tmp = swig_api.ParameterOptimizer.create(self.__opt_conf__)
        assert isinstance(tmp, swig_api.ParameterOptimizer)
        return tmp.getParameterTypes()
    def __create_local_updater__(self):
+        import py_paddle.swig_paddle as swig_api
        return swig_api.ParameterUpdater.createLocalUpdater(self.__opt_conf__)
    def __create_remote_updater__(self, pass_num, use_sparse_updater):
+        import py_paddle.swig_paddle as swig_api
        return swig_api.ParameterUpdater.createRemoteUpdater(
            self.__opt_conf__, pass_num, use_sparse_updater)
    def __create_new_remote_updater__(self, pserver_spec, use_etcd):
+        import py_paddle.swig_paddle as swig_api
        return swig_api.ParameterUpdater.createNewRemoteUpdater(
            self.__opt_conf__, pserver_spec, use_etcd)