diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index efb4dcb2dfbc63bb6905961b054cdef860cf4573..980a97a07c996eca2e8c126a6ad5ab7f340fa1e5 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -22,9 +22,11 @@
     hooks:
     -   id: clang-formater
 -   repo: https://github.com/PaddlePaddle/pre-commit-golang
-    sha: 16398aeccf263adaf53b2495eed0406347d76281
+    sha: 8337620115c25ff8333f1b1a493bd031049bd7c0
     hooks:
-      -   id: go-fmt
-          types: [go]
-      -   id: gometalinter
-          types: [go]
+    -   id: go-fmt
+        types:
+        - go
+    -   id: gometalinter
+        types:
+        - go
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 3e6cedbb0d718cfd4454f95dedf7e02a24f2981b..f7483f6be9169eb58f0148cd3a956a8c881e1fe3 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -7,17 +7,8 @@ INCLUDE_DIRECTORIES(${EIGEN_SOURCE_DIR}/src/extern_eigen3)
 ExternalProject_Add(
     extern_eigen3
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    # for latest version, please get from official website
-    # URL            "https://bitbucket.org/eigen/eigen/get/3.3.4.tar.gz"
-    # URL_MD5        "1a47e78efe365a97de0c022d127607c3"
-
-    # for no-ssl http support, please get from bazel's mirror
-    # URL           "http://mirror.bazel.build/bitbucket.org/eigen/eigen/get/f3a22f35b044.tar.gz"
-    # URL_MD5       "4645c66075982da6fa0bcf6b20f3e8f7"
-
-    # get from github mirror
     GIT_REPOSITORY  "https://github.com/RLovelett/eigen.git"
-    GIT_TAG         "a46d2e7337c4656f00abe54a8115f6d76153a048"
+    GIT_TAG         "master"
     PREFIX          ${EIGEN_SOURCE_DIR}
     UPDATE_COMMAND  ""
     CONFIGURE_COMMAND ""
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 34fd348893058980964d723490d9cc220a157b5a..ef31c252038ce18655913c0f41343fe6dc7dbb86 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -153,7 +153,7 @@ set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
-LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
+LIST(APPEND CUDA_NVCC_FLAGS -std=c++11 --default-stream per-thread)
 LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math)
 
 if(CMAKE_BUILD_TYPE  STREQUAL "Debug")
diff --git a/go/cmd/master/master.go b/go/cmd/master/master.go
index 287da694915ca383dc29e6d33201dc701cb7de87..739c4c01e02b10f46c36b997f8c4700150da2a26 100644
--- a/go/cmd/master/master.go
+++ b/go/cmd/master/master.go
@@ -19,6 +19,8 @@ import (
 	"net"
 	"net/http"
 	"net/rpc"
+	"os"
+	"os/signal"
 	"strconv"
 	"strings"
 	"time"
@@ -68,6 +70,20 @@ func main() {
 		store = &master.InMemStore{}
 	}
 
+	shutdown := func() {
+		log.Infoln("shutting down gracefully")
+		err := store.Shutdown()
+		if err != nil {
+			log.Errorln(err)
+		}
+	}
+
+	// Guaranteed to run even panic happens.
+	defer shutdown()
+
+	c := make(chan os.Signal, 1)
+	signal.Notify(c, os.Interrupt)
+
 	s, err := master.NewService(store, *chunkPerTask, *taskTimeoutDur, *taskTimeoutMax)
 	if err != nil {
 		log.Fatal(err)
@@ -84,8 +100,12 @@ func main() {
 		log.Fatal(err)
 	}
 
-	err = http.Serve(l, nil)
-	if err != nil {
-		log.Fatal(err)
-	}
+	go func() {
+		err = http.Serve(l, nil)
+		if err != nil {
+			log.Fatal(err)
+		}
+	}()
+
+	<-c
 }
diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go
index aa81d0432b1d4f411644e0a5b703d7ea74d144b7..f9cd8f87e8f2e715c87834ee08482be0f511f681 100644
--- a/go/cmd/pserver/pserver.go
+++ b/go/cmd/pserver/pserver.go
@@ -18,6 +18,8 @@ import (
 	"net"
 	"net/http"
 	"net/rpc"
+	"os"
+	"os/signal"
 	"strconv"
 	"time"
 
@@ -33,7 +35,8 @@ func main() {
 	index := flag.Int("index", -1, "index of this pserver, should be larger or equal than 0")
 	etcdEndpoint := flag.String("etcd-endpoint", "http://127.0.0.1:2379",
 		"comma separated endpoint string for pserver to connect to etcd")
-	etcdTimeout := flag.Duration("etcd-timeout", 5*time.Second, "timeout for etcd calls")
+	dialTimeout := flag.Duration("dial-timeout", 5*time.Second, "dial timeout")
+	etcdTTL := flag.Int("etcd-ttl", 5, "etcd time to live in seconds")
 	numPservers := flag.Int("num-pservers", 1, "total pserver count in a training job")
 	checkpointPath := flag.String("checkpoint-path", "/checkpoints/", "save checkpoint path")
 	checkpointInterval := flag.Duration("checkpoint-interval", 600*time.Second, "save checkpoint per interval seconds")
@@ -53,7 +56,7 @@ func main() {
 	if *index >= 0 {
 		idx = *index
 	} else {
-		e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, *etcdTimeout)
+		e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, *dialTimeout, *etcdTTL)
 		idx, err = e.Register(*port)
 		candy.Must(err)
 
@@ -67,6 +70,20 @@ func main() {
 		}
 	}
 
+	shutdown := func() {
+		log.Infoln("shutting down gracefully")
+		sErr := e.Shutdown()
+		if sErr != nil {
+			log.Errorln(sErr)
+		}
+	}
+
+	// Guaranteed to run even panic happens.
+	defer shutdown()
+
+	c := make(chan os.Signal, 1)
+	signal.Notify(c, os.Interrupt)
+
 	s, err := pserver.NewService(idx, *checkpointInterval, *checkpointPath, e, cp)
 	candy.Must(err)
 
@@ -77,7 +94,11 @@ func main() {
 	l, err := net.Listen("tcp", ":"+strconv.Itoa(*port))
 	candy.Must(err)
 
-	log.Infof("start pserver at port %d", *port)
-	err = http.Serve(l, nil)
-	candy.Must(err)
+	go func() {
+		log.Infof("start pserver at port %d", *port)
+		err = http.Serve(l, nil)
+		candy.Must(err)
+	}()
+
+	<-c
 }
diff --git a/go/glide.lock b/go/glide.lock
index f71ae643d68d29846611ec52d0ae7d67e4ced850..1f16abdf66422abcd0ab7987cab3499d02cf1b9c 100644
--- a/go/glide.lock
+++ b/go/glide.lock
@@ -1,15 +1,105 @@
-hash: a8faea3a363468a88917ddeb3b1c9ea36886fb2c622acbad42604fa9cb4d3855
-updated: 2017-07-11T10:04:40.786745417+08:00
+hash: 2a1c0eca5c07a130e3d224f9821f96cfa37a39bf6bce141c855bbc57ef569f1c
+updated: 2017-07-29T07:34:48.722757905+08:00
 imports:
+- name: github.com/beorn7/perks
+  version: 4c0e84591b9aa9e6dcfdf3e020114cd81f89d5f9
+  subpackages:
+  - quantile
+- name: github.com/boltdb/bolt
+  version: 583e8937c61f1af6513608ccc75c97b6abdf4ff9
+- name: github.com/cockroachdb/cmux
+  version: 112f0506e7743d64a6eb8fedbcff13d9979bbf92
 - name: github.com/coreos/etcd
-  version: cb2a496c4ddd1c87a9f280e116649b599999ec79
+  version: c31bec0f29facff13f7c3e3d948e55dd6689ed42
   subpackages:
+  - alarm
+  - auth
   - auth/authpb
+  - client
   - clientv3
   - clientv3/concurrency
+  - compactor
+  - discovery
+  - embed
+  - error
+  - etcdserver
+  - etcdserver/api
+  - etcdserver/api/v2http
+  - etcdserver/api/v2http/httptypes
+  - etcdserver/api/v3client
+  - etcdserver/api/v3election
+  - etcdserver/api/v3election/v3electionpb
+  - etcdserver/api/v3election/v3electionpb/gw
+  - etcdserver/api/v3lock
+  - etcdserver/api/v3lock/v3lockpb
+  - etcdserver/api/v3lock/v3lockpb/gw
+  - etcdserver/api/v3rpc
   - etcdserver/api/v3rpc/rpctypes
+  - etcdserver/auth
   - etcdserver/etcdserverpb
+  - etcdserver/etcdserverpb/gw
+  - etcdserver/membership
+  - etcdserver/stats
+  - lease
+  - lease/leasehttp
+  - lease/leasepb
+  - mvcc
+  - mvcc/backend
   - mvcc/mvccpb
+  - pkg/adt
+  - pkg/contention
+  - pkg/cors
+  - pkg/cpuutil
+  - pkg/crc
+  - pkg/debugutil
+  - pkg/fileutil
+  - pkg/httputil
+  - pkg/idutil
+  - pkg/ioutil
+  - pkg/logutil
+  - pkg/monotime
+  - pkg/netutil
+  - pkg/pathutil
+  - pkg/pbutil
+  - pkg/runtime
+  - pkg/schedule
+  - pkg/srv
+  - pkg/tlsutil
+  - pkg/transport
+  - pkg/types
+  - pkg/wait
+  - proxy/grpcproxy/adapter
+  - raft
+  - raft/raftpb
+  - rafthttp
+  - snap
+  - snap/snappb
+  - store
+  - version
+  - wal
+  - wal/walpb
+- name: github.com/coreos/go-semver
+  version: 8ab6407b697782a06568d4b7f1db25550ec2e4c6
+  subpackages:
+  - semver
+- name: github.com/coreos/go-systemd
+  version: 48702e0da86bd25e76cfef347e2adeb434a0d0a6
+  subpackages:
+  - daemon
+  - journal
+  - util
+- name: github.com/coreos/pkg
+  version: 3ac0863d7acf3bc44daf49afef8919af12f704ef
+  subpackages:
+  - capnslog
+- name: github.com/dgrijalva/jwt-go
+  version: d2709f9f1f31ebcda9651b03077758c1f3a0018c
+- name: github.com/ghodss/yaml
+  version: 0ca9ea5df5451ffdf184b4428c902747c2c11cd7
+- name: github.com/gogo/protobuf
+  version: 909568be09de550ed094403c2bf8a261b5bb730a
+  subpackages:
+  - proto
 - name: github.com/golang/protobuf
   version: 4bd1920723d7b7c925de087aa32e2187708897f7
   subpackages:
@@ -17,14 +107,61 @@ imports:
   - proto
 - name: github.com/golang/snappy
   version: 553a641470496b2327abcac10b36396bd98e45c9
+- name: github.com/google/btree
+  version: 925471ac9e2131377a91e1595defec898166fe49
+- name: github.com/grpc-ecosystem/go-grpc-prometheus
+  version: 6b7015e65d366bf3f19b2b2a000a831940f0f7e0
+- name: github.com/grpc-ecosystem/grpc-gateway
+  version: 18d159699f2e83fc5bb9ef2f79465ca3f3122676
+  subpackages:
+  - runtime
+  - runtime/internal
+  - utilities
+- name: github.com/jonboulle/clockwork
+  version: 2eee05ed794112d45db504eb05aa693efd2b8b09
+- name: github.com/matttproud/golang_protobuf_extensions
+  version: c12348ce28de40eed0136aa2b644d0ee0650e56c
+  subpackages:
+  - pbutil
 - name: github.com/namsral/flag
   version: 71ceffbeb0ba60fccc853971bb3ed4d7d90bfd04
 - name: github.com/PaddlePaddle/recordio
-  version: edfb82af0739c84f241c87390ec5649c7b28c129
+  version: 0432dee9fd4b24fb6840fb20a8c055b0c933fb81
+- name: github.com/prometheus/client_golang
+  version: c5b7fccd204277076155f10851dad72b76a49317
+  subpackages:
+  - prometheus
+- name: github.com/prometheus/client_model
+  version: 6f3806018612930941127f2a7c6c453ba2c527d2
+  subpackages:
+  - go
+- name: github.com/prometheus/common
+  version: 49fee292b27bfff7f354ee0f64e1bc4850462edf
+  subpackages:
+  - expfmt
+  - internal/bitbucket.org/ww/goautoneg
+  - model
+- name: github.com/prometheus/procfs
+  version: a1dba9ce8baed984a2495b658c82687f8157b98f
+  subpackages:
+  - xfs
 - name: github.com/sirupsen/logrus
-  version: 7f976d3a76720c4c27af2ba716b85d2e0a7e38b1
+  version: a3f95b5c423586578a4e099b11a46c2479628cac
 - name: github.com/topicai/candy
   version: 1b9030d056fa9f8c4b1f9c91b52fe4b8ab4cd8cc
+- name: github.com/ugorji/go
+  version: ded73eae5db7e7a0ef6f55aace87a2873c5d2b74
+  subpackages:
+  - codec
+- name: github.com/xiang90/probing
+  version: 07dd2e8dfe18522e9c447ba95f2fe95262f63bb2
+- name: golang.org/x/crypto
+  version: 1351f936d976c60a0a48d728281922cf63eafb8d
+  repo: https://github.com/golang/crypto.git
+  vcs: git
+  subpackages:
+  - bcrypt
+  - blowfish
 - name: golang.org/x/net
   version: c8c74377599bd978aee1cf3b9b63a8634051cec2
   subpackages:
@@ -36,11 +173,15 @@ imports:
   - lex/httplex
   - trace
 - name: golang.org/x/sys
-  version: abf9c25f54453410d0c6668e519582a9e1115027
+  version: 0f826bdd13b500be0f1d4004938ad978fcc6031e
+  repo: https://github.com/golang/sys.git
+  vcs: git
   subpackages:
   - unix
 - name: golang.org/x/text
-  version: cfdf022e86b4ecfb646e1efbd7db175dd623a8fa
+  version: 836efe42bb4aa16aaa17b9c155d8813d336ed720
+  repo: https://github.com/golang/text.git
+  vcs: git
   subpackages:
   - secure/bidirule
   - transform
@@ -60,4 +201,23 @@ imports:
   - stats
   - tap
   - transport
-testImports: []
+- name: gopkg.in/yaml.v2
+  version: cd8b52f8269e0feb286dfeef29f8fe4d5b397e0b
+testImports:
+- name: github.com/davecgh/go-spew
+  version: 04cdfd42973bb9c8589fd6a731800cf222fde1a9
+  subpackages:
+  - spew
+- name: github.com/docker/docker
+  version: b6d164e6c46d8115b146e4c3ac93784e9ef8b49e
+  subpackages:
+  - pkg/ioutils
+  - pkg/longpath
+- name: github.com/pmezard/go-difflib
+  version: d8ed2627bdf02c080bf22230dbb337003b7aba2d
+  subpackages:
+  - difflib
+- name: github.com/stretchr/testify
+  version: 05e8a0eda380579888eb53c394909df027f06991
+  subpackages:
+  - assert
diff --git a/go/glide.yaml b/go/glide.yaml
index ab472c7cda9755d0399bb8376b16589be8b53057..bc23fa6ebf2c3db61e2d63e5f7e7ddcb595dfed0 100644
--- a/go/glide.yaml
+++ b/go/glide.yaml
@@ -6,8 +6,19 @@ import:
   subpackages:
   - clientv3
   - clientv3/concurrency
+  - embed
+  - etcdserver
 - package: github.com/namsral/flag
   version: ^1.7.4-pre
 - package: github.com/sirupsen/logrus
   version: ^1.0.0
 - package: github.com/topicai/candy
+- package: golang.org/x/crypto
+  vcs: git
+  repo: https://github.com/golang/crypto.git
+- package: golang.org/x/sys
+  vcs: git
+  repo: https://github.com/golang/sys.git
+- package: golang.org/x/text
+  vcs: git
+  repo: https://github.com/golang/text.git
diff --git a/go/master/c/client.go b/go/master/c/client.go
index a2b18e4b474e039e661a3ae130379b41e76f29bd..b5759c30b1d7f7dc33e162e959c7de165e02e1da 100644
--- a/go/master/c/client.go
+++ b/go/master/c/client.go
@@ -18,7 +18,6 @@ package main
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
-
 #define PADDLE_MASTER_OK    0
 #define PADDLE_MASTER_ERROR -1
 
@@ -101,6 +100,12 @@ func paddle_release_master_client(client C.paddle_master_client) {
 	remove(client)
 }
 
+//export paddle_start_get_records
+func paddle_start_get_records(client C.paddle_master_client, pass C.int) {
+	c := get(client)
+	c.StartGetRecords(int(pass))
+}
+
 //export paddle_set_dataset
 func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int) C.int {
 	c := get(client)
@@ -121,15 +126,19 @@ func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int
 
 // paddle_next_record gets the nexts training record.
 //
-// returns number of bytes of the records if success, -1 if failed.
+// returns number of bytes of the records if success, -1 if failed, -2 if pass end.
 //
 //export paddle_next_record
 func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
 	c := get(client)
 	r, err := c.NextRecord()
 	if err != nil {
-		// Error
-		// TODO: return the type of error?
+		// NOTE: use errors to indicate pass ends
+		if err.Error() == master.ErrAllTaskFailed.Error() ||
+			err.Error() == master.ErrNoMoreAvailable.Error() ||
+			err.Error() == master.ErrPassBefore.Error() {
+			return -2
+		}
 		*record = (*C.uchar)(nil)
 		return -1
 	}
diff --git a/go/master/client.go b/go/master/client.go
index bbf3768d96ead1911508486410d2402ea0ac8b12..62801b9b7fe85fe27147b12160f48d988623d547 100644
--- a/go/master/client.go
+++ b/go/master/client.go
@@ -16,7 +16,6 @@ package master
 
 import (
 	"os"
-	"sync"
 	"time"
 
 	"github.com/PaddlePaddle/Paddle/go/connection"
@@ -27,9 +26,9 @@ import (
 
 // Client is the client of the master server.
 type Client struct {
-	conn       *connection.Conn
-	ch         chan record
-	initChOnce sync.Once
+	conn    *connection.Conn
+	ch      chan record
+	bufSize int
 }
 
 type record struct {
@@ -46,11 +45,7 @@ func WithBuffer(bufSize int) func(*Client) error {
 		if bufSize <= 0 {
 			return nil
 		}
-
-		c.initChOnce.Do(func() {
-			c.ch = make(chan record, bufSize)
-			go c.getRecords()
-		})
+		c.bufSize = bufSize
 		return nil
 	}
 }
@@ -104,25 +99,41 @@ func NewClient(opts ...func(*Client) error) (*Client, error) {
 		if err != nil {
 			return nil, err
 		}
-
 	}
-
+	c.ch = make(chan record, c.bufSize)
+	// FIXME: connection is created asyncrosly in monitorMaster go routine,
+	//        ensure the connection is ready for use before calling c.addClient.
+	time.Sleep(time.Second)
 	return c, nil
 }
 
-func (c *Client) getRecords() {
+// StartGetRecords must be called at beginning of each pass
+func (c *Client) StartGetRecords(passID int) {
+	go c.getRecords(passID)
+}
+
+func (c *Client) getRecords(passID int) {
 	for {
-		t, err := c.getTask()
+		t, err := c.getTask(passID)
 		if err != nil {
-			log.Errorf("Get task failed, sleep 3 seconds and continue, %s", err)
-			time.Sleep(3 * time.Second)
-			continue
+			if err.Error() == ErrPassBefore.Error() ||
+				err.Error() == ErrNoMoreAvailable.Error() ||
+				err.Error() == ErrAllTaskFailed.Error() {
+				c.ch <- record{nil, err}
+				break
+			}
+			if err.Error() == ErrPassAfter.Error() {
+				// wait util last pass finishes
+				time.Sleep(time.Second * 3)
+				continue
+			}
+			log.Errorf("getTask error: %s", err)
 		}
 
 		for _, chunk := range t.Chunks {
-			f, err := os.Open(chunk.Path)
-			if err != nil {
-				log.Errorln(err)
+			f, e := os.Open(chunk.Path)
+			if e != nil {
+				log.Errorln(e)
 				continue
 			}
 
@@ -178,18 +189,21 @@ func (c *Client) monitorMaster(addrCh <-chan string) {
 	}
 }
 
-// SetDataset set dataset for the master server to dispatch.
+// SetDataset sets dataset to dispatch for the master server.
+//
+// SetDataset can be call multiple times at one pass. But only the first call
+// will be honored.
 //
-// SetDataset can be call multiple times from different nodes. But
-// only the first call will be honored.
+// After all tasks are done, another call of SetDataset will start another pass.
 func (c *Client) SetDataset(globPaths []string) error {
-	return c.conn.Call("Service.SetDataset", globPaths, nil)
+	err := c.conn.Call("Service.SetDataset", globPaths, nil)
+	return err
 }
 
 // getTask gets a new task from the master server.
-func (c *Client) getTask() (Task, error) {
+func (c *Client) getTask(passID int) (Task, error) {
 	var t Task
-	err := c.conn.Call("Service.GetTask", 0, &t)
+	err := c.conn.Call("Service.GetTask", passID, &t)
 	return t, err
 }
 
@@ -208,12 +222,6 @@ func (c *Client) taskFailed(meta TaskMeta) error {
 // NextRecord will block until the next record is available. It is
 // thread-safe.
 func (c *Client) NextRecord() ([]byte, error) {
-	c.initChOnce.Do(func() {
-		// initialize with in case WithBuffer is not used.
-		c.ch = make(chan record, 0)
-		go c.getRecords()
-	})
-
 	r := <-c.ch
 	return r.r, r.err
 }
diff --git a/go/master/client_internal_test.go b/go/master/client_internal_test.go
index ee305e2c80f54ebee2e5011ca7ff0cf5e0612f41..d5f3d79464655540a29eaa6395057aa5795c4615 100644
--- a/go/master/client_internal_test.go
+++ b/go/master/client_internal_test.go
@@ -54,22 +54,22 @@ func TestGetFinishTask(t *testing.T) {
 		panic(err)
 	}
 	go func(l net.Listener) {
-		s, err := NewService(&InMemStore{}, chunkPerTask, time.Second, 1)
-		if err != nil {
-			panic(err)
+		s, sErr := NewService(&InMemStore{}, chunkPerTask, time.Second, 1)
+		if sErr != nil {
+			panic(sErr)
 		}
 
 		server := rpc.NewServer()
-		err = server.Register(s)
-		if err != nil {
-			panic(err)
+		sErr = server.Register(s)
+		if sErr != nil {
+			panic(sErr)
 		}
 
 		mux := http.NewServeMux()
 		mux.Handle(rpc.DefaultRPCPath, server)
-		err = http.Serve(l, mux)
-		if err != nil {
-			panic(err)
+		sErr = http.Serve(l, mux)
+		if sErr != nil {
+			panic(sErr)
 		}
 	}(l)
 
@@ -103,6 +103,7 @@ func TestGetFinishTask(t *testing.T) {
 	ch := make(chan string, 1)
 	ch <- addr
 	go c.monitorMaster(ch)
+
 	err = c.SetDataset([]string{path})
 	if err != nil {
 		panic(err)
@@ -111,44 +112,47 @@ func TestGetFinishTask(t *testing.T) {
 	checkOnePass := func(i int) {
 		var tasks []Task
 		for idx := 0; idx < totalTask; idx++ {
-			task, err := c.getTask()
-			if err != nil {
-				t.Fatalf("Error: %v, pass: %d\n", err, i)
+			task, cErr := c.getTask(i)
+			if cErr != nil && cErr.Error() != ErrNoMoreAvailable.Error() && cErr.Error() != ErrPassAfter.Error() {
+				t.Fatalf("error: %v, pass: %d\n", cErr, i)
 			}
 			tasks = append(tasks, task)
 		}
 
-		_, err = c.getTask()
-		if err == nil {
+		// getting task before task finishes should return error
+		_, cErr := c.getTask(i)
+		if cErr == nil {
 			t.Fatalf("Should get error, pass: %d\n", i)
 		}
 
-		err = c.taskFinished(tasks[0].Meta.ID)
-		if err != nil {
-			t.Fatalf("Error: %v, pass: %d\n", err, i)
+		cErr = c.taskFinished(tasks[0].Meta.ID)
+		if cErr != nil {
+			t.Fatalf("Error: %v, pass: %d\n", cErr, i)
 		}
-
-		err = c.taskFailed(tasks[0].Meta)
-		if err != nil {
-			t.Fatalf("Error: %v, pass: %d\n", err, i)
+		// call taskFailed once won't put the task to failed queue, just ensure
+		// the call
+		cErr = c.taskFailed(tasks[0].Meta)
+		if cErr != nil {
+			t.Fatalf("Error: %v, pass: %d\n", cErr, i)
 		}
 
 		tasks = tasks[1:]
-		task, err := c.getTask()
-		if err != nil {
-			t.Fatal(err)
+		_, cErr = c.getTask(i)
+		if cErr != nil && cErr.Error() != ErrNoMoreAvailable.Error() && cErr.Error() != ErrPassAfter.Error() {
+			t.Fatalf("Should be ErrNoMoreAvailable or ErrPassAfter: %s", cErr)
 		}
-		tasks = append(tasks, task)
 
 		for _, task := range tasks {
-			err = c.taskFinished(task.Meta.ID)
-			if err != nil {
-				t.Fatalf("Error: %v, pass: %d\n", err, i)
+			cErr = c.taskFinished(task.Meta.ID)
+			if cErr != nil {
+				t.Fatal(cErr)
 			}
 		}
 	}
 
 	for i := 0; i < 10; i++ {
+		// init pass data
+		c.StartGetRecords(i)
 		checkOnePass(i)
 	}
 }
diff --git a/go/master/client_test.go b/go/master/client_test.go
index a3a434ae7e855c1cd2043d196435b42d2520f003..79b9cc844d1ff938915a622bf19a7d772682becf 100644
--- a/go/master/client_test.go
+++ b/go/master/client_test.go
@@ -20,8 +20,10 @@ import (
 	"net/http"
 	"net/rpc"
 	"os"
+	"runtime"
 	"strconv"
 	"strings"
+	"sync"
 	"testing"
 	"time"
 
@@ -29,6 +31,18 @@ import (
 	"github.com/PaddlePaddle/recordio"
 )
 
+// tool function for testing output goroutine ids
+func goid() int {
+	var buf [64]byte
+	n := runtime.Stack(buf[:], false)
+	idField := strings.Fields(strings.TrimPrefix(string(buf[:n]), "goroutine "))[0]
+	id, err := strconv.Atoi(idField)
+	if err != nil {
+		panic(fmt.Sprintf("cannot get goroutine id: %v", err))
+	}
+	return id
+}
+
 func TestNextRecord(t *testing.T) {
 	const (
 		path  = "/tmp/master_client_TestFull"
@@ -45,7 +59,7 @@ func TestNextRecord(t *testing.T) {
 		panic(err)
 	}
 	go func(l net.Listener) {
-		s, err := master.NewService(&master.InMemStore{}, 10, time.Second, 1)
+		s, err := master.NewService(&master.InMemStore{}, 1, time.Second*60, 1)
 		if err != nil {
 			panic(err)
 		}
@@ -69,7 +83,7 @@ func TestNextRecord(t *testing.T) {
 		panic(err)
 	}
 
-	w := recordio.NewWriter(f, -1, -1)
+	w := recordio.NewWriter(f, 1, -1)
 	for i := 0; i < total; i++ {
 		_, err = w.Write([]byte{byte(i)})
 		if err != nil {
@@ -87,32 +101,49 @@ func TestNextRecord(t *testing.T) {
 		panic(err)
 	}
 
-	c, err := master.NewClient(master.WithAddr(fmt.Sprintf(":%d", p)), master.WithBuffer(10))
-	if err != nil {
-		panic(err)
-	}
-
-	err = c.SetDataset([]string{path})
-	if err != nil {
-		panic(err)
-	}
-
-	for pass := 0; pass < 50; pass++ {
-		received := make(map[byte]bool)
-		for i := 0; i < total; i++ {
-			r, err := c.NextRecord()
-			if err != nil {
-				t.Fatal(pass, i, "Read error:", err)
+	// start several client to test task fetching
+	var wg sync.WaitGroup
+	for i := 0; i < 4; i++ {
+		wg.Add(1)
+		// test for multiple concurrent clients
+		go func() {
+			defer wg.Done()
+			// each go-routine needs a single client connection instance
+			c, e := master.NewClient(master.WithAddr(fmt.Sprintf(":%d", p)), master.WithBuffer(1))
+			if e != nil {
+				t.Fatal(e)
 			}
-
-			if len(r) != 1 {
-				t.Fatal(pass, i, "Length should be 1.", r)
+			e = c.SetDataset([]string{path})
+			if e != nil {
+				panic(e)
 			}
-
-			if received[r[0]] {
-				t.Fatal(pass, i, "Received duplicate.", received, r)
+			// test for n passes
+			for pass := 0; pass < 10; pass++ {
+				c.StartGetRecords(pass)
+
+				received := make(map[byte]bool)
+				taskid := 0
+				for {
+					r, e := c.NextRecord()
+					if e != nil {
+						// ErrorPassAfter will wait, else break for next pass
+						if e.Error() == master.ErrPassBefore.Error() ||
+							e.Error() == master.ErrNoMoreAvailable.Error() {
+							break
+						}
+						t.Fatal(pass, taskid, "Read error:", e)
+					}
+					if len(r) != 1 {
+						t.Fatal(pass, taskid, "Length should be 1.", r)
+					}
+					if received[r[0]] {
+						t.Fatal(pass, taskid, "Received duplicate.", received, r)
+					}
+					taskid++
+					received[r[0]] = true
+				}
 			}
-			received[r[0]] = true
-		}
+		}()
 	}
+	wg.Wait()
 }
diff --git a/go/master/etcd_client.go b/go/master/etcd_client.go
index ae6b6f776bec9ccaead4465ad233fc8ed6c3a418..94848d887e8bc4b055a7c8b89b9b7f26a39229d1 100644
--- a/go/master/etcd_client.go
+++ b/go/master/etcd_client.go
@@ -39,15 +39,12 @@ type EtcdClient struct {
 	statePath string
 	client    *clientv3.Client
 	lock      *concurrency.Mutex
+	sess      *concurrency.Session
 }
 
 // NewEtcdClient creates a new EtcdClient.
 func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePath string, ttlSec int) (*EtcdClient, error) {
 	log.Debugf("Connecting to etcd at %v", endpoints)
-	// TODO(helin): gracefully shutdown etcd store. Because etcd
-	// store holds a etcd lock, even though the lock will expire
-	// when the lease timeout, we need to implement graceful
-	// shutdown to release the lock.
 	cli, err := clientv3.New(clientv3.Config{
 		Endpoints:   endpoints,
 		DialTimeout: dialTimeout,
@@ -67,12 +64,12 @@ func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePat
 	// one master running, but split-brain problem may cause
 	// multiple master servers running), and the cluster management
 	// software will kill one of them.
-	log.Debugf("Trying to acquire lock at %s.", lockPath)
+	log.Infof("Trying to acquire lock at %s.", lockPath)
 	err = lock.Lock(context.TODO())
 	if err != nil {
 		return nil, err
 	}
-	log.Debugf("Successfully acquired lock at %s.", lockPath)
+	log.Infof("Successfully acquired lock at %s.", lockPath)
 
 	put := clientv3.OpPut(addrPath, addr)
 	resp, err := cli.Txn(context.Background()).If(lock.IsOwner()).Then(put).Commit()
@@ -89,6 +86,7 @@ func NewEtcdClient(endpoints []string, addr string, lockPath, addrPath, statePat
 		statePath: statePath,
 		client:    cli,
 		lock:      lock,
+		sess:      sess,
 	}
 
 	return e, nil
@@ -157,6 +155,21 @@ func (e *EtcdClient) Load() ([]byte, error) {
 	return state, nil
 }
 
+// Shutdown shuts down the etcd client gracefully.
+func (e *EtcdClient) Shutdown() error {
+	err := e.sess.Close()
+	newErr := e.client.Close()
+	if newErr != nil {
+		if err == nil {
+			err = newErr
+		} else {
+			log.Errorln(newErr)
+		}
+	}
+
+	return err
+}
+
 // GetKey gets the value by the specify key.
 func GetKey(c *clientv3.Client, key string, timeout time.Duration) (string, error) {
 	ctx, cancel := context.WithTimeout(context.Background(), timeout)
diff --git a/go/master/inmem_store.go b/go/master/inmem_store.go
index ffd663f7f0b25c29f0bab082d27b29dcfeb60826..a5bd2d4fe150cd34c699ccfae1f3d3e0fb2ef3d6 100644
--- a/go/master/inmem_store.go
+++ b/go/master/inmem_store.go
@@ -40,3 +40,8 @@ func (m *InMemStore) Load() ([]byte, error) {
 
 	return m.buf, nil
 }
+
+// Shutdown shuts down the in mem store.
+func (m *InMemStore) Shutdown() error {
+	return nil
+}
diff --git a/go/master/service.go b/go/master/service.go
index d1ec8939e18e8f4a7b4578a9399e2fa9f24325f3..d30e9a33229c0aff354417771b5bf2ae6a781715 100644
--- a/go/master/service.go
+++ b/go/master/service.go
@@ -19,6 +19,7 @@ import (
 	"compress/gzip"
 	"encoding/gob"
 	"errors"
+	"math/rand"
 	"os"
 	"path/filepath"
 	"sync"
@@ -33,10 +34,23 @@ const (
 	dialTimeout = 5 * time.Second
 )
 
+// ErrAllTaskFailed occur when tasks are in done or failed state.
+var ErrAllTaskFailed = errors.New("all task finished")
+
+// ErrNoMoreAvailable occur when no task in todo and yet not all done or fail.
+var ErrNoMoreAvailable = errors.New("no more available task")
+
+// ErrPassBefore client side pass number does not match with master counter.
+var ErrPassBefore = errors.New("pass number smaller than master")
+
+// ErrPassAfter client side pass number does not match with master counter.
+var ErrPassAfter = errors.New("pass number larger than master")
+
 // Store is the interface for save and load the master state.
 type Store interface {
 	Save([]byte) error
 	Load() ([]byte, error)
+	Shutdown() error
 }
 
 // Chunk is a chunk of data consisted of several data instances.
@@ -75,17 +89,26 @@ type Service struct {
 	chunksPerTask int
 	timeoutDur    time.Duration
 	failureMax    int
-	ready         chan struct{}
 	store         Store
 
-	mu            sync.Mutex
-	initDone      bool
-	taskQueues    taskQueues
+	ready    chan struct{}
+	initDone bool
+
+	mu         sync.Mutex
+	taskQueues taskQueues
+	currPass   int
+	jobTasks   []taskEntry
+
 	savingTrainer string
 }
 
 func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
-	id := 0
+	// generate uniq id across job using nanosecond + randint + counter
+	// FIXME(typhoonzero): this is a workaround, use uuid
+	randStart := rand.Int()
+	counter := 0
+	timestamp := time.Now().Nanosecond()
+	id := timestamp + randStart + counter
 	if chunksPerTask <= 0 {
 		chunksPerTask = 1
 	}
@@ -95,7 +118,8 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 	for i, c := range chunks {
 		if i%chunksPerTask == 0 && len(cur.Task.Chunks) > 0 {
 			cur.Task.Meta.ID = id
-			id++
+			counter++
+			id = timestamp + randStart + counter
 			result = append(result, cur)
 			cur.Task.Chunks = nil
 		}
@@ -266,19 +290,21 @@ func (s *Service) SetDataset(globPaths []string, _ *int) error {
 		return err
 	}
 
-	s.taskQueues.Todo = partition(chunks, s.chunksPerTask)
+	s.jobTasks = partition(chunks, s.chunksPerTask)
+	s.taskQueues.Todo = s.jobTasks
 
 	err = s.snapshot()
 	if err != nil {
 		log.Errorln(err)
 		return err
 	}
-
 	close(s.ready)
 	s.initDone = true
 	return nil
 }
 
+// processFailedTask retry s.failureMax times for failed task.
+// return true if all task are done or failed.
 func (s *Service) processFailedTask(t taskEntry, epoch int) {
 	if t.Task.Meta.Epoch != epoch {
 		// new epoch, task launched after the
@@ -302,8 +328,9 @@ func (s *Service) processFailedTask(t taskEntry, epoch int) {
 		return
 	}
 
-	log.Warningf("Task %v failed %d times, discard.", t.Task, t.NumFailure)
+	log.Warningf("Task %v failed %d times, re-dispatch.", t.Task, t.NumFailure)
 	s.taskQueues.Todo = append(s.taskQueues.Todo, t)
+	return
 }
 
 func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
@@ -331,37 +358,30 @@ func (s *Service) logFields() log.Fields {
 }
 
 // GetTask gets a new task from the service.
-func (s *Service) GetTask(_ int, task *Task) error {
+// passID is the client side pass count
+func (s *Service) GetTask(passID int, task *Task) error {
 	select {
 	case <-s.ready:
 	}
 
 	s.mu.Lock()
 	defer s.mu.Unlock()
+	if passID < s.currPass {
+		return ErrPassBefore
+	}
+	if passID > s.currPass {
+		// Client may get run to pass after master when one client faster than the
+		// other
+		return ErrPassAfter
+	}
 
 	if len(s.taskQueues.Todo) == 0 {
-		if len(s.taskQueues.Done) == 0 {
-			if len(s.taskQueues.Pending) == 0 {
-				err := errors.New("all task failed")
-				log.WithFields(s.logFields()).Warningln("All tasks failed.")
-				return err
-			}
-
-			// TODO(helin): client need to retry in this
-			// error case. Gotcha: RPC client can't
-			// compare returned error with predefined
-			// errors like io.EOF, because the error
-			// instance deserialized from RPC is a
-			// different instance than the error defined
-			// in package. So we need to figure out a way
-			// for client to check this error correctly.
-			err := errors.New("no more available task")
-			log.WithFields(s.logFields()).Warningln("No more available task.")
-			return err
+		if len(s.taskQueues.Done) == 0 && len(s.taskQueues.Pending) == 0 {
+			log.WithFields(s.logFields()).Warningln("All tasks failed, may start next pass")
+			return ErrAllTaskFailed
 		}
-		s.taskQueues.Todo = s.taskQueues.Done
-		s.taskQueues.Done = nil
-		log.WithFields(s.logFields()).Infoln("No more todo task, but trainer is requesting task to do. Move all done task to todo.")
+		log.WithFields(s.logFields()).Warningln("No more available task.")
+		return ErrNoMoreAvailable
 	}
 
 	t := s.taskQueues.Todo[0]
@@ -381,7 +401,7 @@ func (s *Service) GetTask(_ int, task *Task) error {
 }
 
 // TaskFinished tell the service that a task is finished.
-func (s *Service) TaskFinished(taskID int, _ *int) error {
+func (s *Service) TaskFinished(taskID int, dummy *int) error {
 	select {
 	case <-s.ready:
 	}
@@ -401,11 +421,14 @@ func (s *Service) TaskFinished(taskID int, _ *int) error {
 	delete(s.taskQueues.Pending, taskID)
 
 	log.WithFields(s.logFields()).Infof("Task #%d finished.", taskID)
-
-	if len(s.taskQueues.Pending) == 0 && len(s.taskQueues.Todo) == 0 {
-		log.WithFields(s.logFields()).Infoln("No more todo and pending task, start a new pass.")
-		s.taskQueues.Todo = append(s.taskQueues.Todo, s.taskQueues.Done...)
-		s.taskQueues.Done = nil
+	if len(s.taskQueues.Todo) == 0 && len(s.taskQueues.Pending) == 0 {
+		// increase master side pass count if all tasks finished
+		s.currPass++
+		s.taskQueues.Todo = s.jobTasks
+		s.taskQueues.Done = []taskEntry{}
+		// TODO(typhoonzero): deal with failed tasks
+		s.taskQueues.Failed = []taskEntry{}
+		log.WithFields(s.logFields()).Warningf("all task finished, add new pass data, newpass: %d.", s.currPass)
 	}
 
 	err := s.snapshot()
@@ -416,7 +439,7 @@ func (s *Service) TaskFinished(taskID int, _ *int) error {
 }
 
 // TaskFailed tells the service that a task is failed.
-func (s *Service) TaskFailed(meta TaskMeta, _ *int) error {
+func (s *Service) TaskFailed(meta TaskMeta, dummy *int) error {
 	select {
 	case <-s.ready:
 	}
diff --git a/go/master/service_internal_test.go b/go/master/service_internal_test.go
index 69a882fc33668a8cdefa30ae394f6c605f3bf099..bd1a939a55553b558181d91a757c487d0f97b40b 100644
--- a/go/master/service_internal_test.go
+++ b/go/master/service_internal_test.go
@@ -44,7 +44,8 @@ func TestPartionIndex(t *testing.T) {
 	cs := make([]Chunk, 100)
 	ts := partition(cs, 20)
 	for i := range ts {
-		if ts[i].Task.Meta.ID != i {
+		// test auto increament ids
+		if i > 0 && ts[i].Task.Meta.ID != ts[i-1].Task.Meta.ID+1 {
 			t.Error(ts[i], i)
 		}
 	}
diff --git a/go/master/service_test.go b/go/master/service_test.go
new file mode 100644
index 0000000000000000000000000000000000000000..5f91910ecc8cf32289e71e2e41e8b283acc115e6
--- /dev/null
+++ b/go/master/service_test.go
@@ -0,0 +1,68 @@
+package master_test
+
+import (
+	"os"
+	"testing"
+	"time"
+
+	"github.com/PaddlePaddle/Paddle/go/master"
+	"github.com/coreos/etcd/clientv3"
+	"github.com/coreos/etcd/embed"
+	"github.com/docker/docker/pkg/ioutils"
+	"github.com/stretchr/testify/assert"
+)
+
+func TestNewServiceWithEtcd(t *testing.T) {
+	// setup an embed etcd server
+	etcdDir, err := ioutils.TempDir("", "")
+	if err != nil {
+		t.Fatal(err)
+	}
+	cfg := embed.NewConfig()
+	cfg.Dir = etcdDir
+	e, err := embed.StartEtcd(cfg)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer func() {
+		e.Close()
+		if err := os.RemoveAll(etcdDir); err != nil {
+			t.Fatal(err)
+		}
+	}()
+	select {
+	case <-e.Server.ReadyNotify():
+		t.Log("Server is ready!")
+	case <-time.After(60 * time.Second):
+		e.Server.Stop() // trigger a shutdown
+		t.Fatal("Server took too long to start!")
+	}
+
+	ep := []string{"127.0.0.1:2379"}
+	masterAddr := "127.0.0.1:3306"
+	store, err := master.NewEtcdClient(ep, masterAddr, master.DefaultLockPath, master.DefaultAddrPath, master.DefaultStatePath, 30)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	_, err = master.NewService(store, 10, 10, 3)
+	if err != nil {
+		t.Fatal(err)
+	}
+	cli, err := clientv3.New(clientv3.Config{
+		Endpoints:   ep,
+		DialTimeout: 3 * time.Second,
+	})
+	if err != nil {
+		t.Fatal(err)
+	}
+	v, err := master.GetKey(cli, master.DefaultAddrPath, 3*time.Second)
+	if err != nil {
+		t.Fatal(err)
+	}
+	if err := cli.Close(); err != nil {
+		t.Fatal(err)
+	}
+	// test master process registry itself into etcd server.
+	assert.Equal(t, masterAddr, v, "master process should registry itself into etcd server.")
+}
diff --git a/go/pserver/client/c/cclient.go b/go/pserver/client/c/cclient.go
index 0f7e20cdd8d20e37b586c22377a89fca4c3cf7ce..14ad0774550f6e5a5d8610d6007904cd2820432c 100644
--- a/go/pserver/client/c/cclient.go
+++ b/go/pserver/client/c/cclient.go
@@ -55,10 +55,10 @@ var curHandle C.paddle_pserver_client
 func add(c *client.Client) C.paddle_pserver_client {
 	mu.Lock()
 	defer mu.Unlock()
-	client := curHandle
+	cli := curHandle
 	curHandle++
-	handleMap[client] = c
-	return client
+	handleMap[cli] = c
+	return cli
 }
 
 func get(client C.paddle_pserver_client) *client.Client {
diff --git a/go/pserver/client/c/test/test_train.py b/go/pserver/client/c/test/test_train.py
index e9264592b4f18fddf68b198d73bf907206e77a3f..85cb399590f7a5e7e73285ca87c49ea5f24afb32 100644
--- a/go/pserver/client/c/test/test_train.py
+++ b/go/pserver/client/c/test/test_train.py
@@ -6,16 +6,19 @@ import cPickle as pickle
 
 etcd_ip = os.getenv("MASTER_IP", "127.0.0.1")
 etcd_endpoint = "http://" + etcd_ip + ":2379"
+print "connecting to master, etcd endpoints: ", etcd_endpoint
+master_client = master.client(etcd_endpoint, 5, 64)
 
 
 def cloud_reader():
-    print "connecting to master, etcd endpoints: ", etcd_endpoint
-    master_client = master.client(etcd_endpoint, 5, 64)
+    global master_client
     master_client.set_dataset(
-        ["/pfs/dlnel/public/dataset/uci_housing/uci_housing-*-of-*"])
+        ["/pfs/dlnel/public/dataset/uci_housing/uci_housing-*"], passes=30)
     while 1:
         r, e = master_client.next_record()
         if not r:
+            if e != -2:  # other errors
+                print "get record error:", e
             break
         yield pickle.loads(r)
 
@@ -27,10 +30,12 @@ def main():
     # network config
     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
     y_predict = paddle.layer.fc(input=x,
-                                param_attr=paddle.attr.Param(name='w'),
+                                param_attr=paddle.attr.Param(
+                                    name='w', learning_rate=1e-3),
                                 size=1,
                                 act=paddle.activation.Linear(),
-                                bias_attr=paddle.attr.Param(name='b'))
+                                bias_attr=paddle.attr.Param(
+                                    name='b', learning_rate=1e-3))
     y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
     cost = paddle.layer.mse_cost(input=y_predict, label=y)
 
@@ -38,9 +43,8 @@ def main():
     parameters = paddle.parameters.create(cost)
 
     # create optimizer of new remote updater to pserver
-    optimizer = paddle.optimizer.Momentum(momentum=0)
+    optimizer = paddle.optimizer.Momentum(momentum=0, learning_rate=1e-3)
 
-    print "etcd endoint: ", etcd_endpoint
     trainer = paddle.trainer.SGD(cost=cost,
                                  parameters=parameters,
                                  update_equation=optimizer,
@@ -51,6 +55,8 @@ def main():
     # event_handler to print training and testing info
     def event_handler(event):
         if isinstance(event, paddle.event.EndIteration):
+            # FIXME: for cloud data reader, pass number is managed by master
+            # should print the server side pass number
             if event.batch_id % 100 == 0:
                 print "Pass %d, Batch %d, Cost %f" % (
                     event.pass_id, event.batch_id, event.cost)
diff --git a/go/pserver/etcd_client.go b/go/pserver/etcd_client.go
index 98ff8ce827c7cfcd9122cb043f2a6226057cc95a..4fb26307667295ab825d07be6c3d1d4b33f6eb8b 100644
--- a/go/pserver/etcd_client.go
+++ b/go/pserver/etcd_client.go
@@ -34,16 +34,19 @@ const (
 	PsPath = "/ps/"
 	// PsCheckpoint is the etcd path for store checkpoints information
 	PsCheckpoint = "/checkpoints/"
+
+	retryTimeout = 5 * time.Second
 )
 
 // EtcdClient is the etcd client that the pserver uses for fault
 // tolerance, service registry and coordination.
 type EtcdClient struct {
-	numPservers   int
-	etcdEndpoints string
-	etcdClient    *clientv3.Client
-	// etcdTimeout is also used as retry intervals.
-	etcdTimeout time.Duration
+	numPservers int
+	endpoints   string
+	client      *clientv3.Client
+	sess        *concurrency.Session
+	dialTimeout time.Duration
+	ttlSec      int
 	// FIXME: ensure GetExternalIP gets the correct ip for trainers to connect.
 	externalIP string
 	// desired number of pservers in the job.
@@ -52,11 +55,12 @@ type EtcdClient struct {
 }
 
 // NewEtcdClient creates an EtcdClient
-func NewEtcdClient(endpoints string, numPservers int, timeout time.Duration) *EtcdClient {
+func NewEtcdClient(endpoints string, numPservers int, dialtimeout time.Duration, ttlSec int) *EtcdClient {
 	return &EtcdClient{
-		etcdTimeout:   timeout,
-		numPservers:   numPservers,
-		etcdEndpoints: endpoints,
+		dialTimeout: dialtimeout,
+		ttlSec:      ttlSec,
+		numPservers: numPservers,
+		endpoints:   endpoints,
 	}
 }
 
@@ -64,7 +68,6 @@ func NewEtcdClient(endpoints string, numPservers int, timeout time.Duration) *Et
 //
 // Register returns the index of the current pserver.
 func (e *EtcdClient) Register(port int) (int, error) {
-
 	var err error
 	e.externalIP, err = networkhelper.GetExternalIP()
 	if err != nil {
@@ -72,19 +75,26 @@ func (e *EtcdClient) Register(port int) (int, error) {
 	}
 
 	// initialize connection to etcd.
-	ep := strings.Split(e.etcdEndpoints, ",")
+	ep := strings.Split(e.endpoints, ",")
 	for {
 		cli, err := clientv3.New(clientv3.Config{
 			Endpoints:   ep,
-			DialTimeout: e.etcdTimeout,
+			DialTimeout: e.dialTimeout,
 		})
 		if err != nil {
 			log.Errorf("connect to etcd error: %v", err)
-			time.Sleep(e.etcdTimeout)
+			time.Sleep(retryTimeout)
+			continue
+		}
+		e.client = cli
+		sess, err := concurrency.NewSession(cli, concurrency.WithTTL(e.ttlSec))
+		if err != nil {
+			log.Errorf("create etcd session error: %v", err)
+			time.Sleep(retryTimeout)
 			continue
 		}
-		e.etcdClient = cli
-		log.Debugf("inited client to %s", e.etcdEndpoints)
+		e.sess = sess
+		log.Debugf("inited client to %s", e.endpoints)
 		break
 	}
 	// init /ps_desired using transaction, for multiple pservers may want to write
@@ -95,7 +105,7 @@ func (e *EtcdClient) Register(port int) (int, error) {
 		cancel()
 		if err != nil {
 			log.Warn(err)
-			time.Sleep(e.etcdTimeout)
+			time.Sleep(retryTimeout)
 			continue
 		}
 		break
@@ -106,18 +116,18 @@ func (e *EtcdClient) Register(port int) (int, error) {
 	// wait and set s.desired init value
 	for {
 		ctx, cancel := context.WithTimeout(context.Background(), time.Second)
-		resp, err := e.etcdClient.Get(ctx, PsDesired)
+		resp, err := e.client.Get(ctx, PsDesired)
 		cancel()
 		if err != nil {
 			log.Errorf("getting %s error: %v", PsDesired, err)
-			time.Sleep(e.etcdTimeout)
+			time.Sleep(retryTimeout)
 			continue
 		}
 		if len(resp.Kvs) != 0 {
 			e.desired, err = strconv.Atoi(string(resp.Kvs[0].Value))
 			if err != nil {
 				log.Errorf("value of %s invalid %v\n", PsDesired, err)
-				time.Sleep(e.etcdTimeout)
+				time.Sleep(retryTimeout)
 				// NOTE: wait util ps_desired value change
 				continue
 			}
@@ -134,7 +144,7 @@ func (e *EtcdClient) Register(port int) (int, error) {
 		cancel()
 		if err != nil {
 			log.Warn(err)
-			time.Sleep(e.etcdTimeout)
+			time.Sleep(retryTimeout)
 			continue
 		}
 		break
@@ -144,10 +154,10 @@ func (e *EtcdClient) Register(port int) (int, error) {
 }
 
 func (e *EtcdClient) initDesiredPservers(ctx context.Context, numPservers int) (*clientv3.TxnResponse, error) {
-	return concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error {
+	return concurrency.NewSTM(e.client, func(c concurrency.STM) error {
 		dsStr := c.Get(PsDesired)
 		if dsStr == "" {
-			c.Put(PsDesired, strconv.Itoa(numPservers))
+			c.Put(PsDesired, strconv.Itoa(numPservers), clientv3.WithLease(e.sess.Lease()))
 		}
 		return nil
 	}, concurrency.WithAbortContext(ctx), concurrency.WithIsolation(concurrency.RepeatableReads))
@@ -156,7 +166,7 @@ func (e *EtcdClient) initDesiredPservers(ctx context.Context, numPservers int) (
 // registerPserverEtcd registers pserver node on etcd using transaction.
 func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, error) {
 	var idx int
-	_, err := concurrency.NewSTM(e.etcdClient, func(c concurrency.STM) error {
+	_, err := concurrency.NewSTM(e.client, func(c concurrency.STM) error {
 		registered := false
 		for i := 0; i < e.desired; i++ {
 			psKey := PsPath + strconv.Itoa(i)
@@ -165,26 +175,10 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, er
 			log.Debugf("got value (%s) for key: %s", ps, psKey)
 
 			if ps == "" {
-				resp, err := e.etcdClient.Grant(context.TODO(), 5)
-				if err != nil {
-					log.Fatal(err)
-				}
 				// find the first id and write info
 				pserverAddr := e.externalIP + ":" + strconv.Itoa(port)
-				c.Put(psKey, pserverAddr, clientv3.WithLease(resp.ID))
+				c.Put(psKey, pserverAddr, clientv3.WithLease(e.sess.Lease()))
 				log.Debugf("set pserver node %s with value %s", psKey, pserverAddr)
-				ch, kaerr := e.etcdClient.KeepAlive(context.TODO(), resp.ID)
-				if kaerr != nil {
-					log.Errorf("keepalive etcd node error: %v", kaerr)
-					return kaerr
-				}
-
-				// Eat the keep alive message so etcd
-				// will not expire the lease.
-				go func(ch <-chan *clientv3.LeaseKeepAliveResponse) {
-					ka := <-ch
-					log.Debugf("keepalive: %d\n", ka.TTL)
-				}(ch)
 				log.Debug("register finished")
 				idx = i
 				registered = true
@@ -207,7 +201,7 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context, port int) (int, er
 // GetKey gets the value by the specified key
 func (e *EtcdClient) GetKey(key string, timeout time.Duration) ([]byte, error) {
 	ctx, cancel := context.WithTimeout(context.Background(), timeout)
-	resp, err := e.etcdClient.Get(ctx, key)
+	resp, err := e.client.Get(ctx, key)
 	cancel()
 	if err != nil {
 		return []byte{}, err
@@ -223,7 +217,27 @@ func (e *EtcdClient) GetKey(key string, timeout time.Duration) ([]byte, error) {
 // PutKey put into etcd with value by key specified
 func (e *EtcdClient) PutKey(key string, value []byte, timeout time.Duration) error {
 	ctx, cancel := context.WithTimeout(context.Background(), timeout)
-	_, err := e.etcdClient.Put(ctx, key, string(value))
+	_, err := e.client.Put(ctx, key, string(value), clientv3.WithLease(e.sess.Lease()))
 	cancel()
 	return err
 }
+
+// Shutdown shuts down the etcd client gracefully.
+func (e *EtcdClient) Shutdown() error {
+	var err error
+	if e.sess != nil {
+		err = e.sess.Close()
+	}
+
+	if e.client != nil {
+		newErr := e.client.Close()
+		if newErr != nil {
+			if err != nil {
+				log.Errorln(newErr)
+			} else {
+				err = newErr
+			}
+		}
+	}
+	return err
+}
diff --git a/paddle/api/Evaluator.cpp b/paddle/api/Evaluator.cpp
index 681e3a380912339c531c16c88f43255c2f34c32f..fcda6eaf031c02f2314298f88b3af2c08ba6fa11 100644
--- a/paddle/api/Evaluator.cpp
+++ b/paddle/api/Evaluator.cpp
@@ -37,7 +37,7 @@ std::vector<std::string> Evaluator::getNames() const {
 double Evaluator::getValue(const std::string name) const {
   paddle::Error err;
   double v = m->rawPtr->getValue(name, &err);
-  if (err) {
+  if (!err.isOK()) {
     throw std::runtime_error(err.msg());
   }
   return v;
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index f55197c8c9ebb4a0f67ab915abfefd6a45cd13aa..9f84db72da24b0e678520b077f9cba7ffc2d589a 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -17,73 +17,6 @@ limitations under the License. */
 
 #include "hl_base.h"
 
-/**
- * @brief   Shrink column to feature.
- *
- * @param[in]   dataCol     expand data.
- * @param[in]   channels    number of channel.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   blockH      filter height.
- * @param[in]   blockW      filter width.
- * @param[in]   strideH     stride height.
- * @param[in]   strideW     stride width.
- * @param[in]   paddingH    padding height.
- * @param[in]   paddingW    padding width.
- * @param[in]   outputH     output height.
- * @param[in]   outputW     output width.
- * @param[out]  dataIm      output image data.
- * @param[in]   alpha
- * @param[in]   beta
- */
-extern void hl_shrink_col2feature(const real* dataCol,
-                                  size_t channels,
-                                  size_t height,
-                                  size_t width,
-                                  size_t blockH,
-                                  size_t blockW,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  real* dataIm,
-                                  real alpha = 1.0f,
-                                  real beta = 0.0f);
-
-/**
- * @brief   Expand feature to column.
- *
- * @param[in]   dataIm      input image data.
- * @param[in]   channels    number of channel.
- * @param[in]   height      image height.
- * @param[in]   width       image width.
- * @param[in]   blockH      filter height.
- * @param[in]   blockW      filter width.
- * @param[in]   strideH     stride height.
- * @param[in]   strideW     stride width.
- * @param[in]   paddingH    padding height.
- * @param[in]   paddingW    padding width.
- * @param[in]   outputH     output height.
- * @param[in]   outputW     output width.
- * @param[out]  dataCol     expand data.
- *
- */
-extern void hl_expand_feature2col(const real* dataIm,
-                                  size_t channels,
-                                  size_t height,
-                                  size_t width,
-                                  size_t blockH,
-                                  size_t blockW,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  real* dataCol);
-
 /**
  * @brief   Maximum pool forward.
  *
diff --git a/paddle/cuda/include/stub/hl_cnn_stub.h b/paddle/cuda/include/stub/hl_cnn_stub.h
index 039551c6cc69525e71c8c311f78fb6dec07d7fed..2bbb9fa8dfd5eeac9d55aa67a28ebfbffa2acd46 100644
--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -17,36 +17,6 @@ limitations under the License. */
 
 #include "hl_cnn.h"
 
-inline void hl_shrink_col2feature(const real* dataCol,
-                                  size_t channels,
-                                  size_t height,
-                                  size_t width,
-                                  size_t blockH,
-                                  size_t blockW,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  real* dataIm,
-                                  real alpha,
-                                  real beta) {}
-
-inline void hl_expand_feature2col(const real* dataIm,
-                                  size_t channels,
-                                  size_t height,
-                                  size_t width,
-                                  size_t blockH,
-                                  size_t blockW,
-                                  size_t strideH,
-                                  size_t strideW,
-                                  size_t paddingH,
-                                  size_t paddingW,
-                                  size_t outputH,
-                                  size_t outputW,
-                                  real* dataCol) {}
-
 inline void hl_maxpool_forward(const int frameCnt,
                                const real* inputData,
                                const int channels,
diff --git a/paddle/cuda/src/hl_cuda_cnn.cu b/paddle/cuda/src/hl_cuda_cnn.cu
index b94f4d8fe4a251750c527d4b686fcc8f452d4606..b6e3e63a4f52261e49467bd82fdabd063e81460e 100644
--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -18,134 +18,6 @@ limitations under the License. */
 #include "hl_cnn.h"
 #include "hl_device_functions.cuh"
 
-__global__ void KeFeature2col(size_t n, size_t height, const real* data_im,
-                              size_t blockH, size_t blockW, size_t width,
-                              size_t strideH, size_t strideW,
-                              size_t paddingH, size_t paddingW,
-                              size_t height_col, size_t width_col,
-                              real* data_col) {
-  size_t index =
-    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < n) {
-    size_t w_out = index % width_col;
-    index /= width_col;
-    size_t h_out = index % height_col;
-    size_t channel_in = index / height_col;
-    size_t channel_out = channel_in * blockH * blockW;
-    size_t h_in = h_out * strideH;
-    size_t w_in = w_out * strideW;
-
-    data_col += (channel_out * height_col + h_out) * width_col + w_out;
-    for (size_t i = 0; i < blockH; ++i) {
-      for (size_t j = 0; j < blockW; ++j) {
-        int rIdx = int(h_in+i);
-        int cIdx = int(w_in+j);
-        if ((rIdx-(int)paddingH) >= (int)height ||
-            (rIdx-(int)paddingH) < 0 ||
-            (cIdx-(int)paddingW) >= (int)width ||
-            (cIdx-(int)paddingW) < 0) {
-          *data_col = 0;
-        } else {
-          rIdx = rIdx + channel_in*height - paddingH;
-          cIdx = cIdx - paddingW;
-          *data_col = data_im[rIdx* width + cIdx];
-        }
-        data_col += height_col * width_col;
-      }
-    }
-  }
-}
-
-void hl_expand_feature2col(const real* dataIm, size_t channels,
-                           size_t height, size_t width,
-                           size_t blockH, size_t blockW,
-                           size_t strideH, size_t strideW,
-                           size_t paddingH, size_t paddingW,
-                           size_t outputH, size_t outputW,
-                           real* dataCol) {
-  size_t numKernels = channels * outputH * outputW;
-
-  size_t blocks = (numKernels + 1024 -1) / 1024;
-  size_t blockX = 512;
-  size_t blockY = (blocks+512-1)/512;
-  dim3 threads(1024, 1);
-  dim3 grid(blockX, blockY);
-  KeFeature2col<<< grid, threads, 0, STREAM_DEFAULT >>>
-           (numKernels, height, dataIm, blockH, blockW, width,
-           strideH, strideW, paddingH, paddingW,
-           outputH, outputW, dataCol);
-  CHECK_SYNC("hl_expand_feature2col failed");
-}
-
-__global__ void KeCol2Feature(size_t n, const real* data_col, size_t height,
-                              size_t width, size_t channels,
-                              size_t blockH, size_t blockW,
-                              size_t strideH, size_t strideW,
-                              size_t paddingH, size_t paddingW,
-                              size_t height_col, size_t width_col,
-                              real* data_im, real alpha, real beta) {
-  size_t index =
-    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < n) {
-    real val = 0;
-    int w = int(index % width);
-    int h = int((index / width) % height);
-    int c = int(index / (width * height));
-    if ((w - (int)paddingW) >= 0 &&
-        (w - (int)paddingW) < (width-2 * paddingW) &&
-        (h - (int)paddingH) >= 0 &&
-        (h - paddingH) < (height - 2 * paddingH)) {
-      // compute the start and end of the output
-      int w_col_start =
-        (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1;
-      int w_col_end =
-        min((int)(w / (int)strideW + 1), (int)(width_col));
-      int h_col_start =
-        (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1;
-      int h_col_end = min(int(h / strideH + 1), int(height_col));
-      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          // the col location: [c * width * height + h_out, w_out]
-          int c_col = int(c * blockH* blockW) + \
-            (h - h_col * (int)strideH) * (int)blockW +
-            (w - w_col * (int)strideW);
-          val += data_col[(c_col * height_col + h_col) * width_col + w_col];
-        }
-      }
-      h -= paddingH;
-      w -= paddingW;
-      real tD = data_im[c*((width-2*paddingW) * (height-2*paddingH)) +
-                          h*(width-2*paddingW) + w];
-      data_im[c*((width-2*paddingW) * (height-2*paddingH)) +
-              h*(width-2*paddingW) + w] = alpha * val + beta*tD;
-    }
-  }
-}
-
-void hl_shrink_col2feature(const real * dataCol, size_t channels,
-                           size_t height, size_t width,
-                           size_t blockH, size_t blockW,
-                           size_t strideH, size_t strideW,
-                           size_t paddingH, size_t paddingW,
-                           size_t outputH, size_t outputW,
-                           real* dataIm, real alpha, real beta) {
-  size_t numKernels = channels * (height + 2*paddingH) * (width + 2*paddingW);
-
-  size_t blocks = (numKernels + 1024 -1) / 1024;
-  size_t blockX = 512;
-  size_t blockY = (blocks+512-1)/512;
-  dim3 threads(1024, 1);
-  dim3 grid(blockX, blockY);
-
-  // To avoid involving atomic operations, we will launch one kernel per
-  // bottom dimension, and then in the kernel add up the top dimensions.
-  KeCol2Feature<<< grid, threads, 0, STREAM_DEFAULT >>>
-           (numKernels, dataCol, height + 2*paddingH, width + 2*paddingW,
-           channels, blockH, blockW, strideH, strideW, paddingH, paddingW,
-           outputH, outputW, dataIm, alpha, beta);
-  CHECK_SYNC("hl_shrink_col2feature failed");
-}
-
 __global__ void KeMaxPoolForward(const int nthreads, const real* inputData,
                                  const int channels, const int height,
                                  const int width,
diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu
index 0fe2877f89f8d0fbc4db40c400037be30bb87ff7..4f650ce03ccb2d14cc2997e9cd426acb91439539 100644
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -330,7 +330,7 @@ __global__ void KeSequenceAvgForward(real* dst,
     }
     sum = mode == 1 ? sum :
         (mode == 0 ? sum / seqLength : sum * my_rsqrt((real)seqLength));
-    dst[gid] = sum;
+    dst[gid] += sum;
   }
 }
 
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 433edbfda742d3be9915eade7b0a455398a501dc..21cb7c7265e0052630b68954fa25f9189e641e7b 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -3,7 +3,7 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3)
 cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 
-cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory)
+cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context)
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
@@ -29,7 +29,5 @@ py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
 
-proto_library(net_proto SRCS net_proto.proto DEPS op_proto)
-# cc_library(net SRCS net.cc DEPS operator net_proto op_registry fc_op)
-cc_library(net SRCS net.cc DEPS operator net_proto op_registry)
+cc_library(net SRCS net.cc DEPS op_registry)
 cc_test(net_op_test SRCS net_op_test.cc DEPS net add_op mul_op sigmoid_op softmax_op fc_op)
diff --git a/paddle/framework/detail/tensor-inl.h b/paddle/framework/detail/tensor-inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..e7ff09dd5c954378afeca299e901277c3ebdb96a
--- /dev/null
+++ b/paddle/framework/detail/tensor-inl.h
@@ -0,0 +1,142 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/memory/memcpy.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename T>
+inline void Tensor::check_memory_size() const {
+  PADDLE_ENFORCE(holder_ != nullptr,
+                 "Tenosr holds no memory. Call Tensor::mutable_data first.");
+  PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_,
+                 "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
+                 "first to re-allocate memory.");
+}
+
+template <typename T>
+inline const T* Tensor::data() const {
+  check_memory_size<T>();
+  return reinterpret_cast<const T*>(
+      reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
+}
+
+template <typename T>
+inline T* Tensor::data() {
+  check_memory_size<T>();
+  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                              offset_);
+}
+
+template <typename T>
+inline T* Tensor::mutable_data(DDim dims, platform::Place place) {
+  static_assert(std::is_pod<T>::value, "T must be POD");
+  Resize(dims);
+  return mutable_data<T>(place);
+}
+
+template <typename T>
+inline T* Tensor::mutable_data(platform::Place place) {
+  static_assert(std::is_pod<T>::value, "T must be POD");
+  PADDLE_ENFORCE(product(dims_) > 0,
+                 "Tensor's numel must be larger than zero to call "
+                 "Tensor::mutable_data. Call Tensor::set_dim first.");
+  /* some versions of boost::variant don't have operator!= */
+  size_t size = product(dims_) * sizeof(T);
+  if (holder_ == nullptr || !(holder_->place() == place) ||
+      holder_->size() < size + offset_) {
+    if (platform::is_cpu_place(place)) {
+      holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
+          boost::get<platform::CPUPlace>(place), size));
+    }
+#ifndef PADDLE_ONLY_CPU
+    else if (platform::is_gpu_place(place)) {
+      holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
+          boost::get<platform::GPUPlace>(place), size));
+    }
+#endif
+    offset_ = 0;
+  }
+  return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
+                              offset_);
+}
+
+template <typename T>
+inline void Tensor::ShareDataWith(const Tensor& src) {
+  src.check_memory_size<T>();
+  *this = src;
+}
+
+template <typename T>
+inline void Tensor::CopyFrom(const Tensor& src,
+                             const platform::Place& dst_place) {
+  src.check_memory_size<T>();
+  Resize(src.dims());
+
+  auto src_place = src.holder_->place();
+  auto src_ptr = static_cast<const void*>(src.data<T>());
+
+  auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
+
+  auto size = product(src.dims_) * sizeof(T);
+
+  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
+  }
+#ifndef PADDLE_ONLY_CPU
+  else if (platform::is_gpu_place(src_place) &&
+           platform::is_cpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::GPUPlace>(src_place), src_ptr, size, 0);
+  } else if (platform::is_cpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::CPUPlace>(src_place), src_ptr, size, 0);
+  } else if (platform::is_gpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::GPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::GPUPlace>(src_place), src_ptr, size, 0);
+  }
+
+#endif
+}
+
+template <typename T>
+inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
+  check_memory_size<T>();
+  PADDLE_ENFORCE(begin_idx >= 0, "Slice begin index is less than zero.");
+  PADDLE_ENFORCE(end_idx <= dims_[0], "Slice end index is out of bound.");
+  PADDLE_ENFORCE(begin_idx < end_idx,
+                 "Begin index must be less than end index.");
+  PADDLE_ENFORCE(dims_[0] != 1, "Can not slice a tensor with dims_[0] = 1.");
+  int base = product(dims_) / dims_[0];
+  Tensor dst;
+  dst.holder_ = holder_;
+  DDim dst_dims = dims_;
+  dst_dims[0] = end_idx - begin_idx;
+  dst.Resize(dst_dims);
+  dst.offset_ = offset_ + begin_idx * base * sizeof(T);
+  return dst;
+}
+
+inline void Tensor::Resize(const DDim& dims) { dims_ = dims; }
+
+inline const DDim& Tensor::dims() const { return dims_; }
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/net.cc b/paddle/framework/net.cc
index bc23b63b35d37eea01ae6b9b8891e9cd94615898..2cd378c6b21303d1a24206ba3010b0d035aaa766 100644
--- a/paddle/framework/net.cc
+++ b/paddle/framework/net.cc
@@ -20,17 +20,7 @@
 namespace paddle {
 namespace framework {
 
-std::shared_ptr<PlainNet> AddBackwardOp(std::shared_ptr<PlainNet> ForwardOps) {
-  auto grad_ops = std::make_shared<PlainNet>();
-  for (auto& op : ForwardOps->ops_) {
-    auto op_grad = OpRegistry::CreateGradOp(op);
-    grad_ops->AddOp(op_grad);
-  }
-  grad_ops->CompleteAddOp();
-  return grad_ops;
-}
-
-void PlainNet::CompleteAddOp(bool calc) {
+void NetOp::CompleteAddOp(bool calc) {
   add_op_done_ = true;
   if (!calc) return;
   std::unordered_set<std::string> input_set;
@@ -70,7 +60,7 @@ void PlainNet::CompleteAddOp(bool calc) {
   attrs_["temporary_index"] = tmp_index;
 }
 
-std::string PlainNet::DebugString() const {
+std::string NetOp::DebugString() const {
   std::ostringstream os;
   os << OperatorBase::DebugString() << std::endl;
   for (auto& op : ops_) {
@@ -82,5 +72,7 @@ std::string PlainNet::DebugString() const {
   return os.str();
 }
 
+bool NetOp::IsNetOp() const { return true; }
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/net.h b/paddle/framework/net.h
index 3264f1f565e3efc188e7835cb9b44e5741e1eea8..089c1355951f59d51db16d4b4bdce4282d6e5c25 100644
--- a/paddle/framework/net.h
+++ b/paddle/framework/net.h
@@ -37,21 +37,7 @@ namespace framework {
  * This is the base class of network, all the networks should implement the APIs
  * it defines.
  */
-class Net : public OperatorBase {
- public:
-  virtual void AddOp(const std::shared_ptr<OperatorBase>& op) = 0;
-  virtual void CompleteAddOp(bool calc) = 0;
-};
-
-using NetPtr = std::shared_ptr<Net>;
-
-/**
- * @brief a basic implementation of Net.
- *
- * PlainNet is a very simple Net, it create a list of operators, and run them
- * sequentially following the order they added.
- */
-class PlainNet : public Net {
+class NetOp : public OperatorBase {
  public:
   /**
    * Infer all the operators' input and output variables' shapes, will be called
@@ -80,15 +66,17 @@ class PlainNet : public Net {
   /**
    * @brief Add an operator by ptr
    */
-  void AddOp(const std::shared_ptr<OperatorBase>& op) override {
+  void AddOp(const std::shared_ptr<OperatorBase>& op) {
     PADDLE_ENFORCE(!add_op_done_, "Cannot AddOp when this network is sealed");
     ops_.push_back(op);
   }
 
-  void CompleteAddOp(bool calculate = true) override;
+  void CompleteAddOp(bool calculate = true);
 
   std::string DebugString() const override;
 
+  bool IsNetOp() const override;
+
   std::vector<std::shared_ptr<OperatorBase>> ops_;
 
  private:
@@ -100,7 +88,5 @@ class PlainNet : public Net {
   }
 };
 
-std::shared_ptr<PlainNet> AddBackwardOp(std::shared_ptr<PlainNet> ForwardOps);
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/net_op_test.cc b/paddle/framework/net_op_test.cc
index 20b42cbb4923590804a7806ac42347590c73d62f..8048311fe54ee1827fb5b91577478a1d30803e43 100644
--- a/paddle/framework/net_op_test.cc
+++ b/paddle/framework/net_op_test.cc
@@ -40,7 +40,7 @@ void AssertSameVectorWithoutOrder(const std::vector<T>& expected,
 }
 
 TEST(OpKernel, all) {
-  auto net = std::make_shared<PlainNet>();
+  auto net = std::make_shared<NetOp>();
   ASSERT_NE(net, nullptr);
 
   auto op1 = std::make_shared<TestOp>();
@@ -69,30 +69,23 @@ TEST(OpKernel, all) {
   net->Run(scope, dev_ctx);
   ASSERT_EQ(2, infer_shape_cnt);
   ASSERT_EQ(2, run_cnt);
-  ASSERT_THROW(net->AddOp(op2), std::runtime_error);
-}
-TEST(AddBackwardOp, TestGradOp) {
-  auto net = std::make_shared<PlainNet>();
-  ASSERT_NE(net, nullptr);
-  net->AddOp(framework::OpRegistry::CreateOp("mul", {"X", "Y"}, {"Out"}, {}));
-  net->AddOp(
-      framework::OpRegistry::CreateOp("add_two", {"X", "Y"}, {"Out"}, {}));
-  net->AddOp(framework::OpRegistry::CreateOp("add_two", {"X", "Y"}, {""}, {}));
-  auto grad_ops = AddBackwardOp(net);
-  for (auto& op : grad_ops->ops_) {
-    op->DebugString();
-  }
+  ASSERT_THROW(net->AddOp(op2), paddle::platform::EnforceNotMet);
 }
 
-// TODO(zhihong): add fc grad without registering.
-// TEST(AddBackwardOp, TestNoGradOp) {
-//   auto net = std::make_shared<PlainNet>();
-//   ASSERT_NE(net, nullptr);
-//   net->AddOp(framework::OpRegistry::CreateOp("fc", {"X", "W", "b"}, {"Y"},
-//   {})); auto grad_ops = AddBackwardOp(net); for (auto& op : grad_ops->ops_) {
-//     op->DebugString();
-//   }
-// }
+//! TODO(yuyang18): Refine Backward Op.
+// TEST(AddBackwardOp, TestGradOp) {
+//  auto net = std::make_shared<NetOp>();
+//  ASSERT_NE(net, nullptr);
+//  net->AddOp(framework::OpRegistry::CreateOp("mul", {"X", "Y"}, {"Out"}, {}));
+//  net->AddOp(
+//      framework::OpRegistry::CreateOp("add_two", {"X", "Y"}, {"Out"}, {}));
+//  net->AddOp(framework::OpRegistry::CreateOp("add_two", {"X", "Y"}, {""},
+//  {}));
+//  auto grad_ops = AddBackwardOp(net);
+//  for (auto& op : grad_ops->ops_) {
+//    op->DebugString();
+//  }
+//}
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/net_proto.proto b/paddle/framework/net_proto.proto
deleted file mode 100644
index 0779f49fe2a9a6d0d1ea5ec11ba3befeb0a67fa1..0000000000000000000000000000000000000000
--- a/paddle/framework/net_proto.proto
+++ /dev/null
@@ -1,15 +0,0 @@
-syntax="proto2";
-package paddle.framework;
-
-import "op_proto.proto";
-
-message NetDesc {
-  // network identification
-  optional string name = 1;
-  // operator contains in network
-  repeated OpProto operators = 2;
-  // network type to run with. e.g "plainNet", "DAG"
-  optional string net_type = 3;
-  // num worker always
-  optional int32 num_workers = 4;
-}
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index f16deae028d76dc40d6bc589648b461c430c3c98..384f0f631dd9b9a4dd7c0c628340afe668bc248f 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -403,15 +403,16 @@ class GradOpRegisterHelper {
   STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
       __reg_op_kernel_##type##_##DEVICE_TYPE##__,                         \
       "REGISTER_OP_KERNEL must be in global namespace");                  \
-  struct __op_kernel_register__##type##__ {                               \
-    __op_kernel_register__##type##__() {                                  \
+  struct __op_kernel_register__##type##__##DEVICE_TYPE##__ {              \
+    __op_kernel_register__##type##__##DEVICE_TYPE##__() {                 \
       ::paddle::framework::OperatorWithKernel::OpKernelKey key;           \
       key.place_ = PlaceType();                                           \
       ::paddle::framework::OperatorWithKernel::AllOpKernels()[#type][key] \
           .reset(new __VA_ARGS__());                                      \
     }                                                                     \
   };                                                                      \
-  static __op_kernel_register__##type##__ __reg_kernel_##type##__;        \
+  static __op_kernel_register__##type##__##DEVICE_TYPE##__                \
+      __reg_kernel_##type##__##DEVICE_TYPE##__;                           \
   int __op_kernel_register_##type##_handle_##DEVICE_TYPE##__() { return 0; }
 
 // (type, KernelType)
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index 05095372d835e7137daedb548b4bb78043e586ea..2ef781bf8672c8aa53ae32a44f1ea61973f3792c 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -90,7 +90,7 @@ TEST(OpRegistry, IllegalAttr) {
   bool caught = false;
   try {
     paddle::framework::OpRegistry::CreateOp(op_desc);
-  } catch (std::runtime_error& err) {
+  } catch (paddle::platform::EnforceNotMet err) {
     caught = true;
     std::string msg = "larger_than check fail";
     const char* err_msg = err.what();
@@ -136,7 +136,7 @@ TEST(OpRegistry, CustomChecker) {
   bool caught = false;
   try {
     paddle::framework::OpRegistry::CreateOp(op_desc);
-  } catch (std::runtime_error& err) {
+  } catch (paddle::platform::EnforceNotMet err) {
     caught = true;
     std::string msg = "Attribute 'test_attr' is required!";
     const char* err_msg = err.what();
@@ -154,7 +154,7 @@ TEST(OpRegistry, CustomChecker) {
   caught = false;
   try {
     paddle::framework::OpRegistry::CreateOp(op_desc);
-  } catch (std::runtime_error& err) {
+  } catch (paddle::platform::EnforceNotMet err) {
     caught = true;
     std::string msg = "'test_attr' must be even!";
     const char* err_msg = err.what();
@@ -192,7 +192,7 @@ TEST(ProtoMaker, DuplicatedAttr) {
   pd::OpProto op_proto;
   pd::OpAttrChecker op_checker;
   auto proto_maker = TestAttrProtoMaker(&op_proto, &op_checker);
-  ASSERT_THROW(proto_maker.Validate(), std::runtime_error);
+  ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
 }
 
 class TestInOutProtoMaker : public pd::OpProtoAndCheckerMaker {
@@ -208,5 +208,5 @@ TEST(ProtoMaker, DuplicatedInOut) {
   pd::OpProto op_proto;
   pd::OpAttrChecker op_checker;
   auto proto_maker = TestInOutProtoMaker(&op_proto, &op_checker);
-  ASSERT_THROW(proto_maker.Validate(), std::runtime_error);
+  ASSERT_THROW(proto_maker.Validate(), paddle::platform::EnforceNotMet);
 }
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 0a317dffa961a365ad8981aa8cae53c27e80c16b..745508e6ac6e8f310a8ebd8b8d0762fb8ab39bd4 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -34,22 +34,26 @@ KernelContext::GetEigenDevice<platform::GPUPlace, Eigen::GpuDevice>() const {
 #endif
 
 const std::string& OperatorBase::Input(const std::string& name) const {
+  PADDLE_ENFORCE(in_out_idxs_ != nullptr,
+                 "Input Output Indices could not be nullptr");
   auto it = in_out_idxs_->find(name);
   PADDLE_ENFORCE(it != in_out_idxs_->end(), "no key [%s] in in_out_idxs_",
                  name);
-
   if (attrs_.count("input_format") == 0) {
-    return inputs_[it->second];
+    return inputs_.at((size_t)it->second);
   } else {
     const auto& input_format = GetAttr<std::vector<int>>("input_format");
     int idx = input_format[it->second];
-    return inputs_.at(idx);
+    return inputs_.at((size_t)idx);
   }
 }
 
 std::vector<std::string> OperatorBase::Inputs(const std::string& name) const {
+  PADDLE_ENFORCE(in_out_idxs_ != nullptr, "IO Idx could not be nullptr");
   auto input_format = GetAttr<std::vector<int>>("input_format");
   auto offset = in_out_idxs_->at(name);
+  PADDLE_ENFORCE(input_format.at((size_t)offset + 1) <= inputs_.size(),
+                 "Input Out Of Range");
 
   return std::vector<std::string>{
       inputs_.begin() + input_format.at(offset),
@@ -57,23 +61,25 @@ std::vector<std::string> OperatorBase::Inputs(const std::string& name) const {
 }
 
 const std::string& OperatorBase::Output(const std::string& name) const {
+  PADDLE_ENFORCE(in_out_idxs_ != nullptr, "InOut Indice could not be nullptr");
   auto it = in_out_idxs_->find(name);
   PADDLE_ENFORCE(it != in_out_idxs_->end(), "no key [%s] in in_out_idxs_",
                  name);
-
   if (attrs_.count("output_format") == 0) {
-    return outputs_[it->second];
+    return outputs_.at((size_t)it->second);
   } else {
     const auto& output_format = GetAttr<std::vector<int>>("output_format");
     int idx = output_format[it->second];
-    return outputs_.at(idx);
+    return outputs_.at((size_t)idx);
   }
 }
 
 std::vector<std::string> OperatorBase::Outputs(const std::string& name) const {
+  PADDLE_ENFORCE(in_out_idxs_ != nullptr, "InOut Indice could not be nullptr");
   auto output_format = GetAttr<std::vector<int>>("output_format");
   auto offset = in_out_idxs_->at(name);
-
+  PADDLE_ENFORCE(output_format.at((size_t)offset + 1) <= outputs_.size(),
+                 "Output Out of Range");
   return std::vector<std::string>{
       outputs_.begin() + output_format.at(offset),
       outputs_.begin() + output_format.at(offset + 1)};
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 9ba661968cd2bb60c7aa76513363db65b54c8923..8d9ca92565b3d34c5ede7c66cc9eb0d629e8d4e4 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -90,15 +90,17 @@ class OperatorBase {
   virtual void Run(const std::shared_ptr<Scope>& scope,
                    const platform::DeviceContext& dev_ctx) const = 0;
 
-  // Get a input with argument's name described in `op_proto`
+  virtual bool IsNetOp() const { return false; }
+
+  //! Get a input with argument's name described in `op_proto`
   const std::string& Input(const std::string& name) const;
-  // Get a input which has multiple variables.
-  // TODO add a vector_view to prevent memory copy.
+  //! Get a input which has multiple variables.
+  //! TODO add a vector_view to prevent memory copy.
   std::vector<std::string> Inputs(const std::string& name) const;
-  // Get a output with argument's name described in `op_proto`
+  //! Get a output with argument's name described in `op_proto`
   const std::string& Output(const std::string& name) const;
-  // Get an output which has multiple variables.
-  // TODO add a vector_view to prevent memory copy.
+  //! Get an output which has multiple variables.
+  //! TODO add a vector_view to prevent memory copy.
   std::vector<std::string> Outputs(const std::string& name) const;
 
  public:
@@ -199,7 +201,9 @@ class OperatorWithKernel : public OperatorBase {
       place_ = dev_ctx.GetPlace();
     }
 
-    bool operator==(const OpKernelKey& o) const { return place_ == o.place_; }
+    bool operator==(const OpKernelKey& o) const {
+      return platform::places_are_same_class(place_, o.place_);
+    }
   };
 
   struct OpKernelHash {
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index 79c9ffd1a677346fac7712373681acdbaa8116d6..4faaf841440ba30b79c83d09fea977186bd0270a 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -56,7 +56,9 @@ class Scope {
     if (var) {
       return var;
     } else {
-      vars_[name] = std::unique_ptr<Variable>(new Variable());
+      auto ptr = new Variable();
+      name_to_var_[name] = std::unique_ptr<Variable>(ptr);
+      var_to_name_[ptr] = name;
       return GetVariable(name);
     }
   }
@@ -68,8 +70,8 @@ class Scope {
    * from it's parent scope. Return nullptr if not found.
    */
   Variable* GetVariable(const std::string& name) const {
-    auto it = vars_.find(name);
-    if (it != vars_.end()) {
+    auto it = name_to_var_.find(name);
+    if (it != name_to_var_.end()) {
       return it->second.get();
     } else if (parent_ != nullptr) {
       return parent_->GetVariable(name);
@@ -84,12 +86,21 @@ class Scope {
    * Find if there is a Variable in this scope and it's parent scope
    */
   bool HasVariable(const std::string& name) const {
-    return (vars_.find(name) != vars_.end() ||
+    return (name_to_var_.find(name) != name_to_var_.end() ||
             (parent_ && parent_->HasVariable(name)));
   }
 
+  std::string GetVariableName(Variable* const var) const {
+    try {
+      return var_to_name_.at(var);
+    } catch (...) {
+      return "";
+    }
+  }
+
  private:
-  std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
+  std::unordered_map<Variable*, std::string> var_to_name_;
+  std::unordered_map<std::string, std::unique_ptr<Variable>> name_to_var_;
   std::shared_ptr<Scope> parent_{nullptr};
 };
 
diff --git a/paddle/framework/scope_test.cc b/paddle/framework/scope_test.cc
index df1afb200ce9e75c5b1e40f2da56667487ae3576..ff069c7be002e9bcfd63225c3d80aa958935ba14 100644
--- a/paddle/framework/scope_test.cc
+++ b/paddle/framework/scope_test.cc
@@ -40,6 +40,11 @@ TEST(Scope, Create) {
   /// already exist.
   Variable* var4 = scope->CreateVariable("a");
   EXPECT_EQ(var4, var2);
+
+  EXPECT_EQ("a", scope->GetVariableName(var4));
+  Scope scope2;
+  auto var = scope2.CreateVariable("tmp");
+  EXPECT_EQ("", scope->GetVariableName(var));
 }
 
 TEST(Scope, Parent) {
diff --git a/paddle/framework/tensor.cc b/paddle/framework/tensor.cc
index 964f15ab66bca7da75824e192e61600c29e572c0..ea7b2a1f7b17d9abc2c2e14de5ecd1cf4a1a5027 100644
--- a/paddle/framework/tensor.cc
+++ b/paddle/framework/tensor.cc
@@ -12,7 +12,7 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include <paddle/framework/tensor.h>
+#include "paddle/framework/tensor.h"
 
 namespace paddle {
 namespace framework {}
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index a36f375d2e42ee3c46ddef42954335cba7eb88f2..76070f636b0971f4a136042e056c59adb5dc2d40 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <typeindex>
 #include "paddle/framework/ddim.h"
 #include "paddle/memory/memory.h"
+#include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
 #include "unsupported/Eigen/CXX11/Tensor"
@@ -31,9 +32,11 @@ template <bool less, size_t i, typename... args>
 struct CastToPyBufferImpl;
 }  // namespace details
 }  // namespace pybind
+
 namespace framework {
 
 class Tensor {
+ public:
   template <bool less, size_t i, typename... args>
   friend struct paddle::pybind::details::CastToPyBufferImpl;
 
@@ -46,143 +49,122 @@ class Tensor {
  public:
   Tensor() : offset_(0) {}
 
+  /*! Return a pointer to mutable memory block. */
   template <typename T>
-  const T* data() const {
-    EnforceSufficientMemory<T>();
-    return reinterpret_cast<const T*>(
-        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
-  }
+  inline T* data();
 
+  /*! Return a pointer to constant memory block. */
   template <typename T>
-  T* data() {
-    EnforceSufficientMemory<T>();
-    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                offset_);
-  }
-
-  template <typename T,  // must be POD types
-            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
-  T* mutable_data(DDim dims, platform::Place place) {
-    Resize(dims);
-    return mutable_data<T>(place);
-  }
-
-  template <typename T,  // must be POD types
-            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
-  T* mutable_data(platform::Place place) {
-    PADDLE_ENFORCE(product(dims_) > 0,
-                   "Tensor's numel must be larger than zero to call "
-                   "Tensor::mutable_data. Call Tensor::set_dim first.");
-    if (holder_ == nullptr ||
-        !(holder_->place() ==
-          place) /* some versions of boost::variant don't have operator!= */
-        || holder_->size() < product(dims_) * sizeof(T) + offset_) {
-      if (platform::is_cpu_place(place)) {
-        holder_.reset(new PlaceholderImpl<T, platform::CPUPlace>(
-            boost::get<platform::CPUPlace>(place), product(dims_) * sizeof(T)));
-      } else if (platform::is_gpu_place(place)) {
-#ifdef PADDLE_ONLY_CPU
-        PADDLE_THROW("'GPUPlace' is not supported in CPU only device.");
-#else
-        holder_.reset(new PlaceholderImpl<T, platform::GPUPlace>(
-            boost::get<platform::GPUPlace>(place), product(dims_) * sizeof(T)));
-#endif
-      } else {
-        PADDLE_THROW("Unknown 'place'.");
-      }
-      offset_ = 0;
-    }
-    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
-                                offset_);
-  }
+  inline const T* data() const;
 
+  /**
+   * @brief   Return a pointer to mutable memory block.
+   * @note    If not exist, then allocation.
+   */
   template <typename T>
-  void ShareDataWith(const Tensor& src) {
-    src.EnforceSufficientMemory<T>();
-    *this = src;
-  }
+  inline T* mutable_data(platform::Place place);
+
+  /**
+   * @brief     Return a pointer to mutable memory block.
+   *
+   * @param[in] dims    The dimensions of the memory block.
+   * @param[in] place   The place of the memory block.
+   *
+   * @note      If not exist, then allocation.
+   */
+  template <typename T>
+  inline T* mutable_data(DDim dims, platform::Place place);
+
+  /*! Return the dimensions of the memory block. */
+  inline const DDim& dims() const;
 
+  /*! Resize the dimensions of the memory block. */
+  inline void Resize(const DDim& dims);
+
+  /*! The internal of two tensors share the same memory block. */
+  template <typename T>
+  inline void ShareDataWith(const Tensor& src);
+
+  /**
+   * @brief   Copy the content of external tensor to a new place.
+   *
+   * @param[in] src   The external tensor.
+   * @param[in] ctx   The device context contains place where to store.
+   *
+   * @note    CopyFrom supports CPU <-> GPU, GPU <-> GPU.
+   */
+  template <typename T>
+  inline void CopyFrom(const Tensor& src, const platform::Place& dst_place);
+
+  /**
+   * @brief   Return the slice of the tensor.
+   *
+   * @param[in] begin_idx   The begin index of the slice.
+   * @param[in] end_idx     The end index of the slice.
+   */
   template <typename T>
-  void CopyFrom(const Tensor& src, platform::Place dst_place) {
-    PADDLE_ENFORCE(platform::is_cpu_place(src.holder_->place()) &&
-                       platform::is_cpu_place(dst_place),
-                   "Tensor::CopyFrom only support CPU now.");
-    src.EnforceSufficientMemory<T>();
-    size_t size = product(src.dims_) * sizeof(T);
-    Resize(src.dims());
-    const void* src_ptr = static_cast<const void*>(src.data<T>());
-    void* dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
-    memcpy(dst_ptr, src_ptr, size);
-  }
+  inline Tensor Slice(const int& begin_idx, const int& end_idx) const;
 
+ private:
   template <typename T>
-  Tensor Slice(const int& begin_idx, const int& end_idx) const {
-    EnforceSufficientMemory<T>();
-    PADDLE_ENFORCE(begin_idx >= 0, "Slice begin index is less than zero.");
-    PADDLE_ENFORCE(end_idx <= dims_[0], "Slice end index is out of bound.");
-    PADDLE_ENFORCE(begin_idx < end_idx,
-                   "Begin index must be less than end index.");
-    PADDLE_ENFORCE(dims_[0] != 1, "Can not slice a tensor with dims_[0] = 1.");
-    int base = product(dims_) / dims_[0];
-    Tensor dst;
-    dst.holder_ = holder_;
-    DDim dst_dims = dims_;
-    dst_dims[0] = end_idx - begin_idx;
-    dst.Resize(dst_dims);
-    dst.offset_ = offset_ + begin_idx * base * sizeof(T);
-    return dst;
-  }
-
-  void Resize(const DDim& dims) { dims_ = dims; }
-
-  const DDim& dims() const { return dims_; }
+  inline void check_memory_size() const;
 
  private:
-  // Placeholder hides type T, so it doesn't appear as a template
-  // parameter of Variable.
+  /**
+   * @note    Placeholder hides type T, so it doesn't appear as a template
+   *          parameter of Variable.
+   */
   struct Placeholder {
     virtual ~Placeholder() {}
     virtual void* ptr() const = 0;
-    virtual platform::Place place() const = 0;
     virtual size_t size() const = 0;
     virtual std::type_index type() const = 0;
+    virtual platform::Place place() const = 0;
   };
 
-  template <typename T, typename PlaceType>
+  template <typename T, typename Place>
   struct PlaceholderImpl : public Placeholder {
-    PlaceholderImpl(PlaceType place, size_t size)
+    PlaceholderImpl(Place place, size_t size)
         : ptr_(static_cast<T*>(memory::Alloc(place, size)),
-               memory::PODDeleter<T, PlaceType>(place)),
+               memory::PODDeleter<T, Place>(place)),
           place_(place),
-          size_(size) {}
+          size_(size) {
+      PADDLE_ENFORCE(ptr_ != nullptr, "Insufficient %s memory to allocation.",
+                     is_cpu_place(place_) ? "CPU" : "GPU");
+    }
 
-    virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
     virtual size_t size() const { return size_; }
-    virtual paddle::platform::Place place() const { return place_; }
+    virtual platform::Place place() const { return place_; }
+    virtual void* ptr() const { return static_cast<void*>(ptr_.get()); }
     virtual std::type_index type() const { return std::type_index(typeid(T)); }
 
-    std::unique_ptr<T, memory::PODDeleter<T, PlaceType>> ptr_;
-    platform::Place place_;  // record the place of ptr_.
-    size_t size_;            // size of the memory block.
+    /*! the pointer of memory block. */
+    std::unique_ptr<T, memory::PODDeleter<T, Place>> ptr_;
+
+    /*! the place of memory block. */
+    platform::Place place_;
+
+    /*! the size of memory block. */
+    size_t size_;
   };
 
-  template <typename T>
-  inline void EnforceSufficientMemory() const {
-    PADDLE_ENFORCE(holder_ != nullptr,
-                   "Tenosr holds no memory. Call Tensor::mutable_data first.");
-    PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_,
-                   "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
-                   "first to re-allocate memory.");
-  }
-
-  std::shared_ptr<Placeholder> holder_;  // holds the memory block if allocated.
+  /*! holds the memory block if allocated. */
+  std::shared_ptr<Placeholder> holder_;
+
+  /*! points to dimensions of memory block. */
   DDim dims_;
-  // A PlaceHolder may be shared by more than one tensor. Some of them may be
-  // slices of the others. So the offset_ is introduced here to indicate the
-  // byte offset between PlaceHolder::ptr_ and where tensor's data really
-  // begins.
+
+  /**
+   * @brief   A PlaceHolder may be shared by more than one tensor.
+   *
+   * @note    Some of them may be slices of the others. So the offset_
+   *          is introduced here to indicate the byte offset between
+   *          PlaceHolder::ptr_ and where the tensor data really begins.
+   */
   size_t offset_;
 };
 
 }  // namespace framework
 }  // namespace paddle
+
+#include "paddle/framework/detail/tensor-inl.h"
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index 089844dc0164dae8067846a8e6846d47fb1b0833..ef1cc10b840896d9ab97f963fc12a4971cd74e1f 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -33,7 +33,7 @@ TEST(Tensor, DataAssert) {
   bool caught = false;
   try {
     src_tensor.data<double>();
-  } catch (std::runtime_error& err) {
+  } catch (paddle::platform::EnforceNotMet err) {
     caught = true;
     std::string msg =
         "Tenosr holds no memory. Call Tensor::mutable_data first.";
@@ -72,7 +72,8 @@ TEST(Tensor, MutableData) {
     p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CPUPlace());
     EXPECT_EQ(p1, p2);
   }
-#ifdef __CUDACC__
+
+#ifndef PADDLE_ONLY_CPU
   {
     Tensor src_tensor;
     float* p1 = nullptr;
@@ -107,7 +108,7 @@ TEST(Tensor, ShareDataWith) {
     bool caught = false;
     try {
       dst_tensor.ShareDataWith<float>(src_tensor);
-    } catch (std::runtime_error& err) {
+    } catch (paddle::platform::EnforceNotMet err) {
       caught = true;
       std::string msg =
           "Tenosr holds no memory. Call Tensor::mutable_data first.";
@@ -123,7 +124,7 @@ TEST(Tensor, ShareDataWith) {
     ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
   }
 
-#ifdef __CUDACC__
+#ifndef PADDLE_ONLY_CPU
   {
     Tensor src_tensor;
     Tensor dst_tensor;
@@ -160,7 +161,7 @@ TEST(Tensor, Slice) {
     EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
   }
 
-#ifdef __CUDACC__
+#ifndef PADDLE_ONLY_CPU
   {
     Tensor src_tensor;
     src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
@@ -188,25 +189,74 @@ TEST(Tensor, Slice) {
 TEST(Tensor, CopyFrom) {
   using namespace paddle::framework;
   using namespace paddle::platform;
+  {
+    Tensor src_tensor;
+    Tensor dst_tensor;
+
+    int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
+
+    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    memcpy(src_ptr, arr, 9 * sizeof(int));
 
-  Tensor src_tensor;
-  int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
-  int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-  memcpy(src_ptr, arr, 9 * sizeof(int));
-  Tensor dst_tensor;
-  dst_tensor.CopyFrom<int>(src_tensor, CPUPlace());
-  const int* dst_ptr = dst_tensor.data<int>();
-  ASSERT_NE(src_ptr, dst_ptr);
-  for (size_t i = 0; i < 9; ++i) {
-    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    auto cpu_place = new paddle::platform::CPUPlace();
+    dst_tensor.CopyFrom<int>(src_tensor, *cpu_place);
+
+    const int* dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
+    dst_tensor.CopyFrom<int>(slice_tensor, *cpu_place);
+    const int* slice_ptr = slice_tensor.data<int>();
+    dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(dst_ptr, slice_ptr);
+    for (size_t i = 0; i < 3; ++i) {
+      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+    }
   }
+#ifndef PADDLE_ONLY_CPU
+  {
+    Tensor src_tensor;
+    Tensor gpu_tensor;
+    Tensor dst_tensor;
+
+    int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
+
+    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    memcpy(src_ptr, arr, 9 * sizeof(int));
+
+    // CPU Tensor to GPU Tensor
+    auto gpu_place = new paddle::platform::GPUPlace(0);
+    gpu_tensor.CopyFrom<int>(src_tensor, *gpu_place);
+
+    // GPU Tensor to CPU Tensor
+    auto cpu_place = new paddle::platform::CPUPlace();
+    dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place);
+
+    // Compare Tensors
+    const int* dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
+
+    // CPU Slice Tensor to GPU Tensor
+    gpu_tensor.CopyFrom<int>(slice_tensor, *gpu_place);
 
-  Tensor slice_tensor = src_tensor.Slice<int>(1, 2);
-  dst_tensor.CopyFrom<int>(slice_tensor, CPUPlace());
-  const int* slice_ptr = slice_tensor.data<int>();
-  dst_ptr = dst_tensor.data<int>();
-  ASSERT_NE(dst_ptr, slice_ptr);
-  for (size_t i = 0; i < 3; ++i) {
-    EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+    // GPU Tensor to CPU Tensor
+    dst_tensor.CopyFrom<int>(gpu_tensor, *cpu_place);
+
+    // Compare Slice Tensors
+    const int* slice_ptr = slice_tensor.data<int>();
+    dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(dst_ptr, slice_ptr);
+    for (size_t i = 0; i < 3; ++i) {
+      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+    }
   }
+#endif
 }
diff --git a/paddle/function/BlockExpandOp.cpp b/paddle/function/BlockExpandOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a89b6bba45843d81264819cad6ba053f28314f6b
--- /dev/null
+++ b/paddle/function/BlockExpandOp.cpp
@@ -0,0 +1,202 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Function.h"
+#include "Im2Col.h"
+
+namespace paddle {
+
+/*
+ * \brief Converts the image data of four dimensions(NCHW) into
+ *        a sequence data of three dimensions(NST) in the forward calculation,
+ *        which is reversed in the backward calculation.
+ *        Where N is batch size, S is the length of the sequence after each
+ *        image is expanded, T is the size of each time step in the sequence.
+ *
+ * Arguments in forward function:
+ * \param inputs[0]  Image data of NCHW format.
+ * \param outputs[0] Sequence data of NST format.
+ *
+ * Arguments in backward function:
+ * \param inputs[0]  Sequence data of NST format.
+ * \param outputs[0] Image data of NCHW format.
+ */
+class BlockExpandFunction : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    // function arguments
+    strides_ = config.get<std::vector<size_t>>("strides");
+    paddings_ = config.get<std::vector<size_t>>("paddings");
+    blocks_ = config.get<std::vector<size_t>>("blocks");
+
+    // number of inputs and outputs
+    numInputs_ = 1;
+    numOutputs_ = 1;
+  }
+
+  void checkShape(const TensorShape& image, const TensorShape& sequence) const {
+    // image shape should be 4-dimensional.
+    CHECK_EQ(image.ndims(), (size_t)4);
+    // sequence shape should be 3-dimensional.
+    CHECK_EQ(sequence.ndims(), (size_t)3);
+    // The batchSize of the image needs to be equal to
+    // the batchSize of the sequence.
+    CHECK_EQ(image[0], sequence[0]);
+  }
+
+  // Calculate the shape of colData based on the shape of the image
+  // and the shape of the sequence.
+  TensorShape getColShape(const TensorShape& image,
+                          const TensorShape& sequence) const {
+    size_t inputChannels = image[1];
+    size_t inputHeight = image[2];
+    size_t inputWidth = image[3];
+    size_t seqLength = sequence[1];
+    size_t stepSize = sequence[2];
+    size_t outputHeight =
+        1 +
+        (inputHeight + 2 * paddingH() - blockH() + strideH() - 1) / strideH();
+    size_t outputWidth =
+        1 +
+        (inputWidth + 2 * paddingW() - blockW() + strideW() - 1) / strideW();
+    CHECK_EQ(seqLength, outputHeight * outputWidth);
+    CHECK_EQ(stepSize, inputChannels * blockH() * blockW());
+
+    // [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+    return TensorShape({outputHeight,
+                        outputWidth,
+                        inputChannels,
+                        (size_t)blockH(),
+                        (size_t)blockW()});
+  }
+
+protected:
+  std::vector<size_t> strides_;
+  std::vector<size_t> paddings_;
+  std::vector<size_t> blocks_;
+
+  inline int strideH() const { return strides_[0]; }
+
+  inline int strideW() const { return strides_[1]; }
+
+  inline int paddingH() const { return paddings_[0]; }
+
+  inline int paddingW() const { return paddings_[1]; }
+
+  inline int blockH() const { return blocks_[0]; }
+
+  inline int blockW() const { return blocks_[1]; }
+};
+
+template <DeviceType Device>
+class BlockExpandForward : public BlockExpandFunction {
+public:
+  void init(const FuncConfig& config) override {
+    BlockExpandFunction::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& image = inputs[0].shape();
+    const TensorShape& sequence = outputs[0].shape();
+    checkShape(image, sequence);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    const TensorShape& image = inputs[0].shape();
+    const TensorShape& sequence = outputs[0].shape();
+
+    TensorShape imShape = TensorShape({image[1], image[2], image[3]});
+    TensorShape colShape = getColShape(image, sequence);
+    size_t batchSize = image[0];
+
+    real* imageData = inputs[0].data<real>();
+    real* seqData = outputs[0].data<real>();
+    Im2ColFunctor<kOCF, Device, real> im2col;
+    for (size_t i = 0; i < batchSize; i++) {
+      // The result of im2col is [outputHeight, outputWidth,
+      // inputChannels, filterHeight, filterWidth], and it is easy to
+      // reshape into [seqLength, stepSize], where seqLength is equal
+      // output_height * output_width, stepSize is equal
+      // input_channels * filter_height * filter_width
+      im2col(imageData,
+             imShape,
+             seqData,
+             colShape,
+             strideH(),
+             strideW(),
+             paddingH(),
+             paddingW());
+      imageData += imShape.getElements();
+      seqData += colShape.getElements();
+    }
+  }
+};
+
+template <DeviceType Device>
+class BlockExpandBackward : public BlockExpandFunction {
+public:
+  void init(const FuncConfig& config) override {
+    BlockExpandFunction::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& image = outputs[0].shape();
+    const TensorShape& sequence = inputs[0].shape();
+    checkShape(image, sequence);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+    // Since the implementation of Col2ImFunctor is ADD_TO,
+    // this function only supports ADD_TO mode.
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    const TensorShape& image = outputs[0].shape();
+    const TensorShape& sequence = inputs[0].shape();
+
+    TensorShape imShape = TensorShape({image[1], image[2], image[3]});
+    TensorShape colShape = getColShape(image, sequence);
+    size_t batchSize = image[0];
+
+    real* imageData = outputs[0].data<real>();
+    real* seqData = inputs[0].data<real>();
+    Col2ImFunctor<kOCF, Device, real> col2im;
+    for (size_t i = 0; i < batchSize; i++) {
+      col2im(imageData,
+             imShape,
+             seqData,
+             colShape,
+             strideH(),
+             strideW(),
+             paddingH(),
+             paddingW());
+      imageData += imShape.getElements();
+      seqData += colShape.getElements();
+    }
+  }
+};
+
+REGISTER_TYPED_FUNC(BlockExpand, CPU, BlockExpandForward);
+REGISTER_TYPED_FUNC(BlockExpandGrad, CPU, BlockExpandBackward);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(BlockExpand, GPU, BlockExpandForward);
+REGISTER_TYPED_FUNC(BlockExpandGrad, GPU, BlockExpandBackward);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/BlockExpandOpTest.cpp b/paddle/function/BlockExpandOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5e4897e72ba9fab2dd9e25d90313dc1b4d38e2d4
--- /dev/null
+++ b/paddle/function/BlockExpandOpTest.cpp
@@ -0,0 +1,107 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+
+namespace paddle {
+
+TEST(BlockExpandForward, real) {
+  for (size_t batchSize : {5, 32}) {
+    for (size_t channels : {1, 5, 32}) {
+      for (size_t inputHeight : {5, 33, 100}) {
+        for (size_t inputWidth : {5, 32, 96}) {
+          for (size_t block : {1, 3, 5}) {
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+                // init Test object
+                std::vector<size_t> strides = {stride, stride};
+                std::vector<size_t> paddings = {padding, padding};
+                std::vector<size_t> blocks = {block, block};
+                CpuGpuFuncCompare test("BlockExpand",
+                                       FuncConfig()
+                                           .set("strides", strides)
+                                           .set("paddings", paddings)
+                                           .set("blocks", blocks));
+
+                size_t outputHeight =
+                    1 +
+                    (inputHeight + 2 * padding - block + stride - 1) / stride;
+                size_t outputWidth =
+                    1 +
+                    (inputWidth + 2 * padding - block + stride - 1) / stride;
+                TensorShape inputShape =
+                    TensorShape({batchSize, channels, inputHeight, inputWidth});
+                TensorShape outputShape =
+                    TensorShape({batchSize,
+                                 outputHeight * outputWidth,
+                                 channels * block * block});
+                test.addInputs(BufferArg(VALUE_TYPE_FLOAT, inputShape));
+                test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, outputShape));
+                // run Function
+                test.run();
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(BlockExpandBackward, real) {
+  for (size_t batchSize : {5, 32}) {
+    for (size_t channels : {1, 5, 32}) {
+      for (size_t inputHeight : {5, 33, 100}) {
+        for (size_t inputWidth : {5, 32, 96}) {
+          for (size_t block : {1, 3, 5}) {
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+                // init Test object
+                std::vector<size_t> strides = {stride, stride};
+                std::vector<size_t> paddings = {padding, padding};
+                std::vector<size_t> blocks = {block, block};
+                CpuGpuFuncCompare test("BlockExpandGrad",
+                                       FuncConfig()
+                                           .set("strides", strides)
+                                           .set("paddings", paddings)
+                                           .set("blocks", blocks));
+
+                size_t outputHeight =
+                    1 +
+                    (inputHeight + 2 * padding - block + stride - 1) / stride;
+                size_t outputWidth =
+                    1 +
+                    (inputWidth + 2 * padding - block + stride - 1) / stride;
+                TensorShape inputShape =
+                    TensorShape({batchSize, channels, inputHeight, inputWidth});
+                TensorShape outputShape =
+                    TensorShape({batchSize,
+                                 outputHeight * outputWidth,
+                                 channels * block * block});
+                test.addInputs(BufferArg(VALUE_TYPE_FLOAT, outputShape));
+                test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, inputShape),
+                                ADD_TO);
+                // run Function
+                test.run();
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 2bec00cdb2d32d01a5a24e662bcca07f4154939c..93304f73037690b5cf3ac8189aabc28f51316a77 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -36,10 +36,12 @@ if(WITH_GPU)
     add_simple_unittest(MulOpTest)
     add_simple_unittest(CosSimOpTest)
     add_simple_unittest(RowConvOpTest)
+    add_simple_unittest(BlockExpandOpTest)
     add_simple_unittest(CropOpTest)
 endif()
 
 add_simple_unittest(ConvOpTest)
+add_simple_unittest(Im2ColTest)
 endif()
 
 add_style_check_target(paddle_function ${h_files})
diff --git a/paddle/function/GemmConvOp.cpp b/paddle/function/GemmConvOp.cpp
index 00880effc59cc80b2761fb6a4d9f3246439afd3f..9deb2739fcfff935a98a0b5b31b5d11819d81227 100644
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
@@ -12,101 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "GemmConvOp.h"
+#include "ConvOp.h"
 #include "GemmFunctor.h"
+#include "Im2Col.h"
 #include "paddle/math/MemoryHandle.h"
 
 namespace paddle {
 
-/*
- * imData = [input_channels, input_height, input_width]
- * colData = [input_channels, filter_height, filter_width,
- *            output_height, output_width]
- */
-template <class T>
-class Im2ColFunctor<DEVICE_TYPE_CPU, T> {
-public:
-  void operator()(const T* imData,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int outputHeight,
-                  int outputWidth,
-                  T* colData) {
-    int channelsCol = inputChannels * filterHeight * filterWidth;
-
-    for (int c = 0; c < channelsCol; ++c) {
-      int wOffset = c % filterWidth;
-      int hOffset = (c / filterWidth) % filterHeight;
-      int c_im = c / filterWidth / filterHeight;
-      for (int h = 0; h < outputHeight; ++h) {
-        for (int w = 0; w < outputWidth; ++w) {
-          int imRowIdx = h * strideHeight + hOffset;
-          int imColIdx = w * strideWidth + wOffset;
-          if ((imRowIdx - paddingHeight) < 0 ||
-              (imRowIdx - paddingHeight) >= inputHeight ||
-              (imColIdx - paddingWidth) < 0 ||
-              (imColIdx - paddingWidth) >= inputWidth) {
-            colData[(c * outputHeight + h) * outputWidth + w] = T(0);
-          } else {
-            imRowIdx += c_im * inputHeight - paddingHeight;
-            imColIdx -= paddingWidth;
-            colData[(c * outputHeight + h) * outputWidth + w] =
-                imData[imRowIdx * inputWidth + imColIdx];
-          }
-        }
-      }
-    }
-  }
-};
-
-template <class T>
-class Col2ImFunctor<DEVICE_TYPE_CPU, T> {
-public:
-  void operator()(const T* colData,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int outputHeight,
-                  int outputWidth,
-                  T* imData) {
-    int channelsCol = inputChannels * filterHeight * filterWidth;
-
-    for (int c = 0; c < channelsCol; ++c) {
-      int wOffset = c % filterWidth;
-      int hOffset = (c / filterWidth) % filterHeight;
-      int c_im = c / filterWidth / filterHeight;
-      for (int h = 0; h < outputHeight; ++h) {
-        for (int w = 0; w < outputWidth; ++w) {
-          int imRowIdx = h * strideHeight + hOffset;
-          int imColIdx = w * strideWidth + wOffset;
-          if ((imRowIdx - paddingHeight) >= 0 &&
-              (imRowIdx - paddingHeight) < inputHeight &&
-              (imColIdx - paddingWidth) >= 0 &&
-              (imColIdx - paddingWidth) < inputWidth) {
-            imRowIdx += c_im * inputHeight - paddingHeight;
-            imColIdx -= paddingWidth;
-            imData[imRowIdx * inputWidth + imColIdx] +=
-                colData[(c * outputHeight + h) * outputWidth + w];
-          }
-        }
-      }
-    }
-  }
-};
-
 /*
  * \brief Forward calculation of convolution.
  */
@@ -154,15 +66,20 @@ public:
     real* inputData = inputs[0].data<real>();
     real* filterData = inputs[1].data<real>();
     real* outputData = outputs[0].data<real>();
-
-    size_t size = inputChannels / groups_ * filterHeight * filterWidth *
-                  outputHeight * outputWidth;
-    resizeBuffer<Device>(size);
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+    TensorShape colShape = TensorShape({inputChannels / groups_,
+                                        filterHeight,
+                                        filterWidth,
+                                        outputHeight,
+                                        outputWidth});
+
+    resizeBuffer<Device>(colShape.getElements());
     real* colData = reinterpret_cast<real*>(memory_->getBuf());
 
-    Im2ColFunctor<Device, real> im2col;
+    Im2ColFunctor<kCFO, Device, real> im2col;
     GemmFunctor<Device, real> gemm;
-    size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth;
+    size_t inputOffset = imShape.getElements();
     size_t outputOffset =
         (outputChannels / groups_) * outputHeight * outputWidth;
     size_t filterOffset = filter.getElements() / groups_;
@@ -170,18 +87,13 @@ public:
     for (size_t i = 0; i < batchSize; i++) {
       for (size_t g = 0; g < groups_; g++) {
         im2col(inputData + g * inputOffset,
-               inputChannels / groups_,
-               inputHeight,
-               inputWidth,
-               filterHeight,
-               filterWidth,
+               imShape,
+               colData,
+               colShape,
                strideH(),
                strideW(),
                paddingH(),
-               paddingW(),
-               outputHeight,
-               outputWidth,
-               colData);
+               paddingW());
 
         int M = outputChannels / groups_;
         int N = outputHeight * outputWidth;
@@ -247,15 +159,20 @@ public:
     real* outputGrad = inputs[0].data<real>();
     real* filterData = inputs[1].data<real>();
     real* inputGrad = outputs[0].data<real>();
-
-    size_t size = inputChannels / groups_ * filterHeight * filterWidth *
-                  outputHeight * outputWidth;
-    resizeBuffer<Device>(size);
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+    TensorShape colShape = TensorShape({inputChannels / groups_,
+                                        filterHeight,
+                                        filterWidth,
+                                        outputHeight,
+                                        outputWidth});
+
+    resizeBuffer<Device>(colShape.getElements());
     real* colData = reinterpret_cast<real*>(memory_->getBuf());
 
-    Col2ImFunctor<Device, real> col2im;
+    Col2ImFunctor<kCFO, Device, real> col2im;
     GemmFunctor<Device, real> gemm;
-    size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth;
+    size_t inputOffset = imShape.getElements();
     size_t outputOffset =
         (outputChannels / groups_) * outputHeight * outputWidth;
     size_t filterOffset = filter.getElements() / groups_;
@@ -278,20 +195,14 @@ public:
              0.0f,
              colData,
              N);
-
-        col2im(colData,
-               inputChannels / groups_,
-               inputHeight,
-               inputWidth,
-               filterHeight,
-               filterWidth,
+        col2im(inputGrad + g * inputOffset,
+               imShape,
+               colData,
+               colShape,
                strideH(),
                strideW(),
                paddingH(),
-               paddingW(),
-               outputHeight,
-               outputWidth,
-               inputGrad + g * inputOffset);
+               paddingW());
       }
       inputGrad += inputChannels * inputHeight * inputWidth;
       outputGrad += outputChannels * outputHeight * outputWidth;
@@ -344,33 +255,33 @@ public:
     real* outputGrad = inputs[0].data<real>();
     real* inputData = inputs[1].data<real>();
     real* filterGrad = outputs[0].data<real>();
-
-    size_t size = inputChannels / groups_ * filterHeight * filterWidth *
-                  outputHeight * outputWidth;
-    resizeBuffer<Device>(size);
+    TensorShape imShape =
+        TensorShape({inputChannels / groups_, inputHeight, inputWidth});
+    TensorShape colShape = TensorShape({inputChannels / groups_,
+                                        filterHeight,
+                                        filterWidth,
+                                        outputHeight,
+                                        outputWidth});
+
+    resizeBuffer<Device>(colShape.getElements());
     real* colData = reinterpret_cast<real*>(memory_->getBuf());
 
-    Im2ColFunctor<Device, real> im2col;
+    Im2ColFunctor<kCFO, Device, real> im2col;
     GemmFunctor<Device, real> gemm;
-    size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth;
+    size_t inputOffset = imShape.getElements();
     size_t outputOffset =
         (outputChannels / groups_) * outputHeight * outputWidth;
     size_t filterOffset = filter.getElements() / groups_;
     for (size_t i = 0; i < batchSize; i++) {
       for (size_t g = 0; g < groups_; g++) {
         im2col(inputData + g * inputOffset,
-               inputChannels / groups_,
-               inputHeight,
-               inputWidth,
-               filterHeight,
-               filterWidth,
+               imShape,
+               colData,
+               colShape,
                strideH(),
                strideW(),
                paddingH(),
-               paddingW(),
-               outputHeight,
-               outputWidth,
-               colData);
+               paddingW());
 
         int M = outputChannels / groups_;
         int K = outputHeight * outputWidth;
diff --git a/paddle/function/GemmConvOp.h b/paddle/function/GemmConvOp.h
deleted file mode 100644
index 9f11cce597a07ce2a54f518be30b657c26ab7516..0000000000000000000000000000000000000000
--- a/paddle/function/GemmConvOp.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "ConvOp.h"
-
-namespace paddle {
-
-/*
- * imData = [input_channels, input_height, input_width]
- * colData = [input_channels, filter_height, filter_width,
- *            output_height, output_width]
- */
-template <DeviceType Device, class T>
-class Im2ColFunctor {
-public:
-  void operator()(const T* imData,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int outputHeight,
-                  int outputWidth,
-                  T* colData);
-};
-
-template <DeviceType Device, class T>
-class Col2ImFunctor {
-public:
-  void operator()(const T* colData,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int outputHeight,
-                  int outputWidth,
-                  T* imData);
-};
-
-}  // namespace paddle
diff --git a/paddle/function/GemmConvOpGpu.cu b/paddle/function/GemmConvOpGpu.cu
deleted file mode 100644
index 2a1795ff0fb5643ea436c94fe893fe866056fccb..0000000000000000000000000000000000000000
--- a/paddle/function/GemmConvOpGpu.cu
+++ /dev/null
@@ -1,186 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "ConvOp.h"
-#include "GemmConvOp.h"
-
-namespace paddle {
-
-template<class T>
-__global__
-void im2col(const T* data_im, int numOuts, int height, int width,
-            int blockH, int blockW,
-            int strideH, int strideW,
-            int paddingH, int paddingW,
-            int height_col, int width_col,
-            T* data_col) {
-  int index =
-    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < numOuts) {
-    int w_out = index % width_col;
-    index /= width_col;
-    int h_out = index % height_col;
-    int channel_in = index / height_col;
-    int channel_out = channel_in * blockH * blockW;
-    int h_in = h_out * strideH;
-    int w_in = w_out * strideW;
-
-    data_col += (channel_out * height_col + h_out) * width_col + w_out;
-    for (int i = 0; i < blockH; ++i) {
-      for (int j = 0; j < blockW; ++j) {
-        int rIdx = int(h_in+i);
-        int cIdx = int(w_in+j);
-        if ((rIdx-(int)paddingH) >= (int)height ||
-            (rIdx-(int)paddingH) < 0 ||
-            (cIdx-(int)paddingW) >= (int)width ||
-            (cIdx-(int)paddingW) < 0) {
-          *data_col = 0;
-        } else {
-          rIdx = rIdx + channel_in*height - paddingH;
-          cIdx = cIdx - paddingW;
-          *data_col = data_im[rIdx* width + cIdx];
-        }
-        data_col += height_col * width_col;
-      }
-    }
-  }
-}
-
-template <class T>
-class Im2ColFunctor<DEVICE_TYPE_GPU, T> {
-public:
-  void operator()(const T* imData,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int outputHeight,
-                  int outputWidth,
-                  T* colData) {
-    int numKernels = inputChannels * outputHeight * outputWidth;
-    int blocks = (numKernels + 1024 -1) / 1024;
-    int blockX = 512;
-    int blockY = (blocks + 512 - 1) / 512;
-    dim3 threads(1024, 1);
-    dim3 grid(blockX, blockY);
-    im2col<T><<< grid, threads, 0, STREAM_DEFAULT >>>
-        (imData, numKernels, inputHeight, inputWidth, filterHeight, filterWidth,
-         strideHeight, strideWidth, paddingHeight, paddingWidth,
-         outputHeight, outputWidth, colData);
-    CHECK_SYNC("Im2ColFunctor GPU failed");
-  }
-};
-
-template<class T>
-__global__
-void col2im(size_t n, const T* data_col, size_t height,
-            size_t width, size_t channels,
-            size_t blockH, size_t blockW,
-            size_t strideH, size_t strideW,
-            size_t paddingH, size_t paddingW,
-            size_t height_col, size_t width_col,
-            T* data_im) {
-  size_t index =
-    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
-  if (index < n) {
-    T val = 0;
-    int w = int(index % width);
-    int h = int((index / width) % height);
-    int c = int(index / (width * height));
-    if ((w - (int)paddingW) >= 0 &&
-        (w - (int)paddingW) < (width-2 * paddingW) &&
-        (h - (int)paddingH) >= 0 &&
-        (h - paddingH) < (height - 2 * paddingH)) {
-      // compute the start and end of the output
-      int w_col_start =
-        (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1;
-      int w_col_end =
-        min((int)(w / (int)strideW + 1), (int)(width_col));
-      int h_col_start =
-        (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1;
-      int h_col_end = min(int(h / strideH + 1), int(height_col));
-      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
-        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
-          // the col location: [c * width * height + h_out, w_out]
-          int c_col = int(c * blockH* blockW) + \
-            (h - h_col * (int)strideH) * (int)blockW +
-            (w - w_col * (int)strideW);
-          val += data_col[(c_col * height_col + h_col) * width_col + w_col];
-        }
-      }
-      h -= paddingH;
-      w -= paddingW;
-      data_im[c*((width-2*paddingW) * (height-2*paddingH)) +
-              h*(width-2*paddingW) + w] += val;
-    }
-  }
-}
-
-template <class T>
-class Col2ImFunctor<DEVICE_TYPE_GPU, T> {
-public:
-  void operator()(const T* colData,
-                  int inputChannels,
-                  int inputHeight,
-                  int inputWidth,
-                  int filterHeight,
-                  int filterWidth,
-                  int strideHeight,
-                  int strideWidth,
-                  int paddingHeight,
-                  int paddingWidth,
-                  int outputHeight,
-                  int outputWidth,
-                  T* imData) {
-    size_t numKernels = inputChannels * (inputHeight + 2*paddingHeight)
-        * (inputWidth + 2*paddingWidth);
-
-    size_t blocks = (numKernels + 1024 -1) / 1024;
-    size_t blockX = 512;
-    size_t blockY = (blocks+512-1)/512;
-    dim3 threads(1024, 1);
-    dim3 grid(blockX, blockY);
-
-    // To avoid involving atomic operations, we will launch one kernel per
-    // bottom dimension, and then in the kernel add up the top dimensions.
-    col2im<T><<< grid, threads, 0, STREAM_DEFAULT >>>
-             (numKernels,
-              colData,
-              inputHeight + 2*paddingHeight,
-              inputWidth + 2*paddingWidth,
-              inputChannels,
-              filterHeight,
-              filterWidth,
-              strideHeight,
-              strideWidth,
-              paddingHeight,
-              paddingWidth,
-              outputHeight,
-              outputWidth,
-              imData);
-    CHECK_SYNC("Col2ImFunctor GPU failed");
-  }
-};
-
-template class Im2ColFunctor<DEVICE_TYPE_GPU, float>;
-template class Im2ColFunctor<DEVICE_TYPE_GPU, double>;
-template class Col2ImFunctor<DEVICE_TYPE_GPU, float>;
-template class Col2ImFunctor<DEVICE_TYPE_GPU, double>;
-
-}  // namespace paddle
diff --git a/paddle/function/Im2Col.h b/paddle/function/Im2Col.h
new file mode 100644
index 0000000000000000000000000000000000000000..48e2e32f9256fb49c67ba25e9b5a47d72499758b
--- /dev/null
+++ b/paddle/function/Im2Col.h
@@ -0,0 +1,96 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "TensorShape.h"
+#include "TensorType.h"
+
+namespace paddle {
+
+/* The storage format of the coldata in the Im2ColFunctor and Col2ImFunctor. */
+enum ColFormat { kCFO = 0, kOCF = 1 };
+
+/*
+ * \brief Converts the image data of three dimensions(CHW) into a colData of
+ *        five dimensions in the Im2ColFunctor calculation,
+ *        And in the Col2ImFunctor calculation, it is reversed.
+ *
+ * \param imData   Image data.
+ * \param imShape  The shape of imData,
+ *                 [inputChannels, inputHeight, inputWidth].
+ * \param colData  Column data.
+ * \param colShape The shape of colData.
+ *
+ * If the template argument Format is kCFO, the shape of colData is:
+ * [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
+ * So, it is easy to reshape into a convolution matrix for convolution
+ * calculation based on matrix multiplication.
+ * The shape of convolution matrix is [height, width], where the height is equal
+ * inputChannels * filterHeight * filterWidth, and the width is equal
+ * outputHeight * outputWidth.
+ *
+ * Reshape:
+ *     shape of colData           shape of convolution matrix
+ *     [inputChannels,
+ *      filterHeight,
+ *      filterWidth,      ======>      [height, width]
+ *      outputHeight,
+ *      outputWidth]
+ *
+ * If the template argument Format is kOCF, the shape of colData is:
+ * [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ * So, it is easy to reshape into a sequence matrix for rnn calculation.
+ * The shape of sequence matrix is [seqLength, stepSize], where the seqLength
+ * is equal outputHeight * outputWidth, and the stepSize is equal
+ * inputChannels * filterHeight * filterWidth.
+ *
+ * Reshape:
+ *     shape of colData             shape of sequence matrix
+ *     [outputHeight,
+ *      outputWidth,
+ *      inputChannels,    ======>    [seqLength, stepSize]
+ *      filterHeight,
+ *      filterWidth]
+ *
+ * \note The caller needs to ensure that imShape.inputChannels is equal to
+ *       colShape.inputChannels.
+ */
+template <ColFormat Format, DeviceType Device, class T>
+class Im2ColFunctor {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth);
+};
+
+template <ColFormat Format, DeviceType Device, class T>
+class Col2ImFunctor {
+public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth);
+};
+
+}  // namespace paddle
diff --git a/paddle/function/Im2ColOp.cpp b/paddle/function/Im2ColOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b7d1eb1eded7a7471fd5833a649916d3ee3e598e
--- /dev/null
+++ b/paddle/function/Im2ColOp.cpp
@@ -0,0 +1,235 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Im2Col.h"
+
+namespace paddle {
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
+ */
+template <class T>
+class Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputHeight = colShape[3];
+    int outputWidth = colShape[4];
+    int channelsCol = inputChannels * filterHeight * filterWidth;
+
+    for (int c = 0; c < channelsCol; ++c) {
+      int wOffset = c % filterWidth;
+      int hOffset = (c / filterWidth) % filterHeight;
+      int c_im = c / filterWidth / filterHeight;
+      for (int h = 0; h < outputHeight; ++h) {
+        for (int w = 0; w < outputWidth; ++w) {
+          int imRowIdx = h * strideHeight + hOffset;
+          int imColIdx = w * strideWidth + wOffset;
+          if ((imRowIdx - paddingHeight) < 0 ||
+              (imRowIdx - paddingHeight) >= inputHeight ||
+              (imColIdx - paddingWidth) < 0 ||
+              (imColIdx - paddingWidth) >= inputWidth) {
+            colData[(c * outputHeight + h) * outputWidth + w] = T(0);
+          } else {
+            imRowIdx += c_im * inputHeight - paddingHeight;
+            imColIdx -= paddingWidth;
+            colData[(c * outputHeight + h) * outputWidth + w] =
+                imData[imRowIdx * inputWidth + imColIdx];
+          }
+        }
+      }
+    }
+  }
+};
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
+ */
+template <class T>
+class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputHeight = colShape[3];
+    int outputWidth = colShape[4];
+    int channelsCol = inputChannels * filterHeight * filterWidth;
+
+    for (int c = 0; c < channelsCol; ++c) {
+      int wOffset = c % filterWidth;
+      int hOffset = (c / filterWidth) % filterHeight;
+      int c_im = c / filterWidth / filterHeight;
+      for (int h = 0; h < outputHeight; ++h) {
+        for (int w = 0; w < outputWidth; ++w) {
+          int imRowIdx = h * strideHeight + hOffset;
+          int imColIdx = w * strideWidth + wOffset;
+          if ((imRowIdx - paddingHeight) >= 0 &&
+              (imRowIdx - paddingHeight) < inputHeight &&
+              (imColIdx - paddingWidth) >= 0 &&
+              (imColIdx - paddingWidth) < inputWidth) {
+            imRowIdx += c_im * inputHeight - paddingHeight;
+            imColIdx -= paddingWidth;
+            imData[imRowIdx * inputWidth + imColIdx] +=
+                colData[(c * outputHeight + h) * outputWidth + w];
+          }
+        }
+      }
+    }
+  }
+};
+
+template class Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, float>;
+template class Im2ColFunctor<kCFO, DEVICE_TYPE_CPU, double>;
+template class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, float>;
+template class Col2ImFunctor<kCFO, DEVICE_TYPE_CPU, double>;
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ */
+template <class T>
+class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[3];
+    int filterWidth = colShape[4];
+    int outputHeight = colShape[0];
+    int outputWidth = colShape[1];
+    for (int outputH = 0; outputH < outputHeight; ++outputH) {
+      for (int outputW = 0; outputW < outputWidth; ++outputW) {
+        for (int channel = 0; channel < inputChannels; ++channel) {
+          for (int filterH = 0; filterH < filterHeight; ++filterH) {
+            for (int filterW = 0; filterW < filterWidth; ++filterW) {
+              int imRowOffset =
+                  outputH * strideHeight + filterH - paddingHeight;
+              int imColOffset = outputW * strideWidth + filterW - paddingWidth;
+              int colDataOffset =
+                  (((outputH * outputWidth + outputW) * inputChannels +
+                    channel) *
+                       filterHeight +
+                   filterH) *
+                      filterWidth +
+                  filterW;
+              if (imRowOffset < 0 || imRowOffset >= inputHeight ||
+                  imColOffset < 0 || imColOffset >= inputWidth) {
+                colData[colDataOffset] = float(0);
+              } else {
+                int imDataOffset =
+                    (channel * inputHeight + imRowOffset) * inputWidth +
+                    imColOffset;
+                colData[colDataOffset] = imData[imDataOffset];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ */
+template <class T>
+class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[3];
+    int filterWidth = colShape[4];
+    int outputHeight = colShape[0];
+    int outputWidth = colShape[1];
+    for (int outputH = 0; outputH < outputHeight; ++outputH) {
+      for (int outputW = 0; outputW < outputWidth; ++outputW) {
+        for (int channel = 0; channel < inputChannels; ++channel) {
+          for (int filterH = 0; filterH < filterHeight; ++filterH) {
+            for (int filterW = 0; filterW < filterWidth; ++filterW) {
+              int imRowOffset =
+                  outputH * strideHeight + filterH - paddingHeight;
+              int imColOffset = outputW * strideWidth + filterW - paddingWidth;
+              int colDataOffset =
+                  (((outputH * outputWidth + outputW) * inputChannels +
+                    channel) *
+                       filterHeight +
+                   filterH) *
+                      filterWidth +
+                  filterW;
+              if (imRowOffset >= 0 && imRowOffset < inputHeight &&
+                  imColOffset >= 0 && imColOffset < inputWidth) {
+                int imDataOffset =
+                    (channel * inputHeight + imRowOffset) * inputWidth +
+                    imColOffset;
+                imData[imDataOffset] += colData[colDataOffset];
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+template class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, float>;
+template class Im2ColFunctor<kOCF, DEVICE_TYPE_CPU, double>;
+template class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, float>;
+template class Col2ImFunctor<kOCF, DEVICE_TYPE_CPU, double>;
+
+}  // namespace paddle
diff --git a/paddle/function/Im2ColOpGpu.cu b/paddle/function/Im2ColOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..15ba854009636d027447d104071163100d5e3f4b
--- /dev/null
+++ b/paddle/function/Im2ColOpGpu.cu
@@ -0,0 +1,381 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Im2Col.h"
+#include "hl_device_functions.cuh"
+
+namespace paddle {
+
+template<class T>
+__global__
+void im2col(const T* data_im, int numOuts, int height, int width,
+            int blockH, int blockW,
+            int strideH, int strideW,
+            int paddingH, int paddingW,
+            int height_col, int width_col,
+            T* data_col) {
+  int index =
+    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < numOuts) {
+    int w_out = index % width_col;
+    index /= width_col;
+    int h_out = index % height_col;
+    int channel_in = index / height_col;
+    int channel_out = channel_in * blockH * blockW;
+    int h_in = h_out * strideH;
+    int w_in = w_out * strideW;
+
+    data_col += (channel_out * height_col + h_out) * width_col + w_out;
+    for (int i = 0; i < blockH; ++i) {
+      for (int j = 0; j < blockW; ++j) {
+        int rIdx = int(h_in+i);
+        int cIdx = int(w_in+j);
+        if ((rIdx-(int)paddingH) >= (int)height ||
+            (rIdx-(int)paddingH) < 0 ||
+            (cIdx-(int)paddingW) >= (int)width ||
+            (cIdx-(int)paddingW) < 0) {
+          *data_col = 0;
+        } else {
+          rIdx = rIdx + channel_in*height - paddingH;
+          cIdx = cIdx - paddingW;
+          *data_col = data_im[rIdx* width + cIdx];
+        }
+        data_col += height_col * width_col;
+      }
+    }
+  }
+}
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
+ */
+template <class T>
+class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputHeight = colShape[3];
+    int outputWidth = colShape[4];
+
+    int numKernels = inputChannels * outputHeight * outputWidth;
+    int blocks = (numKernels + 1024 -1) / 1024;
+    int blockX = 512;
+    int blockY = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+    im2col<T><<< grid, threads, 0, STREAM_DEFAULT >>>
+        (imData, numKernels, inputHeight, inputWidth, filterHeight, filterWidth,
+         strideHeight, strideWidth, paddingHeight, paddingWidth,
+         outputHeight, outputWidth, colData);
+    CHECK_SYNC("Im2ColFunctor GPU failed");
+  }
+};
+
+template<class T>
+__global__
+void col2im(size_t n, const T* data_col, size_t height,
+            size_t width, size_t channels,
+            size_t blockH, size_t blockW,
+            size_t strideH, size_t strideW,
+            size_t paddingH, size_t paddingW,
+            size_t height_col, size_t width_col,
+            T* data_im) {
+  size_t index =
+    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < n) {
+    T val = 0;
+    int w = int(index % width);
+    int h = int((index / width) % height);
+    int c = int(index / (width * height));
+    if ((w - (int)paddingW) >= 0 &&
+        (w - (int)paddingW) < (width-2 * paddingW) &&
+        (h - (int)paddingH) >= 0 &&
+        (h - paddingH) < (height - 2 * paddingH)) {
+      // compute the start and end of the output
+      int w_col_start =
+        (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1;
+      int w_col_end =
+        min((int)(w / (int)strideW + 1), (int)(width_col));
+      int h_col_start =
+        (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1;
+      int h_col_end = min(int(h / strideH + 1), int(height_col));
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          // the col location: [c * width * height + h_out, w_out]
+          int c_col = int(c * blockH* blockW) + \
+            (h - h_col * (int)strideH) * (int)blockW +
+            (w - w_col * (int)strideW);
+          val += data_col[(c_col * height_col + h_col) * width_col + w_col];
+        }
+      }
+      h -= paddingH;
+      w -= paddingW;
+      data_im[c*((width-2*paddingW) * (height-2*paddingH)) +
+              h*(width-2*paddingW) + w] += val;
+    }
+  }
+}
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [inputChannels, filterHeight, filterWidth, outputHeight, outputWidth]
+ */
+template <class T>
+class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[1];
+    int filterWidth = colShape[2];
+    int outputHeight = colShape[3];
+    int outputWidth = colShape[4];
+
+    size_t numKernels = inputChannels * (inputHeight + 2*paddingHeight)
+        * (inputWidth + 2*paddingWidth);
+
+    size_t blocks = (numKernels + 1024 -1) / 1024;
+    size_t blockX = 512;
+    size_t blockY = (blocks+512-1)/512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+
+    // To avoid involving atomic operations, we will launch one kernel per
+    // bottom dimension, and then in the kernel add up the top dimensions.
+    col2im<T><<< grid, threads, 0, STREAM_DEFAULT >>>
+             (numKernels,
+              colData,
+              inputHeight + 2*paddingHeight,
+              inputWidth + 2*paddingWidth,
+              inputChannels,
+              filterHeight,
+              filterWidth,
+              strideHeight,
+              strideWidth,
+              paddingHeight,
+              paddingWidth,
+              outputHeight,
+              outputWidth,
+              imData);
+    CHECK_SYNC("Col2ImFunctor GPU failed");
+  }
+};
+
+template class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, float>;
+template class Im2ColFunctor<kCFO, DEVICE_TYPE_GPU, double>;
+template class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, float>;
+template class Col2ImFunctor<kCFO, DEVICE_TYPE_GPU, double>;
+
+template<class T>
+__global__
+void im2colOCF(const T* imData, T* colData,
+               int inputChannels,
+               int inputHeight, int inputWidth,
+               int filterHeight, int filterWidth,
+               int strideHeight, int strideWidth,
+               int paddingHeight, int paddingWidth,
+               int outputHeight, int outputWidth) {
+  int swId = blockIdx.x;
+  int shId = blockIdx.y;
+  for (int channelId = threadIdx.z;
+       channelId < inputChannels;
+       channelId += blockDim.z) {
+    for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
+      for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
+        int widthOffset = idx + swId * strideWidth - paddingWidth;
+        int heightOffset = idy + shId * strideHeight - paddingHeight;
+        int imOffset = widthOffset + heightOffset * inputWidth
+           + channelId * inputHeight * inputWidth;
+
+        int colOffset = idx + idy * filterWidth
+          + channelId * filterHeight * filterWidth
+          + (shId * outputWidth + swId)
+          * (inputChannels * filterHeight * filterWidth);
+
+        if (heightOffset >= inputHeight || heightOffset < 0 ||
+            widthOffset >= inputWidth || widthOffset < 0) {
+          colData[colOffset] = T(0);
+        } else {
+          colData[colOffset] = imData[imOffset];
+        }
+      }
+    }
+  }
+}
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ */
+template <class T>
+class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(const T* imData,
+                  const TensorShape& imShape,
+                  T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[3];
+    int filterWidth = colShape[4];
+    int outputHeight = colShape[0];
+    int outputWidth = colShape[1];
+
+    int blockDimX = 0;
+    int blockDimY = 0;
+    if (filterHeight <= 4 && filterWidth <= 4) {
+      blockDimX = 4;
+      blockDimY = 4;
+    } else if (filterHeight <= 8 && filterWidth <= 8) {
+      blockDimX = 8;
+      blockDimY = 8;
+    } else if (filterHeight <= 16 && filterWidth <= 16) {
+      blockDimX = 16;
+      blockDimY = 16;
+    } else {
+      blockDimX = 32;
+      blockDimY = 32;
+    }
+
+    int blockDimZ = 1024 / blockDimX / blockDimY;
+    dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels));
+    dim3 grid(outputWidth, outputHeight);
+    im2colOCF<T><<< grid, threads, 0, STREAM_DEFAULT >>>
+        (imData, colData, inputChannels, inputHeight, inputWidth,
+         filterHeight, filterWidth, strideHeight, strideWidth,
+         paddingHeight, paddingWidth, outputHeight, outputWidth);
+    CHECK_SYNC("Im2ColFunctor GPU failed");
+  }
+};
+
+template<class T>
+__global__
+void col2imOCF(T* imData, const T* colData,
+               int inputChannels,
+               int inputHeight, int inputWidth,
+               int filterHeight, int filterWidth,
+               int strideHeight, int strideWidth,
+               int paddingHeight, int paddingWidth,
+               int outputHeight, int outputWidth) {
+  int swId = blockIdx.x;
+  int shId = blockIdx.y;
+  for (int channelId = threadIdx.z;
+       channelId < inputChannels;
+       channelId += blockDim.z) {
+    for (int idy = threadIdx.y; idy < filterHeight; idy += blockDim.y) {
+      for (int idx = threadIdx.x; idx < filterWidth; idx += blockDim.x) {
+        int widthOffset = idx + swId * strideWidth - paddingWidth;
+        int heightOffset = idy + shId * strideHeight - paddingHeight;
+        int imOffset = widthOffset + heightOffset * inputWidth
+           + channelId * inputHeight * inputWidth;
+
+        int colOffset = idx + idy * filterWidth
+          + channelId * filterHeight * filterWidth
+          + (shId * outputWidth + swId)
+          * (inputChannels * filterHeight * filterWidth);
+
+        if (heightOffset >= 0 && heightOffset < inputHeight &&
+            widthOffset >= 0 && widthOffset < inputWidth) {
+          paddle::paddleAtomicAdd(imData + imOffset, colData[colOffset]);
+        }
+      }
+    }
+  }
+}
+
+/*
+ * imShape = [inputChannels, inputHeight, inputWidth]
+ * colShape =
+ *   [outputHeight, outputWidth, inputChannels, filterHeight, filterWidth]
+ */
+template <class T>
+class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(T* imData,
+                  const TensorShape& imShape,
+                  const T* colData,
+                  const TensorShape& colShape,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth) {
+    int inputChannels = imShape[0];
+    int inputHeight = imShape[1];
+    int inputWidth = imShape[2];
+    int filterHeight = colShape[3];
+    int filterWidth = colShape[4];
+    int outputHeight = colShape[0];
+    int outputWidth = colShape[1];
+
+    int blockDimX = 0;
+    int blockDimY = 0;
+    if (filterHeight <= 4 && filterWidth <= 4) {
+      blockDimX = 4;
+      blockDimY = 4;
+    } else if (filterHeight <= 8 && filterWidth <= 8) {
+      blockDimX = 8;
+      blockDimY = 8;
+    } else if (filterHeight <= 16 && filterWidth <= 16) {
+      blockDimX = 16;
+      blockDimY = 16;
+    } else {
+      blockDimX = 32;
+      blockDimY = 32;
+    }
+
+    int blockDimZ = 1024 / blockDimX / blockDimY;
+    dim3 threads(blockDimX, blockDimY, std::min(blockDimZ, inputChannels));
+    dim3 grid(outputWidth, outputHeight);
+    col2imOCF<T><<< grid, threads, 0, STREAM_DEFAULT >>>
+        (imData, colData, inputChannels, inputHeight, inputWidth,
+         filterHeight, filterWidth, strideHeight, strideWidth,
+         paddingHeight, paddingWidth, outputHeight, outputWidth);
+    CHECK_SYNC("Col2ImFunctor GPU failed");
+  }
+};
+
+template class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, float>;
+template class Im2ColFunctor<kOCF, DEVICE_TYPE_GPU, double>;
+template class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, float>;
+template class Col2ImFunctor<kOCF, DEVICE_TYPE_GPU, double>;
+
+}  // namespace paddle
diff --git a/paddle/function/Im2ColTest.cpp b/paddle/function/Im2ColTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..acc88a553abe7ac58b629aba9b850df58cee7f81
--- /dev/null
+++ b/paddle/function/Im2ColTest.cpp
@@ -0,0 +1,125 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Im2Col.h"
+#include <gtest/gtest.h>
+#include "Function.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/tests/TensorCheck.h"
+
+namespace paddle {
+
+template <DeviceType Device, class T>
+void TestIm2ColFunctor() {
+  for (size_t channels : {1, 5, 32}) {
+    for (size_t inputHeight : {5, 33, 100}) {
+      for (size_t inputWidth : {5, 32, 96}) {
+        for (size_t filterHeight : {1, 5}) {
+          for (size_t filterWidth : {3, 7}) {
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+                if (inputHeight <= filterHeight || inputWidth <= filterWidth)
+                  break;
+                if (padding >= filterHeight || padding >= filterWidth) break;
+                size_t outputHeight =
+                    (inputHeight - filterHeight + 2 * padding + stride) /
+                    stride;
+                size_t outputWidth =
+                    (inputWidth - filterWidth + 2 * padding + stride) / stride;
+
+                TensorShape imShape =
+                    TensorShape({channels, inputHeight, inputWidth});
+                TensorShape colShape1 = TensorShape({channels,
+                                                     filterHeight,
+                                                     filterWidth,
+                                                     outputHeight,
+                                                     outputWidth});
+                TensorShape colShape2 = TensorShape({outputHeight,
+                                                     outputWidth,
+                                                     channels,
+                                                     filterHeight,
+                                                     filterWidth});
+
+                size_t height = channels * filterHeight * filterWidth;
+                size_t width = outputHeight * outputWidth;
+                VectorPtr input1 = Vector::create(imShape.getElements(), false);
+                VectorPtr input2 = Vector::create(imShape.getElements(), false);
+                MatrixPtr output1 = Matrix::create(height, width, false, false);
+                MatrixPtr output2 = Matrix::create(width, height, false, false);
+                input1->uniform(0.001, 1);
+                input2->copyFrom(*input1);
+
+                Im2ColFunctor<kCFO, Device, T> im2Col1;
+                Im2ColFunctor<kOCF, Device, T> im2Col2;
+                im2Col1(input1->getData(),
+                        imShape,
+                        output1->getData(),
+                        colShape1,
+                        stride,
+                        stride,
+                        padding,
+                        padding);
+                im2Col2(input2->getData(),
+                        imShape,
+                        output2->getData(),
+                        colShape2,
+                        stride,
+                        stride,
+                        padding,
+                        padding);
+
+                // The transposition of the result of ColFormat == kCFO
+                // is equal to the result of ColFormat == kOCF.
+                MatrixPtr test;
+                output2->transpose(test, true);
+                autotest::TensorCheckErr(*output1, *test);
+
+                Col2ImFunctor<kCFO, Device, T> col2Im1;
+                Col2ImFunctor<kOCF, Device, T> col2Im2;
+                col2Im1(input1->getData(),
+                        imShape,
+                        output1->getData(),
+                        colShape1,
+                        stride,
+                        stride,
+                        padding,
+                        padding);
+                col2Im2(input2->getData(),
+                        imShape,
+                        output2->getData(),
+                        colShape2,
+                        stride,
+                        stride,
+                        padding,
+                        padding);
+
+                autotest::TensorCheckErr(*input1, *input2);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+TEST(Im2ColFunctor, CPU) { TestIm2ColFunctor<DEVICE_TYPE_CPU, float>(); }
+
+#ifndef PADDLE_ONLY_CPU
+
+TEST(Im2ColFunctor, GPU) { TestIm2ColFunctor<DEVICE_TYPE_GPU, float>(); }
+
+#endif
+
+}  // namespace paddle
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index a40530f41313be27dc1c2606501c6c00bed11c8b..81cc3c890b6d4ad048e4edc03208c85778244078 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -207,8 +207,8 @@ Error __must_check backward(Argument& act) {
     argument_.value->setData(act.value->getData() + offset, 1UL, size);
     argument_.grad->setData(act.grad->getData() + offset, 1UL, size);
 
-    Error status = softmax_.backward(argument_);
-    if (!status) return status;
+    Error err = softmax_.backward(argument_);
+    if (!err.isOK()) return err;
   }
   return Error();
 }
diff --git a/paddle/gserver/layers/BlockExpandLayer.cpp b/paddle/gserver/layers/BlockExpandLayer.cpp
index 2bafeb92158c56efe32f90742807f0af07bda5af..3b1f34635917290c6e4a9230b546892cc5cb7bfa 100644
--- a/paddle/gserver/layers/BlockExpandLayer.cpp
+++ b/paddle/gserver/layers/BlockExpandLayer.cpp
@@ -37,6 +37,22 @@ bool BlockExpandLayer::init(const LayerMap& layerMap,
   imgSizeH_ = blockConf.img_size_y();
   imgSizeW_ = blockConf.img_size_x();
 
+  std::vector<size_t> strides = {(size_t)strideH_, (size_t)strideW_};
+  std::vector<size_t> paddings = {(size_t)paddingH_, (size_t)paddingW_};
+  std::vector<size_t> blocks = {(size_t)blockH_, (size_t)blockW_};
+  createFunction(forward_,
+                 "BlockExpand",
+                 FuncConfig()
+                     .set("strides", strides)
+                     .set("paddings", paddings)
+                     .set("blocks", blocks));
+  createFunction(backward_,
+                 "BlockExpandGrad",
+                 FuncConfig()
+                     .set("strides", strides)
+                     .set("paddings", paddings)
+                     .set("blocks", blocks));
+
   return true;
 }
 
@@ -63,48 +79,27 @@ void BlockExpandLayer::forward(PassType passType) {
   Layer::forward(passType);
 
   size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-
   size_t blockNum = getBlockNum();
   size_t blockSize = blockH_ * blockW_ * channels_;
   resetOutput(blockNum * batchSize, blockSize);
-  Argument& out = getOutput();
-  MatrixPtr outV = getOutputValue();
 
-  MatrixPtr input = getPrev(0)->getOutputValue();
-  Matrix::resizeOrCreate(outVTrans_, blockSize, blockNum, false, useGpu_);
+  // calculate output_.value
+  inputShape_ = TensorShape({batchSize, channels_, imgSizeH_, imgSizeW_});
+  outputShape_ = TensorShape({batchSize, blockNum, blockSize});
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), inputShape_);
+  outputs.addArg(*getOutputValue(), outputShape_, ASSIGN_TO);
+  forward_[0]->calc(inputs, outputs);
+
+  // calculate output_.sequenceStartPositions and output_.cpuSequenceDims
+  Argument& out = getOutput();
   ICpuGpuVector::resizeOrCreate(
       out.sequenceStartPositions, batchSize + 1, false);
   IVector::resizeOrCreate(out.cpuSequenceDims, 2 * batchSize, false);
   int* start = out.sequenceStartPositions->getMutableData(false);
   int* dims = out.cpuSequenceDims->getData();
   for (size_t i = 0; i < batchSize; i++) {
-    outVTrans_->zeroMem();
-    /* expand each block as one row */
-    MatrixPtr inputTmp =
-        Matrix::create(input->getData() + i * input->getWidth(),
-                       1,
-                       input->getWidth(),
-                       false,
-                       useGpu_);
-    outVTrans_->convExpand(*inputTmp,
-                           imgSizeH_,
-                           imgSizeW_,
-                           channels_,
-                           blockH_,
-                           blockW_,
-                           strideH_,
-                           strideW_,
-                           paddingH_,
-                           paddingW_,
-                           outputH_,
-                           outputW_);
-    MatrixPtr outVTmp =
-        Matrix::create(outV->getData() + i * blockNum * blockSize,
-                       blockNum,
-                       blockSize,
-                       false,
-                       useGpu_);
-    outVTrans_->transpose(outVTmp, false);
     start[i] = i * blockNum;
     dims[2 * i] = outputH_;
     dims[2 * i + 1] = outputW_;
@@ -113,48 +108,13 @@ void BlockExpandLayer::forward(PassType passType) {
 }
 
 void BlockExpandLayer::backward(const UpdateCallback& callback) {
-  size_t blockNum = outputH_ * outputW_;
-  size_t blockSize = blockH_ * blockW_ * channels_;
   /* Calculate the input layers error */
-  MatrixPtr preGrad = inputLayers_[0]->getOutputGrad();
-  if (!preGrad) {
-    return;
-  }
-  MatrixPtr grad = getOutputGrad();
-  MatrixPtr gradTrans = Matrix::create(blockSize, blockNum, false, useGpu_);
-  size_t batchSize = preGrad->getHeight();
-
-  CHECK_EQ(batchSize * blockNum, grad->getHeight());
-  CHECK_EQ(blockSize, grad->getWidth());
-
-  for (size_t i = 0; i < batchSize; i++) {
-    MatrixPtr gradTmp =
-        Matrix::create(grad->getData() + i * blockNum * blockSize,
-                       blockNum,
-                       blockSize,
-                       false,
-                       useGpu_);
-    gradTmp->transpose(gradTrans, false);
-    MatrixPtr preGradTmp =
-        Matrix::create(preGrad->getData() + i * preGrad->getWidth(),
-                       1,
-                       preGrad->getWidth(),
-                       false,
-                       useGpu_);
-    preGradTmp->convShrink(*gradTrans,
-                           imgSizeH_,
-                           imgSizeW_,
-                           channels_,
-                           blockH_,
-                           blockW_,
-                           strideH_,
-                           strideW_,
-                           paddingH_,
-                           paddingW_,
-                           outputH_,
-                           outputW_,
-                           1.0,
-                           1.0);
+  if (getInputGrad(0)) {
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*getOutputGrad(), outputShape_);
+    outputs.addArg(*getInputGrad(0), inputShape_, ADD_TO);
+    backward_[0]->calc(inputs, outputs);
   }
 }
 
diff --git a/paddle/gserver/layers/BlockExpandLayer.h b/paddle/gserver/layers/BlockExpandLayer.h
index 8f347400e60ec84fc1b5fdbc1c911a8768b306d0..15ce73ab8b2ca16ba1e9329ed5c00dc7239e8b93 100644
--- a/paddle/gserver/layers/BlockExpandLayer.h
+++ b/paddle/gserver/layers/BlockExpandLayer.h
@@ -50,8 +50,8 @@ protected:
   size_t blockH_, blockW_, strideH_, strideW_, paddingH_, paddingW_;
   size_t imgSizeH_, imgSizeW_, outputH_, outputW_, channels_;
 
-  /// auxiliary variable, which saves the transposed output value.
-  MatrixPtr outVTrans_;
+  TensorShape inputShape_;
+  TensorShape outputShape_;
 
 public:
   explicit BlockExpandLayer(const LayerConfig& config) : Layer(config) {}
diff --git a/paddle/gserver/layers/ConvBaseProjection.cpp b/paddle/gserver/layers/ConvBaseProjection.cpp
index d1e932ded595c90cbe6040c330c5c8663d81e2b4..eb6b0445c95a9e9a7acd5d693ecdb11a263f41fd 100644
--- a/paddle/gserver/layers/ConvBaseProjection.cpp
+++ b/paddle/gserver/layers/ConvBaseProjection.cpp
@@ -87,9 +87,6 @@ void ConvBaseProjection::initCudnn() {
   bwdDataLimitBytes_ = 0;
   bwdFilterLimitBytes_ = 0;
   workSpaceInBytes_ = 0;
-
-  batchNum_ = 0;
-  isSelectAlgo_ = false;
 }
 
 void ConvBaseProjection::reshapeTensorDesc(int batchSize) {
@@ -142,32 +139,25 @@ void ConvBaseProjection::reshape(int batchSize) {
   CHECK_EQ(width, out_->value->getWidth());
   CHECK_EQ(calInputSize(), in_->value->getWidth());
 
-  isSelectAlgo_ = (batchSize == batchNum_);
-  batchNum_ = batchSize;
-
-  if (!isSelectAlgo_) {
-    reshapeTensorDesc(batchSize);
-    hl_conv_workspace(imageDesc_,
-                      outputDesc_,
-                      filterDesc_,
-                      convDesc_,
-                      &fwdAlgo_,
-                      &fwdLimitBytes_,
-                      &bwdDataAlgo_,
-                      &bwdDataLimitBytes_,
-                      &bwdFilterAlgo_,
-                      &bwdFilterLimitBytes_);
-
-    size_t maxWorkSpace = 0;
-    maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
-    maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
-    workSpaceInBytes_ = maxWorkSpace;
-
-    VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
-            << " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_;
-  }
-
-  isSelectAlgo_ = true;
+  reshapeTensorDesc(batchSize);
+  hl_conv_workspace(imageDesc_,
+                    outputDesc_,
+                    filterDesc_,
+                    convDesc_,
+                    &fwdAlgo_,
+                    &fwdLimitBytes_,
+                    &bwdDataAlgo_,
+                    &bwdDataLimitBytes_,
+                    &bwdFilterAlgo_,
+                    &bwdFilterLimitBytes_);
+
+  size_t maxWorkSpace = 0;
+  maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
+  maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
+  workSpaceInBytes_ = maxWorkSpace;
+
+  VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
+          << " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_;
 }
 
 void *ConvBaseProjection::getSpaceBytes(size_t size) {
diff --git a/paddle/gserver/layers/ConvBaseProjection.h b/paddle/gserver/layers/ConvBaseProjection.h
index 4a33aa1837dfc36dbead60deaccbc6b772fe4754..e9d9f8f1b2937b3a3b7323c43ef5608ffc5f82ca 100644
--- a/paddle/gserver/layers/ConvBaseProjection.h
+++ b/paddle/gserver/layers/ConvBaseProjection.h
@@ -101,12 +101,6 @@ protected:
   size_t bwdFilterLimitBytes_;
   /// Size of total work space.
   size_t workSpaceInBytes_;
-
-  /// Whether to call cuDNN api to choose conv algorithm.
-  bool isSelectAlgo_;
-  /// batchNum is used to record batch size. If the batch size is changed,
-  /// the selection algorithm will be called.
-  int batchNum_;
   bool bias_;
 
   std::unique_ptr<Weight> weight_;
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 4431d613f655c1d0c8da13bb5ac9225980c650ad..27f7d95b752d4a423bf99fa425b10b2816575d6a 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -1016,81 +1016,6 @@ void GpuMatrix::check(std::ostream& os, Matrix& refMat, bool printDiff) {
   LOG(INFO) << "the  diffCnt is " << diffCnt;
 }
 
-void GpuMatrix::convExpand(Matrix& feature,
-                           int feaImgHeight,
-                           int feaImgWidth,
-                           int channels,
-                           int blockH,
-                           int blockW,
-                           int strideH,
-                           int strideW,
-                           int paddingH,
-                           int paddingW,
-                           int outputH,
-                           int outputW) {
-  CHECK(feature.useGpu_ == true) << "Matrix type are not equal";
-
-  CHECK_EQ(size_t(feaImgHeight * feaImgWidth * channels),
-           feature.getHeight() * feature.getWidth())
-      << "Matrix dimensions are not equal";
-
-  size_t elemCnt = outputH * outputW * blockH * blockW * channels;
-  CHECK_EQ(elemCnt, height_ * width_) << "Matrix dimensions are not equal";
-
-  hl_expand_feature2col(feature.getData(),
-                        channels,
-                        feaImgHeight,
-                        feaImgWidth,
-                        blockH,
-                        blockW,
-                        strideH,
-                        strideW,
-                        paddingH,
-                        paddingW,
-                        outputH,
-                        outputW,
-                        getData());
-}
-
-void GpuMatrix::convShrink(Matrix& expandFeat,
-                           int thisImgHeight,
-                           int thisImgWidth,
-                           int channels,
-                           int blockH,
-                           int blockW,
-                           int strideH,
-                           int strideW,
-                           int paddingH,
-                           int paddingW,
-                           int outputH,
-                           int outputW,
-                           real alpha,
-                           real beta) {
-  CHECK(expandFeat.useGpu_ == true) << "Matrix type are not equal";
-  CHECK_EQ(size_t(thisImgHeight * thisImgWidth * channels),
-           getHeight() * getWidth())
-      << "Matrix dimensions are not equal";
-
-  size_t elemCnt = outputH * outputW * blockW * blockH * channels;
-  CHECK(elemCnt == expandFeat.getHeight() * expandFeat.getWidth())
-      << "Matrix dimensions are not equal";
-  hl_shrink_col2feature(expandFeat.getData(),
-                        channels,
-                        thisImgHeight,
-                        thisImgWidth,
-                        blockH,
-                        blockW,
-                        strideH,
-                        strideW,
-                        paddingH,
-                        paddingW,
-                        outputH,
-                        outputW,
-                        getData(),
-                        alpha,
-                        beta);
-}
-
 void GpuMatrix::maxPoolForward(Matrix& inputMat,
                                size_t imgSizeH,
                                size_t imgSizeW,
@@ -1777,103 +1702,6 @@ void CpuMatrix::inverse(MatrixPtr& matInv, bool memAlloc) {
   CHECK_EQ(info, 0);
 }
 
-void CpuMatrix::convExpand(Matrix& feature,
-                           int feaImgHeight,
-                           int feaImgWidth,
-                           int channels,
-                           int blockH,
-                           int blockW,
-                           int strideH,
-                           int strideW,
-                           int paddingH,
-                           int paddingW,
-                           int outputH,
-                           int outputW) {
-  CHECK(feature.useGpu_ == false) << "Matrix type are not equal";
-
-  CHECK_EQ(size_t(feaImgHeight * feaImgWidth * channels),
-           feature.getHeight() * feature.getWidth())
-      << "Matrix dimensions are not equal";
-
-  size_t elemCnt = outputH * outputW * blockH * blockW * channels;
-  CHECK_EQ(elemCnt, height_ * width_) << "Matrix dimensions are not equal";
-
-  int channelsCol = channels * blockH * blockW;
-  real* srcData = feature.getData();
-  for (int c = 0; c < channelsCol; ++c) {
-    int wOffset = c % blockW;
-    int hOffset = (c / blockW) % blockH;
-    int c_im = c / blockH / blockW;
-    for (int h = 0; h < outputH; ++h) {
-      for (int w = 0; w < outputW; ++w) {
-        // no c_im*height to Exclude the channel number
-        int imgRowIdx = h * strideH + hOffset;
-        int imgColIdx = w * strideW + wOffset;
-        if ((imgRowIdx - paddingH) < 0 ||
-            (imgRowIdx - paddingH) >= feaImgHeight ||
-            (imgColIdx - paddingW) < 0 ||
-            (imgColIdx - paddingW) >= feaImgWidth) {
-          data_[(c * outputH + h) * outputW + w] = 0;
-        } else {
-          imgRowIdx += c_im * feaImgHeight - paddingH;
-          imgColIdx -= paddingW;
-          data_[(c * outputH + h) * outputW + w] =
-              srcData[imgRowIdx * feaImgWidth + imgColIdx];
-        }
-      }
-    }
-  }
-}
-
-void CpuMatrix::convShrink(Matrix& expandFeat,
-                           int thisImgHeight,
-                           int thisImgWidth,
-                           int channels,
-                           int blockH,
-                           int blockW,
-                           int strideH,
-                           int strideW,
-                           int paddingH,
-                           int paddingW,
-                           int outputH,
-                           int outputW,
-                           real alpha,
-                           real beta) {
-  CHECK(expandFeat.useGpu_ == false) << "Matrix type are not equal";
-  CHECK_EQ(size_t(thisImgHeight * thisImgWidth * channels),
-           getHeight() * getWidth())
-      << "Matrix dimensions are not equal";
-
-  size_t elemCnt = outputH * outputW * blockH * blockW * channels;
-
-  CHECK(elemCnt == expandFeat.getHeight() * expandFeat.getWidth())
-      << "Matrix dimensions are not equal";
-
-  real* expandData = expandFeat.getData();
-  int channelsCol = channels * blockH * blockW;
-  for (int c = 0; c < channelsCol; ++c) {
-    int wOffset = c % blockW;
-    int hOffset = (c / blockW) % blockH;
-    int c_im = c / blockW / blockH;
-    for (int h = 0; h < outputH; ++h) {
-      for (int w = 0; w < outputW; ++w) {
-        int imRowIdx = h * strideH + hOffset;
-        int imColIdx = w * strideW + wOffset;
-        if ((imRowIdx - paddingH) >= 0 &&
-            (imRowIdx - paddingH) < thisImgHeight &&
-            (imColIdx - paddingW) >= 0 &&
-            (imColIdx - paddingW) < thisImgWidth) {
-          imRowIdx += c_im * thisImgHeight - paddingH;
-          imColIdx -= paddingW;
-          data_[imRowIdx * thisImgWidth + imColIdx] =
-              alpha * expandData[(c * outputH + h) * outputW + w] +
-              beta * data_[imRowIdx * thisImgWidth + imColIdx];
-        }
-      }
-    }
-  }
-}
-
 void CpuMatrix::maxPoolForward(Matrix& inputMat,
                                size_t imgSizeH,
                                size_t imgSizeW,
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 7dfd593225065e18830b2b0c0ce854fe7a2d5178..bb802bbb2c75289a45d987b22ad41ce8b1e95c98 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -859,49 +859,6 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
-  /**
-   * This function is used to calculate the convolution:
-   *
-   * It will expand a feature matrix according to the
-   * convolution filters
-   */
-  virtual void convExpand(Matrix& feature,
-                          int feaImgHeight,
-                          int feaImgWidth,
-                          int channels,
-                          int blockH,
-                          int blockW,
-                          int strideH,
-                          int strideW,
-                          int paddingH,
-                          int paddingW,
-                          int outputH,
-                          int outputW) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  /**
-   * This function is the reverse implementation of convExpand:
-   *
-   * Its function is to restore a expanded-matrix into a feature matrix
-   */
-  virtual void convShrink(Matrix& expandColMat,
-                          int thisImgHeight,
-                          int thisImgWidth,
-                          int channels,
-                          int blockH,
-                          int blockW,
-                          int strideH,
-                          int strideW,
-                          int paddingH,
-                          int paddingW,
-                          int outputH,
-                          int outputW,
-                          real alpha = 1.0f,
-                          real beta = 0.0f) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
   /**
    * Pooling forward operation, pick out the largest element
    * in the sizeX of value
@@ -1335,34 +1292,6 @@ public:
 
   void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
 
-  void convExpand(Matrix& feature,
-                  int feaImgHeight,
-                  int feaImgWidth,
-                  int channels,
-                  int blockH,
-                  int blockW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  int outputH,
-                  int outputW);
-
-  void convShrink(Matrix& expandColMat,
-                  int thisImgHeight,
-                  int thisImgWidth,
-                  int channels,
-                  int blockH,
-                  int blochW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingWreal,
-                  int outputH,
-                  int outputW,
-                  real alpha = 1.0f,
-                  real beta = 0.0f);
-
   void maxPoolForward(Matrix& inputMat,
                       size_t imgSizeH,
                       size_t imgSizeW,
@@ -1522,34 +1451,6 @@ public:
 
   MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
 
-  void convExpand(Matrix& feature,
-                  int feaImgHeight,
-                  int feaImgWidth,
-                  int channels,
-                  int blcokH,
-                  int blockW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  int outputH,
-                  int outputW);
-
-  void convShrink(Matrix& expandFeat,
-                  int thisImgHeight,
-                  int thisImgWidth,
-                  int channels,
-                  int blockH,
-                  int blockW,
-                  int strideH,
-                  int strideW,
-                  int paddingH,
-                  int paddingW,
-                  int outputH,
-                  int outputW,
-                  real alpha = 1.0f,
-                  real beta = 0.0f);
-
   void maxPoolForward(Matrix& inputMat,
                       size_t imgSizeH,
                       size_t imgSizeW,
diff --git a/paddle/memory/detail/buddy_allocator.cc b/paddle/memory/detail/buddy_allocator.cc
index 27c1b4033b53b059d38ed88694b20b429cbb4cce..bb44970109c05d239e6b92d90b2079b752fa0104 100644
--- a/paddle/memory/detail/buddy_allocator.cc
+++ b/paddle/memory/detail/buddy_allocator.cc
@@ -27,12 +27,11 @@ BuddyAllocator::BuddyAllocator(SystemAllocator* system_allocator,
       system_allocator_(std::move(system_allocator)) {}
 
 BuddyAllocator::~BuddyAllocator() {
-  DLOG(INFO) << "BuddyAllocator Disconstructor makes sure that all of these "
-                "have actually been freed";
+  VLOG(3) << "BuddyAllocator Disconstructor makes sure that all of these "
+             "have actually been freed";
   while (!pool_.empty()) {
     auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
-    DLOG(INFO) << "Free from block (" << block << ", " << max_chunk_size_
-               << ")";
+    VLOG(3) << "Free from block (" << block << ", " << max_chunk_size_ << ")";
 
     system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
     cache_.invalidate(block);
@@ -52,12 +51,11 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
   // acquire the allocator lock
   std::lock_guard<std::mutex> lock(mutex_);
 
-  DLOG(INFO) << "Allocate " << unaligned_size << " bytes from chunk size "
-             << size;
+  VLOG(3) << "Allocate " << unaligned_size << " bytes from chunk size " << size;
 
   // if the allocation is huge, send directly to the system allocator
   if (size > max_chunk_size_) {
-    DLOG(INFO) << "Allocate from system allocator.";
+    VLOG(3) << "Allocate from system allocator.";
     return SystemAlloc(size);
   }
 
@@ -72,9 +70,9 @@ void* BuddyAllocator::Alloc(size_t unaligned_size) {
       return nullptr;
     }
   } else {
-    DLOG(INFO) << "Allocation from existing memory block " << std::get<2>(*it)
-               << " at address "
-               << reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
+    VLOG(3) << "Allocation from existing memory block " << std::get<2>(*it)
+            << " at address "
+            << reinterpret_cast<MemoryBlock*>(std::get<2>(*it))->data();
   }
 
   total_used_ += size;
@@ -91,10 +89,10 @@ void BuddyAllocator::Free(void* p) {
   // Acquire the allocator lock
   std::lock_guard<std::mutex> lock(mutex_);
 
-  DLOG(INFO) << "Free from address " << block;
+  VLOG(3) << "Free from address " << block;
 
   if (block->type(cache_) == MemoryBlock::HUGE_CHUNK) {
-    DLOG(INFO) << "Free directly from system allocator";
+    VLOG(3) << "Free directly from system allocator";
     system_allocator_->Free(block, block->total_size(cache_),
                             block->index(cache_));
 
@@ -111,8 +109,8 @@ void BuddyAllocator::Free(void* p) {
 
   // Trying to merge the right buddy
   if (block->has_right_buddy(cache_)) {
-    DLOG(INFO) << "Merging this block " << block << " with its right buddy "
-               << block->right_buddy(cache_);
+    VLOG(3) << "Merging this block " << block << " with its right buddy "
+            << block->right_buddy(cache_);
 
     auto right_buddy = block->right_buddy(cache_);
 
@@ -129,8 +127,8 @@ void BuddyAllocator::Free(void* p) {
 
   // Trying to merge the left buddy
   if (block->has_left_buddy(cache_)) {
-    DLOG(INFO) << "Merging this block " << block << " with its left buddy "
-               << block->left_buddy(cache_);
+    VLOG(3) << "Merging this block " << block << " with its left buddy "
+            << block->left_buddy(cache_);
 
     auto left_buddy = block->left_buddy(cache_);
 
@@ -146,8 +144,8 @@ void BuddyAllocator::Free(void* p) {
   }
 
   // Dumping this block into pool
-  DLOG(INFO) << "Inserting free block (" << block << ", "
-             << block->total_size(cache_) << ")";
+  VLOG(3) << "Inserting free block (" << block << ", "
+          << block->total_size(cache_) << ")";
   pool_.insert(
       IndexSizeAddress(block->index(cache_), block->total_size(cache_), block));
 
@@ -166,7 +164,7 @@ void* BuddyAllocator::SystemAlloc(size_t size) {
   size_t index = 0;
   void* p = system_allocator_->Alloc(index, size);
 
-  DLOG(INFO) << "Allocated " << p << " from system allocator.";
+  VLOG(3) << "Allocated " << p << " from system allocator.";
 
   if (p == nullptr) return nullptr;
 
@@ -192,8 +190,8 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool() {
 
   if (p == nullptr) return pool_.end();
 
-  DLOG(INFO) << "Creating and inserting new block " << p
-             << " from system allocator";
+  VLOG(3) << "Creating and inserting new block " << p
+          << " from system allocator";
 
   static_cast<MemoryBlock*>(p)->init(cache_, MemoryBlock::FREE_CHUNK, index,
                                      max_chunk_size_, nullptr, nullptr);
@@ -237,19 +235,19 @@ void* BuddyAllocator::SplitToAlloc(BuddyAllocator::PoolSet::iterator it,
   auto block = static_cast<MemoryBlock*>(std::get<2>(*it));
   pool_.erase(it);
 
-  DLOG(INFO) << "Split block (" << block << ", " << block->total_size(cache_)
-             << ") into";
+  VLOG(3) << "Split block (" << block << ", " << block->total_size(cache_)
+          << ") into";
   block->split(cache_, size);
 
-  DLOG(INFO) << "Left block (" << block << ", " << block->total_size(cache_)
-             << ")";
+  VLOG(3) << "Left block (" << block << ", " << block->total_size(cache_)
+          << ")";
   block->set_type(cache_, MemoryBlock::ARENA_CHUNK);
 
   // the rest of memory if exist
   if (block->has_right_buddy(cache_)) {
     if (block->right_buddy(cache_)->type(cache_) == MemoryBlock::FREE_CHUNK) {
-      DLOG(INFO) << "Insert right block (" << block->right_buddy(cache_) << ", "
-                 << block->right_buddy(cache_)->total_size(cache_) << ")";
+      VLOG(3) << "Insert right block (" << block->right_buddy(cache_) << ", "
+              << block->right_buddy(cache_)->total_size(cache_) << ")";
 
       pool_.insert(
           IndexSizeAddress(block->right_buddy(cache_)->index(cache_),
@@ -276,7 +274,7 @@ void BuddyAllocator::CleanIdleFallBackAlloc() {
       return;
     }
 
-    DLOG(INFO) << "Return block " << block << " to fallback allocator.";
+    VLOG(3) << "Return block " << block << " to fallback allocator.";
 
     system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
     cache_.invalidate(block);
@@ -312,7 +310,7 @@ void BuddyAllocator::CleanIdleNormalAlloc() {
 
     MemoryBlock* block = static_cast<MemoryBlock*>(std::get<2>(*pool));
 
-    DLOG(INFO) << "Return block " << block << " to base allocator.";
+    VLOG(3) << "Return block " << block << " to base allocator.";
 
     system_allocator_->Free(block, max_chunk_size_, block->index(cache_));
     cache_.invalidate(block);
diff --git a/paddle/memory/memcpy.cc b/paddle/memory/memcpy.cc
index 098931c887479ce6f1afc8b90e4003758d88c018..aaab1142ca18d3319469a4d685fde9d30929113f 100644
--- a/paddle/memory/memcpy.cc
+++ b/paddle/memory/memcpy.cc
@@ -35,7 +35,7 @@ void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
                                                   platform::GPUPlace src_place,
                                                   const void* src, size_t num,
                                                   cudaStream_t stream) {
-  platform::GPUPlaceGuard g(src_place.device);
+  platform::SetDeviceId(src_place.device);
   platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
 }
 
@@ -45,7 +45,7 @@ void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
                                                   platform::CPUPlace src_place,
                                                   const void* src, size_t num,
                                                   cudaStream_t stream) {
-  platform::GPUPlaceGuard g(dst_place.device);
+  platform::SetDeviceId(dst_place.device);
   platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
 }
 
@@ -56,7 +56,7 @@ void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
                                                   const void* src, size_t num,
                                                   cudaStream_t stream) {
   if (dst_place == src_place) {
-    platform::GPUPlaceGuard g(src_place.device);
+    platform::SetDeviceId(src_place.device);
     platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
   } else {
     platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num,
diff --git a/paddle/memory/memcpy.h b/paddle/memory/memcpy.h
index 99b1c2e1c3e5ae4facaeb4fd0b773a7531448f03..2b9c0eada6e8406fc81baec7f331a8dd5b8b0ec1 100644
--- a/paddle/memory/memcpy.h
+++ b/paddle/memory/memcpy.h
@@ -20,13 +20,39 @@ limitations under the License. */
 namespace paddle {
 namespace memory {
 
+/**
+ * \brief   Copy memory from one place to another place.
+ *
+ * \param[in]  DstPlace Destination allocation place (CPU).
+ * \param[in]  dst      Destination memory address.
+ * \param[in]  SrcPlace Source allocation place (CPU).
+ * \param[in]  src      Source memory address.
+ * \param[in]  num      memory size in bytes to copy.
+ *
+ */
 template <typename DstPlace, typename SrcPlace>
 void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
 
 #ifndef PADDLE_ONLY_CPU
+
+/**
+ * \brief   Copy memory from one place to another place.
+ *
+ * \param[in]  DstPlace Destination allocation place (CPU or GPU).
+ * \param[in]  dst      Destination memory address.
+ * \param[in]  SrcPlace Source allocation place (CPU or GPU).
+ * \param[in]  src      Source memory address.
+ * \param[in]  num      memory size in bytes to copy.
+ * \param[in]  stream   CUDA stream.
+ *
+ * \note    For GPU memory copy, CUDA stream need to be specified
+ *          for asynchronously memory copy.
+ *
+ */
 template <typename DstPlace, typename SrcPlace>
 void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
           cudaStream_t stream);
+
 #endif  // PADDLE_ONLY_CPU
 
 }  // namespace memory
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index c2e046926fafd8f4cfc4cd81d8f32e3882ff02ec..207025f9b1c64f0f8943f9fae5edefc9328a1d26 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -60,6 +60,7 @@ detail::BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
                                            platform::GpuMaxChunkSize());
     }
   }
+  platform::SetDeviceId(gpu_id);
   return as[gpu_id];
 }
 
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index 5e0d64707299acb22aacff0fad237c135f614d9c..44f567caf9c19775f17988b5142b7693b41a126d 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -20,19 +20,53 @@ limitations under the License. */
 namespace paddle {
 namespace memory {
 
+/**
+ * \brief   Allocate memory block in one place.
+ *
+ * \param[in]  place  Allocation place (CPU or GPU).
+ * \param[in]  size   Allocation size.
+ *
+ * \return  Allocated memory block address.
+ *
+ * \note    If return nullptr, it indicates memory allocation failed
+ *          because insufficient memory in current system. When Alloc
+ *          function is invoked, you must check the returned memory
+ *          address is valid or not.
+ */
 template <typename Place>
-void* Alloc(Place, size_t);
+void* Alloc(Place place, size_t size);
 
+/**
+ * \brief   Free memory block in one place.
+ *
+ * \param[in]  place  Allocation place (CPU or GPU).
+ * \param[in]  ptr    Memory block address to free.
+ *
+ */
 template <typename Place>
-void Free(Place, void*);
+void Free(Place place, void* ptr);
 
+/**
+ * \brief   Total size of used memory in one place.
+ *
+ * \param[in]  place  Allocation place (CPU or GPU).
+ *
+ */
 template <typename Place>
-size_t Used(Place);
+size_t Used(Place place);
 
-template <typename T, /* must be POD types */
-          typename Place /* platform::GPUPlace or platform::CPUPlace */,
-          typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
+/**
+ * \brief   Free memory block in one place.
+ *
+ * \note    In some cases, custom deleter is used to
+ *          deallocate the memory automatically for
+ *          std::unique_ptr<T> in tensor.h.
+ *
+ */
+template <typename T, typename Place>
 class PODDeleter {
+  static_assert(std::is_pod<T>::value, "T must be POD");
+
  public:
   PODDeleter(Place place) : place_(place) {}
   void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index ded963deb2b21ab797ee661b6c42b9f3495c671e..4e34ba012038dda897244b3d491a5127ee10bf45 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -48,10 +48,15 @@ op_library(mul_op SRCS mul_op.cc mul_op.cu)
 op_library(rowwise_add_op SRCS rowwise_add_op.cu rowwise_add_op.cc)
 op_library(sigmoid_op SRCS sigmoid_op.cu sigmoid_op.cc)
 op_library(softmax_op SRCS softmax_op.cc softmax_op.cu)
-op_library(random_op SRCS random_op.cc random_op.cu)
+op_library(guassian_random_op SRCS guassain_random_op.cc guassian_random_op.cu)
 op_library(cross_entropy_op SRCS cross_entropy_op.cc cross_entropy_op.cu)
 
 op_library(fc_op SRCS fc_op.cc DEPS mul_op rowwise_add_op sigmoid_op
         softmax_op net)
 
 op_library(sgd_op SRCS sgd_op.cc sgd_op.cu)
+
+op_library(recurrent_network_op SRCS recurrent_network_op.cc DEPS op_desc
+tensor op_registry operator net)
+cc_test(recurrent_network_op_test SRCS recurrent_network_op_test.cc DEPS
+recurrent_network_op gtest mul_op add_op)
diff --git a/paddle/operators/add_op.cc b/paddle/operators/add_op.cc
index 8d415fbd2e72af556e21f89c37d31b9fad130e3d..1424b0284372d8dfe9eb93ee251b121a48b19b0b 100644
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -13,17 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/operators/add_op.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/tensor.h"
 
 namespace paddle {
 namespace operators {
 
-class AddOp : public framework::OperatorWithKernel {
+class AddOp : public OperatorWithKernel {
 protected:
-  void InferShape(
-      const std::vector<const framework::Tensor *> &inputs,
-      const std::vector<framework::Tensor *> &outputs) const override {
+  void InferShape(const std::vector<const Tensor *> &inputs,
+                  const std::vector<Tensor *> &outputs) const override {
     PADDLE_ENFORCE(inputs.size() == 2, "Input size of AddOp must be two");
     PADDLE_ENFORCE(outputs.size() == 1, "Output size of AddOp must be one");
     PADDLE_ENFORCE(
@@ -35,10 +32,10 @@ protected:
   }
 };
 
-class AddOpMaker : public framework::OpProtoAndCheckerMaker {
+class AddOpMaker : public OpProtoAndCheckerMaker {
 public:
-  AddOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  AddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The first input of add op");
     AddInput("Y", "The second input of add op");
     AddOutput("Out", "The output of add op");
@@ -50,11 +47,10 @@ The equation is: Out = X + Y
   }
 };
 
-class AddOpGrad : public framework::OperatorWithKernel {
+class AddOpGrad : public OperatorWithKernel {
 protected:
-  void InferShape(
-      const std::vector<const framework::Tensor *> &inputs,
-      const std::vector<framework::Tensor *> &outputs) const override {}
+  void InferShape(const std::vector<const Tensor *> &inputs,
+                  const std::vector<Tensor *> &outputs) const override {}
   std::string DebugString() const override {
     LOG(INFO) << "AddOpGrad";
     return "";
@@ -64,7 +60,6 @@ protected:
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP(add_two, paddle::operators::AddOp, paddle::operators::AddOpMaker);
-REGISTER_GRADIENT_OP(add_two, add_two_grad, paddle::operators::AddOpGrad);
-REGISTER_OP_CPU_KERNEL(
-    add_two, paddle::operators::AddKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP(add_two, ops::AddOp, ops::AddOpMaker);
+REGISTER_GRADIENT_OP(add_two, add_two_grad, ops::AddOpGrad);
+REGISTER_OP_CPU_KERNEL(add_two, ops::AddKernel<ops::CPUPlace, float>);
diff --git a/paddle/operators/add_op.cu b/paddle/operators/add_op.cu
index 2e5a755f92e4d1fa487152ed453fe3b2823062ed..79d8de6cd46e1c72b14b0554c7be7b4eee281f4c 100644
--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/add_op.cu
@@ -1,5 +1,4 @@
-#include "paddle/operators/add_op.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/add_op.h"
 
-REGISTER_OP_GPU_KERNEL(add_two,
-                       paddle::operators::AddKernel<paddle::platform::GPUPlace, float>);
\ No newline at end of file
+REGISTER_OP_GPU_KERNEL(add_two, ops::AddKernel<ops::GPUPlace, float>);
diff --git a/paddle/operators/add_op.h b/paddle/operators/add_op.h
index 39d54a63bd16cdafeec1cfcd86ef5d142382e880..0c39433788e1e07e30aaadc4766028219b05bfa5 100644
--- a/paddle/operators/add_op.h
+++ b/paddle/operators/add_op.h
@@ -13,27 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "glog/logging.h"
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/operator.h"
+#include "paddle/operators/type_alias.h"
 
 namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class AddKernel : public framework::OpKernel {
+class AddKernel : public OpKernel {
 public:
-  void Compute(const framework::KernelContext& context) const override {
-    auto input0 = context.Input(0)->Get<framework::Tensor>();
-    auto input1 = context.Input(1)->Get<framework::Tensor>();
-    auto* output = context.Output(0)->GetMutable<framework::Tensor>();
+  void Compute(const KernelContext& context) const override {
+    auto input0 = context.Input(0)->Get<Tensor>();
+    auto input1 = context.Input(1)->Get<Tensor>();
+    auto output = context.Output(0)->GetMutable<Tensor>();
 
     output->mutable_data<T>(context.GetPlace());
 
-    framework::EigenVector<T>::Flatten(*output).device(
+    EigenVector<T>::Flatten(*output).device(
         *(context.GetEigenDevice<Place>())) =
-        framework::EigenVector<T>::Flatten(input0) +
-        framework::EigenVector<T>::Flatten(input1);
+        EigenVector<T>::Flatten(input0) + EigenVector<T>::Flatten(input1);
   }
 };
 
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
index 7d7bb09f3d63bef49913c3c7501082c509c45653..46c88d4d1a28eeedd02eb699562244651ead6d68 100644
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -13,17 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/operators/cross_entropy_op.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/tensor.h"
 
 namespace paddle {
 namespace operators {
 
-class OnehotCrossEntropyOp : public framework::OperatorWithKernel {
+class OnehotCrossEntropyOp : public OperatorWithKernel {
 protected:
-  void InferShape(
-      const std::vector<const framework::Tensor *> &inputs,
-      const std::vector<framework::Tensor *> &outputs) const override {
+  void InferShape(const std::vector<const Tensor *> &inputs,
+                  const std::vector<Tensor *> &outputs) const override {
     PADDLE_ENFORCE(inputs.size() == 2,
                    "Input size of OnehotCrossEntropyOp must be two");
     PADDLE_ENFORCE(outputs.size() == 1,
@@ -35,15 +32,14 @@ protected:
     PADDLE_ENFORCE(inputs[0]->dims().size() == 2, "X's dimension must be 2.");
     PADDLE_ENFORCE(outputs[0]->dims().size() == 1,
                    "label's dimension must be 1.");
-    outputs[0]->Resize(framework::make_ddim({inputs[0]->dims()[0]}));
+    outputs[0]->Resize({inputs[0]->dims()[0]});
   }
 };
 
-class OnehotCrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
+class OnehotCrossEntropyOpMaker : public OpProtoAndCheckerMaker {
 public:
-  OnehotCrossEntropyOpMaker(framework::OpProto *proto,
-                            framework::OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  OnehotCrossEntropyOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The first input of OnehotCrossEntropyOp");
     AddInput("label", "The second input of OnehotCrossEntropyOp");
     AddOutput("Y", "The output of OnehotCrossEntropyOp");
@@ -59,9 +55,7 @@ OnehotCrossEntropy Operator.
 }  // namespace paddle
 
 REGISTER_OP(onehot_cross_entropy,
-            paddle::operators::OnehotCrossEntropyOp,
-            paddle::operators::OnehotCrossEntropyOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    onehot_cross_entropy,
-    paddle::operators::OnehotCrossEntropyOpKernel<::paddle::platform::CPUPlace,
-                                                  float>);
+            ops::OnehotCrossEntropyOp,
+            ops::OnehotCrossEntropyOpMaker);
+REGISTER_OP_CPU_KERNEL(onehot_cross_entropy,
+                       ops::OnehotCrossEntropyOpKernel<ops::CPUPlace, float>);
diff --git a/paddle/operators/cross_entropy_op.cu b/paddle/operators/cross_entropy_op.cu
index 1bcdcb7ea650a361cad376ecdd5e96fe8e8f7c94..19e4b74596a0f59edd04db830ec6f6f481373465 100644
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -1,6 +1,4 @@
 #include "paddle/operators/cross_entropy_op.h"
-#include "paddle/framework/op_registry.h"
 
 REGISTER_OP_GPU_KERNEL(onehot_cross_entropy,
-                       paddle::operators::OnehotCrossEntropyOpKernel<
-                            ::paddle::platform::GPUPlace, float>);
\ No newline at end of file
+                       ops::OnehotCrossEntropyOpKernel<ops::GPUPlace, float>);
\ No newline at end of file
diff --git a/paddle/operators/cross_entropy_op.h b/paddle/operators/cross_entropy_op.h
index ad2c7f34e1fd91b97287b4c5f4004d5b79ea4f82..0383df46be3a3cea7dde8f1b45857e64d5a2f2d8 100644
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -13,23 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "glog/logging.h"
-#include "paddle/framework/operator.h"
+#include "paddle/operators/type_alias.h"
 
 namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class OnehotCrossEntropyOpKernel : public framework::OpKernel {
+class OnehotCrossEntropyOpKernel : public OpKernel {
 public:
   constexpr T LOG_THRESHOLD() const { return static_cast<T>(1e-20); }
 
-  void Compute(const framework::KernelContext& context) const override {
-    auto X = context.Input(0)->Get<framework::Tensor>();
+  void Compute(const KernelContext& context) const override {
+    auto X = context.Input(0)->Get<Tensor>();
     const T* X_data = X.data<T>();
-    const int* label_data =
-        context.Input(1)->Get<framework::Tensor>().data<int>();
-    auto* Y = context.Output(0)->GetMutable<framework::Tensor>();
+    const int* label_data = context.Input(1)->Get<Tensor>().data<int>();
+    auto* Y = context.Output(0)->GetMutable<Tensor>();
 
     Y->mutable_data<T>(context.GetPlace());
 
diff --git a/paddle/operators/fc_op.cc b/paddle/operators/fc_op.cc
index 01e96f4c4817466e3266ca57a0d0ae2368b3e097..c4a9f5937f4fa8c60989bea1726cedbb73330156 100644
--- a/paddle/operators/fc_op.cc
+++ b/paddle/operators/fc_op.cc
@@ -12,41 +12,38 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/framework/net.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
+#include "type_alias.h"
 
 namespace paddle {
 namespace operators {
 
-class FullyConnectedOp : public framework::PlainNet {
+class FullyConnectedOp : public NetOp {
 public:
   void Init() override {
-    AddOp(framework::OpRegistry::CreateOp("mul",
-                                          {
-                                              Input("X"), Input("W"),
-                                          },
-                                          {Output("before_act")},
-                                          {}));
+    AddOp(OpRegistry::CreateOp("mul",
+                               {
+                                   Input("X"), Input("W"),
+                               },
+                               {Output("before_act")},
+                               {}));
     auto b = Input("b");
-    if (b != framework::OperatorBase::EMPTY_VAR_NAME()) {
-      AddOp(framework::OpRegistry::CreateOp("rowwise_add",
-                                            {Output("before_act"), Input("b")},
-                                            {Output("before_act")},
-                                            {}));
+    if (b != EMPTY_VAR_NAME()) {
+      AddOp(OpRegistry::CreateOp("rowwise_add",
+                                 {Output("before_act"), Input("b")},
+                                 {Output("before_act")},
+                                 {}));
     }
 
     auto activation = GetAttr<std::string>("activation");
-    AddOp(framework::OpRegistry::CreateOp(
+    AddOp(OpRegistry::CreateOp(
         activation, {Output("before_act")}, {Output("Y")}, {}));
     CompleteAddOp(false);
   }
 };
 
-class FullyConnectedOpMaker : public framework::OpProtoAndCheckerMaker {
+class FullyConnectedOpMaker : public OpProtoAndCheckerMaker {
 public:
-  FullyConnectedOpMaker(framework::OpProto *proto,
-                        framework::OpAttrChecker *op_checker)
+  FullyConnectedOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "the input of fc operator");
     AddInput("W", "the weight of fc operator");
@@ -71,6 +68,4 @@ USE_OP(rowwise_add);
 USE_OP(sigmoid);
 USE_OP(softmax);
 
-REGISTER_OP(fc,
-            paddle::operators::FullyConnectedOp,
-            paddle::operators::FullyConnectedOpMaker);
+REGISTER_OP(fc, ops::FullyConnectedOp, ops::FullyConnectedOpMaker);
diff --git a/paddle/operators/random_op.cc b/paddle/operators/gaussian_random_op.cc
similarity index 100%
rename from paddle/operators/random_op.cc
rename to paddle/operators/gaussian_random_op.cc
index 674c85134529c3bfb7a8810c9a1ee15fe249c05a..7afc0cd56b2d4719d2346f1d826e0a7bd0a796b9 100644
--- a/paddle/operators/random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
@@ -12,9 +12,9 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 
-#include "paddle/operators/random_op.h"
 #include "glog/logging.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/random_op.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/operators/random_op.cu b/paddle/operators/gaussian_random_op.cu
similarity index 100%
rename from paddle/operators/random_op.cu
rename to paddle/operators/gaussian_random_op.cu
diff --git a/paddle/operators/random_op.h b/paddle/operators/gaussian_random_op.h
similarity index 100%
rename from paddle/operators/random_op.h
rename to paddle/operators/gaussian_random_op.h
diff --git a/paddle/operators/mul_op.cc b/paddle/operators/mul_op.cc
index cd74c8b976d18ffecd50077cc81e1fce56bea155..22c1b78005358a934c57d487f5b0cff133f61f0c 100644
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -13,17 +13,14 @@
    limitations under the License. */
 
 #include "paddle/operators/mul_op.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/tensor.h"
 
 namespace paddle {
 namespace operators {
 
-class MulOp : public framework::OperatorWithKernel {
+class MulOp : public OperatorWithKernel {
 protected:
-  void InferShape(
-      const std::vector<const framework::Tensor *> &inputs,
-      const std::vector<framework::Tensor *> &outputs) const override {
+  void InferShape(const std::vector<const Tensor *> &inputs,
+                  const std::vector<Tensor *> &outputs) const override {
     PADDLE_ENFORCE(inputs.size() == 2, "The mul op must take two inputs");
     auto dim0 = inputs[0]->dims();
     auto dim1 = inputs[1]->dims();
@@ -37,10 +34,10 @@ protected:
   }
 };
 
-class MulOpMaker : public framework::OpProtoAndCheckerMaker {
+class MulOpMaker : public OpProtoAndCheckerMaker {
 public:
-  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The first input of mul op");
     AddInput("Y", "The second input of mul op");
     AddOutput("Out", "The output of mul op");
@@ -52,11 +49,10 @@ The equation is: Out = X * Y
   }
 };
 
-class MulOpGrad : public framework::OperatorWithKernel {
+class MulOpGrad : public OperatorWithKernel {
 protected:
-  void InferShape(
-      const std::vector<const framework::Tensor *> &inputs,
-      const std::vector<framework::Tensor *> &outputs) const override {}
+  void InferShape(const std::vector<const Tensor *> &inputs,
+                  const std::vector<Tensor *> &outputs) const override {}
   std::string DebugString() const override {
     LOG(INFO) << "MulGrad";
     return "";
@@ -66,8 +62,7 @@ protected:
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP(mul, paddle::operators::MulOp, paddle::operators::MulOpMaker);
-REGISTER_GRADIENT_OP(mul, mul_grad, paddle::operators::MulOpGrad);
+REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker);
+REGISTER_GRADIENT_OP(mul, mul_grad, ops::MulOpGrad);
 
-REGISTER_OP_CPU_KERNEL(
-    mul, paddle::operators::MulKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<ops::CPUPlace, float>);
diff --git a/paddle/operators/mul_op.cu b/paddle/operators/mul_op.cu
index 3ee581dc77dc08e6e47b240588811fbc7c6ea303..c27fc886ce7238a13c8ef86bce673a2b54949a9d 100644
--- a/paddle/operators/mul_op.cu
+++ b/paddle/operators/mul_op.cu
@@ -13,8 +13,5 @@
    limitations under the License. */
 
 #include "paddle/operators/mul_op.h"
-#include "paddle/framework/op_registry.h"
 
-REGISTER_OP_GPU_KERNEL(mul,
-                       paddle::operators::MulKernel<paddle::platform
-                       ::GPUPlace, float>);
\ No newline at end of file
+REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<ops::GPUPlace, float>);
\ No newline at end of file
diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h
index e6bad7fb9da2d489666aa67f032552e48a86c6cb..467975044638a3f034ceec84173e8d3fed43cc0c 100644
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -14,30 +14,27 @@
 
 #pragma once
 
-#include "glog/logging.h"
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/operator.h"
+#include "paddle/operators/type_alias.h"
 
 namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class MulKernel : public framework::OpKernel {
+class MulKernel : public OpKernel {
 public:
-  void Compute(const framework::KernelContext& context) const override {
+  void Compute(const KernelContext& context) const override {
     Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair = {
         {Eigen::IndexPair<Eigen::DenseIndex>(1, 0)}};
 
-    auto input0 = context.Input(0)->Get<framework::Tensor>();
-    auto input1 = context.Input(1)->Get<framework::Tensor>();
-    auto* output = context.Output(0)->GetMutable<framework::Tensor>();
+    auto input0 = context.Input(0)->Get<Tensor>();
+    auto input1 = context.Input(1)->Get<Tensor>();
+    auto* output = context.Output(0)->GetMutable<Tensor>();
 
     output->mutable_data<T>(context.GetPlace());
 
-    framework::EigenMatrix<T>::From(*output).device(
-        *(context.GetEigenDevice<Place>())) =
-        framework::EigenMatrix<T>::From(input0).contract(
-            framework::EigenMatrix<T>::From(input1), dim_pair);
+    EigenMatrix<T>::From(*output).device(*(context.GetEigenDevice<Place>())) =
+        EigenMatrix<T>::From(input0).contract(EigenMatrix<T>::From(input1),
+                                              dim_pair);
   }
 };
 }  // namespace operators
diff --git a/paddle/operators/recurrent_network_op.cc b/paddle/operators/recurrent_network_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1a101d6ddf149d608dbdbe048ef43d86bacbcc16
--- /dev/null
+++ b/paddle/operators/recurrent_network_op.cc
@@ -0,0 +1,419 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/operators/recurrent_network_op.h"
+
+#include <glog/logging.h>
+#include <cstring>
+#include <sstream>
+
+#include "paddle/framework/net.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+
+namespace rnn {
+
+void SegmentInputs(std::vector<std::shared_ptr<Scope>>& step_scopes,
+                   const std::vector<Link>& inlinks,
+                   const size_t seq_len) {
+  PADDLE_ENFORCE(!inlinks.empty(), "no in links are provided.");
+  for (size_t i = 0; i < inlinks.size(); ++i) {
+    Tensor* input =
+        step_scopes[0]->GetVariable(inlinks[i].external)->GetMutable<Tensor>();
+    DDim dims = input->dims();
+    PADDLE_ENFORCE(static_cast<size_t>(dims[0]) == seq_len,
+                   "all the inlinks must have same length");
+    DDim step_dims = slice_ddim(dims, 1, dims.size());
+    for (size_t j = 0; j < seq_len; j++) {
+      Tensor* step_input = step_scopes[j]
+                               ->CreateVariable(inlinks[i].internal)
+                               ->GetMutable<Tensor>();
+      *step_input = input->Slice<float>(j, j + 1);
+      step_input->Resize(step_dims);
+    }
+  }
+}
+
+void ConcatOutputs(std::vector<std::shared_ptr<Scope>>& step_scopes,
+                   const std::vector<Link>& outlinks,
+                   const size_t seq_len) {
+  for (size_t i = 0; i < outlinks.size(); i++) {
+    Tensor* output =
+        step_scopes[0]->GetVariable(outlinks[i].external)->GetMutable<Tensor>();
+
+    // TODO(qingiqng) remove following code after adding
+    // InferShape in RecurrentGradientOp
+    DDim step_dims = step_scopes[0]
+                         ->GetVariable(outlinks[i].internal)
+                         ->GetMutable<Tensor>()
+                         ->dims();
+    std::vector<int> dims_vec = vectorize(step_dims);
+    dims_vec.insert(dims_vec.begin(), seq_len);
+    output->mutable_data<float>(make_ddim(dims_vec), platform::CPUPlace());
+
+    for (size_t j = 0; j < seq_len; j++) {
+      Tensor* step_output = step_scopes[j]
+                                ->GetVariable(outlinks[i].internal)
+                                ->GetMutable<Tensor>();
+      // TODO(luotao02) data type and platform::DeviceContext() should set
+      // correctly
+      (output->Slice<float>(j, j + 1))
+          .CopyFrom<float>(*step_output, platform::CPUPlace());
+    }
+  }
+}
+
+void LinkMemories(std::vector<std::shared_ptr<Scope>>& scopes,
+                  const std::vector<rnn::MemoryAttr>& memories,
+                  size_t step_id,
+                  int offset) {
+  PADDLE_ENFORCE(step_id < scopes.size(),
+                 "step [%d] is out of range of step scopes' size [%d]",
+                 step_id,
+                 scopes.size());
+  PADDLE_ENFORCE(static_cast<int>(step_id) + offset >= 0,
+                 "offset [%d] must be large than -[%d]",
+                 offset,
+                 step_id);
+  PADDLE_ENFORCE(step_id + offset < scopes.size(),
+                 "offset [%d] is out of range, it must be less than (%d - %d)",
+                 offset,
+                 scopes.size(),
+                 step_id);
+  std::shared_ptr<Scope> scope = scopes[step_id];
+  std::shared_ptr<Scope> linked_scope = scopes[step_id + offset];
+  for (auto& attr : memories) {
+    auto mem = scope->CreateVariable(attr.pre_var)->GetMutable<Tensor>();
+    // maybe share variable is better?
+    auto linked_mem = linked_scope->GetVariable(attr.var)->GetMutable<Tensor>();
+    mem->ShareDataWith<float>(*linked_mem);
+
+    // TODO(qingqing) remove following code
+    // the memory of current step should be allocated in step net
+    auto m = scope->CreateVariable(attr.var)->GetMutable<Tensor>();
+    // for unit test, as addOp and mulOp are null currently, if not
+    // mutable_data, mem.data() in output will be error. We will
+    // remove this line after merge the correct addOp and mulOp.
+    m->mutable_data<float>(mem->dims(), platform::CPUPlace());
+  }
+}
+
+void InitArgument(const ArgumentName& name,
+                  Argument* arg,
+                  const OperatorBase& op) {
+  arg->step_net = op.Input(name.step_net);
+  arg->step_scopes = op.Output(name.step_scopes);
+
+  auto inlinks = op.Inputs(name.inlinks);
+  auto inlink_alias = op.GetAttr<std::vector<std::string>>(name.inlink_alias);
+  PADDLE_ENFORCE(inlinks.size() == inlink_alias.size(),
+                 "the size of inlinks and inlink_alias don't match:%d,%d",
+                 inlinks.size(),
+                 inlink_alias.size());
+  for (size_t i = 0; i < inlinks.size(); ++i) {
+    rnn::Link link;
+    link.external = inlinks[i];
+    link.internal = inlink_alias[i];
+    (arg->inlinks).push_back(link);
+  }
+
+  auto outlinks = op.Outputs(name.outlinks);
+  auto outlink_alias = op.GetAttr<std::vector<std::string>>(name.outlink_alias);
+  PADDLE_ENFORCE(outlinks.size() == outlink_alias.size(),
+                 "the size of outlinks and outlink_alias don't match:%d,%d",
+                 outlinks.size(),
+                 outlink_alias.size());
+  for (size_t i = 0; i < outlinks.size(); ++i) {
+    rnn::Link link;
+    link.external = outlinks[i];
+    link.internal = outlink_alias[i];
+    (arg->outlinks).push_back(link);
+  }
+
+  auto boot_memories = op.Inputs(name.boot_memories);
+
+  // attributes
+  auto memories = op.GetAttr<std::vector<std::string>>(name.memories);
+  auto pre_memories = op.GetAttr<std::vector<std::string>>(name.pre_memories);
+
+  PADDLE_ENFORCE(memories.size() == boot_memories.size(),
+                 "the size of memories, boot_memories don't match:%d,%d",
+                 memories.size(),
+                 boot_memories.size());
+  PADDLE_ENFORCE(pre_memories.size() == boot_memories.size(),
+                 "the size of pre_memories, boot_memories don't match:%d,%d",
+                 pre_memories.size(),
+                 boot_memories.size());
+  PADDLE_ENFORCE(memories.size() > 0, "more than 1 memories should be set");
+
+  for (size_t i = 0; i < memories.size(); ++i) {
+    rnn::MemoryAttr mem_attr;
+    mem_attr.var = memories[i];
+    mem_attr.pre_var = pre_memories[i];
+    mem_attr.boot_var = boot_memories[i];
+    (arg->memories).push_back(mem_attr);
+  }
+}
+
+}  // namespace rnn
+
+void RecurrentAlgorithm::InferShape(const std::shared_ptr<Scope>& scope) const {
+  seq_len_ = scope->GetVariable((arg_->inlinks[0]).external)
+                 ->GetMutable<Tensor>()
+                 ->dims()[0];
+  CreateScopes(scope);
+  auto step_scopes = GetStepScopes(scope);
+
+  // SegmentInputs is called in InferShape. The input must hold memory in
+  // SegmentInputs. But the other op only set dimension for the output in
+  // InferShape. That's a problem. Wether the RNN op needs InferShape or not?
+  // Wether the following functions (SegmentInputs, InitMemories, ...) need
+  // to rewrite for RNN op?
+  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_);
+
+  InitMemories(step_scopes[0]);
+
+  PADDLE_ENFORCE(scope->HasVariable(arg_->step_net),
+                 "stepnet [%s] is not in scope.",
+                 arg_->step_net);
+  Variable* net = scope->GetVariable(arg_->step_net);
+  PADDLE_ENFORCE(net != nullptr, "failed to get step net");
+  // If the InferShape is called in OperatorBase's run function,
+  // the rnn op only needs to do InferShape for the first time step
+  for (size_t i = 0; i < seq_len_; i++) {
+    if (i > 0) {
+      rnn::LinkMemories(step_scopes, arg_->memories, i, -1);
+    }
+    net->GetMutable<NetOp>()->InferShape(step_scopes[i]);
+  }
+
+  auto outlinks = arg_->outlinks;
+  for (size_t i = 0; i < outlinks.size(); i++) {
+    DDim step_dims = step_scopes[0]
+                         ->GetVariable(outlinks[i].internal)
+                         ->GetMutable<Tensor>()
+                         ->dims();
+    std::vector<int> dims_vec = vectorize(step_dims);
+    // now only support fixed length
+    dims_vec.insert(dims_vec.begin(), seq_len_);
+    Tensor* output =
+        step_scopes[0]->GetVariable(outlinks[i].external)->GetMutable<Tensor>();
+    output->Resize(make_ddim(dims_vec));
+  }
+}
+
+void RecurrentAlgorithm::Run(const std::shared_ptr<Scope>& scope,
+                             const platform::DeviceContext& dev_ctx) const {
+  auto step_scopes = GetStepScopes(scope);
+
+  Variable* net = scope->GetVariable(arg_->step_net);
+  for (size_t step_id = 0; step_id < seq_len_; step_id++) {
+    // the link memory is done in InferShape
+    // maybe remove following code after testing
+    if (step_id > 0) {
+      rnn::LinkMemories(step_scopes, arg_->memories, step_id, -1);
+    }
+    net->GetMutable<NetOp>()->Run(step_scopes[step_id], dev_ctx);
+  }
+
+  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_);
+}
+
+void RecurrentAlgorithm::CreateScopes(std::shared_ptr<Scope> scope) const {
+  // TODO(xxx) Only two scopes are needed for inference, this case will be
+  // supported later.
+  auto step_scopes = scope->GetVariable(arg_->step_scopes)
+                         ->GetMutable<std::vector<std::shared_ptr<Scope>>>();
+
+  if (seq_len_ > step_scopes->size()) {
+    for (size_t i = step_scopes->size(); i < seq_len_; ++i) {
+      std::shared_ptr<Scope> step_scope = std::make_shared<Scope>(scope);
+
+      // Now all variables in scope must be created outside of op.
+      auto net_op = scope->GetVariable(arg_->step_net)->GetMutable<NetOp>();
+      for (auto& input : net_op->inputs_) {
+        step_scope->CreateVariable(input);
+      }
+      for (auto& output : net_op->outputs_) {
+        step_scope->CreateVariable(output);
+      }
+
+      step_scopes->push_back(std::make_shared<Scope>(step_scope));
+    }
+  }
+}
+
+void RecurrentAlgorithm::InitMemories(std::shared_ptr<Scope> step_scope) const {
+  for (auto& attr : arg_->memories) {
+    Tensor* pre_mem =
+        step_scope->CreateVariable(attr.pre_var)->GetMutable<Tensor>();
+    PADDLE_ENFORCE(step_scope->HasVariable(attr.boot_var),
+                   "memory [%s]'s boot variable [%s] not exists",
+                   attr.var,
+                   attr.boot_var);
+    Tensor* boot_mem =
+        step_scope->GetVariable(attr.boot_var)->GetMutable<Tensor>();
+    pre_mem->ShareDataWith<float>(*boot_mem);
+
+    // TODO(qingqing) remove following code
+    // the memory of current step should be allocated in step net
+    // here for unit test
+    auto cur_step_mem =
+        step_scope->CreateVariable(attr.var)->GetMutable<Tensor>();
+    cur_step_mem->mutable_data<float>(boot_mem->dims(), platform::CPUPlace());
+  }
+}
+
+const rnn::ArgumentName RecurrentOp::kArgName{"step_net",
+                                              "step_scopes",
+                                              "inlinks",
+                                              "outlinks",
+                                              "inlink_alias",
+                                              "outlink_alias",
+                                              "memories",
+                                              "pre_memories",
+                                              "boot_memories"};
+
+const rnn::ArgumentName RecurrentGradientOp::kArgName{"step_net",
+                                                      "step_scopes",
+                                                      "outlink@grad",
+                                                      "inlink@grad",
+                                                      "inlink_alias",
+                                                      "outlink_alias",
+                                                      "memories",
+                                                      "pre_memories",
+                                                      "boot_memories@grad"};
+
+void RecurrentOp::Init() {
+  OperatorBase::Init();
+  std::unique_ptr<rnn::Argument> arg(new rnn::Argument());
+  rnn::InitArgument(kArgName, arg.get(), *this);
+  alg_.Init(std::move(arg));
+}
+
+class RecurrentAlgorithmProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+public:
+  RecurrentAlgorithmProtoAndCheckerMaker(OpProto* proto,
+                                         OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    const auto& name = RecurrentOp::kArgName;
+    // inputs and outputs stored in proto
+    AddInputs(name.inlinks,
+              "the input that need to be segmented for each step.");
+    AddInputs(name.boot_memories, "variables to initialize memories.");
+    AddInput(name.step_net, "network shared by all steps.");
+
+    AddOutputs(name.outlinks,
+               "the output that need to concated for all steps.");
+    AddOutput(name.step_scopes, "step scopes");
+
+    // Attributes stored in AttributeMap
+    AddAttr<std::vector<std::string>>(name.inlink_alias, "alias of inlinks");
+    AddAttr<std::vector<std::string>>(name.outlink_alias, "alias of outlinks");
+    AddAttr<std::vector<std::string>>(name.pre_memories,
+                                      "names of pre-memories");
+    AddAttr<std::vector<std::string>>(name.memories, "names of memories");
+
+    AddComment("This is a recurrent group operator.");
+  }
+};
+
+void RecurrentGradientAlgorithm::Run(
+    const std::shared_ptr<Scope>& scope,
+    const platform::DeviceContext& dev_ctx) const {
+  auto step_scopes = GetStepScopes(scope);
+  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_);
+  PADDLE_ENFORCE(scope->HasVariable(arg_->step_net),
+                 "step net is not in scope.");
+  Variable* net = scope->GetVariable(arg_->step_net);
+  PADDLE_ENFORCE(net != nullptr, "failed to get step net");
+  for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) {
+    if (static_cast<size_t>(step_id) != seq_len_ - 1) {
+      rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1);
+    }
+    net->GetMutable<NetOp>()->Run(step_scopes[step_id], dev_ctx);
+  }
+  LinkBootMemoryGradients(step_scopes[0]);
+  rnn::ConcatOutputs(step_scopes, arg_->outlinks, seq_len_);
+}
+
+void RecurrentGradientAlgorithm::LinkBootMemoryGradients(
+    std::shared_ptr<Scope> step_scope) const {
+  for (auto& attr : arg_->memories) {
+    Tensor* mem_grad =
+        step_scope->CreateVariable(attr.var)->GetMutable<Tensor>();
+    PADDLE_ENFORCE(mem_grad != nullptr,
+                   "boot_tensor should be retrieved before");
+    PADDLE_ENFORCE(step_scope->HasVariable(attr.boot_var),
+                   "memory [%s]'s boot variable [%s] not exists",
+                   attr.var,
+                   attr.boot_var);
+    Tensor* boot_mem_grad =
+        step_scope->CreateVariable(attr.boot_var)->GetMutable<Tensor>();
+    boot_mem_grad->ShareDataWith<float>(*mem_grad);
+  }
+}
+
+void RecurrentGradientAlgorithm::InferShape(
+    const std::shared_ptr<Scope>& scope) const {
+  seq_len_ = scope->GetVariable((arg_->inlinks[0]).external)
+                 ->GetMutable<Tensor>()
+                 ->dims()[0];
+  auto step_scopes = GetStepScopes(scope);
+  rnn::SegmentInputs(step_scopes, arg_->inlinks, seq_len_);
+
+  PADDLE_ENFORCE(scope->HasVariable(arg_->step_net),
+                 "step net is not in scope.");
+  Variable* net = scope->GetVariable(arg_->step_net);
+  PADDLE_ENFORCE(net != nullptr, "failed to get step net");
+
+  for (int step_id = seq_len_ - 1; step_id >= 0; --step_id) {
+    if (static_cast<size_t>(step_id) != seq_len_ - 1) {
+      rnn::LinkMemories(step_scopes, arg_->memories, step_id, 1);
+    }
+    net->GetMutable<NetOp>()->InferShape(step_scopes[step_id]);
+  }
+
+  auto outlinks = arg_->outlinks;
+  for (size_t i = 0; i < outlinks.size(); i++) {
+    DDim step_dims = step_scopes[0]
+                         ->GetVariable(outlinks[i].internal)
+                         ->GetMutable<Tensor>()
+                         ->dims();
+    std::vector<int> dims_vec = vectorize(step_dims);
+    // now only support fixed length
+    dims_vec.insert(dims_vec.begin(), seq_len_);
+    Tensor* output =
+        step_scopes[0]->GetVariable(outlinks[i].external)->GetMutable<Tensor>();
+    output->Resize(make_ddim(dims_vec));
+  }
+  LinkBootMemoryGradients(step_scopes[0]);
+}
+
+void RecurrentGradientOp::Init() {
+  OperatorBase::Init();
+  std::unique_ptr<rnn::Argument> arg(new rnn::Argument());
+  rnn::InitArgument(kArgName, arg.get(), *this);
+  alg_.Init(std::move(arg));
+}
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(recurrent_op,
+            paddle::operators::RecurrentOp,
+            paddle::operators::RecurrentAlgorithmProtoAndCheckerMaker);
diff --git a/paddle/operators/recurrent_network_op.h b/paddle/operators/recurrent_network_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..8946c8ce38117c391edcf56558c640ebd0d7f75c
--- /dev/null
+++ b/paddle/operators/recurrent_network_op.h
@@ -0,0 +1,216 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+using namespace paddle::framework;
+
+namespace rnn {
+
+/**
+ * Memory of a RNN (same as the role of `Momory` in PaddlePaddle).
+ *
+ * Memory attributes cached by this op, dims will be infered from
+ * boot memories in father scope. Other attributes are copied from Op's proto
+ * attributes.
+ */
+struct MemoryAttr {
+  // name of current state variable
+  std::string var;
+  // name of previous step's state variable
+  std::string pre_var;
+  // name of the variables to init this memory (same role of `boot_layer` in
+  // PaddlePaddle), which is store in father's scope.
+  std::string boot_var;
+};
+
+struct Link {
+  // input or output links name.
+  std::string internal;
+  // alias to avoid duplicate keys in scopes.
+  std::string external;
+};
+
+struct Argument {
+  std::string step_net;
+  std::string step_scopes;
+  std::vector<Link> inlinks;
+  std::vector<Link> outlinks;
+  std::vector<rnn::MemoryAttr> memories;
+};
+
+struct ArgumentName {
+  std::string step_net;
+  std::string step_scopes;
+  std::string inlinks;
+  std::string outlinks;
+  std::string inlink_alias;   // the alias of inlinks in step net.
+  std::string outlink_alias;  // the alias of outlinks in step net.
+  std::string memories;       // the memory name
+  std::string pre_memories;   // the previous memory name
+  std::string boot_memories;  // the boot memory name
+};
+
+/**
+ * Prepare inputs for each step net.
+ */
+void SegmentInputs(std::vector<std::shared_ptr<Scope>>& step_scopes,
+                   const std::vector<Link>& inlinks,
+                   const size_t seq_len);
+
+/**
+ * Process outputs of step nets and merge to variables.
+ */
+void ConcatOutputs(std::vector<std::shared_ptr<Scope>>& step_scopes,
+                   const std::vector<Link>& outlinks,
+                   const size_t seq_len);
+
+void LinkMemories(std::vector<std::shared_ptr<Scope>>& step_scopes,
+                  const std::vector<MemoryAttr>& memories,
+                  size_t step_id,
+                  int offset);
+
+void InitArgument(const ArgumentName& name, Argument* arg);
+
+};  // namespace rnn
+
+// The sequence format in RecurrentOp is Tensor<seq_len, batch_size, dim> now.
+// TODO:
+// 1. No-padding computing for sequences with indifinite length in one batch.
+// 2. Hierarchical RNN for sequence with sub-sequence.
+// 3. Internal Memory.
+// 4. More Complex RNN architecture, such as Gated Feedback RNN.
+//    Refer to: https://arxiv.org/pdf/1502.02367.pdf
+
+class RecurrentAlgorithm {
+public:
+  void Run(const std::shared_ptr<Scope>& scope,
+           const platform::DeviceContext& dev_ctx) const;
+
+  void Init(std::unique_ptr<rnn::Argument> arg) { arg_ = std::move(arg); }
+
+  /**
+   * InferShape must be called before Run.
+   */
+  void InferShape(const std::shared_ptr<Scope>& scope) const;
+
+protected:
+  /*
+   * The step scopes will be stored in the father scope as a variable.
+   *
+   * NOTE the scopes are reused in both the forward and backward, so just
+   * create once and expand its size if more steps need.
+   */
+  void CreateScopes(std::shared_ptr<Scope> scope) const;
+
+  inline const std::vector<std::shared_ptr<Scope>>& GetStepScopes(
+      std::shared_ptr<Scope> scope) const {
+    return *(scope->GetVariable(arg_->step_scopes))
+                ->GetMutable<std::vector<std::shared_ptr<Scope>>>();
+  }
+
+  void InitMemories(std::shared_ptr<Scope> step_scopes) const;
+
+private:
+  std::unique_ptr<rnn::Argument> arg_;
+  mutable size_t seq_len_;
+};
+
+class RecurrentGradientAlgorithm {
+  /**
+   * RNN's backward alogorithm.
+   *
+   * To accelerate the development of RecurrentGradientOp, we decouple RNN's
+   * algorithm and `OperatorBase`'s implementation, the former contains the core
+   * implementation of a RNN, and will keep stable even if the framework changes
+   * a
+   * lot, and the latter is a wrapper acts like an dapter for it to make RNN an
+   * operator.
+   */
+public:
+  void Init(std::unique_ptr<rnn::Argument> arg) { arg_ = std::move(arg); }
+
+  void Run(const std::shared_ptr<Scope>& scope,
+           const platform::DeviceContext& dev_ctx) const;
+
+  void LinkBootMemoryGradients(std::shared_ptr<Scope> step_scopes) const;
+
+  /**
+   * InferShape must be called before Run.
+   */
+  void InferShape(const std::shared_ptr<Scope>& scope) const;
+
+protected:
+  inline const std::vector<std::shared_ptr<Scope>>& GetStepScopes(
+      std::shared_ptr<Scope> scope) const {
+    return *(scope->GetVariable(arg_->step_scopes))
+                ->GetMutable<std::vector<std::shared_ptr<Scope>>>();
+  }
+
+private:
+  std::unique_ptr<rnn::Argument> arg_;
+  mutable size_t seq_len_;
+};
+
+class RecurrentOp final : public OperatorBase {
+public:
+  void Init() override;
+
+  /**
+   * InferShape must be called before Run.
+   */
+  virtual void InferShape(const std::shared_ptr<Scope>& scope) const override {
+    alg_.InferShape(scope);
+  }
+
+  virtual void Run(const std::shared_ptr<Scope>& scope,
+                   const platform::DeviceContext& dev_ctx) const override {
+    alg_.Run(scope, dev_ctx);
+  }
+
+  static const rnn::ArgumentName kArgName;
+
+private:
+  RecurrentAlgorithm alg_;
+};
+
+class RecurrentGradientOp final : public OperatorBase {
+public:
+  void Init() override;
+
+  /**
+   * InferShape must be called before Run.
+   */
+  virtual void InferShape(const std::shared_ptr<Scope>& scope) const override {
+    alg_.InferShape(scope);
+  }
+
+  virtual void Run(const std::shared_ptr<Scope>& scope,
+                   const platform::DeviceContext& dev_ctx) const override {
+    alg_.Run(scope, dev_ctx);
+  }
+
+  static const rnn::ArgumentName kArgName;
+
+private:
+  RecurrentGradientAlgorithm alg_;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/recurrent_network_op_test.cc b/paddle/operators/recurrent_network_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6784ac6001ad1b464d65814cff1ad6247826ad66
--- /dev/null
+++ b/paddle/operators/recurrent_network_op_test.cc
@@ -0,0 +1,400 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "paddle/framework/net.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/operators/recurrent_network_op.h"
+
+namespace paddle {
+namespace operators {
+
+class RecurrentOpTest : public ::testing::Test {
+protected:
+  virtual void SetUp() override {
+    CreateGlobalVariables();
+    CreateStepNet();
+    CreateRNNOp();
+  }
+
+  virtual void TearDown() override {}
+
+  void CreateGlobalVariables() {
+    scope_ = std::make_shared<Scope>();
+    // create input, and init content
+    LOG(INFO) << "create global variable x";
+    for (auto inlink : std::vector<std::string>{"x", "x0", "x1", "h"}) {
+      Variable* x = scope_->CreateVariable(inlink);
+      DDim dims = make_ddim(std::vector<int>{
+          10 /*sent size*/, 20 /*batch size*/, 30 /*input dim*/});
+      x->GetMutable<Tensor>()->mutable_data<float>(dims, platform::CPUPlace());
+    }
+    // create output alias just for test
+    for (auto inlink : std::vector<std::string>{"h@alias"}) {
+      Variable* x = scope_->CreateVariable(inlink);
+      DDim dims =
+          make_ddim(std::vector<int>{20 /*batch size*/, 30 /*input dim*/});
+      x->GetMutable<Tensor>()->mutable_data<float>(dims, platform::CPUPlace());
+    }
+
+    LOG(INFO) << "create global variable w";
+    Variable* w = scope_->CreateVariable("rnn/w");
+    w->GetMutable<Tensor>()->mutable_data<float>(
+        make_ddim(std::vector<int>{30, 30}), platform::CPUPlace());
+
+    for (auto boot : std::vector<std::string>{"x_boot", "h_boot"}) {
+      LOG(INFO) << "create global variable " << boot;
+      Variable* h_boot = scope_->CreateVariable(boot);
+      h_boot->GetMutable<Tensor>()->mutable_data<float>(
+          make_ddim(std::vector<int>{20 /*batch size*/, 30 /*input dim*/}),
+          platform::CPUPlace());
+    }
+
+    LOG(INFO) << "create variable step_scopes";
+    scope_->CreateVariable("step_scopes");
+
+    LOG(INFO) << "create variable h";
+    scope_->CreateVariable("h");
+  }
+
+  void CreateRNNOp() {
+    OpDesc op_desc;
+
+    op_desc.set_type("recurrent_op");
+    // inlinks 0
+    op_desc.add_inputs("x");
+    op_desc.add_inputs("x0");
+    op_desc.add_inputs("x1");
+    // boot_memories 3
+    op_desc.add_inputs("x_boot");
+    op_desc.add_inputs("h_boot");
+    // step net 5
+    op_desc.add_inputs("step_net");
+    // outlinks 6
+    op_desc.add_outputs("h");
+    // step scopes 7
+    op_desc.add_outputs("step_scopes");
+
+    auto _input_format = std::vector<int>{
+        0,  // in_link
+        3,  // memories
+        5   // step_net
+    };
+    auto input_format = op_desc.add_attrs();
+    input_format->set_name("input_format");
+    input_format->set_type(paddle::framework::AttrType::INTS);
+    for (auto i : _input_format) {
+      input_format->add_ints(i);
+    }
+
+    auto output_format = op_desc.add_attrs();
+    output_format->set_name("output_format");
+    output_format->set_type(paddle::framework::AttrType::INTS);
+    for (auto i : std::vector<int>{0, 1, 2}) {
+      output_format->add_ints(i);
+    }
+
+    auto inlink_alias = op_desc.add_attrs();
+    inlink_alias->set_name("inlink_alias");
+    inlink_alias->set_type(paddle::framework::AttrType::STRINGS);
+
+    auto outlink_alias = op_desc.add_attrs();
+    outlink_alias->set_name("outlink_alias");
+    outlink_alias->set_type(paddle::framework::AttrType::STRINGS);
+
+    auto pre_memories = op_desc.add_attrs();
+    pre_memories->set_name("pre_memories");
+    pre_memories->set_type(paddle::framework::AttrType::STRINGS);
+
+    auto memories = op_desc.add_attrs();
+    memories->set_name("memories");
+    memories->set_type(paddle::framework::AttrType::STRINGS);
+
+    // create inlink_alias
+    for (const auto& item :
+         std::vector<std::string>{"x@alias", "x0@alias", "x1@alias"}) {
+      inlink_alias->add_strings(item);
+    }
+    // pre memories
+    for (const auto& item :
+         std::vector<std::string>{"rnn/x@pre", "rnn/h@pre"}) {
+      pre_memories->add_strings(item);
+    }
+    // memories
+    for (const auto& item : std::vector<std::string>{"rnn/x", "rnn/h"}) {
+      memories->add_strings(item);
+    }
+    // output alias
+    for (const auto& item : std::vector<std::string>{"h@alias"}) {
+      outlink_alias->add_strings(item);
+    }
+
+    rnn_op_ = OpRegistry::CreateOp(op_desc);
+
+    LOG(INFO) << "rnn_op finish init";
+  }
+
+  void CreateStepNet() {
+    LOG(INFO) << "create variable step_net";
+    Variable* var = scope_->CreateVariable("step_net");
+    auto net = var->GetMutable<NetOp>();
+    // rnn/s is net's input or output?
+    net->inputs_ = {"rnn/h@pre", "rnn/w", "rnn/x"};
+    net->inputs_ = {"rnn/s", "rnn/h"};
+    net->AddOp(
+        OpRegistry::CreateOp("mul", {"rnn/h@pre", "rnn/w"}, {"rnn/s"}, {}));
+
+    net->AddOp(
+        OpRegistry::CreateOp("add_two", {"rnn/x", "rnn/s"}, {"rnn/h"}, {}));
+    net->CompleteAddOp();
+  }
+
+  // father scope
+  std::shared_ptr<Scope> scope_;
+  std::shared_ptr<OperatorBase> rnn_op_;
+};
+
+TEST_F(RecurrentOpTest, Run) {
+  platform::CPUDeviceContext ctx;
+  rnn_op_->InferShape(scope_);
+  rnn_op_->Run(scope_, ctx);
+}
+
+class RecurrentGradientAlgorithmTest : public ::testing::Test {
+protected:
+  virtual void SetUp() override {
+    CreateGlobalVariables();
+    CreateStepScopes();
+    CreateStepNet();
+    CreateRNNGradientAlgorithm();
+
+    // segment inputs
+    SegmentInputs();
+    // link forward memories
+    LinkeMemories();
+  }
+
+  virtual void TearDown() override {}
+
+  void CreateGlobalVariables() {
+    scope_ = std::make_shared<Scope>();
+    // inputs: x
+    LOG(INFO) << "create global variable x";
+    Variable* x = scope_->CreateVariable("x");
+    DDim dims =
+        make_ddim({10 /*sent size*/, 20 /*batch size*/, 30 /*input dim*/});
+    x->GetMutable<Tensor>()->mutable_data<float>(dims, platform::CPUPlace());
+    // inputs: h_boot
+    LOG(INFO) << "create global variable h_boot";
+    Variable* h_boot = scope_->CreateVariable("h_boot");
+    h_boot->GetMutable<Tensor>()->mutable_data<float>(
+        make_ddim({20 /*batch size*/, 30 /*input dim*/}), platform::CPUPlace());
+    // inputs: w
+    LOG(INFO) << "create global variable w";
+    Variable* w = scope_->CreateVariable("rnn/w");
+    w->GetMutable<Tensor>()->mutable_data<float>(make_ddim({30, 30}),
+                                                 platform::CPUPlace());
+    // inputs: h_grad
+    LOG(INFO) << "create variable h_grad";
+    Variable* dh = scope_->CreateVariable("h_grad");
+    dh->GetMutable<Tensor>()->mutable_data<float>(make_ddim({10, 20, 30}),
+                                                  platform::CPUPlace());
+    // inputs: step_scopes
+    LOG(INFO) << "create variable step_scopes";
+    scope_->CreateVariable("step_scopes");
+    // inputs: step_net
+    LOG(INFO) << "create variable step_net";
+    scope_->CreateVariable("step_net");
+    // outputs: w_grad
+    LOG(INFO) << "create global variable w_grad";
+    scope_->CreateVariable("rnn/w_grad");
+    // outputs: x_grad
+    LOG(INFO) << "create global variable x_grad";
+    scope_->CreateVariable("x_grad");
+    // outputs: h_boot_grad
+    LOG(INFO) << "create global variable h_boot_grad";
+    scope_->CreateVariable("h_boot_grad");
+  }
+
+  void CreateStepScopes() {
+    std::vector<std::shared_ptr<Scope>>* step_scopes =
+        scope_->GetVariable("step_scopes")
+            ->GetMutable<std::vector<std::shared_ptr<Scope>>>();
+    for (int i = 0; i < 10; ++i) {
+      auto scope = std::make_shared<Scope>(scope_);
+      auto pre_t = scope->CreateVariable("rnn/pre_h")->GetMutable<Tensor>();
+      pre_t->mutable_data<float>(make_ddim({20, 30}), platform::CPUPlace());
+      auto tensor = scope->CreateVariable("rnn/h")->GetMutable<Tensor>();
+      tensor->mutable_data<float>(make_ddim({20, 30}), platform::CPUPlace());
+
+      // for unit test of ConcatOutputs
+      auto xg = scope->CreateVariable("rnn/x_grad")->GetMutable<Tensor>();
+      xg->mutable_data<float>(make_ddim({20, 30}), platform::CPUPlace());
+
+      step_scopes->push_back(scope);
+    }
+
+    // last time step
+    auto g = (*step_scopes)[9]
+                 ->CreateVariable("rnn/h_pre_grad")
+                 ->GetMutable<Tensor>();
+    g->mutable_data<float>(make_ddim({20, 30}), platform::CPUPlace());
+  }
+
+  void CreateRNNGradientAlgorithm() {
+    std::unique_ptr<rnn::Argument> arg(new rnn::Argument());
+    arg->step_net = "step_net";
+    arg->step_scopes = "step_scopes";
+    rnn::Link inlink;
+    inlink.external = "h_grad";
+    inlink.internal = "rnn/h_grad";
+    arg->inlinks = std::vector<rnn::Link>{inlink};
+
+    rnn::Link outlink;
+    outlink.external = "x_grad";
+    outlink.internal = "rnn/x_grad";
+    arg->outlinks = std::vector<rnn::Link>{outlink};
+
+    rnn::MemoryAttr mem_attr;
+    mem_attr.pre_var = "rnn/h_pre_grad";
+    mem_attr.var = "rnn/h_grad";
+    mem_attr.boot_var = "h_boot_grad";
+    arg->memories = std::vector<rnn::MemoryAttr>{mem_attr};
+
+    rnn_grad_algo_.Init(std::move(arg));
+  }
+
+  void CreateStepNet() {
+    LOG(INFO) << "create variable step_net";
+    Variable* var = scope_->CreateVariable("step_net");
+    auto net = var->GetMutable<NetOp>();
+    net->AddOp(OpRegistry::CreateOp("mul",
+                                    {"rnn/h_pre", "rnn/w", "rnn/s_grad"},
+                                    {"rnn/h_pre_grad", "rnn/w_grad"},
+                                    {}));
+
+    net->AddOp(OpRegistry::CreateOp(
+        "add_two", {"rnn/h_grad"}, {"rnn/x_grad", "rnn/s_grad"}, {}));
+    net->CompleteAddOp();
+  }
+
+  void SegmentInputs() {
+    LOG(INFO) << "segment inputs";
+    std::vector<std::string> inlinks = {"x"};
+    std::vector<std::string> inlinks_alias = {"rnn/x"};
+
+    rnn::Link inlink;
+    inlink.external = "x";
+    inlink.internal = "rnn/x";
+    std::vector<std::shared_ptr<Scope>>* step_scopes =
+        scope_->GetVariable("step_scopes")
+            ->GetMutable<std::vector<std::shared_ptr<Scope>>>();
+    rnn::SegmentInputs(*step_scopes, std::vector<rnn::Link>{inlink}, 10);
+  }
+
+  void LinkeMemories() {
+    LOG(INFO) << "link memories";
+    rnn::MemoryAttr mem_attr;
+    mem_attr.pre_var = "rnn/h_pre";
+    mem_attr.var = "rnn/h";
+    mem_attr.boot_var = "boot_h";
+    std::vector<rnn::MemoryAttr> memories;
+    memories.push_back(mem_attr);
+    std::vector<std::shared_ptr<Scope>>* step_scopes =
+        scope_->GetVariable("step_scopes")
+            ->GetMutable<std::vector<std::shared_ptr<Scope>>>();
+    for (int i = 1; i < 10; ++i) {
+      rnn::LinkMemories(*step_scopes, memories, i, -1);
+    }
+  }
+
+  std::shared_ptr<Scope> scope_;
+  RecurrentGradientAlgorithm rnn_grad_algo_;
+};
+
+// TEST_F(RecurrentGradientAlgorithmTest, Run) {
+//   platform::CPUDeviceContext ctx;
+//   rnn_grad_algo_.Run(scope_, ctx);
+// }
+
+}  // namespace operators
+}  // namespace paddle
+
+TEST(RecurrentOp, LinkMemories) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  using namespace paddle::operators;
+
+  // create and init step scopes
+  int len = 10;
+  std::vector<std::shared_ptr<Scope>> step_scopes;
+  for (int i = 0; i < len; ++i) {
+    auto scope = std::make_shared<Scope>();
+    scope->CreateVariable("pre_h");
+    auto tensor = scope->CreateVariable("h")->GetMutable<Tensor>();
+    float* data = tensor->mutable_data<float>(make_ddim({15, 20}), CPUPlace());
+    for (int i = 0; i < 15 * 20; ++i) {
+      data[i] = rand() * (1. / (double)RAND_MAX);
+    }
+    step_scopes.push_back(scope);
+  }
+
+  // create MemoryAttr
+  rnn::MemoryAttr mem_attr;
+  mem_attr.pre_var = "pre_h";
+  mem_attr.var = "h";
+  mem_attr.boot_var = "boot_h";
+  std::vector<rnn::MemoryAttr> memories;
+  memories.push_back(mem_attr);
+
+  for (int i = 1; i < len; ++i) {
+    rnn::LinkMemories(step_scopes, memories, i, -1);
+  }
+  // check
+  for (int i = 0; i < len - 1; ++i) {
+    const float* a =
+        step_scopes[i]->GetVariable("h")->GetMutable<Tensor>()->data<float>();
+    const float* b = step_scopes[i + 1]
+                         ->GetVariable("pre_h")
+                         ->GetMutable<Tensor>()
+                         ->data<float>();
+    for (size_t i = 0; i < 15 * 20; ++i) {
+      ASSERT_FLOAT_EQ(a[i], b[i]);
+    }
+  }
+
+  for (int i = len - 2; i >= 0; --i) {
+    rnn::LinkMemories(step_scopes, memories, i, 1);
+  }
+  // check
+  for (int i = len - 2; i >= 0; --i) {
+    const float* a = step_scopes[i]
+                         ->GetVariable("pre_h")
+                         ->GetMutable<Tensor>()
+                         ->data<float>();
+    const float* b = step_scopes[i + 1]
+                         ->GetVariable("h")
+                         ->GetMutable<Tensor>()
+                         ->data<float>();
+    for (size_t i = 0; i < 15 * 20; ++i) {
+      ASSERT_FLOAT_EQ(a[i], b[i]);
+    }
+  }
+}
+
+USE_OP(add_two);
+USE_OP(mul);
diff --git a/paddle/operators/rnn_design.md b/paddle/operators/rnn_design.md
new file mode 100644
index 0000000000000000000000000000000000000000..3d38b9a0ad225fd8e0c1bb037474b292b1887f5b
--- /dev/null
+++ b/paddle/operators/rnn_design.md
@@ -0,0 +1,239 @@
+# RNN 变长输入设计
+对变长序列的学习，现有主流框架比如 tensorflow, pytorch, caffe2, mxnet 等均使用了padding的方式，
+即将一个mini-batch内不同长度的序列补0到固定长度参与计算。
+
+现有Paddle包括 `RecurrentLayerGroup` 在内的RNN均实现了无padding的变长序列支持，本文也将基于该模块的思路，设计重构后的变长序列支持。
+
+## 背景介绍
+由于tensor必须有明确的shape，因此基于tensor 的主流框架在存储变长序列时，
+必须用zero-padding的方式将变长序列补全为固定shape的tensor。
+
+由于padding是一种框架实现变长序列的妥协， 从用户角度，在使用RNN类模型时自然会比较介意padding的存在，
+因此会有pytorch中对非padding方式变长序列支持长篇的讨论[3]。
+
+由于padding对内存和计算会有额外的消耗，tensorflow和mxnet均使用了bucketing来进行优化[1][2]，
+但不管是padding还是bucket，对于用户都是额外的使用负担。
+
+因此，**paddle原生支持变长序列的方式，能直接满足用户对变长序列的最直接的需求，在当前主流平台中可以算是一大优势**。
+
+但对变长序列的支持，需要对目前框架做一些修改，下面讨论如何在最小修改下支持变长序列。
+
+## 多层序列数据格式 `LODTensor`
+目前 Paddle 会将一个mini-batch内的数据存储在一维的内存上，
+额外使用 `Argument.sequenceStartPositions` 来存储每个句子的信息。
+
+Paddle里使用 `Argument.subSequenceStartPositions` 来存储2层的序列信息，更高维度的序列则无法直接支持；
+
+为了支持 `N-level` 序列的存储，本文将序列信息定义成如下数据结构:
+
+```c++
+std::shared_ptr<std::vector<std::vector<int>>> lod_start_pos_;
+```
+
+或者更明确的定义
+
+```c++
+typedef std::vector<int> level_t;
+std::vector<level_t> lod_start_pos;
+```
+
+这里的每一个 `level_t` 存储一个粒度(level)的偏移信息，和paddle目前做法一致。
+
+为了更透明地传递序列信息，我们引入了一种新的tensor 称为 `LODTensor`[4]，
+其关于tensor相关的接口都直接继承自 `Tensor`，但另外添加了序列相关接口。
+如此，在操作一个 `LODTensor` 时，普通 `Op` 直接当成 `Tensor` 使用，
+而操作序列的 `Op` 会额外操作 `LODTensor` 的变长序列操作的相关接口。
+
+`LODTensor` 具体定义如下：
+
+```c++
+class LODTensor : public Tensor {
+public:
+  size_t Levels() const { return seq_start_positions_.size(); }
+  size_t Elements(int level = 0) const {
+    return seq_start_positions_[level].size();
+  }
+  // slice of level[elem_begin: elem_end]
+  // NOTE low performance in slice seq_start_positions_.
+  // TODO should call Tensor's Slice.
+  LODTensor LODSlice(int level, int elem_begin, int elem_end) const;
+
+  // slice with tensor's data shared with this.
+  LODTensor LODSliceShared(int level, int elem_begin, int elem_end) const;
+
+  // copy other's lod_start_pos_, to share LOD info.
+  // NOTE the LOD info sould not be changed.
+  void ShareConstLODFrom(const LODTensor &other) {
+    lod_start_pos_ = other.lod_start_pos_;
+  }
+  // copy other's lod_start_pos_'s content, free to mutate.
+  void ShareMutableLODFrom(const LODTensor &other) {
+    lod_start_pos_ = std::make_shared <
+                     std::vector<std::vector<int>>(other.lod_start_pos_.begin(),
+                                                   other.lod_start_pos_.end());
+  }
+
+private:
+  std::shared_ptr<std::vector<std::vector<int>>> lod_start_pos_;
+};
+```
+
+其中， `lod_start_pos_` 使用了 `shared_ptr` 来减少存储和复制的代价，
+可以认为 `LODTensor` 是 `Tensor` 的扩展，几乎完全兼容原始 `Tensor` 的使用。
+
+## 框架支持
+### 框架现有的 `Tensor` 调用替换为 `LODTensor`
+为了实现 `LODTensor` 的传递，框架里很多 `Tensor` 都需要变成 `LODTensor`，
+简单实现，直接 **把之前所有的`Tensor` 全部替换成 `LODTensor`，这里可以直接修改 `pybind.cc` 里面创建`Tensor`的接口**。
+
+此外，用户有可能需要感知序列的存在（比如序列的可视化需要解析模型中输出的序列），因此一些序列操作的API也需要暴露到 python 层。
+
+### `lod_start_pos` 随着Op调用链传递
+框架需要支持下列特性，以实现`lod_start_pos`的传递：
+
+1. 以 `shared_ptr` 的方式实现传递
+    - 不修改 `lod_start_pos` 内容的作为 consumer
+    - 修改 `lod_start_pos` 的作为 producer
+    - 约定 consumer 只需要复制传递过来的 `shared_ptr`
+      - producer 需要创建自己的独立的内存，以存储自己独立的修改，并暴露 `shared_ptr` 给后续 consumer
+    - 由于传递过程是以复制`shared_ptr`的方式实现，因此框架只需要传递一次 `lod_start_pos`
+
+2. 对于不感知 `lod_start_pos` 的Op足够透明
+3. 需要修改 `lod_start_pos` 的producer Op可以在 `Run` 时更新自己的 `lod_start_pos` 数据 
+
+具体的设计分为以下3小节
+
+#### `load_start_pos` 的传递
+
+- 对于不需要修改 `lod_start_pos` 的情况，调用 LODTensor的 `ShareConstLODFrom` 接口实现复制
+- 需要修改的，调用`ShareMutableLODFrom` 接口自己分配内存以存储修改
+
+#### 框架透明
+传递这一步需要加入到网络跑之前的初始化操作中，并且只需要初始化一次，基于当前框架设计的初步方案如下
+
+- 在 Op 的 `attrs` 中添加一项 `do_mutate_lod_info` 的属性，默认为 `false`
+  - 有需要修改 `lod_start_pos` 的Op需要在定义 `OpProto` 时设置为 `true`
+- `OperatorBase` 的 `InferShape` 中会读取 `do_mutate_lod_info` ，并且调用 `LODTensor` 相关的方法实现 `lod_start_pos` 的复制。
+- `OperatorBase` 中添加一个 member `is_lod_inited{false}` 来保证传递只进行一次
+
+一些逻辑如下
+
+```c++
+class OperatorBase {
+public:
+  // ...
+  void InferShape() {
+    if (!is_load_inited) {
+      bool do_mutate_lod_info = GetAttr<bool>("do_mutate_load_info");
+      // find a input having LOD to copy
+      auto lod_input = ValidLODInput();
+      for (auto &output : outputs) {
+        if (do_mutate_load_info) {
+          output.ShareMutableLODFrom(lod_input);
+        } else {
+          output.ShareConstLODFrom(load_input);
+        }
+      }
+      is_pod_inited = true;
+    }
+
+    // call op's InferShape
+    // ...
+  }
+
+private:
+  // ...
+  bool is_lod_inited{false};
+};
+```
+
+如此，`lod_start_pos` 的信息的传递对非OLD的Op的实现是完全透明的。
+
+#### `lod_start_pos` 的更新
+上一小节介绍到，对于需要修改 `load_start_pos` 的Op，`OperatorBase` 会分配一块自己的内存以存储修改，
+Op在 `Run` 的实现中，操作更新自己的 `load_start_pos` ，
+而所有依赖其 outputs 的 op 会通过共享的指针自动获取到其更新。
+
+## 根据长度排序
+按照长度排序后，从前往后的时间步的batch size会自然地递减，可以直接塞入 Net 做batch计算
+
+比如原始的输入：
+
+```
+origin:
+xxxx
+xx
+xxx
+
+-> sorted:
+xxxx
+xxx
+xx
+```
+
+经过 `SegmentInputs` 之后，每个会有4个时间步，每个时间步的输入如下（纵向排列）
+
+```
+0    1    2    3
+x    x    x    x
+x    x    x
+x    x
+```
+
+为了追踪排序前后序列的变化，这里用
+```c++
+struct SortedSeqItem {
+   void *start{nullptr};
+   void *end{nullptr};
+};
+
+std::vector<SortedSeqItem> sorted_seqs;
+```
+来追踪序列排序后的位置，并添加一个新的接口 
+
+```c++
+std::vector<SortedSeqItem> SortBySeqLen(const LODTensor& tensor);
+```
+
+由于输入序列的顺序变化，以下现有的接口需要针对性地修改：
+
+- InitMemories, memory需要根据 `sorted_seqs` 重新排列
+- SetmentInputs
+- ConcatOutputs
+
+此外，由于 `sorted_seqs` 需要被 `RecurrentGradientOp` 复用，因此会变成 `RecurrentOp` 一个新的output输出，
+之后作为 `RecurrentGradientOp` 的一个输入传入。
+
+## InitMemories
+由于序列顺序的变化，`boot_memories` 的batch上的element的顺序也需要对应重新排列。
+
+## SegmentInputs
+`SegmentInputs` 会依赖 `sorted_seqs` 的信息，将原始的序列按照排序后的序列顺序，从横向切割，转为每个step中的inputs。
+
+即下面的转变：
+```
+origin:
+xxxx
+xx
+xxx
+
+   |
+   |
+  \ /
+   !
+0    1    2    3
+x    x    x    x
+x    x    x
+x    x
+```
+## ConcatOutputs
+`ConcatOutputs` 需要
+
+- 将每个时间步的输出重新还原为原始输入的序列顺序（以防止Infer阶段顺序打乱）
+- 将每个序列concat 为规则的mini-batch表示
+
+## 参考文献
+1. [Tensorflow Bucketing](https://www.tensorflow.org/versions/r0.12/api_docs/python/contrib.training/bucketing)
+2. [mxnet Bucketing](http://mxnet.io/how_to/bucketing.html)
+3. [variable length input in RNN scenario](https://discuss.pytorch.org/t/about-the-variable-length-input-in-rnn-scenario/345/5)
+4. [Level of details](https://en.wikipedia.org/wiki/Level_of_detail)
diff --git a/paddle/operators/rowwise_add_op.cc b/paddle/operators/rowwise_add_op.cc
index e04d69fa72a2f54cc1cc0829d12e0da1609b3383..4129422fa744b2a7cf135b681efa73ffb2ebcdcc 100644
--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -13,15 +13,13 @@
    limitations under the License. */
 
 #include "paddle/operators/rowwise_add_op.h"
-#include "paddle/framework/op_registry.h"
 namespace paddle {
 namespace operators {
 
-class RowWiseAddOp : public framework::OperatorWithKernel {
+class RowWiseAddOp : public OperatorWithKernel {
 protected:
-  void InferShape(
-      const std::vector<const framework::Tensor *> &inputs,
-      const std::vector<framework::Tensor *> &outputs) const override {
+  void InferShape(const std::vector<const Tensor *> &inputs,
+                  const std::vector<Tensor *> &outputs) const override {
     PADDLE_ENFORCE(inputs.size() == 2UL, "Two inputs is needed by rowwise add");
     auto dim0 = inputs[0]->dims();
     auto dim1 = inputs[1]->dims();
@@ -34,11 +32,10 @@ protected:
   }
 };
 
-class RowWiseAddOpMaker : public framework::OpProtoAndCheckerMaker {
+class RowWiseAddOpMaker : public OpProtoAndCheckerMaker {
 public:
-  RowWiseAddOpMaker(framework::OpProto *proto,
-                    framework::OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "The left input of row-wise add op, must be matrix");
     AddInput("b", "The right input of row-wise add op, must be vector");
     AddOutput("Out", "The output of row-wise add op");
@@ -53,9 +50,6 @@ for i in xrange(X.shape[0]):
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP(rowwise_add,
-            paddle::operators::RowWiseAddOp,
-            paddle::operators::RowWiseAddOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    rowwise_add,
-    paddle::operators::RowWiseAddKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP(rowwise_add, ops::RowWiseAddOp, ops::RowWiseAddOpMaker);
+REGISTER_OP_CPU_KERNEL(rowwise_add,
+                       ops::RowWiseAddKernel<ops::CPUPlace, float>);
diff --git a/paddle/operators/rowwise_add_op.cu b/paddle/operators/rowwise_add_op.cu
index 5dfac4fd2cf9b7da24dcfa5e7583b9ece12bad1e..4b33e38ebabe853e179fe70ef7fde0a80b9050e2 100644
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
@@ -1,6 +1,4 @@
-#include "paddle/framework/op_registry.h"
 #include "paddle/operators/rowwise_add_op.h"
 
-REGISTER_OP_GPU_KERNEL(
-    rowwise_add,
-    paddle::operators::RowWiseAddKernel<paddle::platform ::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(rowwise_add,
+                       ops::RowWiseAddKernel<ops::GPUPlace, float>);
diff --git a/paddle/operators/rowwise_add_op.h b/paddle/operators/rowwise_add_op.h
index dc47fe7c847bd0c8c179ac0a5f44b8cc541b47cb..4596925e9322f373c822608fd9aa6ecee6144d4c 100644
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
@@ -13,25 +13,23 @@
    limitations under the License. */
 
 #pragma once
-#include "glog/logging.h"
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/operator.h"
+#include "paddle/operators/type_alias.h"
 
 namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class RowWiseAddKernel : public framework::OpKernel {
+class RowWiseAddKernel : public OpKernel {
 public:
-  void Compute(const framework::KernelContext& context) const override {
-    auto in0 = context.Input(0)->Get<framework::Tensor>();
-    auto in1 = context.Input(1)->Get<framework::Tensor>();
-    auto* out = context.Output(0)->GetMutable<framework::Tensor>();
+  void Compute(const KernelContext& context) const override {
+    auto in0 = context.Input(0)->Get<Tensor>();
+    auto in1 = context.Input(1)->Get<Tensor>();
+    auto* out = context.Output(0)->GetMutable<Tensor>();
     out->mutable_data<T>(context.GetPlace());
 
-    auto input = framework::EigenMatrix<T>::From(in0);
-    auto bias = framework::EigenVector<T>::From(in1);
-    auto output = framework::EigenMatrix<T>::From(*out);
+    auto input = EigenMatrix<T>::From(in0);
+    auto bias = EigenVector<T>::From(in1);
+    auto output = EigenMatrix<T>::From(*out);
 
     const int bias_size = bias.dimension(0);
     const int rest_size = input.size() / bias_size;
diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
index 66ab1e001142bfb005d3c2e2ea29e01a32dce507..f6c654a9e7083704e353c276e0abc975f4e61ef9 100644
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -13,17 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/operators/sgd_op.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/tensor.h"
 
 namespace paddle {
 namespace operators {
 
-class SGDOp : public framework::OperatorWithKernel {
+class SGDOp : public OperatorWithKernel {
 protected:
-  void InferShape(
-      const std::vector<const framework::Tensor *> &inputs,
-      const std::vector<framework::Tensor *> &outputs) const override {
+  void InferShape(const std::vector<const Tensor *> &inputs,
+                  const std::vector<Tensor *> &outputs) const override {
     PADDLE_ENFORCE(inputs.size() == 2, "Input size of SGDOp must be two");
     PADDLE_ENFORCE(outputs.size() == 1, "Output size of SGDOp must be one");
     PADDLE_ENFORCE(inputs[0] != nullptr, "inputs[0] mast be set");
@@ -35,10 +32,10 @@ protected:
   }
 };
 
-class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
+class SGDOpMaker : public OpProtoAndCheckerMaker {
 public:
-  SGDOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  SGDOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("param", "input parameter");
     AddInput("grad", "input gradient");
     AddOutput("param_out", "output parameter");
@@ -55,7 +52,5 @@ param_out = param - learning_rate * grad;
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP(sgd, paddle::operators::SGDOp, paddle::operators::SGDOpMaker);
-typedef paddle::operators::SGDOpKernel<::paddle::platform::CPUPlace, float>
-    SGDOpKernel_CPU_float;
-REGISTER_OP_CPU_KERNEL(sgd, SGDOpKernel_CPU_float);
+REGISTER_OP(sgd, ops::SGDOp, ops::SGDOpMaker);
+REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel<ops::CPUPlace, float>);
diff --git a/paddle/operators/sgd_op.cu b/paddle/operators/sgd_op.cu
index 400425db10896e3970fc7468e34aba596a536184..f8f5b90cab460b4457cfb0a88bfc012bafe0fbc2 100644
--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
@@ -1,5 +1,3 @@
 #include "paddle/operators/sgd_op.h"
-#include "paddle/framework/op_registry.h"
 
-typedef paddle::operators::SGDOpKernel<::paddle::platform::GPUPlace, float> SGDOpKernel_GPU_float;
-REGISTER_OP_GPU_KERNEL(sgd, SGDOpKernel_GPU_float);
\ No newline at end of file
+REGISTER_OP_GPU_KERNEL(sgd, ops::SGDOpKernel<ops::GPUPlace, float>);
\ No newline at end of file
diff --git a/paddle/operators/sgd_op.h b/paddle/operators/sgd_op.h
index 4b2d214618e5c7c15695bd66604139d805255c47..65179d323bd991b8b4e196c069a11cd901c62082 100644
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
@@ -13,28 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "glog/logging.h"
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/operator.h"
+#include "paddle/operators/type_alias.h"
 
 namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class SGDOpKernel : public framework::OpKernel {
+class SGDOpKernel : public OpKernel {
 public:
-  void Compute(const framework::KernelContext& ctx) const override {
-    auto param = ctx.Input("param")->Get<framework::Tensor>();
-    auto grad = ctx.Input("grad")->Get<framework::Tensor>();
-    auto* param_out = ctx.Output(0)->GetMutable<framework::Tensor>();
+  void Compute(const KernelContext& ctx) const override {
+    auto param = ctx.Input("param")->Get<Tensor>();
+    auto grad = ctx.Input("grad")->Get<Tensor>();
+    auto* param_out = ctx.Output(0)->GetMutable<Tensor>();
     float lr = ctx.op_.GetAttr<float>("learning_rate");
 
     param_out->mutable_data<T>(ctx.GetPlace());
 
-    framework::EigenVector<T>::Flatten(*param_out)
-        .device(*(ctx.GetEigenDevice<Place>())) =
-        framework::EigenVector<T>::Flatten(param) -
-        lr * framework::EigenVector<T>::Flatten(grad);
+    EigenVector<T>::Flatten(*param_out).device(*(ctx.GetEigenDevice<Place>())) =
+        EigenVector<T>::Flatten(param) - lr * EigenVector<T>::Flatten(grad);
   }
 };
 
diff --git a/paddle/operators/sigmoid_op.cc b/paddle/operators/sigmoid_op.cc
index bf63af28b003daad0ab8c223e71a561437ee663a..716f1d9c4dbc45e2d5569f8d634b06fd988a149c 100644
--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
@@ -13,37 +13,33 @@
    limitations under the License. */
 
 #include "paddle/operators/sigmoid_op.h"
-#include "paddle/framework/op_registry.h"
 namespace paddle {
 namespace operators {
 
-class SigmoidOp : public framework::OperatorWithKernel {
+class SigmoidOp : public OperatorWithKernel {
 protected:
-  void InferShape(
-      const std::vector<const framework::Tensor *> &inputs,
-      const std::vector<framework::Tensor *> &outputs) const override {
+  void InferShape(const std::vector<const Tensor *> &inputs,
+                  const std::vector<Tensor *> &outputs) const override {
     PADDLE_ENFORCE(inputs.size() == 1, "Sigmoid Op only have one input");
     PADDLE_ENFORCE(outputs.size() == 1, "Sigmoid Op only have one output");
     outputs[0]->Resize(inputs[0]->dims());
   }
 };
 
-class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
+class SigmoidOpMaker : public OpProtoAndCheckerMaker {
 public:
-  SigmoidOpMaker(framework::OpProto *proto,
-                 framework::OpAttrChecker *op_checker)
-      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+  SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "sigmoid input");
     AddOutput("Y", "sigmoid output");
     AddComment("Sigmoid function");
   }
 };
 
-class SigmoidOpGrad : public framework::OperatorWithKernel {
+class SigmoidOpGrad : public OperatorWithKernel {
 protected:
-  void InferShape(
-      const std::vector<const framework::Tensor *> &inputs,
-      const std::vector<framework::Tensor *> &outputs) const override {}
+  void InferShape(const std::vector<const Tensor *> &inputs,
+                  const std::vector<Tensor *> &outputs) const override {}
   std::string DebugString() const override {
     LOG(INFO) << "SigmoidGrad";
     return "";
@@ -53,11 +49,7 @@ protected:
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP(sigmoid,
-            paddle::operators::SigmoidOp,
-            paddle::operators::SigmoidOpMaker);
-REGISTER_GRADIENT_OP(sigmoid, sigmoid_grad, paddle::operators::SigmoidOpGrad);
+REGISTER_OP(sigmoid, ops::SigmoidOp, ops::SigmoidOpMaker);
+REGISTER_GRADIENT_OP(sigmoid, sigmoid_grad, ops::SigmoidOpGrad);
 
-REGISTER_OP_CPU_KERNEL(
-    sigmoid,
-    paddle::operators::SigmoidKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(sigmoid, ops::SigmoidKernel<ops::CPUPlace, float>);
diff --git a/paddle/operators/sigmoid_op.cu b/paddle/operators/sigmoid_op.cu
index ed344b2bfd4a9eeef2ce79746bec608469503c9c..f679b20418f04eff4310efe4e121963ce5a235e0 100644
--- a/paddle/operators/sigmoid_op.cu
+++ b/paddle/operators/sigmoid_op.cu
@@ -1,5 +1,3 @@
 #include "paddle/operators/sigmoid_op.h"
-#include "paddle/framework/op_registry.h"
 
-REGISTER_OP_GPU_KERNEL(
-    sigmoid, paddle::operators::SigmoidKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(sigmoid, ops::SigmoidKernel<ops::GPUPlace, float>);
diff --git a/paddle/operators/sigmoid_op.h b/paddle/operators/sigmoid_op.h
index 2b9356246c471853b53af1d73f8b2a3c206db7ad..896a6f5d83e0f96de50e3aaae6f545172bf5da14 100644
--- a/paddle/operators/sigmoid_op.h
+++ b/paddle/operators/sigmoid_op.h
@@ -14,25 +14,23 @@
 
 #pragma once
 
-#include "glog/logging.h"
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/operator.h"
+#include "paddle/operators/type_alias.h"
 
 namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class SigmoidKernel : public framework::OpKernel {
+class SigmoidKernel : public OpKernel {
 public:
-  void Compute(const framework::KernelContext& context) const override {
-    auto input = context.Input(0)->Get<framework::Tensor>();
-    auto* output = context.Output(0)->GetMutable<framework::Tensor>();
+  void Compute(const KernelContext& context) const override {
+    auto input = context.Input(0)->Get<Tensor>();
+    auto* output = context.Output(0)->GetMutable<Tensor>();
 
     output->mutable_data<T>(context.GetPlace());
 
-    framework::EigenVector<T>::Flatten(*output).device(
+    EigenVector<T>::Flatten(*output).device(
         *(context.GetEigenDevice<Place>())) =
-        1.0 / (1.0 + (-1.0 * framework::EigenVector<T>::Flatten(input)).exp());
+        1.0 / (1.0 + (-1.0 * EigenVector<T>::Flatten(input)).exp());
   }
 };
 }  // namespace operators
diff --git a/paddle/operators/softmax_op.cc b/paddle/operators/softmax_op.cc
index 82f72fa19f690bebdff01629e75d17eecd6ada74..df60b62fa6ac8d67c9dadc40ec49aaedab92bc88 100644
--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -12,16 +12,14 @@
    See the License for the specific language governing permissions and
    limitations under the License. */
 #include "paddle/operators/softmax_op.h"
-#include "paddle/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
 
-class SoftmaxOp : public framework::OperatorWithKernel {
+class SoftmaxOp : public OperatorWithKernel {
 protected:
-  void InferShape(
-      const std::vector<const framework::Tensor *> &inputs,
-      const std::vector<framework::Tensor *> &outputs) const override {
+  void InferShape(const std::vector<const Tensor *> &inputs,
+                  const std::vector<Tensor *> &outputs) const override {
     PADDLE_ENFORCE(inputs.size() == 1, "Only one input is need for softmax");
     PADDLE_ENFORCE(inputs[0]->dims().size() == 2,
                    "The input of softmax op must be matrix");
@@ -31,10 +29,9 @@ protected:
   }
 };
 
-class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
+class SoftmaxOpMaker : public OpProtoAndCheckerMaker {
 public:
-  SoftmaxOpMaker(framework::OpProto *proto,
-                 framework::OpAttrChecker *op_checker)
+  SoftmaxOpMaker(OpProto *proto, OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "input of softmax");
     AddOutput("Y", "output of softmax");
@@ -42,11 +39,10 @@ public:
   }
 };
 
-class SoftmaxOpGrad : public framework::OperatorWithKernel {
+class SoftmaxOpGrad : public OperatorWithKernel {
 protected:
-  void InferShape(
-      const std::vector<const framework::Tensor *> &inputs,
-      const std::vector<framework::Tensor *> &outputs) const override {}
+  void InferShape(const std::vector<const Tensor *> &inputs,
+                  const std::vector<Tensor *> &outputs) const override {}
   std::string DebugString() const override {
     LOG(INFO) << "SoftmaxOpGrad";
     return "";
@@ -56,9 +52,6 @@ protected:
 }  // namespace operators
 }  // namespace paddle
 
-namespace ops = paddle::operators;
-
 REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker);
-REGISTER_GRADIENT_OP(softmax, softmax_grad, paddle::operators::SoftmaxOpGrad);
-REGISTER_OP_CPU_KERNEL(softmax,
-                       ops::SoftmaxKernel<paddle::platform::CPUPlace, float>);
+REGISTER_GRADIENT_OP(softmax, softmax_grad, ops::SoftmaxOpGrad);
+REGISTER_OP_CPU_KERNEL(softmax, ops::SoftmaxKernel<ops::CPUPlace, float>);
diff --git a/paddle/operators/softmax_op.cu b/paddle/operators/softmax_op.cu
index 60676191eb9460868a266d0e4f70357fa78bec2c..a1f6944a369fe5148ffcfeabf3bf7063dcbc2664 100644
--- a/paddle/operators/softmax_op.cu
+++ b/paddle/operators/softmax_op.cu
@@ -1,5 +1,4 @@
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/softmax_op.h"
 
-REGISTER_OP_GPU_KERNEL(
-    softmax, paddle::operators::SoftmaxKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(softmax, ops::SoftmaxKernel<ops::GPUPlace, float>);
diff --git a/paddle/operators/softmax_op.h b/paddle/operators/softmax_op.h
index 500c188dbfcf28ae52c2d5b06466539e115acc4a..625a87b58560231572c1cca2a21bd0c47c8cb296 100644
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -14,23 +14,21 @@
 
 #pragma once
 
-#include "glog/logging.h"
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/operator.h"
+#include "paddle/operators/type_alias.h"
 
 namespace paddle {
 namespace operators {
 
 template <typename Place, typename T>
-class SoftmaxKernel : public framework::OpKernel {
+class SoftmaxKernel : public OpKernel {
 public:
-  void Compute(const framework::KernelContext& context) const override {
-    auto input = context.Input(0)->Get<framework::Tensor>();
-    auto* output = context.Output(0)->GetMutable<framework::Tensor>();
+  void Compute(const KernelContext& context) const override {
+    auto input = context.Input(0)->Get<Tensor>();
+    auto* output = context.Output(0)->GetMutable<Tensor>();
     output->mutable_data<T>(context.GetPlace());
 
-    auto logits = framework::EigenMatrix<T>::From(input);
-    auto softmax = framework::EigenMatrix<T>::From(*output);
+    auto logits = EigenMatrix<T>::From(input);
+    auto softmax = EigenMatrix<T>::From(*output);
 
     const int kBatchDim = 0;
     const int kClassDim = 1;
diff --git a/paddle/operators/type_alias.h b/paddle/operators/type_alias.h
new file mode 100644
index 0000000000000000000000000000000000000000..b712e457ff60e8b30b87c0d549693d53e9f05d59
--- /dev/null
+++ b/paddle/operators/type_alias.h
@@ -0,0 +1,51 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/net.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using OpKernel = framework::OpKernel;
+using KernelContext = framework::KernelContext;
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+template <typename T,
+          size_t D,
+          int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
+using Tensor = framework::Tensor;
+using OperatorWithKernel = framework::OperatorWithKernel;
+using OpProtoAndCheckerMaker = framework::OpProtoAndCheckerMaker;
+using OpProto = framework::OpProto;
+using OpAttrChecker = framework::OpAttrChecker;
+using CPUPlace = platform::CPUPlace;
+using GPUPlace = platform::GPUPlace;
+using NetOp = framework::NetOp;
+using OpRegistry = framework::OpRegistry;
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index 9c1d94e9e703caf2db92ca4a8eac975317e6b945..f80c36b5b23b4e8bac3bee52d5492ffaa43778d0 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -20,12 +20,104 @@ Eigen::DefaultDevice* DeviceContext::get_eigen_device<Eigen::DefaultDevice>()
   return reinterpret_cast<const CPUDeviceContext*>(this)->eigen_device();
 }
 
+CPUDeviceContext::CPUDeviceContext() {
+  random_seed_ = std::chrono::system_clock::now().time_since_epoch().count();
+  eigen_device_.reset(new Eigen::DefaultDevice());
+}
+
+CPUDeviceContext::CPUDeviceContext(CPUPlace place) {
+  random_seed_ = std::chrono::system_clock::now().time_since_epoch().count();
+  eigen_device_.reset(new Eigen::DefaultDevice());
+}
+
+Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
+  return eigen_device_.get();
+}
+
+Place CPUDeviceContext::GetPlace() const { return CPUPlace(); }
+
 #ifndef PADDLE_ONLY_CPU
+
 template <>
 Eigen::GpuDevice* DeviceContext::get_eigen_device<Eigen::GpuDevice>() const {
   return reinterpret_cast<const CUDADeviceContext*>(this)->eigen_device();
 }
-#endif
+
+CUDADeviceContext::CUDADeviceContext(GPUPlace place) : place_(place) {
+  random_seed_ = std::chrono::system_clock::now().time_since_epoch().count();
+  SetDeviceId(place_.device);
+  // TODO(qijun) Pass a created cuda stream to Eigen::CudaStreamDevice directly
+  // here will cause segment fault. We must implement a class derived from
+  // Eigen::StreamInterface, and reinitialize it with a cuda stream and a gpu id
+  // later. Please refer to the implementation of class EigenCudaStreamDevice
+  // in TensorFlow.
+  //
+  // We find that CUDA 7 introduces a new option, the per-thread default stream,
+  // that has two effects. Please refer to https://devblogs.nvidia.com/
+  // parallelforall/gpu-pro-tip-cuda-7-streams-simplify-concurrency/
+  //
+  // So, we decide to use default stream and add –default-stream per-thread nvcc
+  // flag. Than, two threads with two CUDADeviceContexts will run parallelly.
+  eigen_stream_.reset(new Eigen::CudaStreamDevice());
+  eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
+}
+
+CUDADeviceContext::~CUDADeviceContext() {
+  SetDeviceId(place_.device);
+  Wait();
+  if (cublas_handle_) {
+    PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_));
+  }
+
+  if (cudnn_handle_) {
+    PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
+  }
+
+  if (curand_generator_) {
+    PADDLE_ENFORCE(dynload::curandDestroyGenerator(curand_generator_));
+  }
+  eigen_stream_.reset();
+  eigen_device_.reset();
+}
+
+Place CUDADeviceContext::GetPlace() const { return place_; }
+
+void CUDADeviceContext::Wait() const {
+  PADDLE_ENFORCE(cudaStreamSynchronize(0));
+}
+
+Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
+  return eigen_device_.get();
+}
+
+cublasHandle_t CUDADeviceContext::cublas_handle() {
+  if (!cublas_handle_) {
+    SetDeviceId(place_.device);
+    PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_));
+  }
+  return cublas_handle_;
+}
+
+cudnnHandle_t CUDADeviceContext::cudnn_handle() {
+  if (!cudnn_handle_) {
+    SetDeviceId(place_.device);
+    PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
+  }
+  return cudnn_handle_;
+}
+
+curandGenerator_t CUDADeviceContext::curand_generator() {
+  if (!curand_generator_) {
+    SetDeviceId(place_.device);
+    PADDLE_ENFORCE(dynload::curandCreateGenerator(&curand_generator_,
+                                                  CURAND_RNG_PSEUDO_DEFAULT));
+    PADDLE_ENFORCE(dynload::curandSetPseudoRandomGeneratorSeed(
+        curand_generator_, random_seed_));
+  }
+  return curand_generator_;
+}
+
+#endif  // PADDLE_ONLY_CPU
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 239c25a90c4f98f72fdfb3dff9dfb011e316b101..f5182707611318669b98e6f7b49344604bf7ab4a 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -41,17 +41,13 @@ class DeviceContext {
 class CPUDeviceContext : public DeviceContext {
  public:
   typedef std::mt19937 random_generator_type;
-  CPUDeviceContext() {
-    random_seed_ = std::chrono::system_clock::now().time_since_epoch().count();
-    eigen_device_.reset(new Eigen::DefaultDevice());
-  }
+  CPUDeviceContext();
+  CPUDeviceContext(CPUPlace);
+  virtual ~CPUDeviceContext() {}
 
-  Eigen::DefaultDevice* eigen_device() const { return eigen_device_.get(); }
+  Eigen::DefaultDevice* eigen_device() const;
 
-  Place GetPlace() const override {
-    Place retv = CPUPlace();
-    return retv;
-  }
+  Place GetPlace() const override;
 
   random_generator_type& RandGenerator() {
     if (!rand_generator_) {
@@ -68,122 +64,45 @@ class CPUDeviceContext : public DeviceContext {
 
 #ifndef PADDLE_ONLY_CPU
 
-class GPUPlaceGuard {
- public:
-  explicit GPUPlaceGuard(GPUPlace new_place) : previous_(GetCurrentDeviceId()) {
-    if (previous_ != new_place) {
-      paddle::platform::SetDeviceId(new_place.device);
-    }
-  }
-
-  ~GPUPlaceGuard() { paddle::platform::SetDeviceId(previous_.device); }
-
- private:
-  GPUPlace previous_;
-};
-
 class CUDADeviceContext : public DeviceContext {
  public:
-  CUDADeviceContext() {
-    random_seed_ = std::chrono::system_clock::now().time_since_epoch().count();
-  }
-  explicit CUDADeviceContext(const GPUPlace gpu_place) : gpu_place_(gpu_place) {
-    GPUPlaceGuard guard(gpu_place_);
-    PADDLE_ENFORCE(cudaStreamCreate(&stream_), "cudaStreamCreate failed");
-    eigen_stream_.reset(new Eigen::CudaStreamDevice(&stream_));
-    eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
-  }
+  explicit CUDADeviceContext(GPUPlace);
+  virtual ~CUDADeviceContext();
 
-  Place GetPlace() const override {
-    Place retv = GPUPlace();
-    return retv;
-  }
+  /*! \brief  Wait for all operations completion in the stream. */
+  void Wait() const;
 
-  void Wait() {
-    PADDLE_ENFORCE(cudaStreamSynchronize(stream_),
-                   "cudaStreamSynchronize failed");
-  }
+  /*! \brief  Return place in the device context. */
+  Place GetPlace() const override;
 
-  curandGenerator_t RandGenerator() {
-    if (!rand_generator_) {
-      GPUPlaceGuard guard(gpu_place_);
-      PADDLE_ENFORCE(paddle::platform::dynload::curandCreateGenerator(
-                         &rand_generator_, CURAND_RNG_PSEUDO_DEFAULT),
-                     "curandCreateGenerator failed");
-      PADDLE_ENFORCE(
-          paddle::platform::dynload::curandSetPseudoRandomGeneratorSeed(
-              rand_generator_, random_seed_),
-          "curandSetPseudoRandomGeneratorSeed failed");
-      PADDLE_ENFORCE(
-          paddle::platform::dynload::curandSetStream(rand_generator_, stream_),
-          "curandSetStream failed");
-    }
-    return rand_generator_;
-  }
+  /*! \brief  Return eigen device in the device context. */
+  Eigen::GpuDevice* eigen_device() const;
 
-  cudaStream_t stream() { return stream_; }
+  // clang-format off
+  /*! \brief  Return cublas handle in the device context. */
+  cublasHandle_t    cublas_handle   ();
 
-  Eigen::GpuDevice* eigen_device() const { return eigen_device_.get(); }
+  /*! \brief  Return cudnn  handle in the device context. */
+  cudnnHandle_t     cudnn_handle    ();
 
-  cublasHandle_t cublas_handle() {
-    if (!blas_handle_) {
-      GPUPlaceGuard guard(gpu_place_);
-      PADDLE_ENFORCE(paddle::platform::dynload::cublasCreate(&blas_handle_),
-                     "cublasCreate failed");
-      PADDLE_ENFORCE(
-          paddle::platform::dynload::cublasSetStream(blas_handle_, stream_),
-          "cublasSetStream failed");
-    }
-    return blas_handle_;
-  }
-
-  cudnnHandle_t cudnn_handle() {
-    if (!dnn_handle_) {
-      GPUPlaceGuard guard(gpu_place_);
-      PADDLE_ENFORCE(paddle::platform::dynload::cudnnCreate(&dnn_handle_),
-                     "cudnnCreate failed");
-      PADDLE_ENFORCE(
-          paddle::platform::dynload::cudnnSetStream(dnn_handle_, stream_),
-          "cudnnSetStream failed");
-    }
-    return dnn_handle_;
-  }
-
-  ~CUDADeviceContext() {
-    Wait();
-    if (blas_handle_) {
-      PADDLE_ENFORCE(paddle::platform::dynload::cublasDestroy(blas_handle_),
-                     "cublasDestroy failed");
-    }
-
-    if (dnn_handle_) {
-      PADDLE_ENFORCE(paddle::platform::dynload::cudnnDestroy(dnn_handle_),
-                     "cudnnDestroy failed");
-    }
-
-    if (rand_generator_) {
-      PADDLE_ENFORCE(
-          paddle::platform::dynload::curandDestroyGenerator(rand_generator_),
-          "curandDestroyGenerator failed");
-    }
-    eigen_stream_.reset();
-    eigen_device_.reset();
-    PADDLE_ENFORCE(cudaStreamDestroy(stream_), "cudaStreamDestroy failed");
-  }
+  /*! \brief  Return curand handle in the device context. */
+  curandGenerator_t curand_generator();
+  // clang-format on
 
  private:
-  GPUPlace gpu_place_;
-  cudaStream_t stream_;
+  GPUPlace place_;
 
-  std::unique_ptr<Eigen::CudaStreamDevice> eigen_stream_;
+ private:
   std::unique_ptr<Eigen::GpuDevice> eigen_device_;
+  std::unique_ptr<Eigen::CudaStreamDevice> eigen_stream_;
 
-  cublasHandle_t blas_handle_{nullptr};
-
-  cudnnHandle_t dnn_handle_{nullptr};
-
+ private:
   unsigned random_seed_;
-  curandGenerator_t rand_generator_{nullptr};
+  // clang-format off
+  cudnnHandle_t     cudnn_handle_     = nullptr;
+  cublasHandle_t    cublas_handle_    = nullptr;
+  curandGenerator_t curand_generator_ = nullptr;
+  // clang-format on
 };
 
 #endif
diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index b06ab8a2f184e7bb7dd9cb39f377b087c5258dc4..fd4adbd9deca12ad6c3a59cfd5d30fb0cb6fcf98 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -36,6 +36,21 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 
+struct EnforceNotMet : public std::exception {
+  std::exception_ptr exp_;
+  std::string err_str_;
+
+  EnforceNotMet(std::exception_ptr e, const char* f, int l) : exp_(e) {
+    try {
+      std::rethrow_exception(exp_);
+    } catch (const std::exception& exp) {
+      err_str_ = string::Sprintf("%s at [%s:%d]", exp.what(), f, l);
+    }
+  }
+
+  const char* what() const noexcept { return err_str_.c_str(); }
+};
+
 // Because most enforce conditions would evaluate to true, we can use
 // __builtin_expect to instruct the C++ compiler to generate code that
 // always forces branch prediction of true.
@@ -43,18 +58,11 @@ namespace platform {
 // For more details, please check https://stackoverflow.com/a/43870188/724872.
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
 
-template <typename T>
-inline void throw_on_error(T e) {
-  throw_on_error(e, "");
-}
-
 template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     int stat, const Args&... args) {
   if (UNLIKELY(!(stat))) {
-    throw std::runtime_error(
-        string::Sprintf(args...) +
-        string::Sprintf(" at [%s:%s];", __FILE__, __LINE__));
+    throw std::runtime_error(string::Sprintf(args...));
   }
 }
 
@@ -64,12 +72,8 @@ template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     cudaError_t e, const Args&... args) {
   if (UNLIKELY(e)) {
-    // clang-format off
-    throw thrust::system_error(
-        e, thrust::cuda_category(),
-        string::Sprintf(args...) +
-        string::Sprintf(" at [%s:%s];", __FILE__, __LINE__));
-    // clang-format on
+    throw thrust::system_error(e, thrust::cuda_category(),
+                               string::Sprintf(args...));
   }
 }
 
@@ -77,12 +81,8 @@ template <typename... Args>
 inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
     curandStatus_t stat, const Args&... args) {
   if (stat != CURAND_STATUS_SUCCESS) {
-    // clang-format off
-    throw thrust::system_error(
-        cudaErrorLaunchFailure, thrust::cuda_category(),
-        string::Sprintf(args...) +
-        string::Sprintf(" at [%s:%s];", __FILE__, __LINE__));
-    // clang-format on
+    throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(),
+                               string::Sprintf(args...));
   }
 }
 
@@ -92,12 +92,8 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
   if (stat == CUDNN_STATUS_SUCCESS) {
     return;
   } else {
-    // clang-format off
-    throw std::runtime_error(
-        platform::dynload::cudnnGetErrorString(stat) +
-        string::Sprintf(args...) +
-        string::Sprintf(" at [%s:%s];", __FILE__, __LINE__));
-    // clang-format on
+    throw std::runtime_error(platform::dynload::cudnnGetErrorString(stat) +
+                             string::Sprintf(args...));
   }
 }
 
@@ -126,22 +122,32 @@ inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
   } else if (stat == CUBLAS_STATUS_LICENSE_ERROR) {
     err = "CUBLAS: license error, ";
   }
-  throw std::runtime_error(err + string::Sprintf(args...) +
-                           string::Sprintf(" at [%s:%s];", __FILE__, __LINE__));
+  throw std::runtime_error(err + string::Sprintf(args...));
 }
 
 #endif  // PADDLE_ONLY_CPU
 
-#define PADDLE_THROW(...)                                     \
-  do {                                                        \
-    throw std::runtime_error(                                 \
-        string::Sprintf(__VA_ARGS__) +                        \
-        string::Sprintf(" at [%s:%s];", __FILE__, __LINE__)); \
+template <typename T>
+inline void throw_on_error(T e) {
+  throw_on_error(e, "");
+}
+
+#define PADDLE_THROW(...)                                      \
+  do {                                                         \
+    throw ::paddle::platform::EnforceNotMet(                   \
+        std::make_exception_ptr(                               \
+            std::runtime_error(string::Sprintf(__VA_ARGS__))), \
+        __FILE__, __LINE__);                                   \
   } while (0)
 
-#define PADDLE_ENFORCE(...)                          \
-  do {                                               \
-    ::paddle::platform::throw_on_error(__VA_ARGS__); \
+#define PADDLE_ENFORCE(...)                                             \
+  do {                                                                  \
+    try {                                                               \
+      ::paddle::platform::throw_on_error(__VA_ARGS__);                  \
+    } catch (...) {                                                     \
+      throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
+                                              __FILE__, __LINE__);      \
+    }                                                                   \
   } while (0)
 
 }  // namespace platform
diff --git a/paddle/platform/enforce_test.cc b/paddle/platform/enforce_test.cc
index d7152f81509a35e4ce36d5649e7d209f51e34b86..2ac31812a80d8dd57ce82234cb5835e029a46067 100644
--- a/paddle/platform/enforce_test.cc
+++ b/paddle/platform/enforce_test.cc
@@ -23,7 +23,7 @@ TEST(ENFORCE, FAILED) {
   bool in_catch = false;
   try {
     PADDLE_ENFORCE(false, "Enforce is not ok %d at all", 123);
-  } catch (const std::runtime_error& error) {
+  } catch (paddle::platform::EnforceNotMet error) {
     // your error handling code here
     in_catch = true;
     std::string msg = "Enforce is not ok 123 at all";
diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
index 8010369b410e7dc5bebdb63bec981fcfd90235a3..a8994366bc34ee3ec2c39e3482fc3757b089e61f 100644
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
@@ -1,2 +1,3 @@
 cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python
         add_op fc_op sgd_op cross_entropy_op random_op)
+        add_op fc_op sgd_op cross_entropy_op recurrent_network_op guassian_random_op)
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 88deb56207c04c202260d846ea36f80fdb5f9685..5f3b24a4a1370a49a792d02a4a814c8044741b25 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -37,6 +37,7 @@ USE_OP(sigmoid);
 USE_OP(softmax);
 USE_OP(rowwise_add);
 USE_OP(gaussian_random);
+USE_OP_WITHOUT_KERNEL(recurrent_op);
 
 template <typename ClassType>
 void ExposeOperator(ClassType& m) {
@@ -49,6 +50,11 @@ void ExposeOperator(ClassType& m) {
       .def("__str__", &ClassType::type::DebugString);
 }
 
+static size_t UniqueIntegerGenerator() {
+  static std::atomic<size_t> generator;
+  return generator.fetch_add(1);
+}
+
 PYBIND11_PLUGIN(core) {
   py::module m("core", "C++ core of PaddlePaddle");
 
@@ -90,6 +96,11 @@ All parameter, weight, gradient are variables in Paddle.
            [](pd::Variable& self) -> pd::Tensor* {
              return self.GetMutable<pd::Tensor>();
            },
+           py::return_value_policy::reference)
+      .def("get_net",
+           [](pd::Variable& self) -> pd::NetOp* {
+             return self.GetMutable<pd::NetOp>();
+           },
            py::return_value_policy::reference);
 
   py::class_<pd::Scope, std::shared_ptr<pd::Scope>>(m, "Scope")
@@ -99,7 +110,8 @@ All parameter, weight, gradient are variables in Paddle.
            py::return_value_policy::reference)
       .def("create_var",
            &pd::Scope::CreateVariable,
-           py::return_value_policy::reference);
+           py::return_value_policy::reference)
+      .def("get_var_name", &pd::Scope::GetVariableName);
 
   //! @note: Be careful! PyBind will return std::string as an unicode, not
   //! Python str. If you want a str object, you should cast them in Python.
@@ -141,24 +153,25 @@ All parameter, weight, gradient are variables in Paddle.
   });
   ExposeOperator(operator_base);
 
-  using PlainNetPtr = std::shared_ptr<pd::PlainNet>;
-  py::class_<pd::PlainNet, PlainNetPtr> plain_net(m, "PlainNet");
-
-  plain_net
-      .def_static("create",
-                  []() -> std::shared_ptr<pd::PlainNet> {
-                    auto retv = std::make_shared<pd::PlainNet>();
-                    retv->type_ = "plain_net";
-                    return retv;
-                  })
-      .def("add_op", &pd::PlainNet::AddOp)
+  py::class_<pd::NetOp, std::shared_ptr<pd::NetOp>> net(m, "Net");
+
+  net.def_static("create",
+                 []() -> std::shared_ptr<pd::NetOp> {
+                   auto retv = std::make_shared<pd::NetOp>();
+                   retv->type_ = "plain_net";
+                   return retv;
+                 })
+      .def("add_op", &pd::NetOp::AddOp)
       .def("add_op",
-           [](PlainNetPtr& self, const PlainNetPtr& plain_net) -> void {
-             self->AddOp(std::static_pointer_cast<pd::OperatorBase>(plain_net));
+           [](pd::NetOp& self, const std::shared_ptr<pd::NetOp>& net) -> void {
+             self.AddOp(std::static_pointer_cast<pd::OperatorBase>(net));
            })
-      .def("complete_add_op", &pd::PlainNet::CompleteAddOp)
-      .def("complete_add_op", [](PlainNetPtr& self) { self->CompleteAddOp(); });
-  ExposeOperator(plain_net);
+      .def("complete_add_op", &pd::NetOp::CompleteAddOp)
+      .def("complete_add_op",
+           [](std::shared_ptr<pd::NetOp>& self) { self->CompleteAddOp(); });
+  ExposeOperator(net);
+
+  m.def("unique_integer", UniqueIntegerGenerator);
 
   return m.ptr();
 }
diff --git a/paddle/trainer/NewRemoteParameterUpdater.cpp b/paddle/trainer/NewRemoteParameterUpdater.cpp
index a830ceba5772846cd9255a3eeb26e8d6a17dcfbc..e1558e3fdfbcf296be0ee64202132f53bf901be9 100644
--- a/paddle/trainer/NewRemoteParameterUpdater.cpp
+++ b/paddle/trainer/NewRemoteParameterUpdater.cpp
@@ -76,7 +76,11 @@ void NewRemoteParameterUpdater::init(
       sgdConfigV2->set_decay(paramConfig.decay_rate());
       optimizeConfigV2.set_lr_policy(paddle::OptimizerConfig::Const);
       auto constlr = optimizeConfigV2.mutable_const_lr();
-      constlr->set_learning_rate(paramConfig.learning_rate());
+      if (paramConfig.has_learning_rate()) {
+        constlr->set_learning_rate(paramConfig.learning_rate());
+      } else {
+        constlr->set_learning_rate(trainerConfig_.learning_rate());
+      }
       if (trainerConfig_.algorithm() == "sgd") {
         optimizeConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
         // FIXME: config all algorithms
diff --git a/paddle/utils/Error.h b/paddle/utils/Error.h
index 27ddaab3f003110a2684a871a2de17afb473d660..7cde98306026ca1de76089749aaea265d151da33 100644
--- a/paddle/utils/Error.h
+++ b/paddle/utils/Error.h
@@ -126,9 +126,11 @@ public:
   }
 
   /**
-   * @brief operator bool, return True if there is something error.
+   * @brief check this status by glog.
+   * @note It is a temp method used during cleaning Paddle code. It will be
+   *       removed later.
    */
-  operator bool() const { return !this->isOK(); }
+  void check() const { CHECK(this->isOK()) << msg(); }
 
   /**
    * @brief isOK return True if there is no error.
@@ -136,13 +138,6 @@ public:
    */
   bool isOK() const { return msg_ == nullptr; }
 
-  /**
-   * @brief check this status by glog.
-   * @note It is a temp method used during cleaning Paddle code. It will be
-   *       removed later.
-   */
-  void check() const { CHECK(this->isOK()) << msg(); }
-
 private:
   std::shared_ptr<std::string> msg_;
 };
diff --git a/paddle/utils/tests/test_Error.cpp b/paddle/utils/tests/test_Error.cpp
index fdf326b17a1c8baa87e2a17fafae253565d1e699..6f311fa6b80191de1e11ce1f63c31b64fe2eeb80 100644
--- a/paddle/utils/tests/test_Error.cpp
+++ b/paddle/utils/tests/test_Error.cpp
@@ -18,17 +18,17 @@ limitations under the License. */
 
 TEST(Error, testAll) {
   paddle::Error error;
-  ASSERT_FALSE(error);
+  ASSERT_TRUE(error.isOK());
   error = paddle::Error("I'm the error");
-  ASSERT_TRUE(error);
+  ASSERT_FALSE(error.isOK());
   ASSERT_STREQ("I'm the error", error.msg());
 
   error = paddle::Error("error2");
-  ASSERT_TRUE(error);
+  ASSERT_FALSE(error.isOK());
   ASSERT_STREQ("error2", error.msg());
 
   int i = 3;
   auto error3 = paddle::Error("error%d", i);
-  ASSERT_TRUE(error3);
+  ASSERT_FALSE(error3.isOK());
   ASSERT_STREQ("error3", error3.msg());
 }
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index fc112f1327f5ad5f1bdd04873394b1fa0e761e29..5477158ecb8646992ebdded0b15cce50720ebf36 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2055,8 +2055,7 @@ class BatchNormLayer(LayerBase):
         # Automatically select cudnn_batch_norm for GPU and batch_norm for CPU.
         # Also based on cudnn version.
         use_cudnn = use_gpu and batch_norm_type != "batch_norm" and \
-            ((not parallel_nn) or self.config.device > -1) and \
-            cudnn_version >= 4007
+                ((not parallel_nn) or self.config.device > -1)
         self.layer_type = "cudnn_batch_norm" if use_cudnn else "batch_norm"
         super(BatchNormLayer, self).__init__(
             name, self.layer_type, 0, inputs=inputs, **xargs)
diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py
index 9b9f979bb615f37ec1dc9baa154d28741b1400d5..ecba87191045cff6c05014010e60575741238f8d 100644
--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
@@ -272,7 +272,7 @@ class ExtraLayerAttribute(object):
         for key in self.attr:
             if not hasattr(self, 'can_%s' % key) or \
                     not getattr(self, 'can_%s' % key):
-                raise NotImplementedError("Layer %s cannot support %s" %
+                raise NotImplementedError("Layer %s does not support %s" %
                                           (layer_name, key))
 
     @staticmethod
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 21eba71527e60833e0c69b344ecc639626faa529..14f072fc55109d770edf469ad7c574b8dda8a434 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -865,7 +865,7 @@ def data_layer(name, size, height=None, width=None, layer_attr=None):
 
 @wrap_name_default("embedding")
 @wrap_param_attr_default()
-@layer_support(ERROR_CLIPPING)
+@layer_support(ERROR_CLIPPING, DROPOUT)
 def embedding_layer(input, size, name=None, param_attr=None, layer_attr=None):
     """
     Define a embedding Layer.
@@ -1320,7 +1320,7 @@ def pooling_layer(input,
 @wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
 @wrap_act_default(param_names=["act", 'state_act'], act=TanhActivation())
 @wrap_name_default("lstmemory")
-@layer_support(DROPOUT)
+@layer_support()
 def lstmemory(input,
               name=None,
               size=None,
@@ -1429,7 +1429,7 @@ def lstmemory(input,
 @wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
 @wrap_act_default(param_names=["act"], act=TanhActivation())
 @wrap_name_default("gru")
-@layer_support(DROPOUT)
+@layer_support()
 def grumemory(input,
               size=None,
               name=None,
@@ -1793,7 +1793,7 @@ def repeat_layer(input,
 @wrap_name_default("seqreshape")
 @wrap_act_default(act=IdentityActivation())
 @wrap_bias_attr_default(has_bias=False)
-@layer_support()
+@layer_support(ERROR_CLIPPING, DROPOUT)
 def seq_reshape_layer(input,
                       reshape_size,
                       act=None,
@@ -2703,7 +2703,7 @@ def img_cmrnorm_layer(input,
     default_factory=lambda _: ParamAttr(initial_mean=1.0, initial_std=0.))
 @wrap_act_default(act=ReluActivation())
 @wrap_name_default("batch_norm")
-@layer_support(DROPOUT)
+@layer_support(DROPOUT, ERROR_CLIPPING)
 def batch_norm_layer(input,
                      act=None,
                      name=None,
@@ -2783,15 +2783,6 @@ def batch_norm_layer(input,
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
-    if not isinstance(act, ReluActivation):
-        logger.log(logging.WARN,
-                   "%s is not recommend for batch normalization's activation, "
-                   "maybe the relu is better" % act.name)
-
-    if not isinstance(input.activation, LinearActivation):
-        logger.log(logging.WARN,
-                   "The activation should be inside batch normalization, the "
-                   "previous layer's activation may be Linear")
 
     if num_channels is None:
         if input.num_filters is not None:
@@ -2861,7 +2852,7 @@ def sum_to_one_norm_layer(input, name=None, layer_attr=None):
 @wrap_name_default("addto")
 @wrap_act_default(act=LinearActivation())
 @wrap_bias_attr_default(has_bias=False)
-@layer_support(DROPOUT)
+@layer_support(DROPOUT, ERROR_CLIPPING)
 def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None):
     """
     AddtoLayer.
@@ -2940,7 +2931,7 @@ def addto_layer(input, act=None, name=None, bias_attr=None, layer_attr=None):
 
 @wrap_act_default(act=IdentityActivation())
 @wrap_name_default("concat")
-@layer_support()
+@layer_support(DROPOUT, ERROR_CLIPPING)
 def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
     """
     Concat all input vector into one huge vector.
@@ -3024,7 +3015,7 @@ def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
 @wrap_name_default("seqconcat")
 @wrap_act_default(act=IdentityActivation())
 @wrap_bias_attr_default(has_bias=False)
-@layer_support()
+@layer_support(DROPOUT, ERROR_CLIPPING)
 def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
                      bias_attr=None):
     """
@@ -3177,7 +3168,7 @@ def memory(name,
 @wrap_act_default(param_names=['state_act'], act=TanhActivation())
 @wrap_act_default(act=TanhActivation())
 @wrap_name_default('lstm_step')
-@layer_support(ERROR_CLIPPING, DROPOUT)
+@layer_support()
 def lstm_step_layer(input,
                     state,
                     size=None,
@@ -4480,7 +4471,7 @@ def tensor_layer(a,
 @wrap_param_attr_default()
 @wrap_bias_attr_default()
 @wrap_act_default()
-@layer_support()
+@layer_support(DROPOUT, ERROR_CLIPPING)
 def selective_fc_layer(input,
                        size,
                        select=None,
@@ -5974,7 +5965,7 @@ def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None):
     """
     The crop layer crops images by offset and shape. User can set crop shape by
     args 'shape' explicitly or by reference input layer.
-    
+
     The example usage is:
 
     .. code-block:: python
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
index 07ab2c9b1898f0ec7a5ca168912f2f03597b094a..5bea980611904b37a4a5d4e2cbbee13503a61ff0 100644
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -34,6 +34,7 @@ import minibatch
 import plot
 import image
 import model
+import paddle.trainer.config_parser as cp
 
 __all__ = [
     'optimizer',
@@ -58,6 +59,8 @@ __all__ = [
     'model',
 ]
 
+cp.begin_parse()
+
 
 def init(**kwargs):
     import py_paddle.swig_paddle as api
@@ -73,6 +76,11 @@ def init(**kwargs):
     for key in args_dict.keys():
         args.append('--%s=%s' % (key, str(args_dict[key])))
 
+    if 'use_gpu' in kwargs:
+        cp.g_command_config_args['use_gpu'] = kwargs['use_gpu']
+    assert 'parallel_nn' not in kwargs, ("currently 'parallel_nn' is not "
+                                         "supported in v2 APIs.")
+
     api.initPaddle(*args)
 
 
diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py
index 645f3cc0dce70752c20569523e4bab440861f6a1..111496618dfa997246d0a067b0cd4c7dad74f9dc 100644
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@@ -166,55 +166,37 @@ def cluster_files_reader(files_pattern,
     return reader
 
 
-def convert(output_path,
-            reader,
-            num_shards,
-            name_prefix,
-            max_lines_to_shuffle=1000):
+def convert(output_path, reader, line_count, name_prefix):
     import recordio
     """
     Convert data from reader to recordio format files.
 
     :param output_path: directory in which output files will be saved.
     :param reader: a data reader, from which the convert program will read data instances.
-    :param num_shards: the number of shards that the dataset will be partitioned into.
     :param name_prefix: the name prefix of generated files.
     :param max_lines_to_shuffle: the max lines numbers to shuffle before writing.
     """
 
-    assert num_shards >= 1
-    assert max_lines_to_shuffle >= 1
-
-    def open_writers():
-        w = []
-        for i in range(0, num_shards):
-            n = "%s/%s-%05d-of-%05d" % (output_path, name_prefix, i,
-                                        num_shards - 1)
-            w.append(recordio.writer(n))
-
-        return w
-
-    def close_writers(w):
-        for i in range(0, num_shards):
-            w[i].close()
+    assert line_count >= 1
+    indx_f = 0
 
-    def write_data(w, lines):
+    def write_data(indx_f, lines):
         random.shuffle(lines)
-        for i, d in enumerate(lines):
+        filename = "%s/%s-%05d" % (output_path, name_prefix, indx_f)
+        writer = recordio.writer(filename)
+        for l in lines:
             # FIXME(Yancey1989):
             # dumps with protocol: pickle.HIGHEST_PROTOCOL
-            o = pickle.dumps(d)
-            w[i % num_shards].write(o)
+            writer.write(cPickle.dumps(l))
+        writer.close()
 
-    w = open_writers()
     lines = []
-
     for i, d in enumerate(reader()):
         lines.append(d)
-        if i % max_lines_to_shuffle == 0 and i >= max_lines_to_shuffle:
-            write_data(w, lines)
+        if i % line_count == 0 and i >= line_count:
+            write_data(indx_f, lines)
             lines = []
+            indx_f += 1
             continue
 
-    write_data(w, lines)
-    close_writers(w)
+    write_data(indx_f, lines)
diff --git a/python/paddle/v2/dataset/mq2007.py b/python/paddle/v2/dataset/mq2007.py
index cffb319ad8f56ccddba3fef63e1b6ec68e5bac1e..b705c9109b2b6769c9fafa9241db5d81c682f9e3 100644
--- a/python/paddle/v2/dataset/mq2007.py
+++ b/python/paddle/v2/dataset/mq2007.py
@@ -242,9 +242,9 @@ def gen_list(querylist):
     if not isinstance(querylist, QueryList):
         querylist = QueryList(querylist)
     querylist._correct_ranking_()
-    relevance_score_list = [query.relevance_score for query in querylist]
+    relevance_score_list = [[query.relevance_score] for query in querylist]
     feature_vector_list = [query.feature_vector for query in querylist]
-    yield np.array(relevance_score_list).T, np.array(feature_vector_list)
+    yield np.array(relevance_score_list), np.array(feature_vector_list)
 
 
 def query_filter(querylists):
diff --git a/python/paddle/v2/framework/create_op_creation_methods.py b/python/paddle/v2/framework/create_op_creation_methods.py
index 7248c3f52a9902e8c08ac2f1405801a5710459e5..b034efffb69030cb09e09ea545e9bff6f1744671 100644
--- a/python/paddle/v2/framework/create_op_creation_methods.py
+++ b/python/paddle/v2/framework/create_op_creation_methods.py
@@ -220,6 +220,9 @@ def create_op_creation_method(op_proto):
     __impl__.all_input_args = [var.name for var in op_proto.inputs]
     __impl__.all_output_args = [var.name for var in op_proto.outputs]
     __impl__.all_attr_args = [attr.name for attr in op_proto.attrs]
+    __impl__.all_not_temp_output_args = [
+        var.name for var in op_proto.outputs if not var.temporary
+    ]
 
     return __impl__
 
diff --git a/python/paddle/v2/framework/network.py b/python/paddle/v2/framework/network.py
new file mode 100644
index 0000000000000000000000000000000000000000..c85e87413ef45f40755709e134a277b8d8d1e233
--- /dev/null
+++ b/python/paddle/v2/framework/network.py
@@ -0,0 +1,124 @@
+import paddle.v2.framework.core as core
+from paddle.v2.framework.create_op_creation_methods import op_creations
+from default_scope_funcs import create_var, get_var, get_cur_scope
+
+__all__ = ['Network']  # Only expose Network
+
+
+class NetworkFunctor(object):
+    """
+    Network Op Creation Function. Used internally in this module.
+    It convert string input to Variable. If it is not created before, just 
+    create in scope.
+    
+    It is a functor object. means the instances are callable.
+    
+    :param func: The op creation function which generated in Python.
+    :param net: The Network instance.
+    """
+
+    def __init__(self, func, net):
+        self.func = func
+        self.net = net
+
+    def __call__(self, *args, **kwargs):
+        if len(args) != 0:
+            raise ValueError("Paddle must use keyword argument")
+        inputs = self.func.all_input_args
+        for ipt in inputs:
+            if ipt in kwargs:
+                var = kwargs[ipt]
+                if isinstance(var, basestring):
+                    var = create_var(var)
+                if not isinstance(var, core.Variable):
+                    raise TypeError(
+                        "Input of op creation must be string or variable")
+
+                kwargs[ipt] = get_cur_scope().get_var_name(var)
+
+        notemp_outputs = self.func.all_not_temp_output_args
+
+        for name in notemp_outputs:
+            if name not in kwargs:
+                kwargs[
+                    name] = self.func.__name__ + "@OUT@%d" % core.unique_integer(
+                    )
+
+        outputs = self.func.all_output_args
+        for opt in outputs:
+            if opt in kwargs:
+                var = kwargs[opt]
+                if isinstance(var, basestring):
+                    var = create_var(var)
+                if not isinstance(var, core.Variable):
+                    raise TypeError(
+                        "Output of op creation must be string or variable")
+                kwargs[opt] = get_cur_scope().get_var_name(var)
+
+        op = self.func(**kwargs)
+
+        self.net.net.add_op(op)
+
+        lst = [get_var(kwargs[opt]) for opt in notemp_outputs]
+        if len(lst) == 1:
+            return lst[0]
+        elif len(lst) == 0:
+            return None
+        else:
+            return lst
+
+
+class Network(object):
+    """
+    The network concept. It avoid user to manually create operator, create 
+    variable, and combine them into a Net. Just use Network.xxx can create the
+    operator, create variables in default scope, and add them into `self.net`.
+    
+    For example:
+    
+    ..  code-block: python
+    
+        net = Network()
+        out = net.add_two(X="a", Y="b")
+        fc_out = net.fc(X="out", W="fc.w")
+        
+        net.run(...)
+    """
+
+    def __init__(self):
+        self.net = core.Net.create()
+        funcs = (func_name for func_name in dir(op_creations)
+                 if not func_name.startswith("__"))
+
+        # TODO(yuyang18): This code can work, but do not generate a good
+        # docstring, try to give a better way generate function in runtime
+        # later.
+        for func_name in funcs:
+            func = getattr(op_creations, func_name)
+            impl = NetworkFunctor(func, self)
+            setattr(self, func_name, impl.__call__)
+        self.__complete_add_op__ = False
+
+    def infer_shape(self):
+        self.complete_add_op()
+        self.net.infer_shape(get_cur_scope())
+
+    def run(self, device_context):
+        self.complete_add_op()
+        self.net.run(get_cur_scope(), device_context)
+
+    def __str__(self):
+        return str(self.net)
+
+    def complete_add_op(self):
+        if not self.__complete_add_op__:
+            self.net.complete_add_op()
+            self.__complete_add_op__ = True
+
+
+if __name__ == '__main__':
+    net = Network()
+    out = net.add_two(X="a", Y="b")
+    fc_out = net.fc(X=out, W="fc.w", b="fc.b", activation="softmax")
+    net.complete_add_op()
+    print net
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index 254e8d37d13b2c0d2d6dcceb6c96ac0c2c02592d..deaa350133a88d8c9902b3c120f6583580ccd759 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -3,7 +3,7 @@ add_python_test(test_framework
     test_scope.py
     test_default_scope_funcs.py
     test_op_creation_methods.py
-    test_plain_net.py
+    test_net.py
     test_tensor.py
     test_fc_op.py
     test_add_two_op.py
@@ -13,4 +13,5 @@ add_python_test(test_framework
     test_sigmoid_op.py
     test_softmax_op.py
     test_rowwise_add_op.py
-    test_random_op.py)
+    test_random_op.py
+    test_network.py)
diff --git a/python/paddle/v2/framework/tests/test_plain_net.py b/python/paddle/v2/framework/tests/test_net.py
similarity index 92%
rename from python/paddle/v2/framework/tests/test_plain_net.py
rename to python/paddle/v2/framework/tests/test_net.py
index 53c8dd6c225df2fef9069816935e2778c36c10ee..c4cc5b43bc9cfb857c4d82d5f6a2940235949dc0 100644
--- a/python/paddle/v2/framework/tests/test_plain_net.py
+++ b/python/paddle/v2/framework/tests/test_net.py
@@ -5,11 +5,11 @@ import unittest
 
 class TestNet(unittest.TestCase):
     def test_net_all(self):
-        net = core.PlainNet.create()
+        net = core.Net.create()
         op1 = op_creations.add_two(X="X", Y="Y", Out="Out")
         net.add_op(op1)
 
-        net2 = core.PlainNet.create()
+        net2 = core.Net.create()
         net2.add_op(op_creations.fc(X="X", W="w", Y="fc.out"))
         net2.complete_add_op(True)
         net.add_op(net2)
diff --git a/python/paddle/v2/framework/tests/test_network.py b/python/paddle/v2/framework/tests/test_network.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d53e233e959bd39b558ac97cdca381135505f8d
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_network.py
@@ -0,0 +1,32 @@
+from paddle.v2.framework.network import Network
+import paddle.v2.framework.core as core
+import unittest
+
+
+class TestNet(unittest.TestCase):
+    def test_net_all(self):
+        net = Network()
+        out = net.add_two(X="X", Y="Y")
+        fc_out = net.fc(X=out, W="w")
+        net.complete_add_op()
+        self.assertTrue(isinstance(fc_out, core.Variable))
+        self.assertEqual(
+            '''Op(plain_net), inputs:(@EMPTY@, X, Y, w), outputs:(@TEMP@fc@0, add_two@OUT@0, fc@OUT@1).
+    Op(add_two), inputs:(X, Y), outputs:(add_two@OUT@0).
+    Op(fc), inputs:(add_two@OUT@0, w, @EMPTY@), outputs:(fc@OUT@1, @TEMP@fc@0).
+        Op(mul), inputs:(add_two@OUT@0, w), outputs:(@TEMP@fc@0).
+        Op(sigmoid), inputs:(@TEMP@fc@0), outputs:(fc@OUT@1).
+''', str(net))
+
+        net2 = Network()
+        tmp = net2.add_two(X="X", Y="Y")
+        self.assertTrue(isinstance(tmp, core.Variable))
+        net2.complete_add_op()
+        self.assertEqual(
+            '''Op(plain_net), inputs:(X, Y), outputs:(add_two@OUT@2).
+    Op(add_two), inputs:(X, Y), outputs:(add_two@OUT@2).
+''', str(net2))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_recurrent_op.py b/python/paddle/v2/framework/tests/test_recurrent_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..0457e3f16a709140180ce433c1d56d146f0b6974
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_recurrent_op.py
@@ -0,0 +1,92 @@
+import paddle.v2.framework.core as core
+import unittest
+import numpy as np
+import paddle.v2.framework.create_op_creation_methods as creation
+
+ops = creation.op_creations
+
+
+def create_tensor(scope, name, shape):
+    tensor = scope.create_var(name).get_tensor()
+    tensor.set_dims(shape)
+    tensor.alloc_float()
+    tensor.set(np.random.random(shape))
+    return tensor
+
+
+class TestRNN(unittest.TestCase):
+    '''
+    Test RNNOp
+
+    equation:
+        h_t = \sigma (W x_t + U h_{t-1})
+    weights:
+        - W
+        - U
+    vars:
+        - x
+    memories:
+        - h
+    outputs:
+        - h
+    '''
+
+    def init(self):
+        input_dim = 30
+        batch_size = 50
+        weight_dim = 15
+
+        self.scope = core.Scope(None)
+
+        # create vars
+        create_tensor(self.scope, "x", [batch_size, input_dim])
+        create_tensor(self.scope, "W", [input_dim, weight_dim])
+        create_tensor(self.scope, "U", [weight_dim, weight_dim])
+        create_tensor(self.scope, "h_boot", [batch_size, weight_dim])
+
+        x_alias = "x@alias"
+        y_alias = "y@alias"
+        memory = "h@alias"
+        prememory = "h@pre"
+        output = "rnn_out"
+        output_alias = "rnn_out@alias"
+
+        # create step net
+        stepnet_var = self.scope.create_var("stepnet")
+        stepnet = stepnet_var.get_net()
+        # stepnet = core.Net.create()
+        x_fc_op = ops.fc(X=x_alias, W="W", Y="Wx")
+        h_fc_op = ops.fc(X=prememory, W="U", Y="Uh")
+        sum_op = ops.add_two(X="Wx", Y="Uh", Out="sum")
+        sig_op = ops.sigmoid(X="sum", Y=memory)
+        stepnet.add_op(x_fc_op)
+        stepnet.add_op(h_fc_op)
+        stepnet.add_op(sum_op)
+        stepnet.add_op(sig_op)
+        stepnet.complete_add_op(True)
+
+        # create RNNOp
+        rnnop = ops.recurrent_op(
+            # inputs
+            inlinks=["x"],
+            boot_memories=["h_boot"],
+            step_net="stepnet",
+            # outputs
+            outlinks=[output],
+            step_scopes="step_scopes",
+            # attributes
+            inlink_alias=["x@alias"],
+            outlink_alias=[output_alias],
+            pre_memories=[prememory],
+            memories=[memory])
+
+        ctx = core.DeviceContext.cpu_context()
+        rnnop.infer_shape(self.scope)
+        rnnop.run(self.scope, ctx)
+
+    def test_recurrent(self):
+        self.init()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py
index 40134a3270c3579fd2f6a891af66ff241050f60c..4dcc3ab57e7e6dfbe040ac61025e55b9e48b4415 100644
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
@@ -35,6 +35,13 @@ class Inference(object):
             name = param.getName()
             assert isinstance(val, api.Vector)
             val.copyFromNumpyArray(parameters.get(name).flatten())
+            # the setValueUpdated function is called in randomize, zeroMem,
+            # load function in paddle/parameter/Parameter.cpp. But in the
+            # inference mode, the setValueUpdated is never called, it will
+            # cause the parameter will not be dispatched
+            # in MultiGradientMachine for multi-GPU. So setValueUpdated is
+            # called here, but it's better to call this function in one place.
+            param.setValueUpdated()
         self.__gradient_machine__ = gm
         self.__data_types__ = topo.data_type()
 
diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py
index 4ade1c6f329ae39769279963af6809f938807bdd..6a2bb8d337b7667aa2b1e3ef0815bb80f6e38d6a 100644
--- a/python/paddle/v2/layer.py
+++ b/python/paddle/v2/layer.py
@@ -324,6 +324,3 @@ def parse_network(output_layers, extra_layers=None):
 
 def get_layer(name):
     return config_base.__layer_map__.get(name)
-
-
-cp.begin_parse()
diff --git a/python/paddle/v2/master/client.py b/python/paddle/v2/master/client.py
index 4dc31bff583ee933b33e475d9421c21a7bb74449..b658a81630733fea3976b812afe819d76de4cb25 100644
--- a/python/paddle/v2/master/client.py
+++ b/python/paddle/v2/master/client.py
@@ -1,8 +1,15 @@
 import ctypes
 import os
 
-path = os.path.join(os.path.dirname(__file__), "libpaddle_master.so")
-lib = ctypes.cdll.LoadLibrary(path)
+__lib__ = None
+
+
+def get_c_lib():
+    global __lib__
+    if __lib__ is None:
+        path = os.path.join(os.path.dirname(__file__), "libpaddle_master.so")
+        __lib__ = ctypes.cdll.LoadLibrary(path)
+    return __lib__
 
 
 class client(object):
@@ -11,8 +18,8 @@ class client(object):
     """
 
     def __init__(self, etcd_endpoints, timeout_sec, buf_size=0):
-        self.c = lib.paddle_new_etcd_master_client(etcd_endpoints, timeout_sec,
-                                                   buf_size)
+        self.c = get_c_lib().paddle_new_etcd_master_client(
+            etcd_endpoints, timeout_sec, buf_size)
 
     def request_save_model(self, trainer_id, block_ms):
         """request to save model
@@ -32,20 +39,20 @@ class client(object):
             saving the model, -1 if error happened.
 
         """
-        return lib.paddle_request_save_model(self.c, trainer_id, block_ms)
+        return get_c_lib().paddle_request_save_model(self.c, trainer_id,
+                                                     block_ms)
 
     def release(self):
-        lib.paddle_release_master_client(self.c)
+        get_c_lib().paddle_release_master_client(self.c)
         self.c = None
 
     def set_dataset(self, paths):
         holder_type = ctypes.c_char_p * len(paths)
         holder = holder_type()
-        print paths
         for idx, path in enumerate(paths):
             c_ptr = ctypes.c_char_p(path)
             holder[idx] = c_ptr
-        lib.paddle_set_dataset(self.c, holder, len(paths))
+        get_c_lib().paddle_set_dataset(self.c, holder, len(paths))
 
     def next_record(self):
         """gets next record for training
@@ -56,7 +63,7 @@ class client(object):
         """
         p = ctypes.c_char_p()
         ret = ctypes.pointer(p)
-        size = lib.paddle_next_record(self.c, ret)
+        size = get_c_lib().paddle_next_record(self.c, ret)
         if size < 0:
             # Error
             return None, size
@@ -67,5 +74,5 @@ class client(object):
 
         record = ret.contents.value[:size]
         # Memory created from C should be freed.
-        lib.mem_free(ret.contents)
+        get_c_lib().mem_free(ret.contents)
         return record, 0