diff --git a/.gitignore b/.gitignore
index 275173b9677bffe028152fe8eadb3384329aeb5a..5c2fb134ae896651cb2edc2004112a4246e52359 100644
--- a/.gitignore
+++ b/.gitignore
@@ -19,3 +19,6 @@ third_party/
 
 # clion workspace.
 cmake-build-*
+
+# generated while compiling
+python/paddle/v2/framework/core.so
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 15a7c6b07417adfacd461e95c0b92f658e1e11cc..2c713db3e38548f242e53b2fb5f436db81f843ef 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,6 +97,7 @@ include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
+include(external/pybind11)    # download pybind11
 
 include(cudnn)              # set cudnn libraries, must before configure
 include(configure)          # add paddle env configuration
diff --git a/README.md b/README.md
index fa16cc3cf2ef9c1200a19e03192c94c65fc08679..2a6beeb342b34f8e91ef509d7d41f286a666480c 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@
 
 
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/develop/doc/)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/doc_cn/)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://doc.paddlepaddle.org/develop/doc/)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://doc.paddlepaddle.org/develop/doc_cn/)
 [![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@@ -61,35 +61,36 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
 ## Installation
 
 It is recommended to check out the
-[Docker installation guide](http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html)
+[Docker installation guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html)
 before looking into the
-[build from source guide](http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html)
+[build from source guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html)
 
 ## Documentation
 
-We provide [English](http://www.paddlepaddle.org/develop/doc/) and
-[Chinese](http://www.paddlepaddle.org/doc_cn/) documentation.
+We provide [English](http://doc.paddlepaddle.org/develop/doc/) and
+[Chinese](http://doc.paddlepaddle.org/doc_cn/) documentation.
 
 - [Deep Learning 101](http://book.paddlepaddle.org/index.html)
 
   You might want to start from the this online interactive book that can run in Jupyter Notebook.
 
-- [Distributed Training](http://www.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)
+- [Distributed Training](http://doc.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)
 
   You can run distributed training jobs on MPI clusters.
 
-- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/develop/doc/howto/usage/k8s/k8s_en.html)
+- [Distributed Training on Kubernetes](http://doc.paddlepaddle.org/develop/doc/howto/usage/k8s/k8s_en.html)
 
    You can also run distributed training jobs on Kubernetes clusters.
 
-- [Python API](http://www.paddlepaddle.org/develop/doc/api/index_en.html)
+- [Python API](http://doc.paddlepaddle.org/develop/doc/api/index_en.html)
 
    Our new API enables much shorter programs.
 
-- [How to Contribute](http://www.paddlepaddle.org/develop/doc/howto/dev/contribute_to_paddle_en.html)
+- [How to Contribute](http://doc.paddlepaddle.org/develop/doc/howto/dev/contribute_to_paddle_en.html)
 
    We appreciate your contributions!
 
+
 ## Ask Questions
 
 You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues).
diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..9391c285c7544669a5b1a078b7473d7a656c1bb4
--- /dev/null
+++ b/cmake/external/pybind11.cmake
@@ -0,0 +1,30 @@
+INCLUDE(ExternalProject)
+
+SET(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind)
+
+INCLUDE_DIRECTORIES(${PYBIND_SOURCE_DIR}/src/extern_pybind/include)
+
+ExternalProject_Add(
+        extern_pybind
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY  "https://github.com/pybind/pybind11.git"
+        GIT_TAG         "v2.1.1"
+        PREFIX          ${PYBIND_SOURCE_DIR}
+        UPDATE_COMMAND  ""
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND     ""
+        INSTALL_COMMAND   ""
+        TEST_COMMAND      ""
+)
+
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/pybind_dummy.c)
+    file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
+    add_library(pybind STATIC ${dummyfile})
+else()
+    add_library(pybind INTERFACE)
+endif()
+
+add_dependencies(pybind extern_pybind)
+
+LIST(APPEND external_project_dependencies pybind)
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index 6546b2c83bc8f81f89e4018a2216f191bbeb0d21..67a359d4b5f4cca8fc8e74eab4d4acb4cc12baed 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -18,6 +18,9 @@ INCLUDE(python_module)
 FIND_PACKAGE(PythonInterp 2.7)
 IF(WITH_PYTHON)
     FIND_PACKAGE(PythonLibs 2.7)
+    # Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
+    ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
 ENDIF(WITH_PYTHON)
 
 SET(py_env "")
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 7a996dea92b13bdac054a987a004a3d54ff02da2..c31e62fc08b531a38a851b71a033e14277eff015 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -109,7 +109,9 @@ set(COMMON_FLAGS
     -Wno-unused-function
     -Wno-error=literal-suffix
     -Wno-error=sign-compare
-    -Wno-error=unused-local-typedefs)
+    -Wno-error=unused-local-typedefs
+    -Wno-error=parentheses-equality # Warnings in Pybind11
+)
 
 set(GPU_COMMON_FLAGS
     -fPIC
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 725cf28037182bfd0a27d49491bc8a479a9b4440..83e3d155d038cc65f3e372f0e4ba0aaee2e29690 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -93,6 +93,7 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR})
 if(NOT APPLE)
     find_package(Threads REQUIRED)
     link_libraries(${CMAKE_THREAD_LIBS_INIT})
+    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -ldl")
 endif(NOT APPLE)
 
 function(merge_static_libs TARGET_NAME)
diff --git a/doc/howto/dev/new_layer_cn.rst b/doc/howto/dev/new_layer_cn.rst
index 9489a921c70ad6ee5709f46445554f5d9640162c..75037e693b32f923ee7dc9dfec322495fe4ce10a 100644
--- a/doc/howto/dev/new_layer_cn.rst
+++ b/doc/howto/dev/new_layer_cn.rst
@@ -37,7 +37,7 @@
 
    \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x}
 
-假设 :math:`z = f(W^T x + b)` ，那么
+假设 :math:`z = W^T x + b` ，那么
 
 .. math::
 
diff --git a/doc/howto/dev/new_layer_en.rst b/doc/howto/dev/new_layer_en.rst
index 46481f5ead33dc6a26507e021fd9ae0f8316e940..110a9fb38f890a766bb4480e91feb22d3b0838a5 100644
--- a/doc/howto/dev/new_layer_en.rst
+++ b/doc/howto/dev/new_layer_en.rst
@@ -29,7 +29,7 @@ Fully connected layer takes a dense input vector with dimension :math:`D_i`. It
 
 where :math:`f(.)` is an nonlinear *activation* function, such as sigmoid, tanh, and Relu.
 
-The transformation matrix :math:`W` and bias vector :math:`b` are the *parameters* of the layer. The *parameters* of a layer are learned during training in the *backward pass*. The backward pass computes the gradients of the output function with respect to all parameters and inputs. The optimizer can use chain rule to compute the gradients of the loss function with respect to each parameter. 
+The transformation matrix :math:`W` and bias vector :math:`b` are the *parameters* of the layer. The *parameters* of a layer are learned during training in the *backward pass*. The backward pass computes the gradients of the output function with respect to all parameters and inputs. The optimizer can use chain rule to compute the gradients of the loss function with respect to each parameter.
 
 Suppose our loss function is :math:`c(y)`, then
 
@@ -37,7 +37,7 @@ Suppose our loss function is :math:`c(y)`, then
 
    \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x}
 
-Suppose :math:`z = f(W^T x + b)`, then
+Suppose :math:`z = W^T x + b`, then
 
 .. math::
 
@@ -48,7 +48,7 @@ This derivative can be automatically computed by our base layer class.
 Then, for fully connected layer, we need to compute:
 
 .. math::
-  
+
    \frac{\partial z}{\partial x} = W, \frac{\partial z_j}{\partial W_{ij}} = x_i, \frac{\partial z}{\partial b} = \mathbf 1
 
 where :math:`\mathbf 1` is an all one vector, :math:`W_{ij}` is the number at the i-th row and j-th column of the matrix :math:`W`, :math:`z_j` is the j-th component of the vector :math:`z`, and :math:`x_i` is the i-th component of the vector :math:`x`.
@@ -322,7 +322,7 @@ All the gradient check unit tests are located in :code:`paddle/gserver/tests/tes
                       /* weight */ true);
       }
     }
-    
+
 If you are creating a new file for the test, such as :code:`paddle/gserver/tests/testFCGrad.cpp`, you need to add the file to :code:`paddle/gserver/tests/CMakeLists.txt`. An example is given below. All the unit tests will run when you execute the command :code:`make tests`. Notice that some layers might need high accuracy for the gradient check unit tests to work well. You need to configure :code:`WITH_DOUBLE` to `ON` when configuring cmake.
 
 .. code-block:: bash
diff --git a/doc/howto/dev/write_docs_cn.rst b/doc/howto/dev/write_docs_cn.rst
index d536f53abc031e9d279ace0e231a381a2f1e81b6..36e5d420c986fc8d88eefee4aa221dba0a0480f2 100644
--- a/doc/howto/dev/write_docs_cn.rst
+++ b/doc/howto/dev/write_docs_cn.rst
@@ -41,7 +41,7 @@ PaddlePaddle文档需要准备的环境相对较复杂，所以我们推荐使
 
     python -c "import py_paddle"
 
-如果提示错误，那么用户需要在本地编译安装PaddlePaddle，请参考 `源码编译文档 <http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html>`_ 。
+如果提示错误，那么用户需要在本地编译安装PaddlePaddle，请参考 `源码编译文档 <http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html>`_ 。
 注意，用户在首次编译安装PaddlePaddle时，请将WITH_DOC选项关闭。在编译安装正确之后，请再次确认py_paddle包已经安装，即可进行下一步操作。
 
 如果提示正确，可以执行以下命令编译生成文档，即
@@ -68,9 +68,9 @@ PaddlePaddle文档使用 `sphinx`_ 自动生成，用户可以参考sphinx教程
 如何更新www.paddlepaddle.org文档
 ================================
 
-开发者给PaddlePaddle代码增加的注释以PR的形式提交到github中，提交方式可参见 `贡献文档 <http://paddlepaddle.org/develop/doc_cn/howto/dev/contribute_to_paddle_cn.html>`_ 。
-目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://www.paddlepaddle.org/develop/doc_cn/>`_ 和
-`英文文档 <http://www.paddlepaddle.org/develop/doc/>`_ 。
+开发者给PaddlePaddle代码增加的注释以PR的形式提交到github中，提交方式可参见 `贡献文档 <http://doc.paddlepaddle.org/develop/doc_cn/howto/dev/contribute_to_paddle_cn.html>`_ 。
+目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://doc.paddlepaddle.org/develop/doc_cn/>`_ 和
+`英文文档 <http://doc.paddlepaddle.org/develop/doc/>`_ 。
 
 
 
diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go
index 31ef450f032f756fb32a0444a7e94a18ec2918a0..0ecb1242c3c3d5246125c9ce946001ccf9cbec24 100644
--- a/go/cmd/pserver/pserver.go
+++ b/go/cmd/pserver/pserver.go
@@ -20,6 +20,8 @@ func main() {
 		"comma separated endpoint string for pserver to connect to etcd")
 	etcdTimeout := flag.Int("etcd-timeout", 5, "timeout for etcd calls")
 	numPservers := flag.Int("num-pservers", 1, "total pserver count in a training job")
+	checkpointPath := flag.String("checkpoint-path", "/checkpoints/", "save checkpoint path")
+	checkpointInterval := flag.Int("checkpoint-interval", 600, "save checkpoint per interval seconds")
 	logLevel := flag.String("log-level", "info",
 		"log level, possible values: debug, info, warning, error, fatal, panic")
 	flag.Parse()
@@ -31,18 +33,20 @@ func main() {
 	log.SetLevel(level)
 
 	var idx int
+	var cp pserver.Checkpoint
+	var e *pserver.EtcdClient
 	if *index >= 0 {
 		idx = *index
 	} else {
 		timeout := time.Second * time.Duration((*etcdTimeout))
-		e := pserver.NewEtcdClient(*etcdEndpoint, *numPservers, timeout)
+		e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, timeout)
 		idx, err = e.Register()
 		if err != nil {
 			panic(err)
 		}
 	}
 
-	s, err := pserver.NewService(idx)
+	s, err := pserver.NewService(idx, *checkpointInterval, *checkpointPath, e, cp)
 	if err != nil {
 		panic(err)
 	}
diff --git a/go/master/client.go b/go/master/client.go
index 05383f1bf40c0e2b1f974e802ee8fd6aac109b00..a2ca3f3ef8ce300e3df09a302d74b56ee23c6d10 100644
--- a/go/master/client.go
+++ b/go/master/client.go
@@ -68,7 +68,7 @@ func (c *Client) getRecords() {
 		// We treat a task as finished whenever the last data
 		// instance of the task is read. This is not exactly
 		// correct, but a reasonable approximation.
-		c.taskFinished(t.ID)
+		c.taskFinished(t.Meta.ID)
 	}
 }
 
@@ -118,6 +118,11 @@ func (c *Client) taskFinished(taskID int) error {
 	return c.conn.Call("Service.TaskFinished", taskID, nil)
 }
 
+// TaskFailed tell the master server as task is failed.
+func (c *Client) taskFailed(meta TaskMeta) error {
+	return c.conn.Call("Service.TaskFailed", meta, nil)
+}
+
 // NextRecord returns next record in the dataset.
 //
 // NextRecord will block until the next record is available. It is
diff --git a/go/master/client_internal_test.go b/go/master/client_internal_test.go
index 364dce7b58cf6366af711bde9107559a762563a4..49263474c8fe2410ffb6db93a9647f5ab066b06b 100644
--- a/go/master/client_internal_test.go
+++ b/go/master/client_internal_test.go
@@ -95,10 +95,16 @@ func TestGetFinishTask(t *testing.T) {
 			t.Fatalf("Should get error, pass: %d\n", i)
 		}
 
-		err = c.taskFinished(tasks[0].ID)
+		err = c.taskFinished(tasks[0].Meta.ID)
 		if err != nil {
 			t.Fatalf("Error: %v, pass: %d\n", err, i)
 		}
+
+		err = c.taskFailed(tasks[0].Meta)
+		if err != nil {
+			t.Fatalf("Error: %v, pass: %d\n", err, i)
+		}
+
 		tasks = tasks[1:]
 		task, err := c.getTask()
 		if err != nil {
@@ -107,7 +113,7 @@ func TestGetFinishTask(t *testing.T) {
 		tasks = append(tasks, task)
 
 		for _, task := range tasks {
-			err = c.taskFinished(task.ID)
+			err = c.taskFinished(task.Meta.ID)
 			if err != nil {
 				t.Fatalf("Error: %v, pass: %d\n", err, i)
 			}
diff --git a/go/master/service.go b/go/master/service.go
index 58e68e744859933aa74cac231356d4ff9dfb8d7b..a6050ab99437244dade83f2943f6649453d47fff 100644
--- a/go/master/service.go
+++ b/go/master/service.go
@@ -31,30 +31,36 @@ type Chunk struct {
 	Index recordio.Index // chunk index
 }
 
+// TaskMeta is a struct which stores task's meta info.
+type TaskMeta struct {
+	ID    int
+	Epoch int
+}
+
 // Task is the basic unit of data instances assigned to trainers.
 type Task struct {
-	ID     int
+	Meta   TaskMeta
 	Chunks []Chunk
 }
 
 type taskEntry struct {
-	Epoch      int
-	NumTimeout int
-	Task       Task
+	Task Task
+	// A task fails if it's timeout or trainer reports it exits unnormally.
+	NumFailure int
 }
 
 type taskQueues struct {
 	Todo    []taskEntry
 	Pending map[int]taskEntry // map from task ID to task entry
 	Done    []taskEntry
-	Failed  []Task
+	Failed  []taskEntry
 }
 
 // Service is the master server service.
 type Service struct {
 	chunksPerTask int
 	timeoutDur    time.Duration
-	timeoutMax    int
+	failureMax    int
 	ready         chan struct{}
 	store         Store
 
@@ -73,7 +79,7 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 	var cur taskEntry
 	for i, c := range chunks {
 		if i%chunksPerTask == 0 && len(cur.Task.Chunks) > 0 {
-			cur.Task.ID = id
+			cur.Task.Meta.ID = id
 			id++
 			result = append(result, cur)
 			cur.Task.Chunks = nil
@@ -83,7 +89,7 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 	}
 
 	if len(cur.Task.Chunks) > 0 {
-		cur.Task.ID = id
+		cur.Task.Meta.ID = id
 		result = append(result, cur)
 	}
 
@@ -91,11 +97,11 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 }
 
 // NewService creates a new service.
-func NewService(store Store, chunksPerTask int, timeoutDur time.Duration, timeoutMax int) (*Service, error) {
+func NewService(store Store, chunksPerTask int, timeoutDur time.Duration, failureMax int) (*Service, error) {
 	s := &Service{}
 	s.chunksPerTask = chunksPerTask
 	s.timeoutDur = timeoutDur
-	s.timeoutMax = timeoutMax
+	s.failureMax = failureMax
 	s.taskQueues = taskQueues{}
 	s.taskQueues.Pending = make(map[int]taskEntry)
 	s.ready = make(chan struct{})
@@ -257,6 +263,34 @@ func (s *Service) SetDataset(globPaths []string, dummy *int) error {
 	return nil
 }
 
+func (s *Service) processFailedTask(t taskEntry, epoch int) {
+	if t.Task.Meta.Epoch != epoch {
+		// new epoch, task launched after the
+		// schedule of this timeout check or failed status report.
+		return
+	}
+
+	defer func() {
+		err := s.snapshot()
+		if err != nil {
+			log.Errorln(err)
+		}
+	}()
+
+	delete(s.taskQueues.Pending, t.Task.Meta.ID)
+
+	t.NumFailure++
+	if t.NumFailure > s.failureMax {
+		log.Warningf("Task %v failed %d times, discard.", t.Task, t.NumFailure)
+		s.taskQueues.Failed = append(s.taskQueues.Failed, t)
+		return
+	}
+
+	log.Warningf("Task %v failed %d times, discard.", t.Task, t.NumFailure)
+	s.taskQueues.Todo = append(s.taskQueues.Todo, t)
+	return
+}
+
 func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
 	return func() {
 		s.mu.Lock()
@@ -267,30 +301,7 @@ func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
 			return
 		}
 
-		if t.Epoch != epoch {
-			// new epoch, task launched after the
-			// schedule of this timeout check.
-			return
-		}
-
-		defer func() {
-			err := s.snapshot()
-			if err != nil {
-				log.Errorln(err)
-			}
-		}()
-
-		delete(s.taskQueues.Pending, t.Task.ID)
-
-		t.NumTimeout++
-		if t.NumTimeout > s.timeoutMax {
-			log.Warningf("Task %v timed out %d times, discard.", t.Task, t.NumTimeout)
-			s.taskQueues.Failed = append(s.taskQueues.Failed, t.Task)
-			return
-		}
-
-		log.Warningf("Task %v timed out %d times, retry.", t.Task, t.NumTimeout)
-		s.taskQueues.Todo = append(s.taskQueues.Todo, t)
+		s.processFailedTask(t, epoch)
 	}
 }
 
@@ -339,18 +350,18 @@ func (s *Service) GetTask(dummy int, task *Task) error {
 	}
 
 	t := s.taskQueues.Todo[0]
-	t.Epoch++
+	t.Task.Meta.Epoch++
 	s.taskQueues.Todo = s.taskQueues.Todo[1:]
-	s.taskQueues.Pending[t.Task.ID] = t
+	s.taskQueues.Pending[t.Task.Meta.ID] = t
 	err := s.snapshot()
 	if err != nil {
 		return err
 	}
 
 	*task = t.Task
-	log.WithFields(s.logFields()).Infof("Task #%d dispatched.", task.ID)
+	log.WithFields(s.logFields()).Infof("Task #%v dispatched.", t.Task.Meta)
 
-	time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.ID, t.Epoch))
+	time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.Meta.ID, t.Task.Meta.Epoch))
 	return nil
 }
 
@@ -365,13 +376,12 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
 
 	t, ok := s.taskQueues.Pending[taskID]
 	if !ok {
-		err := errors.New("pending task not found")
 		log.WithFields(s.logFields()).Warningln("Pending task #%d not found.", taskID)
-		return err
+		return nil
 	}
 
 	// task finished, reset timeout
-	t.NumTimeout = 0
+	t.NumFailure = 0
 	s.taskQueues.Done = append(s.taskQueues.Done, t)
 	delete(s.taskQueues.Pending, taskID)
 
@@ -389,3 +399,22 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
 	}
 	return err
 }
+
+// TaskFailed tells the service that a task is failed.
+func (s *Service) TaskFailed(meta TaskMeta, dummy *int) error {
+	select {
+	case <-s.ready:
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	t, ok := s.taskQueues.Pending[meta.ID]
+	if !ok {
+		log.WithFields(s.logFields()).Warningln("TaskFailed:Pending task #%v not found.", t.Task.Meta)
+		return nil
+	}
+
+	s.processFailedTask(t, meta.Epoch)
+	return nil
+}
diff --git a/go/master/service_internal_test.go b/go/master/service_internal_test.go
index bc435b505c014ca13ed5fc16b33a21ebb089a3b7..9c0d1d0a39fc8cb2b29fd0e3a4ba0c9b255f80fb 100644
--- a/go/master/service_internal_test.go
+++ b/go/master/service_internal_test.go
@@ -30,7 +30,7 @@ func TestPartionIndex(t *testing.T) {
 	cs := make([]Chunk, 100)
 	ts := partition(cs, 20)
 	for i := range ts {
-		if ts[i].Task.ID != i {
+		if ts[i].Task.Meta.ID != i {
 			t.Error(ts[i], i)
 		}
 	}
diff --git a/go/pserver/client/c/test/test_train.py b/go/pserver/client/c/test/test_train.py
index 68e1d9b269209b695e27f91a656dc2d8e527b4cd..d6922672f4c1253e62cfe54965f6c2f3b5e6c7bf 100644
--- a/go/pserver/client/c/test/test_train.py
+++ b/go/pserver/client/c/test/test_train.py
@@ -19,7 +19,7 @@ def main():
     # create parameters
     parameters = paddle.parameters.create(cost)
 
-    # create optimizer
+    # create optimizer of new remote updater to pserver
     optimizer = paddle.optimizer.Momentum(momentum=0)
 
     #TODO(zhihong) : replace optimizer with new OptimizerConfig
diff --git a/go/pserver/client/client_test.go b/go/pserver/client/client_test.go
index 29b400812c9dc3a5f44700eacbf7ba043248f2f2..2b72a202b55eab4b0e9107b93807f3ceea95f099 100644
--- a/go/pserver/client/client_test.go
+++ b/go/pserver/client/client_test.go
@@ -42,7 +42,8 @@ func initClient() [numPserver]int {
 		ports[i] = p
 
 		go func(l net.Listener) {
-			s, err := pserver.NewService(0)
+			var cp pserver.Checkpoint
+			s, err := pserver.NewService(0, 1, "", nil, cp)
 			if err != nil {
 				panic(err)
 			}
@@ -174,7 +175,7 @@ func TestNativeClient(t *testing.T) {
 // TODO: tmperary disable etcdClient test for dependency of etcd)
 func EtcdClient(t *testing.T) {
 	initEtcdClient()
-	etcd_client := client.NewEtcd(etcdEndpoints)
-	c2 := client.NewClient(etcd_client, etcd_client.Desired(), selector(true))
+	etcdClient := client.NewEtcd(etcdEndpoints)
+	c2 := client.NewClient(etcdClient, etcdClient.Desired(), selector(true))
 	ClientTest(t, c2)
 }
diff --git a/go/pserver/etcd_client.go b/go/pserver/etcd_client.go
index 37b8d522c1bd07acb41b9515a6d9bc15eae9aa32..1f77787150d16052e3588e9c1795c8d5dafa08e6 100644
--- a/go/pserver/etcd_client.go
+++ b/go/pserver/etcd_client.go
@@ -18,6 +18,8 @@ const (
 	PsDesired = "/ps_desired"
 	// PsAddr is the base dir for pserver to store their addr
 	PsPath = "/ps/"
+	// PsCheckpoint is the etcd path for store checkpoints information
+	PsCheckpoint = "/checkpoints/"
 )
 
 // EtcdClient is the etcd client that the pserver uses for fault
@@ -186,3 +188,14 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
 
 	return idx, nil
 }
+
+// PutKey put into etcd with value by key specified
+func (e *EtcdClient) PutKey(key string, value []byte, timeout int) error {
+	ctx, cancel := context.WithTimeout(context.Background(), time.Second*time.Duration(timeout))
+	_, err := e.etcdClient.Put(ctx, key, string(value))
+	cancel()
+	if err != nil {
+		return err
+	}
+	return nil
+}
diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go
index 54d108209402c27e79a9948f60ecbdadeffc7d9b..dcb871129985bba7fbabdfae7fbebd38caad4c7f 100644
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
@@ -35,22 +35,30 @@ func cArrayToSlice(p unsafe.Pointer, len int) []byte {
 	return (*[1 << 30]byte)(p)[:len:len]
 }
 
-func newOptimizer(paramWithConfigs ParameterWithConfig) *optimizer {
+func newOptimizer(paramWithConfigs ParameterWithConfig, State []byte) *optimizer {
 	o := &optimizer{}
 	o.elementType = paramWithConfigs.Param.ElementType
 	p := paramWithConfigs.Param
 	c := paramWithConfigs.Config
+	s := State
+	paramBufferSize := C.size_t(len(p.Content) / C.sizeof_float)
 	log.WithFields(log.Fields{
 		"ElementType": p.ElementType,
-		"ParamSize":   len(p.Content),
+		"ParamSize":   paramBufferSize,
 		"ConfigSize":  len(c),
+		"StateSize":   len(s),
 	}).Info("New Optimizer Created with config:")
 	var cbuffer unsafe.Pointer
-	cbuffer = C.malloc(C.size_t(len(p.Content)))
-	C.memcpy(cbuffer, unsafe.Pointer(&p.Content[0]), C.size_t(len(p.Content)))
+	cbuffer = C.malloc(paramBufferSize)
+
+	C.memcpy(cbuffer, unsafe.Pointer(&p.Content[0]), paramBufferSize)
+	var cstate unsafe.Pointer
+	if len(s) != 0 {
+		cstate = unsafe.Pointer(&s[0])
+	}
+
 	o.opt = C.paddle_create_optimizer((*C.uchar)(&c[0]), C.int(len(c)),
-		C.paddle_element_type(p.ElementType), cbuffer, C.int(len(p.Content)/C.sizeof_float),
-		(*C.char)(nullPtr), 0)
+		C.paddle_element_type(p.ElementType), cbuffer, C.int(paramBufferSize), (*C.char)(cstate), C.int(len(s)))
 	return o
 }
 
@@ -60,6 +68,12 @@ func (o *optimizer) GetWeights() []byte {
 	return cArrayToSlice(buffer, int(bufferLen)*C.sizeof_float)
 }
 
+func (o *optimizer) GetStates() []byte {
+	var cbuffer *C.char
+	cbufferLen := C.paddle_optimizer_get_state(o.opt, &cbuffer)
+	return cArrayToSlice(unsafe.Pointer(cbuffer), int(cbufferLen))
+}
+
 func (o *optimizer) UpdateParameter(g Gradient) error {
 	if o.elementType != g.ElementType {
 		return fmt.Errorf("Name: %s, parameter and gradient element type not match, parameter: %v, gradient: %v", g.Name, o.elementType, g.ElementType)
diff --git a/go/pserver/optimizer_test.go b/go/pserver/optimizer_test.go
index 0b2f4cfa41a630645c128ac13826de9d8b1d521b..d19e9de92e0b33b1d9619adb615a24884097a38f 100644
--- a/go/pserver/optimizer_test.go
+++ b/go/pserver/optimizer_test.go
@@ -19,6 +19,6 @@ func TestOptimizerCreateRelease(t *testing.T) {
 		Param:  p,
 		Config: config,
 	}
-	o := newOptimizer(param)
+	o := newOptimizer(param, nil)
 	o.Cleanup()
 }
diff --git a/go/pserver/service.go b/go/pserver/service.go
index ad16a5708d10bdcb5189a1e1e8abf13c54a72265..6b52d0d896f8bc04fab6c9b68911523cbb7ac8b9 100644
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -1,9 +1,21 @@
 package pserver
 
 import (
+	"bufio"
+	"bytes"
+	"crypto/md5"
+	"encoding/gob"
+	"encoding/hex"
+	"encoding/json"
 	"errors"
 	"fmt"
+	"os"
+	"path/filepath"
+	"strconv"
 	"sync"
+	"time"
+
+	log "github.com/sirupsen/logrus"
 )
 
 // ElementType is the type of elements of a Parameter.
@@ -39,26 +51,55 @@ type ParameterWithConfig struct {
 	Config []byte // parameter configuration in Proto Buffer format
 }
 
+// ParameterCheckpoint is Parameter and State checkpoint
+type ParameterCheckpoint struct {
+	ParamConfig ParameterWithConfig
+	State       []byte
+}
+
+// checkpoint signature
+type checkpointMeta struct {
+	UUID      string `json:"uuid"`
+	Md5sum    string `json:"md5sum"`
+	Timestamp string `json:"timestamp"`
+}
+
+// Checkpoint is the pserver shard persist in file
+type Checkpoint []ParameterCheckpoint
+
 // Gradient is the gradient of the parameter.
 type Gradient Parameter
 
 // Service is the RPC service for pserver.
 type Service struct {
-	initialized chan struct{}
-	idx         int
-
-	mu     sync.Mutex
-	optMap map[string]*optimizer
+	initialized        chan struct{}
+	idx                int
+	checkpointInterval time.Duration
+	checkpointPath     string
+	client             *EtcdClient
+	mu                 sync.Mutex
+	optMap             map[string]*optimizer
 }
 
 // NewService creates a new service, will bypass etcd registration if no
 // endpoints specified.
-func NewService(idx int) (*Service, error) {
+func NewService(idx int, seconds int, path string, client *EtcdClient, cp Checkpoint) (*Service, error) {
 	s := &Service{
-		idx: idx,
+		idx:                idx,
+		checkpointInterval: time.Second * time.Duration(seconds),
+		checkpointPath:     path,
+		client:             client,
 	}
 	s.optMap = make(map[string]*optimizer)
 	s.initialized = make(chan struct{})
+
+	if cp != nil {
+		for _, item := range cp {
+			p := item.ParamConfig
+			st := item.State
+			s.optMap[p.Param.Name] = newOptimizer(p, st)
+		}
+	}
 	return s, nil
 }
 
@@ -78,7 +119,7 @@ func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) er
 	// TODO(helin): check if paramWithConfigs.Param.Content is
 	// properly memory aligned, if not, make copy to a memory
 	// aligned region.
-	s.optMap[paramWithConfigs.Param.Name] = newOptimizer(paramWithConfigs)
+	s.optMap[paramWithConfigs.Param.Name] = newOptimizer(paramWithConfigs, nil)
 	return nil
 }
 
@@ -139,10 +180,57 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
 	return nil
 }
 
-// Save tells the parameter server to save parameters.
-func (s *Service) Save(path string, dummy *int) error {
+// pserver save checkpoint
+func (s *Service) doCheckpoint() error {
 	<-s.initialized
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	cp := make([]ParameterCheckpoint, 0, len(s.optMap))
+	index := 0
+	for name, opt := range s.optMap {
+		var pc ParameterCheckpoint
+		pc.ParamConfig.Param.Name = name
+		pc.ParamConfig.Param.ElementType = opt.elementType
+		pc.ParamConfig.Param.Content = opt.GetWeights()
+		pc.State = opt.GetStates()
+		cp[index] = pc
+		index++
+	}
+	var buf bytes.Buffer
+	encoder := gob.NewEncoder(&buf)
+	err := encoder.Encode(cp)
+	if err != nil {
+		return err
+	}
+
+	cpMeta := checkpointMeta{}
+	cpMeta.UUID = s.checkpointPath + strconv.Itoa(s.idx)
+	cpMeta.Timestamp = time.Now().String()
+	h := md5.New()
+	cpMeta.Md5sum = hex.EncodeToString(h.Sum(buf.Bytes()))
 
-	// TODO
+	cpMetajson, _ := json.Marshal(cpMeta)
+	err = s.client.PutKey(filepath.Join(PsCheckpoint, strconv.Itoa(s.idx)), cpMetajson, 3)
+	if err != nil {
+		return err
+	}
+	if _, err = os.Stat(cpMeta.UUID); os.IsNotExist(err) {
+		log.Info("checkpoint does not exists.")
+	} else {
+		err = os.Remove(cpMeta.UUID)
+		log.Infof("checkpoint %s already exsits, removing ", cpMeta.UUID)
+	}
+	f, err := os.Create(cpMeta.UUID)
+	defer f.Close()
+	if err != nil {
+		return err
+	}
+	writer := bufio.NewWriter(f)
+	_, err = writer.Write(buf.Bytes())
+	writer.Flush()
+	if err != nil {
+		return err
+	}
 	return nil
 }
diff --git a/go/pserver/service_test.go b/go/pserver/service_test.go
index b6d20d2c8b7ba0ccd7ab46669a597a21dc11c381..9bf1a48a596f3e3e73a2e4df651855fd5f4e775f 100644
--- a/go/pserver/service_test.go
+++ b/go/pserver/service_test.go
@@ -15,7 +15,8 @@ const (
 )
 
 func TestServiceFull(t *testing.T) {
-	s, err := pserver.NewService(0)
+	var cp pserver.Checkpoint
+	s, err := pserver.NewService(0, 1, "", nil, cp)
 	if err != nil {
 		t.Error(err)
 	}
@@ -86,7 +87,8 @@ func TestServiceFull(t *testing.T) {
 }
 
 func TestMultipleInit(t *testing.T) {
-	s, err := pserver.NewService(0)
+	var cp pserver.Checkpoint
+	s, err := pserver.NewService(0, 1, "", nil, cp)
 	if err != nil {
 		t.Error(err)
 	}
@@ -102,7 +104,8 @@ func TestMultipleInit(t *testing.T) {
 }
 
 func TestUninitialized(t *testing.T) {
-	s, err := pserver.NewService(0)
+	var cp pserver.Checkpoint
+	s, err := pserver.NewService(0, 1, "", nil, cp)
 	err = s.SendGrad(pserver.Gradient{}, nil)
 	if err.Error() != pserver.Uninitialized {
 		t.FailNow()
@@ -110,7 +113,8 @@ func TestUninitialized(t *testing.T) {
 }
 
 func TestBlockUntilInitialized(t *testing.T) {
-	s, err := pserver.NewService(0)
+	var cp pserver.Checkpoint
+	s, err := pserver.NewService(0, 1, "", nil, cp)
 	if err != nil {
 		t.Error(err)
 	}
@@ -128,16 +132,6 @@ func TestBlockUntilInitialized(t *testing.T) {
 		ch <- struct{}{}
 	}()
 
-	wg.Add(1)
-	go func() {
-		err := s.Save("", nil)
-		if err != nil {
-			errCh <- err
-		}
-		wg.Done()
-		ch <- struct{}{}
-	}()
-
 	time.Sleep(50 * time.Millisecond)
 
 	select {
@@ -170,3 +164,7 @@ func TestBlockUntilInitialized(t *testing.T) {
 
 	wg.Wait()
 }
+
+func TestCheckpointSpeed(t *testing.T) {
+	//TODO(zhihong): test speed
+}
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 307e99bbe3a833f1fe26057ec38d0b96e04bc0fe..58a35564f83928ee0bdaad63b154ed57d8d8a735 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -15,6 +15,7 @@ if(Boost_FOUND)
   add_subdirectory(memory)
   add_subdirectory(platform)
   add_subdirectory(framework)
+  add_subdirectory(pybind)
 endif()
 
 if(WITH_C_API)
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index 4409c6feae218222b7c0216760cebe4ae8e235cb..aac49fdb7a04ac566ad24c6d17f9af991241e45b 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -11,8 +11,14 @@ proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
 cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
 proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
 cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
-cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_proto op_desc)
+cc_library(operator SRCS operator.cc DEPS op_desc protobuf)
+cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
+cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc)
+cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry operator)
 py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
+
+proto_library(net_proto SRCS net_proto.proto DEPS op_proto)
+cc_library(net SRCS net.cc DEPS net_proto)
diff --git a/paddle/framework/dim.h b/paddle/framework/dim.h
index bcde291d12d429a3f2cd41fa6d0ee606c7c9c92f..883fdc55eb929ebc51e8ae05938e9d07374406ce 100644
--- a/paddle/framework/dim.h
+++ b/paddle/framework/dim.h
@@ -266,29 +266,6 @@ HOSTDEVICE inline bool contained(const Dim<1>& idx, const Dim<1>& size) {
   return ((0 <= idx.head) && (idx.head < size.head));
 }
 
-/**
- * \brief Check if a size and a stride create a Fortran order contiguous
- * block of memory.
- */
-template <int i>
-HOST bool contiguous(const Dim<i>& size, const Dim<i>& stride, int mul = 1) {
-  if (product(size) == 0) return true;
-  int contiguous_stride = get<0>(size) == 1 ? 0 : mul;
-  return (get<0>(stride) == contiguous_stride &&
-          contiguous(size.tail, stride.tail, mul * get<0>(size)));
-}
-
-///\cond HIDDEN
-// Base case of contiguous, check the nth stride is the size of
-// the prefix multiply of n-1 dims.
-template <>
-inline bool contiguous(const Dim<1>& size, const Dim<1>& stride, int mul) {
-  if (get<0>(size) == 0) return true;
-  int contiguous_stride = get<0>(size) == 1 ? 0 : mul;
-  return get<0>(stride) == contiguous_stride;
-}
-///\endcond
-
 /**
  * \brief Compute exclusive prefix-multiply of a Dim.
  */
@@ -306,31 +283,6 @@ HOSTDEVICE inline Dim<1> ex_prefix_mul(const Dim<1>& src, int mul) {
 }
 ///\endcond
 
-/**
- * \brief Calculate strides of a contiguous array of the given size
- *
- * Sets the stride for any dimension with an extent of 1 to 0.
- * \param size Dim object containing the size of the array.
- * \param base The base stride to use.
- * \return Dim object the same size as \p size with the strides.
- */
-template <int i>
-HOSTDEVICE Dim<i> contiguous_strides(const Dim<i>& size, int base = 1) {
-  int stride = size.head == 1 ? 0 : base;
-  return Dim<i>(stride, contiguous_strides(size.tail, base * size.head));
-}
-
-///\cond HIDDEN
-
-// Base case of contiguous_strides
-template <>
-HOSTDEVICE inline Dim<1> contiguous_strides(const Dim<1>& size, int base) {
-  int stride = size.head == 1 ? 0 : base;
-  return Dim<1>(stride);
-}
-
-///\endcond
-
 /**
  * Add two dimensions together
  */
diff --git a/paddle/framework/dim_test.cu b/paddle/framework/dim_test.cu
index 809bf04826637195425a32c054c94e00ef940df9..05217415196f3ec3ce9b5de7cb2f82c9de960ba7 100644
--- a/paddle/framework/dim_test.cu
+++ b/paddle/framework/dim_test.cu
@@ -58,24 +58,6 @@ TEST(Dim, Equality) {
     EXPECT_EQ(paddle::framework::get<1>(c), 3);
     EXPECT_EQ(paddle::framework::get<2>(c), 12);
 
-    // contiguous_strides
-    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(10, 1, 10));
-    EXPECT_EQ(paddle::framework::get<0>(c), 1);
-    EXPECT_EQ(paddle::framework::get<1>(c), 0);
-    EXPECT_EQ(paddle::framework::get<2>(c), 10);
-    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(10, 10, 1));
-    EXPECT_EQ(paddle::framework::get<0>(c), 1);
-    EXPECT_EQ(paddle::framework::get<1>(c), 10);
-    EXPECT_EQ(paddle::framework::get<2>(c), 0);
-    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(1, 10, 10));
-    EXPECT_EQ(paddle::framework::get<0>(c), 0);
-    EXPECT_EQ(paddle::framework::get<1>(c), 1);
-    EXPECT_EQ(paddle::framework::get<2>(c), 10);
-    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(2, 3, 4));
-    EXPECT_EQ(paddle::framework::get<0>(c), 1);
-    EXPECT_EQ(paddle::framework::get<1>(c), 2);
-    EXPECT_EQ(paddle::framework::get<2>(c), 6);
-
     // generate from an index
     auto size = paddle::framework::make_dim(4, 5, 2);
     c = paddle::framework::Dim<3>(14, size);
@@ -101,16 +83,6 @@ TEST(Dim, Bool) {
     EXPECT_TRUE(a == a);
     EXPECT_FALSE(a == b);
     EXPECT_TRUE(a == c);
-
-    // contiguous check
-    int x = 4, y = 5, z = 2;
-    paddle::framework::Dim<3> sizef(x, y, z);
-    paddle::framework::Dim<3> stridea(1, x, x*y);
-    paddle::framework::Dim<3> strideb(2, 2*x, 2*x*y);
-    paddle::framework::Dim<3> stridec(1, x, 2*x*y);
-    EXPECT_TRUE(paddle::framework::contiguous(sizef, stridea));
-    EXPECT_FALSE(paddle::framework::contiguous(sizef, strideb));
-    EXPECT_FALSE(paddle::framework::contiguous(sizef, stridec));
 }
 
 TEST(Dim, Print) {
diff --git a/paddle/framework/net.cc b/paddle/framework/net.cc
new file mode 100644
index 0000000000000000000000000000000000000000..73b3051235ee90b31bd65acb22f454fc13d64da9
--- /dev/null
+++ b/paddle/framework/net.cc
@@ -0,0 +1,20 @@
+#include "paddle/framework/net.h"
+
+namespace paddle {
+namespace framework {
+
+PlainNet::PlainNet(const NetDesc& def) {}
+
+void PlainNet::InferShape(Scope* scope) {
+  for (auto& op : ops_) {
+    op.InferShape();
+  }
+}
+
+void PlainNet::Run(std::shared_ptr<Scope> scope, DeviceContext* ctx) {
+  for (auto& op : ops_) {
+    op.Run(ctx);
+  }
+}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/net.h b/paddle/framework/net.h
new file mode 100644
index 0000000000000000000000000000000000000000..76992e07282904fd1074bb0ced2367a8d20e3ec2
--- /dev/null
+++ b/paddle/framework/net.h
@@ -0,0 +1,171 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/net_proto.pb.h"
+#include "paddle/framework/op_proto.pb.h"
+#include "paddle/framework/scope.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+using namespace paddle::platform;
+
+// operator's index stored in a network.
+typedef int OpIndex;
+/**
+ * NOTE following codes are some definitions of unimplemented concepts.
+ * We write some basic implementation to make Net compilable. These APIs will
+ * keep updating if the concepts related are implemented.
+ */
+
+struct OpDesc;
+struct OpAttrs {};
+
+class Operator {
+ public:
+  Operator(const OpDesc &def) {}
+  void InferShape() {}
+  void Run(DeviceContext *ctx) {}
+};
+
+/**
+ * @brief Network that manage the operators it has.
+ *
+ * Network is the container and controller of a set of operators, user can build
+ * a real network from a NetDesc which is a protobuf message and use
+ * Network.Run() * to run all the operators in the network.
+
+ * A network object knows all Operators belonging to this network. Variables,
+ * which are inputs and outputs of these operators, are created and managed by a
+ * hierarchy of Scope objects.
+ *
+ * This is the base class of network, all the networks should implement the apis
+ * it defines.
+ */
+class Net {
+ public:
+  /**
+   * @brief Infer shapes of all inputs and outputs of operators.
+   */
+  virtual void InferShape(Scope *scope) = 0;
+  /**
+   * @brief Run the network.
+   *
+   * Run all the operators and return success(true) or not, with all the
+   * variables are located in `scope`. `context` describes the detail execution
+   * environment for ops. `begin` and `end` specify the scope of `ops_` to run,
+   * If no positive indexes are provided, all operators in `ops_` will run.
+   */
+  virtual void Run(std::shared_ptr<Scope> scope, DeviceContext *ctx) = 0;
+
+  /**
+   * @brief Add an Operator according to `def`.
+   */
+  virtual OpIndex AddOp(const OpProto &def) = 0;
+
+  /**
+   * @brief Add optimizer operators acctording to `attrs`.
+   */
+  virtual void AddOptimizerOps(const OpAttrs &attrs) = 0;
+
+  /**
+   * @brief Add backward operators.
+   */
+  virtual void AddBackwardOps() = 0;
+
+  /**
+   * @brief Create a network.
+   */
+  static std::unique_ptr<Net> Create(const NetDesc &def = NetDesc());
+
+  virtual ~Net() {}
+};
+
+/**
+ * @brief a basic implementation of Net.
+ *
+ * PlainNet is a very simple Net, it create a list of operators, and run them
+ * sequentially following the order they added.
+ */
+class PlainNet : public Net {
+ public:
+  /**
+   * @brief Initialize a PlainNet.
+   *
+   * Initialize from  a network describe by `def`. NetDesc is the definition of
+   * a network.
+   */
+  PlainNet(const NetDesc &def);
+
+  /**
+   * Infer all the operators' input and output varialbes' shapes, will be called
+   * before every mini-batch
+   */
+  virtual void InferShape(Scope *scope) override;
+
+  /**
+   * @brief Run the network.
+   *
+   * Run all the operators with the `scope`, if no scope is provided, default
+   * scope will be used instead. If no OpContext is provicded, default context
+   * will be used.
+   */
+  virtual void Run(std::shared_ptr<Scope> scope, DeviceContext *ctx) override;
+
+  /**
+   * @brief Add an operator to this network.
+   */
+  virtual OpIndex AddOp(const OpProto &def) override;
+
+  /**
+   * @brief Add all optimizer operators related into the network.
+   */
+  virtual void AddOptimizerOps(const OpAttrs &attrs) override;
+
+  /**
+   * @brief Add all backward operators related into the network.
+   */
+  virtual void AddBackwardOps() override;
+
+  virtual ~PlainNet() override {}
+
+ protected:
+  /**
+   * @brief Build the network.
+   *
+   * Create operators accordding to `def`, will be called by the constructor.
+   */
+  void BuildNet(const NetDesc &def);
+
+  /**
+   * @brief Add an operator into this network.
+   *
+   * Add a operator which is identified as `type` and has attributes described
+   * in `attrs`, the `inputs` are the keys of readonly input variables,
+   * `outputs` are keys of mutable output variables. An `OpIndex` will be
+   * returned to indicate the offset of the new operator in `ops_`.
+   */
+  OpIndex AddOp(const std::string &type, const std::vector<std::string> &inputs,
+                const std::vector<std::string> &outputs,
+                const OpAttrs &attrs = OpAttrs());
+
+ private:
+  // the operators owned by `Network`.
+  std::vector<Operator> ops_;
+};
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/net_proto.proto b/paddle/framework/net_proto.proto
new file mode 100644
index 0000000000000000000000000000000000000000..0779f49fe2a9a6d0d1ea5ec11ba3befeb0a67fa1
--- /dev/null
+++ b/paddle/framework/net_proto.proto
@@ -0,0 +1,15 @@
+syntax="proto2";
+package paddle.framework;
+
+import "op_proto.proto";
+
+message NetDesc {
+  // network identification
+  optional string name = 1;
+  // operator contains in network
+  repeated OpProto operators = 2;
+  // network type to run with. e.g "plainNet", "DAG"
+  optional string net_type = 3;
+  // num worker always
+  optional int32 num_workers = 4;
+}
diff --git a/paddle/framework/net_test.cc b/paddle/framework/net_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a8e31c1497519ce60da004bc0a3e52403593497c
--- /dev/null
+++ b/paddle/framework/net_test.cc
@@ -0,0 +1,24 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/net.h"
+#include "paddle/framework/op_registry.h"
+
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace framework {
+class FakeFC : public Operator {}
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/op_registry.cc b/paddle/framework/op_registry.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4b35e04e681b414c36cf6d9aee9e64dd68ba5da9
--- /dev/null
+++ b/paddle/framework/op_registry.cc
@@ -0,0 +1,36 @@
+#include <paddle/framework/op_registry.h>
+
+namespace paddle {
+namespace framework {
+
+template <>
+void AttrTypeHelper::SetAttrType<int>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::INT);
+}
+
+template <>
+void AttrTypeHelper::SetAttrType<float>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::FLOAT);
+}
+
+template <>
+void AttrTypeHelper::SetAttrType<std::string>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::STRING);
+}
+
+template <>
+void AttrTypeHelper::SetAttrType<std::vector<int>>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::INTS);
+}
+
+template <>
+void AttrTypeHelper::SetAttrType<std::vector<float>>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::FLOATS);
+}
+
+template <>
+void AttrTypeHelper::SetAttrType<std::vector<std::string>>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::STRINGS);
+}
+}  // namespace framework
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 81241b5342d8900c205dd62f2a62dc2496010560..02c99d50bb50cbd49a56a2282e55c148d4e6af16 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -1,26 +1,14 @@
 #pragma once
 
+#include <algorithm>
 #include "paddle/framework/attr_checker.h"
-
-//#include "paddle/framework/op_base.h"
 #include "paddle/framework/op_desc.pb.h"
 #include "paddle/framework/op_proto.pb.h"
+#include "paddle/framework/operator.h"
 
 namespace paddle {
 namespace framework {
 
-//==================For test================//
-class OpBase {
- public:
-  std::vector<std::string> inputs_;
-  std::vector<std::string> outputs_;
-  AttributeMap attr_map_;
-
-  virtual std::string Run() const = 0;
-  virtual ~OpBase() {}
-};
-//=========================================//
-
 // helper class to set attribute type
 struct AttrTypeHelper {
   template <typename T>
@@ -64,36 +52,6 @@ struct AttrTypeHelper {
   }
 };
 
-template <>
-void AttrTypeHelper::SetAttrType<int>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::INT);
-}
-
-template <>
-void AttrTypeHelper::SetAttrType<float>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::FLOAT);
-}
-
-template <>
-void AttrTypeHelper::SetAttrType<std::string>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::STRING);
-}
-
-template <>
-void AttrTypeHelper::SetAttrType<std::vector<int>>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::INTS);
-}
-
-template <>
-void AttrTypeHelper::SetAttrType<std::vector<float>>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::FLOATS);
-}
-
-template <>
-void AttrTypeHelper::SetAttrType<std::vector<std::string>>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::STRINGS);
-}
-
 // this class not only make proto but also init attribute checkers.
 class OpProtoAndCheckerMaker {
  public:
@@ -103,22 +61,22 @@ class OpProtoAndCheckerMaker {
  protected:
   void AddInput(const std::string& name, const std::string& comment) {
     auto input = proto_->mutable_inputs()->Add();
-    *(input->mutable_name()) = name;
-    *(input->mutable_comment()) = comment;
+    *input->mutable_name() = name;
+    *input->mutable_comment() = comment;
   }
 
   void AddOutput(const std::string& name, const std::string& comment) {
     auto output = proto_->mutable_outputs()->Add();
-    *(output->mutable_name()) = name;
-    *(output->mutable_comment()) = comment;
+    *output->mutable_name() = name;
+    *output->mutable_comment() = comment;
   }
 
   template <typename T>
   TypedAttrChecker<T>& AddAttr(const std::string& name,
                                const std::string& comment) {
     auto attr = proto_->mutable_attrs()->Add();
-    *(attr->mutable_name()) = name;
-    *(attr->mutable_comment()) = comment;
+    *attr->mutable_name() = name;
+    *attr->mutable_comment() = comment;
     AttrTypeHelper::SetAttrType<T>(attr);
     return op_checker_->AddAttrChecker<T>(name);
   }
@@ -134,49 +92,52 @@ class OpProtoAndCheckerMaker {
 };
 
 class OpRegistry {
-  typedef std::function<OpBase*()> OpCreator;
+  using OpCreator = std::function<OperatorBase*()>;
 
  public:
   template <typename OpType, typename ProtoMakerType>
   static void RegisterOp(const std::string& op_type) {
-    creators_[op_type] = []() { return new OpType; };
-    OpProto& op_proto = protos_[op_type];
-    OpAttrChecker& op_checker = op_checkers_[op_type];
+    creators()[op_type] = [] { return new OpType; };
+    OpProto& op_proto = protos()[op_type];
+    OpAttrChecker& op_checker = op_checkers()[op_type];
     ProtoMakerType(&op_proto, &op_checker);
-    PADDLE_ENFORCE(op_proto.IsInitialized() == true,
+    PADDLE_ENFORCE(op_proto.IsInitialized(),
                    "Fail to initialize %s's OpProto !", op_type);
   }
 
-  static OpBase* CreateOp(const OpDesc& op_desc) {
+  static OperatorBase* CreateOp(const OpDesc& op_desc) {
     std::string op_type = op_desc.type();
-    OpBase* op = (creators_.at(op_type))();
-    (op->inputs_).resize(op_desc.inputs_size());
-    for (int i = 0; i < op_desc.inputs_size(); ++i) {
-      (op->inputs_)[i] = op_desc.inputs(i);
-    }
-    (op->outputs_).resize(op_desc.outputs_size());
-    for (int i = 0; i < op_desc.outputs_size(); ++i) {
-      (op->outputs_)[i] = op_desc.outputs(i);
+    OperatorBase* op = creators().at(op_type)();
+    op->desc_ = op_desc;
+    op->inputs_.reserve((size_t)op_desc.inputs_size());
+    std::copy(op_desc.inputs().begin(), op_desc.inputs().end(),
+              std::back_inserter(op->inputs_));
+    op->outputs_.reserve((size_t)op_desc.outputs_size());
+    std::copy(op_desc.outputs().begin(), op_desc.outputs().end(),
+              std::back_inserter(op->outputs_));
+    for (auto& attr : op_desc.attrs()) {
+      op->attrs_[attr.name()] = AttrTypeHelper::GetAttrValue(attr);
     }
-    for (int i = 0; i < op_desc.attrs_size(); ++i) {
-      const AttrDesc& ith_attr = op_desc.attrs(i);
-      std::string name = ith_attr.name();
-      (op->attr_map_)[name] = AttrTypeHelper::GetAttrValue(ith_attr);
-    }
-    const OpAttrChecker& op_checker = op_checkers_.at(op_type);
-    op_checker.Check(op->attr_map_);
+    op_checkers().at(op_type).Check(op->attrs_);
     return op;
   }
 
  private:
-  static std::unordered_map<std::string, OpCreator> creators_;
-  static std::unordered_map<std::string, OpProto> protos_;
-  static std::unordered_map<std::string, OpAttrChecker> op_checkers_;
-};
+  static std::unordered_map<std::string, OpCreator>& creators() {
+    static std::unordered_map<std::string, OpCreator> creators_;
+    return creators_;
+  }
+
+  static std::unordered_map<std::string, OpProto>& protos() {
+    static std::unordered_map<std::string, OpProto> protos_;
+    return protos_;
+  };
 
-std::unordered_map<std::string, std::function<OpBase*()>> OpRegistry::creators_;
-std::unordered_map<std::string, OpProto> OpRegistry::protos_;
-std::unordered_map<std::string, OpAttrChecker> OpRegistry::op_checkers_;
+  static std::unordered_map<std::string, OpAttrChecker>& op_checkers() {
+    static std::unordered_map<std::string, OpAttrChecker> op_checkers_;
+    return op_checkers_;
+  };
+};
 
 template <typename OpType, typename ProtoMakerType>
 class OpRegisterHelper {
@@ -194,60 +155,5 @@ class OpRegisterHelper {
   const OpRegisterHelper<__op_class, __op_maker_class>               \
       __op_class##Register::reg(#__op_type);
 
-// Demos
-
-class CosineOp : public OpBase {
- public:
-  virtual std::string Run() const {
-    std::string msg = "CosineOp runs! scale = " +
-                      std::to_string(boost::get<float>(attr_map_.at("scale")));
-    return msg;
-  }
-};
-
-class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
- public:
-  CosineOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "input of cosine op");
-    AddOutput("output", "output of cosine op");
-    AddAttr<float>("scale", "scale of cosine op")
-        .SetDefault(1.0)
-        .LargerThan(0.0);
-    AddType("cos");
-    AddComment("This is cos op");
-  }
-};
-
-REGISTER_OP(CosineOp, CosineOpProtoAndCheckerMaker, cos_sim)
-
-class MyTestOp : public OpBase {
- public:
-  virtual std::string Run() const {
-    std::string msg =
-        "MyTestOp runs! test_attr = " +
-        std::to_string(boost::get<int>(attr_map_.at("test_attr")));
-    return msg;
-  }
-};
-
-class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
- public:
-  MyTestOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "input of cosine op");
-    AddOutput("output", "output of cosine op");
-    auto my_checker = [](int i) {
-      PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
-    };
-    AddAttr<int>("test_attr", "a simple test attribute")
-        .AddCustomChecker(my_checker);
-    AddType("my_test_op");
-    AddComment("This is my_test op");
-  }
-};
-
-REGISTER_OP(MyTestOp, MyTestOpProtoAndCheckerMaker, my_test_op)
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index 17849ca0191db644884e766342b30461abf50298..f5d45a80bb8e9fa095e7d6adc6370918b3f87f5a 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -1,25 +1,81 @@
 #include "paddle/framework/op_registry.h"
 #include <gtest/gtest.h>
 
+using namespace paddle::framework;
+
+namespace paddle {
+namespace framework {
+class CosineOp : public OperatorBase {
+ public:
+  void Run(const std::shared_ptr<Scope>& scope,
+           const platform::DeviceContext& dev_ctx) const override {}
+  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
+};
+
+class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  CosineOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of cosine op");
+    AddOutput("output", "output of cosine op");
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .LargerThan(0.0);
+    AddType("cos");
+    AddComment("This is cos op");
+  }
+};
+
+REGISTER_OP(CosineOp, CosineOpProtoAndCheckerMaker, cos_sim)
+
+class MyTestOp : public OperatorBase {
+ public:
+  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
+  void Run(const std::shared_ptr<Scope>& scope,
+           const platform::DeviceContext& dev_ctx) const override {}
+
+ public:
+};
+
+class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  MyTestOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of cosine op");
+    AddOutput("output", "output of cosine op");
+    auto my_checker = [](int i) {
+      PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
+    };
+    AddAttr<int>("test_attr", "a simple test attribute")
+        .AddCustomChecker(my_checker);
+    AddType("my_test_op");
+    AddComment("This is my_test op");
+  }
+};
+
+REGISTER_OP(MyTestOp, MyTestOpProtoAndCheckerMaker, my_test_op)
+}  // namespace framework
+}  // namespace paddle
+
 TEST(OpRegistry, CreateOp) {
   paddle::framework::OpDesc op_desc;
   op_desc.set_type("cos_sim");
   op_desc.add_inputs("aa");
   op_desc.add_outputs("bb");
 
+  float scale = 3.3;
   auto attr = op_desc.mutable_attrs()->Add();
   attr->set_name("scale");
   attr->set_type(paddle::framework::AttrType::FLOAT);
-  attr->set_f(3.3);
+  attr->set_f(scale);
 
-  paddle::framework::OpBase* op =
+  paddle::framework::OperatorBase* op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
-  std::string debug_str = op->Run();
-  std::string str = "CosineOp runs! scale = " + std::to_string(3.3);
-  ASSERT_EQ(str.size(), debug_str.size());
-  for (size_t i = 0; i < debug_str.length(); ++i) {
-    ASSERT_EQ(debug_str[i], str[i]);
-  }
+  auto scope = std::make_shared<Scope>();
+  paddle::platform::CPUDeviceContext dev_ctx;
+  op->Run(scope, dev_ctx);
+  float scale_get = op->GetAttr<float>("scale");
+  ASSERT_EQ(scale_get, scale);
 }
 
 TEST(OpRegistry, IllegalAttr) {
@@ -35,7 +91,7 @@ TEST(OpRegistry, IllegalAttr) {
 
   bool caught = false;
   try {
-    paddle::framework::OpBase* op __attribute__((unused)) =
+    paddle::framework::OperatorBase* op __attribute__((unused)) =
         paddle::framework::OpRegistry::CreateOp(op_desc);
   } catch (paddle::framework::EnforceNotMet err) {
     caught = true;
@@ -54,15 +110,14 @@ TEST(OpRegistry, DefaultValue) {
   op_desc.add_inputs("aa");
   op_desc.add_outputs("bb");
 
-  paddle::framework::OpBase* op =
+  ASSERT_TRUE(op_desc.IsInitialized());
+
+  paddle::framework::OperatorBase* op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
-  std::string debug_str = op->Run();
-  float default_value = 1.0;
-  std::string str = "CosineOp runs! scale = " + std::to_string(default_value);
-  ASSERT_EQ(str.size(), debug_str.size());
-  for (size_t i = 0; i < debug_str.length(); ++i) {
-    ASSERT_EQ(debug_str[i], str[i]);
-  }
+  auto scope = std::make_shared<Scope>();
+  paddle::platform::CPUDeviceContext dev_ctx;
+  op->Run(scope, dev_ctx);
+  ASSERT_EQ(op->GetAttr<float>("scale"), 1.0);
 }
 
 TEST(OpRegistry, CustomChecker) {
@@ -74,7 +129,7 @@ TEST(OpRegistry, CustomChecker) {
   // attr 'test_attr' is not set
   bool caught = false;
   try {
-    paddle::framework::OpBase* op __attribute__((unused)) =
+    paddle::framework::OperatorBase* op __attribute__((unused)) =
         paddle::framework::OpRegistry::CreateOp(op_desc);
   } catch (paddle::framework::EnforceNotMet err) {
     caught = true;
@@ -93,7 +148,7 @@ TEST(OpRegistry, CustomChecker) {
   attr->set_i(3);
   caught = false;
   try {
-    paddle::framework::OpBase* op __attribute__((unused)) =
+    paddle::framework::OperatorBase* op __attribute__((unused)) =
         paddle::framework::OpRegistry::CreateOp(op_desc);
   } catch (paddle::framework::EnforceNotMet err) {
     caught = true;
@@ -111,12 +166,16 @@ TEST(OpRegistry, CustomChecker) {
   attr->set_name("test_attr");
   attr->set_type(paddle::framework::AttrType::INT);
   attr->set_i(4);
-  paddle::framework::OpBase* op =
+  paddle::framework::OperatorBase* op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
-  std::string debug_str = op->Run();
-  std::string str = "MyTestOp runs! test_attr = " + std::to_string(4);
-  ASSERT_EQ(str.size(), debug_str.size());
-  for (size_t i = 0; i < debug_str.length(); ++i) {
-    ASSERT_EQ(debug_str[i], str[i]);
-  }
+  paddle::platform::CPUDeviceContext dev_ctx;
+  auto scope = std::make_shared<Scope>();
+  op->Run(scope, dev_ctx);
+  int test_attr = op->GetAttr<int>("test_attr");
+  ASSERT_EQ(test_attr, 4);
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
\ No newline at end of file
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8f7adff8b3982e91a3d7f6d598cd62d5005d5f17
--- /dev/null
+++ b/paddle/framework/operator.cc
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace framework {
+
+std::string OperatorBase::DebugString() const {
+  std::stringstream ss;
+  ss << "=================\n";
+  ss << "type = " << desc_.type() << "\n";
+  ss << "inputs = [";
+  for (auto& ipt : inputs_) {
+    ss << ipt << ", ";
+  }
+  ss << "]\n";
+  ss << "outputs = [";
+  for (auto& opt : outputs_) {
+    ss << opt << ", ";
+  }
+  ss << "]\n";
+  ss << "attr_keys = [";
+  for (auto& attr : attrs_) {
+    ss << attr.first << ", ";
+  }
+  ss << "]\n";
+  return ss.str();
+}
+
+}  // namespace framework
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
new file mode 100644
index 0000000000000000000000000000000000000000..0ce422e007c39ce0c3f5f7a89650cc211919ea8f
--- /dev/null
+++ b/paddle/framework/operator.h
@@ -0,0 +1,150 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <paddle/framework/attr_checker.h>
+#include <paddle/framework/op_desc.pb.h>
+#include <paddle/framework/scope.h>
+#include <paddle/platform/device_context.h>
+#include <paddle/platform/place.h>
+#include <paddle/utils/Error.h>
+#include <boost/variant.hpp>
+#include <string>
+#include <unordered_map>
+#include <vector>
+
+namespace paddle {
+namespace framework {
+
+class OperatorBase;
+
+/**
+ * OperatorBase has the basic element that Net will call to do computation.
+ * Only CreateOperator from OpRegistry will new Operator directly. User
+ * should always construct a proto message OpDesc and call
+ * OpRegistry::CreateOp(op_desc) to get an Operator instance.
+ */
+class OperatorBase {
+ public:
+  virtual ~OperatorBase() {}
+
+  template <typename T>
+  inline const T& GetAttr(const std::string& name) const {
+    PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
+                   name);
+    return boost::get<T>(attrs_.at(name));
+  }
+
+  std::string DebugString() const;
+
+  /// InferShape infer the size of Variables used by this Operator with
+  /// information inside scope
+  virtual void InferShape(const std::shared_ptr<Scope>& scope) const = 0;
+
+  /// Net will call this function to Run an op.
+  virtual void Run(const std::shared_ptr<Scope>& scope,
+                   const platform::DeviceContext& dev_ctx) const = 0;
+
+ protected:
+  std::string Type() const { return desc_.type(); }
+
+ public:
+  OpDesc desc_;
+  std::vector<std::string> inputs_;
+  std::vector<std::string> outputs_;
+  AttributeMap attrs_;
+};
+
+class OpKernel {
+ public:
+  /**
+   * KernelContext is the only parameter of Kernel Run function.
+   * Run will get input/output variables, state such as momentum and
+   * device resource such as CUDA stream, cublas handle, etc. from
+   * KernelContext. User should construct it before run the Operator.
+   */
+  class KernelContext {
+   public:
+    KernelContext(const OperatorBase* op, const std::shared_ptr<Scope>& scope,
+                  const platform::DeviceContext& device_context)
+        : op_(*op), scope_(scope), device_context_(device_context) {}
+
+    const Variable* Input(int index) const {
+      return scope_->GetVariable(op_.inputs_[index]);
+    }
+
+    Variable* Output(int index) const {
+      return scope_->GetVariable(op_.outputs_[index]);
+    }
+
+    const OperatorBase& op_;
+    const std::shared_ptr<Scope>& scope_;
+    const platform::DeviceContext& device_context_;
+  };
+
+  virtual void Compute(const KernelContext& context) const = 0;
+
+  virtual ~OpKernel() {}
+};
+
+class OperatorWithKernel : public OperatorBase {
+ public:
+  struct OpKernelKey {
+    platform::Place place_;
+
+    OpKernelKey() = default;
+    OpKernelKey(const platform::DeviceContext& dev_ctx) {
+      place_ = dev_ctx.GetPlace();
+    }
+
+    bool operator==(const OpKernelKey& o) const { return place_ == o.place_; }
+  };
+
+  struct OpKernelHash {
+    std::hash<bool> hash_;
+    size_t operator()(const OpKernelKey& key) const {
+      return hash_(platform::is_gpu_place(key.place_));
+    }
+  };
+
+  using OpKernelMap =
+      std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;
+
+  void Run(const std::shared_ptr<Scope>& scope,
+           const platform::DeviceContext& dev_ctx) const final {
+    auto& opKernel = AllOpKernels().at(Type()).at(OpKernelKey(dev_ctx));
+    opKernel->Compute(OpKernel::KernelContext(this, scope, dev_ctx));
+  }
+
+  static std::unordered_map<std::string /* op_type */, OpKernelMap>&
+  AllOpKernels() {
+    static std::unordered_map<std::string, OpKernelMap> g_all_op_kernels;
+    return g_all_op_kernels;
+  };
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+#define REGISTER_OP_KERNEL(type, PlaceType, KernelType)                   \
+  struct __op_kernel_register__##type##__ {                               \
+    __op_kernel_register__##type##__() {                                  \
+      ::paddle::framework::OperatorWithKernel::OpKernelKey key;           \
+      key.place_ = PlaceType();                                           \
+      ::paddle::framework::OperatorWithKernel::AllOpKernels()[#type][key] \
+          .reset(new KernelType());                                       \
+    }                                                                     \
+  };                                                                      \
+  static __op_kernel_register__##type##__ __reg_kernel_##type##__
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..86f45f108a5d2189894fd59483a84b039a010ab3
--- /dev/null
+++ b/paddle/framework/operator_test.cc
@@ -0,0 +1,73 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/operator.h"
+#include "gtest/gtest.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace framework {
+
+class OperatorTest : public OperatorBase {
+ public:
+  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
+  void Run(const std::shared_ptr<Scope>& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    float scale = GetAttr<float>("scale");
+    ASSERT_NEAR(scale, 3.14, 1e-5);
+    ASSERT_EQ(scope->GetVariable(inputs_[0]), nullptr);
+    ASSERT_NE(scope->GetVariable(outputs_[0]), nullptr);
+  }
+};
+
+class OperatorTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  OperatorTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of test op");
+    AddOutput("output", "output of test op");
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .LargerThan(0.0);
+    AddType("test_operator");
+    AddComment("This is test op");
+  }
+};
+
+REGISTER_OP(OperatorTest, OperatorTestProtoAndCheckerMaker, test_operator)
+
+TEST(OperatorBase, all) {
+  OpDesc op_desc;
+  op_desc.set_type("test_operator");
+  *op_desc.mutable_inputs()->Add() = "IN1";
+  *op_desc.mutable_outputs()->Add() = "OUT1";
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::AttrType::FLOAT);
+  float scale = 3.14;
+  attr->set_f(scale);
+
+  platform::CPUDeviceContext device_context;
+  auto scope = std::make_shared<Scope>();
+
+  OperatorBase* op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  ASSERT_EQ(op->GetAttr<float>("scale"), scale);
+  scope->CreateVariable("OUT1");
+  op->Run(scope, device_context);
+  std::cout << op->DebugString() << std::endl;
+  delete op;
+}
+
+}  // namespace framework
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index ce5d98b04e6b53fcedc4fc4610d9390e64846b2a..a0945e8055625ca4c21ea1c3fa9f27321ca9ba3c 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #pragma once
 
+#include <cstdint>
 #include <memory>
 #include <type_traits>
 #include "paddle/framework/ddim.h"
@@ -26,31 +27,65 @@ namespace framework {
 
 class Tensor {
  public:
+  Tensor() : offset_(0) {}
+
+  explicit Tensor(const DDim& dims) : dims_(dims), offset_(0) {}
+
   template <typename T>
   const T* data() const {
-    PADDLE_ENFORCE(holder_ != nullptr,
-                   "Tensor::data must be called after Tensor::mutable_data.");
-    return static_cast<const T*>(holder_->Ptr());
+    PADDLE_ENFORCE(
+        holder_ != nullptr,
+        "Tenosr has not been initialized. Call Tensor::mutable_data first.");
+    return reinterpret_cast<const T*>(
+        reinterpret_cast<uintptr_t>(holder_->Ptr()) + offset_);
   }
 
   template <typename T,  // must be POD types
             typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
   T* mutable_data(DDim dims, paddle::platform::Place place) {
+    dims_ = dims;
     if (holder_ == nullptr ||
         !(holder_->Place() ==
           place) /* some versions of boost::variant don't have operator!= */
-        || holder_->Size() < product(dims) * sizeof(T)) {
+        || holder_->Size() < product(dims) * sizeof(T) + offset_) {
       holder_.reset(new PlaceholderImpl<T>(place, product(dims) * sizeof(T)));
+      offset_ = 0;
     }
-    return static_cast<T*>(holder_->Ptr());
+    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->Ptr()) +
+                                offset_);
   }
 
-  template <typename T,  // must be POD types
-            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
-  T* mutable_data(DDim dims) {
-    return mutable_data<T>(dims, paddle::platform::get_place());
+  void ShareDataFrom(const Tensor& src) {
+    PADDLE_ENFORCE(src.holder_ != nullptr,
+                   "Can not share data from an uninitialized tensor.");
+    holder_ = src.holder_;
+    dims_ = src.dims_;
+    offset_ = src.offset_;
   }
 
+  Tensor Slice(const int& begin_idx, const int& end_idx) const {
+    PADDLE_ENFORCE(holder_ != nullptr,
+                   "The sliced tenosr has not been initialized.");
+    PADDLE_ENFORCE(begin_idx >= 0 && end_idx <= dims_[0],
+                   "Slice index is less than zero or out of bound.");
+    PADDLE_ENFORCE(begin_idx < end_idx,
+                   "Begin index must be less than end index.");
+    PADDLE_ENFORCE(dims_[0] != 1, "Can not slice a tensor with dims_[0] = 1.");
+    std::vector<int> d = vectorize(dims_);
+    int base = 1;
+    for (size_t i = 1; i < d.size(); ++i) {
+      base *= d[i];
+    }
+    Tensor dst;
+    dst.holder_ = holder_;
+    dst.dims_ = dims_;
+    dst.dims_[0] = end_idx - begin_idx;
+    dst.offset_ = offset_ + begin_idx * base * holder_->TypeSize();
+    return dst;
+  }
+
+  DDim dims() const { return dims_; }
+
  private:
   // Placeholder hides type T, so it doesn't appear as a template
   // parameter of Variable.
@@ -59,6 +94,7 @@ class Tensor {
     virtual void* Ptr() const = 0;
     virtual paddle::platform::Place Place() const = 0;
     virtual size_t Size() const = 0;
+    virtual size_t TypeSize() const = 0;
   };
 
   template <typename T>
@@ -85,6 +121,7 @@ class Tensor {
     virtual void* Ptr() const { return static_cast<void*>(ptr_.get()); }
     virtual size_t Size() const { return size_; }
     virtual paddle::platform::Place Place() const { return place_; }
+    virtual size_t TypeSize() const { return sizeof(T); }
 
     std::unique_ptr<T, Deleter> ptr_;
     paddle::platform::Place place_;  // record the place of ptr_.
@@ -92,6 +129,8 @@ class Tensor {
   };
 
   std::shared_ptr<Placeholder> holder_;  // holds the memory block if allocated.
+  DDim dims_;
+  size_t offset_;  // marks the begin of tensor data area.
 };
 
 }  // namespace framework
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index 727d81f8d72e39ec564c42a48bf7ff64204adfff..f4822838cfbd27656232a23b14f716f2fbe510e0 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -15,15 +15,27 @@
 #include <gtest/gtest.h>
 #include <string>
 
-TEST(Tensor, ASSERT) {
-  paddle::framework::Tensor cpu_tensor;
+TEST(Tensor, Dims) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  Tensor tt(make_ddim({2, 3, 4}));
+  DDim dims = tt.dims();
+  ASSERT_EQ(arity(dims), 3);
+  for (int i = 0; i < 3; ++i) {
+    EXPECT_EQ(i + 2, dims[i]);
+  }
+}
+
+TEST(Tensor, DataAssert) {
+  paddle::framework::Tensor src_tensor;
 
   bool caught = false;
   try {
-    const double* p __attribute__((unused)) = cpu_tensor.data<double>();
+    src_tensor.data<double>();
   } catch (paddle::framework::EnforceNotMet err) {
     caught = true;
-    std::string msg = "Tensor::data must be called after Tensor::mutable_data.";
+    std::string msg =
+        "Tenosr has not been initialized. Call Tensor::mutable_data first.";
     const char* what = err.what();
     for (size_t i = 0; i < msg.length(); ++i) {
       ASSERT_EQ(what[i], msg[i]);
@@ -32,54 +44,138 @@ TEST(Tensor, ASSERT) {
   ASSERT_TRUE(caught);
 }
 
-/* mutable_data() is not tested at present
+/* following tests are not available at present
    because Memory::Alloc() and Memory::Free() have not been ready.
 
 TEST(Tensor, MutableData) {
   using namespace paddle::framework;
   using namespace paddle::platform;
   {
-    Tensor cpu_tensor;
+    Tensor src_tensor;
     float* p1 = nullptr;
     float* p2 = nullptr;
     // initialization
-    p1 = cpu_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CPUPlace());
+    p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), CPUPlace());
     EXPECT_NE(p1, nullptr);
-    // set cpu_tensor a new dim with large size
+    // set src_tensor a new dim with large size
     // momery is supposed to be re-allocated
-    p2 = cpu_tensor.mutable_data<float>(make_ddim({3, 4}));
+    p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), CPUPlace());
     EXPECT_NE(p2, nullptr);
     EXPECT_NE(p1, p2);
-    // set cpu_tensor a new dim with same size
+    // set src_tensor a new dim with same size
     // momery block is supposed to be unchanged
-    p1 = cpu_tensor.mutable_data<float>(make_ddim({2, 2, 3}));
+    p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), CPUPlace());
     EXPECT_EQ(p1, p2);
-    // set cpu_tensor a new dim with smaller size
+    // set src_tensor a new dim with smaller size
     // momery block is supposed to be unchanged
-    p2 = cpu_tensor.mutable_data<float>(make_ddim({2, 2}));
+    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), CPUPlace());
     EXPECT_EQ(p1, p2);
   }
 
   {
-    Tensor gpu_tensor;
+    Tensor src_tensor;
     float* p1 = nullptr;
     float* p2 = nullptr;
     // initialization
-    p1 = gpu_tensor.mutable_data<float>(make_ddim({1, 2, 3}), GPUPlace());
+    p1 = src_tensor.mutable_data<float>(make_ddim({1, 2, 3}), GPUPlace());
     EXPECT_NE(p1, nullptr);
-    // set gpu_tensor a new dim with large size
+    // set src_tensor a new dim with large size
     // momery is supposed to be re-allocated
-    p2 = gpu_tensor.mutable_data<float>(make_ddim({3, 4}));
+    p2 = src_tensor.mutable_data<float>(make_ddim({3, 4}), GPUPlace());
     EXPECT_NE(p2, nullptr);
     EXPECT_NE(p1, p2);
-    // set gpu_tensor a new dim with same size
+    // set src_tensor a new dim with same size
     // momery block is supposed to be unchanged
-    p1 = gpu_tensor.mutable_data<float>(make_ddim({2, 2, 3}));
+    p1 = src_tensor.mutable_data<float>(make_ddim({2, 2, 3}), GPUPlace());
     EXPECT_EQ(p1, p2);
-    // set gpu_tensor a new dim with smaller size
+    // set src_tensor a new dim with smaller size
     // momery block is supposed to be unchanged
-    p2 = gpu_tensor.mutable_data<float>(make_ddim({2, 2}));
+    p2 = src_tensor.mutable_data<float>(make_ddim({2, 2}), GPUPlace());
     EXPECT_EQ(p1, p2);
   }
 }
-*/
+
+TEST(Tensor, ShareDataFrom) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  {
+    Tensor src_tensor;
+    Tensor dst_tensor;
+    // Try to share data form uninitialized tensor
+    bool caught = false;
+    try {
+      dst_tensor.ShareDataFrom(src_tensor);
+    } catch (EnforceNotMet err) {
+      caught = true;
+      std::string msg = "Can not share data from an uninitialized tensor.";
+      const char* what = err.what();
+      for (size_t i = 0; i < msg.length(); ++i) {
+        ASSERT_EQ(what[i], msg[i]);
+      }
+    }
+    ASSERT_TRUE(caught);
+
+    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CPUPlace());
+    dst_tensor.ShareDataFrom(src_tensor);
+    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
+  }
+
+  {
+    Tensor src_tensor;
+    Tensor dst_tensor;
+    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
+    dst_tensor.ShareDataFrom(src_tensor);
+    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
+  }
+}
+
+TEST(Tensor, Slice) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  {
+    Tensor src_tensor;
+    src_tensor.mutable_data<int>(make_ddim({5, 3, 4}), CPUPlace());
+    Tensor slice_tensor = src_tensor.Slice(1, 3);
+    DDim slice_dims = slice_tensor.dims();
+    ASSERT_EQ(arity(slice_dims), 3);
+    EXPECT_EQ(slice_dims[0], 2);
+    EXPECT_EQ(slice_dims[1], 3);
+    EXPECT_EQ(slice_dims[2], 4);
+
+    uintptr_t src_data_address =
+        reinterpret_cast<uintptr_t>(src_tensor.data<int>());
+    uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
+        src_tensor.mutable_data<int>(src_tensor.dims(), CPUPlace()));
+    uintptr_t slice_data_address =
+        reinterpret_cast<uintptr_t>(slice_tensor.data<int>());
+    uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>(
+        slice_tensor.mutable_data<int>(slice_tensor.dims(), CPUPlace()));
+    EXPECT_EQ(src_data_address, src_mutable_data_address);
+    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
+    EXPECT_EQ(src_data_address + 3 * 4 * 1 * sizeof(int), slice_data_address);
+  }
+
+  {
+    Tensor src_tensor;
+    src_tensor.mutable_data<double>(make_ddim({6, 9}), GPUPlace());
+    Tensor slice_tensor = src_tensor.Slice(2, 6);
+    DDim slice_dims = slice_tensor.dims();
+    ASSERT_EQ(arity(slice_dims), 2);
+    EXPECT_EQ(slice_dims[0], 4);
+    EXPECT_EQ(slice_dims[1], 9);
+
+    uintptr_t src_data_address =
+        reinterpret_cast<uintptr_t>(src_tensor.data<double>());
+    uintptr_t src_mutable_data_address = reinterpret_cast<uintptr_t>(
+        src_tensor.mutable_data<double>(src_tensor.dims(), GPUPlace()));
+    uintptr_t slice_data_address =
+        reinterpret_cast<uintptr_t>(slice_tensor.data<double>());
+    uintptr_t slice_mutable_data_address = reinterpret_cast<uintptr_t>(
+        slice_tensor.mutable_data<double>(slice_tensor.dims(), GPUPlace()));
+    EXPECT_EQ(src_data_address, src_mutable_data_address);
+    EXPECT_EQ(slice_data_address, slice_mutable_data_address);
+    EXPECT_EQ(src_data_address + 9 * 2 * sizeof(double), slice_data_address);
+  }
+}
+
+*/
\ No newline at end of file
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index ebacd5d6dc850ebde5212bba39aea4dde8e4eb03..7a198aec6cf12c92cb24a8e560508d06db5e1dcf 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -1,8 +1,14 @@
 add_subdirectory(dynload)
 
-nv_test(cuda_test SRCS cuda_test.cu)
+nv_test(cuda_test SRCS cuda_test.cu DEPS dyload_cuda)
 
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
+IF(WITH_GPU)
+    set(GPU_CTX_DEPS dyload_cuda dynamic_loader )
+ELSE()
+    set(GPU_CTX_DEPS)
+ENDIF()
 
-nv_test(device_context_test SRCS device_context_test.cc DEPS dynamic_loader place eigen3 glog gflags)
+cc_library(device_context SRCS device_context.cc DEPS place eigen3 ${GPU_CTX_DEPS})
+nv_test(device_context_test SRCS device_context_test.cc DEPS device_context glog gflags)
diff --git a/paddle/platform/cuda.h b/paddle/platform/cuda.h
index 5ed36c0f02549e29fc680eba097c9d851fa346e5..96889abf9eb14dd203eb55ffd0b720450323b38e 100644
--- a/paddle/platform/cuda.h
+++ b/paddle/platform/cuda.h
@@ -28,19 +28,19 @@ inline void throw_on_error(cudaError_t e, const char* message) {
   }
 }
 
-int GetDeviceCount(void) {
+inline int GetDeviceCount(void) {
   int count;
   throw_on_error(cudaGetDeviceCount(&count), "cudaGetDeviceCount failed");
   return count;
 }
 
-int GetCurrentDeviceId(void) {
+inline int GetCurrentDeviceId(void) {
   int device_id;
   throw_on_error(cudaGetDevice(&device_id), "cudaGetDevice failed");
   return device_id;
 }
 
-void SetDeviceId(int device_id) {
+inline void SetDeviceId(int device_id) {
   throw_on_error(cudaSetDevice(device_id), "cudaSetDevice failed");
 }
 
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a2dea2ed1e11817c23dd2dc55a578d8fbd21ecb2
--- /dev/null
+++ b/paddle/platform/device_context.cc
@@ -0,0 +1,13 @@
+#include <paddle/platform/device_context.h>
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+namespace dummy {
+// Make DeviceContext A library.
+int DUMMY_VAR_FOR_DEV_CTX = 0;
+
+}  // namespace dummy
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index fcef0a5e3058f1c9d54f9e06a54a09286e2454fd..e3c2cd2647ff519e205d9aac813dd132d05dafef 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -22,8 +22,8 @@ limitations under the License. */
 #include "paddle/platform/dynload/curand.h"
 #define EIGEN_USE_GPU
 #endif
-#include "paddle/platform/place.h"
-#include "unsupported/Eigen/CXX11/Tensor"
+#include <paddle/platform/place.h>
+#include <unsupported/Eigen/CXX11/Tensor>
 
 namespace paddle {
 namespace platform {
@@ -31,11 +31,19 @@ namespace platform {
 class DeviceContext {
  public:
   virtual ~DeviceContext() {}
+  virtual Place GetPlace() const = 0;
 };
 
-class CPUDeviceContext : public DeviceContext {};
+class CPUDeviceContext : public DeviceContext {
+ public:
+  Place GetPlace() const override {
+    Place retv = CPUPlace();
+    return retv;
+  }
+};
 
 #ifndef PADDLE_ONLY_CPU
+
 class GPUPlaceGuard {
  public:
   explicit GPUPlaceGuard(GPUPlace new_place) : previous_(GetCurrentDeviceId()) {
@@ -60,6 +68,11 @@ class CUDADeviceContext : public DeviceContext {
     eigen_device_ = new Eigen::GpuDevice(eigen_stream_);
   }
 
+  Place GetPlace() const override {
+    Place retv = GPUPlace();
+    return retv;
+  }
+
   void Wait() {
     paddle::platform::throw_on_error(cudaStreamSynchronize(stream_),
                                      "cudaStreamSynchronize failed");
diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/platform/dynload/CMakeLists.txt
index 9f829b70128655f018c59db32b95d3f1789da5fc..4a8866b3d364542f315978859e96290c6f067f6f 100644
--- a/paddle/platform/dynload/CMakeLists.txt
+++ b/paddle/platform/dynload/CMakeLists.txt
@@ -1 +1,2 @@
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)
+nv_library(dyload_cuda SRCS cublas.cc cudnn.cc curand.cc)
diff --git a/paddle/platform/dynload/cublas.cc b/paddle/platform/dynload/cublas.cc
new file mode 100644
index 0000000000000000000000000000000000000000..4e3dfdaefb2348346e8f917b1f6c758bf6d91a1a
--- /dev/null
+++ b/paddle/platform/dynload/cublas.cc
@@ -0,0 +1,15 @@
+#include <paddle/platform/dynload/cublas.h>
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+std::once_flag cublas_dso_flag;
+void *cublas_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUBLAS_BLAS_ROUTINE_EACH(DEFINE_WRAP);
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/dynload/cublas.h b/paddle/platform/dynload/cublas.h
index 258cc88031a71e9fee65b5445bd5537d6782e226..47c7a8ec21fc6e84d5f45f68b7dc8a766003be75 100644
--- a/paddle/platform/dynload/cublas.h
+++ b/paddle/platform/dynload/cublas.h
@@ -23,8 +23,8 @@ namespace paddle {
 namespace platform {
 namespace dynload {
 
-std::once_flag cublas_dso_flag;
-void *cublas_dso_handle = nullptr;
+extern std::once_flag cublas_dso_flag;
+extern void *cublas_dso_handle;
 
 /**
  * The following macro definition can generate structs
@@ -34,10 +34,10 @@ void *cublas_dso_handle = nullptr;
  * note: default dynamic linked libs
  */
 #ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)                            \
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)                    \
   struct DynLoad__##__name {                                        \
     template <typename... Args>                                     \
-    cublasStatus_t operator()(Args... args) {                       \
+    inline cublasStatus_t operator()(Args... args) {                \
       typedef cublasStatus_t (*cublasFunc)(Args...);                \
       std::call_once(cublas_dso_flag,                               \
                      paddle::platform::dynload::GetCublasDsoHandle, \
@@ -45,62 +45,46 @@ void *cublas_dso_handle = nullptr;
       void *p_##__name = dlsym(cublas_dso_handle, #__name);         \
       return reinterpret_cast<cublasFunc>(p_##__name)(args...);     \
     }                                                               \
-  } __name;  // struct DynLoad__##__name
+  };                                                                \
+  extern DynLoad__##__name __name
 #else
-#define DYNAMIC_LOAD_CUBLAS_WRAP(__name)      \
-  struct DynLoad__##__name {                  \
-    template <typename... Args>               \
-    cublasStatus_t operator()(Args... args) { \
-      return __name(args...);                 \
-    }                                         \
-  } __name;  // struct DynLoad__##__name
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
+  struct DynLoad__##__name {                     \
+    inline template <typename... Args>           \
+    cublasStatus_t operator()(Args... args) {    \
+      return __name(args...);                    \
+    }                                            \
+  };                                             \
+  extern DynLoad__##__name __name
 #endif
 
-#define DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) DYNAMIC_LOAD_CUBLAS_WRAP(__name)
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) \
+  DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)
 
-// include all needed cublas functions in HPPL
-// clang-format off
 #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \
-  __macro(cublasSgemv)                    \
-  __macro(cublasDgemv)                    \
-  __macro(cublasSgemm)                    \
-  __macro(cublasDgemm)                    \
-  __macro(cublasSgeam)                    \
-  __macro(cublasDgeam)                    \
+  __macro(cublasSgemv);                   \
+  __macro(cublasDgemv);                   \
+  __macro(cublasSgemm);                   \
+  __macro(cublasDgemm);                   \
+  __macro(cublasSgeam);                   \
+  __macro(cublasDgeam);
 
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasCreate)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasDestroy)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetStream)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetPointerMode)
-DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasGetPointerMode)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetrfBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetriBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetrfBatched)
-DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetriBatched)
-CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
+DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasCreate);
+DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasDestroy);
+DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetStream);
+DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasSetPointerMode);
+DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(cublasGetPointerMode);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmBatched);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmBatched);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmBatched);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmBatched);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetrfBatched);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetriBatched);
+DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgetrfBatched);
 
-#undef DYNAMIC_LOAD_CUBLAS_WRAP
-#undef DYNAMIC_LOAD_CUBLAS_V2_WRAP
-#undef CUBLAS_BLAS_ROUTINE_EACH
+CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP);
 
-// clang-format on
-#ifndef PADDLE_TYPE_DOUBLE
-#define CUBLAS_GEAM paddle::platform::dynload::cublasSgeam
-#define CUBLAS_GEMV paddle::platform::dynload::cublasSgemv
-#define CUBLAS_GEMM paddle::platform::dynload::cublasSgemm
-#define CUBLAS_GETRF paddle::platform::dynload::cublasSgetrfBatched
-#define CUBLAS_GETRI paddle::platform::dynload::cublasSgetriBatched
-#else
-#define CUBLAS_GEAM paddle::platform::dynload::cublasDgeam
-#define CUBLAS_GEMV paddle::platform::dynload::cublasDgemv
-#define CUBLAS_GEMM paddle::platform::dynload::cublasDgemm
-#define CUBLAS_GETRF paddle::platform::dynload::cublasDgetrfBatched
-#define CUBLAS_GETRI paddle::platform::dynload::cublasDgetriBatched
-#endif
+#undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/dynload/cudnn.cc b/paddle/platform/dynload/cudnn.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8b5e15b5efcdae6a1eed09f002eb2f4f2163035f
--- /dev/null
+++ b/paddle/platform/dynload/cudnn.cc
@@ -0,0 +1,28 @@
+#include <paddle/platform/dynload/cudnn.h>
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+std::once_flag cudnn_dso_flag;
+void* cudnn_dso_handle = nullptr;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CUDNN_DNN_ROUTINE_EACH(DEFINE_WRAP);
+CUDNN_DNN_ROUTINE_EACH_R2(DEFINE_WRAP);
+
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
+CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DEFINE_WRAP);
+#endif
+
+#ifdef CUDNN_DNN_ROUTINE_EACH_AFTER_R4
+CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
+#endif
+
+#ifdef CUDNN_DNN_ROUTINE_EACH_R5
+CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
+#endif
+
+}  // namespace dynload
+}  // namespace platform
+}  // namespace paddle
\ No newline at end of file
diff --git a/paddle/platform/dynload/cudnn.h b/paddle/platform/dynload/cudnn.h
index 0a9562c573cdfe059ef7caa39ba62efa87225e41..ef0dd85b083dc2335dd5c70d3dc5f59eda25daeb 100644
--- a/paddle/platform/dynload/cudnn.h
+++ b/paddle/platform/dynload/cudnn.h
@@ -23,12 +23,12 @@ namespace paddle {
 namespace platform {
 namespace dynload {
 
-std::once_flag cudnn_dso_flag;
-void* cudnn_dso_handle = nullptr;
+extern std::once_flag cudnn_dso_flag;
+extern void* cudnn_dso_handle;
 
 #ifdef PADDLE_USE_DSO
 
-#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                            \
+#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                    \
   struct DynLoad__##__name {                                       \
     template <typename... Args>                                    \
     auto operator()(Args... args) -> decltype(__name(args...)) {   \
@@ -39,17 +39,19 @@ void* cudnn_dso_handle = nullptr;
       void* p_##__name = dlsym(cudnn_dso_handle, #__name);         \
       return reinterpret_cast<cudnn_func>(p_##__name)(args...);    \
     }                                                              \
-  } __name; /* struct DynLoad__##__name */
+  };                                                               \
+  extern struct DynLoad__##__name __name
 
 #else
 
-#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                          \
+#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name)                  \
   struct DynLoad__##__name {                                     \
     template <typename... Args>                                  \
     auto operator()(Args... args) -> decltype(__name(args...)) { \
       return __name(args...);                                    \
     }                                                            \
-  } __name; /* struct DynLoad__##__name */
+  };                                                             \
+  extern DynLoad__##__name __name
 
 #endif
 
@@ -57,80 +59,73 @@ void* cudnn_dso_handle = nullptr;
  * include all needed cudnn functions in HPPL
  * different cudnn version has different interfaces
  **/
-// clang-format off
-#define CUDNN_DNN_ROUTINE_EACH(__macro)                   \
-  __macro(cudnnSetTensor4dDescriptor)                     \
-  __macro(cudnnSetTensor4dDescriptorEx)                   \
-  __macro(cudnnGetConvolutionNdForwardOutputDim)          \
-  __macro(cudnnGetConvolutionForwardAlgorithm)            \
-  __macro(cudnnCreateTensorDescriptor)                    \
-  __macro(cudnnDestroyTensorDescriptor)                   \
-  __macro(cudnnCreateFilterDescriptor)                    \
-  __macro(cudnnSetFilter4dDescriptor)                     \
-  __macro(cudnnSetPooling2dDescriptor)                    \
-  __macro(cudnnDestroyFilterDescriptor)                   \
-  __macro(cudnnCreateConvolutionDescriptor)               \
-  __macro(cudnnCreatePoolingDescriptor)                   \
-  __macro(cudnnDestroyPoolingDescriptor)                  \
-  __macro(cudnnSetConvolution2dDescriptor)                \
-  __macro(cudnnDestroyConvolutionDescriptor)              \
-  __macro(cudnnCreate)                                    \
-  __macro(cudnnDestroy)                                   \
-  __macro(cudnnSetStream)                                 \
-  __macro(cudnnActivationForward)                         \
-  __macro(cudnnConvolutionForward)                        \
-  __macro(cudnnConvolutionBackwardBias)                   \
-  __macro(cudnnGetConvolutionForwardWorkspaceSize)        \
-  __macro(cudnnTransformTensor)                           \
-  __macro(cudnnPoolingForward)                            \
-  __macro(cudnnPoolingBackward)                           \
-  __macro(cudnnSoftmaxBackward)                           \
-  __macro(cudnnSoftmaxForward)                            \
-  __macro(cudnnGetVersion)                                \
-  __macro(cudnnGetErrorString)
-CUDNN_DNN_ROUTINE_EACH(DYNAMIC_LOAD_CUDNN_WRAP)
-
-#define CUDNN_DNN_ROUTINE_EACH_R2(__macro)                \
-  __macro(cudnnAddTensor)                                 \
-  __macro(cudnnConvolutionBackwardData)                   \
-  __macro(cudnnConvolutionBackwardFilter)
-CUDNN_DNN_ROUTINE_EACH_R2(DYNAMIC_LOAD_CUDNN_WRAP)
+#define CUDNN_DNN_ROUTINE_EACH(__macro)             \
+  __macro(cudnnSetTensor4dDescriptor);              \
+  __macro(cudnnSetTensor4dDescriptorEx);            \
+  __macro(cudnnGetConvolutionNdForwardOutputDim);   \
+  __macro(cudnnGetConvolutionForwardAlgorithm);     \
+  __macro(cudnnCreateTensorDescriptor);             \
+  __macro(cudnnDestroyTensorDescriptor);            \
+  __macro(cudnnCreateFilterDescriptor);             \
+  __macro(cudnnSetFilter4dDescriptor);              \
+  __macro(cudnnSetPooling2dDescriptor);             \
+  __macro(cudnnDestroyFilterDescriptor);            \
+  __macro(cudnnCreateConvolutionDescriptor);        \
+  __macro(cudnnCreatePoolingDescriptor);            \
+  __macro(cudnnDestroyPoolingDescriptor);           \
+  __macro(cudnnSetConvolution2dDescriptor);         \
+  __macro(cudnnDestroyConvolutionDescriptor);       \
+  __macro(cudnnCreate);                             \
+  __macro(cudnnDestroy);                            \
+  __macro(cudnnSetStream);                          \
+  __macro(cudnnActivationForward);                  \
+  __macro(cudnnConvolutionForward);                 \
+  __macro(cudnnConvolutionBackwardBias);            \
+  __macro(cudnnGetConvolutionForwardWorkspaceSize); \
+  __macro(cudnnTransformTensor);                    \
+  __macro(cudnnPoolingForward);                     \
+  __macro(cudnnPoolingBackward);                    \
+  __macro(cudnnSoftmaxBackward);                    \
+  __macro(cudnnSoftmaxForward);                     \
+  __macro(cudnnGetVersion);                         \
+  __macro(cudnnGetErrorString);
+CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+
+#define CUDNN_DNN_ROUTINE_EACH_R2(__macro) \
+  __macro(cudnnAddTensor);                 \
+  __macro(cudnnConvolutionBackwardData);   \
+  __macro(cudnnConvolutionBackwardFilter);
+CUDNN_DNN_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 
 // APIs available after R3:
 #if CUDNN_VERSION >= 3000
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro)              \
-  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize)     \
-  __macro(cudnnGetConvolutionBackwardDataAlgorithm)           \
-  __macro(cudnnGetConvolutionBackwardFilterAlgorithm)         \
-  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize)
-CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DYNAMIC_LOAD_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R3
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R3(__macro)           \
+  __macro(cudnnGetConvolutionBackwardFilterWorkspaceSize); \
+  __macro(cudnnGetConvolutionBackwardDataAlgorithm);       \
+  __macro(cudnnGetConvolutionBackwardFilterAlgorithm);     \
+  __macro(cudnnGetConvolutionBackwardDataWorkspaceSize);
+CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
-
 // APIs available after R4:
 #if CUDNN_VERSION >= 4007
-#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)             \
-  __macro(cudnnBatchNormalizationForwardTraining)            \
-  __macro(cudnnBatchNormalizationForwardInference)           \
-  __macro(cudnnBatchNormalizationBackward)
-CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DYNAMIC_LOAD_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_AFTER_R4
+#define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)    \
+  __macro(cudnnBatchNormalizationForwardTraining);  \
+  __macro(cudnnBatchNormalizationForwardInference); \
+  __macro(cudnnBatchNormalizationBackward);
+CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
 // APIs in R5
 #if CUDNN_VERSION >= 5000
-#define CUDNN_DNN_ROUTINE_EACH_R5(__macro)                    \
-  __macro(cudnnCreateActivationDescriptor)                    \
-  __macro(cudnnSetActivationDescriptor)                       \
-  __macro(cudnnGetActivationDescriptor)                       \
-  __macro(cudnnDestroyActivationDescriptor)
-CUDNN_DNN_ROUTINE_EACH_R5(DYNAMIC_LOAD_CUDNN_WRAP)
-#undef CUDNN_DNN_ROUTINE_EACH_R5
+#define CUDNN_DNN_ROUTINE_EACH_R5(__macro)  \
+  __macro(cudnnCreateActivationDescriptor); \
+  __macro(cudnnSetActivationDescriptor);    \
+  __macro(cudnnGetActivationDescriptor);    \
+  __macro(cudnnDestroyActivationDescriptor);
+CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
-#undef CUDNN_DNN_ROUTINE_EACH
-// clang-format on
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/dynload/curand.cc b/paddle/platform/dynload/curand.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5c1fab992c98569d4a95b6e699d97d428511e48e
--- /dev/null
+++ b/paddle/platform/dynload/curand.cc
@@ -0,0 +1,15 @@
+#include <paddle/platform/dynload/curand.h>
+
+namespace paddle {
+namespace platform {
+namespace dynload {
+
+std::once_flag curand_dso_flag;
+void *curand_dso_handle;
+
+#define DEFINE_WRAP(__name) DynLoad__##__name __name
+
+CURAND_RAND_ROUTINE_EACH(DEFINE_WRAP);
+}
+}
+}
\ No newline at end of file
diff --git a/paddle/platform/dynload/curand.h b/paddle/platform/dynload/curand.h
index 9dc0a25c0fbdc3f73f1dd82206e8940972b0b7f5..d8c46bc41e18d013a80cd0a9116a4b1a52bf5854 100644
--- a/paddle/platform/dynload/curand.h
+++ b/paddle/platform/dynload/curand.h
@@ -22,10 +22,10 @@ limitations under the License. */
 namespace paddle {
 namespace platform {
 namespace dynload {
-std::once_flag curand_dso_flag;
-void *curand_dso_handle = nullptr;
+extern std::once_flag curand_dso_flag;
+extern void *curand_dso_handle;
 #ifdef PADDLE_USE_DSO
-#define DYNAMIC_LOAD_CURAND_WRAP(__name)                            \
+#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name)                    \
   struct DynLoad__##__name {                                        \
     template <typename... Args>                                     \
     curandStatus_t operator()(Args... args) {                       \
@@ -36,32 +36,29 @@ void *curand_dso_handle = nullptr;
       void *p_##__name = dlsym(curand_dso_handle, #__name);         \
       return reinterpret_cast<curandFunc>(p_##__name)(args...);     \
     }                                                               \
-  } __name; /* struct DynLoad__##__name */
+  };                                                                \
+  extern DynLoad__##__name __name
 #else
-#define DYNAMIC_LOAD_CURAND_WRAP(__name)      \
-  struct DynLoad__##__name {                  \
-    template <typename... Args>               \
-    curandStatus_t operator()(Args... args) { \
-      return __name(args...);                 \
-    }                                         \
-  } __name; /* struct DynLoad__##__name */
+#define DECLARE_DYNAMIC_LOAD_CURAND_WRAP(__name) \
+  struct DynLoad__##__name {                     \
+    template <typename... Args>                  \
+    curandStatus_t operator()(Args... args) {    \
+      return __name(args...);                    \
+    }                                            \
+  };                                             \
+  extern DynLoad__##__name __name
 #endif
 
-/* include all needed curand functions in HPPL */
-// clang-format off
-#define CURAND_RAND_ROUTINE_EACH(__macro)    \
-  __macro(curandCreateGenerator)             \
-  __macro(curandSetStream)                   \
-  __macro(curandSetPseudoRandomGeneratorSeed)\
-  __macro(curandGenerateUniform)             \
-  __macro(curandGenerateUniformDouble)       \
-  __macro(curandDestroyGenerator)
-// clang-format on
+#define CURAND_RAND_ROUTINE_EACH(__macro)      \
+  __macro(curandCreateGenerator);              \
+  __macro(curandSetStream);                    \
+  __macro(curandSetPseudoRandomGeneratorSeed); \
+  __macro(curandGenerateUniform);              \
+  __macro(curandGenerateUniformDouble);        \
+  __macro(curandDestroyGenerator);
 
-CURAND_RAND_ROUTINE_EACH(DYNAMIC_LOAD_CURAND_WRAP)
+CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
 
-#undef CURAND_RAND_ROUTINE_EACH
-#undef DYNAMIC_LOAD_CURAND_WRAP
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..af85fdeecb57729d7fb580ebd4c59c1afc61d61a
--- /dev/null
+++ b/paddle/pybind/CMakeLists.txt
@@ -0,0 +1 @@
+cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python)
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
new file mode 100644
index 0000000000000000000000000000000000000000..55aebc59eca50ad33e8a5357c5ca29d4101f754b
--- /dev/null
+++ b/paddle/pybind/pybind.cc
@@ -0,0 +1,46 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/framework/scope.h>
+#include <pybind11/pybind11.h>
+
+namespace py = pybind11;
+namespace pd = paddle::framework;
+
+PYBIND11_PLUGIN(core) {
+  py::module m("core", "C++ core of Paddle Paddle");
+
+  py::class_<pd::Variable>(m, "Variable", R"DOC(Variable Class.
+
+All parameter, weight, gradient are variables in Paddle.
+)DOC")
+      .def("is_int", [](const pd::Variable& var) { return var.IsType<int>(); })
+      .def("set_int",
+           [](pd::Variable& var, int val) -> void {
+             *var.GetMutable<int>() = val;
+           })
+      .def("get_int",
+           [](const pd::Variable& var) -> int { return var.Get<int>(); });
+
+  py::class_<pd::Scope, std::shared_ptr<pd::Scope>>(m, "Scope")
+      .def(py::init<const std::shared_ptr<pd::Scope>&>())
+      .def("get_var",
+           &pd::Scope::GetVariable,
+           py::return_value_policy::reference)
+      .def("create_var",
+           &pd::Scope::CreateVariable,
+           py::return_value_policy::reference);
+
+  return m.ptr();
+}
\ No newline at end of file
diff --git a/paddle/trainer/NewRemoteParameterUpdater.cpp b/paddle/trainer/NewRemoteParameterUpdater.cpp
index f25ce2f7f06f6da0feab27da61b8e49689cbe213..b359d9da2167bf459504e15c3140b3d956f417f3 100644
--- a/paddle/trainer/NewRemoteParameterUpdater.cpp
+++ b/paddle/trainer/NewRemoteParameterUpdater.cpp
@@ -22,7 +22,8 @@ DECLARE_string(save_dir);
 namespace paddle {
 NewRemoteParameterUpdater::NewRemoteParameterUpdater(
     const OptimizationConfig &config, const std::string pserverSpec)
-    : parameterClient_(-1),
+    : trainerConfig_(config),
+      parameterClient_(-1),
       newParameters_(nullptr),
       newGradients_(nullptr),
       pserverSpec_(pserverSpec) {}
@@ -51,7 +52,22 @@ void NewRemoteParameterUpdater::init(
     LOG(INFO) << "paddle_begin_init_params start";
     for (int i = 0; i < parameterSize(); ++i) {
       auto paramConfig = parameters_[i]->getConfig();
-      std::string bytes = paramConfig.SerializeAsString();
+      LOG(INFO) << "old param config: " << paramConfig.DebugString();
+      // FIXME(typhoonzero): convert old paramConfig to optimizerConfig
+      OptimizerConfig optimizeConfigV2;
+      auto sgdConfigV2 = optimizeConfigV2.mutable_sgd();
+      sgdConfigV2->set_momentum(paramConfig.momentum());
+      sgdConfigV2->set_decay(paramConfig.decay_rate());
+      optimizeConfigV2.set_lr_policy(paddle::OptimizerConfig::Const);
+      auto constlr = optimizeConfigV2.mutable_const_lr();
+      constlr->set_learning_rate(paramConfig.learning_rate());
+      if (trainerConfig_.algorithm() == "sgd") {
+        optimizeConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
+        // FIXME: config all algorithms
+      } else {
+        optimizeConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
+      }
+      std::string bytes = optimizeConfigV2.SerializeAsString();
       const char *array = bytes.data();
       int size = (int)bytes.size();
       paddle_init_param(
@@ -83,4 +99,4 @@ void NewRemoteParameterUpdater::finishBatch(real cost) {
 void NewRemoteParameterUpdater::startPass() {}
 
 bool NewRemoteParameterUpdater::finishPass() { return true; }
-}
+}  // namespace paddle
diff --git a/paddle/trainer/NewRemoteParameterUpdater.h b/paddle/trainer/NewRemoteParameterUpdater.h
index f735185f62b3491a63e34cfc4a2ef73dae12243e..dfed00bc216b1d41bb7520619b76702f9fe650f2 100644
--- a/paddle/trainer/NewRemoteParameterUpdater.h
+++ b/paddle/trainer/NewRemoteParameterUpdater.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <functional>
 #include <thread>
+#include "OptimizerConfig.pb.h"
 #include "ParameterUpdater.h"
 #include "libpaddle_pserver_cclient.h"
 #include "paddle/pserver/ParameterClient2.h"
@@ -101,6 +102,7 @@ private:
   }
 
 protected:
+  const OptimizationConfig& trainerConfig_;
   /// internal parameter client object for exchanging data with pserver
   paddle_pserver_client parameterClient_;
   /// the parameters for new pserver client
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 13a1802ee3790b1255fc11f5b2053e3342c61914..0171f9d8ccd6045cb876d57684269a2a49e77f96 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -26,10 +26,17 @@ endif(WITH_GOLANG)
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 
+
+add_custom_command(OUTPUT ${PROJ_ROOT}/python/paddle/v2/framework/core.so
+        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PROJ_ROOT}/python/paddle/v2/framework/core.so
+        DEPENDS paddle_pybind)
+add_custom_target(copy_paddle_pybind ALL DEPENDS ${PROJ_ROOT}/python/paddle/v2/framework/core.so)
+
+
 add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp
-    DEPENDS gen_proto_py framework_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+    DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 
 add_custom_target(paddle_python ALL DEPENDS
     ${OUTPUT_DIR}/.timestamp)
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index b77932ce5f09470329a97cc0a6273942a9155c6a..810bea913ec79b2df0eb63ed5a4fd411549ff2e9 100755
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -1395,7 +1395,7 @@ def inputs(layers, *args):
     if len(args) != 0:
         layers.extend(args)
 
-    Inputs(* [l.name for l in layers])
+    Inputs(*[l.name for l in layers])
 
 
 def outputs(layers, *args):
@@ -1408,6 +1408,8 @@ def outputs(layers, *args):
     :return:
     """
 
+    traveled = set()
+
     def __dfs_travel__(layer,
                        predicate=lambda x: x.layer_type == LayerType.DATA):
         """
@@ -1419,6 +1421,11 @@ def outputs(layers, *args):
         :type layer: LayerOutput
         :return:
         """
+        if layer in traveled:
+            return []
+        else:
+            traveled.add(layer)
+
         assert isinstance(layer, LayerOutput), "layer is %s" % (layer)
         retv = []
         if layer.parents is not None:
@@ -1438,7 +1445,7 @@ def outputs(layers, *args):
     assert len(layers) > 0
 
     if HasInputsSet():  # input already set
-        Outputs(* [l.name for l in layers])
+        Outputs(*[l.name for l in layers])
         return  # just return outputs.
 
     if len(layers) != 1:
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index a939c41ad01922e421f7bcd93851df7447a6799f..70e342fb79ab51e3376ea6ad8f593c4c3a1fff18 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -6,6 +6,7 @@ img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cos
 test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
 test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
 test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
-test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer)
+test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer
+test_recursive_topology)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
index b7d74f85ab4ca3f434dfa45516dfee7227b6ceee..96fb1d4ebde08b1bca2ffd09e8db0895842cbfd3 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
@@ -131,6 +131,7 @@ input_layer_names: "weight"
 input_layer_names: "multi_class_label"
 output_layer_names: "__cost_0__"
 output_layer_names: "__mse_cost_0__"
+output_layer_names: "__nce_layer_0__"
 evaluators {
   name: "classification_error_evaluator"
   type: "classification_error"
@@ -154,6 +155,7 @@ sub_models {
   input_layer_names: "multi_class_label"
   output_layer_names: "__cost_0__"
   output_layer_names: "__mse_cost_0__"
+  output_layer_names: "__nce_layer_0__"
   evaluator_names: "classification_error_evaluator"
   is_recurrent_layer_group: false
 }
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..8133aa9c8d3e7c6843d1b27b70e87d394a1e0e47
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr
@@ -0,0 +1,497 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__addto_0__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+  }
+  inputs {
+    input_layer_name: "data"
+  }
+}
+layers {
+  name: "__addto_1__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_0__"
+  }
+  inputs {
+    input_layer_name: "__addto_0__"
+  }
+}
+layers {
+  name: "__addto_2__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_1__"
+  }
+  inputs {
+    input_layer_name: "__addto_1__"
+  }
+}
+layers {
+  name: "__addto_3__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_2__"
+  }
+  inputs {
+    input_layer_name: "__addto_2__"
+  }
+}
+layers {
+  name: "__addto_4__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_3__"
+  }
+  inputs {
+    input_layer_name: "__addto_3__"
+  }
+}
+layers {
+  name: "__addto_5__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_4__"
+  }
+  inputs {
+    input_layer_name: "__addto_4__"
+  }
+}
+layers {
+  name: "__addto_6__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_5__"
+  }
+  inputs {
+    input_layer_name: "__addto_5__"
+  }
+}
+layers {
+  name: "__addto_7__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_6__"
+  }
+  inputs {
+    input_layer_name: "__addto_6__"
+  }
+}
+layers {
+  name: "__addto_8__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_7__"
+  }
+  inputs {
+    input_layer_name: "__addto_7__"
+  }
+}
+layers {
+  name: "__addto_9__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_8__"
+  }
+  inputs {
+    input_layer_name: "__addto_8__"
+  }
+}
+layers {
+  name: "__addto_10__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_9__"
+  }
+  inputs {
+    input_layer_name: "__addto_9__"
+  }
+}
+layers {
+  name: "__addto_11__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_10__"
+  }
+  inputs {
+    input_layer_name: "__addto_10__"
+  }
+}
+layers {
+  name: "__addto_12__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_11__"
+  }
+  inputs {
+    input_layer_name: "__addto_11__"
+  }
+}
+layers {
+  name: "__addto_13__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_12__"
+  }
+  inputs {
+    input_layer_name: "__addto_12__"
+  }
+}
+layers {
+  name: "__addto_14__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_13__"
+  }
+  inputs {
+    input_layer_name: "__addto_13__"
+  }
+}
+layers {
+  name: "__addto_15__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_14__"
+  }
+  inputs {
+    input_layer_name: "__addto_14__"
+  }
+}
+layers {
+  name: "__addto_16__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_15__"
+  }
+  inputs {
+    input_layer_name: "__addto_15__"
+  }
+}
+layers {
+  name: "__addto_17__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_16__"
+  }
+  inputs {
+    input_layer_name: "__addto_16__"
+  }
+}
+layers {
+  name: "__addto_18__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_17__"
+  }
+  inputs {
+    input_layer_name: "__addto_17__"
+  }
+}
+layers {
+  name: "__addto_19__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_18__"
+  }
+  inputs {
+    input_layer_name: "__addto_18__"
+  }
+}
+layers {
+  name: "__addto_20__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_19__"
+  }
+  inputs {
+    input_layer_name: "__addto_19__"
+  }
+}
+layers {
+  name: "__addto_21__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_20__"
+  }
+  inputs {
+    input_layer_name: "__addto_20__"
+  }
+}
+layers {
+  name: "__addto_22__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_21__"
+  }
+  inputs {
+    input_layer_name: "__addto_21__"
+  }
+}
+layers {
+  name: "__addto_23__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_22__"
+  }
+  inputs {
+    input_layer_name: "__addto_22__"
+  }
+}
+layers {
+  name: "__addto_24__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_23__"
+  }
+  inputs {
+    input_layer_name: "__addto_23__"
+  }
+}
+layers {
+  name: "__addto_25__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_24__"
+  }
+  inputs {
+    input_layer_name: "__addto_24__"
+  }
+}
+layers {
+  name: "__addto_26__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_25__"
+  }
+  inputs {
+    input_layer_name: "__addto_25__"
+  }
+}
+layers {
+  name: "__addto_27__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_26__"
+  }
+  inputs {
+    input_layer_name: "__addto_26__"
+  }
+}
+layers {
+  name: "__addto_28__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_27__"
+  }
+  inputs {
+    input_layer_name: "__addto_27__"
+  }
+}
+layers {
+  name: "__addto_29__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_28__"
+  }
+  inputs {
+    input_layer_name: "__addto_28__"
+  }
+}
+layers {
+  name: "__addto_30__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_29__"
+  }
+  inputs {
+    input_layer_name: "__addto_29__"
+  }
+}
+layers {
+  name: "__addto_31__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_30__"
+  }
+  inputs {
+    input_layer_name: "__addto_30__"
+  }
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 32
+  active_type: "relu"
+  inputs {
+    input_layer_name: "__addto_31__"
+    input_parameter_name: "___fc_layer_0__.w0"
+  }
+  bias_parameter_name: "___fc_layer_0__.wbias"
+}
+layers {
+  name: "__fc_layer_1__"
+  type: "fc"
+  size: 10
+  active_type: "softmax"
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+    input_parameter_name: "___fc_layer_1__.w0"
+  }
+  bias_parameter_name: "___fc_layer_1__.wbias"
+}
+parameters {
+  name: "___fc_layer_0__.w0"
+  size: 3200
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 32
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_0__.wbias"
+  size: 32
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 32
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___fc_layer_1__.w0"
+  size: 320
+  initial_mean: 0.0
+  initial_std: 0.176776695297
+  dims: 32
+  dims: 10
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_1__.wbias"
+  size: 10
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 10
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+output_layer_names: "__fc_layer_1__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__addto_0__"
+  layer_names: "__addto_1__"
+  layer_names: "__addto_2__"
+  layer_names: "__addto_3__"
+  layer_names: "__addto_4__"
+  layer_names: "__addto_5__"
+  layer_names: "__addto_6__"
+  layer_names: "__addto_7__"
+  layer_names: "__addto_8__"
+  layer_names: "__addto_9__"
+  layer_names: "__addto_10__"
+  layer_names: "__addto_11__"
+  layer_names: "__addto_12__"
+  layer_names: "__addto_13__"
+  layer_names: "__addto_14__"
+  layer_names: "__addto_15__"
+  layer_names: "__addto_16__"
+  layer_names: "__addto_17__"
+  layer_names: "__addto_18__"
+  layer_names: "__addto_19__"
+  layer_names: "__addto_20__"
+  layer_names: "__addto_21__"
+  layer_names: "__addto_22__"
+  layer_names: "__addto_23__"
+  layer_names: "__addto_24__"
+  layer_names: "__addto_25__"
+  layer_names: "__addto_26__"
+  layer_names: "__addto_27__"
+  layer_names: "__addto_28__"
+  layer_names: "__addto_29__"
+  layer_names: "__addto_30__"
+  layer_names: "__addto_31__"
+  layer_names: "__fc_layer_0__"
+  layer_names: "__fc_layer_1__"
+  input_layer_names: "data"
+  output_layer_names: "__fc_layer_1__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py b/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a693f8dff06dec6e71eeb488da9c807c35e4c9b
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py
@@ -0,0 +1,16 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+din = data_layer(name='data', size=100)
+
+enc = din
+for i in range(32):
+    enc = addto_layer([enc, enc])
+
+pred = fc_layer(
+    input=fc_layer(
+        input=enc, size=32, act=ReluActivation()),
+    size=10,
+    act=SoftmaxActivation())
+outputs(pred)
diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py
index e1dc4f4c30051202e8fd077087679c4fd6cbd7a0..2a631c365f27a6039021a56268a62017638c2739 100644
--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/v2/dataset/wmt14.py
@@ -32,9 +32,9 @@ MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
 # this is a small set of data for test. The original data is too large and will be add later.
 URL_TRAIN = 'http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz'
 MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
-# this is the pretrained model, whose bleu = 26.92
+# BLEU of this trained model is 26.92
 URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz'
-MD5_MODEL = '4ce14a26607fb8a1cc23bcdedb1895e4'
+MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3'
 
 START = "<s>"
 END = "<e>"
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index 8cb0c5c3765a00b45177117925e320e61a1b609a..d809917af14aaf0ffc3a3d95fd5d8fbe028a61cb 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -1 +1 @@
-add_python_test(test_framework test_protobuf.py)
+add_python_test(test_framework test_protobuf.py test_scope.py)
diff --git a/python/paddle/v2/framework/tests/test_protobuf.py b/python/paddle/v2/framework/tests/test_protobuf.py
index f0e60191991e2f24b0a1972afb0f6cbd3aaa4008..b8702477e64203e735bff05b115eafbb2a52172d 100644
--- a/python/paddle/v2/framework/tests/test_protobuf.py
+++ b/python/paddle/v2/framework/tests/test_protobuf.py
@@ -24,3 +24,7 @@ class TestFrameworkProto(unittest.TestCase):
         attr.type = attr_type_lib.FLOAT
         op_proto.type = "cos"
         self.assertTrue(op_proto.IsInitialized())
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_scope.py b/python/paddle/v2/framework/tests/test_scope.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0ee45cfc75e486c693a00d92a97ac0970195581
--- /dev/null
+++ b/python/paddle/v2/framework/tests/test_scope.py
@@ -0,0 +1,37 @@
+import paddle.v2.framework.core
+import unittest
+
+
+class TestScope(unittest.TestCase):
+    def test_create_destroy(self):
+        paddle_c = paddle.v2.framework.core
+        scope = paddle_c.Scope(None)
+        self.assertIsNotNone(scope)
+        scope_with_parent = paddle_c.Scope(scope)
+        self.assertIsNotNone(scope_with_parent)
+
+    def test_none_variable(self):
+        paddle_c = paddle.v2.framework.core
+        scope = paddle_c.Scope(None)
+        self.assertIsNone(scope.get_var("test"))
+
+    def test_create_var_get_var(self):
+        paddle_c = paddle.v2.framework.core
+        scope = paddle_c.Scope(None)
+        var_a = scope.create_var("var_a")
+        self.assertIsNotNone(var_a)
+        self.assertIsNotNone(scope.get_var('var_a'))
+        scope2 = paddle_c.Scope(scope)
+        self.assertIsNotNone(scope2.get_var('var_a'))
+
+    def test_var_get_int(self):
+        paddle_c = paddle.v2.framework.core
+        scope = paddle_c.Scope(None)
+        var = scope.create_var("test_int")
+        var.set_int(10)
+        self.assertTrue(var.is_int())
+        self.assertEqual(10, var.get_int())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
index 8124e219ba499333ecdf4b34ff5352e281aaa016..390c22ee552c506fde1567efba1326a6d735ad2e 100644
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -66,6 +66,8 @@ class Optimizer(object):
             if use_sparse_remote_updater:
                         gradient_machine.prefetch(in_args)
                         parameter_updater.getParametersRemote()
+
+        :param pserver_spec: pserver location, eg: localhost:3000
         :return: parameter_updater
         """
         if is_local:
diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
index f9658a8c5df9562073c8a187074a6cb3459ac5d9..96c6c4b89a2f2e2c3ecb95213e0e0191b1998f50 100644
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -41,6 +41,7 @@ class SGD(object):
     :type parameters: paddle.v2.parameters.Parameters
     :param extra_layers: Some layers in the neural network graph are not
                          in the path of cost layer.
+    :param pserver_spec: pserver location, eg: localhost:3000
     :type extra_layers: paddle.v2.config_base.Layer
     """
 
diff --git a/python/setup.py.in b/python/setup.py.in
index a422b3832f4c9c60bc5406277f9ada7032f85f51..271ee6e5526981ad94710315d1472b0f4069a1aa 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -29,7 +29,9 @@ setup(name='paddle',
       description='Parallel Distributed Deep Learning',
       install_requires=setup_requires,
       packages=packages,
-      package_data={'paddle.v2.master': ['libpaddle_master.so'], },
+      package_data={'paddle.v2.master': ['libpaddle_master.so'],
+            'paddle.v2.framework': ['core.so']
+      },
       package_dir={
           '': '${CMAKE_CURRENT_SOURCE_DIR}',
           # The paddle.v2.framework.proto will be generated while compiling.