Merge branch 'develop' of https://github.com/paddlepaddle/paddle into voc_dataset

a5239ac7 · wanghaoshuang · a698824a · c1ae3396 · a5239ac7 · a5239ac7
57 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -19,3 +19,6 @@ third_party/
 # clion workspace.
 cmake-build-*
+# generated while compiling
+python/paddle/v2/framework/core.so
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -97,6 +97,7 @@ include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
 include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
+include(external/pybind11)    # download pybind11
 include(cudnn)              # set cudnn libraries, must before configure
 include(configure)          # add paddle env configuration

--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/develop/doc/)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://doc.paddlepaddle.org/develop/doc/)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/doc_cn/)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://doc.paddlepaddle.org/develop/doc_cn/)
 [![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@@ -61,35 +61,36 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
 ## Installation
 It is recommended to check out the
-[Docker installation guide](http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html)
+[Docker installation guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html)
 before looking into the
-[build from source guide](http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html)
+[build from source guide](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html)
 ## Documentation
-We provide [English](http://www.paddlepaddle.org/develop/doc/) and
+We provide [English](http://doc.paddlepaddle.org/develop/doc/) and
-[Chinese](http://www.paddlepaddle.org/doc_cn/) documentation.
+[Chinese](http://doc.paddlepaddle.org/doc_cn/) documentation.
 - [Deep Learning 101](http://book.paddlepaddle.org/index.html)
  You might want to start from the this online interactive book that can run in Jupyter Notebook.
- [Distributed Training](http://www.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)
+- [Distributed Training](http://doc.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)
  You can run distributed training jobs on MPI clusters.
- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/develop/doc/howto/usage/k8s/k8s_en.html)
+- [Distributed Training on Kubernetes](http://doc.paddlepaddle.org/develop/doc/howto/usage/k8s/k8s_en.html)
   You can also run distributed training jobs on Kubernetes clusters.
- [Python API](http://www.paddlepaddle.org/develop/doc/api/index_en.html)
+- [Python API](http://doc.paddlepaddle.org/develop/doc/api/index_en.html)
   Our new API enables much shorter programs.
- [How to Contribute](http://www.paddlepaddle.org/develop/doc/howto/dev/contribute_to_paddle_en.html)
+- [How to Contribute](http://doc.paddlepaddle.org/develop/doc/howto/dev/contribute_to_paddle_en.html)
   We appreciate your contributions!
 ## Ask Questions
 You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues).

--- a/cmake/external/pybind11.cmake
+++ b/cmake/external/pybind11.cmake
+INCLUDE(ExternalProject)
+SET(PYBIND_SOURCE_DIR ${THIRD_PARTY_PATH}/pybind)
+INCLUDE_DIRECTORIES(${PYBIND_SOURCE_DIR}/src/extern_pybind/include)
+ExternalProject_Add(
+        extern_pybind
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY  "https://github.com/pybind/pybind11.git"
+        GIT_TAG         "v2.1.1"
+        PREFIX          ${PYBIND_SOURCE_DIR}
+        UPDATE_COMMAND  ""
+        CONFIGURE_COMMAND ""
+        BUILD_COMMAND     ""
+        INSTALL_COMMAND   ""
+        TEST_COMMAND      ""
+)
+if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
+    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/pybind_dummy.c)
+    file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
+    add_library(pybind STATIC ${dummyfile})
+else()
+    add_library(pybind INTERFACE)
+endif()
+add_dependencies(pybind extern_pybind)
+LIST(APPEND external_project_dependencies pybind)
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -18,6 +18,9 @@ INCLUDE(python_module)
 FIND_PACKAGE(PythonInterp 2.7)
 IF(WITH_PYTHON)
    FIND_PACKAGE(PythonLibs 2.7)
+    # Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
+    ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
+    SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})
 ENDIF(WITH_PYTHON)
 SET(py_env "")

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -109,7 +109,9 @@ set(COMMON_FLAGS
    -Wno-unused-function
    -Wno-error=literal-suffix
    -Wno-error=sign-compare
-    -Wno-error=unused-local-typedefs)
+    -Wno-error=unused-local-typedefs
+    -Wno-error=parentheses-equality # Warnings in Pybind11
+)
 set(GPU_COMMON_FLAGS
    -fPIC

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -93,6 +93,7 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR})
 if(NOT APPLE)
    find_package(Threads REQUIRED)
    link_libraries(${CMAKE_THREAD_LIBS_INIT})
+    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -ldl")
 endif(NOT APPLE)
 function(merge_static_libs TARGET_NAME)

--- a/doc/howto/dev/new_layer_cn.rst
+++ b/doc/howto/dev/new_layer_cn.rst
@@ -37,7 +37,7 @@
   \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x}
-假设 :math:`z = f(W^T x + b)` ，那么
+假设 :math:`z = W^T x + b` ，那么
 .. math::

--- a/doc/howto/dev/new_layer_en.rst
+++ b/doc/howto/dev/new_layer_en.rst
@@ -37,7 +37,7 @@ Suppose our loss function is :math:`c(y)`, then
   \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x}
-Suppose :math:`z = f(W^T x + b)`, then
+Suppose :math:`z = W^T x + b`, then
 .. math::

--- a/doc/howto/dev/write_docs_cn.rst
+++ b/doc/howto/dev/write_docs_cn.rst
@@ -41,7 +41,7 @@ PaddlePaddle文档需要准备的环境相对较复杂，所以我们推荐使
    python -c "import py_paddle"
-如果提示错误，那么用户需要在本地编译安装PaddlePaddle，请参考 `源码编译文档 <http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html>`_ 。
+如果提示错误，那么用户需要在本地编译安装PaddlePaddle，请参考 `源码编译文档 <http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html>`_ 。
 注意，用户在首次编译安装PaddlePaddle时，请将WITH_DOC选项关闭。在编译安装正确之后，请再次确认py_paddle包已经安装，即可进行下一步操作。
 如果提示正确，可以执行以下命令编译生成文档，即
@@ -68,9 +68,9 @@ PaddlePaddle文档使用 `sphinx`_ 自动生成，用户可以参考sphinx教程
 如何更新www.paddlepaddle.org文档
 ================================
-开发者给PaddlePaddle代码增加的注释以PR的形式提交到github中，提交方式可参见 `贡献文档 <http://paddlepaddle.org/develop/doc_cn/howto/dev/contribute_to_paddle_cn.html>`_ 。
+开发者给PaddlePaddle代码增加的注释以PR的形式提交到github中，提交方式可参见 `贡献文档 <http://doc.paddlepaddle.org/develop/doc_cn/howto/dev/contribute_to_paddle_cn.html>`_ 。
-目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://www.paddlepaddle.org/develop/doc_cn/>`_ 和
+目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://doc.paddlepaddle.org/develop/doc_cn/>`_ 和
-`英文文档 <http://www.paddlepaddle.org/develop/doc/>`_ 。
+`英文文档 <http://doc.paddlepaddle.org/develop/doc/>`_ 。

--- a/go/cmd/pserver/pserver.go
+++ b/go/cmd/pserver/pserver.go
@@ -20,6 +20,8 @@ func main() {
 		"comma separated endpoint string for pserver to connect to etcd")
 	etcdTimeout := flag.Int("etcd-timeout", 5, "timeout for etcd calls")
 	numPservers := flag.Int("num-pservers", 1, "total pserver count in a training job")
+	checkpointPath := flag.String("checkpoint-path", "/checkpoints/", "save checkpoint path")
+	checkpointInterval := flag.Int("checkpoint-interval", 600, "save checkpoint per interval seconds")
 	logLevel := flag.String("log-level", "info",
 		"log level, possible values: debug, info, warning, error, fatal, panic")
 	flag.Parse()
@@ -31,18 +33,20 @@ func main() {
 	log.SetLevel(level)
 	var idx int
+	var cp pserver.Checkpoint
+	var e *pserver.EtcdClient
 	if *index >= 0 {
 		idx = *index
 	} else {
 		timeout := time.Second * time.Duration((*etcdTimeout))
-		e := pserver.NewEtcdClient(*etcdEndpoint, *numPservers, timeout)
+		e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, timeout)
 		idx, err = e.Register()
 		if err != nil {
 			panic(err)
 		}
 	}
-	s, err := pserver.NewService(idx)
+	s, err := pserver.NewService(idx, *checkpointInterval, *checkpointPath, e, cp)
 	if err != nil {
 		panic(err)
 	}

--- a/go/master/client.go
+++ b/go/master/client.go
@@ -68,7 +68,7 @@ func (c *Client) getRecords() {
 		// We treat a task as finished whenever the last data
 		// instance of the task is read. This is not exactly
 		// correct, but a reasonable approximation.
-		c.taskFinished(t.ID)
+		c.taskFinished(t.Meta.ID)
 	}
 }
@@ -118,6 +118,11 @@ func (c *Client) taskFinished(taskID int) error {
 	return c.conn.Call("Service.TaskFinished", taskID, nil)
 }
+// TaskFailed tell the master server as task is failed.
+func (c *Client) taskFailed(meta TaskMeta) error {
+	return c.conn.Call("Service.TaskFailed", meta, nil)
+}
 // NextRecord returns next record in the dataset.
 //
 // NextRecord will block until the next record is available. It is

--- a/go/master/client_internal_test.go
+++ b/go/master/client_internal_test.go
@@ -95,10 +95,16 @@ func TestGetFinishTask(t *testing.T) {
 			t.Fatalf("Should get error, pass: %d\n", i)
 		}
-		err = c.taskFinished(tasks[0].ID)
+		err = c.taskFinished(tasks[0].Meta.ID)
 		if err != nil {
 			t.Fatalf("Error: %v, pass: %d\n", err, i)
 		}
+		err = c.taskFailed(tasks[0].Meta)
+		if err != nil {
+			t.Fatalf("Error: %v, pass: %d\n", err, i)
+		}
 		tasks = tasks[1:]
 		task, err := c.getTask()
 		if err != nil {
@@ -107,7 +113,7 @@ func TestGetFinishTask(t *testing.T) {
 		tasks = append(tasks, task)
 		for _, task := range tasks {
-			err = c.taskFinished(task.ID)
+			err = c.taskFinished(task.Meta.ID)
 			if err != nil {
 				t.Fatalf("Error: %v, pass: %d\n", err, i)
 			}

--- a/go/master/service.go
+++ b/go/master/service.go
@@ -31,30 +31,36 @@ type Chunk struct {
 	Index recordio.Index // chunk index
 }
+// TaskMeta is a struct which stores task's meta info.
+type TaskMeta struct {
+	ID    int
+	Epoch int
+}
 // Task is the basic unit of data instances assigned to trainers.
 type Task struct {
-	ID     int
+	Meta   TaskMeta
 	Chunks []Chunk
 }
 type taskEntry struct {
-	Epoch      int
-	NumTimeout int
 	Task Task
+	// A task fails if it's timeout or trainer reports it exits unnormally.
+	NumFailure int
 }
 type taskQueues struct {
 	Todo    []taskEntry
 	Pending map[int]taskEntry // map from task ID to task entry
 	Done    []taskEntry
-	Failed  []Task
+	Failed  []taskEntry
 }
 // Service is the master server service.
 type Service struct {
 	chunksPerTask int
 	timeoutDur    time.Duration
-	timeoutMax    int
+	failureMax    int
 	ready         chan struct{}
 	store         Store
@@ -73,7 +79,7 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 	var cur taskEntry
 	for i, c := range chunks {
 		if i%chunksPerTask == 0 && len(cur.Task.Chunks) > 0 {
-			cur.Task.ID = id
+			cur.Task.Meta.ID = id
 			id++
 			result = append(result, cur)
 			cur.Task.Chunks = nil
@@ -83,7 +89,7 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 	}
 	if len(cur.Task.Chunks) > 0 {
-		cur.Task.ID = id
+		cur.Task.Meta.ID = id
 		result = append(result, cur)
 	}
@@ -91,11 +97,11 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 }
 // NewService creates a new service.
-func NewService(store Store, chunksPerTask int, timeoutDur time.Duration, timeoutMax int) (*Service, error) {
+func NewService(store Store, chunksPerTask int, timeoutDur time.Duration, failureMax int) (*Service, error) {
 	s := &Service{}
 	s.chunksPerTask = chunksPerTask
 	s.timeoutDur = timeoutDur
-	s.timeoutMax = timeoutMax
+	s.failureMax = failureMax
 	s.taskQueues = taskQueues{}
 	s.taskQueues.Pending = make(map[int]taskEntry)
 	s.ready = make(chan struct{})
@@ -257,19 +263,10 @@ func (s *Service) SetDataset(globPaths []string, dummy *int) error {
 	return nil
 }
-func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
+func (s *Service) processFailedTask(t taskEntry, epoch int) {
-	return func() {
+	if t.Task.Meta.Epoch != epoch {
-		s.mu.Lock()
-		defer s.mu.Unlock()
-		t, ok := s.taskQueues.Pending[taskID]
-		if !ok {
-			return
-		}
-		if t.Epoch != epoch {
 		// new epoch, task launched after the
-			// schedule of this timeout check.
+		// schedule of this timeout check or failed status report.
 		return
 	}
@@ -280,17 +277,31 @@ func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
 		}
 	}()
-		delete(s.taskQueues.Pending, t.Task.ID)
+	delete(s.taskQueues.Pending, t.Task.Meta.ID)
-		t.NumTimeout++
+	t.NumFailure++
-		if t.NumTimeout > s.timeoutMax {
+	if t.NumFailure > s.failureMax {
-			log.Warningf("Task %v timed out %d times, discard.", t.Task, t.NumTimeout)
+		log.Warningf("Task %v failed %d times, discard.", t.Task, t.NumFailure)
-			s.taskQueues.Failed = append(s.taskQueues.Failed, t.Task)
+		s.taskQueues.Failed = append(s.taskQueues.Failed, t)
 		return
 	}
-		log.Warningf("Task %v timed out %d times, retry.", t.Task, t.NumTimeout)
+	log.Warningf("Task %v failed %d times, discard.", t.Task, t.NumFailure)
 	s.taskQueues.Todo = append(s.taskQueues.Todo, t)
+	return
+}
+func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
+	return func() {
+		s.mu.Lock()
+		defer s.mu.Unlock()
+		t, ok := s.taskQueues.Pending[taskID]
+		if !ok {
+			return
+		}
+		s.processFailedTask(t, epoch)
 	}
 }
@@ -339,18 +350,18 @@ func (s *Service) GetTask(dummy int, task *Task) error {
 	}
 	t := s.taskQueues.Todo[0]
-	t.Epoch++
+	t.Task.Meta.Epoch++
 	s.taskQueues.Todo = s.taskQueues.Todo[1:]
-	s.taskQueues.Pending[t.Task.ID] = t
+	s.taskQueues.Pending[t.Task.Meta.ID] = t
 	err := s.snapshot()
 	if err != nil {
 		return err
 	}
 	*task = t.Task
-	log.WithFields(s.logFields()).Infof("Task #%d dispatched.", task.ID)
+	log.WithFields(s.logFields()).Infof("Task #%v dispatched.", t.Task.Meta)
-	time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.ID, t.Epoch))
+	time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.Meta.ID, t.Task.Meta.Epoch))
 	return nil
 }
@@ -365,13 +376,12 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
 	t, ok := s.taskQueues.Pending[taskID]
 	if !ok {
-		err := errors.New("pending task not found")
 		log.WithFields(s.logFields()).Warningln("Pending task #%d not found.", taskID)
-		return err
+		return nil
 	}
 	// task finished, reset timeout
-	t.NumTimeout = 0
+	t.NumFailure = 0
 	s.taskQueues.Done = append(s.taskQueues.Done, t)
 	delete(s.taskQueues.Pending, taskID)
@@ -389,3 +399,22 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
 	}
 	return err
 }
+// TaskFailed tells the service that a task is failed.
+func (s *Service) TaskFailed(meta TaskMeta, dummy *int) error {
+	select {
+	case <-s.ready:
+	}
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	t, ok := s.taskQueues.Pending[meta.ID]
+	if !ok {
+		log.WithFields(s.logFields()).Warningln("TaskFailed:Pending task #%v not found.", t.Task.Meta)
+		return nil
+	}
+	s.processFailedTask(t, meta.Epoch)
+	return nil
+}
--- a/go/master/service_internal_test.go
+++ b/go/master/service_internal_test.go
@@ -30,7 +30,7 @@ func TestPartionIndex(t *testing.T) {
 	cs := make([]Chunk, 100)
 	ts := partition(cs, 20)
 	for i := range ts {
-		if ts[i].Task.ID != i {
+		if ts[i].Task.Meta.ID != i {
 			t.Error(ts[i], i)
 		}
 	}

--- a/go/pserver/client/c/test/test_train.py
+++ b/go/pserver/client/c/test/test_train.py
@@ -19,7 +19,7 @@ def main():
    # create parameters
    parameters = paddle.parameters.create(cost)
-    # create optimizer
+    # create optimizer of new remote updater to pserver
    optimizer = paddle.optimizer.Momentum(momentum=0)
    #TODO(zhihong) : replace optimizer with new OptimizerConfig

--- a/go/pserver/client/client_test.go
+++ b/go/pserver/client/client_test.go
@@ -42,7 +42,8 @@ func initClient() [numPserver]int {
 		ports[i] = p
 		go func(l net.Listener) {
-			s, err := pserver.NewService(0)
+			var cp pserver.Checkpoint
+			s, err := pserver.NewService(0, 1, "", nil, cp)
 			if err != nil {
 				panic(err)
 			}
@@ -174,7 +175,7 @@ func TestNativeClient(t *testing.T) {
 // TODO: tmperary disable etcdClient test for dependency of etcd)
 func EtcdClient(t *testing.T) {
 	initEtcdClient()
-	etcd_client := client.NewEtcd(etcdEndpoints)
+	etcdClient := client.NewEtcd(etcdEndpoints)
-	c2 := client.NewClient(etcd_client, etcd_client.Desired(), selector(true))
+	c2 := client.NewClient(etcdClient, etcdClient.Desired(), selector(true))
 	ClientTest(t, c2)
 }
--- a/go/pserver/etcd_client.go
+++ b/go/pserver/etcd_client.go
@@ -18,6 +18,8 @@ const (
 	PsDesired = "/ps_desired"
 	// PsAddr is the base dir for pserver to store their addr
 	PsPath = "/ps/"
+	// PsCheckpoint is the etcd path for store checkpoints information
+	PsCheckpoint = "/checkpoints/"
 )
 // EtcdClient is the etcd client that the pserver uses for fault
@@ -186,3 +188,14 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
 	return idx, nil
 }
+// PutKey put into etcd with value by key specified
+func (e *EtcdClient) PutKey(key string, value []byte, timeout int) error {
+	ctx, cancel := context.WithTimeout(context.Background(), time.Second*time.Duration(timeout))
+	_, err := e.etcdClient.Put(ctx, key, string(value))
+	cancel()
+	if err != nil {
+		return err
+	}
+	return nil
+}
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
@@ -35,22 +35,30 @@ func cArrayToSlice(p unsafe.Pointer, len int) []byte {
 	return (*[1 << 30]byte)(p)[:len:len]
 }
-func newOptimizer(paramWithConfigs ParameterWithConfig) *optimizer {
+func newOptimizer(paramWithConfigs ParameterWithConfig, State []byte) *optimizer {
 	o := &optimizer{}
 	o.elementType = paramWithConfigs.Param.ElementType
 	p := paramWithConfigs.Param
 	c := paramWithConfigs.Config
+	s := State
+	paramBufferSize := C.size_t(len(p.Content) / C.sizeof_float)
 	log.WithFields(log.Fields{
 		"ElementType": p.ElementType,
-		"ParamSize":   len(p.Content),
+		"ParamSize":   paramBufferSize,
 		"ConfigSize":  len(c),
+		"StateSize":   len(s),
 	}).Info("New Optimizer Created with config:")
 	var cbuffer unsafe.Pointer
-	cbuffer = C.malloc(C.size_t(len(p.Content)))
+	cbuffer = C.malloc(paramBufferSize)
-	C.memcpy(cbuffer, unsafe.Pointer(&p.Content[0]), C.size_t(len(p.Content)))
+	C.memcpy(cbuffer, unsafe.Pointer(&p.Content[0]), paramBufferSize)
+	var cstate unsafe.Pointer
+	if len(s) != 0 {
+		cstate = unsafe.Pointer(&s[0])
+	}
 	o.opt = C.paddle_create_optimizer((*C.uchar)(&c[0]), C.int(len(c)),
-		C.paddle_element_type(p.ElementType), cbuffer, C.int(len(p.Content)/C.sizeof_float),
+		C.paddle_element_type(p.ElementType), cbuffer, C.int(paramBufferSize), (*C.char)(cstate), C.int(len(s)))
-		(*C.char)(nullPtr), 0)
 	return o
 }
@@ -60,6 +68,12 @@ func (o *optimizer) GetWeights() []byte {
 	return cArrayToSlice(buffer, int(bufferLen)*C.sizeof_float)
 }
+func (o *optimizer) GetStates() []byte {
+	var cbuffer *C.char
+	cbufferLen := C.paddle_optimizer_get_state(o.opt, &cbuffer)
+	return cArrayToSlice(unsafe.Pointer(cbuffer), int(cbufferLen))
+}
 func (o *optimizer) UpdateParameter(g Gradient) error {
 	if o.elementType != g.ElementType {
 		return fmt.Errorf("Name: %s, parameter and gradient element type not match, parameter: %v, gradient: %v", g.Name, o.elementType, g.ElementType)

--- a/go/pserver/optimizer_test.go
+++ b/go/pserver/optimizer_test.go
@@ -19,6 +19,6 @@ func TestOptimizerCreateRelease(t *testing.T) {
 		Param:  p,
 		Config: config,
 	}
-	o := newOptimizer(param)
+	o := newOptimizer(param, nil)
 	o.Cleanup()
 }
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
 package pserver
 import (
+	"bufio"
+	"bytes"
+	"crypto/md5"
+	"encoding/gob"
+	"encoding/hex"
+	"encoding/json"
 	"errors"
 	"fmt"
+	"os"
+	"path/filepath"
+	"strconv"
 	"sync"
+	"time"
+	log "github.com/sirupsen/logrus"
 )
 // ElementType is the type of elements of a Parameter.
@@ -39,6 +51,22 @@ type ParameterWithConfig struct {
 	Config []byte // parameter configuration in Proto Buffer format
 }
+// ParameterCheckpoint is Parameter and State checkpoint
+type ParameterCheckpoint struct {
+	ParamConfig ParameterWithConfig
+	State       []byte
+}
+// checkpoint signature
+type checkpointMeta struct {
+	UUID      string `json:"uuid"`
+	Md5sum    string `json:"md5sum"`
+	Timestamp string `json:"timestamp"`
+}
+// Checkpoint is the pserver shard persist in file
+type Checkpoint []ParameterCheckpoint
 // Gradient is the gradient of the parameter.
 type Gradient Parameter
@@ -46,19 +74,32 @@ type Gradient Parameter
 type Service struct {
 	initialized        chan struct{}
 	idx                int
+	checkpointInterval time.Duration
+	checkpointPath     string
+	client             *EtcdClient
 	mu                 sync.Mutex
 	optMap             map[string]*optimizer
 }
 // NewService creates a new service, will bypass etcd registration if no
 // endpoints specified.
-func NewService(idx int) (*Service, error) {
+func NewService(idx int, seconds int, path string, client *EtcdClient, cp Checkpoint) (*Service, error) {
 	s := &Service{
 		idx:                idx,
+		checkpointInterval: time.Second * time.Duration(seconds),
+		checkpointPath:     path,
+		client:             client,
 	}
 	s.optMap = make(map[string]*optimizer)
 	s.initialized = make(chan struct{})
+	if cp != nil {
+		for _, item := range cp {
+			p := item.ParamConfig
+			st := item.State
+			s.optMap[p.Param.Name] = newOptimizer(p, st)
+		}
+	}
 	return s, nil
 }
@@ -78,7 +119,7 @@ func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) er
 	// TODO(helin): check if paramWithConfigs.Param.Content is
 	// properly memory aligned, if not, make copy to a memory
 	// aligned region.
-	s.optMap[paramWithConfigs.Param.Name] = newOptimizer(paramWithConfigs)
+	s.optMap[paramWithConfigs.Param.Name] = newOptimizer(paramWithConfigs, nil)
 	return nil
 }
@@ -139,10 +180,57 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
 	return nil
 }
-// Save tells the parameter server to save parameters.
+// pserver save checkpoint
-func (s *Service) Save(path string, dummy *int) error {
+func (s *Service) doCheckpoint() error {
 	<-s.initialized
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	cp := make([]ParameterCheckpoint, 0, len(s.optMap))
+	index := 0
+	for name, opt := range s.optMap {
+		var pc ParameterCheckpoint
+		pc.ParamConfig.Param.Name = name
+		pc.ParamConfig.Param.ElementType = opt.elementType
+		pc.ParamConfig.Param.Content = opt.GetWeights()
+		pc.State = opt.GetStates()
+		cp[index] = pc
+		index++
+	}
+	var buf bytes.Buffer
+	encoder := gob.NewEncoder(&buf)
+	err := encoder.Encode(cp)
+	if err != nil {
+		return err
+	}
-	// TODO
+	cpMeta := checkpointMeta{}
+	cpMeta.UUID = s.checkpointPath + strconv.Itoa(s.idx)
+	cpMeta.Timestamp = time.Now().String()
+	h := md5.New()
+	cpMeta.Md5sum = hex.EncodeToString(h.Sum(buf.Bytes()))
+	cpMetajson, _ := json.Marshal(cpMeta)
+	err = s.client.PutKey(filepath.Join(PsCheckpoint, strconv.Itoa(s.idx)), cpMetajson, 3)
+	if err != nil {
+		return err
+	}
+	if _, err = os.Stat(cpMeta.UUID); os.IsNotExist(err) {
+		log.Info("checkpoint does not exists.")
+	} else {
+		err = os.Remove(cpMeta.UUID)
+		log.Infof("checkpoint %s already exsits, removing ", cpMeta.UUID)
+	}
+	f, err := os.Create(cpMeta.UUID)
+	defer f.Close()
+	if err != nil {
+		return err
+	}
+	writer := bufio.NewWriter(f)
+	_, err = writer.Write(buf.Bytes())
+	writer.Flush()
+	if err != nil {
+		return err
+	}
 	return nil
 }
--- a/go/pserver/service_test.go
+++ b/go/pserver/service_test.go
@@ -15,7 +15,8 @@ const (
 )
 func TestServiceFull(t *testing.T) {
-	s, err := pserver.NewService(0)
+	var cp pserver.Checkpoint
+	s, err := pserver.NewService(0, 1, "", nil, cp)
 	if err != nil {
 		t.Error(err)
 	}
@@ -86,7 +87,8 @@ func TestServiceFull(t *testing.T) {
 }
 func TestMultipleInit(t *testing.T) {
-	s, err := pserver.NewService(0)
+	var cp pserver.Checkpoint
+	s, err := pserver.NewService(0, 1, "", nil, cp)
 	if err != nil {
 		t.Error(err)
 	}
@@ -102,7 +104,8 @@ func TestMultipleInit(t *testing.T) {
 }
 func TestUninitialized(t *testing.T) {
-	s, err := pserver.NewService(0)
+	var cp pserver.Checkpoint
+	s, err := pserver.NewService(0, 1, "", nil, cp)
 	err = s.SendGrad(pserver.Gradient{}, nil)
 	if err.Error() != pserver.Uninitialized {
 		t.FailNow()
@@ -110,7 +113,8 @@ func TestUninitialized(t *testing.T) {
 }
 func TestBlockUntilInitialized(t *testing.T) {
-	s, err := pserver.NewService(0)
+	var cp pserver.Checkpoint
+	s, err := pserver.NewService(0, 1, "", nil, cp)
 	if err != nil {
 		t.Error(err)
 	}
@@ -128,16 +132,6 @@ func TestBlockUntilInitialized(t *testing.T) {
 		ch <- struct{}{}
 	}()
-	wg.Add(1)
-	go func() {
-		err := s.Save("", nil)
-		if err != nil {
-			errCh <- err
-		}
-		wg.Done()
-		ch <- struct{}{}
-	}()
 	time.Sleep(50 * time.Millisecond)
 	select {
@@ -170,3 +164,7 @@ func TestBlockUntilInitialized(t *testing.T) {
 	wg.Wait()
 }
+func TestCheckpointSpeed(t *testing.T) {
+	//TODO(zhihong): test speed
+}
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -15,6 +15,8 @@ if(Boost_FOUND)
  add_subdirectory(memory)
  add_subdirectory(platform)
  add_subdirectory(framework)
+  add_subdirectory(operators)
+  add_subdirectory(pybind)
 endif()
 if(WITH_C_API)

--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -11,8 +11,14 @@ proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
 cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
 proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
 cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
-cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_proto op_desc)
+cc_library(operator SRCS operator.cc DEPS op_desc protobuf)
+cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
+cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc)
+cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry operator)
 py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
+proto_library(net_proto SRCS net_proto.proto DEPS op_proto)
+cc_library(net SRCS net.cc DEPS net_proto)
--- a/paddle/framework/net.cc
+++ b/paddle/framework/net.cc
+#include "paddle/framework/net.h"
+namespace paddle {
+namespace framework {
+PlainNet::PlainNet(const NetDesc& def) {}
+void PlainNet::InferShape(Scope* scope) {
+  for (auto& op : ops_) {
+    op.InferShape();
+  }
+}
+void PlainNet::Run(std::shared_ptr<Scope> scope, DeviceContext* ctx) {
+  for (auto& op : ops_) {
+    op.Run(ctx);
+  }
+}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/net.h
+++ b/paddle/framework/net.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include "paddle/framework/net_proto.pb.h"
+#include "paddle/framework/op_proto.pb.h"
+#include "paddle/framework/scope.h"
+#include "paddle/platform/device_context.h"
+namespace paddle {
+namespace framework {
+using namespace paddle::platform;
+// operator's index stored in a network.
+typedef int OpIndex;
+/**
+ * NOTE following codes are some definitions of unimplemented concepts.
+ * We write some basic implementation to make Net compilable. These APIs will
+ * keep updating if the concepts related are implemented.
+ */
+struct OpDesc;
+struct OpAttrs {};
+class Operator {
+ public:
+  Operator(const OpDesc &def) {}
+  void InferShape() {}
+  void Run(DeviceContext *ctx) {}
+};
+/**
+ * @brief Network that manage the operators it has.
+ *
+ * Network is the container and controller of a set of operators, user can build
+ * a real network from a NetDesc which is a protobuf message and use
+ * Network.Run() * to run all the operators in the network.
+ * A network object knows all Operators belonging to this network. Variables,
+ * which are inputs and outputs of these operators, are created and managed by a
+ * hierarchy of Scope objects.
+ *
+ * This is the base class of network, all the networks should implement the apis
+ * it defines.
+ */
+class Net {
+ public:
+  /**
+   * @brief Infer shapes of all inputs and outputs of operators.
+   */
+  virtual void InferShape(Scope *scope) = 0;
+  /**
+   * @brief Run the network.
+   *
+   * Run all the operators and return success(true) or not, with all the
+   * variables are located in `scope`. `context` describes the detail execution
+   * environment for ops. `begin` and `end` specify the scope of `ops_` to run,
+   * If no positive indexes are provided, all operators in `ops_` will run.
+   */
+  virtual void Run(std::shared_ptr<Scope> scope, DeviceContext *ctx) = 0;
+  /**
+   * @brief Add an Operator according to `def`.
+   */
+  virtual OpIndex AddOp(const OpProto &def) = 0;
+  /**
+   * @brief Add optimizer operators acctording to `attrs`.
+   */
+  virtual void AddOptimizerOps(const OpAttrs &attrs) = 0;
+  /**
+   * @brief Add backward operators.
+   */
+  virtual void AddBackwardOps() = 0;
+  /**
+   * @brief Create a network.
+   */
+  static std::unique_ptr<Net> Create(const NetDesc &def = NetDesc());
+  virtual ~Net() {}
+};
+/**
+ * @brief a basic implementation of Net.
+ *
+ * PlainNet is a very simple Net, it create a list of operators, and run them
+ * sequentially following the order they added.
+ */
+class PlainNet : public Net {
+ public:
+  /**
+   * @brief Initialize a PlainNet.
+   *
+   * Initialize from  a network describe by `def`. NetDesc is the definition of
+   * a network.
+   */
+  PlainNet(const NetDesc &def);
+  /**
+   * Infer all the operators' input and output varialbes' shapes, will be called
+   * before every mini-batch
+   */
+  virtual void InferShape(Scope *scope) override;
+  /**
+   * @brief Run the network.
+   *
+   * Run all the operators with the `scope`, if no scope is provided, default
+   * scope will be used instead. If no OpContext is provicded, default context
+   * will be used.
+   */
+  virtual void Run(std::shared_ptr<Scope> scope, DeviceContext *ctx) override;
+  /**
+   * @brief Add an operator to this network.
+   */
+  virtual OpIndex AddOp(const OpProto &def) override;
+  /**
+   * @brief Add all optimizer operators related into the network.
+   */
+  virtual void AddOptimizerOps(const OpAttrs &attrs) override;
+  /**
+   * @brief Add all backward operators related into the network.
+   */
+  virtual void AddBackwardOps() override;
+  virtual ~PlainNet() override {}
+ protected:
+  /**
+   * @brief Build the network.
+   *
+   * Create operators accordding to `def`, will be called by the constructor.
+   */
+  void BuildNet(const NetDesc &def);
+  /**
+   * @brief Add an operator into this network.
+   *
+   * Add a operator which is identified as `type` and has attributes described
+   * in `attrs`, the `inputs` are the keys of readonly input variables,
+   * `outputs` are keys of mutable output variables. An `OpIndex` will be
+   * returned to indicate the offset of the new operator in `ops_`.
+   */
+  OpIndex AddOp(const std::string &type, const std::vector<std::string> &inputs,
+                const std::vector<std::string> &outputs,
+                const OpAttrs &attrs = OpAttrs());
+ private:
+  // the operators owned by `Network`.
+  std::vector<Operator> ops_;
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/net_proto.proto
+++ b/paddle/framework/net_proto.proto
+syntax="proto2";
+package paddle.framework;
+import "op_proto.proto";
+message NetDesc {
+  // network identification
+  optional string name = 1;
+  // operator contains in network
+  repeated OpProto operators = 2;
+  // network type to run with. e.g "plainNet", "DAG"
+  optional string net_type = 3;
+  // num worker always
+  optional int32 num_workers = 4;
+}
--- a/paddle/framework/net_test.cc
+++ b/paddle/framework/net_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#include "paddle/framework/net.h"
+#include "paddle/framework/op_registry.h"
+#include <gtest/gtest.h>
+namespace paddle {
+namespace framework {
+class FakeFC : public Operator {}
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/op_registry.cc
+++ b/paddle/framework/op_registry.cc
+#include <paddle/framework/op_registry.h>
+namespace paddle {
+namespace framework {
+template <>
+void AttrTypeHelper::SetAttrType<int>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::INT);
+}
+template <>
+void AttrTypeHelper::SetAttrType<float>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::FLOAT);
+}
+template <>
+void AttrTypeHelper::SetAttrType<std::string>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::STRING);
+}
+template <>
+void AttrTypeHelper::SetAttrType<std::vector<int>>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::INTS);
+}
+template <>
+void AttrTypeHelper::SetAttrType<std::vector<float>>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::FLOATS);
+}
+template <>
+void AttrTypeHelper::SetAttrType<std::vector<std::string>>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::STRINGS);
+}
+}  // namespace framework
+}  // namespace paddle
\ No newline at end of file
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
 #pragma once
+#include <algorithm>
 #include "paddle/framework/attr_checker.h"
-//#include "paddle/framework/op_base.h"
 #include "paddle/framework/op_desc.pb.h"
 #include "paddle/framework/op_proto.pb.h"
+#include "paddle/framework/operator.h"
 namespace paddle {
 namespace framework {
-//==================For test================//
-class OpBase {
- public:
-  std::vector<std::string> inputs_;
-  std::vector<std::string> outputs_;
-  AttributeMap attr_map_;
-  virtual std::string Run() const = 0;
-  virtual ~OpBase() {}
-};
-//=========================================//
 // helper class to set attribute type
 struct AttrTypeHelper {
  template <typename T>
@@ -64,36 +52,6 @@ struct AttrTypeHelper {
  }
 };
-template <>
-void AttrTypeHelper::SetAttrType<int>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::INT);
-}
-template <>
-void AttrTypeHelper::SetAttrType<float>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::FLOAT);
-}
-template <>
-void AttrTypeHelper::SetAttrType<std::string>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::STRING);
-}
-template <>
-void AttrTypeHelper::SetAttrType<std::vector<int>>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::INTS);
-}
-template <>
-void AttrTypeHelper::SetAttrType<std::vector<float>>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::FLOATS);
-}
-template <>
-void AttrTypeHelper::SetAttrType<std::vector<std::string>>(AttrProto* attr) {
-  attr->set_type(paddle::framework::AttrType::STRINGS);
-}
 // this class not only make proto but also init attribute checkers.
 class OpProtoAndCheckerMaker {
 public:
@@ -103,22 +61,22 @@ class OpProtoAndCheckerMaker {
 protected:
  void AddInput(const std::string& name, const std::string& comment) {
    auto input = proto_->mutable_inputs()->Add();
-    *(input->mutable_name()) = name;
+    *input->mutable_name() = name;
-    *(input->mutable_comment()) = comment;
+    *input->mutable_comment() = comment;
  }
  void AddOutput(const std::string& name, const std::string& comment) {
    auto output = proto_->mutable_outputs()->Add();
-    *(output->mutable_name()) = name;
+    *output->mutable_name() = name;
-    *(output->mutable_comment()) = comment;
+    *output->mutable_comment() = comment;
  }
  template <typename T>
  TypedAttrChecker<T>& AddAttr(const std::string& name,
                               const std::string& comment) {
    auto attr = proto_->mutable_attrs()->Add();
-    *(attr->mutable_name()) = name;
+    *attr->mutable_name() = name;
-    *(attr->mutable_comment()) = comment;
+    *attr->mutable_comment() = comment;
    AttrTypeHelper::SetAttrType<T>(attr);
    return op_checker_->AddAttrChecker<T>(name);
  }
@@ -134,50 +92,53 @@ class OpProtoAndCheckerMaker {
 };
 class OpRegistry {
-  typedef std::function<OpBase*()> OpCreator;
+  using OpCreator = std::function<OperatorBase*()>;
 public:
  template <typename OpType, typename ProtoMakerType>
  static void RegisterOp(const std::string& op_type) {
-    creators_[op_type] = []() { return new OpType; };
+    creators()[op_type] = [] { return new OpType; };
-    OpProto& op_proto = protos_[op_type];
+    OpProto& op_proto = protos()[op_type];
-    OpAttrChecker& op_checker = op_checkers_[op_type];
+    OpAttrChecker& op_checker = op_checkers()[op_type];
    ProtoMakerType(&op_proto, &op_checker);
-    PADDLE_ENFORCE(op_proto.IsInitialized() == true,
+    PADDLE_ENFORCE(op_proto.IsInitialized(),
                   "Fail to initialize %s's OpProto !", op_type);
  }
-  static OpBase* CreateOp(const OpDesc& op_desc) {
+  static OperatorBase* CreateOp(const OpDesc& op_desc) {
    std::string op_type = op_desc.type();
-    OpBase* op = (creators_.at(op_type))();
+    OperatorBase* op = creators().at(op_type)();
-    (op->inputs_).resize(op_desc.inputs_size());
+    op->desc_ = op_desc;
-    for (int i = 0; i < op_desc.inputs_size(); ++i) {
+    op->inputs_.reserve((size_t)op_desc.inputs_size());
-      (op->inputs_)[i] = op_desc.inputs(i);
+    std::copy(op_desc.inputs().begin(), op_desc.inputs().end(),
-    }
+              std::back_inserter(op->inputs_));
-    (op->outputs_).resize(op_desc.outputs_size());
+    op->outputs_.reserve((size_t)op_desc.outputs_size());
-    for (int i = 0; i < op_desc.outputs_size(); ++i) {
+    std::copy(op_desc.outputs().begin(), op_desc.outputs().end(),
-      (op->outputs_)[i] = op_desc.outputs(i);
+              std::back_inserter(op->outputs_));
-    }
+    for (auto& attr : op_desc.attrs()) {
-    for (int i = 0; i < op_desc.attrs_size(); ++i) {
+      op->attrs_[attr.name()] = AttrTypeHelper::GetAttrValue(attr);
-      const AttrDesc& ith_attr = op_desc.attrs(i);
+    }
-      std::string name = ith_attr.name();
+    op_checkers().at(op_type).Check(op->attrs_);
-      (op->attr_map_)[name] = AttrTypeHelper::GetAttrValue(ith_attr);
-    }
-    const OpAttrChecker& op_checker = op_checkers_.at(op_type);
-    op_checker.Check(op->attr_map_);
    return op;
  }
 private:
+  static std::unordered_map<std::string, OpCreator>& creators() {
    static std::unordered_map<std::string, OpCreator> creators_;
+    return creators_;
+  }
+  static std::unordered_map<std::string, OpProto>& protos() {
    static std::unordered_map<std::string, OpProto> protos_;
+    return protos_;
+  };
+  static std::unordered_map<std::string, OpAttrChecker>& op_checkers() {
    static std::unordered_map<std::string, OpAttrChecker> op_checkers_;
+    return op_checkers_;
+  };
 };
-std::unordered_map<std::string, std::function<OpBase*()>> OpRegistry::creators_;
-std::unordered_map<std::string, OpProto> OpRegistry::protos_;
-std::unordered_map<std::string, OpAttrChecker> OpRegistry::op_checkers_;
 template <typename OpType, typename ProtoMakerType>
 class OpRegisterHelper {
 public:
@@ -194,60 +155,5 @@ class OpRegisterHelper {
  const OpRegisterHelper<__op_class, __op_maker_class>               \
      __op_class##Register::reg(#__op_type);
-// Demos
-class CosineOp : public OpBase {
- public:
-  virtual std::string Run() const {
-    std::string msg = "CosineOp runs! scale = " +
-                      std::to_string(boost::get<float>(attr_map_.at("scale")));
-    return msg;
-  }
-};
-class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
- public:
-  CosineOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "input of cosine op");
-    AddOutput("output", "output of cosine op");
-    AddAttr<float>("scale", "scale of cosine op")
-        .SetDefault(1.0)
-        .LargerThan(0.0);
-    AddType("cos");
-    AddComment("This is cos op");
-  }
-};
-REGISTER_OP(CosineOp, CosineOpProtoAndCheckerMaker, cos_sim)
-class MyTestOp : public OpBase {
- public:
-  virtual std::string Run() const {
-    std::string msg =
-        "MyTestOp runs! test_attr = " +
-        std::to_string(boost::get<int>(attr_map_.at("test_attr")));
-    return msg;
-  }
-};
-class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
- public:
-  MyTestOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "input of cosine op");
-    AddOutput("output", "output of cosine op");
-    auto my_checker = [](int i) {
-      PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
-    };
-    AddAttr<int>("test_attr", "a simple test attribute")
-        .AddCustomChecker(my_checker);
-    AddType("my_test_op");
-    AddComment("This is my_test op");
-  }
-};
-REGISTER_OP(MyTestOp, MyTestOpProtoAndCheckerMaker, my_test_op)
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
 #include "paddle/framework/op_registry.h"
 #include <gtest/gtest.h>
+#include "paddle/framework/operator.h"
+#include "paddle/operators/demo_op.h"
+using namespace paddle::framework;
+namespace paddle {
+namespace framework {
+class CosineOp : public OperatorWithKernel {
+ public:
+  void Run(const OpRunContext* context) const override {
+    printf("%s\n", DebugString().c_str());
+  }
+};
+class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  CosineOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of cosine op");
+    AddOutput("output", "output of cosine op");
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .LargerThan(0.0);
+    AddType("cos");
+    AddComment("This is cos op");
+  }
+};
+REGISTER_OP(CosineOp, CosineOpProtoAndCheckerMaker, cos_sim)
+class MyTestOp : public OperatorWithKernel {
+ public:
+  void Run(const OpRunContext* ctx) const override {
+    printf("%s\n", DebugString().c_str());
+    printf("test_attr = %d\n", ctx->op_->GetAttr<int>("test_attr"));
+  }
+};
+class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  MyTestOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of cosine op");
+    AddOutput("output", "output of cosine op");
+    auto my_checker = [](int i) {
+      PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
+    };
+    AddAttr<int>("test_attr", "a simple test attribute")
+        .AddCustomChecker(my_checker);
+    AddType("my_test_op");
+    AddComment("This is my_test op");
+  }
+};
+REGISTER_OP(MyTestOp, MyTestOpProtoAndCheckerMaker, my_test_op)
+}  // namespace framework
+}  // namespace paddle
 TEST(OpRegistry, CreateOp) {
  paddle::framework::OpDesc op_desc;
@@ -7,19 +64,19 @@ TEST(OpRegistry, CreateOp) {
  op_desc.add_inputs("aa");
  op_desc.add_outputs("bb");
+  float scale = 3.3;
  auto attr = op_desc.mutable_attrs()->Add();
  attr->set_name("scale");
  attr->set_type(paddle::framework::AttrType::FLOAT);
-  attr->set_f(3.3);
+  attr->set_f(scale);
-  paddle::framework::OpBase* op =
+  paddle::framework::OperatorBase* op =
      paddle::framework::OpRegistry::CreateOp(op_desc);
-  std::string debug_str = op->Run();
+  auto scope = std::make_shared<Scope>();
-  std::string str = "CosineOp runs! scale = " + std::to_string(3.3);
+  auto dev_ctx = DeviceContext();
-  ASSERT_EQ(str.size(), debug_str.size());
+  op->Run(scope, &dev_ctx);
-  for (size_t i = 0; i < debug_str.length(); ++i) {
+  float scale_get = op->GetAttr<float>("scale");
-    ASSERT_EQ(debug_str[i], str[i]);
+  ASSERT_EQ(scale_get, scale);
-  }
 }
 TEST(OpRegistry, IllegalAttr) {
@@ -35,7 +92,7 @@ TEST(OpRegistry, IllegalAttr) {
  bool caught = false;
  try {
-    paddle::framework::OpBase* op __attribute__((unused)) =
+    paddle::framework::OperatorBase* op __attribute__((unused)) =
        paddle::framework::OpRegistry::CreateOp(op_desc);
  } catch (paddle::framework::EnforceNotMet err) {
    caught = true;
@@ -54,15 +111,14 @@ TEST(OpRegistry, DefaultValue) {
  op_desc.add_inputs("aa");
  op_desc.add_outputs("bb");
-  paddle::framework::OpBase* op =
+  ASSERT_TRUE(op_desc.IsInitialized());
+  paddle::framework::OperatorBase* op =
      paddle::framework::OpRegistry::CreateOp(op_desc);
-  std::string debug_str = op->Run();
+  auto scope = std::make_shared<Scope>();
-  float default_value = 1.0;
+  auto dev_ctx = DeviceContext();
-  std::string str = "CosineOp runs! scale = " + std::to_string(default_value);
+  op->Run(scope, &dev_ctx);
-  ASSERT_EQ(str.size(), debug_str.size());
+  ASSERT_EQ(op->GetAttr<float>("scale"), 1.0);
-  for (size_t i = 0; i < debug_str.length(); ++i) {
-    ASSERT_EQ(debug_str[i], str[i]);
-  }
 }
 TEST(OpRegistry, CustomChecker) {
@@ -74,7 +130,7 @@ TEST(OpRegistry, CustomChecker) {
  // attr 'test_attr' is not set
  bool caught = false;
  try {
-    paddle::framework::OpBase* op __attribute__((unused)) =
+    paddle::framework::OperatorBase* op __attribute__((unused)) =
        paddle::framework::OpRegistry::CreateOp(op_desc);
  } catch (paddle::framework::EnforceNotMet err) {
    caught = true;
@@ -93,7 +149,7 @@ TEST(OpRegistry, CustomChecker) {
  attr->set_i(3);
  caught = false;
  try {
-    paddle::framework::OpBase* op __attribute__((unused)) =
+    paddle::framework::OperatorBase* op __attribute__((unused)) =
        paddle::framework::OpRegistry::CreateOp(op_desc);
  } catch (paddle::framework::EnforceNotMet err) {
    caught = true;
@@ -111,12 +167,16 @@ TEST(OpRegistry, CustomChecker) {
  attr->set_name("test_attr");
  attr->set_type(paddle::framework::AttrType::INT);
  attr->set_i(4);
-  paddle::framework::OpBase* op =
+  paddle::framework::OperatorBase* op =
      paddle::framework::OpRegistry::CreateOp(op_desc);
-  std::string debug_str = op->Run();
+  auto dev_ctx = DeviceContext();
-  std::string str = "MyTestOp runs! test_attr = " + std::to_string(4);
+  auto scope = std::make_shared<Scope>();
-  ASSERT_EQ(str.size(), debug_str.size());
+  op->Run(scope, &dev_ctx);
-  for (size_t i = 0; i < debug_str.length(); ++i) {
+  int test_attr = op->GetAttr<int>("test_attr");
-    ASSERT_EQ(debug_str[i], str[i]);
+  ASSERT_EQ(test_attr, 4);
-  }
+}
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
 }
\ No newline at end of file
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/framework/operator.h"
+namespace paddle {
+namespace framework {
+std::string OperatorBase::DebugString() const {
+  std::stringstream ss;
+  ss << "=================\n";
+  ss << "type = " << desc_.type() << "\n";
+  ss << "inputs = [";
+  for (auto& ipt : inputs_) {
+    ss << ipt << ", ";
+  }
+  ss << "]\n";
+  ss << "outputs = [";
+  for (auto& opt : outputs_) {
+    ss << opt << ", ";
+  }
+  ss << "]\n";
+  ss << "attr_keys = [";
+  for (auto& attr : attrs_) {
+    ss << attr.first << ", ";
+  }
+  ss << "]\n";
+  return ss.str();
+}
+const Variable* OpRunContext::Input(int index) const {
+  return scope_->GetVariable(op_->inputs_[index]);
+}
+Variable* OpRunContext::Output(int index) const {
+  return scope_->GetVariable(op_->outputs_[index]);
+}
+}  // namespace framework
+}  // namespace paddle
\ No newline at end of file
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <boost/variant.hpp>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/framework/attr_checker.h"
+#include "paddle/framework/op_desc.pb.h"
+#include "paddle/framework/scope.h"
+#include "paddle/utils/Error.h"
+namespace paddle {
+namespace framework {
+class OperatorBase;
+class DeviceContext {};
+/**
+ * OpRunContext is the only parameter of Operator's Run function.
+ * Run will get input/output variables, state such as momentum and
+ * device resource such as CUDA stream, cublas handle, etc. from
+ * OpRunContext. User should construct it before run the Operator.
+ */
+class OpRunContext {
+ public:
+  OpRunContext(const OperatorBase* op, const std::shared_ptr<Scope> scope,
+               const DeviceContext* device_context)
+      : op_(op), scope_(scope), device_context_(device_context) {}
+  const Variable* Input(int index) const;
+  Variable* Output(int index) const;
+ public:
+  const OperatorBase* op_;
+  const std::shared_ptr<Scope> scope_;
+  const DeviceContext* device_context_;
+};
+/**
+ * OperatorBase has the basic element that Net will call to do computation.
+ * Only CreateOperator from OpRegistry will new Operator directly. User
+ * should always construct a proto message OpDesc and call
+ * OpRegistry::CreateOp(op_desc) to get an Operator instance.
+ */
+class OperatorBase {
+ public:
+  virtual ~OperatorBase() {}
+  template <typename T>
+  inline const T& GetAttr(const std::string& name) const {
+    PADDLE_ENFORCE(attrs_.count(name) != 0, "%s should be in AttributeMap",
+                   name);
+    return boost::get<T>(attrs_.at(name));
+  }
+  std::string DebugString() const;
+  /// InferShape infer the size of Variables used by this Operator with
+  /// information inside scope
+  virtual void InferShape(const std::shared_ptr<Scope>& scope) const = 0;
+  /// Net will call this function to Run an op.
+  virtual void Run(const std::shared_ptr<Scope>& scope,
+                   const DeviceContext* dev_ctx) const = 0;
+ public:
+  OpDesc desc_;
+  std::vector<std::string> inputs_;
+  std::vector<std::string> outputs_;
+  AttributeMap attrs_;
+};
+class OperatorWithKernel : public OperatorBase {
+ public:
+  virtual ~OperatorWithKernel() {}
+  virtual void InferShape(const std::shared_ptr<Scope>& scope) const {}
+  void Run(const std::shared_ptr<Scope>& scope,
+           const DeviceContext* dev_ctx) const {
+    OpRunContext op_ctx(this, scope, dev_ctx);
+    Run(&op_ctx);
+  }
+  /// when implement an Op, your should implement this function.
+  /// this function should be moved to OpKernel later
+  virtual void Run(const OpRunContext* context) const = 0;
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/framework/operator.h"
+#include "gtest/gtest.h"
+#include "paddle/framework/op_registry.h"
+namespace paddle {
+namespace framework {
+class OperatorTest : public OperatorWithKernel {
+ public:
+  void Run(const OpRunContext* ctx) const override {
+    float scale = ctx->op_->GetAttr<float>("scale");
+    PADDLE_ENFORCE(ctx->Input(0) == nullptr, "Input(0) should not initialized");
+    PADDLE_ENFORCE(ctx->Output(0) == nullptr,
+                   "Output(1) should not initialized");
+    auto output1 = ctx->scope_->CreateVariable("output1");
+    PADDLE_ENFORCE(output1 != nullptr, "should create output1 from scope");
+    printf("get attr %s = %f\n", "scale", scale);
+    printf("%s\n", DebugString().c_str());
+  }
+};
+class OperatorTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  OperatorTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of test op");
+    AddOutput("output", "output of test op");
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .LargerThan(0.0);
+    AddType("test_operator");
+    AddComment("This is test op");
+  }
+};
+REGISTER_OP(OperatorTest, OperatorTestProtoAndCheckerMaker, test_operator)
+TEST(OperatorBase, DebugString) {
+  OpDesc op_desc;
+  op_desc.set_type("test_operator");
+  std::vector<std::string> inputs = {"IN1", "IN2"};
+  for (auto& input : inputs) {
+    op_desc.add_inputs(input);
+  }
+  std::vector<std::string> outputs = {"OUT1", "OUT2"};
+  for (auto& output : outputs) {
+    op_desc.add_outputs(output);
+  }
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::AttrType::FLOAT);
+  float scale = 3.14;
+  attr->set_f(scale);
+  DeviceContext device_context;
+  auto scope = std::make_shared<Scope>();
+  OperatorBase* op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  ASSERT_EQ(op->inputs_, inputs);
+  ASSERT_EQ(op->outputs_, outputs);
+  ASSERT_EQ(op->GetAttr<float>("scale"), scale);
+  op->Run(scope, &device_context);
+}
+}  // namespace framework
+}  // namespace paddle
\ No newline at end of file
--- a/paddle/operators/.clang-format
+++ b/paddle/operators/.clang-format
+---
+Language:        Cpp
+BasedOnStyle:  Google
+Standard:  Cpp11 
+...
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
--- a/paddle/operators/demo_op.h
+++ b/paddle/operators/demo_op.h
+#pragma once
+#include "paddle/framework/op_registry.h"
+using namespace paddle::framework;
+namespace paddle {
+namespace operators {
+class CosineOp : public OperatorWithKernel {
+ public:
+  void Run(const OpRunContext *context) const override {
+    printf("%s\n", DebugString().c_str());
+  }
+};
+class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  CosineOpProtoAndCheckerMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of cosine op");
+    AddOutput("output", "output of cosine op");
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .LargerThan(0.0);
+    AddType("cos");
+    AddComment("This is cos op");
+  }
+};
+REGISTER_OP(CosineOp, CosineOpProtoAndCheckerMaker, cos_sim)
+class MyTestOp : public OperatorWithKernel {
+ public:
+  void Run(const OpRunContext *context) const override {
+    printf("%s\n", DebugString().c_str());
+  }
+};
+class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  MyTestOpProtoAndCheckerMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of cosine op");
+    AddOutput("output", "output of cosine op");
+    auto my_checker = [](int i) {
+      PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
+    };
+    AddAttr<int>("test_attr", "a simple test attribute")
+        .AddCustomChecker(my_checker);
+    AddType("my_test_op");
+    AddComment("This is my_test op");
+  }
+};
+REGISTER_OP(MyTestOp, MyTestOpProtoAndCheckerMaker, my_test_op)
+}  // namespace operators
+}  // namespace operators
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -4,3 +4,5 @@ nv_test(cuda_test SRCS cuda_test.cu)
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
+nv_test(device_context_test SRCS device_context_test.cc DEPS dynamic_loader place eigen3 glog gflags)
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/framework/enforce.h"
+#ifndef PADDLE_ONLY_CPU
+#include "paddle/platform/cuda.h"
+#include "paddle/platform/dynload/cublas.h"
+#include "paddle/platform/dynload/cudnn.h"
+#include "paddle/platform/dynload/curand.h"
+#define EIGEN_USE_GPU
+#endif
+#include "paddle/platform/place.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+namespace paddle {
+namespace platform {
+class DeviceContext {
+ public:
+  virtual ~DeviceContext() {}
+};
+class CPUDeviceContext : public DeviceContext {};
+#ifndef PADDLE_ONLY_CPU
+class GPUPlaceGuard {
+ public:
+  explicit GPUPlaceGuard(GPUPlace new_place) : previous_(GetCurrentDeviceId()) {
+    if (previous_ != new_place) {
+      paddle::platform::SetDeviceId(new_place.device);
+    }
+  }
+  ~GPUPlaceGuard() { paddle::platform::SetDeviceId(previous_.device); }
+ private:
+  GPUPlace previous_;
+};
+class CUDADeviceContext : public DeviceContext {
+ public:
+  explicit CUDADeviceContext(const GPUPlace gpu_place) : gpu_place_(gpu_place) {
+    GPUPlaceGuard guard(gpu_place_);
+    paddle::platform::throw_on_error(cudaStreamCreate(&stream_),
+                                     "cudaStreamCreate failed");
+    eigen_stream_ = new Eigen::CudaStreamDevice(&stream_);
+    eigen_device_ = new Eigen::GpuDevice(eigen_stream_);
+  }
+  void Wait() {
+    paddle::platform::throw_on_error(cudaStreamSynchronize(stream_),
+                                     "cudaStreamSynchronize failed");
+  }
+  cudaStream_t stream() { return stream_; }
+  Eigen::GpuDevice eigen_device() { return *eigen_device_; }
+  cublasHandle_t cublas_handle() {
+    if (!blas_handle_) {
+      GPUPlaceGuard guard(gpu_place_);
+      PADDLE_ENFORCE(paddle::platform::dynload::cublasCreate(&blas_handle_) ==
+                         CUBLAS_STATUS_SUCCESS,
+                     "cublasCreate failed");
+      PADDLE_ENFORCE(paddle::platform::dynload::cublasSetStream(
+                         blas_handle_, stream_) == CUBLAS_STATUS_SUCCESS,
+                     "cublasSetStream failed");
+    }
+    return blas_handle_;
+  }
+  cudnnHandle_t cudnn_handle() {
+    if (!dnn_handle_) {
+      GPUPlaceGuard guard(gpu_place_);
+      PADDLE_ENFORCE(paddle::platform::dynload::cudnnCreate(&dnn_handle_) ==
+                         CUDNN_STATUS_SUCCESS,
+                     "cudnnCreate failed");
+      PADDLE_ENFORCE(paddle::platform::dynload::cudnnSetStream(
+                         dnn_handle_, stream_) == CUDNN_STATUS_SUCCESS,
+                     "cudnnSetStream failed");
+    }
+    return dnn_handle_;
+  }
+  curandGenerator_t curand_generator() {
+    if (!rand_generator_) {
+      GPUPlaceGuard guard(gpu_place_);
+      PADDLE_ENFORCE(paddle::platform::dynload::curandCreateGenerator(
+                         &rand_generator_, CURAND_RNG_PSEUDO_DEFAULT) ==
+                         CURAND_STATUS_SUCCESS,
+                     "curandCreateGenerator failed");
+      PADDLE_ENFORCE(
+          paddle::platform::dynload::curandSetPseudoRandomGeneratorSeed(
+              rand_generator_, random_seed_) == CURAND_STATUS_SUCCESS,
+          "curandSetPseudoRandomGeneratorSeed failed");
+      PADDLE_ENFORCE(paddle::platform::dynload::curandSetStream(
+                         rand_generator_, stream_) == CURAND_STATUS_SUCCESS,
+                     "curandSetStream failed");
+    }
+    return rand_generator_;
+  }
+  ~CUDADeviceContext() {
+    Wait();
+    if (blas_handle_) {
+      PADDLE_ENFORCE(paddle::platform::dynload::cublasDestroy(blas_handle_) ==
+                         CUBLAS_STATUS_SUCCESS,
+                     "cublasDestroy failed");
+    }
+    if (dnn_handle_) {
+      PADDLE_ENFORCE(paddle::platform::dynload::cudnnDestroy(dnn_handle_) ==
+                         CUDNN_STATUS_SUCCESS,
+                     "cudnnDestroy failed");
+    }
+    if (rand_generator_) {
+      PADDLE_ENFORCE(paddle::platform::dynload::curandDestroyGenerator(
+                         rand_generator_) == CURAND_STATUS_SUCCESS,
+                     "curandDestroyGenerator failed");
+    }
+    delete eigen_stream_;
+    delete eigen_device_;
+    paddle::platform::throw_on_error(cudaStreamDestroy(stream_),
+                                     "cudaStreamDestroy failed");
+  }
+ private:
+  GPUPlace gpu_place_;
+  cudaStream_t stream_;
+  Eigen::CudaStreamDevice* eigen_stream_;
+  Eigen::GpuDevice* eigen_device_;
+  cublasHandle_t blas_handle_{nullptr};
+  cudnnHandle_t dnn_handle_{nullptr};
+  int random_seed_;
+  curandGenerator_t rand_generator_{nullptr};
+};
+#endif
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/platform/device_context.h"
+#include "gtest/gtest.h"
+TEST(CUDADeviceContext, Init) {
+  int count = paddle::platform::GetDeviceCount();
+  for (int i = 0; i < count; i++) {
+    paddle::platform::CUDADeviceContext* device_context =
+        new paddle::platform::CUDADeviceContext(i);
+    Eigen::GpuDevice gpu_device = device_context->eigen_device();
+    ASSERT_NE(nullptr, gpu_device.stream());
+    cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
+    ASSERT_NE(nullptr, cudnn_handle);
+    cublasHandle_t cublas_handle = device_context->cublas_handle();
+    ASSERT_NE(nullptr, cublas_handle);
+    curandGenerator_t curand_handle = device_context->curand_generator();
+    ASSERT_NE(nullptr, curand_handle);
+    delete device_context;
+  }
+}
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
+cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python)
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <paddle/framework/scope.h>
+#include <pybind11/pybind11.h>
+namespace py = pybind11;
+namespace pd = paddle::framework;
+PYBIND11_PLUGIN(core) {
+  py::module m("core", "C++ core of Paddle Paddle");
+  py::class_<pd::Variable>(m, "Variable", R"DOC(Variable Class.
+All parameter, weight, gradient are variables in Paddle.
+)DOC")
+      .def("is_int", [](const pd::Variable& var) { return var.IsType<int>(); })
+      .def("set_int",
+           [](pd::Variable& var, int val) -> void {
+             *var.GetMutable<int>() = val;
+           })
+      .def("get_int",
+           [](const pd::Variable& var) -> int { return var.Get<int>(); });
+  py::class_<pd::Scope, std::shared_ptr<pd::Scope>>(m, "Scope")
+      .def(py::init<const std::shared_ptr<pd::Scope>&>())
+      .def("get_var",
+           &pd::Scope::GetVariable,
+           py::return_value_policy::reference)
+      .def("create_var",
+           &pd::Scope::CreateVariable,
+           py::return_value_policy::reference);
+  return m.ptr();
+}
\ No newline at end of file
--- a/paddle/trainer/NewRemoteParameterUpdater.cpp
+++ b/paddle/trainer/NewRemoteParameterUpdater.cpp
@@ -22,7 +22,8 @@ DECLARE_string(save_dir);
 namespace paddle {
 NewRemoteParameterUpdater::NewRemoteParameterUpdater(
    const OptimizationConfig &config, const std::string pserverSpec)
-    : parameterClient_(-1),
+    : trainerConfig_(config),
+      parameterClient_(-1),
      newParameters_(nullptr),
      newGradients_(nullptr),
      pserverSpec_(pserverSpec) {}
@@ -51,7 +52,22 @@ void NewRemoteParameterUpdater::init(
    LOG(INFO) << "paddle_begin_init_params start";
    for (int i = 0; i < parameterSize(); ++i) {
      auto paramConfig = parameters_[i]->getConfig();
-      std::string bytes = paramConfig.SerializeAsString();
+      LOG(INFO) << "old param config: " << paramConfig.DebugString();
+      // FIXME(typhoonzero): convert old paramConfig to optimizerConfig
+      OptimizerConfig optimizeConfigV2;
+      auto sgdConfigV2 = optimizeConfigV2.mutable_sgd();
+      sgdConfigV2->set_momentum(paramConfig.momentum());
+      sgdConfigV2->set_decay(paramConfig.decay_rate());
+      optimizeConfigV2.set_lr_policy(paddle::OptimizerConfig::Const);
+      auto constlr = optimizeConfigV2.mutable_const_lr();
+      constlr->set_learning_rate(paramConfig.learning_rate());
+      if (trainerConfig_.algorithm() == "sgd") {
+        optimizeConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
+        // FIXME: config all algorithms
+      } else {
+        optimizeConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
+      }
+      std::string bytes = optimizeConfigV2.SerializeAsString();
      const char *array = bytes.data();
      int size = (int)bytes.size();
      paddle_init_param(
@@ -83,4 +99,4 @@ void NewRemoteParameterUpdater::finishBatch(real cost) {
 void NewRemoteParameterUpdater::startPass() {}
 bool NewRemoteParameterUpdater::finishPass() { return true; }
-}
+}  // namespace paddle
--- a/paddle/trainer/NewRemoteParameterUpdater.h
+++ b/paddle/trainer/NewRemoteParameterUpdater.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <functional>
 #include <thread>
+#include "OptimizerConfig.pb.h"
 #include "ParameterUpdater.h"
 #include "libpaddle_pserver_cclient.h"
 #include "paddle/pserver/ParameterClient2.h"
@@ -101,6 +102,7 @@ private:
  }
 protected:
+  const OptimizationConfig& trainerConfig_;
  /// internal parameter client object for exchanging data with pserver
  paddle_pserver_client parameterClient_;
  /// the parameters for new pserver client

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -26,10 +26,17 @@ endif(WITH_GOLANG)
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
    ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
+add_custom_command(OUTPUT ${PROJ_ROOT}/python/paddle/v2/framework/core.so
+        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PROJ_ROOT}/python/paddle/v2/framework/core.so
+        DEPENDS paddle_pybind)
+add_custom_target(copy_paddle_pybind ALL DEPENDS ${PROJ_ROOT}/python/paddle/v2/framework/core.so)
 add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
    COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp
-    DEPENDS gen_proto_py framework_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+    DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 add_custom_target(paddle_python ALL DEPENDS
    ${OUTPUT_DIR}/.timestamp)

--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -1395,7 +1395,7 @@ def inputs(layers, *args):
    if len(args) != 0:
        layers.extend(args)
-    Inputs(* [l.name for l in layers])
+    Inputs(*[l.name for l in layers])
 def outputs(layers, *args):
@@ -1408,6 +1408,8 @@ def outputs(layers, *args):
    :return:
    """
+    traveled = set()
    def __dfs_travel__(layer,
                       predicate=lambda x: x.layer_type == LayerType.DATA):
        """
@@ -1419,6 +1421,11 @@ def outputs(layers, *args):
        :type layer: LayerOutput
        :return:
        """
+        if layer in traveled:
+            return []
+        else:
+            traveled.add(layer)
        assert isinstance(layer, LayerOutput), "layer is %s" % (layer)
        retv = []
        if layer.parents is not None:
@@ -1438,7 +1445,7 @@ def outputs(layers, *args):
    assert len(layers) > 0
    if HasInputsSet():  # input already set
-        Outputs(* [l.name for l in layers])
+        Outputs(*[l.name for l in layers])
        return  # just return outputs.
    if len(layers) != 1:

--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -6,6 +6,7 @@ img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cos
 test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
 test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
 test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
-test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer)
+test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer
+test_recursive_topology)
 export whole_configs=(test_split_datasource)
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
@@ -131,6 +131,7 @@ input_layer_names: "weight"
 input_layer_names: "multi_class_label"
 output_layer_names: "__cost_0__"
 output_layer_names: "__mse_cost_0__"
+output_layer_names: "__nce_layer_0__"
 evaluators {
  name: "classification_error_evaluator"
  type: "classification_error"
@@ -154,6 +155,7 @@ sub_models {
  input_layer_names: "multi_class_label"
  output_layer_names: "__cost_0__"
  output_layer_names: "__mse_cost_0__"
+  output_layer_names: "__nce_layer_0__"
  evaluator_names: "classification_error_evaluator"
  is_recurrent_layer_group: false
 }

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_recursive_topology.protostr
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__addto_0__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+  }
+  inputs {
+    input_layer_name: "data"
+  }
+}
+layers {
+  name: "__addto_1__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_0__"
+  }
+  inputs {
+    input_layer_name: "__addto_0__"
+  }
+}
+layers {
+  name: "__addto_2__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_1__"
+  }
+  inputs {
+    input_layer_name: "__addto_1__"
+  }
+}
+layers {
+  name: "__addto_3__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_2__"
+  }
+  inputs {
+    input_layer_name: "__addto_2__"
+  }
+}
+layers {
+  name: "__addto_4__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_3__"
+  }
+  inputs {
+    input_layer_name: "__addto_3__"
+  }
+}
+layers {
+  name: "__addto_5__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_4__"
+  }
+  inputs {
+    input_layer_name: "__addto_4__"
+  }
+}
+layers {
+  name: "__addto_6__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_5__"
+  }
+  inputs {
+    input_layer_name: "__addto_5__"
+  }
+}
+layers {
+  name: "__addto_7__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_6__"
+  }
+  inputs {
+    input_layer_name: "__addto_6__"
+  }
+}
+layers {
+  name: "__addto_8__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_7__"
+  }
+  inputs {
+    input_layer_name: "__addto_7__"
+  }
+}
+layers {
+  name: "__addto_9__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_8__"
+  }
+  inputs {
+    input_layer_name: "__addto_8__"
+  }
+}
+layers {
+  name: "__addto_10__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_9__"
+  }
+  inputs {
+    input_layer_name: "__addto_9__"
+  }
+}
+layers {
+  name: "__addto_11__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_10__"
+  }
+  inputs {
+    input_layer_name: "__addto_10__"
+  }
+}
+layers {
+  name: "__addto_12__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_11__"
+  }
+  inputs {
+    input_layer_name: "__addto_11__"
+  }
+}
+layers {
+  name: "__addto_13__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_12__"
+  }
+  inputs {
+    input_layer_name: "__addto_12__"
+  }
+}
+layers {
+  name: "__addto_14__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_13__"
+  }
+  inputs {
+    input_layer_name: "__addto_13__"
+  }
+}
+layers {
+  name: "__addto_15__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_14__"
+  }
+  inputs {
+    input_layer_name: "__addto_14__"
+  }
+}
+layers {
+  name: "__addto_16__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_15__"
+  }
+  inputs {
+    input_layer_name: "__addto_15__"
+  }
+}
+layers {
+  name: "__addto_17__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_16__"
+  }
+  inputs {
+    input_layer_name: "__addto_16__"
+  }
+}
+layers {
+  name: "__addto_18__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_17__"
+  }
+  inputs {
+    input_layer_name: "__addto_17__"
+  }
+}
+layers {
+  name: "__addto_19__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_18__"
+  }
+  inputs {
+    input_layer_name: "__addto_18__"
+  }
+}
+layers {
+  name: "__addto_20__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_19__"
+  }
+  inputs {
+    input_layer_name: "__addto_19__"
+  }
+}
+layers {
+  name: "__addto_21__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_20__"
+  }
+  inputs {
+    input_layer_name: "__addto_20__"
+  }
+}
+layers {
+  name: "__addto_22__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_21__"
+  }
+  inputs {
+    input_layer_name: "__addto_21__"
+  }
+}
+layers {
+  name: "__addto_23__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_22__"
+  }
+  inputs {
+    input_layer_name: "__addto_22__"
+  }
+}
+layers {
+  name: "__addto_24__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_23__"
+  }
+  inputs {
+    input_layer_name: "__addto_23__"
+  }
+}
+layers {
+  name: "__addto_25__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_24__"
+  }
+  inputs {
+    input_layer_name: "__addto_24__"
+  }
+}
+layers {
+  name: "__addto_26__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_25__"
+  }
+  inputs {
+    input_layer_name: "__addto_25__"
+  }
+}
+layers {
+  name: "__addto_27__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_26__"
+  }
+  inputs {
+    input_layer_name: "__addto_26__"
+  }
+}
+layers {
+  name: "__addto_28__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_27__"
+  }
+  inputs {
+    input_layer_name: "__addto_27__"
+  }
+}
+layers {
+  name: "__addto_29__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_28__"
+  }
+  inputs {
+    input_layer_name: "__addto_28__"
+  }
+}
+layers {
+  name: "__addto_30__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_29__"
+  }
+  inputs {
+    input_layer_name: "__addto_29__"
+  }
+}
+layers {
+  name: "__addto_31__"
+  type: "addto"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__addto_30__"
+  }
+  inputs {
+    input_layer_name: "__addto_30__"
+  }
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 32
+  active_type: "relu"
+  inputs {
+    input_layer_name: "__addto_31__"
+    input_parameter_name: "___fc_layer_0__.w0"
+  }
+  bias_parameter_name: "___fc_layer_0__.wbias"
+}
+layers {
+  name: "__fc_layer_1__"
+  type: "fc"
+  size: 10
+  active_type: "softmax"
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+    input_parameter_name: "___fc_layer_1__.w0"
+  }
+  bias_parameter_name: "___fc_layer_1__.wbias"
+}
+parameters {
+  name: "___fc_layer_0__.w0"
+  size: 3200
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 32
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_0__.wbias"
+  size: 32
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 32
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___fc_layer_1__.w0"
+  size: 320
+  initial_mean: 0.0
+  initial_std: 0.176776695297
+  dims: 32
+  dims: 10
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_1__.wbias"
+  size: 10
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 10
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+output_layer_names: "__fc_layer_1__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__addto_0__"
+  layer_names: "__addto_1__"
+  layer_names: "__addto_2__"
+  layer_names: "__addto_3__"
+  layer_names: "__addto_4__"
+  layer_names: "__addto_5__"
+  layer_names: "__addto_6__"
+  layer_names: "__addto_7__"
+  layer_names: "__addto_8__"
+  layer_names: "__addto_9__"
+  layer_names: "__addto_10__"
+  layer_names: "__addto_11__"
+  layer_names: "__addto_12__"
+  layer_names: "__addto_13__"
+  layer_names: "__addto_14__"
+  layer_names: "__addto_15__"
+  layer_names: "__addto_16__"
+  layer_names: "__addto_17__"
+  layer_names: "__addto_18__"
+  layer_names: "__addto_19__"
+  layer_names: "__addto_20__"
+  layer_names: "__addto_21__"
+  layer_names: "__addto_22__"
+  layer_names: "__addto_23__"
+  layer_names: "__addto_24__"
+  layer_names: "__addto_25__"
+  layer_names: "__addto_26__"
+  layer_names: "__addto_27__"
+  layer_names: "__addto_28__"
+  layer_names: "__addto_29__"
+  layer_names: "__addto_30__"
+  layer_names: "__addto_31__"
+  layer_names: "__fc_layer_0__"
+  layer_names: "__fc_layer_1__"
+  input_layer_names: "data"
+  output_layer_names: "__fc_layer_1__"
+  is_recurrent_layer_group: false
+}
--- a/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_recursive_topology.py
+from paddle.trainer_config_helpers import *
+settings(batch_size=1000, learning_rate=1e-5)
+din = data_layer(name='data', size=100)
+enc = din
+for i in range(32):
+    enc = addto_layer([enc, enc])
+pred = fc_layer(
+    input=fc_layer(
+        input=enc, size=32, act=ReluActivation()),
+    size=10,
+    act=SoftmaxActivation())
+outputs(pred)
--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/v2/dataset/wmt14.py
@@ -32,9 +32,9 @@ MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
 # this is a small set of data for test. The original data is too large and will be add later.
 URL_TRAIN = 'http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz'
 MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
-# this is the pretrained model, whose bleu = 26.92
+# BLEU of this trained model is 26.92
 URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz'
-MD5_MODEL = '4ce14a26607fb8a1cc23bcdedb1895e4'
+MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3'
 START = "<s>"
 END = "<e>"

--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
-add_python_test(test_framework test_protobuf.py)
+add_python_test(test_framework test_protobuf.py test_scope.py)
--- a/python/paddle/v2/framework/tests/test_protobuf.py
+++ b/python/paddle/v2/framework/tests/test_protobuf.py
@@ -24,3 +24,7 @@ class TestFrameworkProto(unittest.TestCase):
        attr.type = attr_type_lib.FLOAT
        op_proto.type = "cos"
        self.assertTrue(op_proto.IsInitialized())
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/v2/framework/tests/test_scope.py
+++ b/python/paddle/v2/framework/tests/test_scope.py
+import paddle.v2.framework.core
+import unittest
+class TestScope(unittest.TestCase):
+    def test_create_destroy(self):
+        paddle_c = paddle.v2.framework.core
+        scope = paddle_c.Scope(None)
+        self.assertIsNotNone(scope)
+        scope_with_parent = paddle_c.Scope(scope)
+        self.assertIsNotNone(scope_with_parent)
+    def test_none_variable(self):
+        paddle_c = paddle.v2.framework.core
+        scope = paddle_c.Scope(None)
+        self.assertIsNone(scope.get_var("test"))
+    def test_create_var_get_var(self):
+        paddle_c = paddle.v2.framework.core
+        scope = paddle_c.Scope(None)
+        var_a = scope.create_var("var_a")
+        self.assertIsNotNone(var_a)
+        self.assertIsNotNone(scope.get_var('var_a'))
+        scope2 = paddle_c.Scope(scope)
+        self.assertIsNotNone(scope2.get_var('var_a'))
+    def test_var_get_int(self):
+        paddle_c = paddle.v2.framework.core
+        scope = paddle_c.Scope(None)
+        var = scope.create_var("test_int")
+        var.set_int(10)
+        self.assertTrue(var.is_int())
+        self.assertEqual(10, var.get_int())
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -66,6 +66,8 @@ class Optimizer(object):
            if use_sparse_remote_updater:
                        gradient_machine.prefetch(in_args)
                        parameter_updater.getParametersRemote()
+        :param pserver_spec: pserver location, eg: localhost:3000
        :return: parameter_updater
        """
        if is_local:

--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -41,6 +41,7 @@ class SGD(object):
    :type parameters: paddle.v2.parameters.Parameters
    :param extra_layers: Some layers in the neural network graph are not
                         in the path of cost layer.
+    :param pserver_spec: pserver location, eg: localhost:3000
    :type extra_layers: paddle.v2.config_base.Layer
    """

--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -29,7 +29,9 @@ setup(name='paddle',
      description='Parallel Distributed Deep Learning',
      install_requires=setup_requires,
      packages=packages,
-      package_data={'paddle.v2.master': ['libpaddle_master.so'], },
+      package_data={'paddle.v2.master': ['libpaddle_master.so'],
+            'paddle.v2.framework': ['core.so']
+      },
      package_dir={
          '': '${CMAKE_CURRENT_SOURCE_DIR}',
          # The paddle.v2.framework.proto will be generated while compiling.