diff --git a/.gitignore b/.gitignore
index 5c2fb134ae896651cb2edc2004112a4246e52359..c84b2fc8c79d6e2c9c83e2b830ab176295846fd0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,3 +22,6 @@ cmake-build-*
 
 # generated while compiling
 python/paddle/v2/framework/core.so
+CMakeFiles
+cmake_install.cmake
+
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2c713db3e38548f242e53b2fb5f436db81f843ef..6bc6a8077c1e9b242af50f1b912e91bbc6c3a9fb 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -140,6 +140,10 @@ endif(USE_NNPACK)
 
 add_subdirectory(proto)
 
+# "add_subdirectory(go)" should be placed after the following loine,
+# because it depends on paddle/optimizer.
+add_subdirectory(paddle/optimizer)
+
 # "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
 # placed after this block, because they depends on it.
 if(WITH_GOLANG)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 83e3d155d038cc65f3e372f0e4ba0aaee2e29690..71ee266611df27cd72c586c49ce2765e6ea0471b 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -93,7 +93,7 @@ include_directories(${CMAKE_CURRENT_BINARY_DIR})
 if(NOT APPLE)
     find_package(Threads REQUIRED)
     link_libraries(${CMAKE_THREAD_LIBS_INIT})
-    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -ldl")
+    set(CMAKE_CXX_LINK_EXECUTABLE "${CMAKE_CXX_LINK_EXECUTABLE} -ldl -lrt")
 endif(NOT APPLE)
 
 function(merge_static_libs TARGET_NAME)
@@ -301,7 +301,7 @@ function(go_library TARGET_NAME)
 
   file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go")
   string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
-  # FIXME: link path
+
   add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
     COMMAND rm "${${TARGET_NAME}_LIB_PATH}"
     # Golang build source code
@@ -309,7 +309,7 @@ function(go_library TARGET_NAME)
     -o "${${TARGET_NAME}_LIB_PATH}"
     "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${GO_SOURCE}"
     # must run under GOPATH
-  WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
+    WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
   add_dependencies(${TARGET_NAME} go_vendor)
 endfunction(go_library)
 
@@ -320,14 +320,11 @@ function(go_binary TARGET_NAME)
   cmake_parse_arguments(go_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
   string(REPLACE "${PADDLE_GO_PATH}/" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 
-  # FIXME: link path
   add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
-      COMMAND env LIBRARY_PATH=${CMAKE_BINARY_DIR}/go/pserver/client/c/:$ENV{LIBRARY_PATH}
-      GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
+    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build
     -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
     "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${go_binary_SRCS}"
     WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
-  # TODO: don't know what ${TARGET_NAME}_link does
   add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${TARGET_NAME}_timestamp ${go_binary_DEPS})
   install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME} DESTINATION bin)
 endfunction(go_binary)
@@ -335,15 +332,18 @@ endfunction(go_binary)
 function(go_test TARGET_NAME)
   set(options OPTIONAL)
   set(oneValueArgs "")
-  set(multiValueArgs SRCS DEPS)
+  set(multiValueArgs DEPS)
   cmake_parse_arguments(go_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
-  add_custom_command(OUTPUT ${TARGET_NAME}_timestamp
+  string(REPLACE "${PADDLE_GO_PATH}" "" CMAKE_CURRENT_SOURCE_REL_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+  add_custom_target(${TARGET_NAME} ALL DEPENDS go_vendor ${go_test_DEPS})
+  add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
     COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} test
     -c -o "${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}"
-    ${go_test_SRCS}
+    ".${CMAKE_CURRENT_SOURCE_REL_DIR}"
+    WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go")
+  add_test(NAME ${TARGET_NAME}
+    COMMAND ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}
     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-  add_custom_target(${TARGET_NAME} ALL DEPENDS ${TARGET_NAME}_timestamp ${go_test_DEPS})
-  add_test(${TARGET_NAME} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME})
 endfunction(go_test)
 
 function(proto_library TARGET_NAME)
diff --git a/go/CMakeLists.txt b/go/CMakeLists.txt
index f00c70a0589a4f41a23164a95d505d4310d9157b..29ce909c6442014fa0b64c6ca018a61b92c840e9 100644
--- a/go/CMakeLists.txt
+++ b/go/CMakeLists.txt
@@ -17,3 +17,7 @@ add_subdirectory(pserver/client/c)
 add_subdirectory(cmd/pserver)
 add_subdirectory(cmd/master)
 add_subdirectory(master/c)
+add_subdirectory(master)
+add_subdirectory(pserver)
+add_subdirectory(pserver/client)
+add_subdirectory(utils/networkhelper)
diff --git a/go/cmd/master/CMakeLists.txt b/go/cmd/master/CMakeLists.txt
index 1058ffa86b3f00b5e9525edca39a843da9b62db8..9e149967e71c9439bb00b973aa8723a809604aaf 100644
--- a/go/cmd/master/CMakeLists.txt
+++ b/go/cmd/master/CMakeLists.txt
@@ -12,4 +12,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-go_binary(master SRC master.go DEPS paddle_go_optimizer)
+go_binary(master SRC master.go)
diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go
index 0ecb1242c3c3d5246125c9ce946001ccf9cbec24..48351ab6d016b3594a4d983ba0980b2b0eb11c89 100644
--- a/go/cmd/pserver/pserver.go
+++ b/go/cmd/pserver/pserver.go
@@ -8,6 +8,7 @@ import (
 	"time"
 
 	"github.com/namsral/flag"
+	"github.com/topicai/candy"
 
 	"github.com/PaddlePaddle/Paddle/go/pserver"
 	log "github.com/sirupsen/logrus"
@@ -18,53 +19,47 @@ func main() {
 	index := flag.Int("index", -1, "index of this pserver, should be larger or equal than 0")
 	etcdEndpoint := flag.String("etcd-endpoint", "http://127.0.0.1:2379",
 		"comma separated endpoint string for pserver to connect to etcd")
-	etcdTimeout := flag.Int("etcd-timeout", 5, "timeout for etcd calls")
+	etcdTimeout := flag.Duration("etcd-timeout", 5*time.Second, "timeout for etcd calls")
 	numPservers := flag.Int("num-pservers", 1, "total pserver count in a training job")
 	checkpointPath := flag.String("checkpoint-path", "/checkpoints/", "save checkpoint path")
-	checkpointInterval := flag.Int("checkpoint-interval", 600, "save checkpoint per interval seconds")
+	checkpointInterval := flag.Duration("checkpoint-interval", 600*time.Second, "save checkpoint per interval seconds")
 	logLevel := flag.String("log-level", "info",
 		"log level, possible values: debug, info, warning, error, fatal, panic")
 	flag.Parse()
 
 	level, err := log.ParseLevel(*logLevel)
-	if err != nil {
-		panic(err)
-	}
+	candy.Must(err)
+
 	log.SetLevel(level)
 
 	var idx int
-	var cp pserver.Checkpoint
+
+	var cp *pserver.Checkpoint
 	var e *pserver.EtcdClient
 	if *index >= 0 {
 		idx = *index
 	} else {
-		timeout := time.Second * time.Duration((*etcdTimeout))
-		e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, timeout)
+		e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, *etcdTimeout)
 		idx, err = e.Register()
+		candy.Must(err)
+
+		cp, err = pserver.NewCheckpointFromFile(*checkpointPath, idx, e)
 		if err != nil {
-			panic(err)
+			log.Errorf("Fetch checkpoint failed, %s", err)
 		}
 	}
 
 	s, err := pserver.NewService(idx, *checkpointInterval, *checkpointPath, e, cp)
-	if err != nil {
-		panic(err)
-	}
+	candy.Must(err)
+
 	err = rpc.Register(s)
-	if err != nil {
-		panic(err)
-	}
+	candy.Must(err)
 
 	rpc.HandleHTTP()
 	l, err := net.Listen("tcp", ":"+strconv.Itoa(*port))
-	if err != nil {
-		panic(err)
-	}
+	candy.Must(err)
 
 	log.Infof("start pserver at port %d", *port)
 	err = http.Serve(l, nil)
-
-	if err != nil {
-		panic(err)
-	}
+	candy.Must(err)
 }
diff --git a/go/glide.lock b/go/glide.lock
index 190a222338b24b7edac76c72d07df0b2cbd7d9be..f71ae643d68d29846611ec52d0ae7d67e4ced850 100644
--- a/go/glide.lock
+++ b/go/glide.lock
@@ -1,8 +1,8 @@
-hash: b8f18ce6784bd3fadd9fed0b8443e7b658234ea785ae1f220723ae2c1f652aa7
-updated: 2017-06-27T14:05:48.925262819+08:00
+hash: a8faea3a363468a88917ddeb3b1c9ea36886fb2c622acbad42604fa9cb4d3855
+updated: 2017-07-11T10:04:40.786745417+08:00
 imports:
 - name: github.com/coreos/etcd
-  version: 61fc123e7a8b14a0a258aa3f5c4159861b1ec2e7
+  version: cb2a496c4ddd1c87a9f280e116649b599999ec79
   subpackages:
   - auth/authpb
   - clientv3
@@ -22,7 +22,9 @@ imports:
 - name: github.com/PaddlePaddle/recordio
   version: edfb82af0739c84f241c87390ec5649c7b28c129
 - name: github.com/sirupsen/logrus
-  version: 202f25545ea4cf9b191ff7f846df5d87c9382c2b
+  version: 7f976d3a76720c4c27af2ba716b85d2e0a7e38b1
+- name: github.com/topicai/candy
+  version: 1b9030d056fa9f8c4b1f9c91b52fe4b8ab4cd8cc
 - name: golang.org/x/net
   version: c8c74377599bd978aee1cf3b9b63a8634051cec2
   subpackages:
@@ -34,11 +36,11 @@ imports:
   - lex/httplex
   - trace
 - name: golang.org/x/sys
-  version: f7928cfef4d09d1b080aa2b6fd3ca9ba1567c733
+  version: abf9c25f54453410d0c6668e519582a9e1115027
   subpackages:
   - unix
 - name: golang.org/x/text
-  version: 4e9ab9ee170f2a39bd66c92b3e0a47ff47a4bc77
+  version: cfdf022e86b4ecfb646e1efbd7db175dd623a8fa
   subpackages:
   - secure/bidirule
   - transform
diff --git a/go/glide.yaml b/go/glide.yaml
index 05c5d15ca22b6a3d85bee8e1f31d222034ce5314..ab472c7cda9755d0399bb8376b16589be8b53057 100644
--- a/go/glide.yaml
+++ b/go/glide.yaml
@@ -10,3 +10,4 @@ import:
   version: ^1.7.4-pre
 - package: github.com/sirupsen/logrus
   version: ^1.0.0
+- package: github.com/topicai/candy
diff --git a/go/master/CMakeLists.txt b/go/master/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..30531e6469297be1624ea590ea71b1c996b58ed4
--- /dev/null
+++ b/go/master/CMakeLists.txt
@@ -0,0 +1,3 @@
+if(WITH_TESTING)
+  go_test(master_test)
+endif()
diff --git a/go/pserver/CMakeLists.txt b/go/pserver/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6267040a6eb421ef5006a83625cf24a8124f5320
--- /dev/null
+++ b/go/pserver/CMakeLists.txt
@@ -0,0 +1,3 @@
+if(WITH_TESTING)
+  go_test(pserver_test DEPS paddle_go_optimizer)
+endif()
diff --git a/go/pserver/client/CMakeLists.txt b/go/pserver/client/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..0052bb460bbe3a8fc1e898cac8c3d42caec098a7
--- /dev/null
+++ b/go/pserver/client/CMakeLists.txt
@@ -0,0 +1,3 @@
+if(WITH_TESTING)
+  go_test(pserver_client_test DEPS paddle_go_optimizer)
+endif()
diff --git a/go/pserver/client/c/.gitignore b/go/pserver/client/c/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..4bf05c85386dfcef83453a663dffc5d62efcbcc0
--- /dev/null
+++ b/go/pserver/client/c/.gitignore
@@ -0,0 +1 @@
+libpaddle_go_optimizer.a
diff --git a/go/pserver/client/c/CMakeLists.txt b/go/pserver/client/c/CMakeLists.txt
index 93a0a27f858f8654e0a6114abe7e326b086b8bf9..c6333eab550c9a2b71bcaf20b69b2bc0a9b9c529 100644
--- a/go/pserver/client/c/CMakeLists.txt
+++ b/go/pserver/client/c/CMakeLists.txt
@@ -1,5 +1,13 @@
 cc_library(paddle_go_optimizer DEPS paddle_optimizer paddle_proto glog gflags protobuf)
 target_link_libraries(paddle_go_optimizer stdc++ m)
+
+# Copy library to the required place.
+# See: go/pserver/optimizer.go:
+# // #cgo LDFLAGS: ${SRCDIR}/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
+add_custom_command(TARGET paddle_go_optimizer POST_BUILD
+  COMMAND cp "${CMAKE_CURRENT_BINARY_DIR}/libpaddle_go_optimizer.a" "${CMAKE_CURRENT_SOURCE_DIR}"
+  )
+
 go_library(paddle_pserver_cclient STATIC DEPS paddle_go_optimizer)
 if(WITH_TESTING)
   # FIXME: this test requires pserver which is not managed by the test
diff --git a/go/pserver/etcd_client.go b/go/pserver/etcd_client.go
index 1f77787150d16052e3588e9c1795c8d5dafa08e6..4a694b97f47b2ab85d1e109ef7545d104194b5cf 100644
--- a/go/pserver/etcd_client.go
+++ b/go/pserver/etcd_client.go
@@ -16,7 +16,7 @@ import (
 const (
 	// PsDesired is etcd path for store desired pserver count
 	PsDesired = "/ps_desired"
-	// PsAddr is the base dir for pserver to store their addr
+	// PsPath is the base dir for pserver to store their addr
 	PsPath = "/ps/"
 	// PsCheckpoint is the etcd path for store checkpoints information
 	PsCheckpoint = "/checkpoints/"
@@ -189,9 +189,25 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
 	return idx, nil
 }
 
+// GetKey gets the value by the specified key
+func (e *EtcdClient) GetKey(key string, timeout time.Duration) ([]byte, error) {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
+	resp, err := e.etcdClient.Get(ctx, key)
+	cancel()
+	if err != nil {
+		return []byte{}, err
+	}
+	kvs := resp.Kvs
+	if len(kvs) == 0 {
+		return []byte{}, nil
+	}
+	v := kvs[0].Value
+	return v, nil
+}
+
 // PutKey put into etcd with value by key specified
-func (e *EtcdClient) PutKey(key string, value []byte, timeout int) error {
-	ctx, cancel := context.WithTimeout(context.Background(), time.Second*time.Duration(timeout))
+func (e *EtcdClient) PutKey(key string, value []byte, timeout time.Duration) error {
+	ctx, cancel := context.WithTimeout(context.Background(), timeout)
 	_, err := e.etcdClient.Put(ctx, key, string(value))
 	cancel()
 	if err != nil {
diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go
index dcb871129985bba7fbabdfae7fbebd38caad4c7f..a6b73dd5a187727f29b6bb0c8b949e5c8bea2195 100644
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
@@ -1,8 +1,7 @@
 package pserver
 
 // #cgo CFLAGS: -I ../../
-// //FIXME: ldflags contain "build" path
-// #cgo LDFLAGS: ${SRCDIR}/../../build/go/pserver/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
+// #cgo LDFLAGS: ${SRCDIR}/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
 // #include "paddle/optimizer/optimizer.h"
 // #include <stdlib.h>
 // #include <string.h>
diff --git a/go/pserver/service.go b/go/pserver/service.go
index 6b52d0d896f8bc04fab6c9b68911523cbb7ac8b9..65db6970a703606cb74dc9f7677757f16f1e7181 100644
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -9,6 +9,7 @@ import (
 	"encoding/json"
 	"errors"
 	"fmt"
+	"io/ioutil"
 	"os"
 	"path/filepath"
 	"strconv"
@@ -21,14 +22,14 @@ import (
 // ElementType is the type of elements of a Parameter.
 type ElementType int
 
+// RPC error message.
 const (
-	// AlreadyInitialized is true if pserver is initialized
-	AlreadyInitialized = "pserver already initialized"
-	// Uninitialized is true if pserver not fully initialized
-	Uninitialized = "pserver not fully initialized"
+	AlreadyInitialized  = "pserver already initialized"
+	Uninitialized       = "pserver not fully initialized"
+	CheckpointMD5Failed = "checkpoint file MD5 validation failed"
 )
 
-// Supported element types
+// Supported element types.
 const (
 	Int32 ElementType = iota
 	UInt32
@@ -51,21 +52,15 @@ type ParameterWithConfig struct {
 	Config []byte // parameter configuration in Proto Buffer format
 }
 
-// ParameterCheckpoint is Parameter and State checkpoint
-type ParameterCheckpoint struct {
-	ParamConfig ParameterWithConfig
-	State       []byte
-}
-
-// checkpoint signature
+// checkpointMeta saves checkpoint metadata
 type checkpointMeta struct {
 	UUID      string `json:"uuid"`
-	Md5sum    string `json:"md5sum"`
-	Timestamp string `json:"timestamp"`
+	MD5       string `json:"md5"`
+	Timestamp int64  `json:"timestamp"`
 }
 
 // Checkpoint is the pserver shard persist in file
-type Checkpoint []ParameterCheckpoint
+type Checkpoint []parameterCheckpoint
 
 // Gradient is the gradient of the parameter.
 type Gradient Parameter
@@ -81,12 +76,53 @@ type Service struct {
 	optMap             map[string]*optimizer
 }
 
+// parameterCheckpoint saves parameter checkpoint
+type parameterCheckpoint struct {
+	ParameterWithConfig
+	State []byte
+}
+
+// NewCheckpointFromFile loads parameters and state from checkpoint file
+func NewCheckpointFromFile(cpPath string, idx int, e *EtcdClient) (*Checkpoint, error) {
+	v, err := e.GetKey(PsPath+string(idx), 3*time.Second)
+	if err != nil {
+		return nil, err
+	}
+
+	var cpMeta checkpointMeta
+	if err = json.Unmarshal(v, &cpMeta); err != nil {
+		return nil, err
+	}
+
+	fn := filepath.Join(cpPath, cpMeta.UUID)
+	if _, err = os.Stat(fn); os.IsNotExist(err) {
+		return nil, err
+	}
+	content, err := ioutil.ReadFile(fn)
+	if err != nil {
+		return nil, err
+	}
+
+	h := md5.New()
+	md5 := hex.EncodeToString(h.Sum(content))
+	if md5 != cpMeta.MD5 {
+		return nil, errors.New(CheckpointMD5Failed)
+	}
+
+	dec := gob.NewDecoder(bytes.NewReader(content))
+	cp := &Checkpoint{}
+	if err = dec.Decode(cp); err != nil {
+		return nil, err
+	}
+	return cp, nil
+}
+
 // NewService creates a new service, will bypass etcd registration if no
-// endpoints specified.
-func NewService(idx int, seconds int, path string, client *EtcdClient, cp Checkpoint) (*Service, error) {
+// endpoints specified. It will recovery from checkpoint file if a exists a specified checkpoint.
+func NewService(idx int, interval time.Duration, path string, client *EtcdClient, cp *Checkpoint) (*Service, error) {
 	s := &Service{
 		idx:                idx,
-		checkpointInterval: time.Second * time.Duration(seconds),
+		checkpointInterval: interval,
 		checkpointPath:     path,
 		client:             client,
 	}
@@ -94,10 +130,12 @@ func NewService(idx int, seconds int, path string, client *EtcdClient, cp Checkp
 	s.initialized = make(chan struct{})
 
 	if cp != nil {
-		for _, item := range cp {
-			p := item.ParamConfig
-			st := item.State
-			s.optMap[p.Param.Name] = newOptimizer(p, st)
+		for _, item := range *cp {
+			p := ParameterWithConfig{
+				Param:  item.Param,
+				Config: item.Config,
+			}
+			s.optMap[p.Param.Name] = newOptimizer(p, item.State)
 		}
 	}
 	return s, nil
@@ -186,13 +224,13 @@ func (s *Service) doCheckpoint() error {
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
-	cp := make([]ParameterCheckpoint, 0, len(s.optMap))
+	cp := make([]parameterCheckpoint, len(s.optMap))
 	index := 0
 	for name, opt := range s.optMap {
-		var pc ParameterCheckpoint
-		pc.ParamConfig.Param.Name = name
-		pc.ParamConfig.Param.ElementType = opt.elementType
-		pc.ParamConfig.Param.Content = opt.GetWeights()
+		var pc parameterCheckpoint
+		pc.Param.Name = name
+		pc.Param.ElementType = opt.elementType
+		pc.Param.Content = opt.GetWeights()
 		pc.State = opt.GetStates()
 		cp[index] = pc
 		index++
@@ -206,12 +244,12 @@ func (s *Service) doCheckpoint() error {
 
 	cpMeta := checkpointMeta{}
 	cpMeta.UUID = s.checkpointPath + strconv.Itoa(s.idx)
-	cpMeta.Timestamp = time.Now().String()
+	cpMeta.Timestamp = time.Now().UnixNano()
 	h := md5.New()
-	cpMeta.Md5sum = hex.EncodeToString(h.Sum(buf.Bytes()))
+	cpMeta.MD5 = hex.EncodeToString(h.Sum(buf.Bytes()))
 
 	cpMetajson, _ := json.Marshal(cpMeta)
-	err = s.client.PutKey(filepath.Join(PsCheckpoint, strconv.Itoa(s.idx)), cpMetajson, 3)
+	err = s.client.PutKey(filepath.Join(PsCheckpoint, strconv.Itoa(s.idx)), cpMetajson, 3*time.Second)
 	if err != nil {
 		return err
 	}
@@ -219,7 +257,11 @@ func (s *Service) doCheckpoint() error {
 		log.Info("checkpoint does not exists.")
 	} else {
 		err = os.Remove(cpMeta.UUID)
-		log.Infof("checkpoint %s already exsits, removing ", cpMeta.UUID)
+		if err != nil {
+			log.Infof("Removing checkpoint %s failed", cpMeta.UUID)
+		} else {
+			log.Infof("checkpoint %s already exsits, removing ", cpMeta.UUID)
+		}
 	}
 	f, err := os.Create(cpMeta.UUID)
 	defer f.Close()
diff --git a/go/utils/networkhelper/CMakeLists.txt b/go/utils/networkhelper/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..db6cf211d8c0b124856ca5c5fd2c49763b1b4a64
--- /dev/null
+++ b/go/utils/networkhelper/CMakeLists.txt
@@ -0,0 +1,3 @@
+if(WITH_TESTING)
+  go_test(network_helper_test)
+endif()
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 2c1eb7521d896e6d67021e4d020f274bb32d123c..0b5e9a2599dbec6d40eb70609d61ebea2b7e1486 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -8,14 +8,12 @@ add_subdirectory(gserver)
 add_subdirectory(pserver)
 add_subdirectory(trainer)
 add_subdirectory(scripts)
-add_subdirectory(optimizer)
 add_subdirectory(string)
 
 if(Boost_FOUND)
   add_subdirectory(memory)
   add_subdirectory(platform)
   add_subdirectory(framework)
-  add_subdirectory(operators)
   add_subdirectory(pybind)
 endif()
 
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index a91477b327170b11c6ae1424cea87760a5f0faac..7aea6086f5c95ed019aa302f0dfbde707c3e90d9 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -12,7 +12,7 @@ cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
 proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
 cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
 cc_library(operator SRCS operator.cc DEPS op_desc protobuf)
-cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry)
+cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry place)
 cc_library(op_registry SRCS op_registry.cc DEPS op_proto op_desc)
 cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry operator)
 py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
diff --git a/paddle/framework/dim.h b/paddle/framework/dim.h
index bcde291d12d429a3f2cd41fa6d0ee606c7c9c92f..883fdc55eb929ebc51e8ae05938e9d07374406ce 100644
--- a/paddle/framework/dim.h
+++ b/paddle/framework/dim.h
@@ -266,29 +266,6 @@ HOSTDEVICE inline bool contained(const Dim<1>& idx, const Dim<1>& size) {
   return ((0 <= idx.head) && (idx.head < size.head));
 }
 
-/**
- * \brief Check if a size and a stride create a Fortran order contiguous
- * block of memory.
- */
-template <int i>
-HOST bool contiguous(const Dim<i>& size, const Dim<i>& stride, int mul = 1) {
-  if (product(size) == 0) return true;
-  int contiguous_stride = get<0>(size) == 1 ? 0 : mul;
-  return (get<0>(stride) == contiguous_stride &&
-          contiguous(size.tail, stride.tail, mul * get<0>(size)));
-}
-
-///\cond HIDDEN
-// Base case of contiguous, check the nth stride is the size of
-// the prefix multiply of n-1 dims.
-template <>
-inline bool contiguous(const Dim<1>& size, const Dim<1>& stride, int mul) {
-  if (get<0>(size) == 0) return true;
-  int contiguous_stride = get<0>(size) == 1 ? 0 : mul;
-  return get<0>(stride) == contiguous_stride;
-}
-///\endcond
-
 /**
  * \brief Compute exclusive prefix-multiply of a Dim.
  */
@@ -306,31 +283,6 @@ HOSTDEVICE inline Dim<1> ex_prefix_mul(const Dim<1>& src, int mul) {
 }
 ///\endcond
 
-/**
- * \brief Calculate strides of a contiguous array of the given size
- *
- * Sets the stride for any dimension with an extent of 1 to 0.
- * \param size Dim object containing the size of the array.
- * \param base The base stride to use.
- * \return Dim object the same size as \p size with the strides.
- */
-template <int i>
-HOSTDEVICE Dim<i> contiguous_strides(const Dim<i>& size, int base = 1) {
-  int stride = size.head == 1 ? 0 : base;
-  return Dim<i>(stride, contiguous_strides(size.tail, base * size.head));
-}
-
-///\cond HIDDEN
-
-// Base case of contiguous_strides
-template <>
-HOSTDEVICE inline Dim<1> contiguous_strides(const Dim<1>& size, int base) {
-  int stride = size.head == 1 ? 0 : base;
-  return Dim<1>(stride);
-}
-
-///\endcond
-
 /**
  * Add two dimensions together
  */
diff --git a/paddle/framework/dim_test.cu b/paddle/framework/dim_test.cu
index 809bf04826637195425a32c054c94e00ef940df9..05217415196f3ec3ce9b5de7cb2f82c9de960ba7 100644
--- a/paddle/framework/dim_test.cu
+++ b/paddle/framework/dim_test.cu
@@ -58,24 +58,6 @@ TEST(Dim, Equality) {
     EXPECT_EQ(paddle::framework::get<1>(c), 3);
     EXPECT_EQ(paddle::framework::get<2>(c), 12);
 
-    // contiguous_strides
-    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(10, 1, 10));
-    EXPECT_EQ(paddle::framework::get<0>(c), 1);
-    EXPECT_EQ(paddle::framework::get<1>(c), 0);
-    EXPECT_EQ(paddle::framework::get<2>(c), 10);
-    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(10, 10, 1));
-    EXPECT_EQ(paddle::framework::get<0>(c), 1);
-    EXPECT_EQ(paddle::framework::get<1>(c), 10);
-    EXPECT_EQ(paddle::framework::get<2>(c), 0);
-    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(1, 10, 10));
-    EXPECT_EQ(paddle::framework::get<0>(c), 0);
-    EXPECT_EQ(paddle::framework::get<1>(c), 1);
-    EXPECT_EQ(paddle::framework::get<2>(c), 10);
-    c = paddle::framework::contiguous_strides(paddle::framework::Dim<3>(2, 3, 4));
-    EXPECT_EQ(paddle::framework::get<0>(c), 1);
-    EXPECT_EQ(paddle::framework::get<1>(c), 2);
-    EXPECT_EQ(paddle::framework::get<2>(c), 6);
-
     // generate from an index
     auto size = paddle::framework::make_dim(4, 5, 2);
     c = paddle::framework::Dim<3>(14, size);
@@ -101,16 +83,6 @@ TEST(Dim, Bool) {
     EXPECT_TRUE(a == a);
     EXPECT_FALSE(a == b);
     EXPECT_TRUE(a == c);
-
-    // contiguous check
-    int x = 4, y = 5, z = 2;
-    paddle::framework::Dim<3> sizef(x, y, z);
-    paddle::framework::Dim<3> stridea(1, x, x*y);
-    paddle::framework::Dim<3> strideb(2, 2*x, 2*x*y);
-    paddle::framework::Dim<3> stridec(1, x, 2*x*y);
-    EXPECT_TRUE(paddle::framework::contiguous(sizef, stridea));
-    EXPECT_FALSE(paddle::framework::contiguous(sizef, strideb));
-    EXPECT_FALSE(paddle::framework::contiguous(sizef, stridec));
 }
 
 TEST(Dim, Print) {
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index 02c99d50bb50cbd49a56a2282e55c148d4e6af16..248c7a1a3b866ae3bf2af33d0ff67b92d0f9c456 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -147,13 +147,13 @@ class OpRegisterHelper {
   }
 };
 
-#define REGISTER_OP(__op_class, __op_maker_class, __op_type)         \
-  class __op_class##Register {                                       \
-   private:                                                          \
-    const static OpRegisterHelper<__op_class, __op_maker_class> reg; \
-  };                                                                 \
-  const OpRegisterHelper<__op_class, __op_maker_class>               \
-      __op_class##Register::reg(#__op_type);
+#define REGISTER_OP(type, op_class, op_maker_class)                         \
+  class op_class##Register {                                                \
+   private:                                                                 \
+    const static OpRegisterHelper<op_class, op_maker_class> reg;            \
+  };                                                                        \
+  const OpRegisterHelper<op_class, op_maker_class> op_class##Register::reg( \
+      #type)
 
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index c4baafc2aebc8d009a388635bbab180d86a4b914..f5162fb870a91e566a0d2b1419050fe0799b199b 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -1,17 +1,15 @@
 #include "paddle/framework/op_registry.h"
 #include <gtest/gtest.h>
-#include "paddle/framework/operator.h"
-#include "paddle/operators/demo_op.h"
 
 using namespace paddle::framework;
 
 namespace paddle {
 namespace framework {
-class CosineOp : public OperatorWithKernel {
+class CosineOp : public OperatorBase {
  public:
-  void Run(const OpRunContext* context) const override {
-    printf("%s\n", DebugString().c_str());
-  }
+  void Run(const std::shared_ptr<Scope>& scope,
+           const platform::DeviceContext& dev_ctx) const override {}
+  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
 };
 
 class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@@ -28,14 +26,15 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
   }
 };
 
-REGISTER_OP(CosineOp, CosineOpProtoAndCheckerMaker, cos_sim)
+REGISTER_OP(cos_sim, CosineOp, CosineOpProtoAndCheckerMaker);
+
+class MyTestOp : public OperatorBase {
+ public:
+  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
+  void Run(const std::shared_ptr<Scope>& scope,
+           const platform::DeviceContext& dev_ctx) const override {}
 
-class MyTestOp : public OperatorWithKernel {
  public:
-  void Run(const OpRunContext* ctx) const override {
-    printf("%s\n", DebugString().c_str());
-    printf("test_attr = %d\n", ctx->op_->GetAttr<int>("test_attr"));
-  }
 };
 
 class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@@ -54,7 +53,7 @@ class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
   }
 };
 
-REGISTER_OP(MyTestOp, MyTestOpProtoAndCheckerMaker, my_test_op)
+REGISTER_OP(my_test_op, MyTestOp, MyTestOpProtoAndCheckerMaker);
 }  // namespace framework
 }  // namespace paddle
 
@@ -73,8 +72,8 @@ TEST(OpRegistry, CreateOp) {
   paddle::framework::OperatorBase* op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
   auto scope = std::make_shared<Scope>();
-  auto dev_ctx = DeviceContext();
-  op->Run(scope, &dev_ctx);
+  paddle::platform::CPUDeviceContext dev_ctx;
+  op->Run(scope, dev_ctx);
   float scale_get = op->GetAttr<float>("scale");
   ASSERT_EQ(scale_get, scale);
 }
@@ -116,8 +115,8 @@ TEST(OpRegistry, DefaultValue) {
   paddle::framework::OperatorBase* op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
   auto scope = std::make_shared<Scope>();
-  auto dev_ctx = DeviceContext();
-  op->Run(scope, &dev_ctx);
+  paddle::platform::CPUDeviceContext dev_ctx;
+  op->Run(scope, dev_ctx);
   ASSERT_EQ(op->GetAttr<float>("scale"), 1.0);
 }
 
@@ -169,9 +168,9 @@ TEST(OpRegistry, CustomChecker) {
   attr->set_i(4);
   paddle::framework::OperatorBase* op =
       paddle::framework::OpRegistry::CreateOp(op_desc);
-  auto dev_ctx = DeviceContext();
+  paddle::platform::CPUDeviceContext dev_ctx;
   auto scope = std::make_shared<Scope>();
-  op->Run(scope, &dev_ctx);
+  op->Run(scope, dev_ctx);
   int test_attr = op->GetAttr<int>("test_attr");
   ASSERT_EQ(test_attr, 4);
 }
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index 3db3706e47dfab49f54b3f1f9f2e41c53fc3f298..8f7adff8b3982e91a3d7f6d598cd62d5005d5f17 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -39,13 +39,5 @@ std::string OperatorBase::DebugString() const {
   return ss.str();
 }
 
-const Variable* OpRunContext::Input(int index) const {
-  return scope_->GetVariable(op_->inputs_[index]);
-}
-
-Variable* OpRunContext::Output(int index) const {
-  return scope_->GetVariable(op_->outputs_[index]);
-}
-
 }  // namespace framework
 }  // namespace paddle
\ No newline at end of file
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 6570d5869814a198195968606d041055e847ca08..0ce422e007c39ce0c3f5f7a89650cc211919ea8f 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -14,44 +14,22 @@ limitations under the License. */
 
 #pragma once
 
+#include <paddle/framework/attr_checker.h>
+#include <paddle/framework/op_desc.pb.h>
+#include <paddle/framework/scope.h>
+#include <paddle/platform/device_context.h>
+#include <paddle/platform/place.h>
+#include <paddle/utils/Error.h>
 #include <boost/variant.hpp>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
-#include "paddle/framework/attr_checker.h"
-#include "paddle/framework/op_desc.pb.h"
-#include "paddle/framework/scope.h"
-#include "paddle/utils/Error.h"
-
 namespace paddle {
 namespace framework {
 
 class OperatorBase;
 
-class DeviceContext {};
-
-/**
- * OpRunContext is the only parameter of Operator's Run function.
- * Run will get input/output variables, state such as momentum and
- * device resource such as CUDA stream, cublas handle, etc. from
- * OpRunContext. User should construct it before run the Operator.
- */
-class OpRunContext {
- public:
-  OpRunContext(const OperatorBase* op, const std::shared_ptr<Scope> scope,
-               const DeviceContext* device_context)
-      : op_(op), scope_(scope), device_context_(device_context) {}
-
-  const Variable* Input(int index) const;
-  Variable* Output(int index) const;
-
- public:
-  const OperatorBase* op_;
-  const std::shared_ptr<Scope> scope_;
-  const DeviceContext* device_context_;
-};
-
 /**
  * OperatorBase has the basic element that Net will call to do computation.
  * Only CreateOperator from OpRegistry will new Operator directly. User
@@ -77,7 +55,10 @@ class OperatorBase {
 
   /// Net will call this function to Run an op.
   virtual void Run(const std::shared_ptr<Scope>& scope,
-                   const DeviceContext* dev_ctx) const = 0;
+                   const platform::DeviceContext& dev_ctx) const = 0;
+
+ protected:
+  std::string Type() const { return desc_.type(); }
 
  public:
   OpDesc desc_;
@@ -86,22 +67,84 @@ class OperatorBase {
   AttributeMap attrs_;
 };
 
+class OpKernel {
+ public:
+  /**
+   * KernelContext is the only parameter of Kernel Run function.
+   * Run will get input/output variables, state such as momentum and
+   * device resource such as CUDA stream, cublas handle, etc. from
+   * KernelContext. User should construct it before run the Operator.
+   */
+  class KernelContext {
+   public:
+    KernelContext(const OperatorBase* op, const std::shared_ptr<Scope>& scope,
+                  const platform::DeviceContext& device_context)
+        : op_(*op), scope_(scope), device_context_(device_context) {}
+
+    const Variable* Input(int index) const {
+      return scope_->GetVariable(op_.inputs_[index]);
+    }
+
+    Variable* Output(int index) const {
+      return scope_->GetVariable(op_.outputs_[index]);
+    }
+
+    const OperatorBase& op_;
+    const std::shared_ptr<Scope>& scope_;
+    const platform::DeviceContext& device_context_;
+  };
+
+  virtual void Compute(const KernelContext& context) const = 0;
+
+  virtual ~OpKernel() {}
+};
+
 class OperatorWithKernel : public OperatorBase {
  public:
-  virtual ~OperatorWithKernel() {}
+  struct OpKernelKey {
+    platform::Place place_;
 
-  virtual void InferShape(const std::shared_ptr<Scope>& scope) const {}
+    OpKernelKey() = default;
+    OpKernelKey(const platform::DeviceContext& dev_ctx) {
+      place_ = dev_ctx.GetPlace();
+    }
+
+    bool operator==(const OpKernelKey& o) const { return place_ == o.place_; }
+  };
+
+  struct OpKernelHash {
+    std::hash<bool> hash_;
+    size_t operator()(const OpKernelKey& key) const {
+      return hash_(platform::is_gpu_place(key.place_));
+    }
+  };
+
+  using OpKernelMap =
+      std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;
 
   void Run(const std::shared_ptr<Scope>& scope,
-           const DeviceContext* dev_ctx) const {
-    OpRunContext op_ctx(this, scope, dev_ctx);
-    Run(&op_ctx);
+           const platform::DeviceContext& dev_ctx) const final {
+    auto& opKernel = AllOpKernels().at(Type()).at(OpKernelKey(dev_ctx));
+    opKernel->Compute(OpKernel::KernelContext(this, scope, dev_ctx));
   }
 
-  /// when implement an Op, your should implement this function.
-  /// this function should be moved to OpKernel later
-  virtual void Run(const OpRunContext* context) const = 0;
+  static std::unordered_map<std::string /* op_type */, OpKernelMap>&
+  AllOpKernels() {
+    static std::unordered_map<std::string, OpKernelMap> g_all_op_kernels;
+    return g_all_op_kernels;
+  };
 };
 
 }  // namespace framework
 }  // namespace paddle
+
+#define REGISTER_OP_KERNEL(type, PlaceType, KernelType)                   \
+  struct __op_kernel_register__##type##__ {                               \
+    __op_kernel_register__##type##__() {                                  \
+      ::paddle::framework::OperatorWithKernel::OpKernelKey key;           \
+      key.place_ = PlaceType();                                           \
+      ::paddle::framework::OperatorWithKernel::AllOpKernels()[#type][key] \
+          .reset(new KernelType());                                       \
+    }                                                                     \
+  };                                                                      \
+  static __op_kernel_register__##type##__ __reg_kernel_##type##__
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index 48808dabb2711936550d04cb49003e87663d3d27..be8c4be2d429648b3c8a708c7f8bdcae3ff2d283 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -19,17 +19,15 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
-class OperatorTest : public OperatorWithKernel {
+class OperatorTest : public OperatorBase {
  public:
-  void Run(const OpRunContext* ctx) const override {
-    float scale = ctx->op_->GetAttr<float>("scale");
-    PADDLE_ENFORCE(ctx->Input(0) == nullptr, "Input(0) should not initialized");
-    PADDLE_ENFORCE(ctx->Output(0) == nullptr,
-                   "Output(1) should not initialized");
-    auto output1 = ctx->scope_->CreateVariable("output1");
-    PADDLE_ENFORCE(output1 != nullptr, "should create output1 from scope");
-    printf("get attr %s = %f\n", "scale", scale);
-    printf("%s\n", DebugString().c_str());
+  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
+  void Run(const std::shared_ptr<Scope>& scope,
+           const platform::DeviceContext& dev_ctx) const override {
+    float scale = GetAttr<float>("scale");
+    ASSERT_NEAR(scale, 3.14, 1e-5);
+    ASSERT_EQ(scope->GetVariable(inputs_[0]), nullptr);
+    ASSERT_NE(scope->GetVariable(outputs_[0]), nullptr);
   }
 };
 
@@ -47,34 +45,79 @@ class OperatorTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
   }
 };
 
-REGISTER_OP(OperatorTest, OperatorTestProtoAndCheckerMaker, test_operator)
+REGISTER_OP(test_operator, OperatorTest, OperatorTestProtoAndCheckerMaker);
 
-TEST(OperatorBase, DebugString) {
+TEST(OperatorBase, all) {
   OpDesc op_desc;
   op_desc.set_type("test_operator");
-  std::vector<std::string> inputs = {"IN1", "IN2"};
-  for (auto& input : inputs) {
-    op_desc.add_inputs(input);
-  }
-  std::vector<std::string> outputs = {"OUT1", "OUT2"};
-  for (auto& output : outputs) {
-    op_desc.add_outputs(output);
-  }
+  *op_desc.mutable_inputs()->Add() = "IN1";
+  *op_desc.mutable_outputs()->Add() = "OUT1";
   auto attr = op_desc.mutable_attrs()->Add();
   attr->set_name("scale");
   attr->set_type(paddle::framework::AttrType::FLOAT);
   float scale = 3.14;
   attr->set_f(scale);
 
-  DeviceContext device_context;
+  platform::CPUDeviceContext device_context;
   auto scope = std::make_shared<Scope>();
 
   OperatorBase* op = paddle::framework::OpRegistry::CreateOp(op_desc);
-  ASSERT_EQ(op->inputs_, inputs);
-  ASSERT_EQ(op->outputs_, outputs);
   ASSERT_EQ(op->GetAttr<float>("scale"), scale);
-  op->Run(scope, &device_context);
+  scope->CreateVariable("OUT1");
+  op->Run(scope, device_context);
+  std::cout << op->DebugString() << std::endl;
+  delete op;
 }
 
+class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of test op");
+    AddOutput("output", "output of test op");
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .LargerThan(0.0);
+    AddType("test_operator");
+    AddComment("This is test op");
+  }
+};
+
+class OpWithKernelTest : public OperatorWithKernel {
+ public:
+  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
+};
+
+class CPUKernelTest : public OpKernel {
+ public:
+  void Compute(const KernelContext& context) const {
+    float scale = context.op_.GetAttr<float>("scale");
+    ASSERT_NEAR(scale, 3.14, 1e-5);
+    std::cout << "this is cpu kernel" << std::endl;
+    std::cout << context.op_.DebugString() << std::endl;
+  }
+};
+
+REGISTER_OP(op_with_kernel, OpWithKernelTest, OpKernelTestProtoAndCheckerMaker);
+REGISTER_OP_KERNEL(op_with_kernel, platform::CPUPlace, CPUKernelTest);
+
+TEST(OpKernel, all) {
+  OpDesc op_desc;
+  op_desc.set_type("op_with_kernel");
+  *op_desc.mutable_inputs()->Add() = "IN1";
+  *op_desc.mutable_outputs()->Add() = "OUT1";
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_f(3.14);
+
+  platform::CPUDeviceContext cpu_device_context;
+  auto scope = std::make_shared<Scope>();
+
+  OperatorBase* op = paddle::framework::OpRegistry::CreateOp(op_desc);
+  op->Run(scope, cpu_device_context);
+
+  delete op;
+}
 }  // namespace framework
 }  // namespace paddle
\ No newline at end of file
diff --git a/paddle/operators/.clang-format b/paddle/operators/.clang-format
deleted file mode 100644
index 29282dc87e2c499988c17d90d47d44cd5cf7f115..0000000000000000000000000000000000000000
--- a/paddle/operators/.clang-format
+++ /dev/null
@@ -1,5 +0,0 @@
----
-Language:        Cpp
-BasedOnStyle:  Google
-Standard:  Cpp11 
-...
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/paddle/operators/demo_op.h b/paddle/operators/demo_op.h
deleted file mode 100644
index d0b7420b4e25d21f718d5e10d62faeb475931a18..0000000000000000000000000000000000000000
--- a/paddle/operators/demo_op.h
+++ /dev/null
@@ -1,59 +0,0 @@
-#pragma once
-
-#include "paddle/framework/op_registry.h"
-
-using namespace paddle::framework;
-
-namespace paddle {
-namespace operators {
-
-class CosineOp : public OperatorWithKernel {
- public:
-  void Run(const OpRunContext *context) const override {
-    printf("%s\n", DebugString().c_str());
-  }
-};
-
-class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
- public:
-  CosineOpProtoAndCheckerMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "input of cosine op");
-    AddOutput("output", "output of cosine op");
-    AddAttr<float>("scale", "scale of cosine op")
-        .SetDefault(1.0)
-        .LargerThan(0.0);
-    AddType("cos");
-    AddComment("This is cos op");
-  }
-};
-
-REGISTER_OP(CosineOp, CosineOpProtoAndCheckerMaker, cos_sim)
-
-class MyTestOp : public OperatorWithKernel {
- public:
-  void Run(const OpRunContext *context) const override {
-    printf("%s\n", DebugString().c_str());
-  }
-};
-
-class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
- public:
-  MyTestOpProtoAndCheckerMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("input", "input of cosine op");
-    AddOutput("output", "output of cosine op");
-    auto my_checker = [](int i) {
-      PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
-    };
-    AddAttr<int>("test_attr", "a simple test attribute")
-        .AddCustomChecker(my_checker);
-    AddType("my_test_op");
-    AddComment("This is my_test op");
-  }
-};
-
-REGISTER_OP(MyTestOp, MyTestOpProtoAndCheckerMaker, my_test_op)
-
-}  // namespace operators
-}  // namespace operators
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index 7a198aec6cf12c92cb24a8e560508d06db5e1dcf..358d14f4555e1d046c8e7b91e23d54fb504926e5 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -1,11 +1,12 @@
 add_subdirectory(dynload)
 
-nv_test(cuda_test SRCS cuda_test.cu DEPS dyload_cuda)
+nv_test(cuda_test SRCS cuda_test.cu)
 
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
+
 IF(WITH_GPU)
-    set(GPU_CTX_DEPS dyload_cuda dynamic_loader )
+    set(GPU_CTX_DEPS dynload_cuda dynamic_loader)
 ELSE()
     set(GPU_CTX_DEPS)
 ENDIF()
diff --git a/paddle/platform/device_context.cc b/paddle/platform/device_context.cc
index a2dea2ed1e11817c23dd2dc55a578d8fbd21ecb2..960ef0a5955bfe5f7d33b7c8e4524176b0dbfda6 100644
--- a/paddle/platform/device_context.cc
+++ b/paddle/platform/device_context.cc
@@ -1,13 +1,30 @@
-#include <paddle/platform/device_context.h>
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/device_context.h"
 
 namespace paddle {
 namespace platform {
-namespace dynload {
-namespace dummy {
-// Make DeviceContext A library.
-int DUMMY_VAR_FOR_DEV_CTX = 0;
 
-}  // namespace dummy
-}  // namespace dynload
+template <>
+Eigen::DefaultDevice* DeviceContext::get_eigen_device<Eigen::DefaultDevice>() {
+  return reinterpret_cast<CPUDeviceContext*>(this)->eigen_device();
+}
+
+#ifndef PADDLE_ONLY_CPU
+template <>
+Eigen::GpuDevice* DeviceContext::get_eigen_device<Eigen::GpuDevice>() {
+  return reinterpret_cast<CUDADeviceContext*>(this)->eigen_device();
+}
+#endif
+
 }  // namespace platform
-}  // namespace paddle
\ No newline at end of file
+}  // namespace paddle
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
index 160eb4e12060b36c4fefba499d4e83b9aab92848..7de07d06bed885d6529a884fb81fedbdaba78f4a 100644
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -1,11 +1,8 @@
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -22,8 +19,9 @@ limitations under the License. */
 #include "paddle/platform/dynload/curand.h"
 #define EIGEN_USE_GPU
 #endif
-#include "paddle/platform/place.h"
-#include "unsupported/Eigen/CXX11/Tensor"
+#include <paddle/platform/place.h>
+#include <memory>
+#include <unsupported/Eigen/CXX11/Tensor>
 
 namespace paddle {
 namespace platform {
@@ -31,9 +29,29 @@ namespace platform {
 class DeviceContext {
  public:
   virtual ~DeviceContext() {}
+  virtual Place GetPlace() const = 0;
+
+  template <typename DeviceType>
+  DeviceType* get_eigen_device();
 };
 
-class CPUDeviceContext : public DeviceContext {};
+class CPUDeviceContext : public DeviceContext {
+ public:
+  Eigen::DefaultDevice* eigen_device() {
+    if (!eigen_device_) {
+      eigen_device_.reset(new Eigen::DefaultDevice());
+    }
+    return eigen_device_.get();
+  }
+
+  Place GetPlace() const override {
+    Place retv = CPUPlace();
+    return retv;
+  }
+
+ private:
+  std::unique_ptr<Eigen::DefaultDevice> eigen_device_;
+};
 
 #ifndef PADDLE_ONLY_CPU
 
@@ -57,8 +75,13 @@ class CUDADeviceContext : public DeviceContext {
     GPUPlaceGuard guard(gpu_place_);
     paddle::platform::throw_on_error(cudaStreamCreate(&stream_),
                                      "cudaStreamCreate failed");
-    eigen_stream_ = new Eigen::CudaStreamDevice(&stream_);
-    eigen_device_ = new Eigen::GpuDevice(eigen_stream_);
+    eigen_stream_.reset(new Eigen::CudaStreamDevice(&stream_));
+    eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
+  }
+
+  Place GetPlace() const override {
+    Place retv = GPUPlace();
+    return retv;
   }
 
   void Wait() {
@@ -68,7 +91,7 @@ class CUDADeviceContext : public DeviceContext {
 
   cudaStream_t stream() { return stream_; }
 
-  Eigen::GpuDevice eigen_device() { return *eigen_device_; }
+  Eigen::GpuDevice* eigen_device() { return eigen_device_.get(); }
 
   cublasHandle_t cublas_handle() {
     if (!blas_handle_) {
@@ -133,10 +156,8 @@ class CUDADeviceContext : public DeviceContext {
                          rand_generator_) == CURAND_STATUS_SUCCESS,
                      "curandDestroyGenerator failed");
     }
-
-    delete eigen_stream_;
-    delete eigen_device_;
-
+    eigen_stream_.reset();
+    eigen_device_.reset();
     paddle::platform::throw_on_error(cudaStreamDestroy(stream_),
                                      "cudaStreamDestroy failed");
   }
@@ -145,8 +166,8 @@ class CUDADeviceContext : public DeviceContext {
   GPUPlace gpu_place_;
   cudaStream_t stream_;
 
-  Eigen::CudaStreamDevice* eigen_stream_;
-  Eigen::GpuDevice* eigen_device_;
+  std::unique_ptr<Eigen::CudaStreamDevice> eigen_stream_;
+  std::unique_ptr<Eigen::GpuDevice> eigen_device_;
 
   cublasHandle_t blas_handle_{nullptr};
 
@@ -155,6 +176,8 @@ class CUDADeviceContext : public DeviceContext {
   int random_seed_;
   curandGenerator_t rand_generator_{nullptr};
 };
+
 #endif
+
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cc
index 61be4a307dbf073be7dff4564183240834cc7df6..af2ce17fc2238dda62e9888ebe9426edcd55d2bc 100644
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
@@ -15,13 +15,26 @@ limitations under the License. */
 #include "paddle/platform/device_context.h"
 #include "gtest/gtest.h"
 
-TEST(CUDADeviceContext, Init) {
+using DEVICE_GPU = Eigen::GpuDevice;
+TEST(Device, Init) {
+  int count = paddle::platform::GetDeviceCount();
+  for (int i = 0; i < count; i++) {
+    paddle::platform::DeviceContext* device_context =
+        new paddle::platform::CUDADeviceContext(i);
+    Eigen::GpuDevice* gpu_device =
+        device_context->template get_eigen_device<DEVICE_GPU>();
+    ASSERT_NE(nullptr, gpu_device);
+    delete device_context;
+  }
+}
+
+TEST(Device, CUDADeviceContext) {
   int count = paddle::platform::GetDeviceCount();
   for (int i = 0; i < count; i++) {
     paddle::platform::CUDADeviceContext* device_context =
         new paddle::platform::CUDADeviceContext(i);
-    Eigen::GpuDevice gpu_device = device_context->eigen_device();
-    ASSERT_NE(nullptr, gpu_device.stream());
+    Eigen::GpuDevice* gpu_device = device_context->eigen_device();
+    ASSERT_NE(nullptr, gpu_device);
     cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
     ASSERT_NE(nullptr, cudnn_handle);
     cublasHandle_t cublas_handle = device_context->cublas_handle();
diff --git a/paddle/platform/dynload/CMakeLists.txt b/paddle/platform/dynload/CMakeLists.txt
index 4a8866b3d364542f315978859e96290c6f067f6f..d205ead84598e04eea523be32139959a02e0dd83 100644
--- a/paddle/platform/dynload/CMakeLists.txt
+++ b/paddle/platform/dynload/CMakeLists.txt
@@ -1,2 +1,2 @@
 cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags)
-nv_library(dyload_cuda SRCS cublas.cc cudnn.cc curand.cc)
+nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc)
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index 55aebc59eca50ad33e8a5357c5ca29d4101f754b..f9f87acf15a6b62c343cc0e3db9ebc7e0aabb786 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <Python.h>
 #include <paddle/framework/scope.h>
 #include <pybind11/pybind11.h>
 
@@ -43,4 +44,4 @@ All parameter, weight, gradient are variables in Paddle.
            py::return_value_policy::reference);
 
   return m.ptr();
-}
\ No newline at end of file
+}