Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into crop_layer

de5ded6b · wanghaoshuang · 69b12225 · 15f021a9 · de5ded6b · de5ded6b
43 changed file
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -21,3 +21,10 @@
    sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
    hooks:
    -   id: clang-formater
+-   repo: https://github.com/dnephin/pre-commit-golang
+    sha: e4693a4c282b4fc878eda172a929f7a6508e7d16
+    hooks:
+      -   id: go-fmt
+          files: (.*\.go)
+      -   id: go-lint
+          files: (.*\.go)
--- a/.travis.yml
+++ b/.travis.yml
@@ -37,12 +37,13 @@ before_install:
  # protobuf version.
  - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
  - pip install rarfile
+  - curl https://glide.sh/get | bash
  - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
  - |
    function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
  - |
-    export WITH_GOLANG=ON && timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
+    timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
 notifications:
  email:

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,6 +16,7 @@ cmake_minimum_required(VERSION 3.0)

 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 set(PROJ_ROOT ${CMAKE_CURRENT_SOURCE_DIR})
+set(PROJ_BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR})

 include(system)


--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -38,12 +38,14 @@ ExternalProject_Add(
    CMAKE_ARGS      -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
    CMAKE_ARGS      -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
+    CMAKE_ARGS      -DCMAKE_INSTALL_LIBDIR=${GLOG_INSTALL_DIR}/lib
    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
    CMAKE_ARGS      -DWITH_GFLAGS=ON
    CMAKE_ARGS      -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
    CMAKE_ARGS      -DBUILD_TESTING=OFF
    CMAKE_ARGS      -DCMAKE_BUILD_TYPE=Release
    CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${GLOG_INSTALL_DIR}
+                     -DCMAKE_INSTALL_LIBDIR:PATH=${GLOG_INSTALL_DIR}/lib
                     -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
                     -DCMAKE_BUILD_TYPE:STRING=Release
 )

--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -17,6 +17,65 @@ INCLUDE(ExternalProject)
 FIND_PACKAGE(Protobuf QUIET)
 SET(PROTOBUF_FOUND "OFF")

+if(NOT COMMAND protobuf_generate_python)  # before cmake 3.4, protobuf_genrerate_python is not defined.
+    function(protobuf_generate_python SRCS)
+        # shameless copy from https://github.com/Kitware/CMake/blob/master/Modules/FindProtobuf.cmake
+        if(NOT ARGN)
+            message(SEND_ERROR "Error: PROTOBUF_GENERATE_PYTHON() called without any proto files")
+            return()
+        endif()
+
+        if(PROTOBUF_GENERATE_CPP_APPEND_PATH)
+            # Create an include path for each file specified
+            foreach(FIL ${ARGN})
+                get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+                get_filename_component(ABS_PATH ${ABS_FIL} PATH)
+                list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
+                if(${_contains_already} EQUAL -1)
+                    list(APPEND _protobuf_include_path -I ${ABS_PATH})
+                endif()
+            endforeach()
+        else()
+            set(_protobuf_include_path -I ${CMAKE_CURRENT_SOURCE_DIR})
+        endif()
+
+        if(DEFINED PROTOBUF_IMPORT_DIRS AND NOT DEFINED Protobuf_IMPORT_DIRS)
+            set(Protobuf_IMPORT_DIRS "${PROTOBUF_IMPORT_DIRS}")
+        endif()
+
+        if(DEFINED Protobuf_IMPORT_DIRS)
+            foreach(DIR ${Protobuf_IMPORT_DIRS})
+                get_filename_component(ABS_PATH ${DIR} ABSOLUTE)
+                list(FIND _protobuf_include_path ${ABS_PATH} _contains_already)
+                if(${_contains_already} EQUAL -1)
+                    list(APPEND _protobuf_include_path -I ${ABS_PATH})
+                endif()
+            endforeach()
+        endif()
+
+        set(${SRCS})
+        foreach(FIL ${ARGN})
+            get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+            get_filename_component(FIL_WE ${FIL} NAME_WE)
+            if(NOT PROTOBUF_GENERATE_CPP_APPEND_PATH)
+                get_filename_component(FIL_DIR ${FIL} DIRECTORY)
+                if(FIL_DIR)
+                    set(FIL_WE "${FIL_DIR}/${FIL_WE}")
+                endif()
+            endif()
+
+            list(APPEND ${SRCS} "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py")
+            add_custom_command(
+                    OUTPUT "${CMAKE_CURRENT_BINARY_DIR}/${FIL_WE}_pb2.py"
+                    COMMAND  ${Protobuf_PROTOC_EXECUTABLE} --python_out ${CMAKE_CURRENT_BINARY_DIR} ${_protobuf_include_path} ${ABS_FIL}
+                    DEPENDS ${ABS_FIL} ${Protobuf_PROTOC_EXECUTABLE}
+                    COMMENT "Running Python protocol buffer compiler on ${FIL}"
+                    VERBATIM )
+        endforeach()
+
+        set(${SRCS} ${${SRCS}} PARENT_SCOPE)
+    endfunction()
+endif()

 # Print and set the protobuf library information,
 # finish this cmake process and exit from this file.

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -88,7 +88,7 @@
 #

 # including binary directory for generated headers.
-include_directories(${CMAKE_BINARY_DIR})
+include_directories(${CMAKE_CURRENT_BINARY_DIR})

 if(NOT APPLE)
    find_package(Threads REQUIRED)
@@ -99,15 +99,33 @@ function(merge_static_libs TARGET_NAME)
  set(libs ${ARGN})
  list(REMOVE_DUPLICATES libs)

-  # First get the file names of the libraries to be merged
+  # Get all propagation dependencies from the merged libraries
  foreach(lib ${libs})
-    set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
+    list(APPEND libs_deps ${${lib}_LIB_DEPENDS})
  endforeach()

  if(APPLE) # Use OSX's libtool to merge archives
+    # To produce a library we need at least one source file.
+    # It is created by add_custom_command below and will helps
+    # also help to track dependencies.
    set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
+
+    # Make the generated dummy source file depended on all static input
+    # libs. If input lib changes,the source file is touched
+    # which causes the desired effect (relink).
+    add_custom_command(OUTPUT ${dummyfile}
+      COMMAND ${CMAKE_COMMAND} -E touch ${dummyfile}
+      DEPENDS ${libs})
+
+    # Generate dummy staic lib
    file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
    add_library(${TARGET_NAME} STATIC ${dummyfile})
+    target_link_libraries(${TARGET_NAME} ${libs_deps})
+
+    foreach(lib ${libs})
+      # Get the file names of the libraries to be merged
+      set(libfiles ${libfiles} $<TARGET_FILE:${lib}>)
+    endforeach()
 		add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
      COMMAND rm "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a"
      COMMAND /usr/bin/libtool -static -o "${CMAKE_CURRENT_BINARY_DIR}/lib${TARGET_NAME}.a" ${libfiles})
@@ -117,7 +135,8 @@ function(merge_static_libs TARGET_NAME)
      set(objdir ${lib}.objdir)

      add_custom_command(OUTPUT ${objdir}
-        COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir})
+        COMMAND ${CMAKE_COMMAND} -E make_directory ${objdir}
+        DEPENDS ${lib})

      add_custom_command(OUTPUT ${objlistfile}
        COMMAND ${CMAKE_AR} -x "$<TARGET_FILE:${lib}>"
@@ -134,18 +153,18 @@ function(merge_static_libs TARGET_NAME)
      list(APPEND mergebases "${mergebase}")
    endforeach()

-    # We need a target for the output merged library
    add_library(${TARGET_NAME} STATIC ${mergebases})
+    target_link_libraries(${TARGET_NAME} ${libs_deps})
+
+    # Get the file name of the generated library
    set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")

    foreach(lib ${libs})
      add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-      COMMAND ${CMAKE_AR} ru ${outlibfile} @"../${lib}.objlist"
+        COMMAND ${CMAKE_AR} cr ${outlibfile} *.o
+        COMMAND ${CMAKE_RANLIB} ${outlibfile}
        WORKING_DIRECTORY ${lib}.objdir)
    endforeach()
-
-    add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-      COMMAND ${CMAKE_RANLIB} ${outlibfile})
  endif()
 endfunction(merge_static_libs)

@@ -192,7 +211,7 @@ function(cc_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS)
    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main -lstdc++ -lm)
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
    add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
  endif()
@@ -285,7 +304,7 @@ function(go_library TARGET_NAME)
  add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
    COMMAND rm "${${TARGET_NAME}_LIB_PATH}"
    # Golang build source code
-    COMMAND env LIBRARY_PATH=${CMAKE_BINARY_DIR}/go/pserver/client/c/:$ENV{LIBRARY_PATH} GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
+    COMMAND GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
    -o "${${TARGET_NAME}_LIB_PATH}"
    "./${CMAKE_CURRENT_SOURCE_REL_DIR}/${GO_SOURCE}"
    # must run under GOPATH
@@ -335,3 +354,12 @@ function(proto_library TARGET_NAME)
  protobuf_generate_cpp(proto_srcs proto_hdrs ${proto_library_SRCS})
  cc_library(${TARGET_NAME} SRCS ${proto_srcs} DEPS ${proto_library_DEPS} protobuf)
 endfunction()
+
+function(py_proto_compile TARGET_NAME)
+  set(oneValueArgs "")
+  set(multiValueArgs SRCS)
+  cmake_parse_arguments(py_proto_compile "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  set(py_srcs)
+  protobuf_generate_python(py_srcs ${py_proto_compile_SRCS})
+  add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs})
+endfunction()
--- a/doc_theme/templates/layout.html
+++ b/doc_theme/templates/layout.html
@@ -101,7 +101,7 @@
    </div>
    <div class="site-nav-links">
      <div class="site-menu">
-        <a class="fork-on-github" href="https://github.com/PaddlePaddle/Paddle" target="_blank"><i class="fa fa-github"></i>Folk me on Github</a>
+        <a class="fork-on-github" href="https://github.com/PaddlePaddle/Paddle" target="_blank"><i class="fa fa-github"></i>Fork me on Github</a>
        <div class="language-switcher dropdown">
          <a type="button" data-toggle="dropdown">
            <span>English</span>

--- a/go/cmd/pserver/pserver.go
+++ b/go/cmd/pserver/pserver.go
@@ -20,6 +20,8 @@ func main() {
 		"comma separated endpoint string for pserver to connect to etcd")
 	etcdTimeout := flag.Int("etcd-timeout", 5, "timeout for etcd calls")
 	numPservers := flag.Int("num-pservers", 1, "total pserver count in a training job")
+	checkpointPath := flag.String("checkpoint-path", "/checkpoints/", "save checkpoint path")
+	checkpointInterval := flag.Int("checkpoint-interval", 600, "save checkpoint per interval seconds")
 	logLevel := flag.String("log-level", "info",
 		"log level, possible values: debug, info, warning, error, fatal, panic")
 	flag.Parse()
@@ -31,18 +33,20 @@ func main() {
 	log.SetLevel(level)

 	var idx int
+	var cp pserver.Checkpoint
+	var e *pserver.EtcdClient
 	if *index >= 0 {
 		idx = *index
 	} else {
 		timeout := time.Second * time.Duration((*etcdTimeout))
-		e := pserver.NewEtcdClient(*etcdEndpoint, *numPservers, timeout)
+		e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, timeout)
 		idx, err = e.Register()
 		if err != nil {
 			panic(err)
 		}
 	}

-	s, err := pserver.NewService(idx)
+	s, err := pserver.NewService(idx, *checkpointInterval, *checkpointPath, e, cp)
 	if err != nil {
 		panic(err)
 	}

--- a/go/master/c/client.go
+++ b/go/master/c/client.go
@@ -104,11 +104,22 @@ func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int
 	return C.PADDLE_MASTER_OK
 }

+// return value:
+//     0:ok
+//    -1:error
 //export paddle_next_record
 func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
 	c := get(client)
-	r := c.NextRecord()
+	r, err := c.NextRecord()
+	if err != nil {
+		// Error
+		// TODO: return the type of error?
+		*record = (*C.uchar)(nullPtr)
+		return -1
+	}
+
 	if len(r) == 0 {
+		// Empty record
 		*record = (*C.uchar)(nullPtr)
 		return 0
 	}

--- a/go/master/client.go
+++ b/go/master/client.go
@@ -11,7 +11,12 @@ import (
 // Client is the client of the master server.
 type Client struct {
 	conn *connection.Conn
-	ch   chan []byte
+	ch   chan record
+}
+
+type record struct {
+	r   []byte
+	err error
 }

 // NewClient creates a new Client.
@@ -21,7 +26,7 @@ type Client struct {
 func NewClient(addrCh <-chan string, bufSize int) *Client {
 	c := &Client{}
 	c.conn = connection.New()
-	c.ch = make(chan []byte, bufSize)
+	c.ch = make(chan record, bufSize)
 	go c.monitorMaster(addrCh)
 	go c.getRecords()
 	return c
@@ -46,10 +51,11 @@ func (c *Client) getRecords() {

 			s := recordio.NewRangeScanner(f, &chunk.Index, -1, -1)
 			for s.Scan() {
-				c.ch <- s.Record()
+				c.ch <- record{s.Record(), nil}
 			}

 			if s.Err() != nil {
+				c.ch <- record{nil, s.Err()}
 				log.Errorln(err, chunk.Path)
 			}

@@ -116,6 +122,7 @@ func (c *Client) taskFinished(taskID int) error {
 //
 // NextRecord will block until the next record is available. It is
 // thread-safe.
-func (c *Client) NextRecord() []byte {
-	return <-c.ch
+func (c *Client) NextRecord() ([]byte, error) {
+	r := <-c.ch
+	return r.r, r.err
 }
--- a/go/master/client_test.go
+++ b/go/master/client_test.go
@@ -68,12 +68,17 @@ func TestNextRecord(t *testing.T) {
 	for pass := 0; pass < 50; pass++ {
 		received := make(map[byte]bool)
 		for i := 0; i < total; i++ {
-			r := c.NextRecord()
+			r, err := c.NextRecord()
+			if err != nil {
+				t.Fatal(pass, i, "Read error:", err)
+			}
+
 			if len(r) != 1 {
-				t.Fatal("Length should be 1.", r)
+				t.Fatal(pass, i, "Length should be 1.", r)
 			}
+
 			if received[r[0]] {
-				t.Fatal("Received duplicate.", received, r)
+				t.Fatal(pass, i, "Received duplicate.", received, r)
 			}
 			received[r[0]] = true
 		}

--- a/go/pserver/client/c/CMakeLists.txt
+++ b/go/pserver/client/c/CMakeLists.txt
 cc_library(paddle_go_optimizer DEPS paddle_optimizer paddle_proto glog gflags protobuf)
+target_link_libraries(paddle_go_optimizer stdc++ m)
 go_library(paddle_pserver_cclient STATIC DEPS paddle_go_optimizer)
 if(WITH_TESTING)
-    # TODO: add unit test
-    #add_subdirectory(test)
+  # FIXME: this test requires pserver which is not managed by the test
+  # we need some kind of e2e testing machanism.
+  # add_subdirectory(test)
 endif()
--- a/go/pserver/etcd_client.go
+++ b/go/pserver/etcd_client.go
@@ -18,6 +18,8 @@ const (
 	PsDesired = "/ps_desired"
 	// PsAddr is the base dir for pserver to store their addr
 	PsPath = "/ps/"
+	// PsCheckpoint is the etcd path for store checkpoints information
+	PsCheckpoint = "/checkpoints/"
 )

 // EtcdClient is the etcd client that the pserver uses for fault
@@ -186,3 +188,14 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {

 	return idx, nil
 }
+
+// PutKey put into etcd with value by key specified
+func (e *EtcdClient) PutKey(key string, value []byte, timeout int) error {
+	ctx, cancel := context.WithTimeout(context.Background(), time.Second*time.Duration(timeout))
+	_, err := e.etcdClient.Put(ctx, key, string(value))
+	cancel()
+	if err != nil {
+		return err
+	}
+	return nil
+}
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
 package pserver

 // #cgo CFLAGS: -I ../../
-// #cgo LDFLAGS: -lpaddle_go_optimizer -lstdc++ -lm
+// //FIXME: ldflags contain "build" path
+// #cgo LDFLAGS: ${SRCDIR}/../../build/go/pserver/client/c/libpaddle_go_optimizer.a -lstdc++ -lm
 // #include "paddle/optimizer/optimizer.h"
 // #include <stdlib.h>
 // #include <string.h>
@@ -34,29 +35,41 @@ func cArrayToSlice(p unsafe.Pointer, len int) []byte {
 	return (*[1 << 30]byte)(p)[:len:len]
 }

-func newOptimizer(paramWithConfigs ParameterWithConfig) *optimizer {
+func newOptimizer(paramWithConfigs ParameterWithConfig, State []byte) *optimizer {
 	o := &optimizer{}
 	o.elementType = paramWithConfigs.Param.ElementType
 	p := paramWithConfigs.Param
 	c := paramWithConfigs.Config
+	s := State
 	log.WithFields(log.Fields{
 		"ElementType": p.ElementType,
 		"ParamSize":   len(p.Content),
 		"ConfigSize":  len(c),
+		"StateSize":   len(s),
 	}).Info("New Optimizer Created with config:")
 	var cbuffer unsafe.Pointer
 	cbuffer = C.malloc(C.size_t(len(p.Content)))
 	C.memcpy(cbuffer, unsafe.Pointer(&p.Content[0]), C.size_t(len(p.Content)))
+	var cstate unsafe.Pointer
+	if len(s) != 0 {
+		cstate = unsafe.Pointer(&s[0])
+	}
+
 	o.opt = C.paddle_create_optimizer((*C.uchar)(&c[0]), C.int(len(c)),
-		C.paddle_element_type(p.ElementType), cbuffer, C.int(len(p.Content)/C.sizeof_float),
-		(*C.char)(nullPtr), 0)
+		C.paddle_element_type(p.ElementType), cbuffer, C.int(len(p.Content)/C.sizeof_float), (*C.char)(cstate), C.int(len(s)))
 	return o
 }

 func (o *optimizer) GetWeights() []byte {
 	var buffer unsafe.Pointer
-	buffer_len := C.paddle_optimizer_get_weights(o.opt, &buffer)
-	return cArrayToSlice(buffer, int(buffer_len)*C.sizeof_float)
+	bufferLen := C.paddle_optimizer_get_weights(o.opt, &buffer)
+	return cArrayToSlice(buffer, int(bufferLen)*C.sizeof_float)
+}
+
+func (o *optimizer) GetStates() []byte {
+	var cbuffer *C.char
+	cbuffer_len := C.paddle_optimizer_get_state(o.opt, &cbuffer)
+	return cArrayToSlice(unsafe.Pointer(cbuffer), int(cbuffer_len))
 }

 func (o *optimizer) UpdateParameter(g Gradient) error {

--- a/go/pserver/optimizer_test.go
+++ b/go/pserver/optimizer_test.go
@@ -19,6 +19,6 @@ func TestOptimizerCreateRelease(t *testing.T) {
 		Param:  p,
 		Config: config,
 	}
-	o := newOptimizer(param)
+	o := newOptimizer(param, nil)
 	o.Cleanup()
 }
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
 package pserver

 import (
+	"bufio"
+	"bytes"
+	"crypto/md5"
+	"encoding/gob"
+	"encoding/hex"
+	"encoding/json"
 	"errors"
 	"fmt"
+	"os"
+	"path/filepath"
+	"strconv"
 	"sync"
+	"time"
+
+	log "github.com/sirupsen/logrus"
 )

 // ElementType is the type of elements of a Parameter.
 type ElementType int

 const (
+	// AlreadyInitialized is true if pserver is initialized
 	AlreadyInitialized = "pserver already initialized"
+	// Uninitialized is true if pserver not fully initialized
 	Uninitialized = "pserver not fully initialized"
 )

@@ -37,6 +51,22 @@ type ParameterWithConfig struct {
 	Config []byte // parameter configuration in Proto Buffer format
 }

+// ParameterCheckpoint is Parameter and State checkpoint
+type ParameterCheckpoint struct {
+	ParamConfig ParameterWithConfig
+	State       []byte
+}
+
+// checkpoint signature
+type checkpointMeta struct {
+	UUID      string `json:"uuid"`
+	Md5sum    string `json:"md5sum"`
+	Timestamp string `json:"timestamp"`
+}
+
+// Checkpoint is the pserver shard persist in file
+type Checkpoint []ParameterCheckpoint
+
 // Gradient is the gradient of the parameter.
 type Gradient Parameter

@@ -44,19 +74,32 @@ type Gradient Parameter
 type Service struct {
 	initialized        chan struct{}
 	idx                int
-
+	checkpointInterval time.Duration
+	checkpointPath     string
+	client             *EtcdClient
 	mu                 sync.Mutex
 	optMap             map[string]*optimizer
 }

 // NewService creates a new service, will bypass etcd registration if no
 // endpoints specified.
-func NewService(idx int) (*Service, error) {
+func NewService(idx int, seconds int, path string, client *EtcdClient, cp Checkpoint) (*Service, error) {
 	s := &Service{
 		idx:                idx,
+		checkpointInterval: time.Second * time.Duration(seconds),
+		checkpointPath:     path,
+		client:             client,
 	}
 	s.optMap = make(map[string]*optimizer)
 	s.initialized = make(chan struct{})
+
+	if cp != nil {
+		for _, item := range cp {
+			p := item.ParamConfig
+			st := item.State
+			s.optMap[p.Param.Name] = newOptimizer(p, st)
+		}
+	}
 	return s, nil
 }

@@ -76,7 +119,7 @@ func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) er
 	// TODO(helin): check if paramWithConfigs.Param.Content is
 	// properly memory aligned, if not, make copy to a memory
 	// aligned region.
-	s.optMap[paramWithConfigs.Param.Name] = newOptimizer(paramWithConfigs)
+	s.optMap[paramWithConfigs.Param.Name] = newOptimizer(paramWithConfigs, nil)
 	return nil
 }

@@ -137,10 +180,57 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
 	return nil
 }

-// Save tells the parameter server to save parameters.
-func (s *Service) Save(path string, dummy *int) error {
+// pserver save checkpoint
+func (s *Service) doCheckpoint() error {
 	<-s.initialized
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	cp := make([]ParameterCheckpoint, 0, len(s.optMap))
+	index := 0
+	for name, opt := range s.optMap {
+		var pc ParameterCheckpoint
+		pc.ParamConfig.Param.Name = name
+		pc.ParamConfig.Param.ElementType = opt.elementType
+		pc.ParamConfig.Param.Content = opt.GetWeights()
+		pc.State = opt.GetStates()
+		cp[index] = pc
+		index++
+	}
+	var buf bytes.Buffer
+	encoder := gob.NewEncoder(&buf)
+	err := encoder.Encode(cp)
+	if err != nil {
+		return err
+	}

-	// TODO
+	cpMeta := checkpointMeta{}
+	cpMeta.UUID = s.checkpointPath + strconv.Itoa(s.idx)
+	cpMeta.Timestamp = time.Now().String()
+	h := md5.New()
+	cpMeta.Md5sum = hex.EncodeToString(h.Sum(buf.Bytes()))
+
+	cpMetajson, _ := json.Marshal(cpMeta)
+	err = s.client.PutKey(filepath.Join(PsCheckpoint, strconv.Itoa(s.idx)), cpMetajson, 3)
+	if err != nil {
+		return err
+	}
+	if _, err = os.Stat(cpMeta.UUID); os.IsNotExist(err) {
+		log.Info("checkpoint does not exists.")
+	} else {
+		err = os.Remove(cpMeta.UUID)
+		log.Infof("checkpoint %s already exsits, removing ", cpMeta.UUID)
+	}
+	f, err := os.Create(cpMeta.UUID)
+	defer f.Close()
+	if err != nil {
+		return err
+	}
+	writer := bufio.NewWriter(f)
+	_, err = writer.Write(buf.Bytes())
+	writer.Flush()
+	if err != nil {
+		return err
+	}
 	return nil
 }
--- a/go/pserver/service_test.go
+++ b/go/pserver/service_test.go
@@ -15,7 +15,8 @@ const (
 )

 func TestServiceFull(t *testing.T) {
-	s, err := pserver.NewService(0)
+	var cp pserver.Checkpoint
+	s, err := pserver.NewService(0, 1, "", nil, cp)
 	if err != nil {
 		t.Error(err)
 	}
@@ -86,7 +87,8 @@ func TestServiceFull(t *testing.T) {
 }

 func TestMultipleInit(t *testing.T) {
-	s, err := pserver.NewService(0)
+	var cp pserver.Checkpoint
+	s, err := pserver.NewService(0, 1, "", nil, cp)
 	if err != nil {
 		t.Error(err)
 	}
@@ -102,7 +104,8 @@ func TestMultipleInit(t *testing.T) {
 }

 func TestUninitialized(t *testing.T) {
-	s, err := pserver.NewService(0)
+	var cp pserver.Checkpoint
+	s, err := pserver.NewService(0, 1, "", nil, cp)
 	err = s.SendGrad(pserver.Gradient{}, nil)
 	if err.Error() != pserver.Uninitialized {
 		t.FailNow()
@@ -110,7 +113,8 @@ func TestUninitialized(t *testing.T) {
 }

 func TestBlockUntilInitialized(t *testing.T) {
-	s, err := pserver.NewService(0)
+	var cp pserver.Checkpoint
+	s, err := pserver.NewService(0, 1, "", nil, cp)
 	if err != nil {
 		t.Error(err)
 	}
@@ -128,16 +132,6 @@ func TestBlockUntilInitialized(t *testing.T) {
 		ch <- struct{}{}
 	}()

-	wg.Add(1)
-	go func() {
-		err := s.Save("", nil)
-		if err != nil {
-			errCh <- err
-		}
-		wg.Done()
-		ch <- struct{}{}
-	}()
-
 	time.Sleep(50 * time.Millisecond)

 	select {
@@ -170,3 +164,7 @@ func TestBlockUntilInitialized(t *testing.T) {

 	wg.Wait()
 }
+
+func TestCheckpointSpeed(t *testing.T) {
+	//TODO(zhihong): test speed
+}
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -9,6 +9,10 @@ cc_test(enforce_test SRCS enforce_test.cc)
 proto_library(attr_type SRCS attr_type.proto)
 proto_library(op_proto SRCS op_proto.proto DEPS attr_type)
 cc_test(op_proto_test SRCS op_proto_test.cc DEPS op_proto protobuf)
-
 proto_library(op_desc SRCS op_desc.proto DEPS attr_type)
 cc_test(op_desc_test SRCS op_desc_test.cc DEPS op_desc protobuf)
+cc_test(op_registry_test SRCS op_registry_test.cc DEPS op_proto op_desc)
+py_proto_compile(framework_py_proto SRCS attr_type.proto op_proto.proto op_desc.proto)
+# Generate an empty __init__.py to make framework_py_proto as a valid python module.
+add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
+add_dependencies(framework_py_proto framework_py_proto_init)
--- a/paddle/framework/attr_checker.h
+++ b/paddle/framework/attr_checker.h
+#pragma once
+
+#include <boost/variant.hpp>
+#include <functional>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#include "paddle/framework/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+typedef boost::variant<boost::blank, int, float, std::string, std::vector<int>,
+                       std::vector<float>, std::vector<std::string>>
+    Attribute;
+typedef std::unordered_map<std::string, Attribute> AttributeMap;
+
+// check whether a value(attribute) fit a certain limit
+template <typename T>
+class LargerThanChecker {
+ public:
+  LargerThanChecker(T lower_bound) : lower_bound_(lower_bound) {}
+  void operator()(T& value) const {
+    PADDLE_ENFORCE(value > lower_bound_, "larger_than check fail");
+  }
+
+ private:
+  T lower_bound_;
+};
+
+// we can provide users more common Checker, like 'LessThanChecker',
+// 'BetweenChecker'...
+
+template <typename T>
+class DefaultValueSetter {
+ public:
+  DefaultValueSetter(T default_value) : default_value_(default_value) {}
+  void operator()(T& value) const { value = default_value_; }
+
+ private:
+  T default_value_;
+};
+
+// check whether a certain attribute fit its limits
+// an attribute can have more than one limits
+template <typename T>
+class TypedAttrChecker {
+  typedef std::function<void(T&)> ValueChecker;
+
+ public:
+  TypedAttrChecker(const std::string& attr_name) : attr_name_(attr_name) {}
+
+  TypedAttrChecker& LargerThan(const T& lower_bound) {
+    value_checkers_.push_back(LargerThanChecker<T>(lower_bound));
+    return *this;
+  }
+
+  // we can add more common limits, like LessThan(), Between()...
+
+  TypedAttrChecker& SetDefault(const T& default_value) {
+    PADDLE_ENFORCE(default_value_setter_.empty(),
+                   "%s can't have more than one default value!", attr_name_);
+    default_value_setter_.push_back(DefaultValueSetter<T>(default_value));
+    return *this;
+  }
+
+  // allow users provide their own checker
+  TypedAttrChecker& AddCustomChecker(const ValueChecker& checker) {
+    value_checkers_.push_back(checker);
+    return *this;
+  }
+
+  void operator()(AttributeMap& attr_map) const {
+    if (!attr_map.count(attr_name_)) {
+      // user do not set this attr
+      PADDLE_ENFORCE(!default_value_setter_.empty(),
+                     "Attribute '%s' is required!", attr_name_);
+      // default_value_setter_ has no more than one element
+      T val;
+      (default_value_setter_[0])(val);
+      attr_map[attr_name_] = val;
+    }
+    Attribute& attr = attr_map.at(attr_name_);
+    T& attr_value = boost::get<T>(attr);
+    for (const auto& checker : value_checkers_) {
+      checker(attr_value);
+    }
+  }
+
+ private:
+  std::string attr_name_;
+  std::vector<ValueChecker> value_checkers_;
+  std::vector<ValueChecker> default_value_setter_;
+};
+
+// check whether op's all attributes fit their own limits
+class OpAttrChecker {
+  typedef std::function<void(AttributeMap&)> AttrChecker;
+
+ public:
+  template <typename T>
+  TypedAttrChecker<T>& AddAttrChecker(const std::string& attr_name) {
+    attr_checkers_.push_back(TypedAttrChecker<T>(attr_name));
+    AttrChecker& checker = attr_checkers_.back();
+    return *(checker.target<TypedAttrChecker<T>>());
+  }
+
+  void Check(AttributeMap& attr_map) const {
+    for (const auto& checker : attr_checkers_) {
+      checker(attr_map);
+    }
+  }
+
+ private:
+  std::vector<AttrChecker> attr_checkers_;
+};
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
+#pragma once
+
+#include "paddle/framework/attr_checker.h"
+
+//#include "paddle/framework/op_base.h"
+#include "paddle/framework/op_desc.pb.h"
+#include "paddle/framework/op_proto.pb.h"
+
+namespace paddle {
+namespace framework {
+
+//==================For test================//
+class OpBase {
+ public:
+  std::vector<std::string> inputs_;
+  std::vector<std::string> outputs_;
+  AttributeMap attr_map_;
+
+  virtual std::string Run() const = 0;
+  virtual ~OpBase() {}
+};
+//=========================================//
+
+// helper class to set attribute type
+struct AttrTypeHelper {
+  template <typename T>
+  static void SetAttrType(AttrProto* attr);
+
+  static Attribute GetAttrValue(const AttrDesc& attr_desc) {
+    switch (attr_desc.type()) {
+      case paddle::framework::AttrType::INT: {
+        return attr_desc.i();
+      }
+      case paddle::framework::AttrType::FLOAT: {
+        return attr_desc.f();
+      }
+      case paddle::framework::AttrType::STRING: {
+        return attr_desc.s();
+      }
+      case paddle::framework::AttrType::INTS: {
+        std::vector<int> val(attr_desc.ints_size());
+        for (int i = 0; i < attr_desc.ints_size(); ++i) {
+          val[i] = attr_desc.ints(i);
+        }
+        return val;
+      }
+      case paddle::framework::AttrType::FLOATS: {
+        std::vector<float> val(attr_desc.floats_size());
+        for (int i = 0; i < attr_desc.floats_size(); ++i) {
+          val[i] = attr_desc.floats(i);
+        }
+        return val;
+      }
+      case paddle::framework::AttrType::STRINGS: {
+        std::vector<std::string> val(attr_desc.strings_size());
+        for (int i = 0; i < attr_desc.strings_size(); ++i) {
+          val[i] = attr_desc.strings(i);
+        }
+        return val;
+      }
+    }
+    PADDLE_ENFORCE(false, "Unknown OpDesc::AttrDesc::type !");
+    return boost::blank();
+  }
+};
+
+template <>
+void AttrTypeHelper::SetAttrType<int>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::INT);
+}
+
+template <>
+void AttrTypeHelper::SetAttrType<float>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::FLOAT);
+}
+
+template <>
+void AttrTypeHelper::SetAttrType<std::string>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::STRING);
+}
+
+template <>
+void AttrTypeHelper::SetAttrType<std::vector<int>>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::INTS);
+}
+
+template <>
+void AttrTypeHelper::SetAttrType<std::vector<float>>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::FLOATS);
+}
+
+template <>
+void AttrTypeHelper::SetAttrType<std::vector<std::string>>(AttrProto* attr) {
+  attr->set_type(paddle::framework::AttrType::STRINGS);
+}
+
+// this class not only make proto but also init attribute checkers.
+class OpProtoAndCheckerMaker {
+ public:
+  OpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : proto_(proto), op_checker_(op_checker) {}
+
+ protected:
+  void AddInput(const std::string& name, const std::string& comment) {
+    auto input = proto_->mutable_inputs()->Add();
+    *(input->mutable_name()) = name;
+    *(input->mutable_comment()) = comment;
+  }
+
+  void AddOutput(const std::string& name, const std::string& comment) {
+    auto output = proto_->mutable_outputs()->Add();
+    *(output->mutable_name()) = name;
+    *(output->mutable_comment()) = comment;
+  }
+
+  template <typename T>
+  TypedAttrChecker<T>& AddAttr(const std::string& name,
+                               const std::string& comment) {
+    auto attr = proto_->mutable_attrs()->Add();
+    *(attr->mutable_name()) = name;
+    *(attr->mutable_comment()) = comment;
+    AttrTypeHelper::SetAttrType<T>(attr);
+    return op_checker_->AddAttrChecker<T>(name);
+  }
+
+  void AddType(const std::string& op_type) { proto_->set_type(op_type); }
+
+  void AddComment(const std::string& comment) {
+    *(proto_->mutable_comment()) = comment;
+  }
+
+  OpProto* proto_;
+  OpAttrChecker* op_checker_;
+};
+
+class OpRegistry {
+  typedef std::function<OpBase*()> OpCreator;
+
+ public:
+  template <typename OpType, typename ProtoMakerType>
+  static void RegisterOp(const std::string& op_type) {
+    creators_[op_type] = []() { return new OpType; };
+    OpProto& op_proto = protos_[op_type];
+    OpAttrChecker& op_checker = op_checkers_[op_type];
+    ProtoMakerType(&op_proto, &op_checker);
+    PADDLE_ENFORCE(op_proto.IsInitialized() == true,
+                   "Fail to initialize %s's OpProto !", op_type);
+  }
+
+  static OpBase* CreateOp(const OpDesc& op_desc) {
+    std::string op_type = op_desc.type();
+    OpBase* op = (creators_.at(op_type))();
+    (op->inputs_).resize(op_desc.inputs_size());
+    for (int i = 0; i < op_desc.inputs_size(); ++i) {
+      (op->inputs_)[i] = op_desc.inputs(i);
+    }
+    (op->outputs_).resize(op_desc.outputs_size());
+    for (int i = 0; i < op_desc.outputs_size(); ++i) {
+      (op->outputs_)[i] = op_desc.outputs(i);
+    }
+    for (int i = 0; i < op_desc.attrs_size(); ++i) {
+      const AttrDesc& ith_attr = op_desc.attrs(i);
+      std::string name = ith_attr.name();
+      (op->attr_map_)[name] = AttrTypeHelper::GetAttrValue(ith_attr);
+    }
+    const OpAttrChecker& op_checker = op_checkers_.at(op_type);
+    op_checker.Check(op->attr_map_);
+    return op;
+  }
+
+ private:
+  static std::unordered_map<std::string, OpCreator> creators_;
+  static std::unordered_map<std::string, OpProto> protos_;
+  static std::unordered_map<std::string, OpAttrChecker> op_checkers_;
+};
+
+std::unordered_map<std::string, std::function<OpBase*()>> OpRegistry::creators_;
+std::unordered_map<std::string, OpProto> OpRegistry::protos_;
+std::unordered_map<std::string, OpAttrChecker> OpRegistry::op_checkers_;
+
+template <typename OpType, typename ProtoMakerType>
+class OpRegisterHelper {
+ public:
+  OpRegisterHelper(std::string op_type) {
+    OpRegistry::RegisterOp<OpType, ProtoMakerType>(op_type);
+  }
+};
+
+#define REGISTER_OP(__op_class, __op_maker_class, __op_type)         \
+  class __op_class##Register {                                       \
+   private:                                                          \
+    const static OpRegisterHelper<__op_class, __op_maker_class> reg; \
+  };                                                                 \
+  const OpRegisterHelper<__op_class, __op_maker_class>               \
+      __op_class##Register::reg(#__op_type);
+
+// Demos
+
+class CosineOp : public OpBase {
+ public:
+  virtual std::string Run() const {
+    std::string msg = "CosineOp runs! scale = " +
+                      std::to_string(boost::get<float>(attr_map_.at("scale")));
+    return msg;
+  }
+};
+
+class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  CosineOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of cosine op");
+    AddOutput("output", "output of cosine op");
+    AddAttr<float>("scale", "scale of cosine op")
+        .SetDefault(1.0)
+        .LargerThan(0.0);
+    AddType("cos");
+    AddComment("This is cos op");
+  }
+};
+
+REGISTER_OP(CosineOp, CosineOpProtoAndCheckerMaker, cos_sim)
+
+class MyTestOp : public OpBase {
+ public:
+  virtual std::string Run() const {
+    std::string msg =
+        "MyTestOp runs! test_attr = " +
+        std::to_string(boost::get<int>(attr_map_.at("test_attr")));
+    return msg;
+  }
+};
+
+class MyTestOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+  MyTestOpProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "input of cosine op");
+    AddOutput("output", "output of cosine op");
+    auto my_checker = [](int i) {
+      PADDLE_ENFORCE(i % 2 == 0, "'test_attr' must be even!");
+    };
+    AddAttr<int>("test_attr", "a simple test attribute")
+        .AddCustomChecker(my_checker);
+    AddType("my_test_op");
+    AddComment("This is my_test op");
+  }
+};
+
+REGISTER_OP(MyTestOp, MyTestOpProtoAndCheckerMaker, my_test_op)
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
+#include "paddle/framework/op_registry.h"
+#include <gtest/gtest.h>
+
+TEST(OpRegistry, CreateOp) {
+  paddle::framework::OpDesc op_desc;
+  op_desc.set_type("cos_sim");
+  op_desc.add_inputs("aa");
+  op_desc.add_outputs("bb");
+
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_f(3.3);
+
+  paddle::framework::OpBase* op =
+      paddle::framework::OpRegistry::CreateOp(op_desc);
+  std::string debug_str = op->Run();
+  std::string str = "CosineOp runs! scale = " + std::to_string(3.3);
+  ASSERT_EQ(str.size(), debug_str.size());
+  for (size_t i = 0; i < debug_str.length(); ++i) {
+    ASSERT_EQ(debug_str[i], str[i]);
+  }
+}
+
+TEST(OpRegistry, IllegalAttr) {
+  paddle::framework::OpDesc op_desc;
+  op_desc.set_type("cos_sim");
+  op_desc.add_inputs("aa");
+  op_desc.add_outputs("bb");
+
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("scale");
+  attr->set_type(paddle::framework::AttrType::FLOAT);
+  attr->set_f(-2.0);
+
+  bool caught = false;
+  try {
+    paddle::framework::OpBase* op __attribute__((unused)) =
+        paddle::framework::OpRegistry::CreateOp(op_desc);
+  } catch (paddle::framework::EnforceNotMet err) {
+    caught = true;
+    std::string msg = "larger_than check fail";
+    const char* err_msg = err.what();
+    for (size_t i = 0; i < msg.length(); ++i) {
+      ASSERT_EQ(err_msg[i], msg[i]);
+    }
+  }
+  ASSERT_TRUE(caught);
+}
+
+TEST(OpRegistry, DefaultValue) {
+  paddle::framework::OpDesc op_desc;
+  op_desc.set_type("cos_sim");
+  op_desc.add_inputs("aa");
+  op_desc.add_outputs("bb");
+
+  paddle::framework::OpBase* op =
+      paddle::framework::OpRegistry::CreateOp(op_desc);
+  std::string debug_str = op->Run();
+  float default_value = 1.0;
+  std::string str = "CosineOp runs! scale = " + std::to_string(default_value);
+  ASSERT_EQ(str.size(), debug_str.size());
+  for (size_t i = 0; i < debug_str.length(); ++i) {
+    ASSERT_EQ(debug_str[i], str[i]);
+  }
+}
+
+TEST(OpRegistry, CustomChecker) {
+  paddle::framework::OpDesc op_desc;
+  op_desc.set_type("my_test_op");
+  op_desc.add_inputs("ii");
+  op_desc.add_outputs("oo");
+
+  // attr 'test_attr' is not set
+  bool caught = false;
+  try {
+    paddle::framework::OpBase* op __attribute__((unused)) =
+        paddle::framework::OpRegistry::CreateOp(op_desc);
+  } catch (paddle::framework::EnforceNotMet err) {
+    caught = true;
+    std::string msg = "Attribute 'test_attr' is required!";
+    const char* err_msg = err.what();
+    for (size_t i = 0; i < msg.length(); ++i) {
+      ASSERT_EQ(err_msg[i], msg[i]);
+    }
+  }
+  ASSERT_TRUE(caught);
+
+  // set 'test_attr' set to an illegal value
+  auto attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("test_attr");
+  attr->set_type(paddle::framework::AttrType::INT);
+  attr->set_i(3);
+  caught = false;
+  try {
+    paddle::framework::OpBase* op __attribute__((unused)) =
+        paddle::framework::OpRegistry::CreateOp(op_desc);
+  } catch (paddle::framework::EnforceNotMet err) {
+    caught = true;
+    std::string msg = "'test_attr' must be even!";
+    const char* err_msg = err.what();
+    for (size_t i = 0; i < msg.length(); ++i) {
+      ASSERT_EQ(err_msg[i], msg[i]);
+    }
+  }
+  ASSERT_TRUE(caught);
+
+  // set 'test_attr' set to a legal value
+  op_desc.mutable_attrs()->Clear();
+  attr = op_desc.mutable_attrs()->Add();
+  attr->set_name("test_attr");
+  attr->set_type(paddle::framework::AttrType::INT);
+  attr->set_i(4);
+  paddle::framework::OpBase* op =
+      paddle::framework::OpRegistry::CreateOp(op_desc);
+  std::string debug_str = op->Run();
+  std::string str = "MyTestOp runs! test_attr = " + std::to_string(4);
+  ASSERT_EQ(str.size(), debug_str.size());
+  for (size_t i = 0; i < debug_str.length(); ++i) {
+    ASSERT_EQ(debug_str[i], str[i]);
+  }
+}
\ No newline at end of file
--- a/paddle/optimizer/adadelta_optimizer.cc
+++ b/paddle/optimizer/adadelta_optimizer.cc
@@ -27,22 +27,24 @@ void AdadeltaOptimizer::Update(const Tensor* gradient) {

 const char* AdadeltaOptimizer::SerializeState(int* state_len) {
  AdadeltaOptimizerState state;
-  // TODO(zhihong) : add lr_policy serialization
  state.set_num_sample_passed(num_sample_passed_);
+  std::string lr_str = this->lr_policy_->SerializeState(state_len);
+  state.mutable_lr_state()->ParseFromString(lr_str);

  TensorToProto(*parameter_, state.mutable_parameter());
  TensorToProto(*accum_gradient_, state.mutable_accum_gradient());
  TensorToProto(*accum_delta_, state.mutable_accum_delta());
  TensorToProto(*update_delta_, state.mutable_update_delta());
  auto str = state.SerializeAsString();
-  *state_len = str.size();
+  *state_len += str.size();
  return str.c_str();
 }

 void AdadeltaOptimizer::DeserializeState(const std::string& str) {
  AdadeltaOptimizerState state;
  state.ParseFromString(str);
-  // TODO(zhihong) : add lr_policy DeserializeState
+  auto lr_state = state.lr_state();
+  this->lr_policy_->DeserializeState(lr_state.SerializeAsString());
  num_sample_passed_ = state.num_sample_passed();

  ProtoToTensor(state.parameter(), parameter_);

--- a/paddle/optimizer/adagrad_optimizer.cc
+++ b/paddle/optimizer/adagrad_optimizer.cc
@@ -19,20 +19,23 @@ void AdagradOptimizer::Update(const Tensor* gradient) {
 }
 const char* AdagradOptimizer::SerializeState(int* state_len) {
  AdagradOptimizerState state;
-  // TODO(zhihong) : add lr_policy serialization
  state.set_num_sample_passed(num_sample_passed_);
+  std::string lr_str = this->lr_policy_->SerializeState(state_len);
+  state.mutable_lr_state()->ParseFromString(lr_str);

  TensorToProto(*parameter_, state.mutable_parameter());
  TensorToProto(*accum_gradient_, state.mutable_accum_gradient());
  auto str = state.SerializeAsString();
-  *state_len = str.size();
+  *state_len += str.size();
  return str.c_str();
 }

 void AdagradOptimizer::DeserializeState(const std::string& str) {
  AdagradOptimizerState state;
  state.ParseFromString(str);
-  // TODO(zhihong) : add lr_policy DeserializeState
+  auto lr_state = state.lr_state();
+  this->lr_policy_->DeserializeState(lr_state.SerializeAsString());
+
  num_sample_passed_ = state.num_sample_passed();
  ProtoToTensor(state.parameter(), parameter_);
  ProtoToTensor(state.accum_gradient(), accum_gradient_);

--- a/paddle/optimizer/adam_optimizer.cc
+++ b/paddle/optimizer/adam_optimizer.cc
@@ -24,20 +24,23 @@ void AdamOptimizer::Update(const Tensor *gradient) {

 const char *AdamOptimizer::SerializeState(int *state_len) {
  AdamOptimizerState state;
-  // TODO(zhihong) : add lr_policy serialization
+  std::string lr_str = this->lr_policy_->SerializeState(state_len);
+  state.mutable_lr_state()->ParseFromString(lr_str);
  state.set_num_sample_passed(num_sample_passed_);
+
  TensorToProto(*parameter_, state.mutable_parameter());
  TensorToProto(*momentums_, state.mutable_momentums());
  TensorToProto(*velocitys_, state.mutable_velocitys());
  auto str = state.SerializeAsString();
-  *state_len = str.size();
+  *state_len += str.size();
  return str.c_str();
 }

 void AdamOptimizer::DeserializeState(const std::string &str) {
  AdamOptimizerState state;
  state.ParseFromString(str);
-  // TODO(zhihong) : add lr_policy DeserializeState
+  auto lr_state = state.lr_state();
+  this->lr_policy_->DeserializeState(lr_state.SerializeAsString());
  num_sample_passed_ = state.num_sample_passed();

  ProtoToTensor(state.parameter(), parameter_);

--- a/paddle/optimizer/lr_policy.h
+++ b/paddle/optimizer/lr_policy.h
@@ -17,36 +17,56 @@ public:
 // constant learning rate policy
 class ConstLr final : public LrPolicy {
 public:
-  ConstLr(double lr) : learning_rate(lr){};
+  ConstLr(double lr) : learning_rate_(lr){};
  double LearningRate(const uint64_t num_sample_passed) {
-    return learning_rate;
+    return learning_rate_;
+  }
+  const char *SerializeState(int *state_len) {
+    LrPolicyState state;
+    state.set_learning_rate(learning_rate_);
+    auto str = state.SerializeAsString();
+    *state_len = str.size();
+    return str.c_str();
+  }
+  void DeserializeState(const std::string &str) {
+    LrPolicyState state;
+    state.ParseFromString(str);
+    learning_rate_ = state.learning_rate();
  }
-  const char *SerializeState(int *state_len) { return nullptr; }
-  void DeserializeState(const std::string &state) {}

 private:
-  double learning_rate;
+  double learning_rate_;
 };

 class LinearLr final : public LrPolicy {
 public:
  LinearLr(double lr, double lr_decay_a, double lr_decay_b)
-      : learning_rate(lr), lr_decay_a(lr_decay_a), lr_decay_b(lr_decay_b) {}
+      : learning_rate_(lr), lr_decay_a_(lr_decay_a), lr_decay_b_(lr_decay_b) {}
  double LearningRate(const uint64_t num_sample_passed) {
-    return std::max(learning_rate - lr_decay_a * num_sample_passed, lr_decay_b);
+    return std::max(learning_rate_ - lr_decay_a_ * num_sample_passed,
+                    lr_decay_b_);
  }
  const char *SerializeState(int *state_len) {
-    // TODO(zhihong) : add lr_policy serialization
-    return nullptr;
+    LrPolicyState state;
+    state.set_learning_rate(learning_rate_);
+    state.set_lr_decay_a(lr_decay_a_);
+    state.set_lr_decay_b(lr_decay_b_);
+    auto str = state.SerializeAsString();
+    *state_len = str.size();
+    return str.c_str();
  }
-  void DeserializeState(const std::string &state) {
-    // TODO(zhihong) : add lr_policy serialization
+  void DeserializeState(const std::string &str) {
+    LrPolicyState state;
+    state.ParseFromString(str);
+    learning_rate_ = state.learning_rate();
+    lr_decay_a_ = state.lr_decay_a();
+    lr_decay_b_ = state.lr_decay_b();
  }

 private:
-  double learning_rate;
-  double lr_decay_a;
-  double lr_decay_b;
+  double learning_rate_;
+  double lr_decay_a_;
+  double lr_decay_b_;
 };

 }  // namespace optimizer

--- a/paddle/optimizer/sgd_optimizer.cc
+++ b/paddle/optimizer/sgd_optimizer.cc
@@ -30,16 +30,20 @@ void SGDOptimizer::Update(const Tensor *gradient) {
 const char *SGDOptimizer::SerializeState(int *state_len) {
  SGDOptimizerState state;
  state.set_num_sample_passed(num_sample_passed_);
+  std::string lr_str = this->lr_policy_->SerializeState(state_len);
+  state.mutable_lr_state()->ParseFromString(lr_str);
  TensorToProto(*parameter_, state.mutable_parameter());
  if (momentum_ != 0.0) TensorToProto(*momentums_, state.mutable_momentums());
  auto str = state.SerializeAsString();
-  *state_len = str.size();
+  *state_len += str.size();
  return str.c_str();
 }

 void SGDOptimizer::DeserializeState(const std::string &str) {
  SGDOptimizerState state;
  state.ParseFromString(str);
+  auto lr_state = state.lr_state();
+  this->lr_policy_->DeserializeState(lr_state.SerializeAsString());
  num_sample_passed_ = state.num_sample_passed();
  ProtoToTensor(state.parameter(), parameter_);
  if (momentum_ != 0.0) ProtoToTensor(state.parameter(), momentums_);

--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -4,3 +4,5 @@ nv_test(cuda_test SRCS cuda_test.cu)

 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
+
+nv_test(device_context_test SRCS device_context_test.cc DEPS dynamic_loader place eigen3 glog gflags)
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/enforce.h"
+#ifndef PADDLE_ONLY_CPU
+#include "paddle/platform/cuda.h"
+#include "paddle/platform/dynload/cublas.h"
+#include "paddle/platform/dynload/cudnn.h"
+#include "paddle/platform/dynload/curand.h"
+#define EIGEN_USE_GPU
+#endif
+#include "paddle/platform/place.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+namespace platform {
+
+class DeviceContext {
+ public:
+  virtual ~DeviceContext() {}
+};
+
+class CPUDeviceContext : public DeviceContext {};
+
+#ifndef PADDLE_ONLY_CPU
+class GPUPlaceGuard {
+ public:
+  explicit GPUPlaceGuard(GPUPlace new_place) : previous_(GetCurrentDeviceId()) {
+    if (previous_ != new_place) {
+      paddle::platform::SetDeviceId(new_place.device);
+    }
+  }
+
+  ~GPUPlaceGuard() { paddle::platform::SetDeviceId(previous_.device); }
+
+ private:
+  GPUPlace previous_;
+};
+
+class CUDADeviceContext : public DeviceContext {
+ public:
+  explicit CUDADeviceContext(const GPUPlace gpu_place) : gpu_place_(gpu_place) {
+    GPUPlaceGuard guard(gpu_place_);
+    paddle::platform::throw_on_error(cudaStreamCreate(&stream_),
+                                     "cudaStreamCreate failed");
+    eigen_stream_ = new Eigen::CudaStreamDevice(&stream_);
+    eigen_device_ = new Eigen::GpuDevice(eigen_stream_);
+  }
+
+  void Wait() {
+    paddle::platform::throw_on_error(cudaStreamSynchronize(stream_),
+                                     "cudaStreamSynchronize failed");
+  }
+
+  cudaStream_t stream() { return stream_; }
+
+  Eigen::GpuDevice eigen_device() { return *eigen_device_; }
+
+  cublasHandle_t cublas_handle() {
+    if (!blas_handle_) {
+      GPUPlaceGuard guard(gpu_place_);
+      PADDLE_ENFORCE(paddle::platform::dynload::cublasCreate(&blas_handle_) ==
+                         CUBLAS_STATUS_SUCCESS,
+                     "cublasCreate failed");
+      PADDLE_ENFORCE(paddle::platform::dynload::cublasSetStream(
+                         blas_handle_, stream_) == CUBLAS_STATUS_SUCCESS,
+                     "cublasSetStream failed");
+    }
+    return blas_handle_;
+  }
+
+  cudnnHandle_t cudnn_handle() {
+    if (!dnn_handle_) {
+      GPUPlaceGuard guard(gpu_place_);
+      PADDLE_ENFORCE(paddle::platform::dynload::cudnnCreate(&dnn_handle_) ==
+                         CUDNN_STATUS_SUCCESS,
+                     "cudnnCreate failed");
+      PADDLE_ENFORCE(paddle::platform::dynload::cudnnSetStream(
+                         dnn_handle_, stream_) == CUDNN_STATUS_SUCCESS,
+                     "cudnnSetStream failed");
+    }
+    return dnn_handle_;
+  }
+
+  curandGenerator_t curand_generator() {
+    if (!rand_generator_) {
+      GPUPlaceGuard guard(gpu_place_);
+      PADDLE_ENFORCE(paddle::platform::dynload::curandCreateGenerator(
+                         &rand_generator_, CURAND_RNG_PSEUDO_DEFAULT) ==
+                         CURAND_STATUS_SUCCESS,
+                     "curandCreateGenerator failed");
+      PADDLE_ENFORCE(
+          paddle::platform::dynload::curandSetPseudoRandomGeneratorSeed(
+              rand_generator_, random_seed_) == CURAND_STATUS_SUCCESS,
+          "curandSetPseudoRandomGeneratorSeed failed");
+      PADDLE_ENFORCE(paddle::platform::dynload::curandSetStream(
+                         rand_generator_, stream_) == CURAND_STATUS_SUCCESS,
+                     "curandSetStream failed");
+    }
+    return rand_generator_;
+  }
+
+  ~CUDADeviceContext() {
+    Wait();
+    if (blas_handle_) {
+      PADDLE_ENFORCE(paddle::platform::dynload::cublasDestroy(blas_handle_) ==
+                         CUBLAS_STATUS_SUCCESS,
+                     "cublasDestroy failed");
+    }
+
+    if (dnn_handle_) {
+      PADDLE_ENFORCE(paddle::platform::dynload::cudnnDestroy(dnn_handle_) ==
+                         CUDNN_STATUS_SUCCESS,
+                     "cudnnDestroy failed");
+    }
+
+    if (rand_generator_) {
+      PADDLE_ENFORCE(paddle::platform::dynload::curandDestroyGenerator(
+                         rand_generator_) == CURAND_STATUS_SUCCESS,
+                     "curandDestroyGenerator failed");
+    }
+
+    delete eigen_stream_;
+    delete eigen_device_;
+
+    paddle::platform::throw_on_error(cudaStreamDestroy(stream_),
+                                     "cudaStreamDestroy failed");
+  }
+
+ private:
+  GPUPlace gpu_place_;
+  cudaStream_t stream_;
+
+  Eigen::CudaStreamDevice* eigen_stream_;
+  Eigen::GpuDevice* eigen_device_;
+
+  cublasHandle_t blas_handle_{nullptr};
+
+  cudnnHandle_t dnn_handle_{nullptr};
+
+  int random_seed_;
+  curandGenerator_t rand_generator_{nullptr};
+};
+#endif
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/platform/device_context_test.cc
+++ b/paddle/platform/device_context_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/device_context.h"
+#include "gtest/gtest.h"
+
+TEST(CUDADeviceContext, Init) {
+  int count = paddle::platform::GetDeviceCount();
+  for (int i = 0; i < count; i++) {
+    paddle::platform::CUDADeviceContext* device_context =
+        new paddle::platform::CUDADeviceContext(i);
+    Eigen::GpuDevice gpu_device = device_context->eigen_device();
+    ASSERT_NE(nullptr, gpu_device.stream());
+    cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
+    ASSERT_NE(nullptr, cudnn_handle);
+    cublasHandle_t cublas_handle = device_context->cublas_handle();
+    ASSERT_NE(nullptr, cublas_handle);
+    curandGenerator_t curand_handle = device_context->curand_generator();
+    ASSERT_NE(nullptr, curand_handle);
+    delete device_context;
+  }
+}
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -5,13 +5,14 @@ set -e
 mkdir -p $TRAVIS_BUILD_DIR/build
 cd $TRAVIS_BUILD_DIR/build

-# Compile Documentation only.
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_STYLE_CHECK=OFF
+# Compile paddle binaries first
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_GOLANG=ON -DWITH_STYLE_CHECK=OFF

 mkdir output
 make -j `nproc`
 find .. -name '*whl' | xargs pip install  # install all wheels.
 rm -rf *
+# Compile Documentation only.
 cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON
 make -j `nproc` paddle_docs paddle_docs_cn


--- a/proto/OptimizerConfig.proto
+++ b/proto/OptimizerConfig.proto
@@ -78,11 +78,15 @@ enum DataType {
  repeated bytes content = 2;
 }

+message LrPolicyState {
+  // learninRate Policy
+  optional double learning_rate = 1 [default = 1.0];
+  optional double lr_decay_a = 2;
+  optional double lr_decay_b = 3;
+}
+
 message SGDOptimizerState {
-  // learning rate policy
-  optional double learning_rate = 101;
-  optional double lr_decay_a = 102;
-  optional double lr_decay_b = 103;
+  optional LrPolicyState lr_state = 101;
  optional double num_sample_passed = 104;
  // state
  optional TensorProto parameter = 1;
@@ -91,9 +95,7 @@ message SGDOptimizerState {

 message AdadeltaOptimizerState {
  // learning rate policy
-  optional double learning_rate = 101;
-  optional double lr_decay_a = 102;
-  optional double lr_decay_b = 103;
+  optional LrPolicyState lr_state = 101;
  optional double num_sample_passed = 104;
  // state
  optional TensorProto parameter = 1;
@@ -102,11 +104,9 @@ message AdadeltaOptimizerState {
  optional TensorProto update_delta = 4;
 }

+
 message AdagradOptimizerState {
-  // learning rate policy
-  optional double learning_rate = 101;
-  optional double lr_decay_a = 102;
-  optional double lr_decay_b = 103;
+  optional LrPolicyState lr_state = 101;
  optional double num_sample_passed = 104;
  // state
  optional TensorProto parameter = 1;
@@ -114,10 +114,7 @@ message AdagradOptimizerState {
 }

 message AdamOptimizerState {
-  // learning rate policy
-  optional double learning_rate = 101;
-  optional double lr_decay_a = 102;
-  optional double lr_decay_b = 103;
+  optional LrPolicyState lr_state = 101;
  optional double num_sample_passed = 104;
  // state
  optional TensorProto parameter = 1;

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -29,7 +29,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
 add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
    COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp
-    DEPENDS gen_proto_py ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
+    DEPENDS gen_proto_py framework_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})

 add_custom_target(paddle_python ALL DEPENDS
    ${OUTPUT_DIR}/.timestamp)
@@ -43,6 +43,7 @@ if (WITH_TESTING)
    add_subdirectory(paddle/v2/tests)
    add_subdirectory(paddle/v2/reader/tests)
    add_subdirectory(paddle/v2/plot/tests)
+    add_subdirectory(paddle/v2/framework/tests)
  endif()
 endif()
 install(DIRECTORY ${PADDLE_PYTHON_PACKAGE_DIR}

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1353,7 +1353,8 @@ class LayerBase(object):
            device=None,
            active_type="",
            drop_rate=0.,
-            coeff=None):
+            coeff=None,
+            error_clipping_threshold=None):
        config_assert('@' not in name,
                      "layer name: %s contain special character @" % name)
        global g_current_submodel
@@ -1387,6 +1388,9 @@ class LayerBase(object):
        elif g_default_device is not None:
            self.config.device = g_default_device

+        if error_clipping_threshold is not None:
+            self.config.error_clipping_threshold = error_clipping_threshold
+
        for input_index in xrange(len(self.inputs)):
            input = self.inputs[input_index]
            input_config = None
@@ -2822,13 +2826,7 @@ class TensorLayer(LayerBase):

 @config_layer('mixed')
 class MixedLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 inputs,
-                 size=0,
-                 bias=True,
-                 error_clipping_threshold=None,
-                 **xargs):
+    def __init__(self, name, inputs, size=0, bias=True, **xargs):
        config_assert(inputs, 'inputs cannot be empty')
        super(MixedLayer, self).__init__(
            name, 'mixed', size, inputs=inputs, **xargs)
@@ -2910,9 +2908,6 @@ class MixedLayer(LayerBase):
            self.config.bias_size = psize
            self.create_bias_parameter(bias, psize)

-        if error_clipping_threshold is not None:
-            self.config.error_clipping_threshold = error_clipping_threshold
-

 # like MixedLayer, but no bias parameter
 @config_func

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -4806,6 +4806,14 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
    So groups should be larger than 1, and the num of channels should be able
    to devided by groups.

+    .. math::
+       y_{si+j} = \max_k x_{gsi + sk + j}
+       g = groups
+       s = input.size / num_channels
+       0 \le i < num_channels / groups
+       0 \le j < s
+       0 \le k < groups
+
    Please refer to Paper:
      - Maxout Networks: http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
      - Multi-digit Number Recognition from Street View \

--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -1395,7 +1395,7 @@ def inputs(layers, *args):
    if len(args) != 0:
        layers.extend(args)

-    Inputs(* [l.name for l in layers])
+    Inputs(*[l.name for l in layers])


 def outputs(layers, *args):
@@ -1438,7 +1438,7 @@ def outputs(layers, *args):
    assert len(layers) > 0

    if HasInputsSet():  # input already set
-        Outputs(* [l.name for l in layers])
+        Outputs(*[l.name for l in layers])
        return  # just return outputs.

    if len(layers) != 1:

--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/v2/dataset/wmt14.py
@@ -32,9 +32,9 @@ MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
 # this is a small set of data for test. The original data is too large and will be add later.
 URL_TRAIN = 'http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz'
 MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
-# this is the pretrained model, whose bleu = 26.92
+# BLEU of this trained model is 26.92
 URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz'
-MD5_MODEL = '4ce14a26607fb8a1cc23bcdedb1895e4'
+MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3'

 START = "<s>"
 END = "<e>"

--- a/python/paddle/v2/framework/__init__.py
+++ b/python/paddle/v2/framework/__init__.py
+__all__ = ['proto']
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
+add_python_test(test_framework test_protobuf.py)
--- a/python/paddle/v2/framework/tests/test_protobuf.py
+++ b/python/paddle/v2/framework/tests/test_protobuf.py
+import paddle.v2.framework.proto.op_proto_pb2
+import paddle.v2.framework.proto.attr_type_pb2
+import unittest
+
+
+class TestFrameworkProto(unittest.TestCase):
+    def test_all(self):
+        op_proto_lib = paddle.v2.framework.proto.op_proto_pb2
+        attr_type_lib = paddle.v2.framework.proto.attr_type_pb2
+        op_proto = op_proto_lib.OpProto()
+        ipt0 = op_proto.inputs.add()
+        ipt0.name = "a"
+        ipt0.comment = "the input of cosine op"
+        ipt1 = op_proto.inputs.add()
+        ipt1.name = "b"
+        ipt1.comment = "the other input of cosine op"
+        opt = op_proto.outputs.add()
+        opt.name = "output"
+        opt.comment = "the output of cosine op"
+        op_proto.comment = "cosine op, output = scale*cos(a, b)"
+        attr = op_proto.attrs.add()
+        attr.name = "scale"
+        attr.comment = "scale of cosine op"
+        attr.type = attr_type_lib.FLOAT
+        op_proto.type = "cos"
+        self.assertTrue(op_proto.IsInitialized())
--- a/python/paddle/v2/master/client.py
+++ b/python/paddle/v2/master/client.py
@@ -26,14 +26,22 @@ class client(object):
            holder[idx] = c_ptr
        lib.paddle_set_dataset(self.c, holder, len(paths))

+    # return format: (record, errno)
+    # errno =  0: ok
+    #       <  0: error
    def next_record(self):
        p = ctypes.c_char_p()
        ret = ctypes.pointer(p)
        size = lib.paddle_next_record(self.c, ret)
+        if size < 0:
+            # Error
+            return None, size
+
        if size == 0:
            # Empty record
-            return ""
+            return "", 0
+
        record = ret.contents.value[:size]
        # Memory created from C should be freed.
        lib.mem_free(ret.contents)
-        return record
+        return record, 0
--- a/python/paddle/v2/reader/creator.py
+++ b/python/paddle/v2/reader/creator.py
@@ -57,17 +57,20 @@ def text_file(path):
    return reader


-def recordio(path):
+def recordio_local(paths, buf_size=100):
    """
-    Creates a data reader that outputs record one one by one from given recordio file
-    :path: path of recordio file
-    :returns: data reader of recordio file
+    Creates a data reader from given RecordIO file paths separated by ",", 
+        glob pattern is supported.
+    :path: path of recordio files.
+    :returns: data reader of recordio files.
    """

    import recordio as rec
+    import paddle.v2.reader.decorator as dec

    def reader():
-        f = rec.reader(path)
+        a = ','.join(paths)
+        f = rec.reader(a)
        while True:
            r = f.read()
            if r is None:
@@ -75,4 +78,38 @@ def recordio(path):
            yield r
        f.close()

+    return dec.buffered(reader, buf_size)
+
+
+def recordio(paths, buf_size=100):
+    """
+    Creates a data reader that outputs record one one by one 
+        from given local or cloud recordio path.
+    :path: path of recordio files.
+    :returns: data reader of recordio files.
+    """
+    import os
+    import paddle.v2.master.client as cloud
+
+    if "KUBERNETES_SERVICE_HOST" not in os.environ.keys():
+        return recordio_local(paths)
+
+    host_name = "MASTER_SERVICE_HOST"
+    if host_name not in os.environ.keys():
+        raise Exception('not find ' + host_name + ' in environ.')
+
+    addr = os.environ(host)
+
+    def reader():
+        c = cloud(addr, buf_size)
+        c.set_dataset(paths)
+
+        while True:
+            r, err = client.next_record()
+            if err < 0:
+                break
+            yield r
+
+        c.close()
+
    return reader
--- a/python/paddle/v2/reader/tests/creator_test.py
+++ b/python/paddle/v2/reader/tests/creator_test.py
@@ -38,7 +38,7 @@ class TestRecordIO(unittest.TestCase):
    def test_recordio(self):
        path = os.path.join(
            os.path.dirname(__file__), "test_recordio_creator.dat")
-        reader = paddle.v2.reader.creator.recordio(path)
+        reader = paddle.v2.reader.creator.recordio([path])
        for idx, r in enumerate(reader()):
            self.assertSequenceEqual(r, str(idx))


--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -9,7 +9,9 @@ packages=['paddle',
          'paddle.v2.dataset',
          'paddle.v2.reader',
          'paddle.v2.master',
-          'paddle.v2.plot']
+          'paddle.v2.plot',
+          'paddle.v2.framework',
+          'paddle.v2.framework.proto']

 setup_requires=["requests",
                "numpy",
@@ -27,8 +29,11 @@ setup(name='paddle',
      description='Parallel Distributed Deep Learning',
      install_requires=setup_requires,
      packages=packages,
-      package_data={'paddle.v2.master': ['${paddle_master_LIB_NAME}'], },
+      package_data={'paddle.v2.master': ['libpaddle_master.so'], },
      package_dir={
-          '': '${CMAKE_CURRENT_SOURCE_DIR}'
+          '': '${CMAKE_CURRENT_SOURCE_DIR}',
+          # The paddle.v2.framework.proto will be generated while compiling.
+          # So that package points to other directory.
+          'paddle.v2.framework.proto': '${PROJ_BINARY_ROOT}/paddle/framework'
      },
 )