diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5bedbbefa85a730ff2934a12597988a67e73c1a4..15a7c6b07417adfacd461e95c0b92f658e1e11cc 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,6 +16,7 @@ cmake_minimum_required(VERSION 3.0)
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 set(PROJ_ROOT ${CMAKE_CURRENT_SOURCE_DIR})
+set(PROJ_BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR})
 
 include(system)
 
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 8f65decda77946878daffda0afffab8e0b74b83d..725cf28037182bfd0a27d49491bc8a479a9b4440 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -88,7 +88,7 @@
 #
 
 # including binary directory for generated headers.
-include_directories(${CMAKE_BINARY_DIR})
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
 
 if(NOT APPLE)
     find_package(Threads REQUIRED)
@@ -106,7 +106,7 @@ function(merge_static_libs TARGET_NAME)
 
   if(APPLE) # Use OSX's libtool to merge archives
     # To produce a library we need at least one source file.
-    # It is created by add_custom_command below and will helps 
+    # It is created by add_custom_command below and will helps
     # also help to track dependencies.
     set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/${TARGET_NAME}_dummy.c)
 
@@ -144,24 +144,24 @@ function(merge_static_libs TARGET_NAME)
         DEPENDS ${lib} ${objdir}
         WORKING_DIRECTORY ${objdir})
 
-      # Empty dummy source file that goes into merged library		
-      set(mergebase ${lib}.mergebase.c)		
-      add_custom_command(OUTPUT ${mergebase}		
-        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}		
-        DEPENDS ${objlistfile})		
+      # Empty dummy source file that goes into merged library
+      set(mergebase ${lib}.mergebase.c)
+      add_custom_command(OUTPUT ${mergebase}
+        COMMAND ${CMAKE_COMMAND} -E touch ${mergebase}
+        DEPENDS ${objlistfile})
 
       list(APPEND mergebases "${mergebase}")
     endforeach()
 
     add_library(${TARGET_NAME} STATIC ${mergebases})
-    target_link_libraries(${TARGET_NAME} ${libs_deps}) 
+    target_link_libraries(${TARGET_NAME} ${libs_deps})
 
     # Get the file name of the generated library
     set(outlibfile "$<TARGET_FILE:${TARGET_NAME}>")
 
     foreach(lib ${libs})
       add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-        COMMAND ${CMAKE_AR} cr ${outlibfile} *.o  
+        COMMAND ${CMAKE_AR} cr ${outlibfile} *.o
         COMMAND ${CMAKE_RANLIB} ${outlibfile}
         WORKING_DIRECTORY ${lib}.objdir)
     endforeach()
@@ -362,4 +362,4 @@ function(py_proto_compile TARGET_NAME)
   set(py_srcs)
   protobuf_generate_python(py_srcs ${py_proto_compile_SRCS})
   add_custom_target(${TARGET_NAME} ALL DEPENDS ${py_srcs})
-endfunction()
\ No newline at end of file
+endfunction()
diff --git a/go/cmd/pserver/pserver.go b/go/cmd/pserver/pserver.go
index 31ef450f032f756fb32a0444a7e94a18ec2918a0..0ecb1242c3c3d5246125c9ce946001ccf9cbec24 100644
--- a/go/cmd/pserver/pserver.go
+++ b/go/cmd/pserver/pserver.go
@@ -20,6 +20,8 @@ func main() {
 		"comma separated endpoint string for pserver to connect to etcd")
 	etcdTimeout := flag.Int("etcd-timeout", 5, "timeout for etcd calls")
 	numPservers := flag.Int("num-pservers", 1, "total pserver count in a training job")
+	checkpointPath := flag.String("checkpoint-path", "/checkpoints/", "save checkpoint path")
+	checkpointInterval := flag.Int("checkpoint-interval", 600, "save checkpoint per interval seconds")
 	logLevel := flag.String("log-level", "info",
 		"log level, possible values: debug, info, warning, error, fatal, panic")
 	flag.Parse()
@@ -31,18 +33,20 @@ func main() {
 	log.SetLevel(level)
 
 	var idx int
+	var cp pserver.Checkpoint
+	var e *pserver.EtcdClient
 	if *index >= 0 {
 		idx = *index
 	} else {
 		timeout := time.Second * time.Duration((*etcdTimeout))
-		e := pserver.NewEtcdClient(*etcdEndpoint, *numPservers, timeout)
+		e = pserver.NewEtcdClient(*etcdEndpoint, *numPservers, timeout)
 		idx, err = e.Register()
 		if err != nil {
 			panic(err)
 		}
 	}
 
-	s, err := pserver.NewService(idx)
+	s, err := pserver.NewService(idx, *checkpointInterval, *checkpointPath, e, cp)
 	if err != nil {
 		panic(err)
 	}
diff --git a/go/pserver/etcd_client.go b/go/pserver/etcd_client.go
index 37b8d522c1bd07acb41b9515a6d9bc15eae9aa32..1f77787150d16052e3588e9c1795c8d5dafa08e6 100644
--- a/go/pserver/etcd_client.go
+++ b/go/pserver/etcd_client.go
@@ -18,6 +18,8 @@ const (
 	PsDesired = "/ps_desired"
 	// PsAddr is the base dir for pserver to store their addr
 	PsPath = "/ps/"
+	// PsCheckpoint is the etcd path for store checkpoints information
+	PsCheckpoint = "/checkpoints/"
 )
 
 // EtcdClient is the etcd client that the pserver uses for fault
@@ -186,3 +188,14 @@ func (e *EtcdClient) registerPserverEtcd(ctx context.Context) (int, error) {
 
 	return idx, nil
 }
+
+// PutKey put into etcd with value by key specified
+func (e *EtcdClient) PutKey(key string, value []byte, timeout int) error {
+	ctx, cancel := context.WithTimeout(context.Background(), time.Second*time.Duration(timeout))
+	_, err := e.etcdClient.Put(ctx, key, string(value))
+	cancel()
+	if err != nil {
+		return err
+	}
+	return nil
+}
diff --git a/go/pserver/optimizer.go b/go/pserver/optimizer.go
index ee5fe6205b670920b0ce13dd678901e5154c0e7e..dcb871129985bba7fbabdfae7fbebd38caad4c7f 100644
--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
@@ -35,22 +35,30 @@ func cArrayToSlice(p unsafe.Pointer, len int) []byte {
 	return (*[1 << 30]byte)(p)[:len:len]
 }
 
-func newOptimizer(paramWithConfigs ParameterWithConfig) *optimizer {
+func newOptimizer(paramWithConfigs ParameterWithConfig, State []byte) *optimizer {
 	o := &optimizer{}
 	o.elementType = paramWithConfigs.Param.ElementType
 	p := paramWithConfigs.Param
 	c := paramWithConfigs.Config
+	s := State
+	paramBufferSize := C.size_t(len(p.Content) / C.sizeof_float)
 	log.WithFields(log.Fields{
 		"ElementType": p.ElementType,
-		"ParamSize":   len(p.Content) / C.sizeof_float,
+		"ParamSize":   paramBufferSize,
 		"ConfigSize":  len(c),
+		"StateSize":   len(s),
 	}).Info("New Optimizer Created with config:")
 	var cbuffer unsafe.Pointer
-	cbuffer = C.malloc(C.size_t(len(p.Content)))
-	C.memcpy(cbuffer, unsafe.Pointer(&p.Content[0]), C.size_t(len(p.Content)/C.sizeof_float))
+	cbuffer = C.malloc(paramBufferSize)
+
+	C.memcpy(cbuffer, unsafe.Pointer(&p.Content[0]), paramBufferSize)
+	var cstate unsafe.Pointer
+	if len(s) != 0 {
+		cstate = unsafe.Pointer(&s[0])
+	}
+
 	o.opt = C.paddle_create_optimizer((*C.uchar)(&c[0]), C.int(len(c)),
-		C.paddle_element_type(p.ElementType), cbuffer, C.int(len(p.Content)/C.sizeof_float),
-		(*C.char)(nullPtr), 0)
+		C.paddle_element_type(p.ElementType), cbuffer, C.int(paramBufferSize), (*C.char)(cstate), C.int(len(s)))
 	return o
 }
 
@@ -60,6 +68,12 @@ func (o *optimizer) GetWeights() []byte {
 	return cArrayToSlice(buffer, int(bufferLen)*C.sizeof_float)
 }
 
+func (o *optimizer) GetStates() []byte {
+	var cbuffer *C.char
+	cbufferLen := C.paddle_optimizer_get_state(o.opt, &cbuffer)
+	return cArrayToSlice(unsafe.Pointer(cbuffer), int(cbufferLen))
+}
+
 func (o *optimizer) UpdateParameter(g Gradient) error {
 	if o.elementType != g.ElementType {
 		return fmt.Errorf("Name: %s, parameter and gradient element type not match, parameter: %v, gradient: %v", g.Name, o.elementType, g.ElementType)
diff --git a/go/pserver/optimizer_test.go b/go/pserver/optimizer_test.go
index 0b2f4cfa41a630645c128ac13826de9d8b1d521b..d19e9de92e0b33b1d9619adb615a24884097a38f 100644
--- a/go/pserver/optimizer_test.go
+++ b/go/pserver/optimizer_test.go
@@ -19,6 +19,6 @@ func TestOptimizerCreateRelease(t *testing.T) {
 		Param:  p,
 		Config: config,
 	}
-	o := newOptimizer(param)
+	o := newOptimizer(param, nil)
 	o.Cleanup()
 }
diff --git a/go/pserver/service.go b/go/pserver/service.go
index ad16a5708d10bdcb5189a1e1e8abf13c54a72265..6b52d0d896f8bc04fab6c9b68911523cbb7ac8b9 100644
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -1,9 +1,21 @@
 package pserver
 
 import (
+	"bufio"
+	"bytes"
+	"crypto/md5"
+	"encoding/gob"
+	"encoding/hex"
+	"encoding/json"
 	"errors"
 	"fmt"
+	"os"
+	"path/filepath"
+	"strconv"
 	"sync"
+	"time"
+
+	log "github.com/sirupsen/logrus"
 )
 
 // ElementType is the type of elements of a Parameter.
@@ -39,26 +51,55 @@ type ParameterWithConfig struct {
 	Config []byte // parameter configuration in Proto Buffer format
 }
 
+// ParameterCheckpoint is Parameter and State checkpoint
+type ParameterCheckpoint struct {
+	ParamConfig ParameterWithConfig
+	State       []byte
+}
+
+// checkpoint signature
+type checkpointMeta struct {
+	UUID      string `json:"uuid"`
+	Md5sum    string `json:"md5sum"`
+	Timestamp string `json:"timestamp"`
+}
+
+// Checkpoint is the pserver shard persist in file
+type Checkpoint []ParameterCheckpoint
+
 // Gradient is the gradient of the parameter.
 type Gradient Parameter
 
 // Service is the RPC service for pserver.
 type Service struct {
-	initialized chan struct{}
-	idx         int
-
-	mu     sync.Mutex
-	optMap map[string]*optimizer
+	initialized        chan struct{}
+	idx                int
+	checkpointInterval time.Duration
+	checkpointPath     string
+	client             *EtcdClient
+	mu                 sync.Mutex
+	optMap             map[string]*optimizer
 }
 
 // NewService creates a new service, will bypass etcd registration if no
 // endpoints specified.
-func NewService(idx int) (*Service, error) {
+func NewService(idx int, seconds int, path string, client *EtcdClient, cp Checkpoint) (*Service, error) {
 	s := &Service{
-		idx: idx,
+		idx:                idx,
+		checkpointInterval: time.Second * time.Duration(seconds),
+		checkpointPath:     path,
+		client:             client,
 	}
 	s.optMap = make(map[string]*optimizer)
 	s.initialized = make(chan struct{})
+
+	if cp != nil {
+		for _, item := range cp {
+			p := item.ParamConfig
+			st := item.State
+			s.optMap[p.Param.Name] = newOptimizer(p, st)
+		}
+	}
 	return s, nil
 }
 
@@ -78,7 +119,7 @@ func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) er
 	// TODO(helin): check if paramWithConfigs.Param.Content is
 	// properly memory aligned, if not, make copy to a memory
 	// aligned region.
-	s.optMap[paramWithConfigs.Param.Name] = newOptimizer(paramWithConfigs)
+	s.optMap[paramWithConfigs.Param.Name] = newOptimizer(paramWithConfigs, nil)
 	return nil
 }
 
@@ -139,10 +180,57 @@ func (s *Service) GetParam(name string, parameter *Parameter) error {
 	return nil
 }
 
-// Save tells the parameter server to save parameters.
-func (s *Service) Save(path string, dummy *int) error {
+// pserver save checkpoint
+func (s *Service) doCheckpoint() error {
 	<-s.initialized
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	cp := make([]ParameterCheckpoint, 0, len(s.optMap))
+	index := 0
+	for name, opt := range s.optMap {
+		var pc ParameterCheckpoint
+		pc.ParamConfig.Param.Name = name
+		pc.ParamConfig.Param.ElementType = opt.elementType
+		pc.ParamConfig.Param.Content = opt.GetWeights()
+		pc.State = opt.GetStates()
+		cp[index] = pc
+		index++
+	}
+	var buf bytes.Buffer
+	encoder := gob.NewEncoder(&buf)
+	err := encoder.Encode(cp)
+	if err != nil {
+		return err
+	}
+
+	cpMeta := checkpointMeta{}
+	cpMeta.UUID = s.checkpointPath + strconv.Itoa(s.idx)
+	cpMeta.Timestamp = time.Now().String()
+	h := md5.New()
+	cpMeta.Md5sum = hex.EncodeToString(h.Sum(buf.Bytes()))
 
-	// TODO
+	cpMetajson, _ := json.Marshal(cpMeta)
+	err = s.client.PutKey(filepath.Join(PsCheckpoint, strconv.Itoa(s.idx)), cpMetajson, 3)
+	if err != nil {
+		return err
+	}
+	if _, err = os.Stat(cpMeta.UUID); os.IsNotExist(err) {
+		log.Info("checkpoint does not exists.")
+	} else {
+		err = os.Remove(cpMeta.UUID)
+		log.Infof("checkpoint %s already exsits, removing ", cpMeta.UUID)
+	}
+	f, err := os.Create(cpMeta.UUID)
+	defer f.Close()
+	if err != nil {
+		return err
+	}
+	writer := bufio.NewWriter(f)
+	_, err = writer.Write(buf.Bytes())
+	writer.Flush()
+	if err != nil {
+		return err
+	}
 	return nil
 }
diff --git a/go/pserver/service_test.go b/go/pserver/service_test.go
index b6d20d2c8b7ba0ccd7ab46669a597a21dc11c381..9bf1a48a596f3e3e73a2e4df651855fd5f4e775f 100644
--- a/go/pserver/service_test.go
+++ b/go/pserver/service_test.go
@@ -15,7 +15,8 @@ const (
 )
 
 func TestServiceFull(t *testing.T) {
-	s, err := pserver.NewService(0)
+	var cp pserver.Checkpoint
+	s, err := pserver.NewService(0, 1, "", nil, cp)
 	if err != nil {
 		t.Error(err)
 	}
@@ -86,7 +87,8 @@ func TestServiceFull(t *testing.T) {
 }
 
 func TestMultipleInit(t *testing.T) {
-	s, err := pserver.NewService(0)
+	var cp pserver.Checkpoint
+	s, err := pserver.NewService(0, 1, "", nil, cp)
 	if err != nil {
 		t.Error(err)
 	}
@@ -102,7 +104,8 @@ func TestMultipleInit(t *testing.T) {
 }
 
 func TestUninitialized(t *testing.T) {
-	s, err := pserver.NewService(0)
+	var cp pserver.Checkpoint
+	s, err := pserver.NewService(0, 1, "", nil, cp)
 	err = s.SendGrad(pserver.Gradient{}, nil)
 	if err.Error() != pserver.Uninitialized {
 		t.FailNow()
@@ -110,7 +113,8 @@ func TestUninitialized(t *testing.T) {
 }
 
 func TestBlockUntilInitialized(t *testing.T) {
-	s, err := pserver.NewService(0)
+	var cp pserver.Checkpoint
+	s, err := pserver.NewService(0, 1, "", nil, cp)
 	if err != nil {
 		t.Error(err)
 	}
@@ -128,16 +132,6 @@ func TestBlockUntilInitialized(t *testing.T) {
 		ch <- struct{}{}
 	}()
 
-	wg.Add(1)
-	go func() {
-		err := s.Save("", nil)
-		if err != nil {
-			errCh <- err
-		}
-		wg.Done()
-		ch <- struct{}{}
-	}()
-
 	time.Sleep(50 * time.Millisecond)
 
 	select {
@@ -170,3 +164,7 @@ func TestBlockUntilInitialized(t *testing.T) {
 
 	wg.Wait()
 }
+
+func TestCheckpointSpeed(t *testing.T) {
+	//TODO(zhihong): test speed
+}
diff --git a/paddle/optimizer/adadelta_optimizer.cc b/paddle/optimizer/adadelta_optimizer.cc
index 465ad5e0d2089121a0f11ab916afe0420cbcfab7..6eec5d846fa5ef6b25e7646200dad1d452dda806 100644
--- a/paddle/optimizer/adadelta_optimizer.cc
+++ b/paddle/optimizer/adadelta_optimizer.cc
@@ -27,22 +27,24 @@ void AdadeltaOptimizer::Update(const Tensor* gradient) {
 
 const char* AdadeltaOptimizer::SerializeState(int* state_len) {
   AdadeltaOptimizerState state;
-  // TODO(zhihong) : add lr_policy serialization
   state.set_num_sample_passed(num_sample_passed_);
+  std::string lr_str = this->lr_policy_->SerializeState(state_len);
+  state.mutable_lr_state()->ParseFromString(lr_str);
 
   TensorToProto(*parameter_, state.mutable_parameter());
   TensorToProto(*accum_gradient_, state.mutable_accum_gradient());
   TensorToProto(*accum_delta_, state.mutable_accum_delta());
   TensorToProto(*update_delta_, state.mutable_update_delta());
   auto str = state.SerializeAsString();
-  *state_len = str.size();
+  *state_len += str.size();
   return str.c_str();
 }
 
 void AdadeltaOptimizer::DeserializeState(const std::string& str) {
   AdadeltaOptimizerState state;
   state.ParseFromString(str);
-  // TODO(zhihong) : add lr_policy DeserializeState
+  auto lr_state = state.lr_state();
+  this->lr_policy_->DeserializeState(lr_state.SerializeAsString());
   num_sample_passed_ = state.num_sample_passed();
 
   ProtoToTensor(state.parameter(), parameter_);
diff --git a/paddle/optimizer/adagrad_optimizer.cc b/paddle/optimizer/adagrad_optimizer.cc
index bdaa7877d2bc58c17c51b977852d4b6fec511ed2..5b92610ac547ee11cedf2e49e4d7f1db4b2da646 100644
--- a/paddle/optimizer/adagrad_optimizer.cc
+++ b/paddle/optimizer/adagrad_optimizer.cc
@@ -19,20 +19,23 @@ void AdagradOptimizer::Update(const Tensor* gradient) {
 }
 const char* AdagradOptimizer::SerializeState(int* state_len) {
   AdagradOptimizerState state;
-  // TODO(zhihong) : add lr_policy serialization
   state.set_num_sample_passed(num_sample_passed_);
+  std::string lr_str = this->lr_policy_->SerializeState(state_len);
+  state.mutable_lr_state()->ParseFromString(lr_str);
 
   TensorToProto(*parameter_, state.mutable_parameter());
   TensorToProto(*accum_gradient_, state.mutable_accum_gradient());
   auto str = state.SerializeAsString();
-  *state_len = str.size();
+  *state_len += str.size();
   return str.c_str();
 }
 
 void AdagradOptimizer::DeserializeState(const std::string& str) {
   AdagradOptimizerState state;
   state.ParseFromString(str);
-  // TODO(zhihong) : add lr_policy DeserializeState
+  auto lr_state = state.lr_state();
+  this->lr_policy_->DeserializeState(lr_state.SerializeAsString());
+
   num_sample_passed_ = state.num_sample_passed();
   ProtoToTensor(state.parameter(), parameter_);
   ProtoToTensor(state.accum_gradient(), accum_gradient_);
diff --git a/paddle/optimizer/adam_optimizer.cc b/paddle/optimizer/adam_optimizer.cc
index ceab7397d87349c64ca9e5d11990cb38068421be..1ebb6b1e0f7b4edcbac1b28319fd4de576f85f6a 100644
--- a/paddle/optimizer/adam_optimizer.cc
+++ b/paddle/optimizer/adam_optimizer.cc
@@ -24,20 +24,23 @@ void AdamOptimizer::Update(const Tensor *gradient) {
 
 const char *AdamOptimizer::SerializeState(int *state_len) {
   AdamOptimizerState state;
-  // TODO(zhihong) : add lr_policy serialization
+  std::string lr_str = this->lr_policy_->SerializeState(state_len);
+  state.mutable_lr_state()->ParseFromString(lr_str);
   state.set_num_sample_passed(num_sample_passed_);
+
   TensorToProto(*parameter_, state.mutable_parameter());
   TensorToProto(*momentums_, state.mutable_momentums());
   TensorToProto(*velocitys_, state.mutable_velocitys());
   auto str = state.SerializeAsString();
-  *state_len = str.size();
+  *state_len += str.size();
   return str.c_str();
 }
 
 void AdamOptimizer::DeserializeState(const std::string &str) {
   AdamOptimizerState state;
   state.ParseFromString(str);
-  // TODO(zhihong) : add lr_policy DeserializeState
+  auto lr_state = state.lr_state();
+  this->lr_policy_->DeserializeState(lr_state.SerializeAsString());
   num_sample_passed_ = state.num_sample_passed();
 
   ProtoToTensor(state.parameter(), parameter_);
diff --git a/paddle/optimizer/lr_policy.h b/paddle/optimizer/lr_policy.h
index d8e33ad37ab4c019a36f63f34babe65cf8c8fb16..036c376e10f465c2866a230caf9224f4af5478bc 100644
--- a/paddle/optimizer/lr_policy.h
+++ b/paddle/optimizer/lr_policy.h
@@ -17,36 +17,56 @@ public:
 // constant learning rate policy
 class ConstLr final : public LrPolicy {
 public:
-  ConstLr(double lr) : learning_rate(lr){};
+  ConstLr(double lr) : learning_rate_(lr){};
   double LearningRate(const uint64_t num_sample_passed) {
-    return learning_rate;
+    return learning_rate_;
+  }
+  const char *SerializeState(int *state_len) {
+    LrPolicyState state;
+    state.set_learning_rate(learning_rate_);
+    auto str = state.SerializeAsString();
+    *state_len = str.size();
+    return str.c_str();
+  }
+  void DeserializeState(const std::string &str) {
+    LrPolicyState state;
+    state.ParseFromString(str);
+    learning_rate_ = state.learning_rate();
   }
-  const char *SerializeState(int *state_len) { return nullptr; }
-  void DeserializeState(const std::string &state) {}
 
 private:
-  double learning_rate;
+  double learning_rate_;
 };
 
 class LinearLr final : public LrPolicy {
 public:
   LinearLr(double lr, double lr_decay_a, double lr_decay_b)
-      : learning_rate(lr), lr_decay_a(lr_decay_a), lr_decay_b(lr_decay_b) {}
+      : learning_rate_(lr), lr_decay_a_(lr_decay_a), lr_decay_b_(lr_decay_b) {}
   double LearningRate(const uint64_t num_sample_passed) {
-    return std::max(learning_rate - lr_decay_a * num_sample_passed, lr_decay_b);
+    return std::max(learning_rate_ - lr_decay_a_ * num_sample_passed,
+                    lr_decay_b_);
   }
   const char *SerializeState(int *state_len) {
-    // TODO(zhihong) : add lr_policy serialization
-    return nullptr;
+    LrPolicyState state;
+    state.set_learning_rate(learning_rate_);
+    state.set_lr_decay_a(lr_decay_a_);
+    state.set_lr_decay_b(lr_decay_b_);
+    auto str = state.SerializeAsString();
+    *state_len = str.size();
+    return str.c_str();
   }
-  void DeserializeState(const std::string &state) {
-    // TODO(zhihong) : add lr_policy serialization
+  void DeserializeState(const std::string &str) {
+    LrPolicyState state;
+    state.ParseFromString(str);
+    learning_rate_ = state.learning_rate();
+    lr_decay_a_ = state.lr_decay_a();
+    lr_decay_b_ = state.lr_decay_b();
   }
 
 private:
-  double learning_rate;
-  double lr_decay_a;
-  double lr_decay_b;
+  double learning_rate_;
+  double lr_decay_a_;
+  double lr_decay_b_;
 };
 
 }  // namespace optimizer
diff --git a/paddle/optimizer/sgd_optimizer.cc b/paddle/optimizer/sgd_optimizer.cc
index 34e051003fa83f11b1f4a39c46856e0372836a1a..15418faa840c19e776f293700ee886991754fb04 100644
--- a/paddle/optimizer/sgd_optimizer.cc
+++ b/paddle/optimizer/sgd_optimizer.cc
@@ -30,16 +30,20 @@ void SGDOptimizer::Update(const Tensor *gradient) {
 const char *SGDOptimizer::SerializeState(int *state_len) {
   SGDOptimizerState state;
   state.set_num_sample_passed(num_sample_passed_);
+  std::string lr_str = this->lr_policy_->SerializeState(state_len);
+  state.mutable_lr_state()->ParseFromString(lr_str);
   TensorToProto(*parameter_, state.mutable_parameter());
   if (momentum_ != 0.0) TensorToProto(*momentums_, state.mutable_momentums());
   auto str = state.SerializeAsString();
-  *state_len = str.size();
+  *state_len += str.size();
   return str.c_str();
 }
 
 void SGDOptimizer::DeserializeState(const std::string &str) {
   SGDOptimizerState state;
   state.ParseFromString(str);
+  auto lr_state = state.lr_state();
+  this->lr_policy_->DeserializeState(lr_state.SerializeAsString());
   num_sample_passed_ = state.num_sample_passed();
   ProtoToTensor(state.parameter(), parameter_);
   if (momentum_ != 0.0) ProtoToTensor(state.parameter(), momentums_);
diff --git a/paddle/platform/CMakeLists.txt b/paddle/platform/CMakeLists.txt
index cc6b52e9271ff00e91f2ca172815c543eb99261d..ebacd5d6dc850ebde5212bba39aea4dde8e4eb03 100644
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -4,3 +4,5 @@ nv_test(cuda_test SRCS cuda_test.cu)
 
 cc_library(place SRCS place.cc)
 cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
+
+nv_test(device_context_test SRCS device_context_test.cc DEPS dynamic_loader place eigen3 glog gflags)
diff --git a/paddle/platform/device_context.h b/paddle/platform/device_context.h
new file mode 100644
index 0000000000000000000000000000000000000000..fcef0a5e3058f1c9d54f9e06a54a09286e2454fd
--- /dev/null
+++ b/paddle/platform/device_context.h
@@ -0,0 +1,159 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/enforce.h"
+#ifndef PADDLE_ONLY_CPU
+#include "paddle/platform/cuda.h"
+#include "paddle/platform/dynload/cublas.h"
+#include "paddle/platform/dynload/cudnn.h"
+#include "paddle/platform/dynload/curand.h"
+#define EIGEN_USE_GPU
+#endif
+#include "paddle/platform/place.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace paddle {
+namespace platform {
+
+class DeviceContext {
+ public:
+  virtual ~DeviceContext() {}
+};
+
+class CPUDeviceContext : public DeviceContext {};
+
+#ifndef PADDLE_ONLY_CPU
+class GPUPlaceGuard {
+ public:
+  explicit GPUPlaceGuard(GPUPlace new_place) : previous_(GetCurrentDeviceId()) {
+    if (previous_ != new_place) {
+      paddle::platform::SetDeviceId(new_place.device);
+    }
+  }
+
+  ~GPUPlaceGuard() { paddle::platform::SetDeviceId(previous_.device); }
+
+ private:
+  GPUPlace previous_;
+};
+
+class CUDADeviceContext : public DeviceContext {
+ public:
+  explicit CUDADeviceContext(const GPUPlace gpu_place) : gpu_place_(gpu_place) {
+    GPUPlaceGuard guard(gpu_place_);
+    paddle::platform::throw_on_error(cudaStreamCreate(&stream_),
+                                     "cudaStreamCreate failed");
+    eigen_stream_ = new Eigen::CudaStreamDevice(&stream_);
+    eigen_device_ = new Eigen::GpuDevice(eigen_stream_);
+  }
+
+  void Wait() {
+    paddle::platform::throw_on_error(cudaStreamSynchronize(stream_),
+                                     "cudaStreamSynchronize failed");
+  }
+
+  cudaStream_t stream() { return stream_; }
+
+  Eigen::GpuDevice eigen_device() { return *eigen_device_; }
+
+  cublasHandle_t cublas_handle() {
+    if (!blas_handle_) {
+      GPUPlaceGuard guard(gpu_place_);
+      PADDLE_ENFORCE(paddle::platform::dynload::cublasCreate(&blas_handle_) ==
+                         CUBLAS_STATUS_SUCCESS,
+                     "cublasCreate failed");
+      PADDLE_ENFORCE(paddle::platform::dynload::cublasSetStream(
+                         blas_handle_, stream_) == CUBLAS_STATUS_SUCCESS,
+                     "cublasSetStream failed");
+    }
+    return blas_handle_;
+  }
+
+  cudnnHandle_t cudnn_handle() {
+    if (!dnn_handle_) {
+      GPUPlaceGuard guard(gpu_place_);
+      PADDLE_ENFORCE(paddle::platform::dynload::cudnnCreate(&dnn_handle_) ==
+                         CUDNN_STATUS_SUCCESS,
+                     "cudnnCreate failed");
+      PADDLE_ENFORCE(paddle::platform::dynload::cudnnSetStream(
+                         dnn_handle_, stream_) == CUDNN_STATUS_SUCCESS,
+                     "cudnnSetStream failed");
+    }
+    return dnn_handle_;
+  }
+
+  curandGenerator_t curand_generator() {
+    if (!rand_generator_) {
+      GPUPlaceGuard guard(gpu_place_);
+      PADDLE_ENFORCE(paddle::platform::dynload::curandCreateGenerator(
+                         &rand_generator_, CURAND_RNG_PSEUDO_DEFAULT) ==
+                         CURAND_STATUS_SUCCESS,
+                     "curandCreateGenerator failed");
+      PADDLE_ENFORCE(
+          paddle::platform::dynload::curandSetPseudoRandomGeneratorSeed(
+              rand_generator_, random_seed_) == CURAND_STATUS_SUCCESS,
+          "curandSetPseudoRandomGeneratorSeed failed");
+      PADDLE_ENFORCE(paddle::platform::dynload::curandSetStream(
+                         rand_generator_, stream_) == CURAND_STATUS_SUCCESS,
+                     "curandSetStream failed");
+    }
+    return rand_generator_;
+  }
+
+  ~CUDADeviceContext() {
+    Wait();
+    if (blas_handle_) {
+      PADDLE_ENFORCE(paddle::platform::dynload::cublasDestroy(blas_handle_) ==
+                         CUBLAS_STATUS_SUCCESS,
+                     "cublasDestroy failed");
+    }
+
+    if (dnn_handle_) {
+      PADDLE_ENFORCE(paddle::platform::dynload::cudnnDestroy(dnn_handle_) ==
+                         CUDNN_STATUS_SUCCESS,
+                     "cudnnDestroy failed");
+    }
+
+    if (rand_generator_) {
+      PADDLE_ENFORCE(paddle::platform::dynload::curandDestroyGenerator(
+                         rand_generator_) == CURAND_STATUS_SUCCESS,
+                     "curandDestroyGenerator failed");
+    }
+
+    delete eigen_stream_;
+    delete eigen_device_;
+
+    paddle::platform::throw_on_error(cudaStreamDestroy(stream_),
+                                     "cudaStreamDestroy failed");
+  }
+
+ private:
+  GPUPlace gpu_place_;
+  cudaStream_t stream_;
+
+  Eigen::CudaStreamDevice* eigen_stream_;
+  Eigen::GpuDevice* eigen_device_;
+
+  cublasHandle_t blas_handle_{nullptr};
+
+  cudnnHandle_t dnn_handle_{nullptr};
+
+  int random_seed_;
+  curandGenerator_t rand_generator_{nullptr};
+};
+#endif
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/device_context_test.cc b/paddle/platform/device_context_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..61be4a307dbf073be7dff4564183240834cc7df6
--- /dev/null
+++ b/paddle/platform/device_context_test.cc
@@ -0,0 +1,33 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/platform/device_context.h"
+#include "gtest/gtest.h"
+
+TEST(CUDADeviceContext, Init) {
+  int count = paddle::platform::GetDeviceCount();
+  for (int i = 0; i < count; i++) {
+    paddle::platform::CUDADeviceContext* device_context =
+        new paddle::platform::CUDADeviceContext(i);
+    Eigen::GpuDevice gpu_device = device_context->eigen_device();
+    ASSERT_NE(nullptr, gpu_device.stream());
+    cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
+    ASSERT_NE(nullptr, cudnn_handle);
+    cublasHandle_t cublas_handle = device_context->cublas_handle();
+    ASSERT_NE(nullptr, cublas_handle);
+    curandGenerator_t curand_handle = device_context->curand_generator();
+    ASSERT_NE(nullptr, curand_handle);
+    delete device_context;
+  }
+}
diff --git a/proto/OptimizerConfig.proto b/proto/OptimizerConfig.proto
index c698d3c2ddbf58a41ac6ee960af83a257325d1f9..2a87e293f64d3398dea2641c3ff292eceec7e154 100644
--- a/proto/OptimizerConfig.proto
+++ b/proto/OptimizerConfig.proto
@@ -78,11 +78,15 @@ enum DataType {
   repeated bytes content = 2;
 }
 
+message LrPolicyState {
+  // learninRate Policy
+  optional double learning_rate = 1 [default = 1.0];
+  optional double lr_decay_a = 2;
+  optional double lr_decay_b = 3;
+}
+
 message SGDOptimizerState {
-  // learning rate policy
-  optional double learning_rate = 101;
-  optional double lr_decay_a = 102;
-  optional double lr_decay_b = 103;
+  optional LrPolicyState lr_state = 101;
   optional double num_sample_passed = 104;
   // state
   optional TensorProto parameter = 1;
@@ -91,9 +95,7 @@ message SGDOptimizerState {
 
 message AdadeltaOptimizerState {
   // learning rate policy
-  optional double learning_rate = 101;
-  optional double lr_decay_a = 102;
-  optional double lr_decay_b = 103;
+  optional LrPolicyState lr_state = 101;
   optional double num_sample_passed = 104;
   // state
   optional TensorProto parameter = 1;
@@ -102,11 +104,9 @@ message AdadeltaOptimizerState {
   optional TensorProto update_delta = 4;
 }
 
+
 message AdagradOptimizerState {
-  // learning rate policy
-  optional double learning_rate = 101;
-  optional double lr_decay_a = 102;
-  optional double lr_decay_b = 103;
+  optional LrPolicyState lr_state = 101;
   optional double num_sample_passed = 104;
   // state
   optional TensorProto parameter = 1;
@@ -114,10 +114,7 @@ message AdagradOptimizerState {
 }
 
 message AdamOptimizerState {
-  // learning rate policy
-  optional double learning_rate = 101;
-  optional double lr_decay_a = 102;
-  optional double lr_decay_b = 103;
+  optional LrPolicyState lr_state = 101;
   optional double num_sample_passed = 104;
   // state
   optional TensorProto parameter = 1;
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 0a5dd49bb48c25f268aa273314f92c092305664a..b0524a507bacec6768424045e58bf91305de2d08 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -1253,9 +1253,9 @@ def pooling_layer(input,
 
     If stride > 0, this layer slides a window whose size is determined by stride,
     and return the pooling value of the window as the output. Thus, a long sequence
-    will be shorten. 
-    
-    The parameter stride specifies the intervals at which to apply the pooling 
+    will be shorten.
+
+    The parameter stride specifies the intervals at which to apply the pooling
     operation. Note that for sequence with sub-sequence, the default value
     of stride is -1.
 
@@ -4805,6 +4805,14 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
     So groups should be larger than 1, and the num of channels should be able
     to devided by groups.
 
+    .. math::
+       y_{si+j} = \max_k x_{gsi + sk + j}
+       g = groups
+       s = input.size / num_channels
+       0 \le i < num_channels / groups
+       0 \le j < s
+       0 \le k < groups
+
     Please refer to Paper:
       - Maxout Networks: http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
       - Multi-digit Number Recognition from Street View \
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index b77932ce5f09470329a97cc0a6273942a9155c6a..f0b6625dc3736852980c889232d17853290863ac 100755
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -1395,7 +1395,7 @@ def inputs(layers, *args):
     if len(args) != 0:
         layers.extend(args)
 
-    Inputs(* [l.name for l in layers])
+    Inputs(*[l.name for l in layers])
 
 
 def outputs(layers, *args):
@@ -1438,7 +1438,7 @@ def outputs(layers, *args):
     assert len(layers) > 0
 
     if HasInputsSet():  # input already set
-        Outputs(* [l.name for l in layers])
+        Outputs(*[l.name for l in layers])
         return  # just return outputs.
 
     if len(layers) != 1:
diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py
index e1dc4f4c30051202e8fd077087679c4fd6cbd7a0..2a631c365f27a6039021a56268a62017638c2739 100644
--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/v2/dataset/wmt14.py
@@ -32,9 +32,9 @@ MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
 # this is a small set of data for test. The original data is too large and will be add later.
 URL_TRAIN = 'http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz'
 MD5_TRAIN = '0791583d57d5beb693b9414c5b36798c'
-# this is the pretrained model, whose bleu = 26.92
+# BLEU of this trained model is 26.92
 URL_MODEL = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_14/wmt14_model.tar.gz'
-MD5_MODEL = '4ce14a26607fb8a1cc23bcdedb1895e4'
+MD5_MODEL = '0cb4a5366189b6acba876491c8724fa3'
 
 START = "<s>"
 END = "<e>"
diff --git a/python/setup.py.in b/python/setup.py.in
index eeffbfe80e3b4b5473b06f7372addd9870034e77..a422b3832f4c9c60bc5406277f9ada7032f85f51 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -34,6 +34,6 @@ setup(name='paddle',
           '': '${CMAKE_CURRENT_SOURCE_DIR}',
           # The paddle.v2.framework.proto will be generated while compiling.
           # So that package points to other directory.
-          'paddle.v2.framework.proto': '${CMAKE_BINARY_DIR}/paddle/framework'
+          'paddle.v2.framework.proto': '${PROJ_BINARY_ROOT}/paddle/framework'
       },
 )