diff --git a/.travis.yml b/.travis.yml
index 44b755ee32d204c883f0d74e7ad0f78380918954..f9b4a7e08315a42a61a58d6c61c45771df962c4d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -50,6 +50,7 @@ before_install:
   # protobuf version.
   - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
   - pip install rarfile
+  - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
   - |
     function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 79210d043648de5d493f0b998eeb885c993a6106..c2218be5efb90142ef7f16b788e141e025105771 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -127,6 +127,7 @@ endif(WITH_GPU)
 add_subdirectory(proto)
 add_subdirectory(paddle)
 add_subdirectory(python)
+add_subdirectory(go/pserver/cclient)
 
 if(WITH_DOC)
     add_subdirectory(doc)
diff --git a/doc/design/cluster_train/pserver_client.md b/doc/design/cluster_train/pserver_client.md
index b3e4079010490b69db1de28157f0cab80cad2381..474b8c572cd92fc87e9f7f3f2b19d12cccd158de 100644
--- a/doc/design/cluster_train/pserver_client.md
+++ b/doc/design/cluster_train/pserver_client.md
@@ -74,14 +74,25 @@ typedef enum {
 typedef struct {
   char*               name;
   paddle_element_type element_type;
-  void*               content;
+  unsigned char*      content;
   int                 content_len;
 } paddle_parameter, paddle_gradient;
 
-typedef struct paddle_pserver_client paddle_pserver_client;
+typedef int paddle_pserver_client;
 
-paddle_pserver_client* paddle_new_pserver_client();
-void paddle_pserver_client_release(paddle_pserver_client* client);
+/**
+ * @brief creates a pserver client that talks to etcd for coordination.
+ */
+paddle_pserver_client paddle_new_etcd_pserver_client(char* etcd_addr);
+
+/**
+ * @brief creates a pserver client given pserver addresses.
+ *
+ * @param pserver_addrs comma-separated pserver addresses.
+ * @param selected if current pserver client is selected to initialize all parameter servers.
+ */
+paddle_pserver_client paddle_new_pserver_client(char* pserver_addrs, int selected);
+void paddle_pserver_client_release(paddle_pserver_client c);
 
 /**
  * @brief paddle_begin_init_params begins to initialize parameters on
@@ -95,7 +106,7 @@ void paddle_pserver_client_release(paddle_pserver_client* client);
  * @return 1 if the trainer is selected to initialize parameter
  * servers, otherwise 0.
  */
-int paddle_begin_init_params(paddle_pserver_client* client);
+int paddle_begin_init_params(paddle_pserver_client client);
 
 /**
  * @brief paddle_init_param initializes the parameter on parameter
@@ -109,7 +120,7 @@ int paddle_begin_init_params(paddle_pserver_client* client);
  * @paddle_begin_init_param). Or simply exit the program and wait for
  * the cluster management system to restart the trainer.
  */
-int paddle_init_param(paddle_pserver_client* client, paddle_parameter param, const unsigned char* param_config_proto, int config_len);
+int paddle_init_param(paddle_pserver_client client, paddle_parameter param, const unsigned char* param_config_proto, int config_len);
 
 /**
  * @brief paddle_finish_init_params tells parameter servers client has
@@ -120,7 +131,7 @@ int paddle_init_param(paddle_pserver_client* client, paddle_parameter param, con
  * @paddle_begin_init_param). Or simply exit the program and wait for
  * the cluster management system to restart the trainer.
  */
-int paddle_finish_init_params(paddle_pserver_client* client);
+int paddle_finish_init_params(paddle_pserver_client client);
 
 /**
  * @brief paddle_send_grads sends gradients to parameter servers for
@@ -131,7 +142,7 @@ int paddle_finish_init_params(paddle_pserver_client* client);
  * @param learning_rate the learning rate for the gradients.
  * @return 0 if successful, otherwise -1.
  */
-int paddle_send_grads(paddle_pserver_client* client, const paddle_gradient* grads, int len);
+int paddle_send_grads(paddle_pserver_client client, const paddle_gradient* grads, int len);
 
 /**
  * @brief paddle_get_params gets parameters from parameter servers.
@@ -139,13 +150,15 @@ int paddle_send_grads(paddle_pserver_client* client, const paddle_gradient* grad
  * paddle_get_params will block until parameters are initialized on
  * the parameter servers.
  *
- * @param names the array of names of the parameters to get.
- * @param dst the destination array of parameters to save to.
+ * @param dst the destination array of parameter pointers to save to.
+ * The parameter pointer must be pre-popullated with required parameter name,
+ * and the content of parameter must be pre-allocated of the size of required
+ * parameter on pserver.
  * @param len the length of the names array and the paddle_parameter
  * array.
  * @return 0 if successful, otherwise -1.
  */
-int paddle_get_params(paddle_pserver_client* client, const char** names, paddle_parameter* dst, int len);
+int paddle_get_params(paddle_pserver_client client, paddle_parameter** dst, int len);
 
 /**
  * @brief paddle_save_model indicates parameters to save the parameter
@@ -154,5 +167,5 @@ int paddle_get_params(paddle_pserver_client* client, const char** names, paddle_
  * @param path the path to save parameters.
  * @return 0 if successful, otherwise -1.
  */
-int paddle_save_model(paddle_pserver_client* client, const char* path);
+int paddle_save_model(paddle_pserver_client client, const char* path);
 ```
diff --git a/doc/design/cluster_train/remote_parameter_updater.md b/doc/design/cluster_train/remote_parameter_updater.md
new file mode 100644
index 0000000000000000000000000000000000000000..6e8e5938455b869e0f3367794c41250340b37f77
--- /dev/null
+++ b/doc/design/cluster_train/remote_parameter_updater.md
@@ -0,0 +1,21 @@
+# Design Doc: Remote Parameter Updater for Cluster Train
+
+For an overview of distribute training, please refer to [distributed training design doc](README.md). In this design doc, we will discuss the parameter updater that will use parameter server cclient [The Client Library of Parameter Server Design Doc](pserver_client.md) to manage and update parameters.
+
+## Parameter Updater
+
+Parameter Updater is used by trainer to manage and update parameter, there are mainly two kind of parameter updater: local and remote, since this design is for cluster train, we will only discuss remote parameter updater here.
+
+### Remote Parameter Updater
+
+Remote Parameter Updater manage parameters through remote parameter server with the client that communicate with pserver([The Client Library of Parameter Server Design Doc](pserver_client.md))
+
+In PaddlePaddle Python V2 API, trainer is implemented in python, and the trainer will hold a instance of parameter updater and call it's functions directly. In this design, we will also expose the api of RemoteParameterUpdater to python with swig.
+
+#### Sparse Remote Parameter Updater
+
+Since we will only implement dense parameter management new, the mechanism for sparse parameter will be discussed in next stage.
+
+### Interface Design
+
+TBD
diff --git a/go/cmake/golang.cmake b/go/cmake/golang.cmake
index d38d06de2348821b21109f7dc708314da81111c5..7c85fb6298d02b85ea87af9b6f6e960463443b7d 100644
--- a/go/cmake/golang.cmake
+++ b/go/cmake/golang.cmake
@@ -17,7 +17,7 @@ function(GO_LIBRARY NAME BUILD_TYPE)
   endif()
 
   file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go")
-  file(RELATIVE_PATH rel ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
+  file(RELATIVE_PATH rel ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
 
   # find Paddle directory.
   get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
@@ -32,12 +32,14 @@ function(GO_LIBRARY NAME BUILD_TYPE)
   # will use the local changes in Paddle rather than checkout Paddle
   # in github.
   add_custom_target(copyPaddle
-    COMMAND ln -sf ${PADDLE_DIR} ${PADDLE_IN_GOPATH})
+    COMMAND rm -rf ${PADDLE_IN_GOPATH}/Paddle
+    COMMAND ln -sf ${PADDLE_DIR} ${PADDLE_IN_GOPATH}/Paddle)
   add_dependencies(goGet copyPaddle)
 
   add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
     COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
-    -o "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}"
+     -gcflags=-shared -asmflags=-shared -installsuffix=_shared -a
+     -o "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}"
     ${CMAKE_GO_FLAGS} ${GO_SOURCE}
     WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
 
diff --git a/go/cmd/master/master.go b/go/cmd/master/master.go
index d1f3d7d76c438670faf6677b01e790c5ebe1f2cb..25cd1cafcdf328094a019638f37f908591f5f374 100644
--- a/go/cmd/master/master.go
+++ b/go/cmd/master/master.go
@@ -1,80 +1,32 @@
 package main
 
 import (
-	"fmt"
 	"net"
 	"net/http"
 	"net/rpc"
-	"os"
-	"path/filepath"
 	"strconv"
-	"strings"
 	"time"
 
 	"github.com/namsral/flag"
 
 	"github.com/PaddlePaddle/Paddle/go/master"
-	"github.com/PaddlePaddle/recordio"
 )
 
 func main() {
 	port := flag.Int("port", 8080, "port of the master server.")
-	dataset := flag.String("training_dataset", "", "dataset: comma separated path to RecordIO paths, supports golb patterns.")
+
 	faultTolerance := flag.Bool("fault_tolerance", false, "enable fault tolerance (requires etcd).")
 	taskTimeoutDur := flag.Duration("task_timout_dur", 20*time.Minute, "task timout duration.")
 	taskTimeoutMax := flag.Int("task_timeout_max", 3, "max timtout count for each task before it being declared failed task.")
 	chunkPerTask := flag.Int("chunk_per_task", 10, "chunk per task.")
 	flag.Parse()
 
-	if *dataset == "" {
-		panic("no dataset specified.")
-	}
-
 	if *faultTolerance {
 		panic("fault tolernance not implemented.")
-	}
-
-	var chunks []master.Chunk
-	var paths []string
-	ss := strings.Split(*dataset, ",")
-	fmt.Println(ss)
-	for _, s := range ss {
-		match, err := filepath.Glob(s)
-		if err != nil {
-			panic(err)
-		}
-		paths = append(paths, match...)
-	}
-
-	if len(paths) == 0 {
-		panic("no valid datset specified.")
-	}
-
-	idx := 0
-	for _, path := range paths {
-		f, err := os.Open(path)
-		if err != nil {
-			panic(err)
-		}
-
-		index, err := recordio.LoadIndex(f)
-		if err != nil {
-			panic(err)
-		}
-		f.Close()
 
-		count := index.NumChunks()
-		for i := 0; i < count; i++ {
-			chunk := master.Chunk{
-				Idx:   idx,
-				Path:  path,
-				Index: *index.ChunkIndex(i),
-			}
-			chunks = append(chunks, chunk)
-		}
 	}
 
-	s := master.NewService(chunks, *chunkPerTask, *taskTimeoutDur, *taskTimeoutMax)
+	s := master.NewService(*chunkPerTask, *taskTimeoutDur, *taskTimeoutMax)
 	err := rpc.Register(s)
 	if err != nil {
 		panic(err)
diff --git a/go/pserver/internal/connection/conn.go b/go/connection/conn.go
similarity index 82%
rename from go/pserver/internal/connection/conn.go
rename to go/connection/conn.go
index 1c04f117254054741b7d45fb16462b5ce84a2aea..bc9b5f0617e35f049c3e14f0b441aca2033f9645 100644
--- a/go/pserver/internal/connection/conn.go
+++ b/go/connection/conn.go
@@ -2,6 +2,7 @@ package connection
 
 import (
 	"errors"
+	"log"
 	"net/rpc"
 	"sync"
 )
@@ -21,6 +22,18 @@ func New() *Conn {
 	return c
 }
 
+// Close closes the connection.
+func (c *Conn) Close() error {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+
+	if c.client == nil {
+		return nil
+	}
+
+	return c.client.Close()
+}
+
 // Connect connects the connection to a address.
 func (c *Conn) Connect(addr string) error {
 	c.mu.Lock()
@@ -50,12 +63,20 @@ func (c *Conn) Connect(addr string) error {
 			c.waitConn = nil
 		}
 	} else {
+		err := client.Close()
+		if err != nil {
+			log.Println(err)
+		}
+
 		return errors.New("client already set from a concurrent goroutine")
 	}
 
 	return nil
 }
 
+// TODO(helin): refactor Call to be able to perform given retry
+// policy.
+
 // Call make a RPC call.
 //
 // Call will be blocked until the connection to remote RPC service
diff --git a/go/master/client.go b/go/master/client.go
new file mode 100644
index 0000000000000000000000000000000000000000..20c66340dc28bc514b6c51583dd94830c42a41bf
--- /dev/null
+++ b/go/master/client.go
@@ -0,0 +1,82 @@
+package master
+
+import (
+	"log"
+	"time"
+
+	"github.com/PaddlePaddle/Paddle/go/connection"
+)
+
+// Addresser provide the address of the master server.
+type Addresser interface {
+	Address() string
+}
+
+// Client is the client of the master server.
+type Client struct {
+	conn *connection.Conn
+}
+
+// NewClient creates a new Client.
+func NewClient(addr Addresser) *Client {
+	c := &Client{}
+	c.conn = connection.New()
+	go c.monitorMaster(addr)
+	return c
+}
+
+func (c *Client) monitorMaster(addr Addresser) {
+	lastMaster := ""
+	monitor := func() {
+		// get the lastest address of the master server,
+		// connect to the new address once address changed.
+		curMaster := addr.Address()
+		if curMaster != lastMaster {
+			if curMaster == "" {
+				err := c.conn.Close()
+				if err != nil {
+					log.Println(err)
+				}
+			} else {
+				err := c.conn.Connect(curMaster)
+				if err != nil {
+					log.Println(err)
+
+					// connect to addr failed, set
+					// to last known addr in order
+					// to retry next time.
+					curMaster = lastMaster
+				}
+
+			}
+		}
+
+		lastMaster = curMaster
+	}
+
+	monitor()
+	ticker := time.NewTicker(10 * time.Second)
+	for _ = range ticker.C {
+		monitor()
+	}
+}
+
+// SetDataset set dataset for the master server to dispatch.
+//
+// SetDataset can be call multiple times from different nodes. But
+// only the first call will be honored.
+func (c *Client) SetDataset(globPaths []string) error {
+	return c.conn.Call("Service.SetDataset", globPaths, nil)
+}
+
+// GetTask gets a new task from the master server.
+func (c *Client) GetTask() (Task, error) {
+	var t Task
+	err := c.conn.Call("Service.GetTask", 0, &t)
+	return t, err
+}
+
+// TaskFinished tells the master server a task is finished.
+func (c *Client) TaskFinished(taskID int) error {
+	return c.conn.Call("Service.TaskFinished", taskID, nil)
+}
diff --git a/go/master/client_test.go b/go/master/client_test.go
new file mode 100644
index 0000000000000000000000000000000000000000..df708ad7912c07205ae0cee8d2ab1c06d65223cc
--- /dev/null
+++ b/go/master/client_test.go
@@ -0,0 +1,120 @@
+package master_test
+
+import (
+	"fmt"
+	"net"
+	"net/http"
+	"net/rpc"
+	"os"
+	"strconv"
+	"strings"
+	"testing"
+	"time"
+
+	log "github.com/sirupsen/logrus"
+
+	"github.com/PaddlePaddle/Paddle/go/master"
+	"github.com/PaddlePaddle/recordio"
+)
+
+const (
+	totalTask    = 20
+	chunkPerTask = 10
+)
+
+var port int
+
+func init() {
+	log.SetLevel(log.ErrorLevel)
+
+	l, err := net.Listen("tcp", ":0")
+	if err != nil {
+		panic(err)
+	}
+
+	ss := strings.Split(l.Addr().String(), ":")
+	p, err := strconv.Atoi(ss[len(ss)-1])
+	if err != nil {
+		panic(err)
+	}
+	port = p
+
+	go func(l net.Listener) {
+		s := master.NewService(chunkPerTask, time.Second, 1)
+		server := rpc.NewServer()
+		err := server.Register(s)
+		if err != nil {
+			panic(err)
+		}
+
+		mux := http.NewServeMux()
+		mux.Handle(rpc.DefaultRPCPath, server)
+		err = http.Serve(l, mux)
+		if err != nil {
+			panic(err)
+		}
+	}(l)
+}
+
+type addresser string
+
+func (a addresser) Address() string {
+	return string(a)
+}
+
+func TestClientFull(t *testing.T) {
+	const p = "/tmp/master_client_test_0"
+	f, err := os.Create(p)
+	if err != nil {
+		panic(err)
+	}
+
+	for i := 0; i < totalTask*chunkPerTask; i++ {
+		w := recordio.NewWriter(f, -1, -1)
+		w.Write(nil)
+		// call Close to force RecordIO writing a chunk.
+		w.Close()
+	}
+	f.Close()
+
+	c := master.NewClient(addresser(fmt.Sprintf(":%d", port)))
+	c.SetDataset([]string{p})
+
+	checkOnePass := func(i int) {
+		var tasks []master.Task
+		for i := 0; i < totalTask; i++ {
+			task, err := c.GetTask()
+			if err != nil {
+				t.Fatal(i, err)
+			}
+			tasks = append(tasks, task)
+		}
+
+		_, err = c.GetTask()
+		if err == nil {
+			t.Fatal(i, "should get error.")
+		}
+
+		err = c.TaskFinished(tasks[0].ID)
+		if err != nil {
+			t.Fatal(err)
+		}
+		tasks = tasks[1:]
+		task, err := c.GetTask()
+		if err != nil {
+			t.Fatal(err)
+		}
+		tasks = append(tasks, task)
+
+		for _, task := range tasks {
+			err = c.TaskFinished(task.ID)
+			if err != nil {
+				t.Fatal(i, err)
+			}
+		}
+	}
+
+	for i := 0; i < 10; i++ {
+		checkOnePass(i)
+	}
+}
diff --git a/go/master/service.go b/go/master/service.go
index ab17a62f3854c1e32d731037fcc9857260d03781..1e2a34972bb4763688d7df97d768320cd6f9e9d4 100644
--- a/go/master/service.go
+++ b/go/master/service.go
@@ -2,29 +2,25 @@ package master
 
 import (
 	"errors"
-	"log"
+	"os"
+	"path/filepath"
 	"sync"
 	"time"
 
-	"github.com/PaddlePaddle/recordio"
-)
+	log "github.com/sirupsen/logrus"
 
-const (
-	targetTaskCount = 300
-)
-
-// errors
-var (
-	ErrNoMoreTask          = errors.New("no more task for current pass")
-	ErrPendingTaskNotFound = errors.New("pending task not found")
+	"github.com/PaddlePaddle/recordio"
 )
 
 // Service is the master server service.
 type Service struct {
-	timeoutDur time.Duration
-	timeoutMax int
+	chunksPerTask int
+	timeoutDur    time.Duration
+	timeoutMax    int
+	ready         chan struct{}
 
 	mu         sync.Mutex
+	initDone   bool
 	taskQueues taskQueues
 }
 
@@ -55,7 +51,6 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 
 	if len(cur.Task.Chunks) > 0 {
 		cur.Task.ID = id
-		id++
 		result = append(result, cur)
 	}
 
@@ -63,21 +58,21 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 }
 
 // NewService creates a new service.
-func NewService(chunks []Chunk, chunksPerTask int, timeoutDur time.Duration, timeoutMax int) *Service {
+func NewService(chunksPerTask int, timeoutDur time.Duration, timeoutMax int) *Service {
 	s := &Service{}
+	s.chunksPerTask = chunksPerTask
 	s.timeoutDur = timeoutDur
 	s.timeoutMax = timeoutMax
 	s.taskQueues = taskQueues{}
 	s.taskQueues.Pending = make(map[int]taskEntry)
-	s.taskQueues.Todo = partition(chunks, chunksPerTask)
+	s.ready = make(chan struct{})
 	return s
 }
 
 // Chunk is a chunk of data consisted of several data instances.
 type Chunk struct {
-	Idx   int // index of the chunk within the file
 	Path  string
-	Index recordio.Index // block index
+	Index recordio.Index // chunk index
 }
 
 // Task is the basic unit of data instances assigned to trainers.
@@ -105,74 +100,205 @@ func (s *Service) snapshot() error {
 	return nil
 }
 
-// GetTask gets a new task from the service.
-func (s *Service) GetTask(dummy int, task *Task) error {
+func readChunks(globPaths []string) ([]Chunk, error) {
+	var chunks []Chunk
+	var paths []string
+
+	for _, s := range globPaths {
+		match, err := filepath.Glob(s)
+		if err != nil {
+			return nil, err
+		}
+		paths = append(paths, match...)
+	}
+
+	if len(paths) == 0 {
+		return nil, errors.New("no valid dataset specified")
+	}
+
+	for _, path := range paths {
+		f, err := os.Open(path)
+		if err != nil {
+			return nil, err
+		}
+
+		index, err := recordio.LoadIndex(f)
+		if err != nil {
+			return nil, err
+		}
+		err = f.Close()
+		if err != nil {
+			return nil, err
+		}
+
+		count := index.NumChunks()
+		for i := 0; i < count; i++ {
+			chunk := Chunk{
+				Path:  path,
+				Index: *index.ChunkIndex(i),
+			}
+			chunks = append(chunks, chunk)
+		}
+	}
+
+	return chunks, nil
+}
+
+// SetDataset sets dataset to dispatch for the master server.
+//
+// SetDataset can be call multiple times. But only the first call will
+// be honored.
+func (s *Service) SetDataset(globPaths []string, dummy *int) error {
+	if len(globPaths) == 0 {
+		return errors.New("no dataset specified")
+	}
+
 	s.mu.Lock()
 	defer s.mu.Unlock()
+	if s.initDone {
+		// Already initialized. All trainer will call
+		// SetDataset, but we only handle the first one. Treat
+		// other calls as successful but do nothing.
+		return nil
+	}
 
-	if len(s.taskQueues.Todo) == 0 {
-		return ErrNoMoreTask
+	chunks, err := readChunks(globPaths)
+	if err != nil {
+		return err
 	}
 
-	t := s.taskQueues.Todo[0]
-	t.Epoch++
-	s.taskQueues.Todo = s.taskQueues.Todo[1:]
-	s.taskQueues.Pending[t.Task.ID] = t
-	err := s.snapshot()
+	s.taskQueues.Todo = partition(chunks, s.chunksPerTask)
+
+	err = s.snapshot()
 	if err != nil {
+		log.Errorln(err)
 		return err
 	}
 
-	time.AfterFunc(s.timeoutDur, func(taskID int, epoch int) func() {
-		return func() {
-			s.mu.Lock()
-			defer s.mu.Unlock()
+	close(s.ready)
+	s.initDone = true
+	return nil
+}
 
-			t, ok := s.taskQueues.Pending[taskID]
-			if !ok {
-				return
-			}
+func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
+	return func() {
+		s.mu.Lock()
+		defer s.mu.Unlock()
 
-			if t.Epoch != epoch {
-				// new epoch, task launched after the
-				// schedule of this timeout check.
-				return
+		t, ok := s.taskQueues.Pending[taskID]
+		if !ok {
+			return
+		}
+
+		if t.Epoch != epoch {
+			// new epoch, task launched after the
+			// schedule of this timeout check.
+			return
+		}
+
+		defer func() {
+			err := s.snapshot()
+			if err != nil {
+				log.Errorln(err)
 			}
+		}()
+
+		delete(s.taskQueues.Pending, t.Task.ID)
 
-			defer func() {
-				err := s.snapshot()
-				if err != nil {
-					log.Println(err)
-				}
-			}()
+		t.NumTimeout++
+		if t.NumTimeout > s.timeoutMax {
+			log.Warningf("Task %v failed %d times, discard.\n", t.Task, t.NumTimeout)
+			s.taskQueues.Failed = append(s.taskQueues.Failed, t.Task)
+			return
+		}
 
-			delete(s.taskQueues.Pending, t.Task.ID)
+		log.Warningf("Task %v failed %d times, retry.\n", t.Task, t.NumTimeout)
+		s.taskQueues.Todo = append(s.taskQueues.Todo, t)
+	}
+}
 
-			t.NumTimeout++
-			if t.NumTimeout > s.timeoutMax {
-				s.taskQueues.Failed = append(s.taskQueues.Failed, t.Task)
-				return
+// GetTask gets a new task from the service.
+func (s *Service) GetTask(dummy int, task *Task) error {
+	select {
+	case <-s.ready:
+	}
+
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	if len(s.taskQueues.Todo) == 0 {
+		if len(s.taskQueues.Done) == 0 {
+			if len(s.taskQueues.Pending) == 0 {
+				err := errors.New("all task failed")
+				log.Warningln(err)
+				return err
 			}
 
-			s.taskQueues.Todo = append(s.taskQueues.Todo, t)
+			// TODO(helin): client need to retry in this
+			// error case. Gotcha: RPC client can't
+			// compare returned error with predefined
+			// errors like io.EOF, because the error
+			// instance deserialized from RPC is a
+			// different instance than the error defined
+			// in package. So we need to figure out a way
+			// for client to check this error correctly.
+			err := errors.New("no more available task")
+			log.Warningln(err)
+			return err
 		}
-	}(t.Task.ID, t.Epoch))
+		s.taskQueues.Todo = s.taskQueues.Done
+		s.taskQueues.Done = nil
+		log.Infoln("No more todo task, but trainer is requesting task to do. Move all done task to todo.")
+	}
+
+	t := s.taskQueues.Todo[0]
+	t.Epoch++
+	s.taskQueues.Todo = s.taskQueues.Todo[1:]
+	s.taskQueues.Pending[t.Task.ID] = t
+	err := s.snapshot()
+	if err != nil {
+		return err
+	}
+
+	*task = t.Task
+	log.Infof("Task #%d dispatched\n", task.ID)
+
+	time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.ID, t.Epoch))
 	return nil
 }
 
 // TaskFinished tell the service that a task is finished.
 func (s *Service) TaskFinished(taskID int, dummy *int) error {
+	select {
+	case <-s.ready:
+	}
+
 	s.mu.Lock()
 	defer s.mu.Unlock()
 
+	log.Infof("Task %d finished\n", taskID)
+
 	t, ok := s.taskQueues.Pending[taskID]
 	if !ok {
-		return ErrPendingTaskNotFound
+		err := errors.New("pending task not found")
+		log.Warningln(err)
+		return err
 	}
 
 	// task finished, reset timeout
 	t.NumTimeout = 0
 	s.taskQueues.Done = append(s.taskQueues.Done, t)
 	delete(s.taskQueues.Pending, taskID)
-	return s.snapshot()
+
+	if len(s.taskQueues.Pending) == 0 && len(s.taskQueues.Todo) == 0 {
+		log.Infoln("No more todo and pending task, start a new pass.")
+		s.taskQueues.Todo = append(s.taskQueues.Todo, s.taskQueues.Done...)
+		s.taskQueues.Done = nil
+	}
+
+	err := s.snapshot()
+	if err != nil {
+		log.Errorln(err)
+	}
+	return err
 }
diff --git a/go/pserver/cclient/CMakeLists.txt b/go/pserver/cclient/CMakeLists.txt
index c017d7465611373309c6c60141fed864f5ccfb5d..7967af51ee9a94c9e40bf6403fe819ff462d9219 100644
--- a/go/pserver/cclient/CMakeLists.txt
+++ b/go/pserver/cclient/CMakeLists.txt
@@ -9,5 +9,15 @@ project(cxx_go C Go)
 include(golang)
 include(flags)
 
-go_library(client STATIC)
+go_library(paddle_pserver_cclient STATIC)
+
+if(PROJ_ROOT)
+  add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/trainer/libpaddle_pserver_cclient.a
+          COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_pserver_cclient.h ${PROJ_ROOT}/paddle/trainer/
+          COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/libpaddle_pserver_cclient.a ${PROJ_ROOT}/paddle/trainer/
+          WORKING_DIRECTORY ${PROJ_ROOT}/paddle
+          DEPENDS paddle_pserver_cclient)
+  add_custom_target(paddle_pserver_cclient_lib ALL DEPENDS ${PROJ_ROOT}/paddle/trainer/libpaddle_pserver_cclient.a)
+endif(PROJ_ROOT)
+
 add_subdirectory(test)
diff --git a/go/pserver/cclient/cclient.go b/go/pserver/cclient/cclient.go
index 0b4aa79806b72f4608230d2216d1741389913d95..4476e762dad04009833421056aa5a49efd44ddaa 100644
--- a/go/pserver/cclient/cclient.go
+++ b/go/pserver/cclient/cclient.go
@@ -19,21 +19,9 @@ typedef struct {
   int                 content_len;
 } paddle_parameter, paddle_gradient;
 
-static inline void paddle_release_param(paddle_parameter* param) {
-  if (param != NULL) {
-    if (param->name != NULL) {
-      free(param->name);
-    }
-
-    if (param->content != NULL) {
-      free(param->content);
-    }
-
-    free(param);
-  }
-}
-
-typedef int client;
+typedef int paddle_pserver_client;
+#define PSERVER_ERROR -1
+#define PSERVER_OK 0
 */
 import "C"
 
@@ -48,10 +36,10 @@ import (
 
 var nullPtr = unsafe.Pointer(uintptr(0))
 var mu sync.Mutex
-var handleMap = make(map[C.client]*pserver.Client)
-var curHandle C.client
+var handleMap = make(map[C.paddle_pserver_client]*pserver.Client)
+var curHandle C.paddle_pserver_client
 
-func add(c *pserver.Client) C.client {
+func add(c *pserver.Client) C.paddle_pserver_client {
 	mu.Lock()
 	defer mu.Unlock()
 	client := curHandle
@@ -60,13 +48,13 @@ func add(c *pserver.Client) C.client {
 	return client
 }
 
-func get(client C.client) *pserver.Client {
+func get(client C.paddle_pserver_client) *pserver.Client {
 	mu.Lock()
 	defer mu.Unlock()
 	return handleMap[client]
 }
 
-func remove(client C.client) *pserver.Client {
+func remove(client C.paddle_pserver_client) *pserver.Client {
 	mu.Lock()
 	defer mu.Unlock()
 	h := handleMap[client]
@@ -100,7 +88,7 @@ func (l lister) List() []pserver.Server {
 }
 
 //export paddle_new_pserver_client
-func paddle_new_pserver_client(addrs *C.char, selected int) C.client {
+func paddle_new_pserver_client(addrs *C.char, selected int) C.paddle_pserver_client {
 	a := C.GoString(addrs)
 	as := strings.Split(a, ",")
 	servers := make([]pserver.Server, len(as))
@@ -113,27 +101,27 @@ func paddle_new_pserver_client(addrs *C.char, selected int) C.client {
 }
 
 //export paddle_new_etcd_pserver_client
-func paddle_new_etcd_pserver_client(etcd_addr *C.char) C.client {
+func paddle_new_etcd_pserver_client(etcd_addr *C.char) C.paddle_pserver_client {
 	// TODO(helin): fault tolerant pserver client using etcd.
 	panic("not implemented.")
 }
 
 //export paddle_pserver_client_release
-func paddle_pserver_client_release(client C.client) {
+func paddle_pserver_client_release(client C.paddle_pserver_client) {
 	remove(client)
 }
 
 //export paddle_begin_init_params
-func paddle_begin_init_params(client C.client) C.int {
+func paddle_begin_init_params(client C.paddle_pserver_client) C.int {
 	c := get(client)
 	if selected := c.BeginInitParams(); selected {
 		return 1
 	}
-	return 0
+	return C.PSERVER_OK
 }
 
 //export paddle_init_param
-func paddle_init_param(client C.client, param C.paddle_parameter, param_config unsafe.Pointer, config_len C.int) C.int {
+func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter, param_config unsafe.Pointer, config_len C.int) C.int {
 	et := pserver.ElementType(param.element_type)
 	name := C.GoString(param.name)
 	content := cArrayToSlice(unsafe.Pointer(param.content), int(param.content_len))
@@ -143,31 +131,41 @@ func paddle_init_param(client C.client, param C.paddle_parameter, param_config u
 	}
 	c := get(client)
 	err := c.InitParam(pc)
+
 	if err != nil {
+		if err.Error() == pserver.AlreadyInitialized {
+			log.Printf("parameter %s already initialized, treat paddle_init_param as sucessful.\n", name)
+			return C.PSERVER_OK
+		}
 		log.Println(err)
-		return -1
+		return C.PSERVER_ERROR
 	}
 
-	return 0
+	return C.PSERVER_OK
 }
 
 //export paddle_finish_init_params
-func paddle_finish_init_params(client C.client) C.int {
+func paddle_finish_init_params(client C.paddle_pserver_client) C.int {
 	c := get(client)
 	err := c.FinishInitParams()
 	if err != nil {
+		if err.Error() == pserver.AlreadyInitialized {
+			log.Println("parameters already initialized, treat paddle_finish_init_params as sucessful.")
+			return C.PSERVER_OK
+		}
+
 		log.Println(err)
-		return -1
+		return C.PSERVER_ERROR
 	}
 
-	return 0
+	return C.PSERVER_OK
 }
 
 //export paddle_send_grads
-func paddle_send_grads(client C.client, grads *C.paddle_gradient, total C.int) C.int {
+func paddle_send_grads(client C.paddle_pserver_client, grads **C.paddle_gradient, total C.int) C.int {
 	var gs []pserver.Gradient
 	for i := 0; i < int(total); i++ {
-		grad := (*C.paddle_gradient)(unsafe.Pointer((uintptr(unsafe.Pointer(grads)) + uintptr(i)*unsafe.Sizeof(*grads))))
+		grad := *(**C.paddle_gradient)(unsafe.Pointer((uintptr(unsafe.Pointer(grads)) + uintptr(i)*unsafe.Sizeof(*grads))))
 		et := pserver.ElementType(grad.element_type)
 		name := C.GoString(grad.name)
 		content := cArrayToSlice(unsafe.Pointer(grad.content), int(grad.content_len))
@@ -178,83 +176,81 @@ func paddle_send_grads(client C.client, grads *C.paddle_gradient, total C.int) C
 	err := c.SendGrads(gs)
 	if err != nil {
 		log.Println(err)
-		return -1
+		return C.PSERVER_ERROR
 	}
 
-	return 0
+	return C.PSERVER_OK
 }
 
 //export paddle_get_params
-func paddle_get_params(client C.client, names **C.char, dst **C.paddle_parameter, total C.int) C.int {
+func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter, total C.int) C.int {
 	var ns []string
 	for i := 0; i < int(total); i++ {
-		name := *(**C.char)(unsafe.Pointer((uintptr(unsafe.Pointer(names)) + uintptr(i)*unsafe.Sizeof(*names))))
-		ns = append(ns, C.GoString(name))
+		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
+		ns = append(ns, C.GoString(param.name))
 	}
 	c := get(client)
 	ps, err := c.GetParams(ns)
 	if err != nil {
 		log.Println(err)
-		return -1
+		return C.PSERVER_ERROR
 	}
 
-	for i := 0; i < int(total); i++ {
-		if i >= len(ps) {
-			break
+	if len(ps) != len(ns) {
+		pn := make([]string, len(ps))
+		for i, p := range ps {
+			pn[i] = p.Name
 		}
+		log.Printf("pserver returned wrong number of parameters. Requested: %s, returned: %s.\n", strings.Join(pn, ", "), strings.Join(ns, ", "))
+		return C.PSERVER_ERROR
+	}
 
+	for i := range ps {
+		if ns[i] != ps[i].Name {
+			pn := make([]string, len(ps))
+			for i, p := range ps {
+				pn[i] = p.Name
+			}
+			log.Printf("pserver returned wrong parameters, or not in requested order. Requested: %s, returned: %s.\n", strings.Join(pn, ", "), strings.Join(ns, ", "))
+			return C.PSERVER_ERROR
+		}
+	}
+
+	for i := 0; i < int(total); i++ {
 		p := ps[i]
 		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
-		nameReady := false
-		contentAllocated := false
 
 		if unsafe.Pointer(param) == nullPtr {
-			param = (*C.paddle_parameter)(C.calloc(1, C.size_t(unsafe.Sizeof(*param))))
+			log.Println("must pre-allocate parameter.")
+			return C.PSERVER_ERROR
 		} else {
-			if unsafe.Pointer(param.name) != nullPtr {
-				if n := C.GoString(param.name); n != p.Name {
-					log.Println("Warning: the pre-allocated parameter name does not match the parameter name, it will be freed.", n, p.Name)
-					C.free(unsafe.Pointer(param.name))
-				} else {
-					nameReady = true
-				}
-			}
-
 			if unsafe.Pointer(param.content) != nullPtr {
-				if int(param.content_len) == len(p.Content) {
-					contentAllocated = true
-				} else {
-					log.Println("Warning: the pre-allocated content len does not match parameter content len, the pre-allocated content will be freed.", param.content_len, len(p.Content))
-					C.free(unsafe.Pointer(param.content))
+				if int(param.content_len) != len(p.Content) {
+					log.Printf("the pre-allocated content len does not match parameter content len. Pre-allocated len: %d, returned len: %d", param.content_len, len(p.Content))
+					return C.PSERVER_ERROR
 				}
 			}
 		}
 
-		if !nameReady {
-			param.name = C.CString(p.Name)
-		}
-		if !contentAllocated {
-			param.content = (*C.uchar)(C.malloc(C.size_t(len(p.Content))))
-		}
 		C.memcpy(unsafe.Pointer(param.content), unsafe.Pointer(&p.Content[0]), C.size_t(len(p.Content)))
 		param.content_len = C.int(len(p.Content))
 		param.element_type = C.paddle_element_type(p.ElementType)
 	}
 
-	return 0
+	return C.PSERVER_OK
 }
 
 //export paddle_save_model
-func paddle_save_model(client C.client, path *C.char) C.int {
+func paddle_save_model(client C.paddle_pserver_client, path *C.char) C.int {
 	p := C.GoString(path)
 	c := get(client)
 	err := c.Save(p)
 	if err != nil {
 		log.Println(err)
-		return -1
+		return C.PSERVER_ERROR
 	}
 
-	return 0
+	return C.PSERVER_OK
 }
 
 func main() {} // Required but ignored
diff --git a/go/pserver/cclient/test/CMakeLists.txt b/go/pserver/cclient/test/CMakeLists.txt
index 16f84648c1de3a8fdb4595c00bdb7608a152ded2..1a3dd7e5e9e0ff3273fc2be67c48461797b4a6b3 100644
--- a/go/pserver/cclient/test/CMakeLists.txt
+++ b/go/pserver/cclient/test/CMakeLists.txt
@@ -1,11 +1,22 @@
 cmake_minimum_required(VERSION 3.0)
 
-include_directories(${CMAKE_BINARY_DIR})
-
 add_executable(main main.c)
-add_dependencies(main client)
+add_dependencies(main paddle_pserver_cclient)
+add_executable(test_cclient test_cclient.c)
+add_dependencies(test_cclient paddle_pserver_cclient)
 
 if(APPLE)
   set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
+else()
+  set(CMAKE_EXE_LINKER_FLAGS "-pthread")
 endif()
-target_link_libraries(main ${CMAKE_BINARY_DIR}/libclient.a)
+
+if(PROJ_ROOT)
+  include_directories(${CMAKE_CURRENT_BINARY_DIR}/..)
+  target_link_libraries(main ${CMAKE_CURRENT_BINARY_DIR}/../libpaddle_pserver_cclient.a pthread)
+  target_link_libraries(test_cclient ${CMAKE_CURRENT_BINARY_DIR}/../libpaddle_pserver_cclient.a pthread)
+else(PROJ_ROOT)
+  include_directories(${CMAKE_BINARY_DIR})
+  target_link_libraries(main ${CMAKE_BINARY_DIR}/libpaddle_pserver_cclient.a pthread)
+  target_link_libraries(test_cclient ${CMAKE_BINARY_DIR}/libpaddle_pserver_cclient.a pthread)
+endif(PROJ_ROOT)
diff --git a/go/pserver/cclient/test/main.c b/go/pserver/cclient/test/main.c
index f75a2110b947520dfec1265e56eaf2ba7ac3b51b..07e1b86b43299f72e24bb83621847980eaaaf06e 100644
--- a/go/pserver/cclient/test/main.c
+++ b/go/pserver/cclient/test/main.c
@@ -1,68 +1,90 @@
 #include <stdio.h>
 
-#include "libclient.h"
+#include "libpaddle_pserver_cclient.h"
 
-void fail() {
-  // TODO(helin): fix: gtest using cmake is not working, using this
-  // hacky way for now.
-  printf("test failed.\n");
+// TODO(helin): Fix: gtest using cmake is not working, using this
+// hacky way for now.
+#define fail()                                          \
+  fprintf(stderr, "info: %s:%d: ", __FILE__, __LINE__); \
   exit(-1);
+
+void sendGrads(paddle_pserver_client c) {
+  unsigned char grad_a[2000] = {2};
+  unsigned char grad_b[3000] = {3};
+  paddle_gradient grad1 = {
+      "param_a", PADDLE_ELEMENT_TYPE_FLOAT32, grad_a, 2000};
+  paddle_gradient grad2 = {
+      "param_b", PADDLE_ELEMENT_TYPE_FLOAT32, grad_b, 3000};
+  paddle_gradient* grads[2] = {&grad1, &grad2};
+  if (paddle_send_grads(c, grads, 2)) {
+    fail();
+  }
+}
+
+void getParams(paddle_pserver_client c) {
+  paddle_parameter param_a;
+  paddle_parameter param_b;
+  char name_a[] = "param_a";
+  char name_b[] = "param_b";
+  // Must pre-allocate the prameter content before calling paddle_get_params.
+  unsigned char content_a[2000] = {};
+  unsigned char content_b[3000] = {};
+  param_a.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+  param_a.name = name_a;
+  param_a.content = content_a;
+  param_a.content_len = 2000;
+  param_b.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+  param_b.name = name_b;
+  param_b.content = content_b;
+  param_b.content_len = 3000;
+
+  paddle_parameter* params[2] = {&param_a, &param_b};
+  if (paddle_get_params(c, params, 2)) {
+    fail();
+  }
 }
 
 int main() {
   char addr[] = "localhost:3000";
-  client c = paddle_new_pserver_client(addr, 1);
+  paddle_pserver_client c = paddle_new_pserver_client(addr, 1);
 retry:
   if (paddle_begin_init_params(c)) {
     paddle_parameter param;
     char name_a[] = "param_a";
     char name_b[] = "param_b";
-    unsigned char content[] = {0x00, 0x11, 0x22};
+    unsigned char content_a[2000] = {1};
+    unsigned char content_b[3000] = {0};
     param.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
     param.name = name_a;
-    param.content = content;
-    param.content_len = 3;
-    if (paddle_init_param(c, param, NULL, 0) != 0) {
+    param.content = content_a;
+    param.content_len = 2000;
+    int error = paddle_init_param(c, param, NULL, 0);
+    if (error != 0) {
       goto retry;
     }
-    param.element_type = PADDLE_ELEMENT_TYPE_INT32;
+
+    param.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
     param.name = name_b;
-    param.content = content;
-    param.content_len = 3;
-    if (paddle_init_param(c, param, NULL, 0) != 0) {
+    param.content = content_b;
+    param.content_len = 3000;
+    error = paddle_init_param(c, param, NULL, 0);
+    if (error != 0) {
       goto retry;
     }
-    if (paddle_finish_init_params(c) != 0) {
+
+    error = paddle_finish_init_params(c);
+    if (error != 0) {
       goto retry;
     }
-  } else {
-    fail();
-  }
-
-  unsigned char content[] = {0x00, 0x11, 0x22};
-  paddle_gradient grads[2] = {
-      {"param_a", PADDLE_ELEMENT_TYPE_INT32, content, 3},
-      {"param_b", PADDLE_ELEMENT_TYPE_FLOAT32, content, 3}};
-
-  if (!paddle_send_grads(c, grads, 2)) {
-    fail();
   }
 
-  paddle_parameter* params[2] = {NULL, NULL};
-  char* names[] = {"param_a", "param_b"};
-  if (!paddle_get_params(c, names, params, 2)) {
-    fail();
+  int i;
+  for (i = 0; i < 100; i++) {
+    sendGrads(c);
+    getParams(c);
   }
 
-  // get parameters again by reusing the allocated parameter buffers.
-  if (!paddle_get_params(c, names, params, 2)) {
-    fail();
-  }
-
-  paddle_release_param(params[0]);
-  paddle_release_param(params[1]);
-
-  if (!paddle_save_model(c, "/tmp/")) {
+  if (paddle_save_model(c, "/tmp/")) {
     fail();
   }
 
diff --git a/go/pserver/cclient/test/test_cclient.c b/go/pserver/cclient/test/test_cclient.c
new file mode 100644
index 0000000000000000000000000000000000000000..0f9c2ef80114d4c5cd887117952f5b7b5d9355f6
--- /dev/null
+++ b/go/pserver/cclient/test/test_cclient.c
@@ -0,0 +1,117 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "libpaddle_pserver_cclient.h"
+
+typedef float real;
+
+void fail() {
+  // TODO(helin): fix: gtest using cmake is not working, using this
+  // hacky way for now.
+  printf("test failed.\n");
+  exit(-1);
+}
+
+void print_parameter(paddle_gradient* param) {
+  if (param == NULL) {
+    printf("param is NULL!!\n");
+  } else {
+    printf("==== parameter ====\n");
+    printf("name: %s\n", param->name);
+    printf("content_len: %d\n", param->content_len);
+    printf("content_type: %d\n", param->element_type);
+    int i;
+    for (i = 0; i < param->content_len / (int)sizeof(real); ++i) {
+      printf("%f ", ((float*)param->content)[i]);
+    }
+    printf("\n\n");
+  }
+}
+
+int main() {
+  char addr[] = "localhost:3000";
+  paddle_pserver_client c = paddle_new_pserver_client(addr, 1);
+
+  char* names[] = {"param_a", "param_b"};
+
+retry:
+  printf("init parameter to pserver:\n");
+
+  real param_content1[] = {0.1, 0.2, 0.3};
+  real param_content2[] = {0.4, 0.5, 0.6};
+  paddle_parameter** params =
+      (paddle_parameter**)malloc(sizeof(paddle_parameter*) * 2);
+  params[0] = (paddle_parameter*)malloc(sizeof(paddle_parameter));
+  params[0]->name = names[0];
+  params[0]->content = (unsigned char*)param_content1;
+  params[0]->content_len = 3 * sizeof(real);
+  params[0]->element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+
+  params[1] = (paddle_parameter*)malloc(sizeof(paddle_parameter));
+  params[1]->name = names[1];
+  params[1]->content = (unsigned char*)param_content2;
+  params[1]->content_len = 3 * sizeof(real);
+  params[1]->element_type = PADDLE_ELEMENT_TYPE_INT32;
+
+  if (paddle_begin_init_params(c)) {
+    if (paddle_init_param(c, *params[0], NULL, 0) != 0) {
+      goto retry;
+    }
+    if (paddle_init_param(c, *params[1], NULL, 0) != 0) {
+      goto retry;
+    }
+    if (paddle_finish_init_params(c) != 0) {
+      goto retry;
+    }
+  } else {
+    fail();
+  }
+
+  printf("get inited parameters from pserver:\n");
+  // get parameters again by reusing the allocated parameter buffers.
+  if (paddle_get_params(c, params, 2) != 0) {
+    fail();
+  }
+  print_parameter(params[0]);
+  print_parameter(params[1]);
+
+  printf("send gradient to pserver:\n");
+  real gradient_content1[] = {0.01, 0.02, 0.03};
+  real gradinet_content2[] = {0.04, 0.05, 0.06};
+
+  paddle_gradient** grads =
+      (paddle_gradient**)malloc(sizeof(paddle_gradient*) * 2);
+  grads[0] = (paddle_gradient*)malloc(sizeof(paddle_gradient));
+  grads[0]->name = names[0];
+  grads[0]->content = (unsigned char*)gradient_content1;
+  grads[0]->content_len = 3 * sizeof(real);
+  grads[0]->element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+
+  grads[1] = (paddle_gradient*)malloc(sizeof(paddle_gradient));
+  grads[1]->name = names[1];
+  grads[1]->content = (unsigned char*)gradinet_content2;
+  grads[1]->content_len = 3 * sizeof(real);
+  grads[1]->element_type = PADDLE_ELEMENT_TYPE_INT32;
+
+  printf("print gradient sent to pserver:\n");
+  print_parameter(grads[0]);
+  print_parameter(grads[1]);
+
+  if (paddle_send_grads(c, grads, 2) != 0) {
+    fail();
+  }
+
+  printf("get updated parameters from pserver:\n");
+  // get parameters again by reusing the allocated parameter buffers.
+  if (paddle_get_params(c, params, 2) != 0) {
+    fail();
+  }
+  print_parameter(params[0]);
+  print_parameter(params[1]);
+
+  if (paddle_save_model(c, "/tmp/") != 0) {
+    fail();
+  }
+
+  return 0;
+}
diff --git a/go/pserver/cclient/test/test_mnist.py b/go/pserver/cclient/test/test_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3a3af55e2812fa0c965d22ddaba198f43f3c4ad
--- /dev/null
+++ b/go/pserver/cclient/test/test_mnist.py
@@ -0,0 +1,131 @@
+import paddle.v2 as paddle
+import gzip
+
+
+def softmax_regression(img):
+    predict = paddle.layer.fc(input=img,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+
+
+def multilayer_perceptron(img):
+    # The first fully-connected layer
+    hidden1 = paddle.layer.fc(input=img, size=128, act=paddle.activation.Relu())
+    # The second fully-connected layer and the according activation function
+    hidden2 = paddle.layer.fc(input=hidden1,
+                              size=64,
+                              act=paddle.activation.Relu())
+    # The thrid fully-connected layer, note that the hidden size should be 10,
+    # which is the number of unique digits
+    predict = paddle.layer.fc(input=hidden2,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+
+
+def convolutional_neural_network(img):
+    # first conv layer
+    conv_pool_1 = paddle.networks.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        num_channel=1,
+        pool_size=2,
+        pool_stride=2,
+        act=paddle.activation.Tanh())
+    # second conv layer
+    conv_pool_2 = paddle.networks.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        num_channel=20,
+        pool_size=2,
+        pool_stride=2,
+        act=paddle.activation.Tanh())
+    # The first fully-connected layer
+    fc1 = paddle.layer.fc(input=conv_pool_2,
+                          size=128,
+                          act=paddle.activation.Tanh())
+    # The softmax layer, note that the hidden size should be 10,
+    # which is the number of unique digits
+    predict = paddle.layer.fc(input=fc1,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+
+
+def main():
+    paddle.init(use_gpu=False, trainer_count=1)
+
+    # define network topology
+    images = paddle.layer.data(
+        name='pixel', type=paddle.data_type.dense_vector(784))
+    label = paddle.layer.data(
+        name='label', type=paddle.data_type.integer_value(10))
+
+    # Here we can build the prediction network in different ways. Please
+    # choose one by uncomment corresponding line.
+    predict = softmax_regression(images)
+    #predict = multilayer_perceptron(images)
+    #predict = convolutional_neural_network(images)
+
+    cost = paddle.layer.classification_cost(input=predict, label=label)
+    parameters = paddle.parameters.create(cost)
+
+    optimizer = paddle.optimizer.Momentum(
+        learning_rate=0.1 / 128.0,
+        momentum=0.9,
+        regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
+
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer,
+                                 is_local=False,
+                                 pserver_spec="localhost:3000")
+
+    lists = []
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 1000 == 0:
+                print "Pass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+
+        elif isinstance(event, paddle.event.EndPass):
+            result = trainer.test(reader=paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=128))
+            print "Test with Pass %d, Cost %f, %s\n" % (
+                event.pass_id, result.cost, result.metrics)
+            lists.append((event.pass_id, result.cost,
+                          result.metrics['classification_error_evaluator']))
+
+    trainer.train(
+        reader=paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.mnist.train(), buf_size=8192),
+            batch_size=128),
+        event_handler=event_handler,
+        num_passes=100)
+
+    # find the best pass
+    best = sorted(lists, key=lambda list: float(list[1]))[0]
+    print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1])
+    print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100)
+
+    test_creator = paddle.dataset.mnist.test()
+    test_data = []
+    for item in test_creator():
+        test_data.append((item[0], ))
+        if len(test_data) == 100:
+            break
+
+    # output is a softmax layer. It returns probabilities.
+    # Shape should be (100, 10)
+    probs = paddle.infer(
+        output_layer=predict, parameters=parameters, input=test_data)
+    print probs.shape
+
+
+if __name__ == '__main__':
+    main()
diff --git a/go/pserver/cclient/test/test_train.py b/go/pserver/cclient/test/test_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f8d5d793bdeb687c9d234005d9e2eae760cc3a7
--- /dev/null
+++ b/go/pserver/cclient/test/test_train.py
@@ -0,0 +1,60 @@
+import paddle.v2 as paddle
+import paddle.v2.dataset.uci_housing as uci_housing
+
+
+def main():
+    # init
+    paddle.init(use_gpu=False, trainer_count=1)
+
+    # network config
+    x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+    y_predict = paddle.layer.fc(input=x,
+                                param_attr=paddle.attr.Param(name='w'),
+                                size=1,
+                                act=paddle.activation.Linear(),
+                                bias_attr=paddle.attr.Param(name='b'))
+    y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
+    cost = paddle.layer.mse_cost(input=y_predict, label=y)
+
+    # create parameters
+    parameters = paddle.parameters.create(cost)
+
+    # create optimizer
+    optimizer = paddle.optimizer.Momentum(momentum=0)
+
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer,
+                                 is_local=False,
+                                 pserver_spec="localhost:3000")
+
+    # event_handler to print training and testing info
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                print "Pass %d, Batch %d, Cost %f" % (
+                    event.pass_id, event.batch_id, event.cost)
+
+        if isinstance(event, paddle.event.EndPass):
+            if (event.pass_id + 1) % 10 == 0:
+                result = trainer.test(
+                    reader=paddle.batch(
+                        uci_housing.test(), batch_size=2),
+                    feeding={'x': 0,
+                             'y': 1})
+                print "Test %d, %.2f" % (event.pass_id, result.cost)
+
+    # training
+    trainer.train(
+        reader=paddle.batch(
+            paddle.reader.shuffle(
+                uci_housing.train(), buf_size=500),
+            batch_size=2),
+        feeding={'x': 0,
+                 'y': 1},
+        event_handler=event_handler,
+        num_passes=30)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/go/pserver/client.go b/go/pserver/client.go
index f8bd0aa59f30ec7e2b2d318929af96135d3128ed..afe1eecd015b84684329c0e624f3753852d7a8ce 100644
--- a/go/pserver/client.go
+++ b/go/pserver/client.go
@@ -6,7 +6,7 @@ import (
 	"sort"
 	"time"
 
-	"github.com/PaddlePaddle/Paddle/go/pserver/internal/connection"
+	"github.com/PaddlePaddle/Paddle/go/connection"
 )
 
 // TODO(helin): add RPC call retry logic
@@ -47,7 +47,7 @@ func NewClient(l Lister, pserverNum int, sel Selector) *Client {
 // monitorPservers monitors pserver addresses, and updates connection
 // when the address changes.
 func (c *Client) monitorPservers(l Lister, pserverNum int) {
-	knownServers := make([]Server, pserverNum)
+	lastServers := make([]Server, pserverNum)
 	ticker := time.NewTicker(10 * time.Second)
 	monitor := func() {
 		curServers := make([]Server, pserverNum)
@@ -56,25 +56,37 @@ func (c *Client) monitorPservers(l Lister, pserverNum int) {
 			curServers[l.Index] = l
 		}
 
-		for i := range knownServers {
-			if knownServers[i].Addr != curServers[i].Addr {
-				err := c.pservers[i].Connect(curServers[i].Addr)
+		for i := range lastServers {
+			if lastServers[i].Addr == curServers[i].Addr {
+				continue
+			}
+
+			if curServers[i].Addr == "" {
+				err := c.pservers[i].Close()
 				if err != nil {
 					log.Println(err)
-
-					// connect to addr failed, set
-					// to last known addr in order
-					// to retry next time.
-					curServers[i].Addr = knownServers[i].Addr
 				}
+
+				continue
 			}
+
+			err := c.pservers[i].Connect(curServers[i].Addr)
+			if err != nil {
+				log.Println(err)
+
+				// connect to addr failed, set
+				// to last known addr in order
+				// to retry next time.
+				curServers[i].Addr = lastServers[i].Addr
+			}
+
 		}
 
-		knownServers = curServers
+		lastServers = curServers
 	}
 
 	monitor()
-	for _ = range ticker.C {
+	for range ticker.C {
 		monitor()
 	}
 }
@@ -93,16 +105,14 @@ func (c *Client) BeginInitParams() bool {
 
 // InitParam initializes the parameter on parameter servers.
 func (c *Client) InitParam(paramWithConfigs ParameterWithConfig) error {
-	var dummy int
-	return c.pservers[c.partition(paramWithConfigs.Param.Name)].Call("Service.InitParam", paramWithConfigs, &dummy)
+	return c.pservers[c.partition(paramWithConfigs.Param.Name)].Call("Service.InitParam", paramWithConfigs, nil)
 }
 
 // FinishInitParams tells parameter servers client has sent all
 // parameters to parameter servers as initialization.
 func (c *Client) FinishInitParams() error {
 	for _, p := range c.pservers {
-		var dummy int
-		err := p.Call("Service.FinishInitParams", dummy, &dummy)
+		err := p.Call("Service.FinishInitParams", 0, nil)
 		if err != nil {
 			return err
 		}
@@ -116,8 +126,7 @@ func (c *Client) SendGrads(grads []Gradient) error {
 	errCh := make(chan error, len(grads))
 	for _, g := range grads {
 		go func(g Gradient) {
-			var dummy int
-			err := c.pservers[c.partition(g.Name)].Call("Service.SendGrad", g, &dummy)
+			err := c.pservers[c.partition(g.Name)].Call("Service.SendGrad", g, nil)
 			errCh <- err
 		}(g)
 	}
@@ -196,8 +205,7 @@ func (c *Client) Save(path string) error {
 	errCh := make(chan error, len(c.pservers))
 
 	for _, p := range c.pservers {
-		var dummy int
-		err := p.Call("Service.Save", path, &dummy)
+		err := p.Call("Service.Save", path, nil)
 		errCh <- err
 	}
 
diff --git a/go/pserver/client_test.go b/go/pserver/client_test.go
index a9a0948a51a31a1c7393f716e3dfc436dbf919af..d0371a26a13fac9daecacd0b6a271caa6d830651 100644
--- a/go/pserver/client_test.go
+++ b/go/pserver/client_test.go
@@ -117,7 +117,7 @@ func TestClientFull(t *testing.T) {
 
 	for i := range params {
 		if names[i] != params[i].Name {
-			t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", names[i], params[i])
+			t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", names[i], params[i].Name)
 		}
 	}
 }
diff --git a/go/pserver/optimizer.c b/go/pserver/optimizer.c
index b8da3ec9592053e3efe00e69d73a8ae259a30a2f..f16ba2cbf8e168a434fdcdb4f1e0ba1e98d91c6b 100644
--- a/go/pserver/optimizer.c
+++ b/go/pserver/optimizer.c
@@ -32,7 +32,13 @@ int update_SGD(void* optimizer,
                const void* gradient,
                int num_bytes) {
   SGD_optimizer* o = (SGD_optimizer*)optimizer;
-  // TODO
+  float* parameter = (float*)buffer;
+  float* grad = (float*)gradient;
+
+  int i;
+  for (i = 0; i < num_bytes / sizeof(float); ++i) {
+    parameter[i] -= o->learning_rate * grad[i];
+  }
   return 0;
 }
 
diff --git a/go/pserver/service.go b/go/pserver/service.go
index d5787b9708bb15629a6e6290ffc97ee9885bc8b8..78a2bfaf6347019333bf1c7ee6cdc04d93ab1370 100644
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -9,8 +9,10 @@ import (
 // ElementType is the type of elements of a Parameter.
 type ElementType int
 
-var ErrAlreadyInitialized = errors.New("pserver already initialized")
-var ErrUninitialized = errors.New("pserver not fully initialized")
+const (
+	AlreadyInitialized = "pserver already initialized"
+	Uninitialized      = "pserver not fully initialized"
+)
 
 // Supported element types
 const (
@@ -49,7 +51,7 @@ type Service struct {
 
 // NewService creates a new service.
 func NewService() *Service {
-	s := &Service{opt: newOptimizer(sgd, 0.01)}
+	s := &Service{opt: newOptimizer(sgd, 0.005)}
 	s.paramMap = make(map[string]Parameter)
 	s.initialized = make(chan struct{})
 	return s
@@ -59,7 +61,7 @@ func NewService() *Service {
 func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) error {
 	select {
 	case <-s.initialized:
-		return ErrAlreadyInitialized
+		return errors.New(AlreadyInitialized)
 	default:
 	}
 
@@ -80,7 +82,7 @@ func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) er
 func (s *Service) FinishInitParams(dummy0 int, dummy1 *int) error {
 	select {
 	case <-s.initialized:
-		return ErrAlreadyInitialized
+		return errors.New(AlreadyInitialized)
 	default:
 	}
 
@@ -94,7 +96,7 @@ func (s *Service) SendGrad(g Gradient, dummy *int) error {
 	select {
 	case <-s.initialized:
 	default:
-		return ErrUninitialized
+		return errors.New(Uninitialized)
 	}
 
 	s.mu.Lock()
diff --git a/go/pserver/service_test.go b/go/pserver/service_test.go
index 4c9fac4536e09013916aadb26af3a86a5a775b4f..b746d13e1ca71e697c464f84d915af029d37120c 100644
--- a/go/pserver/service_test.go
+++ b/go/pserver/service_test.go
@@ -15,8 +15,7 @@ func TestFull(t *testing.T) {
 	p.Name = "param_a"
 	p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}
 	p.ElementType = pserver.Int32
-	var dummy int
-	err := s.InitParam(pserver.ParameterWithConfig{p, nil}, &dummy)
+	err := s.InitParam(pserver.ParameterWithConfig{Param: p, Config: nil}, nil)
 	if err != nil {
 		t.FailNow()
 	}
@@ -25,12 +24,12 @@ func TestFull(t *testing.T) {
 	p1.Name = "param_b"
 	p1.Content = []byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
 	p1.ElementType = pserver.Float32
-	err = s.InitParam(pserver.ParameterWithConfig{p1, nil}, &dummy)
+	err = s.InitParam(pserver.ParameterWithConfig{Param: p1, Config: nil}, nil)
 	if err != nil {
 		t.FailNow()
 	}
 
-	err = s.FinishInitParams(0, &dummy)
+	err = s.FinishInitParams(0, nil)
 	if err != nil {
 		t.FailNow()
 	}
@@ -46,11 +45,11 @@ func TestFull(t *testing.T) {
 	}
 
 	g1, g2 := pserver.Gradient(p1), pserver.Gradient(p)
-	err = s.SendGrad(g1, &dummy)
+	err = s.SendGrad(g1, nil)
 	if err != nil {
 		t.FailNow()
 	}
-	err = s.SendGrad(g2, &dummy)
+	err = s.SendGrad(g2, nil)
 
 	if err != nil {
 		t.FailNow()
@@ -74,23 +73,21 @@ func TestFull(t *testing.T) {
 
 func TestMultipleInit(t *testing.T) {
 	s := pserver.NewService()
-	var dummy int
-	err := s.FinishInitParams(0, &dummy)
+	err := s.FinishInitParams(0, nil)
 	if err != nil {
 		t.FailNow()
 	}
 
-	err = s.FinishInitParams(0, &dummy)
-	if err != pserver.ErrAlreadyInitialized {
+	err = s.FinishInitParams(0, nil)
+	if err.Error() != pserver.AlreadyInitialized {
 		t.FailNow()
 	}
 }
 
 func TestUninitialized(t *testing.T) {
 	s := pserver.NewService()
-	var dummy int
-	err := s.SendGrad(pserver.Gradient{}, &dummy)
-	if err != pserver.ErrUninitialized {
+	err := s.SendGrad(pserver.Gradient{}, nil)
+	if err.Error() != pserver.Uninitialized {
 		t.FailNow()
 	}
 }
@@ -98,13 +95,14 @@ func TestUninitialized(t *testing.T) {
 func TestBlockUntilInitialized(t *testing.T) {
 	s := pserver.NewService()
 	ch := make(chan struct{}, 2)
+	errCh := make(chan error, 2)
 	var wg sync.WaitGroup
 	wg.Add(1)
 	go func() {
 		var param pserver.Parameter
 		err := s.GetParam("param_a", &param)
 		if err != nil {
-			t.FailNow()
+			errCh <- err
 		}
 		wg.Done()
 		ch <- struct{}{}
@@ -112,10 +110,9 @@ func TestBlockUntilInitialized(t *testing.T) {
 
 	wg.Add(1)
 	go func() {
-		var dummy int
-		err := s.Save("", &dummy)
+		err := s.Save("", nil)
 		if err != nil {
-			t.FailNow()
+			errCh <- err
 		}
 		wg.Done()
 		ch <- struct{}{}
@@ -127,6 +124,8 @@ func TestBlockUntilInitialized(t *testing.T) {
 	case <-ch:
 		// some function returned before initialization is completed.
 		t.FailNow()
+	case <-errCh:
+		t.FailNow()
 	default:
 	}
 
@@ -134,13 +133,12 @@ func TestBlockUntilInitialized(t *testing.T) {
 	p.Name = "param_a"
 	p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}
 	p.ElementType = pserver.Int32
-	var dummy int
-	err := s.InitParam(pserver.ParameterWithConfig{p, nil}, &dummy)
+	err := s.InitParam(pserver.ParameterWithConfig{Param: p, Config: nil}, nil)
 	if err != nil {
 		t.FailNow()
 	}
 
-	err = s.FinishInitParams(0, &dummy)
+	err = s.FinishInitParams(0, nil)
 	if err != nil {
 		t.FailNow()
 	}
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index 071bc36c2ded51ba977350aeae15f6d244cea5be..c9433a38de4d005ebe229c55916401a5f82e9ef3 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -16,7 +16,7 @@ set(API_HEADER
     Internal.h)
 
 add_library(paddle_api STATIC ${API_SOURCES})
-add_dependencies(paddle_api gen_proto_cpp)
+add_dependencies(paddle_api gen_proto_cpp paddle_pserver_cclient_lib)
 
 INCLUDE(${SWIG_USE_FILE})
 INCLUDE_DIRECTORIES(${PROJ_ROOT}/paddle)
@@ -45,7 +45,7 @@ SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS
 )
 
 IF(APPLE)
-    SET(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load")
+    SET(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load -framework CoreFoundation -framework Security")
 ELSE(APPLE)
     SET(START_GROUP "-Xlinker -start-group")
     SET(END_GROUP "-Xlinker -end-group")
diff --git a/paddle/api/Paddle.i b/paddle/api/Paddle.i
index 068ba286c07d8854a1a7c7042224a679b50b4957..3237e73745dca58bed923b20851f0f0039a3487c 100644
--- a/paddle/api/Paddle.i
+++ b/paddle/api/Paddle.i
@@ -179,6 +179,7 @@ namespace std {
 %newobject ParameterOptimizer::needSpecialTraversal;
 %newobject ParameterUpdater::createLocalUpdater;
 %newobject ParameterUpdater::createRemoteUpdater;
+%newobject ParameterUpdater::createNewRemoteUpdater;
 
 %feature("director") UpdateCallback;
 %feature("autodoc", 1); // To generate method stub, for code hint in ide
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index da0f157abd68c73c45f498cf9ef2726aac67c95b..7565ea51fe3e71bf81a28e6e4b5a2bbdd085798c 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -841,6 +841,8 @@ public:
   static ParameterUpdater* createRemoteUpdater(OptimizationConfig* config,
                                                int passCount,
                                                bool useSparseUpdater);
+  static ParameterUpdater* createNewRemoteUpdater(
+      OptimizationConfig* config, const std::string pserverSpec);
   ~ParameterUpdater();
 
   /**
diff --git a/paddle/api/ParameterUpdater.cpp b/paddle/api/ParameterUpdater.cpp
index 79921ea6e787f3c0ebecaad6a9a54bac92211320..eaf8518ae2beaa93bc40ee944c984d142d2bb951 100644
--- a/paddle/api/ParameterUpdater.cpp
+++ b/paddle/api/ParameterUpdater.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "PaddleAPI.h"
 
 #include "PaddleAPIPrivate.h"
+#include "paddle/trainer/NewRemoteParameterUpdater.h"
 #include "paddle/trainer/RemoteParameterUpdater.h"
 #include "paddle/trainer/ThreadParameterUpdater.h"
 
@@ -28,6 +29,14 @@ ParameterUpdater *ParameterUpdater::createLocalUpdater(
   return updater;
 }
 
+ParameterUpdater *ParameterUpdater::createNewRemoteUpdater(
+    OptimizationConfig *config, const std::string pserverSpec) {
+  auto updater = new ParameterUpdater();
+  updater->m->updater.reset(new paddle::NewRemoteParameterUpdater(
+      config->m->getConfig(), pserverSpec));
+  return updater;
+}
+
 ParameterUpdater *ParameterUpdater::createRemoteUpdater(
     OptimizationConfig *config, int passCount, bool useSparseUpdater) {
   auto updater = new ParameterUpdater();
diff --git a/paddle/gserver/layers/DetectionUtil.cpp b/paddle/gserver/layers/DetectionUtil.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3e61adc66e60c54250e4f323452aa13045310879
--- /dev/null
+++ b/paddle/gserver/layers/DetectionUtil.cpp
@@ -0,0 +1,576 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DetectionUtil.h"
+
+namespace paddle {
+
+size_t appendWithPermute(const Matrix& inMatrix,
+                         size_t height,
+                         size_t width,
+                         size_t outTotalSize,
+                         size_t outOffset,
+                         size_t batchSize,
+                         Matrix& outMatrix,
+                         PermMode permMode) {
+  CHECK_EQ(inMatrix.useGpu(), outMatrix.useGpu());
+  bool useGpu = inMatrix.useGpu();
+  if (permMode == kNCHWToNHWC) {
+    size_t inElementCnt = inMatrix.getElementCnt();
+    size_t channels = inElementCnt / (height * width * batchSize);
+    size_t imgSize = height * width;
+    for (size_t i = 0; i < batchSize; ++i) {
+      size_t offset = i * (outTotalSize / batchSize) + outOffset;
+      const MatrixPtr inTmp = Matrix::create(
+          const_cast<real*>(inMatrix.getData()) + i * channels * imgSize,
+          channels,
+          imgSize,
+          false,
+          useGpu);
+      MatrixPtr outTmp =
+          Matrix::create(const_cast<real*>(outMatrix.getData()) + offset,
+                         imgSize,
+                         channels,
+                         false,
+                         useGpu);
+      inTmp->transpose(outTmp, false);
+    }
+    return channels * imgSize;
+  } else {
+    LOG(FATAL) << "Unkown permute mode";
+  }
+}
+
+size_t decomposeWithPermute(const Matrix& inMatrix,
+                            size_t height,
+                            size_t width,
+                            size_t inTotalSize,
+                            size_t inOffset,
+                            size_t batchSize,
+                            Matrix& outMatrix,
+                            PermMode permMode) {
+  CHECK_EQ(inMatrix.useGpu(), outMatrix.useGpu());
+  bool useGpu = inMatrix.useGpu();
+  if (permMode == kNHWCToNCHW) {
+    size_t outElementCnt = outMatrix.getElementCnt();
+    size_t channels = outElementCnt / (height * width * batchSize);
+    size_t imgSize = height * width;
+    for (size_t i = 0; i < batchSize; ++i) {
+      size_t offset = i * (inTotalSize / batchSize) + inOffset;
+      const MatrixPtr inTmp =
+          Matrix::create(const_cast<real*>(inMatrix.getData()) + offset,
+                         imgSize,
+                         channels,
+                         false,
+                         useGpu);
+      MatrixPtr outTmp = Matrix::create(
+          const_cast<real*>(outMatrix.getData()) + i * channels * imgSize,
+          channels,
+          imgSize,
+          false,
+          useGpu);
+      inTmp->transpose(outTmp, false);
+    }
+    return channels * imgSize;
+  } else {
+    LOG(FATAL) << "Unkown permute mode";
+  }
+}
+
+real jaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2) {
+  if (bbox2.xMin > bbox1.xMax || bbox2.xMax < bbox1.xMin ||
+      bbox2.yMin > bbox1.yMax || bbox2.yMax < bbox1.yMin) {
+    return 0.0;
+  } else {
+    real interXMin = std::max(bbox1.xMin, bbox2.xMin);
+    real interYMin = std::max(bbox1.yMin, bbox2.yMin);
+    real interXMax = std::min(bbox1.xMax, bbox2.xMax);
+    real interYMax = std::min(bbox1.yMax, bbox2.yMax);
+
+    real interWidth = interXMax - interXMin;
+    real interHeight = interYMax - interYMin;
+    real interArea = interWidth * interHeight;
+
+    real bboxArea1 = bbox1.getArea();
+    real bboxArea2 = bbox2.getArea();
+
+    return interArea / (bboxArea1 + bboxArea2 - interArea);
+  }
+}
+
+void encodeBBoxWithVar(const NormalizedBBox& priorBBox,
+                       const vector<real>& priorBBoxVar,
+                       const NormalizedBBox& gtBBox,
+                       vector<real>& outVec) {
+  real priorBBoxWidth = priorBBox.getWidth();
+  real priorBBoxHeight = priorBBox.getHeight();
+  real priorBBoxCenterX = priorBBox.getCenterX();
+  real priorBBoxCenterY = priorBBox.getCenterY();
+
+  real gtBBoxWidth = gtBBox.getWidth();
+  real gtBBoxHeight = gtBBox.getHeight();
+  real gtBBoxCenterX = gtBBox.getCenterX();
+  real gtBBoxCenterY = gtBBox.getCenterY();
+
+  outVec.clear();
+  outVec.push_back((gtBBoxCenterX - priorBBoxCenterX) / priorBBoxWidth /
+                   priorBBoxVar[0]);
+  outVec.push_back((gtBBoxCenterY - priorBBoxCenterY) / priorBBoxHeight /
+                   priorBBoxVar[1]);
+  outVec.push_back(std::log(std::fabs(gtBBoxWidth / priorBBoxWidth)) /
+                   priorBBoxVar[2]);
+  outVec.push_back(std::log(std::fabs(gtBBoxHeight / priorBBoxHeight)) /
+                   priorBBoxVar[3]);
+}
+
+NormalizedBBox decodeBBoxWithVar(const NormalizedBBox& priorBBox,
+                                 const vector<real>& priorBBoxVar,
+                                 const vector<real>& locPredData) {
+  real priorBBoxWidth = priorBBox.getWidth();
+  real priorBBoxHeight = priorBBox.getHeight();
+  real priorBBoxCenterX = priorBBox.getCenterX();
+  real priorBBoxCenterY = priorBBox.getCenterY();
+
+  real decodedBBoxCenterX =
+      priorBBoxVar[0] * locPredData[0] * priorBBoxWidth + priorBBoxCenterX;
+  real decodedBBoxCenterY =
+      priorBBoxVar[1] * locPredData[1] * priorBBoxHeight + priorBBoxCenterY;
+  real decodedBBoxWidth =
+      std::exp(priorBBoxVar[2] * locPredData[2]) * priorBBoxWidth;
+  real decodedBBoxHeight =
+      std::exp(priorBBoxVar[3] * locPredData[3]) * priorBBoxHeight;
+
+  NormalizedBBox decodedBBox;
+  decodedBBox.xMin = decodedBBoxCenterX - decodedBBoxWidth / 2;
+  decodedBBox.yMin = decodedBBoxCenterY - decodedBBoxHeight / 2;
+  decodedBBox.xMax = decodedBBoxCenterX + decodedBBoxWidth / 2;
+  decodedBBox.yMax = decodedBBoxCenterY + decodedBBoxHeight / 2;
+
+  return decodedBBox;
+}
+
+void getBBoxFromPriorData(const real* priorData,
+                          const size_t numBBoxes,
+                          vector<NormalizedBBox>& bboxVec) {
+  size_t outOffset = bboxVec.size();
+  bboxVec.resize(bboxVec.size() + numBBoxes);
+  for (size_t i = 0; i < numBBoxes; ++i) {
+    NormalizedBBox bbox;
+    bbox.xMin = *(priorData + i * 8);
+    bbox.yMin = *(priorData + i * 8 + 1);
+    bbox.xMax = *(priorData + i * 8 + 2);
+    bbox.yMax = *(priorData + i * 8 + 3);
+    bboxVec[outOffset + i] = bbox;
+  }
+}
+
+void getBBoxVarFromPriorData(const real* priorData,
+                             const size_t num,
+                             vector<vector<real>>& varVec) {
+  size_t outOffset = varVec.size();
+  varVec.resize(varVec.size() + num);
+  for (size_t i = 0; i < num; ++i) {
+    vector<real> var;
+    var.push_back(*(priorData + i * 8 + 4));
+    var.push_back(*(priorData + i * 8 + 5));
+    var.push_back(*(priorData + i * 8 + 6));
+    var.push_back(*(priorData + i * 8 + 7));
+    varVec[outOffset + i] = var;
+  }
+}
+
+void getBBoxFromLabelData(const real* labelData,
+                          const size_t numBBoxes,
+                          vector<NormalizedBBox>& bboxVec) {
+  size_t outOffset = bboxVec.size();
+  bboxVec.resize(bboxVec.size() + numBBoxes);
+  for (size_t i = 0; i < numBBoxes; ++i) {
+    NormalizedBBox bbox;
+    bbox.xMin = *(labelData + i * 6 + 1);
+    bbox.yMin = *(labelData + i * 6 + 2);
+    bbox.xMax = *(labelData + i * 6 + 3);
+    bbox.yMax = *(labelData + i * 6 + 4);
+    real isDifficult = *(labelData + i * 6 + 5);
+    if (std::abs(isDifficult - 0.0) < 1e-6)
+      bbox.isDifficult = false;
+    else
+      bbox.isDifficult = true;
+    bboxVec[outOffset + i] = bbox;
+  }
+}
+
+void getBBoxFromDetectData(const real* detectData,
+                           const size_t numBBoxes,
+                           vector<real>& labelVec,
+                           vector<real>& scoreVec,
+                           vector<NormalizedBBox>& bboxVec) {
+  size_t outOffset = bboxVec.size();
+  labelVec.resize(outOffset + numBBoxes);
+  scoreVec.resize(outOffset + numBBoxes);
+  bboxVec.resize(outOffset + numBBoxes);
+  for (size_t i = 0; i < numBBoxes; ++i) {
+    labelVec[outOffset + i] = *(detectData + i * 7 + 1);
+    scoreVec[outOffset + i] = *(detectData + i * 7 + 2);
+    NormalizedBBox bbox;
+    bbox.xMin = *(detectData + i * 7 + 3);
+    bbox.yMin = *(detectData + i * 7 + 4);
+    bbox.xMax = *(detectData + i * 7 + 5);
+    bbox.yMax = *(detectData + i * 7 + 6);
+    bboxVec[outOffset + i] = bbox;
+  }
+}
+
+void matchBBox(const vector<NormalizedBBox>& priorBBoxes,
+               const vector<NormalizedBBox>& gtBBoxes,
+               real overlapThreshold,
+               vector<int>* matchIndices,
+               vector<real>* matchOverlaps) {
+  map<size_t, map<size_t, real>> overlaps;
+  size_t numPriors = priorBBoxes.size();
+  size_t numGTs = gtBBoxes.size();
+
+  matchIndices->clear();
+  matchIndices->resize(numPriors, -1);
+  matchOverlaps->clear();
+  matchOverlaps->resize(numPriors, 0.0);
+
+  // Store the positive overlap between predictions and ground truth
+  for (size_t i = 0; i < numPriors; ++i) {
+    for (size_t j = 0; j < numGTs; ++j) {
+      real overlap = jaccardOverlap(priorBBoxes[i], gtBBoxes[j]);
+      if (overlap > 1e-6) {
+        (*matchOverlaps)[i] = std::max((*matchOverlaps)[i], overlap);
+        overlaps[i][j] = overlap;
+      }
+    }
+  }
+  // Bipartite matching
+  vector<int> gtPool;
+  for (size_t i = 0; i < numGTs; ++i) {
+    gtPool.push_back(i);
+  }
+  while (gtPool.size() > 0) {
+    // Find the most overlapped gt and corresponding predictions
+    int maxPriorIdx = -1;
+    int maxGTIdx = -1;
+    real maxOverlap = -1.0;
+    for (map<size_t, map<size_t, real>>::iterator it = overlaps.begin();
+         it != overlaps.end();
+         ++it) {
+      size_t i = it->first;
+      if ((*matchIndices)[i] != -1) {
+        // The prediction already has matched ground truth or is ignored
+        continue;
+      }
+      for (size_t p = 0; p < gtPool.size(); ++p) {
+        int j = gtPool[p];
+        if (it->second.find(j) == it->second.end()) {
+          // No overlap between the i-th prediction and j-th ground truth
+          continue;
+        }
+        // Find the maximum overlapped pair
+        if (it->second[j] > maxOverlap) {
+          maxPriorIdx = (int)i;
+          maxGTIdx = (int)j;
+          maxOverlap = it->second[j];
+        }
+      }
+    }
+    if (maxPriorIdx == -1) {
+      break;
+    } else {
+      (*matchIndices)[maxPriorIdx] = maxGTIdx;
+      (*matchOverlaps)[maxPriorIdx] = maxOverlap;
+      gtPool.erase(std::find(gtPool.begin(), gtPool.end(), maxGTIdx));
+    }
+  }
+
+  // Get most overlaped for the rest prediction bboxes
+  for (map<size_t, map<size_t, real>>::iterator it = overlaps.begin();
+       it != overlaps.end();
+       ++it) {
+    size_t i = it->first;
+    if ((*matchIndices)[i] != -1) {
+      // The prediction already has matched ground truth or is ignored
+      continue;
+    }
+    int maxGTIdx = -1;
+    real maxOverlap = -1;
+    for (size_t j = 0; j < numGTs; ++j) {
+      if (it->second.find(j) == it->second.end()) {
+        // No overlap between the i-th prediction and j-th ground truth
+        continue;
+      }
+      // Find the maximum overlapped pair
+      real overlap = it->second[j];
+      if (overlap > maxOverlap && overlap >= overlapThreshold) {
+        maxGTIdx = j;
+        maxOverlap = overlap;
+      }
+    }
+    if (maxGTIdx != -1) {
+      (*matchIndices)[i] = maxGTIdx;
+      (*matchOverlaps)[i] = maxOverlap;
+    }
+  }
+}
+
+pair<size_t, size_t> generateMatchIndices(
+    const Matrix& priorValue,
+    const size_t numPriorBBoxes,
+    const Matrix& gtValue,
+    const int* gtStartPosPtr,
+    const size_t seqNum,
+    const vector<vector<real>>& maxConfScore,
+    const size_t batchSize,
+    const real overlapThreshold,
+    const real negOverlapThreshold,
+    const size_t negPosRatio,
+    vector<vector<int>>* matchIndicesVecPtr,
+    vector<vector<int>>* negIndicesVecPtr) {
+  vector<NormalizedBBox> priorBBoxes;  // share same prior bboxes
+  getBBoxFromPriorData(priorValue.getData(), numPriorBBoxes, priorBBoxes);
+  size_t totalPos = 0;
+  size_t totalNeg = 0;
+  for (size_t n = 0; n < batchSize; ++n) {
+    vector<int> matchIndices;
+    vector<int> negIndices;
+    vector<real> matchOverlaps;
+    matchIndices.resize(numPriorBBoxes, -1);
+    matchOverlaps.resize(numPriorBBoxes, 0.0);
+    size_t numGTBBoxes = 0;
+    if (n < seqNum) numGTBBoxes = gtStartPosPtr[n + 1] - gtStartPosPtr[n];
+    if (!numGTBBoxes) {
+      matchIndicesVecPtr->push_back(matchIndices);
+      negIndicesVecPtr->push_back(negIndices);
+      continue;
+    }
+    vector<NormalizedBBox> gtBBoxes;
+    getBBoxFromLabelData(
+        gtValue.getData() + gtStartPosPtr[n] * 6, numGTBBoxes, gtBBoxes);
+
+    matchBBox(
+        priorBBoxes, gtBBoxes, overlapThreshold, &matchIndices, &matchOverlaps);
+
+    size_t numPos = 0;
+    size_t numNeg = 0;
+    for (size_t i = 0; i < matchIndices.size(); ++i)
+      if (matchIndices[i] != -1) ++numPos;
+    totalPos += numPos;
+    vector<pair<real, size_t>> scoresIndices;
+    for (size_t i = 0; i < matchIndices.size(); ++i)
+      if (matchIndices[i] == -1 && matchOverlaps[i] < negOverlapThreshold) {
+        scoresIndices.push_back(std::make_pair(maxConfScore[n][i], i));
+        ++numNeg;
+      }
+    numNeg = std::min(static_cast<size_t>(numPos * negPosRatio), numNeg);
+    std::sort(scoresIndices.begin(),
+              scoresIndices.end(),
+              sortScorePairDescend<size_t>);
+    for (size_t i = 0; i < numNeg; ++i)
+      negIndices.push_back(scoresIndices[i].second);
+    totalNeg += numNeg;
+    matchIndicesVecPtr->push_back(matchIndices);
+    negIndicesVecPtr->push_back(negIndices);
+  }
+  return std::make_pair(totalPos, totalNeg);
+}
+
+void getMaxConfidenceScores(const real* confData,
+                            const size_t batchSize,
+                            const size_t numPriorBBoxes,
+                            const size_t numClasses,
+                            const size_t backgroundId,
+                            vector<vector<real>>* maxConfScoreVecPtr) {
+  maxConfScoreVecPtr->clear();
+  for (size_t i = 0; i < batchSize; ++i) {
+    vector<real> maxConfScore;
+    for (size_t j = 0; j < numPriorBBoxes; ++j) {
+      int offset = j * numClasses;
+      real maxVal = -FLT_MAX;
+      real maxPosVal = -FLT_MAX;
+      real maxScore = 0.0;
+      for (size_t c = 0; c < numClasses; ++c) {
+        maxVal = std::max<real>(confData[offset + c], maxVal);
+        if (c != backgroundId)
+          maxPosVal = std::max<real>(confData[offset + c], maxPosVal);
+      }
+      real sum = 0.0;
+      for (size_t c = 0; c < numClasses; ++c)
+        sum += std::exp(confData[offset + c] - maxVal);
+      maxScore = std::exp(maxPosVal - maxVal) / sum;
+      maxConfScore.push_back(maxScore);
+    }
+    confData += numPriorBBoxes * numClasses;
+    maxConfScoreVecPtr->push_back(maxConfScore);
+  }
+}
+
+template <typename T>
+bool sortScorePairDescend(const pair<real, T>& pair1,
+                          const pair<real, T>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+template <>
+bool sortScorePairDescend(const pair<real, NormalizedBBox>& pair1,
+                          const pair<real, NormalizedBBox>& pair2) {
+  return pair1.first > pair2.first;
+}
+
+void applyNMSFast(const vector<NormalizedBBox>& bboxes,
+                  const real* confScoreData,
+                  size_t classIdx,
+                  size_t topK,
+                  real confThreshold,
+                  real nmsThreshold,
+                  size_t numPriorBBoxes,
+                  size_t numClasses,
+                  vector<size_t>* indices) {
+  vector<pair<real, size_t>> scores;
+  for (size_t i = 0; i < numPriorBBoxes; ++i) {
+    size_t confOffset = i * numClasses + classIdx;
+    if (confScoreData[confOffset] > confThreshold)
+      scores.push_back(std::make_pair(confScoreData[confOffset], i));
+  }
+  std::stable_sort(scores.begin(), scores.end(), sortScorePairDescend<size_t>);
+  if (topK > 0 && topK < scores.size()) scores.resize(topK);
+  while (scores.size() > 0) {
+    const size_t idx = scores.front().second;
+    bool keep = true;
+    for (size_t i = 0; i < indices->size(); ++i) {
+      if (keep) {
+        const size_t savedIdx = (*indices)[i];
+        real overlap = jaccardOverlap(bboxes[idx], bboxes[savedIdx]);
+        keep = overlap <= nmsThreshold;
+      } else {
+        break;
+      }
+    }
+    if (keep) indices->push_back(idx);
+    scores.erase(scores.begin());
+  }
+}
+
+size_t getDetectionIndices(
+    const real* confData,
+    const size_t numPriorBBoxes,
+    const size_t numClasses,
+    const size_t backgroundId,
+    const size_t batchSize,
+    const size_t confThreshold,
+    const size_t nmsTopK,
+    const real nmsThreshold,
+    const size_t keepTopK,
+    const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
+    vector<map<size_t, vector<size_t>>>* allDetectionIndices) {
+  size_t totalKeepNum = 0;
+  for (size_t n = 0; n < batchSize; ++n) {
+    const vector<NormalizedBBox>& decodedBBoxes = allDecodedBBoxes[n];
+    size_t numDetected = 0;
+    map<size_t, vector<size_t>> indices;
+    size_t confOffset = n * numPriorBBoxes * numClasses;
+    for (size_t c = 0; c < numClasses; ++c) {
+      if (c == backgroundId) continue;
+      applyNMSFast(decodedBBoxes,
+                   confData + confOffset,
+                   c,
+                   nmsTopK,
+                   confThreshold,
+                   nmsThreshold,
+                   numPriorBBoxes,
+                   numClasses,
+                   &(indices[c]));
+      numDetected += indices[c].size();
+    }
+    if (keepTopK > 0 && numDetected > keepTopK) {
+      vector<pair<real, pair<size_t, size_t>>> scoreIndexPairs;
+      for (size_t c = 0; c < numClasses; ++c) {
+        const vector<size_t>& labelIndices = indices[c];
+        for (size_t i = 0; i < labelIndices.size(); ++i) {
+          size_t idx = labelIndices[i];
+          scoreIndexPairs.push_back(
+              std::make_pair((confData + confOffset)[idx * numClasses + c],
+                             std::make_pair(c, idx)));
+        }
+      }
+      std::sort(scoreIndexPairs.begin(),
+                scoreIndexPairs.end(),
+                sortScorePairDescend<pair<size_t, size_t>>);
+      scoreIndexPairs.resize(keepTopK);
+      map<size_t, vector<size_t>> newIndices;
+      for (size_t i = 0; i < scoreIndexPairs.size(); ++i) {
+        size_t label = scoreIndexPairs[i].second.first;
+        size_t idx = scoreIndexPairs[i].second.second;
+        newIndices[label].push_back(idx);
+      }
+      allDetectionIndices->push_back(newIndices);
+      totalKeepNum += keepTopK;
+    } else {
+      allDetectionIndices->push_back(indices);
+      totalKeepNum += numDetected;
+    }
+  }
+  return totalKeepNum;
+}
+
+void getDetectionOutput(const real* confData,
+                        const size_t numKept,
+                        const size_t numPriorBBoxes,
+                        const size_t numClasses,
+                        const size_t batchSize,
+                        const vector<map<size_t, vector<size_t>>>& allIndices,
+                        const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
+                        Matrix& out) {
+  MatrixPtr outBuffer;
+  Matrix::resizeOrCreate(outBuffer, numKept, 7, false, false);
+  real* bufferData = outBuffer->getData();
+  size_t count = 0;
+  for (size_t n = 0; n < batchSize; ++n) {
+    for (map<size_t, vector<size_t>>::const_iterator it = allIndices[n].begin();
+         it != allIndices[n].end();
+         ++it) {
+      size_t label = it->first;
+      const vector<size_t>& indices = it->second;
+      const vector<NormalizedBBox>& decodedBBoxes = allDecodedBBoxes[n];
+      for (size_t i = 0; i < indices.size(); ++i) {
+        size_t idx = indices[i];
+        size_t confOffset = n * numPriorBBoxes * numClasses + idx * numClasses;
+        bufferData[count * 7] = n;
+        bufferData[count * 7 + 1] = label;
+        bufferData[count * 7 + 2] = (confData + confOffset)[label];
+        NormalizedBBox clippedBBox = clipBBox(decodedBBoxes[idx]);
+        bufferData[count * 7 + 3] = clippedBBox.xMin;
+        bufferData[count * 7 + 4] = clippedBBox.yMin;
+        bufferData[count * 7 + 5] = clippedBBox.xMax;
+        bufferData[count * 7 + 6] = clippedBBox.yMax;
+        ++count;
+      }
+    }
+  }
+  out.copyFrom(bufferData, numKept * 7);
+}
+
+NormalizedBBox clipBBox(const NormalizedBBox& bbox) {
+  real realOne = static_cast<real>(1.0);
+  real realZero = static_cast<real>(0.0);
+  NormalizedBBox clippedBBox;
+  clippedBBox.xMin = std::max(std::min(bbox.xMin, realOne), realZero);
+  clippedBBox.yMin = std::max(std::min(bbox.yMin, realOne), realZero);
+  clippedBBox.xMax = std::max(std::min(bbox.xMax, realOne), realZero);
+  clippedBBox.yMax = std::max(std::min(bbox.yMax, realOne), realZero);
+  return clippedBBox;
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/DetectionUtil.h b/paddle/gserver/layers/DetectionUtil.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe4f9f075e4cf011c97f68f49598a828d62327b3
--- /dev/null
+++ b/paddle/gserver/layers/DetectionUtil.h
@@ -0,0 +1,307 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <float.h>
+#include <algorithm>
+#include <vector>
+#include "paddle/math/Matrix.h"
+
+using std::vector;
+using std::pair;
+using std::map;
+
+namespace paddle {
+
+template <typename T>
+struct BBoxBase {
+  BBoxBase(T xMin, T yMin, T xMax, T yMax)
+      : xMin(xMin), yMin(yMin), xMax(xMax), yMax(yMax), isDifficult(false) {}
+
+  BBoxBase() {}
+
+  T getWidth() const { return xMax - xMin; }
+
+  T getHeight() const { return yMax - yMin; }
+
+  T getCenterX() const { return (xMin + xMax) / 2; }
+
+  T getCenterY() const { return (yMin + yMax) / 2; }
+
+  T getArea() const { return getWidth() * getHeight(); }
+
+  // coordinate of bounding box
+  T xMin;
+  T yMin;
+  T xMax;
+  T yMax;
+  // whether difficult object (e.g. object with heavy occlusion is difficult)
+  bool isDifficult;
+};
+
+struct NormalizedBBox : BBoxBase<real> {
+  NormalizedBBox() : BBoxBase<real>() {}
+};
+
+enum PermMode { kNCHWToNHWC, kNHWCToNCHW };
+
+/**
+ * @brief First permute input maxtrix then append to output matrix
+ */
+size_t appendWithPermute(const Matrix& inMatrix,
+                         size_t height,
+                         size_t width,
+                         size_t outTotalSize,
+                         size_t outOffset,
+                         size_t batchSize,
+                         Matrix& outMatrix,
+                         PermMode permMode);
+
+/**
+ * @brief First permute input maxtrix then decompose to output
+ */
+size_t decomposeWithPermute(const Matrix& inMatrix,
+                            size_t height,
+                            size_t width,
+                            size_t totalSize,
+                            size_t offset,
+                            size_t batchSize,
+                            Matrix& outMatrix,
+                            PermMode permMode);
+
+/**
+ * @brief Compute jaccard overlap between two bboxes.
+ * @param bbox1 The first bbox
+ * @param bbox2 The second bbox
+ */
+real jaccardOverlap(const NormalizedBBox& bbox1, const NormalizedBBox& bbox2);
+
+/**
+ * @brief Compute offset parameters between prior bbox and ground truth bbox
+ * and variances of prior bbox are considered
+ * @param priorBBox Input prior bbox
+ * @param priorBBoxVar Variance parameters of prior bbox
+ * @param gtBBox Groundtruth bbox
+ * @param outVec Output vector
+ */
+void encodeBBoxWithVar(const NormalizedBBox& priorBBox,
+                       const vector<real>& priorBBoxVar,
+                       const NormalizedBBox& gtBBox,
+                       vector<real>& outVec);
+
+/**
+ * @brief Decode prior bbox with offset parameters
+ * and variances of prior bbox are considered
+ * @param priorBBox Prior bbox to be decoded
+ * @param priorBBoxVar Variance parameters of prior bbox
+ * @param locPredData Offset parameters
+ */
+NormalizedBBox decodeBBoxWithVar(const NormalizedBBox& priorBBox,
+                                 const vector<real>& priorBBoxVar,
+                                 const vector<real>& locPredData);
+
+/**
+ * @brief Extract bboxes from prior matrix, the layout is
+ * xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var ...
+ * @param priorData Matrix of prior value
+ * @param numBBoxes Number of bbox to be extracted
+ * @param bboxVec Append to the vector
+ */
+void getBBoxFromPriorData(const real* priorData,
+                          const size_t numBBoxes,
+                          vector<NormalizedBBox>& bboxVec);
+
+/**
+ * @brief Extract labels, scores and bboxes from detection matrix, the layout is
+ * imageId | label | score | xmin | ymin | xmax | ymax
+ * @param detectData Matrix of detection value
+ * @param numBBoxes Number of bbox to be extracted
+ * @param labelVec Label of bbox
+ * @param scoreVec Score of bbox
+ * @param bboxVec Append to the vector
+ */
+void getBBoxFromDetectData(const real* detectData,
+                           const size_t numBBoxes,
+                           vector<real>& labelVec,
+                           vector<real>& scoreVec,
+                           vector<NormalizedBBox>& bboxVec);
+
+/**
+ * @brief Extract variances from prior matrix, the layout is
+ * xmin1 | ymin1 | xmax1 | ymax1 | xmin1Var | ymin1Var | xmax1Var | ymax1Var ...
+ * @param priorData Matrix of prior value
+ * @param num Number to be extracted
+ * @param varVec Append to the vector
+ */
+void getBBoxVarFromPriorData(const real* priorData,
+                             const size_t num,
+                             vector<vector<real>>& varVec);
+
+/**
+ * @brief Extract bboxes from label matrix, the layout is
+ * class1_1 | xmin1_1 | ymin1_1 | xmax1_1 | ymax1_1 | difficult1_1 | ...
+ * @param labelData Matrix of label value
+ * @param numBBoxes Number to be extracted
+ * @param bboxVec Append to the vector
+ */
+void getBBoxFromLabelData(const real* labelData,
+                          const size_t numBBoxes,
+                          vector<NormalizedBBox>& bboxVec);
+
+/**
+* @brief Match prior bbox to groundtruth bbox, the strategy is:
+1. Find the most overlaped bbox pair (prior and groundtruth)
+2. For rest of prior bboxes find the most overlaped groundtruth bbox
+* @param priorBBoxes prior bbox
+* @param gtBBoxes groundtruth bbox
+* @param overlapThreshold Low boundary of overlap (judge whether matched)
+* @param matchIndices For each prior bbox, groundtruth bbox index if matched
+otherwise -1
+* @param matchOverlaps For each prior bbox, overap with all groundtruth bboxes
+*/
+void matchBBox(const vector<NormalizedBBox>& priorBBoxes,
+               const vector<NormalizedBBox>& gtBBoxes,
+               real overlapThreshold,
+               vector<int>* matchIndices,
+               vector<real>* matchOverlaps);
+
+/**
+* @brief Generate positive bboxes and negative bboxes,
+|positive bboxes|/|negative bboxes| is negPosRatio
+* @param priorValue Prior value
+* @param numPriorBBoxes Number of prior bbox
+* @param gtValue Groundtruth value
+* @param gtStartPosPtr Since groundtruth value stored as sequence type,
+this parameter indicates start position of each record
+* @param seqNum Number of sequence
+* @param maxConfScore Classification score for prior bbox, used to mine
+negative examples
+* @param batchSize Image number
+* @param overlapThreshold Low boundary of overap
+* @param negOverlapThreshold Upper boundary of overap (judge negative example)
+* @param negPosRatio Control number of negative bboxes
+* @param matchIndicesVecPtr Save indices of matched prior bbox
+* @param negIndicesVecPtr Save indices of negative prior bbox
+*/
+pair<size_t, size_t> generateMatchIndices(
+    const Matrix& priorValue,
+    const size_t numPriorBBoxes,
+    const Matrix& gtValue,
+    const int* gtStartPosPtr,
+    const size_t seqNum,
+    const vector<vector<real>>& maxConfScore,
+    const size_t batchSize,
+    const real overlapThreshold,
+    const real negOverlapThreshold,
+    const size_t negPosRatio,
+    vector<vector<int>>* matchIndicesVecPtr,
+    vector<vector<int>>* negIndicesVecPtr);
+
+/**
+ * @brief Get max confidence score for each prior bbox
+ * @param confData Confidence scores, layout is
+ * class1 score | class2 score | ... | classN score ...
+ * @param batchSize Image number
+ * @param numPriorBBoxes Prior bbox number
+ * @param numClasses Classes number
+ * @param backgroundId Background id
+ * @param maxConfScoreVecPtr Ouput
+ */
+void getMaxConfidenceScores(const real* confData,
+                            const size_t batchSize,
+                            const size_t numPriorBBoxes,
+                            const size_t numClasses,
+                            const size_t backgroundId,
+                            vector<vector<real>>* maxConfScoreVecPtr);
+
+template <typename T>
+bool sortScorePairDescend(const pair<real, T>& pair1,
+                          const pair<real, T>& pair2);
+
+template <>
+bool sortScorePairDescend(const pair<real, NormalizedBBox>& pair1,
+                          const pair<real, NormalizedBBox>& pair2);
+
+/**
+ * @brief Do NMS for bboxes to remove duplicated bboxes
+ * @param bboxes BBoxes to apply NMS
+ * @param confScoreData Confidence scores
+ * @param classIdx Class to do NMS
+ * @param topK Number to keep
+ * @param confThreshold Low boundary of confidence score
+ * @param nmsThreshold Threshold of overlap
+ * @param numPriorBBoxes Total number of prior bboxes
+ * @param numClasses Total class number
+ * @param indices Indices of high quality bboxes
+ */
+void applyNMSFast(const vector<NormalizedBBox>& bboxes,
+                  const real* confScoreData,
+                  size_t classIdx,
+                  size_t topK,
+                  real confThreshold,
+                  real nmsThreshold,
+                  size_t numPriorBBoxes,
+                  size_t numClasses,
+                  vector<size_t>* indices);
+
+/**
+ * @brief Get detection results which satify requirements
+ * @param numPriorBBoxes Prior bbox number
+ * @param numClasses Class number
+ * @param backgroundId Background class
+ * @param batchSize Image number
+ * @param confThreshold Threshold of class confidence
+ * @param nmsTopK Used in NMS operation to keep top k bbox
+ * @param nmsThreshold Used in NMS, threshold of overlap
+ * @param keepTopK How many bboxes keeped in an image
+ * @param allDecodedBBoxes Decoded bboxes for all images
+ * @param allDetectionIndices Save detection bbox indices
+ */
+size_t getDetectionIndices(
+    const real* confData,
+    const size_t numPriorBBoxes,
+    const size_t numClasses,
+    const size_t backgroundId,
+    const size_t batchSize,
+    const size_t confThreshold,
+    const size_t nmsTopK,
+    const real nmsThreshold,
+    const size_t keepTopK,
+    const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
+    vector<map<size_t, vector<size_t>>>* allDetectionIndices);
+
+/**
+ * @brief Get detection results
+ * @param confData Confidence scores
+ * @param numPriorBBoxes Prior bbox number
+ * @param numClasses Class number
+ * @param batchSize Image number
+ * @param allIndices Indices of predicted bboxes
+ * @param allDecodedBBoxes BBoxes decoded
+ * @param out Output matrix
+ * image number | label | confidence score | xMin | yMin | xMax | yMax
+ */
+void getDetectionOutput(const real* confData,
+                        const size_t numKept,
+                        const size_t numPriorBBoxes,
+                        const size_t numClasses,
+                        const size_t batchSize,
+                        const vector<map<size_t, vector<size_t>>>& allIndices,
+                        const vector<vector<NormalizedBBox>>& allDecodedBBoxes,
+                        Matrix& out);
+
+NormalizedBBox clipBBox(const NormalizedBBox& bbox);
+
+}  // namespace paddle
diff --git a/paddle/trainer/CMakeLists.txt b/paddle/trainer/CMakeLists.txt
index 06c019f0a97757b658d1bc3405246d8f47632aad..9d246b6690134d96e9a262c6ac64d998536128a9 100644
--- a/paddle/trainer/CMakeLists.txt
+++ b/paddle/trainer/CMakeLists.txt
@@ -4,6 +4,7 @@ set(TRAINER_SOURCES
         ParameterUpdater.cpp
         ParamUtil.cpp
         RemoteParameterUpdater.cpp
+        NewRemoteParameterUpdater.cpp
         Tester.cpp
         Trainer.cpp
         TrainerInternal.cpp
@@ -16,6 +17,7 @@ set(TRAINER_HEADERS
         ParameterUpdater.h
         ParamUtil.h
         RemoteParameterUpdater.h
+        NewRemoteParameterUpdater.h
         Tester.h
         TesterConfig.h
         Trainer.h
@@ -32,7 +34,7 @@ add_style_check_target(paddle_trainer_lib
 add_style_check_target(paddle_trainer_lib
     ${TRAINER_HEADERS})
 add_dependencies(paddle_trainer_lib
-    gen_proto_cpp)
+    gen_proto_cpp paddle_pserver_cclient_lib)
 
 macro(add_paddle_exe TARGET_NAME)
   add_executable(${TARGET_NAME} ${ARGN})
@@ -56,3 +58,10 @@ install(TARGETS paddle_trainer paddle_merge_model
 
 set_target_properties(paddle_trainer PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
 set_target_properties(paddle_merge_model PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
+
+if(APPLE)
+  set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
+endif()
+
+target_link_libraries(paddle_trainer ${CMAKE_CURRENT_SOURCE_DIR}/libpaddle_pserver_cclient.a)
+target_link_libraries(paddle_trainer_lib ${CMAKE_CURRENT_SOURCE_DIR}/libpaddle_pserver_cclient.a)
diff --git a/paddle/trainer/NewRemoteParameterUpdater.cpp b/paddle/trainer/NewRemoteParameterUpdater.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f25ce2f7f06f6da0feab27da61b8e49689cbe213
--- /dev/null
+++ b/paddle/trainer/NewRemoteParameterUpdater.cpp
@@ -0,0 +1,86 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "NewRemoteParameterUpdater.h"
+#include "Trainer.h"
+#include "paddle/utils/Stat.h"
+
+DECLARE_int32(trainer_id);
+DECLARE_string(save_dir);
+
+namespace paddle {
+NewRemoteParameterUpdater::NewRemoteParameterUpdater(
+    const OptimizationConfig &config, const std::string pserverSpec)
+    : parameterClient_(-1),
+      newParameters_(nullptr),
+      newGradients_(nullptr),
+      pserverSpec_(pserverSpec) {}
+
+void NewRemoteParameterUpdater::init(
+    const std::vector<ParameterPtr> &parameters) {
+  ParameterUpdater::init(parameters);
+
+  for (auto &para : parameters_) {
+    para->getBuf(PARAMETER_VALUE)->zeroMem();
+    para->getBuf(PARAMETER_GRADIENT)->zeroMem();
+  }
+
+  // create parameter server client.
+  parameterClient_ = paddle_new_pserver_client((char *)pserverSpec_.c_str(),
+                                               FLAGS_trainer_id == 0);
+
+  // init new parameter and gradient.
+  newParameters_ = initNewParameter(PARAMETER_VALUE);
+  newGradients_ = initNewParameter(PARAMETER_GRADIENT);
+
+  // init parameter, one trainer will get the opportunity to int parameter and
+  // send them to parameter server. Others will get the initialized parameter
+  // from parameter server
+  if (paddle_begin_init_params(parameterClient_)) {
+    LOG(INFO) << "paddle_begin_init_params start";
+    for (int i = 0; i < parameterSize(); ++i) {
+      auto paramConfig = parameters_[i]->getConfig();
+      std::string bytes = paramConfig.SerializeAsString();
+      const char *array = bytes.data();
+      int size = (int)bytes.size();
+      paddle_init_param(
+          parameterClient_, *newParameters_[i], (void *)array, size);
+    }
+    paddle_finish_init_params(parameterClient_);
+    LOG(INFO) << "paddle_begin_init_params done";
+  } else {
+    paddle_get_params(parameterClient_, newParameters_, parameterSize());
+  }
+
+  LOG(INFO) << "NewRemoteParameterUpdater initialized";
+}
+
+void NewRemoteParameterUpdater::updateImpl(Parameter *para) {}
+
+void NewRemoteParameterUpdater::finishBatch(real cost) {
+  // send gradient to parameter server.
+  paddle_send_grads(parameterClient_, newGradients_, parameterSize());
+  // get the updated parameter from parameterClient.
+  paddle_get_params(parameterClient_, newParameters_, parameterSize());
+
+  // clear gradient after update parameter.
+  for (auto &para : parameters_) {
+    para->getBuf(PARAMETER_GRADIENT)->zeroMem();
+  }
+}
+
+void NewRemoteParameterUpdater::startPass() {}
+
+bool NewRemoteParameterUpdater::finishPass() { return true; }
+}
diff --git a/paddle/trainer/NewRemoteParameterUpdater.h b/paddle/trainer/NewRemoteParameterUpdater.h
new file mode 100644
index 0000000000000000000000000000000000000000..f735185f62b3491a63e34cfc4a2ef73dae12243e
--- /dev/null
+++ b/paddle/trainer/NewRemoteParameterUpdater.h
@@ -0,0 +1,114 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <functional>
+#include <thread>
+#include "ParameterUpdater.h"
+#include "libpaddle_pserver_cclient.h"
+#include "paddle/pserver/ParameterClient2.h"
+#include "paddle/utils/Queue.h"
+#include "paddle/utils/Util.h"
+
+namespace paddle {
+
+/**
+ * New remote parameter updater for dense parameters that use cclient of go.
+ */
+class NewRemoteParameterUpdater : public ParameterUpdater {
+public:
+  NewRemoteParameterUpdater(const OptimizationConfig& config,
+                            const std::string pserverSpec);
+  ~NewRemoteParameterUpdater() {
+    releaseNewParameter(newParameters_);
+    releaseNewParameter(newGradients_);
+    if (parameterClient_ >= 0) paddle_pserver_client_release(parameterClient_);
+  }
+
+  /**
+   * initialize the internal parameter client and itself.
+   */
+  virtual void init(const std::vector<ParameterPtr>& parameters);
+  /**
+   * @brief start batch
+   *
+   * @note  one batch training exhibits stateful feature to help
+   *        to do performance tuning, sgd optimization if necessary.
+   */
+  virtual PassType startBatch(int64_t batchSize) { return PASS_TRAIN; }
+
+  /**
+   * send parameters to pservers and get returned parameters
+   * from all pservers if necessary.
+   */
+  virtual void finishBatch(real cost);
+  virtual void startPass();
+  virtual bool finishPass();
+
+protected:
+  /**
+   * work need to do after finishBatch
+   */
+  virtual void updateImpl(Parameter* para);
+
+private:
+  int parameterSize() { return (int)parameters_.size(); }
+
+  /**
+   * init parameter of go paddle pserver cclient.
+   * @param new_params
+   * @param type
+   */
+  paddle_parameter** initNewParameter(ParameterType type) {
+    paddle_parameter** new_params =
+        (paddle_parameter**)malloc(sizeof(paddle_parameter*) * parameterSize());
+    for (int i = 0; i < parameterSize(); ++i) {
+      new_params[i] = (paddle_parameter*)malloc(sizeof(paddle_parameter));
+      memset(new_params[i], 0, sizeof(paddle_parameter));
+    }
+
+    for (int i = 0; i < parameterSize(); ++i) {
+      ParameterPtr param = parameters_[i];
+      new_params[i]->element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+      new_params[i]->name = (char*)param->getName().c_str();
+      new_params[i]->content =
+          (unsigned char*)(param->getBuf(type).get()->getData());
+      new_params[i]->content_len =
+          (int)param->getBuf(type).get()->getSize() * sizeof(real);
+    }
+    return new_params;
+  }
+
+  void releaseNewParameter(paddle_parameter** newParams) {
+    if (newParams != nullptr) {
+      for (int i = 0; i < parameterSize(); ++i) {
+        free(newParams[i]);
+      }
+      free(newParams);
+    }
+  }
+
+protected:
+  /// internal parameter client object for exchanging data with pserver
+  paddle_pserver_client parameterClient_;
+  /// the parameters for new pserver client
+  paddle_parameter** newParameters_;
+  /// the gradinets for new pserver client
+  paddle_parameter** newGradients_;
+  /// the specification of parameter server "host1:port,host1:port"
+  std::string pserverSpec_;
+};
+
+}  // namespace paddle
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index e78dc4f3b4d8501b9864ed6c7e38afa98720deb2..2825ceb255dac325a5e6259e33c649587a37e88b 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -126,6 +126,7 @@ def init_config_environment(
         g_config=TrainerConfig(),
         g_layer_map={},
         g_parameter_map={},
+        g_parameter_initializer_map={},
         g_extended_config_funcs={},
 
         # store command args of paddle_trainer
@@ -439,22 +440,22 @@ def model_type(name):
 
 @config_class
 class Bias(Cfg):
-    def __init__(
-            self,
-            parameter_name=None,
-            learning_rate=None,
-            momentum=None,
-            decay_rate=None,
-            decay_rate_l1=None,
-            initial_mean=None,
-            initial_std=None,
-            initial_strategy=None,
-            initial_smart=None,
-            num_batches_regularization=None,
-            sparse_remote_update=None,
-            gradient_clipping_threshold=None,
-            is_static=None,
-            is_shared=None, ):
+    def __init__(self,
+                 parameter_name=None,
+                 learning_rate=None,
+                 momentum=None,
+                 decay_rate=None,
+                 decay_rate_l1=None,
+                 initial_mean=None,
+                 initial_std=None,
+                 initial_strategy=None,
+                 initial_smart=None,
+                 num_batches_regularization=None,
+                 sparse_remote_update=None,
+                 gradient_clipping_threshold=None,
+                 is_static=None,
+                 is_shared=None,
+                 initializer=None):
         self.add_keys(locals())
 
 
@@ -465,6 +466,7 @@ class Input(Cfg):
             self,
             input_layer_name,
             parameter_name=None,
+            initializer=None,
             learning_rate=None,
             momentum=None,
             decay_rate=None,
@@ -521,6 +523,7 @@ class Projection(Input):
             initial_std=None,
             initial_strategy=None,
             initial_smart=None,
+            initializer=None,
             num_batches_regularization=None,
             sparse_remote_update=None,
             sparse_update=None,
@@ -1494,7 +1497,8 @@ class LayerBase(object):
                     gradient_clipping_threshold=bias.
                     gradient_clipping_threshold,
                     is_static=bias.is_static,
-                    is_shared=bias.is_shared, )
+                    is_shared=bias.is_shared,
+                    initializer=bias.initializer)
             if for_self:
                 self.config.bias_parameter_name = bias.parameter_name
             else:
@@ -1551,7 +1555,8 @@ class LayerBase(object):
             format=format,
             is_static=input_config.is_static,
             is_shared=input_config.is_shared,
-            update_hooks=input_config.update_hooks)
+            update_hooks=input_config.update_hooks,
+            initializer=input_config.initializer)
 
     def set_layer_size(self, size):
         if self.config.size == 0:
@@ -3236,7 +3241,8 @@ def Parameter(name,
               need_compact=None,
               is_static=None,
               is_shared=None,
-              update_hooks=None):
+              update_hooks=None,
+              initializer=None):
 
     config_assert(name not in g_parameter_map,
                   'Duplicated parameter name: ' + name)
@@ -3324,6 +3330,11 @@ def Parameter(name,
             para.update_hooks.extend(update_hooks)
 
     g_parameter_map[name] = para
+    if initializer is not None:
+        config_assert(
+            callable(initializer),
+            "parameter initializer should be a callable object")
+        g_parameter_initializer_map[name] = initializer
 
 
 @config_func
diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py
index d1167a234caed3753c6beedfc89b01054e3688e1..4100697c9c3770f1b748ea630d5f8193167fe7fc 100644
--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
@@ -95,6 +95,10 @@ class ParameterAttribute(object):
     :param sparse_update: Enable sparse update for this parameter. It will
                           enable both local and remote sparse update.
     :type sparse_update: bool
+    :param initializer: If not None, it should be a callable object which accepts
+                        a parameter name and returns numpy array for the initial
+                        value of the parameter
+    :param initializer: callable object
     """
 
     def __init__(self,
@@ -109,7 +113,8 @@ class ParameterAttribute(object):
                  learning_rate=None,
                  momentum=None,
                  gradient_clipping_threshold=None,
-                 sparse_update=False):
+                 sparse_update=False,
+                 initializer=None):
         self.attr = {}
 
         if is_static:
@@ -161,6 +166,8 @@ class ParameterAttribute(object):
                 is_compatible_with(gradient_clipping_threshold, float):
             self.attr['gradient_clipping_threshold'] = \
                 gradient_clipping_threshold
+        if initializer is not None:
+            self.attr['initializer'] = initializer
 
     def set_default_parameter_name(self, name):
         """
diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
index 5e99d4a241b7fe2b0f9ff4ba191db4b341c4d30e..1ef2dceca910e806bddf17c95d1c345a144d9e31 100644
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -45,7 +45,12 @@ class Optimizer(object):
         return swig_api.ParameterUpdater.createRemoteUpdater(
             self.__opt_conf__, pass_num, use_sparse_updater)
 
-    def create_updater(self, is_local, num_passes, use_sparse_updater):
+    def __create_new_remote_updater__(self, pserver_spec):
+        return swig_api.ParameterUpdater.createNewRemoteUpdater(
+            self.__opt_conf__, pserver_spec)
+
+    def create_updater(self, is_local, num_passes, use_sparse_updater,
+                       pserver_spec):
         """
         create proper parameter_updater by configuration.
         :param is_local: create local or remote parameter updater
@@ -64,8 +69,12 @@ class Optimizer(object):
         if is_local:
             parameter_updater = self.__create_local_updater__()
         else:
-            parameter_updater = self.__create_remote_updater__(
-                num_passes, use_sparse_updater)
+            if pserver_spec is None:
+                parameter_updater = self.__create_remote_updater__(
+                    num_passes, use_sparse_updater)
+            else:
+                parameter_updater = self.__create_new_remote_updater__(
+                    pserver_spec)
         return parameter_updater
 
 
diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py
index 64805d0c504b876f4d1f6657fe94457534a0b278..ad20241b98302f136326ae491c6723a6c12ae284 100644
--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
@@ -1,6 +1,7 @@
 import numpy as np
 import py_paddle.swig_paddle as api
 from paddle.proto.ParameterConfig_pb2 import ParameterConfig
+import paddle.trainer.config_parser as cp
 import struct
 import tarfile
 import cStringIO
@@ -18,8 +19,11 @@ def create(layers):
     """
     topology = Topology(layers)
     pool = Parameters()
+    initializers = cp.g_parameter_initializer_map
     for param in topology.proto().parameters:
         pool.__append_config__(param)
+        if param.name in initializers:
+            pool[param.name] = initializers[param.name](param.name)
     return pool
 
 
diff --git a/python/paddle/v2/tests/test_parameters.py b/python/paddle/v2/tests/test_parameters.py
index ebb182caab6430862a8e4da2ae4ea6b1e72f726c..45372e7dd0ec7cbdd6a2eb5c0397ef7e74284cd0 100644
--- a/python/paddle/v2/tests/test_parameters.py
+++ b/python/paddle/v2/tests/test_parameters.py
@@ -11,6 +11,9 @@ except ImportError:
     sys.exit(0)
 
 import paddle.v2.parameters as parameters
+import paddle.v2.data_type as data_type
+import paddle.v2.layer as layer
+from paddle.v2.attr import ParamAttr
 from paddle.proto.ParameterConfig_pb2 import ParameterConfig
 import random
 import cStringIO
@@ -55,6 +58,25 @@ class TestParameters(unittest.TestCase):
             p1 = params_dup.get(name)
             self.assertTrue(numpy.isclose(p0, p1).all())
 
+    def test_initializer(self):
+        def initializer(name):
+            assert name == "fc.w"
+            mat = numpy.ones((3, 2), dtype=numpy.float32)
+            mat[1, 1] = 2
+            return mat
+
+        x = layer.data(name="x", type=data_type.dense_vector(3))
+        y = layer.fc(x,
+                     size=2,
+                     bias_attr=False,
+                     param_attr=ParamAttr(
+                         name="fc.w", initializer=initializer))
+        params = parameters.create(y)
+        val = params["fc.w"]
+        assert val.shape == (3, 2)
+        expected = numpy.array([[1, 1], [1, 2], [1, 1]], numpy.float32)
+        assert numpy.logical_and.reduce(numpy.reshape(val == expected, 6))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
index 8fdb67cc2688a67ed815af396b214e339195c73f..f9658a8c5df9562073c8a187074a6cb3459ac5d9 100644
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -49,7 +49,8 @@ class SGD(object):
                  parameters,
                  update_equation,
                  extra_layers=None,
-                 is_local=True):
+                 is_local=True,
+                 pserver_spec=None):
 
         if not isinstance(parameters, v2_parameters.Parameters):
             raise TypeError('parameters should be parameters')
@@ -63,6 +64,7 @@ class SGD(object):
         self.__parameters__ = parameters
         self.__topology_in_proto__ = topology.proto()
         self.__is_local__ = is_local
+        self.__pserver_spec__ = pserver_spec
 
         self.__use_sparse_updater__ = self.__topology__.use_sparse_updater()
         # # In local mode, disable sparse_remote_update.
@@ -126,7 +128,8 @@ class SGD(object):
         __check_train_args__(**locals())
 
         self.__parameter_updater__ = self.__optimizer__.create_updater(
-            self.__is_local__, num_passes, self.__use_sparse_updater__)
+            self.__is_local__, num_passes, self.__use_sparse_updater__,
+            self.__pserver_spec__)
         self.__parameter_updater__.init(self.__gradient_machine__)
 
         self.__gradient_machine__.start()