Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into improve_pruning

23b1a274 · xzl · 997cef2e · 91f82aba · 23b1a274 · 23b1a274
49 changed file
--- a/.travis.yml
+++ b/.travis.yml
@@ -50,6 +50,7 @@ before_install:
  # protobuf version.
  - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
  - pip install rarfile
+  - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
  - |
    function timeout() { perl -e 'alarm shift; exec @ARGV' "$@"; }
 script:

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -127,6 +127,7 @@ endif(WITH_GPU)
 add_subdirectory(proto)
 add_subdirectory(paddle)
 add_subdirectory(python)
+add_subdirectory(go/pserver/cclient)
 if(WITH_DOC)
    add_subdirectory(doc)

--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -59,6 +59,11 @@ context_projection
 ..  autoclass:: paddle.v2.layer.context_projection
    :noindex:
+row_conv
+--------
+..  autoclass:: paddle.v2.layer.row_conv
+    :noindex:
 Image Pooling Layer
 ===================
@@ -346,6 +351,12 @@ sampling_id
 ..  autoclass:: paddle.v2.layer.sampling_id
    :noindex:
+multiplex
+---------
+..  autoclass:: paddle.v2.layer.multiplex
+    :noindex:
 Slicing and Joining Layers
 ==========================

--- a/doc/design/cluster_train/pserver_client.md
+++ b/doc/design/cluster_train/pserver_client.md
@@ -74,14 +74,25 @@ typedef enum {
 typedef struct {
  char*               name;
  paddle_element_type element_type;
-  void*               content;
+  unsigned char*      content;
  int                 content_len;
 } paddle_parameter, paddle_gradient;
-typedef struct paddle_pserver_client paddle_pserver_client;
+typedef int paddle_pserver_client;
-paddle_pserver_client* paddle_new_pserver_client();
+/**
-void paddle_pserver_client_release(paddle_pserver_client* client);
+ * @brief creates a pserver client that talks to etcd for coordination.
+ */
+paddle_pserver_client paddle_new_etcd_pserver_client(char* etcd_addr);
+/**
+ * @brief creates a pserver client given pserver addresses.
+ *
+ * @param pserver_addrs comma-separated pserver addresses.
+ * @param selected if current pserver client is selected to initialize all parameter servers.
+ */
+paddle_pserver_client paddle_new_pserver_client(char* pserver_addrs, int selected);
+void paddle_pserver_client_release(paddle_pserver_client c);
 /**
 * @brief paddle_begin_init_params begins to initialize parameters on
@@ -95,7 +106,7 @@ void paddle_pserver_client_release(paddle_pserver_client* client);
 * @return 1 if the trainer is selected to initialize parameter
 * servers, otherwise 0.
 */
-int paddle_begin_init_params(paddle_pserver_client* client);
+int paddle_begin_init_params(paddle_pserver_client client);
 /**
 * @brief paddle_init_param initializes the parameter on parameter
@@ -109,7 +120,7 @@ int paddle_begin_init_params(paddle_pserver_client* client);
 * @paddle_begin_init_param). Or simply exit the program and wait for
 * the cluster management system to restart the trainer.
 */
-int paddle_init_param(paddle_pserver_client* client, paddle_parameter param, const unsigned char* param_config_proto, int config_len);
+int paddle_init_param(paddle_pserver_client client, paddle_parameter param, const unsigned char* param_config_proto, int config_len);
 /**
 * @brief paddle_finish_init_params tells parameter servers client has
@@ -120,7 +131,7 @@ int paddle_init_param(paddle_pserver_client* client, paddle_parameter param, con
 * @paddle_begin_init_param). Or simply exit the program and wait for
 * the cluster management system to restart the trainer.
 */
-int paddle_finish_init_params(paddle_pserver_client* client);
+int paddle_finish_init_params(paddle_pserver_client client);
 /**
 * @brief paddle_send_grads sends gradients to parameter servers for
@@ -131,7 +142,7 @@ int paddle_finish_init_params(paddle_pserver_client* client);
 * @param learning_rate the learning rate for the gradients.
 * @return 0 if successful, otherwise -1.
 */
-int paddle_send_grads(paddle_pserver_client* client, const paddle_gradient* grads, int len);
+int paddle_send_grads(paddle_pserver_client client, const paddle_gradient* grads, int len);
 /**
 * @brief paddle_get_params gets parameters from parameter servers.
@@ -139,13 +150,15 @@ int paddle_send_grads(paddle_pserver_client* client, const paddle_gradient* grad
 * paddle_get_params will block until parameters are initialized on
 * the parameter servers.
 *
- * @param names the array of names of the parameters to get.
+ * @param dst the destination array of parameter pointers to save to.
- * @param dst the destination array of parameters to save to.
+ * The parameter pointer must be pre-popullated with required parameter name,
+ * and the content of parameter must be pre-allocated of the size of required
+ * parameter on pserver.
 * @param len the length of the names array and the paddle_parameter
 * array.
 * @return 0 if successful, otherwise -1.
 */
-int paddle_get_params(paddle_pserver_client* client, const char** names, paddle_parameter* dst, int len);
+int paddle_get_params(paddle_pserver_client client, paddle_parameter** dst, int len);
 /**
 * @brief paddle_save_model indicates parameters to save the parameter
@@ -154,5 +167,5 @@ int paddle_get_params(paddle_pserver_client* client, const char** names, paddle_
 * @param path the path to save parameters.
 * @return 0 if successful, otherwise -1.
 */
-int paddle_save_model(paddle_pserver_client* client, const char* path);
+int paddle_save_model(paddle_pserver_client client, const char* path);
 ```
--- a/doc/design/cluster_train/remote_parameter_updater.md
+++ b/doc/design/cluster_train/remote_parameter_updater.md
+# Design Doc: Remote Parameter Updater for Cluster Train
+For an overview of distribute training, please refer to [distributed training design doc](README.md). In this design doc, we will discuss the parameter updater that will use parameter server cclient [The Client Library of Parameter Server Design Doc](pserver_client.md) to manage and update parameters.
+## Parameter Updater
+Parameter Updater is used by trainer to manage and update parameter, there are mainly two kind of parameter updater: local and remote, since this design is for cluster train, we will only discuss remote parameter updater here.
+### Remote Parameter Updater
+Remote Parameter Updater manage parameters through remote parameter server with the client that communicate with pserver([The Client Library of Parameter Server Design Doc](pserver_client.md))
+In PaddlePaddle Python V2 API, trainer is implemented in python, and the trainer will hold a instance of parameter updater and call it's functions directly. In this design, we will also expose the api of RemoteParameterUpdater to python with swig.
+#### Sparse Remote Parameter Updater
+Since we will only implement dense parameter management new, the mechanism for sparse parameter will be discussed in next stage.
+### Interface Design
+TBD
--- a/go/cmake/golang.cmake
+++ b/go/cmake/golang.cmake
@@ -17,7 +17,7 @@ function(GO_LIBRARY NAME BUILD_TYPE)
  endif()
  file(GLOB GO_SOURCE RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.go")
-  file(RELATIVE_PATH rel ${CMAKE_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
+  file(RELATIVE_PATH rel ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR})
  # find Paddle directory.
  get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
@@ -32,12 +32,14 @@ function(GO_LIBRARY NAME BUILD_TYPE)
  # will use the local changes in Paddle rather than checkout Paddle
  # in github.
  add_custom_target(copyPaddle
-    COMMAND ln -sf ${PADDLE_DIR} ${PADDLE_IN_GOPATH})
+    COMMAND rm -rf ${PADDLE_IN_GOPATH}/Paddle
+    COMMAND ln -sf ${PADDLE_DIR} ${PADDLE_IN_GOPATH}/Paddle)
  add_dependencies(goGet copyPaddle)
  add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
-    -o "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}"
+     -gcflags=-shared -asmflags=-shared -installsuffix=_shared -a
+     -o "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}"
    ${CMAKE_GO_FLAGS} ${GO_SOURCE}
    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})

--- a/go/cmd/master/master.go
+++ b/go/cmd/master/master.go
 package main
 import (
-	"fmt"
 	"net"
 	"net/http"
 	"net/rpc"
-	"os"
-	"path/filepath"
 	"strconv"
-	"strings"
 	"time"
 	"github.com/namsral/flag"
 	"github.com/PaddlePaddle/Paddle/go/master"
-	"github.com/PaddlePaddle/recordio"
 )
 func main() {
 	port := flag.Int("port", 8080, "port of the master server.")
-	dataset := flag.String("training_dataset", "", "dataset: comma separated path to RecordIO paths, supports golb patterns.")
 	faultTolerance := flag.Bool("fault_tolerance", false, "enable fault tolerance (requires etcd).")
 	taskTimeoutDur := flag.Duration("task_timout_dur", 20*time.Minute, "task timout duration.")
 	taskTimeoutMax := flag.Int("task_timeout_max", 3, "max timtout count for each task before it being declared failed task.")
 	chunkPerTask := flag.Int("chunk_per_task", 10, "chunk per task.")
 	flag.Parse()
-	if *dataset == "" {
-		panic("no dataset specified.")
-	}
 	if *faultTolerance {
 		panic("fault tolernance not implemented.")
-	}
-	var chunks []master.Chunk
-	var paths []string
-	ss := strings.Split(*dataset, ",")
-	fmt.Println(ss)
-	for _, s := range ss {
-		match, err := filepath.Glob(s)
-		if err != nil {
-			panic(err)
-		}
-		paths = append(paths, match...)
-	}
-	if len(paths) == 0 {
-		panic("no valid datset specified.")
-	}
-	idx := 0
-	for _, path := range paths {
-		f, err := os.Open(path)
-		if err != nil {
-			panic(err)
-		}
-		index, err := recordio.LoadIndex(f)
-		if err != nil {
-			panic(err)
-		}
-		f.Close()
-		count := index.NumChunks()
-		for i := 0; i < count; i++ {
-			chunk := master.Chunk{
-				Idx:   idx,
-				Path:  path,
-				Index: *index.ChunkIndex(i),
-			}
-			chunks = append(chunks, chunk)
-		}
 	}
-	s := master.NewService(chunks, *chunkPerTask, *taskTimeoutDur, *taskTimeoutMax)
+	s := master.NewService(*chunkPerTask, *taskTimeoutDur, *taskTimeoutMax)
 	err := rpc.Register(s)
 	if err != nil {
 		panic(err)

--- a/go/pserver/internal/connection/conn.go
+++ b/go/pserver/internal/connection/conn.go
@@ -2,6 +2,7 @@ package connection
 import (
 	"errors"
+	"log"
 	"net/rpc"
 	"sync"
 )
@@ -21,6 +22,18 @@ func New() *Conn {
 	return c
 }
+// Close closes the connection.
+func (c *Conn) Close() error {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	if c.client == nil {
+		return nil
+	}
+	return c.client.Close()
+}
 // Connect connects the connection to a address.
 func (c *Conn) Connect(addr string) error {
 	c.mu.Lock()
@@ -50,12 +63,20 @@ func (c *Conn) Connect(addr string) error {
 			c.waitConn = nil
 		}
 	} else {
+		err := client.Close()
+		if err != nil {
+			log.Println(err)
+		}
 		return errors.New("client already set from a concurrent goroutine")
 	}
 	return nil
 }
+// TODO(helin): refactor Call to be able to perform given retry
+// policy.
 // Call make a RPC call.
 //
 // Call will be blocked until the connection to remote RPC service

--- a/go/master/client.go
+++ b/go/master/client.go
+package master
+import (
+	"log"
+	"time"
+	"github.com/PaddlePaddle/Paddle/go/connection"
+)
+// Addresser provide the address of the master server.
+type Addresser interface {
+	Address() string
+}
+// Client is the client of the master server.
+type Client struct {
+	conn *connection.Conn
+}
+// NewClient creates a new Client.
+func NewClient(addr Addresser) *Client {
+	c := &Client{}
+	c.conn = connection.New()
+	go c.monitorMaster(addr)
+	return c
+}
+func (c *Client) monitorMaster(addr Addresser) {
+	lastMaster := ""
+	monitor := func() {
+		// get the lastest address of the master server,
+		// connect to the new address once address changed.
+		curMaster := addr.Address()
+		if curMaster != lastMaster {
+			if curMaster == "" {
+				err := c.conn.Close()
+				if err != nil {
+					log.Println(err)
+				}
+			} else {
+				err := c.conn.Connect(curMaster)
+				if err != nil {
+					log.Println(err)
+					// connect to addr failed, set
+					// to last known addr in order
+					// to retry next time.
+					curMaster = lastMaster
+				}
+			}
+		}
+		lastMaster = curMaster
+	}
+	monitor()
+	ticker := time.NewTicker(10 * time.Second)
+	for _ = range ticker.C {
+		monitor()
+	}
+}
+// SetDataset set dataset for the master server to dispatch.
+//
+// SetDataset can be call multiple times from different nodes. But
+// only the first call will be honored.
+func (c *Client) SetDataset(globPaths []string) error {
+	return c.conn.Call("Service.SetDataset", globPaths, nil)
+}
+// GetTask gets a new task from the master server.
+func (c *Client) GetTask() (Task, error) {
+	var t Task
+	err := c.conn.Call("Service.GetTask", 0, &t)
+	return t, err
+}
+// TaskFinished tells the master server a task is finished.
+func (c *Client) TaskFinished(taskID int) error {
+	return c.conn.Call("Service.TaskFinished", taskID, nil)
+}
--- a/go/master/client_test.go
+++ b/go/master/client_test.go
+package master_test
+import (
+	"fmt"
+	"net"
+	"net/http"
+	"net/rpc"
+	"os"
+	"strconv"
+	"strings"
+	"testing"
+	"time"
+	log "github.com/sirupsen/logrus"
+	"github.com/PaddlePaddle/Paddle/go/master"
+	"github.com/PaddlePaddle/recordio"
+)
+const (
+	totalTask    = 20
+	chunkPerTask = 10
+)
+var port int
+func init() {
+	log.SetLevel(log.ErrorLevel)
+	l, err := net.Listen("tcp", ":0")
+	if err != nil {
+		panic(err)
+	}
+	ss := strings.Split(l.Addr().String(), ":")
+	p, err := strconv.Atoi(ss[len(ss)-1])
+	if err != nil {
+		panic(err)
+	}
+	port = p
+	go func(l net.Listener) {
+		s := master.NewService(chunkPerTask, time.Second, 1)
+		server := rpc.NewServer()
+		err := server.Register(s)
+		if err != nil {
+			panic(err)
+		}
+		mux := http.NewServeMux()
+		mux.Handle(rpc.DefaultRPCPath, server)
+		err = http.Serve(l, mux)
+		if err != nil {
+			panic(err)
+		}
+	}(l)
+}
+type addresser string
+func (a addresser) Address() string {
+	return string(a)
+}
+func TestClientFull(t *testing.T) {
+	const p = "/tmp/master_client_test_0"
+	f, err := os.Create(p)
+	if err != nil {
+		panic(err)
+	}
+	for i := 0; i < totalTask*chunkPerTask; i++ {
+		w := recordio.NewWriter(f, -1, -1)
+		w.Write(nil)
+		// call Close to force RecordIO writing a chunk.
+		w.Close()
+	}
+	f.Close()
+	c := master.NewClient(addresser(fmt.Sprintf(":%d", port)))
+	c.SetDataset([]string{p})
+	checkOnePass := func(i int) {
+		var tasks []master.Task
+		for i := 0; i < totalTask; i++ {
+			task, err := c.GetTask()
+			if err != nil {
+				t.Fatal(i, err)
+			}
+			tasks = append(tasks, task)
+		}
+		_, err = c.GetTask()
+		if err == nil {
+			t.Fatal(i, "should get error.")
+		}
+		err = c.TaskFinished(tasks[0].ID)
+		if err != nil {
+			t.Fatal(err)
+		}
+		tasks = tasks[1:]
+		task, err := c.GetTask()
+		if err != nil {
+			t.Fatal(err)
+		}
+		tasks = append(tasks, task)
+		for _, task := range tasks {
+			err = c.TaskFinished(task.ID)
+			if err != nil {
+				t.Fatal(i, err)
+			}
+		}
+	}
+	for i := 0; i < 10; i++ {
+		checkOnePass(i)
+	}
+}
--- a/go/master/service.go
+++ b/go/master/service.go
@@ -2,29 +2,25 @@ package master
 import (
 	"errors"
-	"log"
+	"os"
+	"path/filepath"
 	"sync"
 	"time"
-	"github.com/PaddlePaddle/recordio"
+	log "github.com/sirupsen/logrus"
-)
-const (
+	"github.com/PaddlePaddle/recordio"
-	targetTaskCount = 300
-)
-// errors
-var (
-	ErrNoMoreTask          = errors.New("no more task for current pass")
-	ErrPendingTaskNotFound = errors.New("pending task not found")
 )
 // Service is the master server service.
 type Service struct {
-	timeoutDur time.Duration
+	chunksPerTask int
-	timeoutMax int
+	timeoutDur    time.Duration
+	timeoutMax    int
+	ready         chan struct{}
 	mu         sync.Mutex
+	initDone   bool
 	taskQueues taskQueues
 }
@@ -55,7 +51,6 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 	if len(cur.Task.Chunks) > 0 {
 		cur.Task.ID = id
-		id++
 		result = append(result, cur)
 	}
@@ -63,21 +58,21 @@ func partition(chunks []Chunk, chunksPerTask int) []taskEntry {
 }
 // NewService creates a new service.
-func NewService(chunks []Chunk, chunksPerTask int, timeoutDur time.Duration, timeoutMax int) *Service {
+func NewService(chunksPerTask int, timeoutDur time.Duration, timeoutMax int) *Service {
 	s := &Service{}
+	s.chunksPerTask = chunksPerTask
 	s.timeoutDur = timeoutDur
 	s.timeoutMax = timeoutMax
 	s.taskQueues = taskQueues{}
 	s.taskQueues.Pending = make(map[int]taskEntry)
-	s.taskQueues.Todo = partition(chunks, chunksPerTask)
+	s.ready = make(chan struct{})
 	return s
 }
 // Chunk is a chunk of data consisted of several data instances.
 type Chunk struct {
-	Idx   int // index of the chunk within the file
 	Path  string
-	Index recordio.Index // block index
+	Index recordio.Index // chunk index
 }
 // Task is the basic unit of data instances assigned to trainers.
@@ -105,74 +100,205 @@ func (s *Service) snapshot() error {
 	return nil
 }
-// GetTask gets a new task from the service.
+func readChunks(globPaths []string) ([]Chunk, error) {
-func (s *Service) GetTask(dummy int, task *Task) error {
+	var chunks []Chunk
+	var paths []string
+	for _, s := range globPaths {
+		match, err := filepath.Glob(s)
+		if err != nil {
+			return nil, err
+		}
+		paths = append(paths, match...)
+	}
+	if len(paths) == 0 {
+		return nil, errors.New("no valid dataset specified")
+	}
+	for _, path := range paths {
+		f, err := os.Open(path)
+		if err != nil {
+			return nil, err
+		}
+		index, err := recordio.LoadIndex(f)
+		if err != nil {
+			return nil, err
+		}
+		err = f.Close()
+		if err != nil {
+			return nil, err
+		}
+		count := index.NumChunks()
+		for i := 0; i < count; i++ {
+			chunk := Chunk{
+				Path:  path,
+				Index: *index.ChunkIndex(i),
+			}
+			chunks = append(chunks, chunk)
+		}
+	}
+	return chunks, nil
+}
+// SetDataset sets dataset to dispatch for the master server.
+//
+// SetDataset can be call multiple times. But only the first call will
+// be honored.
+func (s *Service) SetDataset(globPaths []string, dummy *int) error {
+	if len(globPaths) == 0 {
+		return errors.New("no dataset specified")
+	}
 	s.mu.Lock()
 	defer s.mu.Unlock()
+	if s.initDone {
+		// Already initialized. All trainer will call
+		// SetDataset, but we only handle the first one. Treat
+		// other calls as successful but do nothing.
+		return nil
+	}
-	if len(s.taskQueues.Todo) == 0 {
+	chunks, err := readChunks(globPaths)
-		return ErrNoMoreTask
+	if err != nil {
+		return err
 	}
-	t := s.taskQueues.Todo[0]
+	s.taskQueues.Todo = partition(chunks, s.chunksPerTask)
-	t.Epoch++
-	s.taskQueues.Todo = s.taskQueues.Todo[1:]
+	err = s.snapshot()
-	s.taskQueues.Pending[t.Task.ID] = t
-	err := s.snapshot()
 	if err != nil {
+		log.Errorln(err)
 		return err
 	}
-	time.AfterFunc(s.timeoutDur, func(taskID int, epoch int) func() {
+	close(s.ready)
-		return func() {
+	s.initDone = true
-			s.mu.Lock()
+	return nil
-			defer s.mu.Unlock()
+}
-			t, ok := s.taskQueues.Pending[taskID]
+func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
-			if !ok {
+	return func() {
-				return
+		s.mu.Lock()
-			}
+		defer s.mu.Unlock()
-			if t.Epoch != epoch {
+		t, ok := s.taskQueues.Pending[taskID]
-				// new epoch, task launched after the
+		if !ok {
-				// schedule of this timeout check.
+			return
-				return
+		}
+		if t.Epoch != epoch {
+			// new epoch, task launched after the
+			// schedule of this timeout check.
+			return
+		}
+		defer func() {
+			err := s.snapshot()
+			if err != nil {
+				log.Errorln(err)
 			}
+		}()
+		delete(s.taskQueues.Pending, t.Task.ID)
-			defer func() {
+		t.NumTimeout++
-				err := s.snapshot()
+		if t.NumTimeout > s.timeoutMax {
-				if err != nil {
+			log.Warningf("Task %v failed %d times, discard.\n", t.Task, t.NumTimeout)
-					log.Println(err)
+			s.taskQueues.Failed = append(s.taskQueues.Failed, t.Task)
-				}
+			return
-			}()
+		}
-			delete(s.taskQueues.Pending, t.Task.ID)
+		log.Warningf("Task %v failed %d times, retry.\n", t.Task, t.NumTimeout)
+		s.taskQueues.Todo = append(s.taskQueues.Todo, t)
+	}
+}
-			t.NumTimeout++
+// GetTask gets a new task from the service.
-			if t.NumTimeout > s.timeoutMax {
+func (s *Service) GetTask(dummy int, task *Task) error {
-				s.taskQueues.Failed = append(s.taskQueues.Failed, t.Task)
+	select {
-				return
+	case <-s.ready:
+	}
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if len(s.taskQueues.Todo) == 0 {
+		if len(s.taskQueues.Done) == 0 {
+			if len(s.taskQueues.Pending) == 0 {
+				err := errors.New("all task failed")
+				log.Warningln(err)
+				return err
 			}
-			s.taskQueues.Todo = append(s.taskQueues.Todo, t)
+			// TODO(helin): client need to retry in this
+			// error case. Gotcha: RPC client can't
+			// compare returned error with predefined
+			// errors like io.EOF, because the error
+			// instance deserialized from RPC is a
+			// different instance than the error defined
+			// in package. So we need to figure out a way
+			// for client to check this error correctly.
+			err := errors.New("no more available task")
+			log.Warningln(err)
+			return err
 		}
-	}(t.Task.ID, t.Epoch))
+		s.taskQueues.Todo = s.taskQueues.Done
+		s.taskQueues.Done = nil
+		log.Infoln("No more todo task, but trainer is requesting task to do. Move all done task to todo.")
+	}
+	t := s.taskQueues.Todo[0]
+	t.Epoch++
+	s.taskQueues.Todo = s.taskQueues.Todo[1:]
+	s.taskQueues.Pending[t.Task.ID] = t
+	err := s.snapshot()
+	if err != nil {
+		return err
+	}
+	*task = t.Task
+	log.Infof("Task #%d dispatched\n", task.ID)
+	time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.ID, t.Epoch))
 	return nil
 }
 // TaskFinished tell the service that a task is finished.
 func (s *Service) TaskFinished(taskID int, dummy *int) error {
+	select {
+	case <-s.ready:
+	}
 	s.mu.Lock()
 	defer s.mu.Unlock()
+	log.Infof("Task %d finished\n", taskID)
 	t, ok := s.taskQueues.Pending[taskID]
 	if !ok {
-		return ErrPendingTaskNotFound
+		err := errors.New("pending task not found")
+		log.Warningln(err)
+		return err
 	}
 	// task finished, reset timeout
 	t.NumTimeout = 0
 	s.taskQueues.Done = append(s.taskQueues.Done, t)
 	delete(s.taskQueues.Pending, taskID)
-	return s.snapshot()
+	if len(s.taskQueues.Pending) == 0 && len(s.taskQueues.Todo) == 0 {
+		log.Infoln("No more todo and pending task, start a new pass.")
+		s.taskQueues.Todo = append(s.taskQueues.Todo, s.taskQueues.Done...)
+		s.taskQueues.Done = nil
+	}
+	err := s.snapshot()
+	if err != nil {
+		log.Errorln(err)
+	}
+	return err
 }
--- a/go/pserver/cclient/CMakeLists.txt
+++ b/go/pserver/cclient/CMakeLists.txt
@@ -9,5 +9,15 @@ project(cxx_go C Go)
 include(golang)
 include(flags)
-go_library(client STATIC)
+go_library(paddle_pserver_cclient STATIC)
+if(PROJ_ROOT)
+  add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/trainer/libpaddle_pserver_cclient.a
+          COMMAND cp ${CMAKE_BINARY_DIR}/go/pserver/cclient/libpaddle_pserver_cclient.h ${PROJ_ROOT}/paddle/trainer/
+          COMMAND cp ${CMAKE_BINARY_DIR}/go/pserver/cclient/libpaddle_pserver_cclient.a ${PROJ_ROOT}/paddle/trainer/
+          WORKING_DIRECTORY ${PROJ_ROOT}/paddle
+          DEPENDS paddle_pserver_cclient)
+  add_custom_target(paddle_pserver_cclient_lib ALL DEPENDS ${PROJ_ROOT}/paddle/trainer/libpaddle_pserver_cclient.a)
+endif(PROJ_ROOT)
 add_subdirectory(test)
--- a/go/pserver/cclient/cclient.go
+++ b/go/pserver/cclient/cclient.go
@@ -19,21 +19,9 @@ typedef struct {
  int                 content_len;
 } paddle_parameter, paddle_gradient;
-static inline void paddle_release_param(paddle_parameter* param) {
+typedef int paddle_pserver_client;
-  if (param != NULL) {
+#define PSERVER_ERROR -1
-    if (param->name != NULL) {
+#define PSERVER_OK 0
-      free(param->name);
-    }
-    if (param->content != NULL) {
-      free(param->content);
-    }
-    free(param);
-  }
-}
-typedef int client;
 */
 import "C"
@@ -48,10 +36,10 @@ import (
 var nullPtr = unsafe.Pointer(uintptr(0))
 var mu sync.Mutex
-var handleMap = make(map[C.client]*pserver.Client)
+var handleMap = make(map[C.paddle_pserver_client]*pserver.Client)
-var curHandle C.client
+var curHandle C.paddle_pserver_client
-func add(c *pserver.Client) C.client {
+func add(c *pserver.Client) C.paddle_pserver_client {
 	mu.Lock()
 	defer mu.Unlock()
 	client := curHandle
@@ -60,13 +48,13 @@ func add(c *pserver.Client) C.client {
 	return client
 }
-func get(client C.client) *pserver.Client {
+func get(client C.paddle_pserver_client) *pserver.Client {
 	mu.Lock()
 	defer mu.Unlock()
 	return handleMap[client]
 }
-func remove(client C.client) *pserver.Client {
+func remove(client C.paddle_pserver_client) *pserver.Client {
 	mu.Lock()
 	defer mu.Unlock()
 	h := handleMap[client]
@@ -100,7 +88,7 @@ func (l lister) List() []pserver.Server {
 }
 //export paddle_new_pserver_client
-func paddle_new_pserver_client(addrs *C.char, selected int) C.client {
+func paddle_new_pserver_client(addrs *C.char, selected int) C.paddle_pserver_client {
 	a := C.GoString(addrs)
 	as := strings.Split(a, ",")
 	servers := make([]pserver.Server, len(as))
@@ -113,27 +101,27 @@ func paddle_new_pserver_client(addrs *C.char, selected int) C.client {
 }
 //export paddle_new_etcd_pserver_client
-func paddle_new_etcd_pserver_client(etcd_addr *C.char) C.client {
+func paddle_new_etcd_pserver_client(etcd_addr *C.char) C.paddle_pserver_client {
 	// TODO(helin): fault tolerant pserver client using etcd.
 	panic("not implemented.")
 }
 //export paddle_pserver_client_release
-func paddle_pserver_client_release(client C.client) {
+func paddle_pserver_client_release(client C.paddle_pserver_client) {
 	remove(client)
 }
 //export paddle_begin_init_params
-func paddle_begin_init_params(client C.client) C.int {
+func paddle_begin_init_params(client C.paddle_pserver_client) C.int {
 	c := get(client)
 	if selected := c.BeginInitParams(); selected {
 		return 1
 	}
-	return 0
+	return C.PSERVER_OK
 }
 //export paddle_init_param
-func paddle_init_param(client C.client, param C.paddle_parameter, param_config unsafe.Pointer, config_len C.int) C.int {
+func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter, param_config unsafe.Pointer, config_len C.int) C.int {
 	et := pserver.ElementType(param.element_type)
 	name := C.GoString(param.name)
 	content := cArrayToSlice(unsafe.Pointer(param.content), int(param.content_len))
@@ -143,31 +131,41 @@ func paddle_init_param(client C.client, param C.paddle_parameter, param_config u
 	}
 	c := get(client)
 	err := c.InitParam(pc)
 	if err != nil {
+		if err.Error() == pserver.AlreadyInitialized {
+			log.Printf("parameter %s already initialized, treat paddle_init_param as sucessful.\n", name)
+			return C.PSERVER_OK
+		}
 		log.Println(err)
-		return -1
+		return C.PSERVER_ERROR
 	}
-	return 0
+	return C.PSERVER_OK
 }
 //export paddle_finish_init_params
-func paddle_finish_init_params(client C.client) C.int {
+func paddle_finish_init_params(client C.paddle_pserver_client) C.int {
 	c := get(client)
 	err := c.FinishInitParams()
 	if err != nil {
+		if err.Error() == pserver.AlreadyInitialized {
+			log.Println("parameters already initialized, treat paddle_finish_init_params as sucessful.")
+			return C.PSERVER_OK
+		}
 		log.Println(err)
-		return -1
+		return C.PSERVER_ERROR
 	}
-	return 0
+	return C.PSERVER_OK
 }
 //export paddle_send_grads
-func paddle_send_grads(client C.client, grads *C.paddle_gradient, total C.int) C.int {
+func paddle_send_grads(client C.paddle_pserver_client, grads **C.paddle_gradient, total C.int) C.int {
 	var gs []pserver.Gradient
 	for i := 0; i < int(total); i++ {
-		grad := (*C.paddle_gradient)(unsafe.Pointer((uintptr(unsafe.Pointer(grads)) + uintptr(i)*unsafe.Sizeof(*grads))))
+		grad := *(**C.paddle_gradient)(unsafe.Pointer((uintptr(unsafe.Pointer(grads)) + uintptr(i)*unsafe.Sizeof(*grads))))
 		et := pserver.ElementType(grad.element_type)
 		name := C.GoString(grad.name)
 		content := cArrayToSlice(unsafe.Pointer(grad.content), int(grad.content_len))
@@ -178,83 +176,81 @@ func paddle_send_grads(client C.client, grads *C.paddle_gradient, total C.int) C
 	err := c.SendGrads(gs)
 	if err != nil {
 		log.Println(err)
-		return -1
+		return C.PSERVER_ERROR
 	}
-	return 0
+	return C.PSERVER_OK
 }
 //export paddle_get_params
-func paddle_get_params(client C.client, names **C.char, dst **C.paddle_parameter, total C.int) C.int {
+func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter, total C.int) C.int {
 	var ns []string
 	for i := 0; i < int(total); i++ {
-		name := *(**C.char)(unsafe.Pointer((uintptr(unsafe.Pointer(names)) + uintptr(i)*unsafe.Sizeof(*names))))
+		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
-		ns = append(ns, C.GoString(name))
+		ns = append(ns, C.GoString(param.name))
 	}
 	c := get(client)
 	ps, err := c.GetParams(ns)
 	if err != nil {
 		log.Println(err)
-		return -1
+		return C.PSERVER_ERROR
 	}
-	for i := 0; i < int(total); i++ {
+	if len(ps) != len(ns) {
-		if i >= len(ps) {
+		pn := make([]string, len(ps))
-			break
+		for i, p := range ps {
+			pn[i] = p.Name
 		}
+		log.Printf("pserver returned wrong number of parameters. Requested: %s, returned: %s.\n", strings.Join(pn, ", "), strings.Join(ns, ", "))
+		return C.PSERVER_ERROR
+	}
+	for i := range ps {
+		if ns[i] != ps[i].Name {
+			pn := make([]string, len(ps))
+			for i, p := range ps {
+				pn[i] = p.Name
+			}
+			log.Printf("pserver returned wrong parameters, or not in requested order. Requested: %s, returned: %s.\n", strings.Join(pn, ", "), strings.Join(ns, ", "))
+			return C.PSERVER_ERROR
+		}
+	}
+	for i := 0; i < int(total); i++ {
 		p := ps[i]
 		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
-		nameReady := false
-		contentAllocated := false
 		if unsafe.Pointer(param) == nullPtr {
-			param = (*C.paddle_parameter)(C.calloc(1, C.size_t(unsafe.Sizeof(*param))))
+			log.Println("must pre-allocate parameter.")
+			return C.PSERVER_ERROR
 		} else {
-			if unsafe.Pointer(param.name) != nullPtr {
-				if n := C.GoString(param.name); n != p.Name {
-					log.Println("Warning: the pre-allocated parameter name does not match the parameter name, it will be freed.", n, p.Name)
-					C.free(unsafe.Pointer(param.name))
-				} else {
-					nameReady = true
-				}
-			}
 			if unsafe.Pointer(param.content) != nullPtr {
-				if int(param.content_len) == len(p.Content) {
+				if int(param.content_len) != len(p.Content) {
-					contentAllocated = true
+					log.Printf("the pre-allocated content len does not match parameter content len. Pre-allocated len: %d, returned len: %d", param.content_len, len(p.Content))
-				} else {
+					return C.PSERVER_ERROR
-					log.Println("Warning: the pre-allocated content len does not match parameter content len, the pre-allocated content will be freed.", param.content_len, len(p.Content))
-					C.free(unsafe.Pointer(param.content))
 				}
 			}
 		}
-		if !nameReady {
-			param.name = C.CString(p.Name)
-		}
-		if !contentAllocated {
-			param.content = (*C.uchar)(C.malloc(C.size_t(len(p.Content))))
-		}
 		C.memcpy(unsafe.Pointer(param.content), unsafe.Pointer(&p.Content[0]), C.size_t(len(p.Content)))
 		param.content_len = C.int(len(p.Content))
 		param.element_type = C.paddle_element_type(p.ElementType)
 	}
-	return 0
+	return C.PSERVER_OK
 }
 //export paddle_save_model
-func paddle_save_model(client C.client, path *C.char) C.int {
+func paddle_save_model(client C.paddle_pserver_client, path *C.char) C.int {
 	p := C.GoString(path)
 	c := get(client)
 	err := c.Save(p)
 	if err != nil {
 		log.Println(err)
-		return -1
+		return C.PSERVER_ERROR
 	}
-	return 0
+	return C.PSERVER_OK
 }
 func main() {} // Required but ignored
--- a/go/pserver/cclient/test/CMakeLists.txt
+++ b/go/pserver/cclient/test/CMakeLists.txt
 cmake_minimum_required(VERSION 3.0)
-include_directories(${CMAKE_BINARY_DIR})
 add_executable(main main.c)
-add_dependencies(main client)
+add_dependencies(main paddle_pserver_cclient)
+add_executable(test_cclient test_cclient.c)
+add_dependencies(test_cclient paddle_pserver_cclient)
 if(APPLE)
  set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
+else()
+  set(CMAKE_EXE_LINKER_FLAGS "-pthread")  
 endif()
-target_link_libraries(main ${CMAKE_BINARY_DIR}/libclient.a)
+if(PROJ_ROOT)
+  include_directories(${CMAKE_BINARY_DIR}/go/pserver/cclient/)
+  target_link_libraries(main ${CMAKE_BINARY_DIR}/go/pserver/cclient/libpaddle_pserver_cclient.a pthread)
+  target_link_libraries(test_cclient ${CMAKE_BINARY_DIR}/go/pserver/cclient/libpaddle_pserver_cclient.a pthread)
+else(PROJ_ROOT)
+  include_directories(${CMAKE_BINARY_DIR})
+  target_link_libraries(main ${CMAKE_BINARY_DIR}/libpaddle_pserver_cclient.a pthread)
+  target_link_libraries(test_cclient ${CMAKE_BINARY_DIR}/libpaddle_pserver_cclient.a pthread)
+endif(PROJ_ROOT)
--- a/go/pserver/cclient/test/main.c
+++ b/go/pserver/cclient/test/main.c
 #include <stdio.h>
-#include "libclient.h"
+#include "libpaddle_pserver_cclient.h"
-void fail() {
+// TODO(helin): Fix: gtest using cmake is not working, using this
-  // TODO(helin): fix: gtest using cmake is not working, using this
+// hacky way for now.
-  // hacky way for now.
+#define fail()                                          \
-  printf("test failed.\n");
+  fprintf(stderr, "info: %s:%d: ", __FILE__, __LINE__); \
  exit(-1);
+void sendGrads(paddle_pserver_client c) {
+  unsigned char grad_a[2000] = {2};
+  unsigned char grad_b[3000] = {3};
+  paddle_gradient grad1 = {
+      "param_a", PADDLE_ELEMENT_TYPE_FLOAT32, grad_a, 2000};
+  paddle_gradient grad2 = {
+      "param_b", PADDLE_ELEMENT_TYPE_FLOAT32, grad_b, 3000};
+  paddle_gradient* grads[2] = {&grad1, &grad2};
+  if (paddle_send_grads(c, grads, 2)) {
+    fail();
+  }
+}
+void getParams(paddle_pserver_client c) {
+  paddle_parameter param_a;
+  paddle_parameter param_b;
+  char name_a[] = "param_a";
+  char name_b[] = "param_b";
+  // Must pre-allocate the prameter content before calling paddle_get_params.
+  unsigned char content_a[2000] = {};
+  unsigned char content_b[3000] = {};
+  param_a.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+  param_a.name = name_a;
+  param_a.content = content_a;
+  param_a.content_len = 2000;
+  param_b.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+  param_b.name = name_b;
+  param_b.content = content_b;
+  param_b.content_len = 3000;
+  paddle_parameter* params[2] = {&param_a, &param_b};
+  if (paddle_get_params(c, params, 2)) {
+    fail();
+  }
 }
 int main() {
  char addr[] = "localhost:3000";
-  client c = paddle_new_pserver_client(addr, 1);
+  paddle_pserver_client c = paddle_new_pserver_client(addr, 1);
 retry:
  if (paddle_begin_init_params(c)) {
    paddle_parameter param;
    char name_a[] = "param_a";
    char name_b[] = "param_b";
-    unsigned char content[] = {0x00, 0x11, 0x22};
+    unsigned char content_a[2000] = {1};
+    unsigned char content_b[3000] = {0};
    param.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
    param.name = name_a;
-    param.content = content;
+    param.content = content_a;
-    param.content_len = 3;
+    param.content_len = 2000;
-    if (paddle_init_param(c, param, NULL, 0) != 0) {
+    int error = paddle_init_param(c, param, NULL, 0);
+    if (error != 0) {
      goto retry;
    }
-    param.element_type = PADDLE_ELEMENT_TYPE_INT32;
+    param.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
    param.name = name_b;
-    param.content = content;
+    param.content = content_b;
-    param.content_len = 3;
+    param.content_len = 3000;
-    if (paddle_init_param(c, param, NULL, 0) != 0) {
+    error = paddle_init_param(c, param, NULL, 0);
+    if (error != 0) {
      goto retry;
    }
-    if (paddle_finish_init_params(c) != 0) {
+    error = paddle_finish_init_params(c);
+    if (error != 0) {
      goto retry;
    }
-  } else {
-    fail();
-  }
-  unsigned char content[] = {0x00, 0x11, 0x22};
-  paddle_gradient grads[2] = {
-      {"param_a", PADDLE_ELEMENT_TYPE_INT32, content, 3},
-      {"param_b", PADDLE_ELEMENT_TYPE_FLOAT32, content, 3}};
-  if (!paddle_send_grads(c, grads, 2)) {
-    fail();
  }
-  paddle_parameter* params[2] = {NULL, NULL};
+  int i;
-  char* names[] = {"param_a", "param_b"};
+  for (i = 0; i < 100; i++) {
-  if (!paddle_get_params(c, names, params, 2)) {
+    sendGrads(c);
-    fail();
+    getParams(c);
  }
-  // get parameters again by reusing the allocated parameter buffers.
+  if (paddle_save_model(c, "/tmp/")) {
-  if (!paddle_get_params(c, names, params, 2)) {
-    fail();
-  }
-  paddle_release_param(params[0]);
-  paddle_release_param(params[1]);
-  if (!paddle_save_model(c, "/tmp/")) {
    fail();
  }

--- a/go/pserver/cclient/test/test_cclient.c
+++ b/go/pserver/cclient/test/test_cclient.c
+#include <stdio.h>
+#include <stdlib.h>
+#include "libpaddle_pserver_cclient.h"
+typedef float real;
+void fail() {
+  // TODO(helin): fix: gtest using cmake is not working, using this
+  // hacky way for now.
+  printf("test failed.\n");
+  exit(-1);
+}
+void print_parameter(paddle_gradient* param) {
+  if (param == NULL) {
+    printf("param is NULL!!\n");
+  } else {
+    printf("==== parameter ====\n");
+    printf("name: %s\n", param->name);
+    printf("content_len: %d\n", param->content_len);
+    printf("content_type: %d\n", param->element_type);
+    int i;
+    for (i = 0; i < param->content_len / (int)sizeof(real); ++i) {
+      printf("%f ", ((float*)param->content)[i]);
+    }
+    printf("\n\n");
+  }
+}
+int main() {
+  char addr[] = "localhost:3000";
+  paddle_pserver_client c = paddle_new_pserver_client(addr, 1);
+  char* names[] = {"param_a", "param_b"};
+retry:
+  printf("init parameter to pserver:\n");
+  real param_content1[] = {0.1, 0.2, 0.3};
+  real param_content2[] = {0.4, 0.5, 0.6};
+  paddle_parameter** params =
+      (paddle_parameter**)malloc(sizeof(paddle_parameter*) * 2);
+  params[0] = (paddle_parameter*)malloc(sizeof(paddle_parameter));
+  params[0]->name = names[0];
+  params[0]->content = (unsigned char*)param_content1;
+  params[0]->content_len = 3 * sizeof(real);
+  params[0]->element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+  params[1] = (paddle_parameter*)malloc(sizeof(paddle_parameter));
+  params[1]->name = names[1];
+  params[1]->content = (unsigned char*)param_content2;
+  params[1]->content_len = 3 * sizeof(real);
+  params[1]->element_type = PADDLE_ELEMENT_TYPE_INT32;
+  if (paddle_begin_init_params(c)) {
+    if (paddle_init_param(c, *params[0], NULL, 0) != 0) {
+      goto retry;
+    }
+    if (paddle_init_param(c, *params[1], NULL, 0) != 0) {
+      goto retry;
+    }
+    if (paddle_finish_init_params(c) != 0) {
+      goto retry;
+    }
+  } else {
+    fail();
+  }
+  printf("get inited parameters from pserver:\n");
+  // get parameters again by reusing the allocated parameter buffers.
+  if (paddle_get_params(c, params, 2) != 0) {
+    fail();
+  }
+  print_parameter(params[0]);
+  print_parameter(params[1]);
+  printf("send gradient to pserver:\n");
+  real gradient_content1[] = {0.01, 0.02, 0.03};
+  real gradinet_content2[] = {0.04, 0.05, 0.06};
+  paddle_gradient** grads =
+      (paddle_gradient**)malloc(sizeof(paddle_gradient*) * 2);
+  grads[0] = (paddle_gradient*)malloc(sizeof(paddle_gradient));
+  grads[0]->name = names[0];
+  grads[0]->content = (unsigned char*)gradient_content1;
+  grads[0]->content_len = 3 * sizeof(real);
+  grads[0]->element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+  grads[1] = (paddle_gradient*)malloc(sizeof(paddle_gradient));
+  grads[1]->name = names[1];
+  grads[1]->content = (unsigned char*)gradinet_content2;
+  grads[1]->content_len = 3 * sizeof(real);
+  grads[1]->element_type = PADDLE_ELEMENT_TYPE_INT32;
+  printf("print gradient sent to pserver:\n");
+  print_parameter(grads[0]);
+  print_parameter(grads[1]);
+  if (paddle_send_grads(c, grads, 2) != 0) {
+    fail();
+  }
+  printf("get updated parameters from pserver:\n");
+  // get parameters again by reusing the allocated parameter buffers.
+  if (paddle_get_params(c, params, 2) != 0) {
+    fail();
+  }
+  print_parameter(params[0]);
+  print_parameter(params[1]);
+  if (paddle_save_model(c, "/tmp/") != 0) {
+    fail();
+  }
+  return 0;
+}
--- a/go/pserver/cclient/test/test_mnist.py
+++ b/go/pserver/cclient/test/test_mnist.py
+import paddle.v2 as paddle
+import gzip
+def softmax_regression(img):
+    predict = paddle.layer.fc(input=img,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+def multilayer_perceptron(img):
+    # The first fully-connected layer
+    hidden1 = paddle.layer.fc(input=img, size=128, act=paddle.activation.Relu())
+    # The second fully-connected layer and the according activation function
+    hidden2 = paddle.layer.fc(input=hidden1,
+                              size=64,
+                              act=paddle.activation.Relu())
+    # The thrid fully-connected layer, note that the hidden size should be 10,
+    # which is the number of unique digits
+    predict = paddle.layer.fc(input=hidden2,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+def convolutional_neural_network(img):
+    # first conv layer
+    conv_pool_1 = paddle.networks.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        num_channel=1,
+        pool_size=2,
+        pool_stride=2,
+        act=paddle.activation.Tanh())
+    # second conv layer
+    conv_pool_2 = paddle.networks.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        num_channel=20,
+        pool_size=2,
+        pool_stride=2,
+        act=paddle.activation.Tanh())
+    # The first fully-connected layer
+    fc1 = paddle.layer.fc(input=conv_pool_2,
+                          size=128,
+                          act=paddle.activation.Tanh())
+    # The softmax layer, note that the hidden size should be 10,
+    # which is the number of unique digits
+    predict = paddle.layer.fc(input=fc1,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+def main():
+    paddle.init(use_gpu=False, trainer_count=1)
+    # define network topology
+    images = paddle.layer.data(
+        name='pixel', type=paddle.data_type.dense_vector(784))
+    label = paddle.layer.data(
+        name='label', type=paddle.data_type.integer_value(10))
+    # Here we can build the prediction network in different ways. Please
+    # choose one by uncomment corresponding line.
+    predict = softmax_regression(images)
+    #predict = multilayer_perceptron(images)
+    #predict = convolutional_neural_network(images)
+    cost = paddle.layer.classification_cost(input=predict, label=label)
+    parameters = paddle.parameters.create(cost)
+    optimizer = paddle.optimizer.Momentum(
+        learning_rate=0.1 / 128.0,
+        momentum=0.9,
+        regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer,
+                                 is_local=False,
+                                 pserver_spec="localhost:3000")
+    lists = []
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 1000 == 0:
+                print "Pass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+        elif isinstance(event, paddle.event.EndPass):
+            result = trainer.test(reader=paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=128))
+            print "Test with Pass %d, Cost %f, %s\n" % (
+                event.pass_id, result.cost, result.metrics)
+            lists.append((event.pass_id, result.cost,
+                          result.metrics['classification_error_evaluator']))
+    trainer.train(
+        reader=paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.mnist.train(), buf_size=8192),
+            batch_size=128),
+        event_handler=event_handler,
+        num_passes=100)
+    # find the best pass
+    best = sorted(lists, key=lambda list: float(list[1]))[0]
+    print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1])
+    print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100)
+    test_creator = paddle.dataset.mnist.test()
+    test_data = []
+    for item in test_creator():
+        test_data.append((item[0], ))
+        if len(test_data) == 100:
+            break
+    # output is a softmax layer. It returns probabilities.
+    # Shape should be (100, 10)
+    probs = paddle.infer(
+        output_layer=predict, parameters=parameters, input=test_data)
+    print probs.shape
+if __name__ == '__main__':
+    main()
--- a/go/pserver/cclient/test/test_train.py
+++ b/go/pserver/cclient/test/test_train.py
+import paddle.v2 as paddle
+import paddle.v2.dataset.uci_housing as uci_housing
+def main():
+    # init
+    paddle.init(use_gpu=False, trainer_count=1)
+    # network config
+    x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+    y_predict = paddle.layer.fc(input=x,
+                                param_attr=paddle.attr.Param(name='w'),
+                                size=1,
+                                act=paddle.activation.Linear(),
+                                bias_attr=paddle.attr.Param(name='b'))
+    y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
+    cost = paddle.layer.mse_cost(input=y_predict, label=y)
+    # create parameters
+    parameters = paddle.parameters.create(cost)
+    # create optimizer
+    optimizer = paddle.optimizer.Momentum(momentum=0)
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer,
+                                 is_local=False,
+                                 pserver_spec="localhost:3000")
+    # event_handler to print training and testing info
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                print "Pass %d, Batch %d, Cost %f" % (
+                    event.pass_id, event.batch_id, event.cost)
+        if isinstance(event, paddle.event.EndPass):
+            if (event.pass_id + 1) % 10 == 0:
+                result = trainer.test(
+                    reader=paddle.batch(
+                        uci_housing.test(), batch_size=2),
+                    feeding={'x': 0,
+                             'y': 1})
+                print "Test %d, %.2f" % (event.pass_id, result.cost)
+    # training
+    trainer.train(
+        reader=paddle.batch(
+            paddle.reader.shuffle(
+                uci_housing.train(), buf_size=500),
+            batch_size=2),
+        feeding={'x': 0,
+                 'y': 1},
+        event_handler=event_handler,
+        num_passes=30)
+if __name__ == '__main__':
+    main()
--- a/go/pserver/client.go
+++ b/go/pserver/client.go
@@ -6,7 +6,7 @@ import (
 	"sort"
 	"time"
-	"github.com/PaddlePaddle/Paddle/go/pserver/internal/connection"
+	"github.com/PaddlePaddle/Paddle/go/connection"
 )
 // TODO(helin): add RPC call retry logic
@@ -47,7 +47,7 @@ func NewClient(l Lister, pserverNum int, sel Selector) *Client {
 // monitorPservers monitors pserver addresses, and updates connection
 // when the address changes.
 func (c *Client) monitorPservers(l Lister, pserverNum int) {
-	knownServers := make([]Server, pserverNum)
+	lastServers := make([]Server, pserverNum)
 	ticker := time.NewTicker(10 * time.Second)
 	monitor := func() {
 		curServers := make([]Server, pserverNum)
@@ -56,25 +56,37 @@ func (c *Client) monitorPservers(l Lister, pserverNum int) {
 			curServers[l.Index] = l
 		}
-		for i := range knownServers {
+		for i := range lastServers {
-			if knownServers[i].Addr != curServers[i].Addr {
+			if lastServers[i].Addr == curServers[i].Addr {
-				err := c.pservers[i].Connect(curServers[i].Addr)
+				continue
+			}
+			if curServers[i].Addr == "" {
+				err := c.pservers[i].Close()
 				if err != nil {
 					log.Println(err)
-					// connect to addr failed, set
-					// to last known addr in order
-					// to retry next time.
-					curServers[i].Addr = knownServers[i].Addr
 				}
+				continue
 			}
+			err := c.pservers[i].Connect(curServers[i].Addr)
+			if err != nil {
+				log.Println(err)
+				// connect to addr failed, set
+				// to last known addr in order
+				// to retry next time.
+				curServers[i].Addr = lastServers[i].Addr
+			}
 		}
-		knownServers = curServers
+		lastServers = curServers
 	}
 	monitor()
-	for _ = range ticker.C {
+	for range ticker.C {
 		monitor()
 	}
 }
@@ -93,16 +105,14 @@ func (c *Client) BeginInitParams() bool {
 // InitParam initializes the parameter on parameter servers.
 func (c *Client) InitParam(paramWithConfigs ParameterWithConfig) error {
-	var dummy int
+	return c.pservers[c.partition(paramWithConfigs.Param.Name)].Call("Service.InitParam", paramWithConfigs, nil)
-	return c.pservers[c.partition(paramWithConfigs.Param.Name)].Call("Service.InitParam", paramWithConfigs, &dummy)
 }
 // FinishInitParams tells parameter servers client has sent all
 // parameters to parameter servers as initialization.
 func (c *Client) FinishInitParams() error {
 	for _, p := range c.pservers {
-		var dummy int
+		err := p.Call("Service.FinishInitParams", 0, nil)
-		err := p.Call("Service.FinishInitParams", dummy, &dummy)
 		if err != nil {
 			return err
 		}
@@ -116,8 +126,7 @@ func (c *Client) SendGrads(grads []Gradient) error {
 	errCh := make(chan error, len(grads))
 	for _, g := range grads {
 		go func(g Gradient) {
-			var dummy int
+			err := c.pservers[c.partition(g.Name)].Call("Service.SendGrad", g, nil)
-			err := c.pservers[c.partition(g.Name)].Call("Service.SendGrad", g, &dummy)
 			errCh <- err
 		}(g)
 	}
@@ -196,8 +205,7 @@ func (c *Client) Save(path string) error {
 	errCh := make(chan error, len(c.pservers))
 	for _, p := range c.pservers {
-		var dummy int
+		err := p.Call("Service.Save", path, nil)
-		err := p.Call("Service.Save", path, &dummy)
 		errCh <- err
 	}

--- a/go/pserver/client_test.go
+++ b/go/pserver/client_test.go
@@ -117,7 +117,7 @@ func TestClientFull(t *testing.T) {
 	for i := range params {
 		if names[i] != params[i].Name {
-			t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", names[i], params[i])
+			t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", names[i], params[i].Name)
 		}
 	}
 }
--- a/go/pserver/optimizer.c
+++ b/go/pserver/optimizer.c
@@ -32,7 +32,13 @@ int update_SGD(void* optimizer,
               const void* gradient,
               int num_bytes) {
  SGD_optimizer* o = (SGD_optimizer*)optimizer;
-  // TODO
+  float* parameter = (float*)buffer;
+  float* grad = (float*)gradient;
+  int i;
+  for (i = 0; i < num_bytes / sizeof(float); ++i) {
+    parameter[i] -= o->learning_rate * grad[i];
+  }
  return 0;
 }

--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -9,8 +9,10 @@ import (
 // ElementType is the type of elements of a Parameter.
 type ElementType int
-var ErrAlreadyInitialized = errors.New("pserver already initialized")
+const (
-var ErrUninitialized = errors.New("pserver not fully initialized")
+	AlreadyInitialized = "pserver already initialized"
+	Uninitialized      = "pserver not fully initialized"
+)
 // Supported element types
 const (
@@ -49,7 +51,7 @@ type Service struct {
 // NewService creates a new service.
 func NewService() *Service {
-	s := &Service{opt: newOptimizer(sgd, 0.01)}
+	s := &Service{opt: newOptimizer(sgd, 0.005)}
 	s.paramMap = make(map[string]Parameter)
 	s.initialized = make(chan struct{})
 	return s
@@ -59,7 +61,7 @@ func NewService() *Service {
 func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) error {
 	select {
 	case <-s.initialized:
-		return ErrAlreadyInitialized
+		return errors.New(AlreadyInitialized)
 	default:
 	}
@@ -80,7 +82,7 @@ func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) er
 func (s *Service) FinishInitParams(dummy0 int, dummy1 *int) error {
 	select {
 	case <-s.initialized:
-		return ErrAlreadyInitialized
+		return errors.New(AlreadyInitialized)
 	default:
 	}
@@ -94,7 +96,7 @@ func (s *Service) SendGrad(g Gradient, dummy *int) error {
 	select {
 	case <-s.initialized:
 	default:
-		return ErrUninitialized
+		return errors.New(Uninitialized)
 	}
 	s.mu.Lock()

--- a/go/pserver/service_test.go
+++ b/go/pserver/service_test.go
@@ -15,8 +15,7 @@ func TestFull(t *testing.T) {
 	p.Name = "param_a"
 	p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}
 	p.ElementType = pserver.Int32
-	var dummy int
+	err := s.InitParam(pserver.ParameterWithConfig{Param: p, Config: nil}, nil)
-	err := s.InitParam(pserver.ParameterWithConfig{p, nil}, &dummy)
 	if err != nil {
 		t.FailNow()
 	}
@@ -25,12 +24,12 @@ func TestFull(t *testing.T) {
 	p1.Name = "param_b"
 	p1.Content = []byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
 	p1.ElementType = pserver.Float32
-	err = s.InitParam(pserver.ParameterWithConfig{p1, nil}, &dummy)
+	err = s.InitParam(pserver.ParameterWithConfig{Param: p1, Config: nil}, nil)
 	if err != nil {
 		t.FailNow()
 	}
-	err = s.FinishInitParams(0, &dummy)
+	err = s.FinishInitParams(0, nil)
 	if err != nil {
 		t.FailNow()
 	}
@@ -46,11 +45,11 @@ func TestFull(t *testing.T) {
 	}
 	g1, g2 := pserver.Gradient(p1), pserver.Gradient(p)
-	err = s.SendGrad(g1, &dummy)
+	err = s.SendGrad(g1, nil)
 	if err != nil {
 		t.FailNow()
 	}
-	err = s.SendGrad(g2, &dummy)
+	err = s.SendGrad(g2, nil)
 	if err != nil {
 		t.FailNow()
@@ -74,23 +73,21 @@ func TestFull(t *testing.T) {
 func TestMultipleInit(t *testing.T) {
 	s := pserver.NewService()
-	var dummy int
+	err := s.FinishInitParams(0, nil)
-	err := s.FinishInitParams(0, &dummy)
 	if err != nil {
 		t.FailNow()
 	}
-	err = s.FinishInitParams(0, &dummy)
+	err = s.FinishInitParams(0, nil)
-	if err != pserver.ErrAlreadyInitialized {
+	if err.Error() != pserver.AlreadyInitialized {
 		t.FailNow()
 	}
 }
 func TestUninitialized(t *testing.T) {
 	s := pserver.NewService()
-	var dummy int
+	err := s.SendGrad(pserver.Gradient{}, nil)
-	err := s.SendGrad(pserver.Gradient{}, &dummy)
+	if err.Error() != pserver.Uninitialized {
-	if err != pserver.ErrUninitialized {
 		t.FailNow()
 	}
 }
@@ -98,13 +95,14 @@ func TestUninitialized(t *testing.T) {
 func TestBlockUntilInitialized(t *testing.T) {
 	s := pserver.NewService()
 	ch := make(chan struct{}, 2)
+	errCh := make(chan error, 2)
 	var wg sync.WaitGroup
 	wg.Add(1)
 	go func() {
 		var param pserver.Parameter
 		err := s.GetParam("param_a", &param)
 		if err != nil {
-			t.FailNow()
+			errCh <- err
 		}
 		wg.Done()
 		ch <- struct{}{}
@@ -112,10 +110,9 @@ func TestBlockUntilInitialized(t *testing.T) {
 	wg.Add(1)
 	go func() {
-		var dummy int
+		err := s.Save("", nil)
-		err := s.Save("", &dummy)
 		if err != nil {
-			t.FailNow()
+			errCh <- err
 		}
 		wg.Done()
 		ch <- struct{}{}
@@ -127,6 +124,8 @@ func TestBlockUntilInitialized(t *testing.T) {
 	case <-ch:
 		// some function returned before initialization is completed.
 		t.FailNow()
+	case <-errCh:
+		t.FailNow()
 	default:
 	}
@@ -134,13 +133,12 @@ func TestBlockUntilInitialized(t *testing.T) {
 	p.Name = "param_a"
 	p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}
 	p.ElementType = pserver.Int32
-	var dummy int
+	err := s.InitParam(pserver.ParameterWithConfig{Param: p, Config: nil}, nil)
-	err := s.InitParam(pserver.ParameterWithConfig{p, nil}, &dummy)
 	if err != nil {
 		t.FailNow()
 	}
-	err = s.FinishInitParams(0, &dummy)
+	err = s.FinishInitParams(0, nil)
 	if err != nil {
 		t.FailNow()
 	}

--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -16,7 +16,7 @@ set(API_HEADER
    Internal.h)
 add_library(paddle_api STATIC ${API_SOURCES})
-add_dependencies(paddle_api gen_proto_cpp)
+add_dependencies(paddle_api gen_proto_cpp paddle_pserver_cclient_lib)
 INCLUDE(${SWIG_USE_FILE})
 INCLUDE_DIRECTORIES(${PROJ_ROOT}/paddle)
@@ -45,7 +45,7 @@ SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS
 )
 IF(APPLE)
-    SET(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load")
+    SET(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load -framework CoreFoundation -framework Security")
 ELSE(APPLE)
    SET(START_GROUP "-Xlinker -start-group")
    SET(END_GROUP "-Xlinker -end-group")

--- a/paddle/api/Paddle.i
+++ b/paddle/api/Paddle.i
@@ -179,6 +179,7 @@ namespace std {
 %newobject ParameterOptimizer::needSpecialTraversal;
 %newobject ParameterUpdater::createLocalUpdater;
 %newobject ParameterUpdater::createRemoteUpdater;
+%newobject ParameterUpdater::createNewRemoteUpdater;
 %feature("director") UpdateCallback;
 %feature("autodoc", 1); // To generate method stub, for code hint in ide

--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -841,6 +841,8 @@ public:
  static ParameterUpdater* createRemoteUpdater(OptimizationConfig* config,
                                               int passCount,
                                               bool useSparseUpdater);
+  static ParameterUpdater* createNewRemoteUpdater(
+      OptimizationConfig* config, const std::string pserverSpec);
  ~ParameterUpdater();
  /**

--- a/paddle/api/ParameterUpdater.cpp
+++ b/paddle/api/ParameterUpdater.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "PaddleAPI.h"
 #include "PaddleAPIPrivate.h"
+#include "paddle/trainer/NewRemoteParameterUpdater.h"
 #include "paddle/trainer/RemoteParameterUpdater.h"
 #include "paddle/trainer/ThreadParameterUpdater.h"
@@ -28,6 +29,14 @@ ParameterUpdater *ParameterUpdater::createLocalUpdater(
  return updater;
 }
+ParameterUpdater *ParameterUpdater::createNewRemoteUpdater(
+    OptimizationConfig *config, const std::string pserverSpec) {
+  auto updater = new ParameterUpdater();
+  updater->m->updater.reset(new paddle::NewRemoteParameterUpdater(
+      config->m->getConfig(), pserverSpec));
+  return updater;
+}
 ParameterUpdater *ParameterUpdater::createRemoteUpdater(
    OptimizationConfig *config, int passCount, bool useSparseUpdater) {
  auto updater = new ParameterUpdater();

--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -28,6 +28,7 @@ if(WITH_TESTING)
    add_simple_unittest(PadOpTest)
    add_simple_unittest(MulOpTest)
    add_simple_unittest(CosSimOpTest)
+    add_simple_unittest(RowConvOpTest)
 endif()
 endif()

--- a/paddle/function/RowConvOp.cpp
+++ b/paddle/function/RowConvOp.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "RowConvOp.h"
+#include <iostream>
+#include "paddle/math/Vector.h"
+namespace paddle {
+template <>
+void RowConv<DEVICE_TYPE_CPU>(CpuMatrix& out,
+                              const CpuMatrix& in,
+                              const CpuMatrix& filter,
+                              const CpuIVector& seq) {
+  const int* starts = seq.getData();
+  const size_t numSeq = seq.getSize() - 1;
+  const size_t contextLength = filter.getHeight();
+  for (size_t i = 0; i < numSeq; ++i) {
+    size_t begin = starts[i];
+    size_t end = starts[i + 1];
+    for (size_t j = begin; j < end; ++j) {
+      MatrixPtr x;
+      MatrixPtr w;
+      if ((j + contextLength) < end) {
+        x = (const_cast<CpuMatrix&>(in)).subMatrix(j, contextLength);
+        w = (const_cast<CpuMatrix&>(filter)).subMatrix(0, contextLength);
+      } else {
+        x = (const_cast<CpuMatrix&>(in)).subMatrix(j, end - j);
+        w = (const_cast<CpuMatrix&>(filter)).subMatrix(0, end - j);
+      }
+      MatrixPtr y = out.subMatrix(j, 1);
+      y->addDotMulVMM(*x, *w);
+    }
+  }
+}
+template <>
+void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
+                                  const CpuMatrix& in,
+                                  const CpuMatrix& filter,
+                                  CpuMatrix& inG,
+                                  CpuMatrix& filterG,
+                                  const CpuIVector& seq) {
+  // gradient w.r.t filter
+  const int* starts = seq.getData();
+  const size_t numSeq = seq.getSize() - 1;
+  const size_t contextLength = filter.getHeight();
+  if (filterG) {
+    for (size_t i = 0; i < numSeq; ++i) {
+      size_t begin = starts[i];
+      size_t end = starts[i + 1];
+      size_t steps = end - begin;
+      for (size_t j = 0; j < contextLength && (begin + j) < end; ++j) {
+        MatrixPtr x =
+            (const_cast<CpuMatrix&>(in)).subMatrix(begin + j, steps - j);
+        MatrixPtr dy =
+            (const_cast<CpuMatrix&>(outG)).subMatrix(begin, steps - j);
+        MatrixPtr dw = filterG.subMatrix(j, 1);
+        dw->addDotMulVMM(*dy, *x);
+      }
+    }
+  }
+  // gradient w.r.t input feature
+  if (inG) {
+    for (size_t i = 0; i < numSeq; ++i) {
+      size_t begin = starts[i];
+      size_t end = starts[i + 1];
+      size_t steps = end - begin;
+      for (size_t j = 0; j < steps; ++j) {
+        MatrixPtr dx = inG.subMatrix(begin + j, 1);
+        for (size_t t = 0; t < contextLength; ++t) {
+          if (int(j - t) >= 0) {
+            MatrixPtr dy =
+                (const_cast<CpuMatrix&>(outG)).subMatrix(begin + j - t, 1);
+            MatrixPtr w = (const_cast<CpuMatrix&>(filter)).subMatrix(t, 1);
+            dx->addDotMul(*dy, *w, 1.0, 1.0);
+          }
+        }
+      }
+    }
+  }
+}
+/**
+ * \brief The row convolution is called lookahead convolution. It is firstly
+ * introduced in deep-speech2 system. The bidirectional RNN that learns
+ * representation for a sequence by performing a forward and a backward pass
+ * through the entire sequence. However, unlike unidirectional RNNs,
+ * bidirectional RNNs are challenging to deploy in an online and low-latency
+ * setting. The lookahead convolution incorporates information from future
+ * subsequences in a computationally efficient manner to improve unidirectional
+ * recurrent neural networks.
+ *
+ * The connection of row convolution is different form the 1D sequence
+ * convolution. Assumed that, the future context-length is k, that is to say,
+ * it can get the output at timestep t by using the the input feature from t-th
+ * timestep to (t+k)-th timestep. Assumed that the hidden dim of input
+ * activations are d, the activations r_t for the new layer at time-step t are:
+ *
+ *
+ *            -- k + 1
+ *  r(t,i) =  >       W(i,j) * h(t+j-1, i),  for (1 <= i <= d)
+ *            -- j = 1
+ *
+ *
+ * The weight shape is: (k + 1) x d
+ * Function Arguments:
+ *
+ * \param inputs[0]  The input activations.
+ * \param inputs[0]  The filter (or weight) and shape is (k+1) x d.
+ * \param outputs[1] The output activations.
+ *
+ * [1] Dario Amodei, etc. Deep Speech 2 : End-to-End Speech Recognition in
+ * English
+ *     and Mandarin. https://arxiv.org/abs/1512.02595
+ */
+template <DeviceType Device>
+class RowConvFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {}
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    // check
+    CHECK_EQ(2UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    // TODO(qingqing): support ASSIGN_TO.
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here.";
+    const auto in = dynamic_cast<const SequenceArg&>(inputs[0]);
+    auto out = dynamic_cast<const SequenceArg&>(outputs[0]);
+    auto w = inputs[1];
+    CHECK(in.data() && out.data() && in.getSequenceId().data());
+    CHECK_EQ(in.shape().ndims(), 2UL);
+    CHECK(in.shape() == out.shape());
+    CHECK_EQ(w.shape()[1], in.shape()[1]);
+    auto outMat = out.matrix<Device>();
+    const auto inMat = in.matrix<Device>();
+    const auto wMat = w.matrix<Device>();
+    const auto seqId = in.getSequenceId().vector<int, Device>();
+    RowConv<Device>(outMat, inMat, wMat, seqId);
+  }
+};
+/**
+ * \brief The backward of row convolution function. This function calculated
+ * the gradient w.r.t filter and the gradient w.r.t input activations(or data).
+ *
+ * Argument in this Function:
+ *
+ * \param inputs[0]  The gradient w.r.t output activations.
+ * \param inputs[1]  The input activations.
+ * \param inputs[2]  The filter (or weight) and shape is (k+1) x d.
+ * \param outputs[0] The gradient w.r.t input activations.
+ * \param outputs[1] The gradient w.r.r filter.
+ *
+ * Abbreviation:
+ * w.r.t: with respect to.
+ */
+template <DeviceType Device>
+class RowConvGradFunc : public FunctionBase {
+  // TODO(qingqing): split into RowConvDataFunc and RowConvWeightFunc
+public:
+  void init(const FuncConfig& config) override {}
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    // check
+    CHECK_EQ(3UL, inputs.size());
+    CHECK_EQ(2UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    CHECK_EQ(outputs[1].getArgType(), ADD_TO);
+    CHECK(inputs[0].isSequenceArg() && inputs[1].isSequenceArg() &&
+          outputs[0].isSequenceArg())
+        << "SequenceArg required here.";
+    const auto outGrad = dynamic_cast<const SequenceArg&>(inputs[0]);
+    const auto in = dynamic_cast<const SequenceArg&>(inputs[1]);
+    const auto w = inputs[2];
+    auto inGrad = dynamic_cast<const SequenceArg&>(outputs[0]);
+    auto wGrad = outputs[1];
+    CHECK_EQ(in.shape().ndims(), 2UL);
+    CHECK(in.shape() == inGrad.shape());
+    CHECK(in.shape() == outGrad.shape());
+    CHECK_EQ(wGrad.shape()[1], in.shape()[1]);
+    const auto outGMat = outGrad.matrix<Device>();
+    const auto inMat = in.matrix<Device>();
+    const auto wMat = w.matrix<Device>();
+    auto inGMat = inGrad.data()
+                      ? inGrad.matrix<Device>()
+                      : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
+    auto wGMat = wGrad.data()
+                     ? wGrad.matrix<Device>()
+                     : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
+    const auto seqId = in.getSequenceId().vector<int, Device>();
+    RowConvGrad<Device>(outGMat, inMat, wMat, inGMat, wGMat, seqId);
+  }
+};
+REGISTER_TYPED_FUNC(RowConv, CPU, RowConvFunc);
+REGISTER_TYPED_FUNC(RowConvGrad, CPU, RowConvGradFunc);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(RowConv, GPU, RowConvFunc);
+REGISTER_TYPED_FUNC(RowConvGrad, GPU, RowConvGradFunc);
+#endif
+}  // namespace paddle
--- a/paddle/function/RowConvOp.h
+++ b/paddle/function/RowConvOp.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "Function.h"
+namespace paddle {
+/**
+ * \brief The forward of row convolution.
+ *
+ * \param[out] out      The output data and shape is h x d. h is the sum of
+ *                      time steps of all samples in one mini-batch.
+ * \param[in]  in       The input data and shape is h x d.
+ * \param[in]  filter   The filter and shape is k x d. The lookahead step
+ *                      number plus one equals k.
+ * \param[in]  seq      The sequence start positions.
+ *
+ */
+template <DeviceType DType>
+void RowConv(typename Tensor<real, DType>::Matrix& out,
+             const typename Tensor<real, DType>::Matrix& in,
+             const typename Tensor<real, DType>::Matrix& filter,
+             const typename Tensor<int, DType>::Vector& seq);
+/**
+ * \brief The backward of row convolution.
+ *
+ * \param[in]  outG     The gradient w.r.t output data.
+ * \param[in]  in       The input data.
+ * \param[in]  filter   The filter.
+ * \param[out] inG      The gradient w.r.t input data.
+ * \param[out] filterG  The gradient w.r.t filter.
+ * \param[in]  seq      The sequence start positions.
+ *
+ */
+template <DeviceType DType>
+void RowConvGrad(const typename Tensor<real, DType>::Matrix& outG,
+                 const typename Tensor<real, DType>::Matrix& in,
+                 const typename Tensor<real, DType>::Matrix& filter,
+                 typename Tensor<real, DType>::Matrix& inG,
+                 typename Tensor<real, DType>::Matrix& filterG,
+                 const typename Tensor<int, DType>::Vector& seq);
+}  // namespace paddle
--- a/paddle/function/RowConvOpGpu.cu
+++ b/paddle/function/RowConvOpGpu.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "hl_base.h"
+#include "RowConvOp.h"
+namespace paddle {
+template<int BLOCK_H, int BLOCK_W>
+__global__ void KeRowConv(real* y, const real* x,  const real* w,
+    const int* starts, const int height, const int width,
+    const int numSeq, const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+  __shared__ real sw[BLOCK_H][BLOCK_W];
+  for (int i = tidy; i < context; i += blky) {
+    sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0;
+  }
+  __syncthreads();
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      real sum = 0;
+      int off = (start + j) * width;
+      for (int t = 0; t < context; ++t) {
+        if ((start + j + t) < end) {
+          int xoff = off + t * width;
+          real xVal = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0;
+          sum += sw[t][tidx] * xVal;
+        }
+      }
+      if (gidx + tidx < width) {
+        y[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+__global__ void KeRowConv2(real* y, const real* x,  const real* w,
+    const int* starts, const int height, const int width,
+    const int numSeq, const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      int off = (start + j) * width;
+      real sum = 0;
+      for (int t = 0; t < context && (start + j + t) < end; ++t) {
+        int xoff = off + t * width;
+        real xd = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0;
+        real wd = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0;
+        sum += wd * xd;
+      }
+      if (gidx + tidx < width) {
+        y[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+template <>
+void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,
+                              const GpuMatrix& in,
+                              const GpuMatrix& filter,
+                              const GpuIVector& seq) {
+  const size_t numSeq = seq.getSize() - 1;
+  const size_t contextLength = filter.getHeight();
+  const size_t height = in.getHeight();
+  const size_t width = in.getWidth();
+  real* y = out.getData();
+  const real* x = in.getData();
+  const real* w = filter.getData();
+  const int* starts = seq.getData();
+  dim3 dimBlock(32, 32);
+  dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
+  if (contextLength <= 32) {
+    KeRowConv<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
+      (y, x, w, starts, height, width, numSeq, contextLength);
+  } else {
+    KeRowConv2<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
+      (y, x, w, starts, height, width, numSeq, contextLength);
+  }
+  CHECK_SYNC("RowConv");
+}
+template<int BLOCK_H, int BLOCK_W, int CONTEXT>
+__global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
+    const int* starts, const int height, const int width, const int numSeq,
+    const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+  __shared__ real sh_x[BLOCK_W][BLOCK_H];
+  __shared__ real sh_dy[BLOCK_W][BLOCK_H + CONTEXT - 1];
+  __shared__ real sh_dw[CONTEXT][BLOCK_W];
+  if (tidy < context) {
+    sh_dw[tidy][tidx] = 0.0;
+  }
+  __syncthreads();
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    const int size = ((steps + BLOCK_H - 1)/BLOCK_H) * BLOCK_H;
+    for (int j = tidy; j < size; j += BLOCK_H) {
+      int xoff = gidx + tidx;
+      int yoff = start + j;
+      // transpose
+      sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
+      sh_dy[tidx][tidy + context - 1] = (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0;
+      __syncthreads();
+      if (tidy < (context - 1)) {
+        yoff = yoff - context + 1;
+        sh_dy[tidx][tidy] = (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0;
+      }
+      __syncthreads();
+      for (int t = 0; t < context; t++) {
+        real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx + context - 1 - t];
+        __syncthreads();
+        // warp size and blockDim.x is 32.
+        val += __shfl_down(val, 16);
+        val += __shfl_down(val, 8);
+        val += __shfl_down(val, 4);
+        val += __shfl_down(val, 2);
+        val += __shfl_down(val, 1);
+        __syncthreads();
+        if (tidx == 0) {
+          sh_dw[t][tidy] += val;
+        }
+        __syncthreads();
+      }
+    }
+  }
+  for (int t = tidy; (t < context) && ((gidx + tidx) < width); t += blky) {
+    dw[t * width + gidx + tidx] += sh_dw[t][tidx];
+  }
+}
+template<int BLOCK_H, int BLOCK_W>
+__global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
+    const int* starts, const int height, const int width, const int numSeq,
+    const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int gidx = blockIdx.x * blockDim.x;
+  __shared__ real sh_x[BLOCK_H][BLOCK_W];
+  __shared__ real sh_dy[BLOCK_H][BLOCK_W];
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    const int size = ((steps + BLOCK_H - 1)/BLOCK_H) * BLOCK_H;
+    for (int j = tidy; j < size; j += BLOCK_H) {
+      int xoff = gidx + tidx;
+      int yoff = start + j;
+      // transpose
+      sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
+      __syncthreads();
+      for (int t = 0; t < context; t++) {
+        sh_dy[tidx][tidy] = (xoff < width && (yoff - t) >= start && yoff - t < end) ? dy[(yoff - t) * width + xoff] : 0.0;
+        __syncthreads();
+        real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx];
+        __syncthreads();
+        // warp size and blockDim.x is 32.
+        val += __shfl_down(val, 16);
+        val += __shfl_down(val, 8);
+        val += __shfl_down(val, 4);
+        val += __shfl_down(val, 2);
+        val += __shfl_down(val, 1);
+        __syncthreads();
+        if (tidx == 0 && (gidx + tidy) < width) {
+          dw[t*width + gidx + tidy] += val;
+        }
+      }
+    }
+  }
+}
+template<int BLOCK_H, int BLOCK_W>
+__global__ void KeRowConvBwData(real* dx, const real* w, const real* dy,
+    const int* starts, const int height, const int width, const int numSeq,
+    const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+  __shared__ real sw[BLOCK_H][BLOCK_W];
+  for (int i = tidy; i < context; i += blky) {
+    sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0;
+  }
+  __syncthreads();
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      real sum = 0;
+      int off = (start + j) * width;
+      for (int t = 0; t < context && (j - t) >= 0; ++t) {
+        int dyOff = off - t * width;
+        real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0;
+        sum += sw[t][tidx] * dyVal;
+      }
+      if (gidx + tidx < width) {
+        dx[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+__global__ void KeRowConvBwData2(real* dx, const real* w, const real* dy,
+    const int* starts, const int height, const int width, const int numSeq,
+    const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      real sum = 0;
+      int off = (start + j) * width;
+      for (int t = 0; t < context && (j - t) >= 0; ++t) {
+        int dyOff = off - t * width;
+        real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0;
+        real wVal = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0;
+        sum += wVal * dyVal;
+      }
+      if (gidx + tidx < width) {
+        dx[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+template <>
+void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
+                              const GpuMatrix& in,
+                              const GpuMatrix& filter,
+                              GpuMatrix& inG,
+                              GpuMatrix& filterG,
+                              const GpuIVector& seq) {
+  const size_t numSeq = seq.getSize() - 1;
+  const size_t contextLength = filter.getHeight();
+  const size_t height = in.getHeight();
+  const size_t width = in.getWidth();
+  const real* dy = outG.getData();
+  const real* x = in.getData();
+  const real* w = filter.getData();
+  const int* starts = seq.getData();
+  if (filterG) {
+    dim3 dimBlock(32, 32);
+    dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
+    real* dw = filterG.getData();
+    if (contextLength <= 32) { 
+      KeRowConvBwWeight<32, 32, 32>
+        <<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
+        (dw, x, dy, starts, height, width, numSeq, contextLength);
+    } else {
+      KeRowConvBwWeight2<32, 32>
+        <<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
+        (dw, x, dy, starts, height, width, numSeq, contextLength);
+    }
+  }
+  if (inG) {
+    real* dx = inG.getData();
+    dim3 dimBlock2(32, 32);
+    dim3 dimGrid2(DIVUP(width, dimBlock2.x), 1);
+    if (contextLength <= 64) {
+      KeRowConvBwData<32, 64>
+        <<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>
+        (dx, w, dy, starts, height, width, numSeq, contextLength);
+    } else {
+      KeRowConvBwData2
+        <<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>
+        (dx, w, dy, starts, height, width, numSeq, contextLength);
+    }
+  }
+  CHECK_SYNC("RowConvGrad");
+}
+}  // namespace paddle
--- a/paddle/function/RowConvOpTest.cpp
+++ b/paddle/function/RowConvOpTest.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+namespace paddle {
+void testRowConvFw(size_t batchSize, size_t dim, size_t contextLength) {
+  FunctionCompare test("RowConv", FuncConfig());
+  test.addSequence(SequenceIdArg(TensorShape{batchSize}));
+  test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{contextLength, dim}));
+  test.addOutputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}),
+                  ADD_TO);
+  test.run();
+}
+void testRowConvBw(size_t batchSize, size_t dim, size_t contextLength) {
+  FunctionCompare test("RowConvGrad", FuncConfig());
+  test.addSequence(SequenceIdArg(TensorShape{batchSize}));
+  test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}));
+  test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{contextLength, dim}));
+  test.addOutputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}),
+                  ADD_TO);
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{contextLength, dim}),
+                  ADD_TO);
+  test.run();
+}
+TEST(RowConv, real) {
+  for (size_t numSamples : {17, 129, 2020}) {
+    for (size_t dim : {16, 512, 2560}) {
+      for (size_t context : {3, 19, 65}) {
+        VLOG(3) << " numSamples=" << numSamples << " dim=" << dim
+                << " context length=" << context;
+        testRowConvFw(numSamples, dim, context);
+        testRowConvBw(numSamples, dim, context);
+      }
+    }
+  }
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/RowConvLayer.cpp
+++ b/paddle/gserver/layers/RowConvLayer.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "RowConvLayer.h"
+#include "paddle/utils/Stat.h"
+namespace paddle {
+REGISTER_LAYER(row_conv, RowConvLayer);
+bool RowConvLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  contexLength_ = config_.inputs(0).row_conv_conf().context_length();
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  weight_.reset(new Weight(contexLength_, getSize(), parameters_[0]));
+  createFunction(forward_, "RowConv", FuncConfig());
+  createFunction(backward_, "RowConvGrad", FuncConfig());
+  return true;
+}
+void RowConvLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  MatrixPtr input = getInputValue(0);
+  size_t height = input->getHeight();
+  size_t width = input->getWidth();
+  CHECK_EQ(width, getSize());
+  resetOutput(height, width);
+  const auto startPos = getInput(0).sequenceStartPositions->getVector(useGpu_);
+  MatrixPtr w = weight_->getW();
+  wDims_ = TensorShape({w->getHeight(), w->getWidth()});
+  MatrixPtr outV = getOutputValue();
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), *startPos);
+  inputs.addArg(*w, wDims_);
+  outputs.addArg(*getOutputValue(), *startPos, ADD_TO);
+  {
+    REGISTER_TIMER_INFO("RowConvForward", getName().c_str());
+    forward_[0]->calc(inputs, outputs);
+  }
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+void RowConvLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+  const auto startPos = getInput(0).sequenceStartPositions->getVector(useGpu_);
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getOutputGrad(), *startPos);
+  inputs.addArg(*getInputValue(0), *startPos);
+  inputs.addArg(*weight_->getW(), wDims_);
+  MatrixPtr inGrad = getInputGrad(0);
+  MatrixPtr wGrad = weight_->getWGrad();
+  size_t h = getInputValue(0)->getHeight();
+  size_t w = getInputValue(0)->getWidth();
+  outputs.addArg(
+      inGrad ? (*inGrad) : *(Matrix::create(nullptr, h, w, false, useGpu_)),
+      *startPos,
+      ADD_TO);
+  outputs.addArg(
+      wGrad ? (*wGrad)
+            : *(Matrix::create(nullptr, contexLength_, w, false, useGpu_)),
+      wDims_,
+      ADD_TO);
+  {
+    REGISTER_TIMER_INFO("RowConvBackward", getName().c_str());
+    backward_[0]->calc(inputs, outputs);
+  }
+  {
+    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+    weight_->getParameterPtr()->incUpdate(callback);
+  }
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/RowConvLayer.h
+++ b/paddle/gserver/layers/RowConvLayer.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "Layer.h"
+namespace paddle {
+/**
+ * \brief Row Convolution Layer.
+ */
+class RowConvLayer : public Layer {
+public:
+  explicit RowConvLayer(const LayerConfig& config) : Layer(config) {}
+  ~RowConvLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+protected:
+  // Row convolution weight, context_lenght_ * fan_out.
+  // fan_out is the size of output feature.
+  std::unique_ptr<Weight> weight_;
+  // The step number to look ahead plus one equals contexLength_.
+  size_t contexLength_;
+  TensorShape wDims_;
+};
+}  // namespace paddle
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1705,6 +1705,26 @@ TEST(Layer, TransLayer) {
  }
 }
+TEST(Layer, RowConvLayer) {
+  const int context = 3;
+  const int size = 512;
+  TestConfig config;
+  config.layerConfig.set_type("row_conv");
+  config.layerConfig.set_size(size);
+  config.layerConfig.set_active_type("sigmoid");
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_DATA, "layer_0", size, context * size});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  RowConvConfig* conv = input->mutable_row_conv_conf();
+  conv->set_context_length(context);
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "row_conv", 100, false, useGpu, false);
+  }
+}
 int main(int argc, char** argv) {
  testing::InitGoogleTest(&argc, argv);
  initMain(argc, argv);

--- a/paddle/parameter/tests/test_argument.cpp
+++ b/paddle/parameter/tests/test_argument.cpp
@@ -42,7 +42,7 @@ TEST(Argument, poolSequenceWithStride) {
    CHECK_EQ(outStart[3], 4);
    CHECK_EQ(outStart[4], 7);
-    CHECK_EQ(stridePositions->getSize(), 8);
+    CHECK_EQ(stridePositions->getSize(), 8UL);
    auto result = reversed ? strideResultReversed : strideResult;
    for (int i = 0; i < 8; i++) {
      CHECK_EQ(stridePositions->getData()[i], result[i]);

--- a/paddle/trainer/CMakeLists.txt
+++ b/paddle/trainer/CMakeLists.txt
@@ -4,6 +4,7 @@ set(TRAINER_SOURCES
        ParameterUpdater.cpp
        ParamUtil.cpp
        RemoteParameterUpdater.cpp
+        NewRemoteParameterUpdater.cpp
        Tester.cpp
        Trainer.cpp
        TrainerInternal.cpp
@@ -16,6 +17,7 @@ set(TRAINER_HEADERS
        ParameterUpdater.h
        ParamUtil.h
        RemoteParameterUpdater.h
+        NewRemoteParameterUpdater.h
        Tester.h
        TesterConfig.h
        Trainer.h
@@ -32,7 +34,7 @@ add_style_check_target(paddle_trainer_lib
 add_style_check_target(paddle_trainer_lib
    ${TRAINER_HEADERS})
 add_dependencies(paddle_trainer_lib
-    gen_proto_cpp)
+    gen_proto_cpp paddle_pserver_cclient_lib)
 macro(add_paddle_exe TARGET_NAME)
  add_executable(${TARGET_NAME} ${ARGN})
@@ -56,3 +58,10 @@ install(TARGETS paddle_trainer paddle_merge_model
 set_target_properties(paddle_trainer PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
 set_target_properties(paddle_merge_model PROPERTIES INSTALL_RPATH_USE_LINK_PATH TRUE)
+if(APPLE)
+  set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
+endif()
+target_link_libraries(paddle_trainer ${CMAKE_CURRENT_SOURCE_DIR}/libpaddle_pserver_cclient.a)
+target_link_libraries(paddle_trainer_lib ${CMAKE_CURRENT_SOURCE_DIR}/libpaddle_pserver_cclient.a)
--- a/paddle/trainer/NewRemoteParameterUpdater.cpp
+++ b/paddle/trainer/NewRemoteParameterUpdater.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "NewRemoteParameterUpdater.h"
+#include "Trainer.h"
+#include "paddle/utils/Stat.h"
+DECLARE_int32(trainer_id);
+DECLARE_string(save_dir);
+namespace paddle {
+NewRemoteParameterUpdater::NewRemoteParameterUpdater(
+    const OptimizationConfig &config, const std::string pserverSpec)
+    : parameterClient_(-1),
+      newParameters_(nullptr),
+      newGradients_(nullptr),
+      pserverSpec_(pserverSpec) {}
+void NewRemoteParameterUpdater::init(
+    const std::vector<ParameterPtr> &parameters) {
+  ParameterUpdater::init(parameters);
+  for (auto &para : parameters_) {
+    para->getBuf(PARAMETER_VALUE)->zeroMem();
+    para->getBuf(PARAMETER_GRADIENT)->zeroMem();
+  }
+  // create parameter server client.
+  parameterClient_ = paddle_new_pserver_client((char *)pserverSpec_.c_str(),
+                                               FLAGS_trainer_id == 0);
+  // init new parameter and gradient.
+  newParameters_ = initNewParameter(PARAMETER_VALUE);
+  newGradients_ = initNewParameter(PARAMETER_GRADIENT);
+  // init parameter, one trainer will get the opportunity to int parameter and
+  // send them to parameter server. Others will get the initialized parameter
+  // from parameter server
+  if (paddle_begin_init_params(parameterClient_)) {
+    LOG(INFO) << "paddle_begin_init_params start";
+    for (int i = 0; i < parameterSize(); ++i) {
+      auto paramConfig = parameters_[i]->getConfig();
+      std::string bytes = paramConfig.SerializeAsString();
+      const char *array = bytes.data();
+      int size = (int)bytes.size();
+      paddle_init_param(
+          parameterClient_, *newParameters_[i], (void *)array, size);
+    }
+    paddle_finish_init_params(parameterClient_);
+    LOG(INFO) << "paddle_begin_init_params done";
+  } else {
+    paddle_get_params(parameterClient_, newParameters_, parameterSize());
+  }
+  LOG(INFO) << "NewRemoteParameterUpdater initialized";
+}
+void NewRemoteParameterUpdater::updateImpl(Parameter *para) {}
+void NewRemoteParameterUpdater::finishBatch(real cost) {
+  // send gradient to parameter server.
+  paddle_send_grads(parameterClient_, newGradients_, parameterSize());
+  // get the updated parameter from parameterClient.
+  paddle_get_params(parameterClient_, newParameters_, parameterSize());
+  // clear gradient after update parameter.
+  for (auto &para : parameters_) {
+    para->getBuf(PARAMETER_GRADIENT)->zeroMem();
+  }
+}
+void NewRemoteParameterUpdater::startPass() {}
+bool NewRemoteParameterUpdater::finishPass() { return true; }
+}
--- a/paddle/trainer/NewRemoteParameterUpdater.h
+++ b/paddle/trainer/NewRemoteParameterUpdater.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <functional>
+#include <thread>
+#include "ParameterUpdater.h"
+#include "libpaddle_pserver_cclient.h"
+#include "paddle/pserver/ParameterClient2.h"
+#include "paddle/utils/Queue.h"
+#include "paddle/utils/Util.h"
+namespace paddle {
+/**
+ * New remote parameter updater for dense parameters that use cclient of go.
+ */
+class NewRemoteParameterUpdater : public ParameterUpdater {
+public:
+  NewRemoteParameterUpdater(const OptimizationConfig& config,
+                            const std::string pserverSpec);
+  ~NewRemoteParameterUpdater() {
+    releaseNewParameter(newParameters_);
+    releaseNewParameter(newGradients_);
+    if (parameterClient_ >= 0) paddle_pserver_client_release(parameterClient_);
+  }
+  /**
+   * initialize the internal parameter client and itself.
+   */
+  virtual void init(const std::vector<ParameterPtr>& parameters);
+  /**
+   * @brief start batch
+   *
+   * @note  one batch training exhibits stateful feature to help
+   *        to do performance tuning, sgd optimization if necessary.
+   */
+  virtual PassType startBatch(int64_t batchSize) { return PASS_TRAIN; }
+  /**
+   * send parameters to pservers and get returned parameters
+   * from all pservers if necessary.
+   */
+  virtual void finishBatch(real cost);
+  virtual void startPass();
+  virtual bool finishPass();
+protected:
+  /**
+   * work need to do after finishBatch
+   */
+  virtual void updateImpl(Parameter* para);
+private:
+  int parameterSize() { return (int)parameters_.size(); }
+  /**
+   * init parameter of go paddle pserver cclient.
+   * @param new_params
+   * @param type
+   */
+  paddle_parameter** initNewParameter(ParameterType type) {
+    paddle_parameter** new_params =
+        (paddle_parameter**)malloc(sizeof(paddle_parameter*) * parameterSize());
+    for (int i = 0; i < parameterSize(); ++i) {
+      new_params[i] = (paddle_parameter*)malloc(sizeof(paddle_parameter));
+      memset(new_params[i], 0, sizeof(paddle_parameter));
+    }
+    for (int i = 0; i < parameterSize(); ++i) {
+      ParameterPtr param = parameters_[i];
+      new_params[i]->element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+      new_params[i]->name = (char*)param->getName().c_str();
+      new_params[i]->content =
+          (unsigned char*)(param->getBuf(type).get()->getData());
+      new_params[i]->content_len =
+          (int)param->getBuf(type).get()->getSize() * sizeof(real);
+    }
+    return new_params;
+  }
+  void releaseNewParameter(paddle_parameter** newParams) {
+    if (newParams != nullptr) {
+      for (int i = 0; i < parameterSize(); ++i) {
+        free(newParams[i]);
+      }
+      free(newParams);
+    }
+  }
+protected:
+  /// internal parameter client object for exchanging data with pserver
+  paddle_pserver_client parameterClient_;
+  /// the parameters for new pserver client
+  paddle_parameter** newParameters_;
+  /// the gradinets for new pserver client
+  paddle_parameter** newGradients_;
+  /// the specification of parameter server "host1:port,host1:port"
+  std::string pserverSpec_;
+};
+}  // namespace paddle
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -194,6 +194,10 @@ message MaxOutConfig {
  required uint32 groups = 2;
 }
+message RowConvConfig {
+  required uint32 context_length = 1;
+}
 message ProjectionConfig {
  required string type = 1;
  required string name = 2;
@@ -279,6 +283,7 @@ message LayerInputConfig {
  optional SppConfig spp_conf = 12;
  optional PriorBoxConfig priorbox_conf = 13;
  optional PadConfig pad_conf = 14;
+  optional RowConvConfig row_conv_conf = 15;
 }
 message LayerConfig {

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2081,6 +2081,23 @@ class MaxOutLayer(LayerBase):
                           g_layer_map[input_layer.name].width, out_channels)
+@config_layer('row_conv')
+class RowConvLayer(LayerBase):
+    def __init__(self, name, inputs, context_length, **xargs):
+        super(RowConvLayer, self).__init__(
+            name, 'maxout', 0, inputs=inputs, **xargs)
+        config_assert(
+            len(self.inputs) == 1,
+            'TransLayer must have one and only one input')
+        input_layer = self.get_input_layer(0)
+        row_conv_conf = self.config.inputs[0].row_conv_conf
+        row_conv_conf.context_length = context_length
+        self.set_layer_size(input_layer.size)
+        psize = context_length * input_layer.size
+        dims = [context_length, input_layer.size]
+        self.create_input_parameter(0, psize, dims)
 # key: cost type
 # value: cost class
 g_cost_map = {}

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -121,6 +121,7 @@ __all__ = [
    'smooth_l1_cost',
    'layer_support',
    'multiplex_layer',
+    'row_conv_layer',
    'dropout_layer',
    'prelu_layer',
 ]
@@ -179,17 +180,18 @@ class LayerType(object):
    EOSID_LAYER = 'eos_id'
    RECURRENT_LAYER = 'recurrent'
-    CONV_SHIFT_LAYER = 'conv_shift'
+    CONV_SHIFT_LAYER = "conv_shift"
-    TENSOR_LAYER = 'tensor'
+    TENSOR_LAYER = "tensor"
-    SEL_FC_LAYER = 'selective_fc'
+    SEL_FC_LAYER = "selective_fc"
-    SAMPLING_ID_LAYER = 'sampling_id'
+    SAMPLING_ID_LAYER = "sampling_id"
-    SLOPE_INTERCEPT_LAYER = 'slope_intercept'
+    SLOPE_INTERCEPT_LAYER = "slope_intercept"
-    LINEAR_COMBINATION_LAYER = 'convex_comb'
+    LINEAR_COMBINATION_LAYER = "convex_comb"
-    BLOCK_EXPAND = 'blockexpand'
+    BLOCK_EXPAND = "blockexpand"
-    MAXOUT = 'maxout'
+    MAXOUT = "maxout"
-    SPP_LAYER = 'spp'
+    SPP_LAYER = "spp"
-    PAD_LAYER = 'pad'
+    PAD_LAYER = "pad"
-    MULTIPLEX_LAYER = 'multiplex'
+    MULTIPLEX_LAYER = "multiplex"
+    ROW_CONV_LAYER = "row_conv"
    PRINT_LAYER = 'print'
    PRIORBOX_LAYER = 'priorbox'
@@ -5585,6 +5587,79 @@ def dropout_layer(input, dropout_rate, name=None):
 @wrap_name_default()
+@wrap_act_default(act=LinearActivation())
+@wrap_param_attr_default()
+@layer_support(DROPOUT)
+def row_conv_layer(input,
+                   context_len,
+                   act=None,
+                   name=None,
+                   param_attr=None,
+                   layer_attr=None):
+    """
+    The row convolution is called lookahead convolution. It is firstly
+    introduced in paper of `Deep Speech 2: End-toEnd Speech Recognition
+    in English and Mandarin <https://arxiv.org/pdf/1512.02595v1.pdf>`_ .
+    The bidirectional RNN that learns representation for a sequence by
+    performing a forward and a backward pass through the entire sequence.
+    However, unlike unidirectional RNNs, bidirectional RNNs are challenging
+    to deploy in an online and low-latency setting. The lookahead convolution
+    incorporates information from future subsequences in a computationally
+    efficient manner to improve unidirectional recurrent neural networks.
+    The connection of row convolution is different form the 1D sequence
+    convolution. Assumed that, the future context-length is k, that is to say,
+    it can get the output at timestep t by using the the input feature from t-th
+    timestep to (t+k+1)-th timestep. Assumed that the hidden dim of input
+    activations are d, the activations r_t for the new layer at time-step t are:
+    .. math::
+        r_{t,r} = \sum_{j=1}^{k + 1} {w_{i,j}h_{t+j-1, i}}
+                  \quad \text{for} \quad  (1 \leq i \leq d)
+    Note:
+        The `context_len` is `k + 1`. That is to say, the lookahead step
+        number plus one equals context_len.
+    .. code-block:: python
+       row_conv = row_conv_layer(input=input_layer, context_len=3)
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param context_len: The context length equals the lookahead step number
+                        plus one.
+    :type context_len: int
+    :param act: Activation Type. Default is linear activation.
+    :type act: BaseActivation
+    :param param_attr: The Parameter Attribute. If None, the parameter will be
+                       initialized smartly. It's better set it by yourself.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput)
+    assert context_len > 0, "the context_len must be greatet than 0."
+    Layer(
+        inputs=[Input(input.name, **param_attr.attr)],
+        name=name,
+        context_length=context_len,
+        type=LayerType.ROW_CONV_LAYER,
+        active_type=act.name,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.ROW_CONV_LAYER, input, activation=act, size=input.size)
 @layer_support()
 @wrap_name_default()
 @wrap_param_attr_default()

--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -6,6 +6,6 @@ img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cos
 test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
 test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
 test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
-test_prelu_layer)
+test_prelu_layer test_row_conv)
 export whole_configs=(test_split_datasource)
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 2560
+  active_type: ""
+}
+layers {
+  name: "__row_conv_layer_0__"
+  type: "maxout"
+  size: 2560
+  active_type: "relu"
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___row_conv_layer_0__.w0"
+    row_conv_conf {
+      context_length: 19
+    }
+  }
+}
+parameters {
+  name: "___row_conv_layer_0__.w0"
+  size: 48640
+  initial_mean: 0.0
+  initial_std: 0.229415733871
+  dims: 19
+  dims: 2560
+  initial_strategy: 0
+  initial_smart: true
+}
+input_layer_names: "data"
+output_layer_names: "__row_conv_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__row_conv_layer_0__"
+  input_layer_names: "data"
+  output_layer_names: "__row_conv_layer_0__"
+  is_recurrent_layer_group: false
+}
--- a/python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py
+from paddle.trainer_config_helpers import *
+settings(batch_size=1000, learning_rate=1e-5)
+data = data_layer(name='data', size=2560)
+row_conv = row_conv_layer(input=data, context_len=19, act=ReluActivation())
+outputs(row_conv)
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@@ -149,3 +149,57 @@ def cluster_files_reader(files_pattern,
                    yield line
    return reader
+def convert(output_path,
+            reader,
+            num_shards,
+            name_prefix,
+            max_lines_to_shuffle=1000):
+    import recordio
+    import cPickle as pickle
+    import random
+    """
+    Convert data from reader to recordio format files.
+    :param output_path: directory in which output files will be saved.
+    :param reader: a data reader, from which the convert program will read data instances.
+    :param num_shards: the number of shards that the dataset will be partitioned into.
+    :param name_prefix: the name prefix of generated files.
+    :param max_lines_to_shuffle: the max lines numbers to shuffle before writing.
+    """
+    assert num_shards >= 1
+    assert max_lines_to_shuffle >= 1
+    def open_writers():
+        w = []
+        for i in range(0, num_shards):
+            n = "%s/%s-%05d-of-%05d" % (output_path, name_prefix, i,
+                                        num_shards - 1)
+            w.append(recordio.writer(n))
+        return w
+    def close_writers(w):
+        for i in range(0, num_shards):
+            w[i].close()
+    def write_data(w, lines):
+        random.shuffle(lines)
+        for i, d in enumerate(lines):
+            d = pickle.dumps(d, pickle.HIGHEST_PROTOCOL)
+            w[i % num_shards].write(d)
+    w = open_writers()
+    lines = []
+    for i, d in enumerate(reader()):
+        lines.append(d)
+        if i % max_lines_to_shuffle == 0 and i >= max_lines_to_shuffle:
+            write_data(w, lines)
+            lines = []
+            continue
+    write_data(w, lines)
+    close_writers(w)
--- a/python/paddle/v2/dataset/tests/common_test.py
+++ b/python/paddle/v2/dataset/tests/common_test.py
@@ -57,6 +57,38 @@ class TestCommon(unittest.TestCase):
        for idx, e in enumerate(reader()):
            self.assertEqual(e, str("0"))
+    def test_convert(self):
+        record_num = 10
+        num_shards = 4
+        def test_reader():
+            def reader():
+                for x in xrange(record_num):
+                    yield x
+            return reader
+        path = tempfile.mkdtemp()
+        paddle.v2.dataset.common.convert(path,
+                                         test_reader(), num_shards,
+                                         'random_images')
+        files = glob.glob(path + '/random_images-*')
+        self.assertEqual(len(files), num_shards)
+        recs = []
+        for i in range(0, num_shards):
+            n = "%s/random_images-%05d-of-%05d" % (path, i, num_shards - 1)
+            r = recordio.reader(n)
+            while True:
+                d = r.read()
+                if d is None:
+                    break
+                recs.append(d)
+        recs.sort()
+        self.assertEqual(total, record_num)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -45,7 +45,12 @@ class Optimizer(object):
        return swig_api.ParameterUpdater.createRemoteUpdater(
            self.__opt_conf__, pass_num, use_sparse_updater)
-    def create_updater(self, is_local, num_passes, use_sparse_updater):
+    def __create_new_remote_updater__(self, pserver_spec):
+        return swig_api.ParameterUpdater.createNewRemoteUpdater(
+            self.__opt_conf__, pserver_spec)
+    def create_updater(self, is_local, num_passes, use_sparse_updater,
+                       pserver_spec):
        """
        create proper parameter_updater by configuration.
        :param is_local: create local or remote parameter updater
@@ -64,8 +69,12 @@ class Optimizer(object):
        if is_local:
            parameter_updater = self.__create_local_updater__()
        else:
-            parameter_updater = self.__create_remote_updater__(
+            if pserver_spec is None:
-                num_passes, use_sparse_updater)
+                parameter_updater = self.__create_remote_updater__(
+                    num_passes, use_sparse_updater)
+            else:
+                parameter_updater = self.__create_new_remote_updater__(
+                    pserver_spec)
        return parameter_updater

--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -49,7 +49,8 @@ class SGD(object):
                 parameters,
                 update_equation,
                 extra_layers=None,
-                 is_local=True):
+                 is_local=True,
+                 pserver_spec=None):
        if not isinstance(parameters, v2_parameters.Parameters):
            raise TypeError('parameters should be parameters')
@@ -63,6 +64,7 @@ class SGD(object):
        self.__parameters__ = parameters
        self.__topology_in_proto__ = topology.proto()
        self.__is_local__ = is_local
+        self.__pserver_spec__ = pserver_spec
        self.__use_sparse_updater__ = self.__topology__.use_sparse_updater()
        # # In local mode, disable sparse_remote_update.
@@ -126,7 +128,8 @@ class SGD(object):
        __check_train_args__(**locals())
        self.__parameter_updater__ = self.__optimizer__.create_updater(
-            self.__is_local__, num_passes, self.__use_sparse_updater__)
+            self.__is_local__, num_passes, self.__use_sparse_updater__,
+            self.__pserver_spec__)
        self.__parameter_updater__.init(self.__gradient_machine__)
        self.__gradient_machine__.start()