diff --git a/Dockerfile b/Dockerfile
index b6f99ca539d077164c71d797a5ccda7b1b5c44ba..39af60966b6cab7d8b9e644f4ea658613f8ba518 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -30,7 +30,8 @@ RUN apt-get update && \
     python-numpy python-matplotlib gcc g++ \
     automake locales clang-format-3.8 swig doxygen cmake  \
     liblapack-dev liblapacke-dev libboost-dev \
-    clang-3.8 llvm-3.8 libclang-3.8-dev && \
+    clang-3.8 llvm-3.8 libclang-3.8-dev \
+    net-tools && \
     apt-get clean -y
 
 # Install Go
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index 1efa74ecda4170332d96603ca2253c68468474f9..c7b017bc07b25bc606fd838a5fb9d3715f4faecb 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -59,6 +59,11 @@ context_projection
 ..  autoclass:: paddle.v2.layer.context_projection
     :noindex:
 
+row_conv
+--------
+..  autoclass:: paddle.v2.layer.row_conv
+    :noindex:
+
 Image Pooling Layer
 ===================
 
@@ -130,7 +135,7 @@ recurrent_group
 ---------------
 ..  autoclass:: paddle.v2.layer.recurrent_group
     :noindex:
-    
+
 lstm_step
 ---------
 ..  autoclass:: paddle.v2.layer.lstm_step
@@ -145,12 +150,12 @@ beam_search
 ------------
 ..  autoclass:: paddle.v2.layer.beam_search
     :noindex:
-    
+
 get_output
 ----------
 ..  autoclass:: paddle.v2.layer.get_output
     :noindex:
-    
+
 Mixed Layer
 ===========
 
@@ -203,7 +208,7 @@ trans_full_matrix_projection
 ----------------------------
 ..  autoclass:: paddle.v2.layer.trans_full_matrix_projection
     :noindex:
-    
+
 Aggregate Layers
 ================
 
@@ -346,6 +351,12 @@ sampling_id
 ..  autoclass:: paddle.v2.layer.sampling_id
     :noindex:
 
+multiplex
+---------
+..  autoclass:: paddle.v2.layer.multiplex
+    :noindex:
+
+
 Slicing and Joining Layers
 ==========================
 
@@ -434,10 +445,26 @@ smooth_l1_cost
 ..  autoclass:: paddle.v2.layer.smooth_l1_cost
     :noindex:
 
-Check Layer 
+Check Layer
 ============
 
 eos
 ---
 ..  autoclass:: paddle.v2.layer.eos
     :noindex:
+
+Miscs
+=====
+
+dropout
+--------------
+..  autoclass:: paddle.v2.layer.dropout
+    :noindex:
+
+Activation with learnable parameter
+===================================
+
+prelu
+--------
+..  autoclass:: paddle.v2.layer.prelu
+    :noindex:
diff --git a/doc/api/v2/config/networks.rst b/doc/api/v2/config/networks.rst
index b2a617fff134035c04eeabbbaf6d9cbe2a525f1c..6e813ab1a820d068ea3e54cad6178f1cf928eadc 100644
--- a/doc/api/v2/config/networks.rst
+++ b/doc/api/v2/config/networks.rst
@@ -125,11 +125,3 @@ simple_attention
     :members: simple_attention
     :noindex:
 
-Miscs
-=====
-
-dropout_layer
---------------
-..  automodule:: paddle.v2.networks
-    :members: dropout_layer
-    :noindex:
diff --git a/doc/design/cluster_train/pserver_client.md b/doc/design/cluster_train/pserver_client.md
index b3e4079010490b69db1de28157f0cab80cad2381..474b8c572cd92fc87e9f7f3f2b19d12cccd158de 100644
--- a/doc/design/cluster_train/pserver_client.md
+++ b/doc/design/cluster_train/pserver_client.md
@@ -74,14 +74,25 @@ typedef enum {
 typedef struct {
   char*               name;
   paddle_element_type element_type;
-  void*               content;
+  unsigned char*      content;
   int                 content_len;
 } paddle_parameter, paddle_gradient;
 
-typedef struct paddle_pserver_client paddle_pserver_client;
+typedef int paddle_pserver_client;
 
-paddle_pserver_client* paddle_new_pserver_client();
-void paddle_pserver_client_release(paddle_pserver_client* client);
+/**
+ * @brief creates a pserver client that talks to etcd for coordination.
+ */
+paddle_pserver_client paddle_new_etcd_pserver_client(char* etcd_addr);
+
+/**
+ * @brief creates a pserver client given pserver addresses.
+ *
+ * @param pserver_addrs comma-separated pserver addresses.
+ * @param selected if current pserver client is selected to initialize all parameter servers.
+ */
+paddle_pserver_client paddle_new_pserver_client(char* pserver_addrs, int selected);
+void paddle_pserver_client_release(paddle_pserver_client c);
 
 /**
  * @brief paddle_begin_init_params begins to initialize parameters on
@@ -95,7 +106,7 @@ void paddle_pserver_client_release(paddle_pserver_client* client);
  * @return 1 if the trainer is selected to initialize parameter
  * servers, otherwise 0.
  */
-int paddle_begin_init_params(paddle_pserver_client* client);
+int paddle_begin_init_params(paddle_pserver_client client);
 
 /**
  * @brief paddle_init_param initializes the parameter on parameter
@@ -109,7 +120,7 @@ int paddle_begin_init_params(paddle_pserver_client* client);
  * @paddle_begin_init_param). Or simply exit the program and wait for
  * the cluster management system to restart the trainer.
  */
-int paddle_init_param(paddle_pserver_client* client, paddle_parameter param, const unsigned char* param_config_proto, int config_len);
+int paddle_init_param(paddle_pserver_client client, paddle_parameter param, const unsigned char* param_config_proto, int config_len);
 
 /**
  * @brief paddle_finish_init_params tells parameter servers client has
@@ -120,7 +131,7 @@ int paddle_init_param(paddle_pserver_client* client, paddle_parameter param, con
  * @paddle_begin_init_param). Or simply exit the program and wait for
  * the cluster management system to restart the trainer.
  */
-int paddle_finish_init_params(paddle_pserver_client* client);
+int paddle_finish_init_params(paddle_pserver_client client);
 
 /**
  * @brief paddle_send_grads sends gradients to parameter servers for
@@ -131,7 +142,7 @@ int paddle_finish_init_params(paddle_pserver_client* client);
  * @param learning_rate the learning rate for the gradients.
  * @return 0 if successful, otherwise -1.
  */
-int paddle_send_grads(paddle_pserver_client* client, const paddle_gradient* grads, int len);
+int paddle_send_grads(paddle_pserver_client client, const paddle_gradient* grads, int len);
 
 /**
  * @brief paddle_get_params gets parameters from parameter servers.
@@ -139,13 +150,15 @@ int paddle_send_grads(paddle_pserver_client* client, const paddle_gradient* grad
  * paddle_get_params will block until parameters are initialized on
  * the parameter servers.
  *
- * @param names the array of names of the parameters to get.
- * @param dst the destination array of parameters to save to.
+ * @param dst the destination array of parameter pointers to save to.
+ * The parameter pointer must be pre-popullated with required parameter name,
+ * and the content of parameter must be pre-allocated of the size of required
+ * parameter on pserver.
  * @param len the length of the names array and the paddle_parameter
  * array.
  * @return 0 if successful, otherwise -1.
  */
-int paddle_get_params(paddle_pserver_client* client, const char** names, paddle_parameter* dst, int len);
+int paddle_get_params(paddle_pserver_client client, paddle_parameter** dst, int len);
 
 /**
  * @brief paddle_save_model indicates parameters to save the parameter
@@ -154,5 +167,5 @@ int paddle_get_params(paddle_pserver_client* client, const char** names, paddle_
  * @param path the path to save parameters.
  * @return 0 if successful, otherwise -1.
  */
-int paddle_save_model(paddle_pserver_client* client, const char* path);
+int paddle_save_model(paddle_pserver_client client, const char* path);
 ```
diff --git a/go/pserver/cclient/cclient.go b/go/pserver/cclient/cclient.go
index 7fdf9f0ec20b1e65a86845c03cd9e46e4847941f..4476e762dad04009833421056aa5a49efd44ddaa 100644
--- a/go/pserver/cclient/cclient.go
+++ b/go/pserver/cclient/cclient.go
@@ -19,21 +19,9 @@ typedef struct {
   int                 content_len;
 } paddle_parameter, paddle_gradient;
 
-static inline void paddle_release_param(paddle_parameter* param) {
-  if (param != NULL) {
-    if (param->name != NULL) {
-      free(param->name);
-    }
-
-    if (param->content != NULL) {
-      free(param->content);
-    }
-
-    free(param);
-  }
-}
-
-typedef int client;
+typedef int paddle_pserver_client;
+#define PSERVER_ERROR -1
+#define PSERVER_OK 0
 */
 import "C"
 
@@ -48,10 +36,10 @@ import (
 
 var nullPtr = unsafe.Pointer(uintptr(0))
 var mu sync.Mutex
-var handleMap = make(map[C.client]*pserver.Client)
-var curHandle C.client
+var handleMap = make(map[C.paddle_pserver_client]*pserver.Client)
+var curHandle C.paddle_pserver_client
 
-func add(c *pserver.Client) C.client {
+func add(c *pserver.Client) C.paddle_pserver_client {
 	mu.Lock()
 	defer mu.Unlock()
 	client := curHandle
@@ -60,13 +48,13 @@ func add(c *pserver.Client) C.client {
 	return client
 }
 
-func get(client C.client) *pserver.Client {
+func get(client C.paddle_pserver_client) *pserver.Client {
 	mu.Lock()
 	defer mu.Unlock()
 	return handleMap[client]
 }
 
-func remove(client C.client) *pserver.Client {
+func remove(client C.paddle_pserver_client) *pserver.Client {
 	mu.Lock()
 	defer mu.Unlock()
 	h := handleMap[client]
@@ -100,7 +88,7 @@ func (l lister) List() []pserver.Server {
 }
 
 //export paddle_new_pserver_client
-func paddle_new_pserver_client(addrs *C.char, selected int) C.client {
+func paddle_new_pserver_client(addrs *C.char, selected int) C.paddle_pserver_client {
 	a := C.GoString(addrs)
 	as := strings.Split(a, ",")
 	servers := make([]pserver.Server, len(as))
@@ -113,27 +101,27 @@ func paddle_new_pserver_client(addrs *C.char, selected int) C.client {
 }
 
 //export paddle_new_etcd_pserver_client
-func paddle_new_etcd_pserver_client(etcd_addr *C.char) C.client {
+func paddle_new_etcd_pserver_client(etcd_addr *C.char) C.paddle_pserver_client {
 	// TODO(helin): fault tolerant pserver client using etcd.
 	panic("not implemented.")
 }
 
 //export paddle_pserver_client_release
-func paddle_pserver_client_release(client C.client) {
+func paddle_pserver_client_release(client C.paddle_pserver_client) {
 	remove(client)
 }
 
 //export paddle_begin_init_params
-func paddle_begin_init_params(client C.client) C.int {
+func paddle_begin_init_params(client C.paddle_pserver_client) C.int {
 	c := get(client)
 	if selected := c.BeginInitParams(); selected {
 		return 1
 	}
-	return 0
+	return C.PSERVER_OK
 }
 
 //export paddle_init_param
-func paddle_init_param(client C.client, param C.paddle_parameter, param_config unsafe.Pointer, config_len C.int) C.int {
+func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter, param_config unsafe.Pointer, config_len C.int) C.int {
 	et := pserver.ElementType(param.element_type)
 	name := C.GoString(param.name)
 	content := cArrayToSlice(unsafe.Pointer(param.content), int(param.content_len))
@@ -143,28 +131,38 @@ func paddle_init_param(client C.client, param C.paddle_parameter, param_config u
 	}
 	c := get(client)
 	err := c.InitParam(pc)
+
 	if err != nil {
+		if err.Error() == pserver.AlreadyInitialized {
+			log.Printf("parameter %s already initialized, treat paddle_init_param as sucessful.\n", name)
+			return C.PSERVER_OK
+		}
 		log.Println(err)
-		return -1
+		return C.PSERVER_ERROR
 	}
 
-	return 0
+	return C.PSERVER_OK
 }
 
 //export paddle_finish_init_params
-func paddle_finish_init_params(client C.client) C.int {
+func paddle_finish_init_params(client C.paddle_pserver_client) C.int {
 	c := get(client)
 	err := c.FinishInitParams()
 	if err != nil {
+		if err.Error() == pserver.AlreadyInitialized {
+			log.Println("parameters already initialized, treat paddle_finish_init_params as sucessful.")
+			return C.PSERVER_OK
+		}
+
 		log.Println(err)
-		return -1
+		return C.PSERVER_ERROR
 	}
 
-	return 0
+	return C.PSERVER_OK
 }
 
 //export paddle_send_grads
-func paddle_send_grads(client C.client, grads **C.paddle_gradient, total C.int) C.int {
+func paddle_send_grads(client C.paddle_pserver_client, grads **C.paddle_gradient, total C.int) C.int {
 	var gs []pserver.Gradient
 	for i := 0; i < int(total); i++ {
 		grad := *(**C.paddle_gradient)(unsafe.Pointer((uintptr(unsafe.Pointer(grads)) + uintptr(i)*unsafe.Sizeof(*grads))))
@@ -178,85 +176,81 @@ func paddle_send_grads(client C.client, grads **C.paddle_gradient, total C.int)
 	err := c.SendGrads(gs)
 	if err != nil {
 		log.Println(err)
-		return -1
+		return C.PSERVER_ERROR
 	}
 
-	return 0
+	return C.PSERVER_OK
 }
 
 //export paddle_get_params
-func paddle_get_params(client C.client, names **C.char, dst **C.paddle_parameter, total C.int) C.int {
+func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter, total C.int) C.int {
 	var ns []string
 	for i := 0; i < int(total); i++ {
-		name := *(**C.char)(unsafe.Pointer((uintptr(unsafe.Pointer(names)) + uintptr(i)*unsafe.Sizeof(*names))))
-		ns = append(ns, C.GoString(name))
+		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
+		ns = append(ns, C.GoString(param.name))
 	}
 	c := get(client)
 	ps, err := c.GetParams(ns)
 	if err != nil {
 		log.Println(err)
-		return -1
+		return C.PSERVER_ERROR
 	}
 
-	for i := 0; i < int(total); i++ {
-		if i >= len(ps) {
-			break
+	if len(ps) != len(ns) {
+		pn := make([]string, len(ps))
+		for i, p := range ps {
+			pn[i] = p.Name
+		}
+		log.Printf("pserver returned wrong number of parameters. Requested: %s, returned: %s.\n", strings.Join(pn, ", "), strings.Join(ns, ", "))
+		return C.PSERVER_ERROR
+	}
+
+	for i := range ps {
+		if ns[i] != ps[i].Name {
+			pn := make([]string, len(ps))
+			for i, p := range ps {
+				pn[i] = p.Name
+			}
+			log.Printf("pserver returned wrong parameters, or not in requested order. Requested: %s, returned: %s.\n", strings.Join(pn, ", "), strings.Join(ns, ", "))
+			return C.PSERVER_ERROR
 		}
+	}
 
+	for i := 0; i < int(total); i++ {
 		p := ps[i]
-		paramPtr := (**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
-		param := *paramPtr
-		nameReady := false
-		contentAllocated := false
+		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
 
 		if unsafe.Pointer(param) == nullPtr {
-			*paramPtr = (*C.paddle_parameter)(C.calloc(1, C.size_t(unsafe.Sizeof(*param))))
-			param = *paramPtr
+			log.Println("must pre-allocate parameter.")
+			return C.PSERVER_ERROR
 		} else {
-			if unsafe.Pointer(param.name) != nullPtr {
-				if n := C.GoString(param.name); n != p.Name {
-					log.Println("Warning: the pre-allocated parameter name does not match the parameter name, it will be freed.", n, p.Name)
-					C.free(unsafe.Pointer(param.name))
-				} else {
-					nameReady = true
-				}
-			}
-
 			if unsafe.Pointer(param.content) != nullPtr {
-				if int(param.content_len) == len(p.Content) {
-					contentAllocated = true
-				} else {
-					log.Println("Warning: the pre-allocated content len does not match parameter content len, the pre-allocated content will be freed.", param.content_len, len(p.Content))
-					C.free(unsafe.Pointer(param.content))
+				if int(param.content_len) != len(p.Content) {
+					log.Printf("the pre-allocated content len does not match parameter content len. Pre-allocated len: %d, returned len: %d", param.content_len, len(p.Content))
+					return C.PSERVER_ERROR
 				}
 			}
 		}
 
-		if !nameReady {
-			param.name = C.CString(p.Name)
-		}
-		if !contentAllocated {
-			param.content = (*C.uchar)(C.malloc(C.size_t(len(p.Content))))
-		}
 		C.memcpy(unsafe.Pointer(param.content), unsafe.Pointer(&p.Content[0]), C.size_t(len(p.Content)))
 		param.content_len = C.int(len(p.Content))
 		param.element_type = C.paddle_element_type(p.ElementType)
 	}
 
-	return 0
+	return C.PSERVER_OK
 }
 
 //export paddle_save_model
-func paddle_save_model(client C.client, path *C.char) C.int {
+func paddle_save_model(client C.paddle_pserver_client, path *C.char) C.int {
 	p := C.GoString(path)
 	c := get(client)
 	err := c.Save(p)
 	if err != nil {
 		log.Println(err)
-		return -1
+		return C.PSERVER_ERROR
 	}
 
-	return 0
+	return C.PSERVER_OK
 }
 
 func main() {} // Required but ignored
diff --git a/go/pserver/cclient/test/CMakeLists.txt b/go/pserver/cclient/test/CMakeLists.txt
index e7d0a74237e9cd049cff96ff0589c53cfb329e7f..882a894ef2b6d24aed42062a667b69a18204406f 100644
--- a/go/pserver/cclient/test/CMakeLists.txt
+++ b/go/pserver/cclient/test/CMakeLists.txt
@@ -7,6 +7,8 @@ add_dependencies(test_cclient paddle_pserver_cclient)
 
 if(APPLE)
   set(CMAKE_EXE_LINKER_FLAGS "-framework CoreFoundation -framework Security")
+else()
+  set(CMAKE_EXE_LINKER_FLAGS "-pthread")  
 endif()
 
 if(PROJ_ROOT)
diff --git a/go/pserver/cclient/test/main.c b/go/pserver/cclient/test/main.c
index 6adc3c9b533f54b343f46ab688799878888fabfd..c8aed0f2e8e515ace88f03578d117b8faede9790 100644
--- a/go/pserver/cclient/test/main.c
+++ b/go/pserver/cclient/test/main.c
@@ -2,77 +2,87 @@
 
 #include "libpaddle_pserver_cclient.h"
 
-void fail() {
-  // TODO(helin): fix: gtest using cmake is not working, using this
-  // hacky way for now.
-  printf("test failed.\n");
+// TODO(helin): Fix: gtest using cmake is not working, using this
+// hacky way for now.
+#define fail()                                          \
+  fprintf(stderr, "info: %s:%d: ", __FILE__, __LINE__); \
   exit(-1);
+
+void sendGrads(paddle_pserver_client c) {
+  unsigned char grad_a[2000] = {2};
+  unsigned char grad_b[3000] = {3};
+  paddle_gradient grads[2] = {
+      {"param_a", PADDLE_ELEMENT_TYPE_FLOAT32, grad_a, 2000},
+      {"param_b", PADDLE_ELEMENT_TYPE_FLOAT32, grad_b, 3000}};
+
+  if (paddle_send_grads(c, grads, 2)) {
+    fail();
+  }
+}
+
+void getParams(paddle_pserver_client c) {
+  paddle_parameter param_a;
+  paddle_parameter param_b;
+  char name_a[] = "param_a";
+  char name_b[] = "param_b";
+  // Must pre-allocate the prameter content before calling paddle_get_params.
+  unsigned char content_a[2000] = {};
+  unsigned char content_b[3000] = {};
+  param_a.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+  param_a.name = name_a;
+  param_a.content = content_a;
+  param_a.content_len = 2000;
+  param_b.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
+  param_b.name = name_b;
+  param_b.content = content_b;
+  param_b.content_len = 3000;
+
+  paddle_parameter* params[2] = {&param_a, &param_b};
+  if (paddle_get_params(c, params, 2)) {
+    fail();
+  }
 }
 
 int main() {
   char addr[] = "localhost:3000";
-  client c = paddle_new_pserver_client(addr, 1);
+  paddle_pserver_client c = paddle_new_pserver_client(addr, 1);
 retry:
   if (paddle_begin_init_params(c)) {
     paddle_parameter param;
     char name_a[] = "param_a";
     char name_b[] = "param_b";
-    unsigned char content[] = {0x00, 0x11, 0x22};
+    unsigned char content_a[2000] = {1};
+    unsigned char content_b[3000] = {0};
     param.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
     param.name = name_a;
-    param.content = content;
-    param.content_len = 3;
-    if (paddle_init_param(c, param, NULL, 0) != 0) {
+    param.content = content_a;
+    param.content_len = 2000;
+    int error = paddle_init_param(c, param, NULL, 0);
+    if (error != 0) {
       goto retry;
     }
-    param.element_type = PADDLE_ELEMENT_TYPE_INT32;
+
+    param.element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
     param.name = name_b;
-    param.content = content;
-    param.content_len = 3;
-    if (paddle_init_param(c, param, NULL, 0) != 0) {
+    param.content = content_b;
+    param.content_len = 3000;
+    error = paddle_init_param(c, param, NULL, 0);
+    if (error != 0) {
       goto retry;
     }
-    if (paddle_finish_init_params(c) != 0) {
+
+    error = paddle_finish_init_params(c);
+    if (error != 0) {
       goto retry;
     }
-  } else {
-    fail();
   }
 
-  unsigned char content[] = {0x00, 0x11, 0x22};
-  paddle_gradient** grads =
-      (paddle_gradient**)malloc(sizeof(paddle_gradient*) * 2);
-  grads[0] = (paddle_gradient*)malloc(sizeof(paddle_gradient));
-  grads[0]->name = "param_a";
-  grads[0]->content = content;
-  grads[0]->content_len = 3;
-  grads[0]->element_type = PADDLE_ELEMENT_TYPE_FLOAT32;
-
-  grads[1] = (paddle_gradient*)malloc(sizeof(paddle_gradient));
-  grads[1]->name = "param_b";
-  grads[1]->content = content;
-  grads[1]->content_len = 3;
-  grads[1]->element_type = PADDLE_ELEMENT_TYPE_INT32;
-
-  if (paddle_send_grads(c, grads, 2) != 0) {
-    fail();
+  for (int i = 0; i < 100; i++) {
+    sendGrads(c);
+    getParams(c);
   }
 
-  paddle_parameter* params[2] = {NULL, NULL};
-  char* names[] = {"param_a", "param_b"};
-  if (paddle_get_params(c, names, params, 2) != 0) {
-    fail();
-  }
-
-  // get parameters again by reusing the allocated parameter buffers.
-  if (paddle_get_params(c, names, params, 2) != 0) {
-    fail();
-  }
-
-  paddle_release_param(params[0]);
-  paddle_release_param(params[1]);
-
-  if (paddle_save_model(c, "/tmp/") != 0) {
+  if (paddle_save_model(c, "/tmp/")) {
     fail();
   }
 
diff --git a/go/pserver/cclient/test/mnist_test.py b/go/pserver/cclient/test/test_mnist.py
similarity index 100%
rename from go/pserver/cclient/test/mnist_test.py
rename to go/pserver/cclient/test/test_mnist.py
diff --git a/go/pserver/client_test.go b/go/pserver/client_test.go
index a9a0948a51a31a1c7393f716e3dfc436dbf919af..d0371a26a13fac9daecacd0b6a271caa6d830651 100644
--- a/go/pserver/client_test.go
+++ b/go/pserver/client_test.go
@@ -117,7 +117,7 @@ func TestClientFull(t *testing.T) {
 
 	for i := range params {
 		if names[i] != params[i].Name {
-			t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", names[i], params[i])
+			t.Fatalf("order of returned parameter does not required: parameter name: %s, required name: %s", names[i], params[i].Name)
 		}
 	}
 }
diff --git a/go/pserver/service.go b/go/pserver/service.go
index 7d2a1fea865091edb2802e1c9f8f57e398559562..78a2bfaf6347019333bf1c7ee6cdc04d93ab1370 100644
--- a/go/pserver/service.go
+++ b/go/pserver/service.go
@@ -9,8 +9,10 @@ import (
 // ElementType is the type of elements of a Parameter.
 type ElementType int
 
-var ErrAlreadyInitialized = errors.New("pserver already initialized")
-var ErrUninitialized = errors.New("pserver not fully initialized")
+const (
+	AlreadyInitialized = "pserver already initialized"
+	Uninitialized      = "pserver not fully initialized"
+)
 
 // Supported element types
 const (
@@ -59,7 +61,7 @@ func NewService() *Service {
 func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) error {
 	select {
 	case <-s.initialized:
-		return ErrAlreadyInitialized
+		return errors.New(AlreadyInitialized)
 	default:
 	}
 
@@ -80,7 +82,7 @@ func (s *Service) InitParam(paramWithConfigs ParameterWithConfig, dummy *int) er
 func (s *Service) FinishInitParams(dummy0 int, dummy1 *int) error {
 	select {
 	case <-s.initialized:
-		return ErrAlreadyInitialized
+		return errors.New(AlreadyInitialized)
 	default:
 	}
 
@@ -94,7 +96,7 @@ func (s *Service) SendGrad(g Gradient, dummy *int) error {
 	select {
 	case <-s.initialized:
 	default:
-		return ErrUninitialized
+		return errors.New(Uninitialized)
 	}
 
 	s.mu.Lock()
diff --git a/go/pserver/service_test.go b/go/pserver/service_test.go
index 4c9fac4536e09013916aadb26af3a86a5a775b4f..796492ffb47f109b1d47101712195903b8dc8457 100644
--- a/go/pserver/service_test.go
+++ b/go/pserver/service_test.go
@@ -16,7 +16,7 @@ func TestFull(t *testing.T) {
 	p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}
 	p.ElementType = pserver.Int32
 	var dummy int
-	err := s.InitParam(pserver.ParameterWithConfig{p, nil}, &dummy)
+	err := s.InitParam(pserver.ParameterWithConfig{Param: p, Config: nil}, &dummy)
 	if err != nil {
 		t.FailNow()
 	}
@@ -25,7 +25,7 @@ func TestFull(t *testing.T) {
 	p1.Name = "param_b"
 	p1.Content = []byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}
 	p1.ElementType = pserver.Float32
-	err = s.InitParam(pserver.ParameterWithConfig{p1, nil}, &dummy)
+	err = s.InitParam(pserver.ParameterWithConfig{Param: p1, Config: nil}, &dummy)
 	if err != nil {
 		t.FailNow()
 	}
@@ -81,7 +81,7 @@ func TestMultipleInit(t *testing.T) {
 	}
 
 	err = s.FinishInitParams(0, &dummy)
-	if err != pserver.ErrAlreadyInitialized {
+	if err.Error() != pserver.AlreadyInitialized {
 		t.FailNow()
 	}
 }
@@ -90,7 +90,7 @@ func TestUninitialized(t *testing.T) {
 	s := pserver.NewService()
 	var dummy int
 	err := s.SendGrad(pserver.Gradient{}, &dummy)
-	if err != pserver.ErrUninitialized {
+	if err.Error() != pserver.Uninitialized {
 		t.FailNow()
 	}
 }
@@ -135,7 +135,7 @@ func TestBlockUntilInitialized(t *testing.T) {
 	p.Content = []byte{1, 0, 0, 0, 2, 0, 0, 0, 3, 0, 0, 0}
 	p.ElementType = pserver.Int32
 	var dummy int
-	err := s.InitParam(pserver.ParameterWithConfig{p, nil}, &dummy)
+	err := s.InitParam(pserver.ParameterWithConfig{Param: p, Config: nil}, &dummy)
 	if err != nil {
 		t.FailNow()
 	}
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 9898dc083ebb1783a0e2ddd12afaa9c3d5a79e98..47ca1833967ee705d6558b1dad06a6335b30f03a 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -8,6 +8,7 @@ add_subdirectory(gserver)
 add_subdirectory(pserver)
 add_subdirectory(trainer)
 add_subdirectory(scripts)
+add_subdirectory(strings)
 
 # Do not build go directory until go cmake is working smoothly.
 # if(CMAKE_Go_COMPILER)
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index c258a152406c1e965bb1129e74c1d289861b881d..c9433a38de4d005ebe229c55916401a5f82e9ef3 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -41,6 +41,7 @@ SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS
     paddle_network
     paddle_proto
     ${external_project_dependencies}
+    ${RDMA_LIBS}
 )
 
 IF(APPLE)
@@ -73,6 +74,7 @@ SWIG_LINK_LIBRARIES(swig_paddle
     ${CMAKE_DL_LIBS}
     ${EXTERNAL_LIBS}
     ${CMAKE_THREAD_LIBS_INIT}
+    ${RDMA_LD_FLAGS}
     ${START_END}
 )
 
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 233a53709a80f06dd2a06995b159c1aef10e2788..1f54ac1231c6ac2e19b25bb336292194c63c11e9 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -28,6 +28,7 @@ if(WITH_TESTING)
     add_simple_unittest(PadOpTest)
     add_simple_unittest(MulOpTest)
     add_simple_unittest(CosSimOpTest)
+    add_simple_unittest(RowConvOpTest)
 endif()
 endif()
 
diff --git a/paddle/function/RowConvOp.cpp b/paddle/function/RowConvOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b6501e8f4db7fd33891cd80e07a6f36dd0b34532
--- /dev/null
+++ b/paddle/function/RowConvOp.cpp
@@ -0,0 +1,225 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "RowConvOp.h"
+#include <iostream>
+#include "paddle/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void RowConv<DEVICE_TYPE_CPU>(CpuMatrix& out,
+                              const CpuMatrix& in,
+                              const CpuMatrix& filter,
+                              const CpuIVector& seq) {
+  const int* starts = seq.getData();
+  const size_t numSeq = seq.getSize() - 1;
+  const size_t contextLength = filter.getHeight();
+  for (size_t i = 0; i < numSeq; ++i) {
+    size_t begin = starts[i];
+    size_t end = starts[i + 1];
+    for (size_t j = begin; j < end; ++j) {
+      MatrixPtr x;
+      MatrixPtr w;
+      if ((j + contextLength) < end) {
+        x = (const_cast<CpuMatrix&>(in)).subMatrix(j, contextLength);
+        w = (const_cast<CpuMatrix&>(filter)).subMatrix(0, contextLength);
+      } else {
+        x = (const_cast<CpuMatrix&>(in)).subMatrix(j, end - j);
+        w = (const_cast<CpuMatrix&>(filter)).subMatrix(0, end - j);
+      }
+      MatrixPtr y = out.subMatrix(j, 1);
+      y->addDotMulVMM(*x, *w);
+    }
+  }
+}
+
+template <>
+void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
+                                  const CpuMatrix& in,
+                                  const CpuMatrix& filter,
+                                  CpuMatrix& inG,
+                                  CpuMatrix& filterG,
+                                  const CpuIVector& seq) {
+  // gradient w.r.t filter
+  const int* starts = seq.getData();
+  const size_t numSeq = seq.getSize() - 1;
+  const size_t contextLength = filter.getHeight();
+  if (filterG) {
+    for (size_t i = 0; i < numSeq; ++i) {
+      size_t begin = starts[i];
+      size_t end = starts[i + 1];
+      size_t steps = end - begin;
+      for (size_t j = 0; j < contextLength && (begin + j) < end; ++j) {
+        MatrixPtr x =
+            (const_cast<CpuMatrix&>(in)).subMatrix(begin + j, steps - j);
+        MatrixPtr dy =
+            (const_cast<CpuMatrix&>(outG)).subMatrix(begin, steps - j);
+        MatrixPtr dw = filterG.subMatrix(j, 1);
+        dw->addDotMulVMM(*dy, *x);
+      }
+    }
+  }
+
+  // gradient w.r.t input feature
+  if (inG) {
+    for (size_t i = 0; i < numSeq; ++i) {
+      size_t begin = starts[i];
+      size_t end = starts[i + 1];
+      size_t steps = end - begin;
+      for (size_t j = 0; j < steps; ++j) {
+        MatrixPtr dx = inG.subMatrix(begin + j, 1);
+        for (size_t t = 0; t < contextLength; ++t) {
+          if (int(j - t) >= 0) {
+            MatrixPtr dy =
+                (const_cast<CpuMatrix&>(outG)).subMatrix(begin + j - t, 1);
+            MatrixPtr w = (const_cast<CpuMatrix&>(filter)).subMatrix(t, 1);
+            dx->addDotMul(*dy, *w, 1.0, 1.0);
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief The row convolution is called lookahead convolution. It is firstly
+ * introduced in deep-speech2 system. The bidirectional RNN that learns
+ * representation for a sequence by performing a forward and a backward pass
+ * through the entire sequence. However, unlike unidirectional RNNs,
+ * bidirectional RNNs are challenging to deploy in an online and low-latency
+ * setting. The lookahead convolution incorporates information from future
+ * subsequences in a computationally efficient manner to improve unidirectional
+ * recurrent neural networks.
+ *
+ * The connection of row convolution is different form the 1D sequence
+ * convolution. Assumed that, the future context-length is k, that is to say,
+ * it can get the output at timestep t by using the the input feature from t-th
+ * timestep to (t+k)-th timestep. Assumed that the hidden dim of input
+ * activations are d, the activations r_t for the new layer at time-step t are:
+ *
+ *
+ *            -- k + 1
+ *  r(t,i) =  >       W(i,j) * h(t+j-1, i),  for (1 <= i <= d)
+ *            -- j = 1
+ *
+ *
+ * The weight shape is: (k + 1) x d
+ * Function Arguments:
+ *
+ * \param inputs[0]  The input activations.
+ * \param inputs[0]  The filter (or weight) and shape is (k+1) x d.
+ * \param outputs[1] The output activations.
+ *
+ * [1] Dario Amodei, etc. Deep Speech 2 : End-to-End Speech Recognition in
+ * English
+ *     and Mandarin. https://arxiv.org/abs/1512.02595
+ */
+
+template <DeviceType Device>
+class RowConvFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {}
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    // check
+    CHECK_EQ(2UL, inputs.size());
+    CHECK_EQ(1UL, outputs.size());
+    // TODO(qingqing): support ASSIGN_TO.
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here.";
+    const auto in = dynamic_cast<const SequenceArg&>(inputs[0]);
+    auto out = dynamic_cast<const SequenceArg&>(outputs[0]);
+    auto w = inputs[1];
+    CHECK(in.data() && out.data() && in.getSequenceId().data());
+    CHECK_EQ(in.shape().ndims(), 2UL);
+    CHECK(in.shape() == out.shape());
+    CHECK_EQ(w.shape()[1], in.shape()[1]);
+
+    auto outMat = out.matrix<Device>();
+    const auto inMat = in.matrix<Device>();
+    const auto wMat = w.matrix<Device>();
+    const auto seqId = in.getSequenceId().vector<int, Device>();
+
+    RowConv<Device>(outMat, inMat, wMat, seqId);
+  }
+};
+
+/**
+ * \brief The backward of row convolution function. This function calculated
+ * the gradient w.r.t filter and the gradient w.r.t input activations(or data).
+ *
+ * Argument in this Function:
+ *
+ * \param inputs[0]  The gradient w.r.t output activations.
+ * \param inputs[1]  The input activations.
+ * \param inputs[2]  The filter (or weight) and shape is (k+1) x d.
+ * \param outputs[0] The gradient w.r.t input activations.
+ * \param outputs[1] The gradient w.r.r filter.
+ *
+ * Abbreviation:
+ * w.r.t: with respect to.
+ */
+
+template <DeviceType Device>
+class RowConvGradFunc : public FunctionBase {
+  // TODO(qingqing): split into RowConvDataFunc and RowConvWeightFunc
+public:
+  void init(const FuncConfig& config) override {}
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    // check
+    CHECK_EQ(3UL, inputs.size());
+    CHECK_EQ(2UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    CHECK_EQ(outputs[1].getArgType(), ADD_TO);
+    CHECK(inputs[0].isSequenceArg() && inputs[1].isSequenceArg() &&
+          outputs[0].isSequenceArg())
+        << "SequenceArg required here.";
+
+    const auto outGrad = dynamic_cast<const SequenceArg&>(inputs[0]);
+    const auto in = dynamic_cast<const SequenceArg&>(inputs[1]);
+    const auto w = inputs[2];
+    auto inGrad = dynamic_cast<const SequenceArg&>(outputs[0]);
+    auto wGrad = outputs[1];
+
+    CHECK_EQ(in.shape().ndims(), 2UL);
+    CHECK(in.shape() == inGrad.shape());
+    CHECK(in.shape() == outGrad.shape());
+    CHECK_EQ(wGrad.shape()[1], in.shape()[1]);
+
+    const auto outGMat = outGrad.matrix<Device>();
+    const auto inMat = in.matrix<Device>();
+    const auto wMat = w.matrix<Device>();
+    auto inGMat = inGrad.data()
+                      ? inGrad.matrix<Device>()
+                      : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
+    auto wGMat = wGrad.data()
+                     ? wGrad.matrix<Device>()
+                     : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
+    const auto seqId = in.getSequenceId().vector<int, Device>();
+
+    RowConvGrad<Device>(outGMat, inMat, wMat, inGMat, wGMat, seqId);
+  }
+};
+
+REGISTER_TYPED_FUNC(RowConv, CPU, RowConvFunc);
+REGISTER_TYPED_FUNC(RowConvGrad, CPU, RowConvGradFunc);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(RowConv, GPU, RowConvFunc);
+REGISTER_TYPED_FUNC(RowConvGrad, GPU, RowConvGradFunc);
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/RowConvOp.h b/paddle/function/RowConvOp.h
new file mode 100644
index 0000000000000000000000000000000000000000..2c5de6151aa2ed73ace1569d880b1c3677c195c4
--- /dev/null
+++ b/paddle/function/RowConvOp.h
@@ -0,0 +1,56 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+/**
+ * \brief The forward of row convolution.
+ *
+ * \param[out] out      The output data and shape is h x d. h is the sum of
+ *                      time steps of all samples in one mini-batch.
+ * \param[in]  in       The input data and shape is h x d.
+ * \param[in]  filter   The filter and shape is k x d. The lookahead step
+ *                      number plus one equals k.
+ * \param[in]  seq      The sequence start positions.
+ *
+ */
+template <DeviceType DType>
+void RowConv(typename Tensor<real, DType>::Matrix& out,
+             const typename Tensor<real, DType>::Matrix& in,
+             const typename Tensor<real, DType>::Matrix& filter,
+             const typename Tensor<int, DType>::Vector& seq);
+
+/**
+ * \brief The backward of row convolution.
+ *
+ * \param[in]  outG     The gradient w.r.t output data.
+ * \param[in]  in       The input data.
+ * \param[in]  filter   The filter.
+ * \param[out] inG      The gradient w.r.t input data.
+ * \param[out] filterG  The gradient w.r.t filter.
+ * \param[in]  seq      The sequence start positions.
+ *
+ */
+template <DeviceType DType>
+void RowConvGrad(const typename Tensor<real, DType>::Matrix& outG,
+                 const typename Tensor<real, DType>::Matrix& in,
+                 const typename Tensor<real, DType>::Matrix& filter,
+                 typename Tensor<real, DType>::Matrix& inG,
+                 typename Tensor<real, DType>::Matrix& filterG,
+                 const typename Tensor<int, DType>::Vector& seq);
+}  // namespace paddle
diff --git a/paddle/function/RowConvOpGpu.cu b/paddle/function/RowConvOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c0b947e224313abaf4fadfb8293dc78ca085ff84
--- /dev/null
+++ b/paddle/function/RowConvOpGpu.cu
@@ -0,0 +1,344 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_base.h"
+#include "RowConvOp.h"
+
+namespace paddle {
+
+template<int BLOCK_H, int BLOCK_W>
+__global__ void KeRowConv(real* y, const real* x,  const real* w,
+    const int* starts, const int height, const int width,
+    const int numSeq, const int context) {
+
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  __shared__ real sw[BLOCK_H][BLOCK_W];
+
+  for (int i = tidy; i < context; i += blky) {
+    sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0;
+  }
+  
+  __syncthreads();
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      real sum = 0;
+      int off = (start + j) * width;
+      for (int t = 0; t < context; ++t) {
+        if ((start + j + t) < end) {
+          int xoff = off + t * width;
+          real xVal = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0;
+          sum += sw[t][tidx] * xVal;
+        }
+      }
+      if (gidx + tidx < width) {
+        y[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+
+__global__ void KeRowConv2(real* y, const real* x,  const real* w,
+    const int* starts, const int height, const int width,
+    const int numSeq, const int context) {
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      int off = (start + j) * width;
+      real sum = 0;
+      for (int t = 0; t < context && (start + j + t) < end; ++t) {
+        int xoff = off + t * width;
+        real xd = gidx + tidx < width ? x[xoff + gidx + tidx] : 0.0;
+        real wd = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0;
+        sum += wd * xd;
+      }
+      if (gidx + tidx < width) {
+        y[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+
+
+
+template <>
+void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,
+                              const GpuMatrix& in,
+                              const GpuMatrix& filter,
+                              const GpuIVector& seq) {
+  const size_t numSeq = seq.getSize() - 1;
+  const size_t contextLength = filter.getHeight();
+  const size_t height = in.getHeight();
+  const size_t width = in.getWidth();
+
+  real* y = out.getData();
+  const real* x = in.getData();
+  const real* w = filter.getData();
+  const int* starts = seq.getData();
+
+  dim3 dimBlock(32, 32);
+  dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
+
+  if (contextLength <= 32) {
+    KeRowConv<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
+      (y, x, w, starts, height, width, numSeq, contextLength);
+  } else {
+    KeRowConv2<<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
+      (y, x, w, starts, height, width, numSeq, contextLength);
+  }
+  CHECK_SYNC("RowConv");
+}
+
+
+template<int BLOCK_H, int BLOCK_W, int CONTEXT>
+__global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
+    const int* starts, const int height, const int width, const int numSeq,
+    const int context) {
+
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  __shared__ real sh_x[BLOCK_W][BLOCK_H];
+  __shared__ real sh_dy[BLOCK_W][BLOCK_H + CONTEXT - 1];
+  __shared__ real sh_dw[CONTEXT][BLOCK_W];
+
+  if (tidy < context) {
+    sh_dw[tidy][tidx] = 0.0;
+  }
+  __syncthreads();
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    const int size = ((steps + BLOCK_H - 1)/BLOCK_H) * BLOCK_H;
+    for (int j = tidy; j < size; j += BLOCK_H) {
+      int xoff = gidx + tidx;
+      int yoff = start + j;
+
+      // transpose
+      sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
+      sh_dy[tidx][tidy + context - 1] = (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0;
+      __syncthreads();
+      if (tidy < (context - 1)) {
+        yoff = yoff - context + 1;
+        sh_dy[tidx][tidy] = (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0;
+      }
+      __syncthreads();
+
+      for (int t = 0; t < context; t++) {
+        real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx + context - 1 - t];
+        __syncthreads();
+        // warp size and blockDim.x is 32.
+        val += __shfl_down(val, 16);
+        val += __shfl_down(val, 8);
+        val += __shfl_down(val, 4);
+        val += __shfl_down(val, 2);
+        val += __shfl_down(val, 1);
+        __syncthreads();
+        if (tidx == 0) {
+          sh_dw[t][tidy] += val;
+        }
+        __syncthreads();
+      }
+    }
+  }
+
+  for (int t = tidy; (t < context) && ((gidx + tidx) < width); t += blky) {
+    dw[t * width + gidx + tidx] += sh_dw[t][tidx];
+  }
+}
+
+template<int BLOCK_H, int BLOCK_W>
+__global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
+    const int* starts, const int height, const int width, const int numSeq,
+    const int context) {
+
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  __shared__ real sh_x[BLOCK_H][BLOCK_W];
+  __shared__ real sh_dy[BLOCK_H][BLOCK_W];
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+
+    const int size = ((steps + BLOCK_H - 1)/BLOCK_H) * BLOCK_H;
+    for (int j = tidy; j < size; j += BLOCK_H) {
+      int xoff = gidx + tidx;
+      int yoff = start + j;
+
+      // transpose
+      sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
+      __syncthreads();
+
+      for (int t = 0; t < context; t++) {
+        sh_dy[tidx][tidy] = (xoff < width && (yoff - t) >= start && yoff - t < end) ? dy[(yoff - t) * width + xoff] : 0.0;
+        __syncthreads();
+
+        real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx];
+        __syncthreads();
+        // warp size and blockDim.x is 32.
+        val += __shfl_down(val, 16);
+        val += __shfl_down(val, 8);
+        val += __shfl_down(val, 4);
+        val += __shfl_down(val, 2);
+        val += __shfl_down(val, 1);
+        __syncthreads();
+
+        if (tidx == 0 && (gidx + tidy) < width) {
+          dw[t*width + gidx + tidy] += val;
+        }
+      }
+    }
+  }
+}
+
+template<int BLOCK_H, int BLOCK_W>
+__global__ void KeRowConvBwData(real* dx, const real* w, const real* dy,
+    const int* starts, const int height, const int width, const int numSeq,
+    const int context) {
+
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  __shared__ real sw[BLOCK_H][BLOCK_W];
+
+  for (int i = tidy; i < context; i += blky) {
+    sw[i][tidx] = gidx + tidx < width ? w[i*width + gidx + tidx] : 0.0;
+  }
+  
+  __syncthreads();
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      real sum = 0;
+      int off = (start + j) * width;
+      for (int t = 0; t < context && (j - t) >= 0; ++t) {
+        int dyOff = off - t * width;
+        real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0;
+        sum += sw[t][tidx] * dyVal;
+      }
+      if (gidx + tidx < width) {
+        dx[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+
+__global__ void KeRowConvBwData2(real* dx, const real* w, const real* dy,
+    const int* starts, const int height, const int width, const int numSeq,
+    const int context) {
+
+  const int tidx = threadIdx.x;
+  const int tidy = threadIdx.y;
+  const int blky = blockDim.y;
+  const int gidx = blockIdx.x * blockDim.x;
+
+  for (int i = 0; i < numSeq; ++i) {
+    const int start = starts[i];
+    const int end = starts[i + 1];
+    const int steps = end - start;
+    for (int j = tidy; j < steps; j += blky) {
+      real sum = 0;
+      int off = (start + j) * width;
+      for (int t = 0; t < context && (j - t) >= 0; ++t) {
+        int dyOff = off - t * width;
+        real dyVal = gidx + tidx < width ? dy[dyOff + gidx + tidx] : 0.0;
+        real wVal = gidx + tidx < width ? w[t * width + gidx + tidx] : 0.0;
+        sum += wVal * dyVal;
+      }
+      if (gidx + tidx < width) {
+        dx[off + gidx + tidx] += sum;
+      }
+    }
+  }
+}
+
+
+template <>
+void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
+                              const GpuMatrix& in,
+                              const GpuMatrix& filter,
+                              GpuMatrix& inG,
+                              GpuMatrix& filterG,
+                              const GpuIVector& seq) {
+  const size_t numSeq = seq.getSize() - 1;
+  const size_t contextLength = filter.getHeight();
+  const size_t height = in.getHeight();
+  const size_t width = in.getWidth();
+
+  const real* dy = outG.getData();
+  const real* x = in.getData();
+  const real* w = filter.getData();
+  const int* starts = seq.getData();
+
+  if (filterG) {
+    dim3 dimBlock(32, 32);
+    dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
+    real* dw = filterG.getData();
+    if (contextLength <= 32) { 
+      KeRowConvBwWeight<32, 32, 32>
+        <<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
+        (dw, x, dy, starts, height, width, numSeq, contextLength);
+    } else {
+      KeRowConvBwWeight2<32, 32>
+        <<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
+        (dw, x, dy, starts, height, width, numSeq, contextLength);
+    }
+  }
+
+  if (inG) {
+    real* dx = inG.getData();
+    dim3 dimBlock2(32, 32);
+    dim3 dimGrid2(DIVUP(width, dimBlock2.x), 1);
+    if (contextLength <= 64) {
+      KeRowConvBwData<32, 64>
+        <<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>
+        (dx, w, dy, starts, height, width, numSeq, contextLength);
+    } else {
+      KeRowConvBwData2
+        <<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>
+        (dx, w, dy, starts, height, width, numSeq, contextLength);
+    }
+  }
+
+  CHECK_SYNC("RowConvGrad");
+}
+
+}  // namespace paddle
diff --git a/paddle/function/RowConvOpTest.cpp b/paddle/function/RowConvOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1c95d3ff2cccbf33f4c5f91f6daf340871a8f7b0
--- /dev/null
+++ b/paddle/function/RowConvOpTest.cpp
@@ -0,0 +1,62 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+
+namespace paddle {
+
+void testRowConvFw(size_t batchSize, size_t dim, size_t contextLength) {
+  FunctionCompare test("RowConv", FuncConfig());
+
+  test.addSequence(SequenceIdArg(TensorShape{batchSize}));
+  test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{contextLength, dim}));
+
+  test.addOutputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}),
+                  ADD_TO);
+
+  test.run();
+}
+
+void testRowConvBw(size_t batchSize, size_t dim, size_t contextLength) {
+  FunctionCompare test("RowConvGrad", FuncConfig());
+
+  test.addSequence(SequenceIdArg(TensorShape{batchSize}));
+  test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}));
+  test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{contextLength, dim}));
+
+  test.addOutputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}),
+                  ADD_TO);
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{contextLength, dim}),
+                  ADD_TO);
+
+  test.run();
+}
+
+TEST(RowConv, real) {
+  for (size_t numSamples : {17, 129, 2020}) {
+    for (size_t dim : {16, 512, 2560}) {
+      for (size_t context : {3, 19, 65}) {
+        VLOG(3) << " numSamples=" << numSamples << " dim=" << dim
+                << " context length=" << context;
+        testRowConvFw(numSamples, dim, context);
+        testRowConvBw(numSamples, dim, context);
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/RowConvLayer.cpp b/paddle/gserver/layers/RowConvLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..54d77999ad5b30a8d9f4feaa02d81417957544a7
--- /dev/null
+++ b/paddle/gserver/layers/RowConvLayer.cpp
@@ -0,0 +1,106 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "RowConvLayer.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(row_conv, RowConvLayer);
+
+bool RowConvLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  contexLength_ = config_.inputs(0).row_conv_conf().context_length();
+
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  weight_.reset(new Weight(contexLength_, getSize(), parameters_[0]));
+  createFunction(forward_, "RowConv", FuncConfig());
+  createFunction(backward_, "RowConvGrad", FuncConfig());
+
+  return true;
+}
+
+void RowConvLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  MatrixPtr input = getInputValue(0);
+  size_t height = input->getHeight();
+  size_t width = input->getWidth();
+  CHECK_EQ(width, getSize());
+  resetOutput(height, width);
+
+  const auto startPos = getInput(0).sequenceStartPositions->getVector(useGpu_);
+  MatrixPtr w = weight_->getW();
+  wDims_ = TensorShape({w->getHeight(), w->getWidth()});
+
+  MatrixPtr outV = getOutputValue();
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), *startPos);
+  inputs.addArg(*w, wDims_);
+  outputs.addArg(*getOutputValue(), *startPos, ADD_TO);
+
+  {
+    REGISTER_TIMER_INFO("RowConvForward", getName().c_str());
+    forward_[0]->calc(inputs, outputs);
+  }
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void RowConvLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ {
+    REGISTER_TIMER_INFO("BpAvtTimer", getName().c_str());
+    backwardActivation();
+  }
+
+  const auto startPos = getInput(0).sequenceStartPositions->getVector(useGpu_);
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getOutputGrad(), *startPos);
+  inputs.addArg(*getInputValue(0), *startPos);
+  inputs.addArg(*weight_->getW(), wDims_);
+
+  MatrixPtr inGrad = getInputGrad(0);
+  MatrixPtr wGrad = weight_->getWGrad();
+  size_t h = getInputValue(0)->getHeight();
+  size_t w = getInputValue(0)->getWidth();
+  outputs.addArg(
+      inGrad ? (*inGrad) : *(Matrix::create(nullptr, h, w, false, useGpu_)),
+      *startPos,
+      ADD_TO);
+  outputs.addArg(
+      wGrad ? (*wGrad)
+            : *(Matrix::create(nullptr, contexLength_, w, false, useGpu_)),
+      wDims_,
+      ADD_TO);
+
+  {
+    REGISTER_TIMER_INFO("RowConvBackward", getName().c_str());
+    backward_[0]->calc(inputs, outputs);
+  }
+
+  {
+    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+    weight_->getParameterPtr()->incUpdate(callback);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/RowConvLayer.h b/paddle/gserver/layers/RowConvLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..b3bdda2f3542b312eda531695c975ff2dfc29c93
--- /dev/null
+++ b/paddle/gserver/layers/RowConvLayer.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+
+namespace paddle {
+
+/**
+ * \brief Row Convolution Layer.
+ */
+class RowConvLayer : public Layer {
+public:
+  explicit RowConvLayer(const LayerConfig& config) : Layer(config) {}
+
+  ~RowConvLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+
+protected:
+  // Row convolution weight, context_lenght_ * fan_out.
+  // fan_out is the size of output feature.
+  std::unique_ptr<Weight> weight_;
+
+  // The step number to look ahead plus one equals contexLength_.
+  size_t contexLength_;
+  TensorShape wDims_;
+};
+}  // namespace paddle
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index e1e8e7fae7ca4c96206d60703db1f35aa1196875..6adffcf53b7966bd6f3d02970e5f07cc9802f469 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1705,6 +1705,26 @@ TEST(Layer, TransLayer) {
   }
 }
 
+TEST(Layer, RowConvLayer) {
+  const int context = 3;
+  const int size = 512;
+
+  TestConfig config;
+  config.layerConfig.set_type("row_conv");
+  config.layerConfig.set_size(size);
+  config.layerConfig.set_active_type("sigmoid");
+
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_DATA, "layer_0", size, context * size});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  RowConvConfig* conv = input->mutable_row_conv_conf();
+  conv->set_context_length(context);
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "row_conv", 100, false, useGpu, false);
+  }
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index 6d9365af2d14673146d9e427138bf6dd5f5b41b6..5beced3bb5a1050078f88dfd4350a2df71d27f35 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -632,7 +632,7 @@ void Argument::printValueString(std::ostream& stream,
                                 const std::string& prefix) const {
   std::unordered_map<std::string, std::string> out;
   getValueString(&out);
-  for (auto field : {"value", "id", "sequence pos", "sub-sequence pos"}) {
+  for (auto field : {"value", "ids", "sequence pos", "sub-sequence pos"}) {
     auto it = out.find(field);
     if (it != out.end()) {
       stream << prefix << field << ":\n" << it->second;
diff --git a/paddle/parameter/tests/test_argument.cpp b/paddle/parameter/tests/test_argument.cpp
index 81fe4ee397351a013c8616ad08fb8cb4b8dae4d0..98ab013548734059060eb06ce1a7cec23dbf1b72 100644
--- a/paddle/parameter/tests/test_argument.cpp
+++ b/paddle/parameter/tests/test_argument.cpp
@@ -42,7 +42,7 @@ TEST(Argument, poolSequenceWithStride) {
     CHECK_EQ(outStart[3], 4);
     CHECK_EQ(outStart[4], 7);
 
-    CHECK_EQ(stridePositions->getSize(), 8);
+    CHECK_EQ(stridePositions->getSize(), 8UL);
     auto result = reversed ? strideResultReversed : strideResult;
     for (int i = 0; i < 8; i++) {
       CHECK_EQ(stridePositions->getData()[i], result[i]);
diff --git a/paddle/pserver/LightNetwork.cpp b/paddle/pserver/LightNetwork.cpp
index 8c8ba0a2e51b85bde0544c6780b07130336a6bdd..922f25734dee0a6db7fbcfcef3d29d2bad5b7858 100644
--- a/paddle/pserver/LightNetwork.cpp
+++ b/paddle/pserver/LightNetwork.cpp
@@ -383,20 +383,23 @@ void SocketClient::TcpClient(const std::string &serverAddr, int serverPort) {
   setOption(sockfd);
 
   /// Now connect to the server
-  int retry_second = 0;
-  int error = 0;
+  int retry_count = 0;
   do {
-    error = connect(sockfd, (sockaddr *)&serv_addr, sizeof(serv_addr));
-    if (error == ECONNREFUSED) {
+    if (connect(sockfd, (sockaddr *)&serv_addr, sizeof(serv_addr)) == 0) {
+      break;
+    }
+
+    if (errno == ECONNREFUSED) {
       LOG(WARNING) << "connection refused by pserver, try again!";
-      if (retry_second++ >= 7) {
+      if (retry_count++ >= 7) {
         LOG(FATAL) << "connection refused by pserver, maybe pserver failed!";
       }
       std::this_thread::sleep_for(std::chrono::seconds(1));
     } else {
-      PCHECK(error >= 0) << "ERROR connecting to " << serverAddr;
+      PCHECK(errno != 0) << "ERROR connecting to " << serverAddr << ":"
+                         << serverPort << "errorno: " << errno;
     }
-  } while (error == ECONNREFUSED);
+  } while (errno == ECONNREFUSED);
 
   channel_.reset(new SocketChannel(sockfd, serverAddr));
   tcpRdma_ = F_TCP;
diff --git a/paddle/strings/CMakeLists.txt b/paddle/strings/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4e55eecd484c0e218ecd51bbd19b3eb4f6f92a25
--- /dev/null
+++ b/paddle/strings/CMakeLists.txt
@@ -0,0 +1,2 @@
+cc_library(stringpiece SRCS stringpiece.cc)
+cc_test(stringpiece_test SRCS stringpiece_test.cc DEPS stringpiece glog gflags)
diff --git a/paddle/strings/stringpiece.cc b/paddle/strings/stringpiece.cc
new file mode 100644
index 0000000000000000000000000000000000000000..415b3558d5dfffde26275bcb16ea3922424ca9f3
--- /dev/null
+++ b/paddle/strings/stringpiece.cc
@@ -0,0 +1,141 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+#include "paddle/strings/stringpiece.h"
+
+#include <string.h>
+
+#include <algorithm>
+#include <iosfwd>
+#include <stdexcept>
+
+namespace paddle {
+
+StringPiece::StringPiece() : data_(NULL), size_(0) {}
+
+StringPiece::StringPiece(const char* d, size_t n) : data_(d), size_(n) {
+  if (d == NULL && n != 0)
+    throw std::invalid_argument(
+        "StringPiece requires len to be 0 for NULL data");
+}
+
+StringPiece::StringPiece(const char* s) : data_(s) {
+  size_ = (s == NULL) ? 0 : strlen(s);
+}
+
+StringPiece::StringPiece(const std::string& s)
+    : data_(s.data()), size_(s.size()) {}
+
+char StringPiece::operator[](size_t n) const {
+  if (n >= len())
+    throw std::invalid_argument("index out of StringPiece length");
+  return data_[n];
+}
+
+int Compare(StringPiece a, StringPiece b) {
+  const size_t min_len = (a.len() < b.len()) ? a.len() : b.len();
+  int r = memcmp(a.data(), b.data(), min_len);
+  if (r == 0) {
+    if (a.len() < b.len())
+      return -1;
+    else if (a.len() > b.len())
+      return 1;
+  }
+  return r;
+}
+
+bool operator==(StringPiece x, StringPiece y) {
+  return ((x.len() == y.len()) &&
+          (x.data() == y.data() || memcmp(x.data(), y.data(), x.len()) == 0));
+}
+
+bool operator!=(StringPiece x, StringPiece y) { return !(x == y); }
+
+bool operator<(StringPiece x, StringPiece y) { return Compare(x, y) < 0; }
+bool operator>(StringPiece x, StringPiece y) { return Compare(x, y) > 0; }
+
+bool operator<=(StringPiece x, StringPiece y) { return Compare(x, y) <= 0; }
+bool operator>=(StringPiece x, StringPiece y) { return Compare(x, y) >= 0; }
+
+bool HasPrefix(StringPiece s, StringPiece x) {
+  return ((s.len() >= x.len()) && (memcmp(s.data(), x.data(), x.len()) == 0));
+}
+
+bool HasSuffix(StringPiece s, StringPiece x) {
+  return ((s.len() >= x.len()) &&
+          (memcmp(s.data() + (s.len() - x.len()), x.data(), x.len()) == 0));
+}
+
+StringPiece SkipPrefix(StringPiece s, size_t n) {
+  if (n > s.len())
+    throw std::invalid_argument("Skip distance larger than StringPiece length");
+  return StringPiece(s.data() + n, s.len() - n);
+}
+
+StringPiece SkipSuffix(StringPiece s, size_t n) {
+  if (n > s.len())
+    throw std::invalid_argument("Skip distance larger than StringPiece length");
+  return StringPiece(s.data(), s.len() - n);
+}
+
+StringPiece TrimPrefix(StringPiece s, StringPiece x) {
+  return HasPrefix(s, x) ? SkipPrefix(s, x.len()) : s;
+}
+
+StringPiece TrimSuffix(StringPiece s, StringPiece x) {
+  return HasSuffix(s, x) ? SkipSuffix(s, x.len()) : s;
+}
+
+bool Contains(StringPiece s, StringPiece sub) {
+  return std::search(s.begin(), s.end(), sub.begin(), sub.end()) != s.end();
+}
+
+size_t Index(StringPiece s, StringPiece sub) {
+  auto e = std::search(s.begin(), s.end(), sub.begin(), sub.end());
+  return e != s.end() ? e - s.data() : StringPiece::npos;
+}
+
+size_t Find(StringPiece s, char c, size_t pos) {
+  if (pos >= s.len()) {
+    return StringPiece::npos;
+  }
+  const char* result =
+      reinterpret_cast<const char*>(memchr(s.data() + pos, c, s.len() - pos));
+  return result != nullptr ? result - s.data() : StringPiece::npos;
+}
+
+size_t RFind(StringPiece s, char c, size_t pos) {
+  if (s.len() == 0) return StringPiece::npos;
+  for (const char* p = s.data() + std::min(pos, s.len() - 1); p >= s.data();
+       p--) {
+    if (*p == c) {
+      return p - s.data();
+    }
+  }
+  return StringPiece::npos;
+}
+
+StringPiece SubStr(StringPiece s, size_t pos, size_t n) {
+  if (pos > s.len()) pos = s.len();
+  if (n > s.len() - pos) n = s.len() - pos;
+  return StringPiece(s.data() + pos, n);
+}
+
+std::ostream& operator<<(std::ostream& o, StringPiece piece) {
+  return o << piece.ToString();
+}
+
+}  // namespace paddle
diff --git a/paddle/strings/stringpiece.h b/paddle/strings/stringpiece.h
new file mode 100644
index 0000000000000000000000000000000000000000..adff713e86f49349b8f189c1d24584bfc1bb8aa7
--- /dev/null
+++ b/paddle/strings/stringpiece.h
@@ -0,0 +1,105 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+#pragma once
+
+#include <ostream>
+#include <string>
+
+namespace paddle {
+
+// StringPiece points into a std::string object but doesn't own the
+// string.  It is for efficient access to strings.  Like Go's string
+// type.  Not that StringPiece doesn't mutate the underlying string,
+// so it is thread-safe given that the underlying string doesn't
+// change.  Because StringPiece contains a little data members, and
+// its syntax is simple as it doesn't own/manage the string, it is
+// cheap to construct StringPieces and pass them around.
+class StringPiece {
+public:
+  static const size_t npos = static_cast<size_t>(-1);
+
+  // We provide non-explicit singleton constructors so users can
+  // pass in a "const char*" or a "string" wherever a "StringPiece"
+  // is expected.  These contructors ensure that if data_ is NULL,
+  // size_ is 0.
+  StringPiece();
+  StringPiece(const char* d, size_t n);
+  StringPiece(const char* d);
+  StringPiece(const std::string& s);
+
+  const char* data() const { return data_; }
+  size_t len() const { return size_; }
+
+  char operator[](size_t n) const;
+
+  // StringPiece doesn't own the string, so both iterator and const
+  // iterator are const char* indeed.
+  typedef const char* const_iterator;
+  typedef const char* iterator;
+  iterator begin() const { return data_; }
+  iterator end() const { return data_ + size_; }
+
+  // Return a string that contains the copy of the referenced data.
+  std::string ToString() const { return std::string(data_, size_); }
+
+private:
+  const char* data_;
+  size_t size_;
+
+  // Intentionally copyable
+};
+
+int Compare(StringPiece a, StringPiece b);
+
+bool operator==(StringPiece x, StringPiece y);
+bool operator!=(StringPiece x, StringPiece y);
+bool operator<(StringPiece x, StringPiece y);
+bool operator>(StringPiece x, StringPiece y);
+bool operator<=(StringPiece x, StringPiece y);
+bool operator>=(StringPiece x, StringPiece y);
+
+bool HasPrefix(StringPiece s, StringPiece prefix);
+bool HasSuffix(StringPiece s, StringPiece suffix);
+
+StringPiece SkipPrefix(StringPiece s, size_t n);
+StringPiece SkipSuffix(StringPiece s, size_t n);
+
+// Skip the prefix (or suffix) if it matches with the string.
+StringPiece TrimPrefix(StringPiece s, StringPiece prefix);
+StringPiece TrimSuffix(StringPiece s, StringPiece suffix);
+
+// Returns if s contains sub.  Any s except for empty s contains an
+// empty sub.
+bool Contains(StringPiece s, StringPiece sub);
+
+// Return the first occurrence of sub in s, or npos.  If both s and
+// sub is empty, it returns npos; otherwise, if only sub is empty, it
+// returns 0.
+size_t Index(StringPiece s, StringPiece sub);
+
+// Return the first occurrence of c in s[pos:end], or npos.
+size_t Find(StringPiece s, char c, size_t pos);
+
+// Search range is [0..pos] inclusive.  If pos == npos, search everything.
+size_t RFind(StringPiece s, char c, size_t pos);
+
+StringPiece SubStr(StringPiece s, size_t pos, size_t n);
+
+// allow StringPiece to be logged
+std::ostream& operator<<(std::ostream& o, StringPiece piece);
+
+}  // namespace paddle
diff --git a/paddle/strings/stringpiece_test.cc b/paddle/strings/stringpiece_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2ba66a04f641c3457efa713383484491a213668f
--- /dev/null
+++ b/paddle/strings/stringpiece_test.cc
@@ -0,0 +1,293 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+#include "paddle/strings/stringpiece.h"
+
+#include <sstream>
+
+#include "gtest/gtest.h"
+
+TEST(StringPiece, Construct) {
+  {
+    paddle::StringPiece s;
+    EXPECT_EQ(NULL, s.data());
+    EXPECT_EQ(0U, s.len());
+  }
+  { EXPECT_THROW(paddle::StringPiece s(NULL, 10000U), std::invalid_argument); }
+  {
+    paddle::StringPiece s(NULL);
+    EXPECT_EQ(0U, s.len());
+  }
+  {
+    std::string a;
+    EXPECT_EQ(0U, a.size());
+    paddle::StringPiece s(a);
+    EXPECT_EQ(0U, s.len());
+  }
+}
+
+TEST(StringPiece, CopyAndAssign) {
+  paddle::StringPiece empty;
+  EXPECT_EQ(0U, empty.len());
+
+  paddle::StringPiece a("hello");
+  paddle::StringPiece b = a;
+  EXPECT_EQ(b.len(), strlen("hello"));
+  EXPECT_EQ(a, b);
+
+  std::string storage("hello");
+  paddle::StringPiece c(storage);
+  EXPECT_EQ(a, c);
+  EXPECT_NE(a.data(), c.data());
+}
+
+TEST(StringPiece, Compare) {
+  {
+    paddle::StringPiece a("hello");
+    paddle::StringPiece b("world");
+    EXPECT_TRUE(a != b);
+    EXPECT_FALSE(a == b);
+    EXPECT_TRUE(a < b);
+    EXPECT_TRUE(a <= b);
+    EXPECT_FALSE(a > b);
+    EXPECT_FALSE(a >= b);
+    EXPECT_LT(Compare(a, b), 0);
+    EXPECT_GT(Compare(b, a), 0);
+  }
+  {
+    paddle::StringPiece a, b;
+    EXPECT_TRUE(a == b);
+    EXPECT_FALSE(a != b);
+    EXPECT_FALSE(a < b);
+    EXPECT_FALSE(a > b);
+    EXPECT_TRUE(a <= b);
+    EXPECT_TRUE(a >= b);
+    EXPECT_EQ(0, Compare(a, b));
+    EXPECT_EQ(0, Compare(b, a));
+  }
+}
+
+TEST(StringPiece, ToString) {
+  {
+    paddle::StringPiece s;
+    EXPECT_EQ(std::string(""), s.ToString());
+  }
+  {
+    paddle::StringPiece s(NULL);
+    EXPECT_EQ(std::string(""), s.ToString());
+  }
+  {
+    paddle::StringPiece s("hello");
+    EXPECT_EQ(std::string("hello"), s.ToString());
+  }
+}
+
+TEST(StringPiece, HasPrefixSuffix) {
+  using paddle::HasPrefix;
+  using paddle::HasSuffix;
+  {
+    paddle::StringPiece s;
+    EXPECT_FALSE(HasPrefix(s, "something"));
+    EXPECT_TRUE(HasPrefix(s, ""));
+    EXPECT_FALSE(HasSuffix(s, "something"));
+    EXPECT_TRUE(HasSuffix(s, ""));
+  }
+  {
+    paddle::StringPiece s("app");
+    EXPECT_TRUE(HasPrefix(s, ""));
+    EXPECT_TRUE(HasPrefix(s, "a"));
+    EXPECT_TRUE(HasPrefix(s, "ap"));
+    EXPECT_TRUE(HasPrefix(s, "app"));
+
+    EXPECT_TRUE(HasSuffix(s, ""));
+    EXPECT_TRUE(HasSuffix(s, "p"));
+    EXPECT_TRUE(HasSuffix(s, "pp"));
+    EXPECT_TRUE(HasSuffix(s, "app"));
+  }
+}
+
+TEST(StringPiece, SkipPrefixSuffix) {
+  using paddle::SkipPrefix;
+  using paddle::SkipSuffix;
+  {
+    paddle::StringPiece s;
+    EXPECT_EQ("", SkipPrefix(s, 0));
+    EXPECT_THROW(SkipPrefix(s, 1), std::invalid_argument);
+
+    EXPECT_EQ("", SkipSuffix(s, 0));
+    EXPECT_THROW(SkipSuffix(s, 1), std::invalid_argument);
+  }
+  {
+    paddle::StringPiece s("app");
+    EXPECT_EQ("app", SkipPrefix(s, 0));
+    EXPECT_EQ("pp", SkipPrefix(s, 1));
+    EXPECT_EQ("p", SkipPrefix(s, 2));
+    EXPECT_EQ("", SkipPrefix(s, 3));
+    EXPECT_THROW(SkipPrefix(s, 4), std::invalid_argument);
+
+    EXPECT_EQ("app", SkipSuffix(s, 0));
+    EXPECT_EQ("ap", SkipSuffix(s, 1));
+    EXPECT_EQ("a", SkipSuffix(s, 2));
+    EXPECT_EQ("", SkipSuffix(s, 3));
+    EXPECT_THROW(SkipSuffix(s, 4), std::invalid_argument);
+  }
+}
+
+TEST(StringPiece, TrimPrefixSuffix) {
+  using paddle::TrimPrefix;
+  using paddle::TrimSuffix;
+  {
+    paddle::StringPiece s;
+    EXPECT_EQ("", TrimPrefix(s, ""));
+    EXPECT_EQ("", TrimPrefix(s, "something"));
+
+    EXPECT_EQ("", TrimSuffix(s, ""));
+    EXPECT_EQ("", TrimSuffix(s, "something"));
+  }
+  {
+    paddle::StringPiece s("app");
+    EXPECT_EQ("app", TrimPrefix(s, ""));
+    EXPECT_EQ("pp", TrimPrefix(s, "a"));
+    EXPECT_EQ("p", TrimPrefix(s, "ap"));
+    EXPECT_EQ("", TrimPrefix(s, "app"));
+    EXPECT_EQ("app", TrimPrefix(s, "something"));
+
+    EXPECT_EQ("app", TrimSuffix(s, ""));
+    EXPECT_EQ("ap", TrimSuffix(s, "p"));
+    EXPECT_EQ("a", TrimSuffix(s, "pp"));
+    EXPECT_EQ("", TrimSuffix(s, "app"));
+    EXPECT_EQ("app", TrimSuffix(s, "something"));
+  }
+}
+
+TEST(StringPiece, Contains) {
+  using paddle::Contains;
+  {
+    paddle::StringPiece s;
+    EXPECT_FALSE(Contains(s, ""));
+    EXPECT_FALSE(Contains(s, "something"));
+  }
+  {
+    paddle::StringPiece s("app");
+    EXPECT_TRUE(Contains(s, ""));
+    EXPECT_TRUE(Contains(s, "a"));
+    EXPECT_TRUE(Contains(s, "p"));
+    EXPECT_TRUE(Contains(s, "ap"));
+    EXPECT_TRUE(Contains(s, "pp"));
+    EXPECT_TRUE(Contains(s, "app"));
+    EXPECT_FALSE(Contains(s, "something"));
+  }
+}
+
+TEST(StringPiece, Index) {
+  using paddle::Index;
+  auto npos = paddle::StringPiece::npos;
+  {
+    paddle::StringPiece s;
+    EXPECT_EQ(npos, Index(s, ""));
+    EXPECT_EQ(npos, Index(s, "something"));
+  }
+  {
+    paddle::StringPiece s("app");
+    EXPECT_EQ(0U, Index(s, ""));
+    EXPECT_EQ(0U, Index(s, "a"));
+    EXPECT_EQ(1U, Index(s, "p"));
+    EXPECT_EQ(0U, Index(s, "ap"));
+    EXPECT_EQ(1U, Index(s, "pp"));
+    EXPECT_EQ(0U, Index(s, "app"));
+    EXPECT_EQ(npos, Index(s, "something"));
+  }
+}
+
+TEST(StringPiece, Find) {
+  using paddle::Find;
+  auto npos = paddle::StringPiece::npos;
+  {
+    paddle::StringPiece s;
+    EXPECT_EQ(npos, Find(s, 'a', 0U));
+  }
+  {
+    paddle::StringPiece s("app");
+    EXPECT_EQ(0U, Find(s, 'a', 0U));
+    EXPECT_EQ(1U, Find(s, 'p', 0U));
+    EXPECT_EQ(1U, Find(s, 'p', 1U));
+    EXPECT_EQ(2U, Find(s, 'p', 2U));
+    EXPECT_EQ(npos, Find(s, 'z', 2U));
+  }
+}
+
+TEST(StringPiece, RFind) {
+  using paddle::RFind;
+  auto npos = paddle::StringPiece::npos;
+  {
+    paddle::StringPiece s;
+    EXPECT_EQ(npos, RFind(s, 'a', 0U));
+  }
+  {
+    paddle::StringPiece s("app");
+    EXPECT_EQ(2U, RFind(s, 'p', 2U));
+    EXPECT_EQ(0U, RFind(s, 'a', 2U));
+    EXPECT_EQ(1U, RFind(s, 'p', 1U));
+    EXPECT_EQ(0U, RFind(s, 'a', 0));
+    EXPECT_EQ(npos, RFind(s, 'z', 2U));
+  }
+}
+
+TEST(StringPiece, SubStr) {
+  using paddle::SubStr;
+  {
+    paddle::StringPiece s;
+    EXPECT_EQ("", SubStr(s, 0, 0));
+    EXPECT_EQ("", SubStr(s, 0, 1));
+    EXPECT_EQ("", SubStr(s, 1, 0));
+  }
+  {
+    paddle::StringPiece s("app");
+    EXPECT_EQ("", SubStr(s, 0, 0));
+    EXPECT_EQ("", SubStr(s, 1, 0));
+    EXPECT_EQ("", SubStr(s, 2, 0));
+    EXPECT_EQ("", SubStr(s, 3, 0));
+
+    EXPECT_EQ("a", SubStr(s, 0, 1));
+    EXPECT_EQ("p", SubStr(s, 1, 1));
+    EXPECT_EQ("p", SubStr(s, 2, 1));
+    EXPECT_EQ("", SubStr(s, 3, 1));
+
+    EXPECT_EQ("ap", SubStr(s, 0, 2));
+    EXPECT_EQ("pp", SubStr(s, 1, 2));
+    EXPECT_EQ("p", SubStr(s, 2, 2));
+    EXPECT_EQ("", SubStr(s, 3, 2));
+
+    EXPECT_EQ("app", SubStr(s, 0, 3));
+    EXPECT_EQ("pp", SubStr(s, 1, 3));
+    EXPECT_EQ("p", SubStr(s, 2, 3));
+    EXPECT_EQ("", SubStr(s, 3, 3));
+  }
+}
+
+TEST(StringPiece, StreamOutput) {
+  using paddle::StringPiece;
+
+  std::stringstream o;
+  o << StringPiece();
+  EXPECT_EQ("", o.str());
+
+  o << StringPiece("hello");
+  EXPECT_EQ("hello", o.str());
+
+  o << StringPiece();
+  EXPECT_EQ("hello", o.str());
+}
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index 4f9b53d6f6553e55406dd000029a598a92fd2fb6..29270829bbc3af6990aaf03a5228ef7f6a892a5c 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -194,6 +194,10 @@ message MaxOutConfig {
   required uint32 groups = 2;
 }
 
+message RowConvConfig {
+  required uint32 context_length = 1;
+}
+
 message ProjectionConfig {
   required string type = 1;
   required string name = 2;
@@ -279,6 +283,7 @@ message LayerInputConfig {
   optional SppConfig spp_conf = 12;
   optional PriorBoxConfig priorbox_conf = 13;
   optional PadConfig pad_conf = 14;
+  optional RowConvConfig row_conv_conf = 15;
 }
 
 message LayerConfig {
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 5d540664a7f56b4fc27ecd5dc46bf36b0268eb98..0792e2d40b43f5fb2de8d6bb43a62cfa23f77082 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -73,7 +73,6 @@ To use this from paddle_trainer, paddle_trainer should be called with
 --config_args=extension_module_name=[MODULE_NAME]
 
 '''
-
 import copy
 import logging
 import os
@@ -1731,9 +1730,10 @@ class ParameterReluLayer(LayerBase):
     def __init__(self, name, inputs, partial_sum=1, **args):
         super(ParameterReluLayer, self).__init__(
             name, self.layer_type, 0, inputs=inputs, **args)
-        config_assert(len(self.inputs) == 1)
-        config_assert(self.input_layer.size % partial_sum == 0)
         input_layer = self.get_input_layer(0)
+        config_assert(len(self.inputs) == 1, "prelu layer has only one input.")
+        config_assert(input_layer.size % partial_sum == 0,
+                      "a wrong setting for partial_sum")
         self.set_layer_size(input_layer.size)
         self.create_input_parameter(0, input_layer.size / partial_sum)
 
@@ -2081,6 +2081,23 @@ class MaxOutLayer(LayerBase):
                            g_layer_map[input_layer.name].width, out_channels)
 
 
+@config_layer('row_conv')
+class RowConvLayer(LayerBase):
+    def __init__(self, name, inputs, context_length, **xargs):
+        super(RowConvLayer, self).__init__(
+            name, 'maxout', 0, inputs=inputs, **xargs)
+        config_assert(
+            len(self.inputs) == 1,
+            'TransLayer must have one and only one input')
+        input_layer = self.get_input_layer(0)
+        row_conv_conf = self.config.inputs[0].row_conv_conf
+        row_conv_conf.context_length = context_length
+        self.set_layer_size(input_layer.size)
+        psize = context_length * input_layer.size
+        dims = [context_length, input_layer.size]
+        self.create_input_parameter(0, psize, dims)
+
+
 # key: cost type
 # value: cost class
 g_cost_map = {}
@@ -3546,11 +3563,7 @@ def update_g_config():
     return g_config
 
 
-def begin_parse(config_arg_str=''):
-    '''
-    @param config_arg_str: a string of the form var1=val1,var2=val2. It will be
-    passed to config script as a dictionary CONFIG_ARGS
-    '''
+def begin_parse():
     init_config_environment()
     for hook in _parse_config_hooks:
         hook()
@@ -3568,8 +3581,12 @@ def begin_parse(config_arg_str=''):
 
 
 def parse_config(trainer_config, config_arg_str):
-    begin_parse(config_arg_str)
+    '''
+    @param config_arg_str: a string of the form var1=val1,var2=val2. It will be
+    passed to config script as a dictionary CONFIG_ARGS
+    '''
 
+    begin_parse()
     config_args = {}
 
     if config_arg_str:
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 5667e5ff2bccd38f2da00a3b17ea8bc8e3a6fb8e..2d8ddbb9007b241eb1986887d8ea6c2de8235c29 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -31,31 +31,31 @@ except ImportError:
 import copy
 
 __all__ = [
-    "full_matrix_projection",
-    "AggregateLevel",
-    "ExpandLevel",
-    "identity_projection",
-    "dotmul_projection",
-    "dotmul_operator",
-    "repeat_layer",
-    "seq_reshape_layer",
-    "table_projection",
-    "mixed_layer",
-    "data_layer",
-    "embedding_layer",
-    "fc_layer",
-    "grumemory",
-    "pooling_layer",
-    "lstmemory",
-    "last_seq",
-    "first_seq",
-    "cos_sim",
-    "hsigmoid",
-    "conv_projection",
-    "mse_cost",
-    "regression_cost",
+    'full_matrix_projection',
+    'AggregateLevel',
+    'ExpandLevel',
+    'identity_projection',
+    'dotmul_projection',
+    'dotmul_operator',
+    'repeat_layer',
+    'seq_reshape_layer',
+    'table_projection',
+    'mixed_layer',
+    'data_layer',
+    'embedding_layer',
+    'fc_layer',
+    'grumemory',
+    'pooling_layer',
+    'lstmemory',
+    'last_seq',
+    'first_seq',
+    'cos_sim',
+    'hsigmoid',
+    'conv_projection',
+    'mse_cost',
+    'regression_cost',
     'classification_cost',
-    "LayerOutput",
+    'LayerOutput',
     'img_conv_layer',
     'img_pool_layer',
     'batch_norm_layer',
@@ -121,6 +121,9 @@ __all__ = [
     'smooth_l1_cost',
     'layer_support',
     'multiplex_layer',
+    'row_conv_layer',
+    'dropout_layer',
+    'prelu_layer',
 ]
 
 
@@ -129,26 +132,26 @@ class LayerType(object):
     Layer type enumerations.
     """
 
-    DATA = "data"
-    MIXED_LAYER = "mixed"
-    LSTMEMORY = "lstmemory"
-    GRUMEMORY = "gated_recurrent"
-    SEQUENCE_LAST_INSTANCE = "seqlastins"
-    SEQUENCE_FIRST_INSTANCE = "seqfirstins"
-    SEQUENCE_RESHAPE = "seqreshape"
-    POOLING_MAX = "max"
+    DATA = 'data'
+    MIXED_LAYER = 'mixed'
+    LSTMEMORY = 'lstmemory'
+    GRUMEMORY = 'gated_recurrent'
+    SEQUENCE_LAST_INSTANCE = 'seqlastins'
+    SEQUENCE_FIRST_INSTANCE = 'seqfirstins'
+    SEQUENCE_RESHAPE = 'seqreshape'
+    POOLING_MAX = 'max'
     POOLING_AVG = 'average'
-    FC_LAYER = "fc"
+    FC_LAYER = 'fc'
     COST = 'cost'
     COSINE_SIM_VEC = 'cos_vm'
     COSINE_SIM = 'cos'
     HSIGMOID = 'hsigmoid'
-    CONV_LAYER = "conv"
-    CONVTRANS_LAYER = "convt"
-    EXCONV_LAYER = "exconv"
-    EXCONVTRANS_LAYER = "exconvt"
-    CUDNNCONV_LAYER = "cudnn_conv"
-    POOL_LAYER = "pool"
+    CONV_LAYER = 'conv'
+    CONVTRANS_LAYER = 'convt'
+    EXCONV_LAYER = 'exconv'
+    EXCONVTRANS_LAYER = 'exconvt'
+    CUDNNCONV_LAYER = 'cudnn_conv'
+    POOL_LAYER = 'pool'
     BATCH_NORM_LAYER = 'batch_norm'
     NORM_LAYER = 'norm'
     SUM_TO_ONE_NORM_LAYER = 'sum_to_one_norm'
@@ -188,25 +191,28 @@ class LayerType(object):
     SPP_LAYER = "spp"
     PAD_LAYER = "pad"
     MULTIPLEX_LAYER = "multiplex"
+    ROW_CONV_LAYER = "row_conv"
 
-    PRINT_LAYER = "print"
-    PRIORBOX_LAYER = "priorbox"
+    PRINT_LAYER = 'print'
+    PRIORBOX_LAYER = 'priorbox'
 
-    CTC_LAYER = "ctc"
-    WARP_CTC_LAYER = "warp_ctc"
-    CRF_LAYER = "crf"
-    CRF_DECODING_LAYER = "crf_decoding"
+    CTC_LAYER = 'ctc'
+    WARP_CTC_LAYER = 'warp_ctc'
+    CRF_LAYER = 'crf'
+    CRF_DECODING_LAYER = 'crf_decoding'
     NCE_LAYER = 'nce'
 
-    RANK_COST = "rank-cost"
-    LAMBDA_COST = "lambda_cost"
-    HUBER = "huber"
-    CROSS_ENTROPY = "multi-class-cross-entropy"
-    CROSS_ENTROPY_WITH_SELFNORM = "multi_class_cross_entropy_with_selfnorm"
-    SOFT_BIN_CLASS_CROSS_ENTROPY = "soft_binary_class_cross_entropy"
-    MULTI_BIN_LABEL_CROSS_ENTROPY = "multi_binary_label_cross_entropy"
-    SUM_COST = "sum_cost"
-    SMOOTH_L1 = "smooth_l1"
+    RANK_COST = 'rank-cost'
+    LAMBDA_COST = 'lambda_cost'
+    HUBER = 'huber'
+    CROSS_ENTROPY = 'multi-class-cross-entropy'
+    CROSS_ENTROPY_WITH_SELFNORM = 'multi_class_cross_entropy_with_selfnorm'
+    SOFT_BIN_CLASS_CROSS_ENTROPY = 'soft_binary_class_cross_entropy'
+    MULTI_BIN_LABEL_CROSS_ENTROPY = 'multi_binary_label_cross_entropy'
+    SUM_COST = 'sum_cost'
+    SMOOTH_L1 = 'smooth_l1'
+
+    PRELU = 'prelu'
 
     @staticmethod
     def is_layer_type(type_name):
@@ -3768,7 +3774,6 @@ def beam_search(step,
     assert generated_input_index != -1
 
     gipt = input[generated_input_index]
-    assert isinstance(gipt, BaseGeneratedInput)
 
     gipt.bos_id = bos_id
     gipt.eos_id = eos_id
@@ -3788,7 +3793,6 @@ def beam_search(step,
         predict = gipt.after_real_step(step(*args))
 
         eos_layer(input=predict, eos_id=eos_id, name=eos_name)
-
         return predict
 
     tmp = recurrent_group(
@@ -3860,7 +3864,6 @@ def classification_cost(input,
                         label,
                         weight=None,
                         name=None,
-                        top_k=None,
                         evaluator=classification_error_evaluator,
                         layer_attr=None):
     """
@@ -3875,8 +3878,6 @@ def classification_cost(input,
     :param weight: The weight affects the cost, namely the scale of cost.
                    It is an optional argument.
     :type weight: LayerOutput
-    :param top_k: number k in top-k error rate
-    :type top_k: int
     :param evaluator: Evaluator method.
     :param layer_attr: layer's extra attribute.
     :type layer_attr: ExtraLayerAttribute
@@ -3904,7 +3905,7 @@ def classification_cost(input,
         assert isinstance(e.for_classification, bool)
         assert e.for_classification
 
-        e(name=e.__name__, input=input, label=label, weight=weight, top_k=top_k)
+        e(name=e.__name__, input=input, label=label, weight=weight)
 
     if not isinstance(evaluator, collections.Sequence):
         evaluator = [evaluator]
@@ -4725,7 +4726,7 @@ def ctc_layer(input,
         fc_layer with softmax activation, should be num_classes + 1. The size of ctc_layer
         should also be num_classes + 1.
 
-    The simple usage:
+    The example usage is:
 
     .. code-block:: python
 
@@ -4812,7 +4813,7 @@ def warp_ctc_layer(input,
         - As a native 'softmax' activation is interated to the warp-ctc library,
           'linear' activation is expected instead in the 'input' layer.
 
-    The simple usage:
+    The example usage is:
 
     .. code-block:: python
 
@@ -4873,7 +4874,7 @@ def crf_layer(input,
     A layer for calculating the cost of sequential conditional random
     field model.
 
-    The simple usage:
+    The example usage is:
 
     .. code-block:: python
 
@@ -4947,7 +4948,7 @@ def crf_decoding_layer(input,
     this layer will also calculate error. output.value[i] is 1 for incorrect
     decoding or 0 for correct decoding.
 
-    The simple usage:
+    The example usage is:
 
     .. code-block:: python
 
@@ -5140,7 +5141,7 @@ def rank_cost(left,
       - :math:`o_i` and :math:`o_j`: the left output and right output.
         Their dimension is one.
 
-    The simple usage:
+    The example usage is:
 
     .. code-block:: python
 
@@ -5197,7 +5198,7 @@ def lambda_cost(input,
     """
     lambdaCost for lambdaRank LTR approach.
 
-    The simple usage:
+    The example usage is:
 
     .. code-block:: python
 
@@ -5255,6 +5256,8 @@ def cross_entropy(input,
     """
     A loss layer for multi class entropy.
 
+    The example usage is:
+
     .. code-block:: python
 
        cost = cross_entropy(input=input_layer,
@@ -5301,6 +5304,8 @@ def cross_entropy_with_selfnorm(input,
     A loss layer for multi class entropy with selfnorm.
     Input should be a vector of positive numbers, without normalization.
 
+    The example usage is:
+
     .. code-block:: python
 
        cost = cross_entropy_with_selfnorm(input=input_layer,
@@ -5342,6 +5347,8 @@ def sum_cost(input, name=None, layer_attr=None):
     """
     A loss layer which calculate the sum of the input as loss
 
+    The example usage is:
+
     .. code-block:: python
 
        cost = sum_cost(input=input_layer)
@@ -5371,6 +5378,8 @@ def huber_cost(input, label, name=None, coeff=1.0, layer_attr=None):
     """
     A loss layer for huber loss.
 
+    The example usage is:
+
     .. code-block:: python
 
        cost = huber_cost(input=input_layer,
@@ -5411,6 +5420,8 @@ def multi_binary_label_cross_entropy(input,
     """
     A loss layer for multi binary label cross entropy.
 
+    The example usage is:
+
     .. code-block:: python
 
        cost = multi_binary_label_cross_entropy(input=input_layer,
@@ -5470,6 +5481,8 @@ def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
     More details can be found by referring to `Fast R-CNN
     <https://arxiv.org/pdf/1504.08083v2.pdf>`_
 
+    The example usage is:
+
     .. code-block:: python
 
        cost = smooth_l1_cost(input=input_layer,
@@ -5519,6 +5532,8 @@ def multiplex_layer(input, name=None, layer_attr=None):
     where, y is output. :math:`x_{k}` is the k-th input layer and
     :math:`k = x_{0}[i] + 1`.
 
+    The example usage is:
+
     .. code-block:: python
 
        maxid = multiplex_layer(input=layers)
@@ -5551,3 +5566,155 @@ def multiplex_layer(input, name=None, layer_attr=None):
         layer_type=LayerType.MULTIPLEX_LAYER,
         parents=input,
         size=l.config.size)
+
+
+@wrap_name_default("dropout")
+def dropout_layer(input, dropout_rate, name=None):
+    """
+    @TODO(yuyang18): Add comments.
+
+    :param name:
+    :param input:
+    :param dropout_rate:
+    :return:
+    """
+    return addto_layer(
+        name=name,
+        input=input,
+        act=LinearActivation(),
+        bias_attr=False,
+        layer_attr=ExtraAttr(drop_rate=dropout_rate))
+
+
+@wrap_name_default()
+@wrap_act_default(act=LinearActivation())
+@wrap_param_attr_default()
+@layer_support(DROPOUT)
+def row_conv_layer(input,
+                   context_len,
+                   act=None,
+                   name=None,
+                   param_attr=None,
+                   layer_attr=None):
+    """
+
+    The row convolution is called lookahead convolution. It is firstly
+    introduced in paper of `Deep Speech 2: End-toEnd Speech Recognition
+    in English and Mandarin <https://arxiv.org/pdf/1512.02595v1.pdf>`_ .
+
+    The bidirectional RNN that learns representation for a sequence by
+    performing a forward and a backward pass through the entire sequence.
+    However, unlike unidirectional RNNs, bidirectional RNNs are challenging
+    to deploy in an online and low-latency setting. The lookahead convolution
+    incorporates information from future subsequences in a computationally
+    efficient manner to improve unidirectional recurrent neural networks.
+ 
+    The connection of row convolution is different form the 1D sequence
+    convolution. Assumed that, the future context-length is k, that is to say,
+    it can get the output at timestep t by using the the input feature from t-th
+    timestep to (t+k+1)-th timestep. Assumed that the hidden dim of input
+    activations are d, the activations r_t for the new layer at time-step t are:
+ 
+    .. math::
+
+        r_{t,r} = \sum_{j=1}^{k + 1} {w_{i,j}h_{t+j-1, i}}
+                  \quad \text{for} \quad  (1 \leq i \leq d)
+
+    Note:
+        The `context_len` is `k + 1`. That is to say, the lookahead step
+        number plus one equals context_len.
+
+
+    .. code-block:: python
+
+       row_conv = row_conv_layer(input=input_layer, context_len=3)
+
+
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param context_len: The context length equals the lookahead step number
+                        plus one.
+    :type context_len: int
+    :param act: Activation Type. Default is linear activation.
+    :type act: BaseActivation
+    :param param_attr: The Parameter Attribute. If None, the parameter will be
+                       initialized smartly. It's better set it by yourself.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+
+    """
+    assert isinstance(input, LayerOutput)
+    assert context_len > 0, "the context_len must be greatet than 0."
+
+    Layer(
+        inputs=[Input(input.name, **param_attr.attr)],
+        name=name,
+        context_length=context_len,
+        type=LayerType.ROW_CONV_LAYER,
+        active_type=act.name,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.ROW_CONV_LAYER, input, activation=act, size=input.size)
+
+
+@layer_support()
+@wrap_name_default()
+@wrap_param_attr_default()
+def prelu_layer(input,
+                name=None,
+                partial_sum=1,
+                param_attr=None,
+                layer_attr=None):
+    """
+    The Parameter Relu activation that actives outputs with a learnable weight.
+
+    Reference:
+        Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+        ImageNet Classification http://arxiv.org/pdf/1502.01852v1.pdf
+
+    .. math::
+       z_i &\\quad if \\quad z_i > 0 \\\\
+       a_i * z_i  &\\quad \\mathrm{otherwise}
+
+    The example usage is:
+
+    .. code-block:: python
+
+       prelu = prelu_layer(input=layers, partial_sum=1)
+
+    :param name: Name of this layer.
+    :type name: basestring
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param partial_sum: this parameter makes a group of inputs share a same weight.
+
+        - partial_sum = 1, indicates the element-wise activation: each element has a weight.
+        - partial_sum = number of elements in one channel, indicates the channel-wise activation, elements in a channel share a same weight.
+        - partial_sum = number of outputs, indicates all elements share a same weight.
+
+    :type partial_sum: int
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
+    :type param_attr: ParameterAttribute|None
+    :param layer_attr: Extra layer configurations. Default is None.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    assert isinstance(input, LayerOutput), 'prelu_layer only accepts one input'
+    assert isinstance(param_attr, ParameterAttribute)
+
+    l = Layer(
+        name=name,
+        type=LayerType.PRELU,
+        inputs=Input(input.name, **param_attr.attr),
+        partial_sum=partial_sum,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name=name,
+        layer_type=LayerType.PRELU,
+        parents=input,
+        size=l.config.size)
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index fb533a47e0b0585be6f0e019086993f8b3aa7f38..1bf59ed4840ae69afc5bce49c86a08b60e9603ee 100755
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -26,10 +26,10 @@ from paddle.trainer.config_parser import *
 
 __all__ = [
     'sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool",
-    "img_conv_bn_pool", 'dropout_layer', 'lstmemory_group', 'lstmemory_unit',
-    'small_vgg', 'img_conv_group', 'vgg_16_network', 'gru_unit', 'gru_group',
-    'simple_gru', 'simple_attention', 'simple_gru2', 'bidirectional_gru',
-    'text_conv_pool', 'bidirectional_lstm', 'inputs', 'outputs'
+    "img_conv_bn_pool", 'lstmemory_group', 'lstmemory_unit', 'small_vgg',
+    'img_conv_group', 'vgg_16_network', 'gru_unit', 'gru_group', 'simple_gru',
+    'simple_attention', 'simple_gru2', 'bidirectional_gru', 'text_conv_pool',
+    'bidirectional_lstm', 'inputs', 'outputs'
 ]
 
 ######################################################
@@ -1366,29 +1366,6 @@ def simple_attention(encoded_sequence,
         input=scaled, pooling_type=SumPooling(), name="%s_pooling" % name)
 
 
-############################################################################
-#                         Miscs                                            #
-############################################################################
-
-
-@wrap_name_default("dropout")
-def dropout_layer(input, dropout_rate, name=None):
-    """
-    @TODO(yuyang18): Add comments.
-
-    :param name:
-    :param input:
-    :param dropout_rate:
-    :return:
-    """
-    return addto_layer(
-        name=name,
-        input=input,
-        act=LinearActivation(),
-        bias_attr=False,
-        layer_attr=ExtraAttr(drop_rate=dropout_rate))
-
-
 def inputs(layers, *args):
     """
     Declare the inputs of network. The order of input should be as same as
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index 981ccbf248391b5db4339570d918404df6033f3d..c24102255f5bbed0f551b2dbfec20be7daf5f5b4 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -5,6 +5,7 @@ last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
 img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
 test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
 test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
-test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer)
+test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
+test_prelu_layer test_row_conv)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..64d227565f2b21ff43d4391c682ca90c0f47908e
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_prelu_layer.protostr
@@ -0,0 +1,36 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "__prelu_layer_0__"
+  type: "prelu"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___prelu_layer_0__.w0"
+  }
+}
+parameters {
+  name: "___prelu_layer_0__.w0"
+  size: 300
+  initial_mean: 0.0
+  initial_std: 0.057735026919
+  initial_strategy: 0
+  initial_smart: true
+}
+input_layer_names: "input"
+output_layer_names: "__prelu_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "__prelu_layer_0__"
+  input_layer_names: "input"
+  output_layer_names: "__prelu_layer_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..9ec15d2a19ec50a1729f9eeaa6dce8b1153c776b
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr
@@ -0,0 +1,41 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 2560
+  active_type: ""
+}
+layers {
+  name: "__row_conv_layer_0__"
+  type: "maxout"
+  size: 2560
+  active_type: "relu"
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___row_conv_layer_0__.w0"
+    row_conv_conf {
+      context_length: 19
+    }
+  }
+}
+parameters {
+  name: "___row_conv_layer_0__.w0"
+  size: 48640
+  initial_mean: 0.0
+  initial_std: 0.229415733871
+  dims: 19
+  dims: 2560
+  initial_strategy: 0
+  initial_smart: true
+}
+input_layer_names: "data"
+output_layer_names: "__row_conv_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__row_conv_layer_0__"
+  input_layer_names: "data"
+  output_layer_names: "__row_conv_layer_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py b/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e3057f323db22ffc3911cce30ec2e8bb95e3dbe
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_prelu_layer.py
@@ -0,0 +1,6 @@
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='input', size=300)
+prelu = prelu_layer(input=data)
+
+outputs(prelu)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py b/python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab33c496b0663d8472ce4b272be6c5cecbcfc978
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py
@@ -0,0 +1,9 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+data = data_layer(name='data', size=2560)
+
+row_conv = row_conv_layer(input=data, context_len=19, act=ReluActivation())
+
+outputs(row_conv)
diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py
index 418b592a5ac638cc61b86a9b3fbdcee1e3a0bcaf..9c614914b5e372e8e5e3c3c072b18b83edf51e87 100644
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@@ -149,3 +149,57 @@ def cluster_files_reader(files_pattern,
                     yield line
 
     return reader
+
+
+def convert(output_path,
+            reader,
+            num_shards,
+            name_prefix,
+            max_lines_to_shuffle=1000):
+    import recordio
+    import cPickle as pickle
+    import random
+    """
+    Convert data from reader to recordio format files.
+
+    :param output_path: directory in which output files will be saved.
+    :param reader: a data reader, from which the convert program will read data instances.
+    :param num_shards: the number of shards that the dataset will be partitioned into.
+    :param name_prefix: the name prefix of generated files.
+    :param max_lines_to_shuffle: the max lines numbers to shuffle before writing.
+    """
+
+    assert num_shards >= 1
+    assert max_lines_to_shuffle >= 1
+
+    def open_writers():
+        w = []
+        for i in range(0, num_shards):
+            n = "%s/%s-%05d-of-%05d" % (output_path, name_prefix, i,
+                                        num_shards - 1)
+            w.append(recordio.writer(n))
+
+        return w
+
+    def close_writers(w):
+        for i in range(0, num_shards):
+            w[i].close()
+
+    def write_data(w, lines):
+        random.shuffle(lines)
+        for i, d in enumerate(lines):
+            d = pickle.dumps(d, pickle.HIGHEST_PROTOCOL)
+            w[i % num_shards].write(d)
+
+    w = open_writers()
+    lines = []
+
+    for i, d in enumerate(reader()):
+        lines.append(d)
+        if i % max_lines_to_shuffle == 0 and i >= max_lines_to_shuffle:
+            write_data(w, lines)
+            lines = []
+            continue
+
+    write_data(w, lines)
+    close_writers(w)
diff --git a/python/paddle/v2/dataset/tests/common_test.py b/python/paddle/v2/dataset/tests/common_test.py
index f9815d4f9e1ee3bbe9ccf2dae588c51c262468c1..cfa194eba38ea70311c4deeac2635dc0a0103576 100644
--- a/python/paddle/v2/dataset/tests/common_test.py
+++ b/python/paddle/v2/dataset/tests/common_test.py
@@ -57,6 +57,38 @@ class TestCommon(unittest.TestCase):
         for idx, e in enumerate(reader()):
             self.assertEqual(e, str("0"))
 
+    def test_convert(self):
+        record_num = 10
+        num_shards = 4
+
+        def test_reader():
+            def reader():
+                for x in xrange(record_num):
+                    yield x
+
+            return reader
+
+        path = tempfile.mkdtemp()
+        paddle.v2.dataset.common.convert(path,
+                                         test_reader(), num_shards,
+                                         'random_images')
+
+        files = glob.glob(path + '/random_images-*')
+        self.assertEqual(len(files), num_shards)
+
+        recs = []
+        for i in range(0, num_shards):
+            n = "%s/random_images-%05d-of-%05d" % (path, i, num_shards - 1)
+            r = recordio.reader(n)
+            while True:
+                d = r.read()
+                if d is None:
+                    break
+                recs.append(d)
+
+        recs.sort()
+        self.assertEqual(total, record_num)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py
index 815635f5dd4654fe3a31a9244e6e4473c397dd2f..aeed9ebd7d4d64efa5d0bf1638742a485c0fa44a 100644
--- a/python/paddle/v2/layer.py
+++ b/python/paddle/v2/layer.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 """
 `paddle.v2.layer` is a part of model config packages in paddle.v2. In API v2,
-we want to make Paddle a plain Python package. The model config package defined
+we want to make Paddle a plain Python package. The model config package defines
 the way how to configure a neural network topology in Paddle Python code.
 
 The primary usage shows below.
@@ -30,7 +30,6 @@ The primary usage shows below.
     # use prediction instance where needed.
     parameters = paddle.parameters.create(cost)
 """
-
 import collections
 import copy
 import re
@@ -44,9 +43,10 @@ __all__ = ['data', 'parse_network']
 
 
 def __need_to_keep__(name):
-    if name in ['StaticInput', 'LayerType', 'layer_support']:
-        return False
-    return True
+    return name in [
+        'StaticInput', 'SubsequenceInput', 'GeneratedInput', 'LayerType',
+        'layer_support'
+    ]
 
 
 def __need_to_wrap__(name):
@@ -54,6 +54,8 @@ def __need_to_wrap__(name):
 
 
 def __convert_name__(inname):
+    if __need_to_keep__(inname):
+        return inname
     if inname == 'maxid_layer':
         return 'max_id'
     elif inname.endswith('memory') or inname.endswith(
@@ -74,8 +76,6 @@ def __convert_name__(inname):
 
 for name in v1_layers.__all__:
     obj = getattr(v1_layers, name)
-    if not __need_to_keep__(name):
-        continue
     new_name = __convert_name__(name)
     if callable(obj) and __need_to_wrap__(name):
         globals()[new_name] = __convert_to_v2__(obj, new_name, __name__)
@@ -107,7 +107,7 @@ __data_layer__.__doc__ = __map_data_docstr__(v1_layers.data_layer.__doc__)
 data = __convert_to_v2__(__data_layer__, 'name', __name__)
 
 
-def __get_used_layers__(output_layers, extra_layers=None):
+def __get_used_layers__(output_layers):
     layer_names = set()
     parents = {}
 
@@ -132,6 +132,13 @@ def __get_used_layers__(output_layers, extra_layers=None):
                     add_parent(mem.layer_name, mem.boot_layer_name)
                 add_parent(mem.link_name, mem.layer_name)
 
+            if sub_model.HasField('generator'):
+                # according to the implementation of text generation
+                # in recurrent layer group, the generated word must be
+                # the first out link
+                add_parent(sub_model.out_links[0].layer_name,
+                           sub_model.generator.eos_layer_name)
+
     def dfs_travel(layer_name):
         if layer_name in layer_names:
             return
@@ -247,9 +254,9 @@ def __trim_submodel__(old_submodel, layer_names, input_layer_names,
 def parse_network(output_layers, extra_layers=None):
     if not isinstance(output_layers, collections.Sequence):
         output_layers = [output_layers]
-    if extra_layers is not None and not isinstance(extra_layers,
-                                                   collections.Sequence):
-        extra_layers = [extra_layers]
+    if extra_layers is not None:
+        if not isinstance(extra_layers, collections.Sequence):
+            extra_layers = [extra_layers]
     else:
         extra_layers = []
 
@@ -262,18 +269,29 @@ def parse_network(output_layers, extra_layers=None):
 
     model_config = ModelConfig()
     model_config.type = cp.g_config.model_config.type
+
+    for layer in output_layers:
+        model_config.output_layer_names.append(layer.full_name)
+        output_layer_names.add(layer.full_name)
+
     for l in cp.g_config.model_config.layers:
         if l.name not in layer_names:
             continue
         model_config.layers.extend([l])
         if l.type == 'data':
+            if l.name in model_config.output_layer_names:
+                """
+                In text generation, the outlink to save the generated word
+                indices is a data_layer defined in recurrent_group. This
+                data_layer is sure to be the output of the network in text
+                generation task, so this statement excludes such a special
+                data_layer from being inputs of the network, otherwise an error
+                will occur during data feeding.
+                """
+                continue
             model_config.input_layer_names.append(l.name)
             input_layer_names.add(l.name)
 
-    for layer in output_layers:
-        model_config.output_layer_names.append(layer.full_name)
-        output_layer_names.add(layer.full_name)
-
     for e in cp.g_config.model_config.evaluators:
         if e.name in evaluator_names:
             model_config.evaluators.extend([e])
diff --git a/python/paddle/v2/topology.py b/python/paddle/v2/topology.py
index f3bb4d5f10dd6c5b220161e32dfc3a94642ac7a2..a20e878d0817d0a75e9c47a44f8765deca99225c 100644
--- a/python/paddle/v2/topology.py
+++ b/python/paddle/v2/topology.py
@@ -31,7 +31,6 @@ class Topology(object):
     def __init__(self, layers, extra_layers=None):
         def __check__(layers):
             if not isinstance(layers, collections.Sequence):
-                __check_layer_type__(layers)
                 layers = [layers]
             for layer in layers:
                 __check_layer_type__(layer)
@@ -91,6 +90,7 @@ class Topology(object):
         [('image', dense_vector(768)), ('label', integer_value(10))]
         """
         data_layers = self.data_layers()
+
         return [(nm, data_layers[nm].data_type)
                 for nm in self.proto().input_layer_names]