Fix bug and Python API.

18cd1f25 · dangqingqing · b783e08e · 18cd1f25 · 18cd1f25 · 18cd1f25
11 changed file
--- a/paddle/function/RowConvOp.cpp
+++ b/paddle/function/RowConvOp.cpp
@@ -61,7 +61,7 @@ void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
      size_t begin = starts[i];
      size_t end = starts[i + 1];
      size_t steps = end - begin;
-      for (size_t j = 0; j < contextLength; ++j) {
+      for (size_t j = 0; j < contextLength && (begin + j) < end; ++j) {
        MatrixPtr x =
            (const_cast<CpuMatrix&>(in)).subMatrix(begin + j, steps - j);
        MatrixPtr dy =
@@ -81,7 +81,7 @@ void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
      for (size_t j = 0; j < steps; ++j) {
        MatrixPtr dx = inG.subMatrix(begin + j, 1);
        for (size_t t = 0; t < contextLength; ++t) {
-          if ((int(j) - int(t)) >= 0) {
+          if (int(j - t) >= 0) {
            MatrixPtr dy =
                (const_cast<CpuMatrix&>(outG)).subMatrix(begin + j - t, 1);
            MatrixPtr w = (const_cast<CpuMatrix&>(filter)).subMatrix(t, 1);
@@ -94,8 +94,37 @@ void RowConvGrad<DEVICE_TYPE_CPU>(const CpuMatrix& outG,
 }
 /**
- * \brief TODO(qingqing)
+ * \brief The row convolution is called lookahead convolution. It is firstly
+ * introduced in deep-speech2 system. The bidirectional RNN that learns
+ * representation for a sequence by performing a forward and a backward pass
+ * through the entire sequence. However, unlike unidirectional RNNs,
+ * bidirectional RNNs are challenging to deploy in an online and low-latency
+ * setting. The lookahead convolution incorporates information from future
+ * subsequences in a computationally efficient manner to improve unidirectional
+ * recurrent neural networks.
 *
+ * The connection of row convolution is different form the 1D sequence
+ * convolution. Assumed that, the future context-length is k, that is to say,
+ * it can get the output at timestep t by using the the input feature from t-th
+ * timestep to (t+k)-th timestep. Assumed that the hidden dim of input
+ * activations are d, the activations r_t for the new layer at time-step t are:
+ *
+ *
+ *            -- k + 1
+ *  r(t,i) =  >       W(i,j) * h(t+j-1, i),  for (1 <= i <= d)
+ *            -- j = 1
+ *
+ *
+ * The weight shape is: (k + 1) x d
+ * Function Arguments:
+ *
+ * \param inputs[0]  The input activations.
+ * \param inputs[0]  The filter (or weight) and shape is (k+1) x d.
+ * \param outputs[1] The output activations.
+ *
+ * [1] Dario Amodei, etc. Deep Speech 2 : End-to-End Speech Recognition in
+ * English
+ *     and Mandarin. https://arxiv.org/abs/1512.02595
 */
 template <DeviceType Device>
@@ -128,10 +157,21 @@ public:
    RowConv<Device>(outMat, inMat, wMat, seqId);
  }
 };
 /**
- * \brief TODO(qingqing)
+ * \brief The backward of row convolution function. This function calculated
+ * the gradient w.r.t filter and the gradient w.r.t input activations(or data).
 *
 * Argument in this Function:
+ *
+ * \param inputs[0]  The gradient w.r.t output activations.
+ * \param inputs[1]  The input activations.
+ * \param inputs[2]  The filter (or weight) and shape is (k+1) x d.
+ * \param outputs[0] The gradient w.r.t input activations.
+ * \param outputs[1] The gradient w.r.r filter.
+ *
+ * Abbreviation:
+ * w.r.t: with respect to.
 */
 template <DeviceType Device>
@@ -140,12 +180,27 @@ public:
  void init(const FuncConfig& config) override {}
  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    // check
+    CHECK_EQ(3UL, inputs.size());
+    CHECK_EQ(2UL, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    CHECK_EQ(outputs[1].getArgType(), ADD_TO);
+    CHECK(inputs[0].isSequenceArg() && inputs[1].isSequenceArg() &&
+          outputs[0].isSequenceArg())
+        << "SequenceArg required here.";
    const auto outGrad = dynamic_cast<const SequenceArg&>(inputs[0]);
    const auto in = dynamic_cast<const SequenceArg&>(inputs[1]);
    const auto w = inputs[2];
    auto inGrad = dynamic_cast<const SequenceArg&>(outputs[0]);
    auto wGrad = outputs[1];
+    CHECK_EQ(in.shape().ndims(), 2UL);
+    CHECK_EQ(outGrad.shape().ndims(), 2UL);
+    CHECK_EQ(in.shape()[1], outGrad.shape()[1]);
+    CHECK_EQ(in.shape()[0], outGrad.shape()[0]);
+    CHECK_EQ(wGrad.shape()[1], in.shape()[1]);
    const auto outGMat = outGrad.matrix<Device>();
    const auto inMat = in.matrix<Device>();
    const auto wMat = w.matrix<Device>();
@@ -157,37 +212,7 @@ public:
                     : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
    const auto seqId = in.getSequenceId().vector<int, Device>();
-    std::cout << "in:" << std::endl;
-    for (int i = 0; i < inMat.getHeight(); ++i) {
-      for (int j = 0; j < inMat.getWidth(); ++j) {
-        std::cout << outGMat.getElement(i, j) << " ";
-      }
-      std::cout << std::endl;
-    }
-    std::cout << "w:" << std::endl;
-    for (int i = 0; i < wMat.getHeight(); ++i) {
-      for (int j = 0; j < wMat.getWidth(); ++j) {
-        std::cout << wMat.getElement(i, j) << " ";
-      }
-      std::cout << std::endl;
-    }
-    std::cout << "w:" << std::endl;
-    for (int i = 0; i < seqId.getSize(); ++i) {
-      std::cout << seqId.getElement(i) << " ";
-    }
-    std::cout << std::endl;
    RowConvGrad<Device>(outGMat, inMat, wMat, inGMat, wGMat, seqId);
-    std::cout << std::endl << "out:" << std::endl;
-    for (int i = 0; i < inGMat.getHeight(); ++i) {
-      for (int j = 0; j < inGMat.getWidth(); ++j) {
-        std::cout << inGMat.getElement(i, j) << " ";
-      }
-      std::cout << std::endl;
-    }
  }
 };

--- a/paddle/function/RowConvOp.h
+++ b/paddle/function/RowConvOp.h
@@ -19,7 +19,14 @@ limitations under the License. */
 namespace paddle {
 /**
- * \brief TODO(qingqing)
+ * \brief The forward of row convolution.
+ *
+ * \param[out] out      The output data and shape is h x d. h is the sum of
+ *                      time steps of all samples in one mini-batch.
+ * \param[in]  in       The input data and shape is h x d.
+ * \param[in]  filter   The filter and shape is k x d. The lookahead step
+ *                      number plus one equals k.
+ * \param[in]  seq      The sequence start positions.
 *
 */
 template <DeviceType DType>
@@ -29,7 +36,14 @@ void RowConv(typename Tensor<real, DType>::Matrix& out,
             const typename Tensor<int, DType>::Vector& seq);
 /**
- * \brief  TODO(qingqing)
+ * \brief The backward of row convolution.
+ *
+ * \param[in]  outG     The gradient w.r.t output data.
+ * \param[in]  in       The input data.
+ * \param[in]  filter   The filter.
+ * \param[out] inG      The gradient w.r.t input data.
+ * \param[out] filterG  The gradient w.r.t filter.
+ * \param[in]  seq      The sequence start positions.
 *
 */
 template <DeviceType DType>

--- a/paddle/function/RowConvOpGpu.cu
+++ b/paddle/function/RowConvOpGpu.cu
@@ -96,11 +96,6 @@ void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,
  const size_t height = in.getHeight();
  const size_t width = in.getWidth();
-  LOG(INFO) << numSeq;
-  LOG(INFO) << contextLength;
-  LOG(INFO) << height;
-  LOG(INFO) << width;
  real* y = out.getData();
  const real* x = in.getData();
  const real* w = filter.getData();
@@ -108,7 +103,6 @@ void RowConv<DEVICE_TYPE_GPU>(GpuMatrix& out,
  dim3 dimBlock(32, 32);
  dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
-  LOG(INFO) << dimGrid.x;
  if (contextLength <= 32) {
    KeRowConv<32, 32><<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
@@ -131,12 +125,12 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
  const int blky = blockDim.y;
  const int gidx = blockIdx.x * blockDim.x;
-  __shared__ real sh_x[BLOCK_H][BLOCK_W];
+  __shared__ real sh_x[BLOCK_W][BLOCK_H];
-  __shared__ real sh_dy[BLOCK_H][BLOCK_W];
+  __shared__ real sh_dy[BLOCK_W][BLOCK_H + CONTEXT - 1];
  __shared__ real sh_dw[CONTEXT][BLOCK_W];
-  for (int t = tidy; t < context; t += blky) {
+  if (tidy < context) {
-    sh_dw[t][tidx] = 0.0;
+    sh_dw[tidy][tidx] = 0.0;
  }
  __syncthreads();
@@ -144,21 +138,31 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
    const int start = starts[i];
    const int end = starts[i + 1];
    const int steps = end - start;
-    for (int j = tidy; j < steps; j += BLOCK_H) {
+    const int size = ((steps + BLOCK_H - 1)/BLOCK_H) * BLOCK_H;
+    for (int j = tidy; j < size; j += BLOCK_H) {
      int xoff = gidx + tidx;
      int yoff = start + j;
      // transpose
-      sh_x[tidx][tidy] = xoff < width && yoff < end ? x[yoff * width + xoff] : 0.0;
+      sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
-      sh_dy[tidx][tidy] = xoff < width && yoff < end ? dy[yoff * width + xoff] : 0.0;
+      sh_dy[tidx][tidy + context - 1] = (xoff < width && yoff < end) ? dy[yoff * width + xoff] : 0.0;
+      __syncthreads();
+      if (tidy < (context - 1)) {
+        yoff = yoff - context + 1;
+        sh_dy[tidx][tidy] = (xoff < width && yoff >= start) ? dy[yoff * width + xoff] : 0.0;
+      }
      __syncthreads();
      for (int t = 0; t < context; t++) {
-        real val = tidx + t < blockDim.x ? sh_x[tidy][tidx + t] * sh_dy[tidy][tidx]: 0.0;
+        real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx + context - 1 - t];
+        __syncthreads();
        // warp size and blockDim.x is 32.
-        for (int offset = 16; offset > 0; offset /= 2) {
+        val += __shfl_down(val, 16);
-          val += __shfl_down(val, offset);
+        val += __shfl_down(val, 8);
-        }
+        val += __shfl_down(val, 4);
+        val += __shfl_down(val, 2);
+        val += __shfl_down(val, 1);
+        __syncthreads();
        if (tidx == 0) {
          sh_dw[t][tidy] += val;
        }
@@ -167,7 +171,7 @@ __global__ void KeRowConvBwWeight(real* dw, const real* x, const real* dy,
    }
  }
-  for (int t = tidy; t < context && (gidx + tidx) < width; t += blky) {
+  for (int t = tidy; (t < context) && ((gidx + tidx) < width); t += blky) {
    dw[t * width + gidx + tidx] += sh_dw[t][tidx];
  }
 }
@@ -188,21 +192,30 @@ __global__ void KeRowConvBwWeight2(real* dw, const real* x, const real* dy,
    const int start = starts[i];
    const int end = starts[i + 1];
    const int steps = end - start;
-    for (int j = 0; j < steps; j += BLOCK_H) {
+    const int size = ((steps + BLOCK_H - 1)/BLOCK_H) * BLOCK_H;
+    for (int j = tidy; j < size; j += BLOCK_H) {
      int xoff = gidx + tidx;
      int yoff = start + j;
      // transpose
-      sh_x[tidx][tidy] = xoff < width  && yoff < end ? x[yoff * width + xoff] : 0.0;
+      sh_x[tidx][tidy] = (xoff < width && yoff < end) ? x[yoff * width + xoff] : 0.0;
-      sh_dy[tidx][tidy] = xoff < width  && yoff < end ? dy[yoff * width + xoff] : 0.0;
      __syncthreads();
      for (int t = 0; t < context; t++) {
-        real val = tidx + t < blockDim.x ? sh_x[tidy][tidx + t] * sh_dy[tidy][tidx]: 0.0;
+        sh_dy[tidx][tidy] = (xoff < width && (yoff - t) >= start && yoff - t < end) ? dy[(yoff - t) * width + xoff] : 0.0;
+        __syncthreads();
+        real val = sh_x[tidy][tidx] * sh_dy[tidy][tidx];
+        __syncthreads();
        // warp size and blockDim.x is 32.
-        for (int offset = 16; offset > 0; offset /= 2) {
+        val += __shfl_down(val, 16);
-          val += __shfl_down(val, offset);
+        val += __shfl_down(val, 8);
-        }
+        val += __shfl_down(val, 4);
+        val += __shfl_down(val, 2);
+        val += __shfl_down(val, 1);
+        __syncthreads();
        if (tidx == 0 && (gidx + tidy) < width) {
          dw[t*width + gidx + tidy] += val;
        }
@@ -293,34 +306,36 @@ void RowConvGrad<DEVICE_TYPE_GPU>(const GpuMatrix& outG,
  const real* dy = outG.getData();
  const real* x = in.getData();
  const real* w = filter.getData();
-  real* dx = inG.getData();
-  real* dw = filterG.getData();
  const int* starts = seq.getData();
-  dim3 dimBlock(32, 32);
+  if (filterG) {
-  dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
+    dim3 dimBlock(32, 32);
+    dim3 dimGrid(DIVUP(width, dimBlock.x), 1);
-  if (contextLength <= 16) { 
+    real* dw = filterG.getData();
-    KeRowConvBwWeight<32, 32, 16>
+    if (contextLength <= 16) { 
-      <<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
+      KeRowConvBwWeight<32, 32, 16>
-      (dw, x, dy, starts, height, width, numSeq, contextLength);
+        <<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
-  } else {
+        (dw, x, dy, starts, height, width, numSeq, contextLength);
-    KeRowConvBwWeight2<32, 32>
+    } else {
-      <<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
+      KeRowConvBwWeight2<32, 32>
-      (dw, x, dy, starts, height, width, numSeq, contextLength);
+        <<<dimGrid, dimBlock, 0, STREAM_DEFAULT>>>
+        (dw, x, dy, starts, height, width, numSeq, contextLength);
+    }
  }
+  if (inG) {
-  dim3 dimBlock2(32, 32);
+    real* dx = inG.getData();
-  dim3 dimGrid2(DIVUP(width, dimBlock2.x), 1);
+    dim3 dimBlock2(32, 32);
-  if (contextLength <= 64) {
+    dim3 dimGrid2(DIVUP(width, dimBlock2.x), 1);
-    KeRowConvBwData<32, 64>
+    if (contextLength <= 64) {
-      <<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>
+      KeRowConvBwData<32, 64>
-      (dx, w, dy, starts, height, width, numSeq, contextLength);
+        <<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>
-  } else {
+        (dx, w, dy, starts, height, width, numSeq, contextLength);
-    KeRowConvBwData2
+    } else {
-      <<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>
+      KeRowConvBwData2
-      (dx, w, dy, starts, height, width, numSeq, contextLength);
+        <<<dimGrid2, dimBlock2, 0, STREAM_DEFAULT>>>
+        (dx, w, dy, starts, height, width, numSeq, contextLength);
+    }
  }
  CHECK_SYNC("RowConvGrad");

--- a/paddle/function/RowConvOpTest.cpp
+++ b/paddle/function/RowConvOpTest.cpp
@@ -47,23 +47,16 @@ void testRowConvBw(size_t batchSize, size_t dim, size_t contextLength) {
 }
 TEST(RowConv, real) {
-  // for (size_t numSamples : {17, 129}) {
+  for (size_t numSamples : {17, 129, 2020}) {
-  //   for (size_t dim : {16, 248}) {
+    for (size_t dim : {16, 512, 2560}) {
-  //     for (size_t context: {3, 7, 65}) {
+      for (size_t context : {3, 19, 65}) {
-  LOG(INFO) << "===========";
+        VLOG(3) << " numSamples=" << numSamples << " dim=" << dim
-  // for (size_t numSamples : {17}) {
+                << " context length=" << context;
-  //  for (size_t dim : {16}) {
+        testRowConvFw(numSamples, dim, context);
-  //    for (size_t context: {3}) {
+        testRowConvBw(numSamples, dim, context);
-  size_t numSamples = 17;
+      }
-  size_t dim = 16;
+    }
-  size_t context = 3;
+  }
-  LOG(INFO) << " numSamples=" << numSamples << " dim=" << dim
-            << " context length=" << context;
-  testRowConvFw(numSamples, dim, context);
-  // testRowConvBw(numSamples, dim, context);
-  //     }
-  //   }
-  // }
 }
 }  // namespace paddle
--- a/paddle/gserver/layers/RowConvLayer.cpp
+++ b/paddle/gserver/layers/RowConvLayer.cpp
@@ -75,7 +75,7 @@ void RowConvLayer::backward(const UpdateCallback& callback) {
  BufferArgs outputs;
  inputs.addArg(*getOutputGrad(), *startPos);
  inputs.addArg(*getInputValue(0), *startPos);
-  inputs.addArg(*weight_->getW(), *startPos);
+  inputs.addArg(*weight_->getW(), wDims_);
  MatrixPtr inGrad = getInputGrad(0);
  MatrixPtr wGrad = weight_->getWGrad();

--- a/paddle/gserver/layers/RowConvLayer.h
+++ b/paddle/gserver/layers/RowConvLayer.h
@@ -37,9 +37,7 @@ protected:
  // fan_out is the size of output feature.
  std::unique_ptr<Weight> weight_;
-  // std::unique_ptr<Weight> biases_;
+  // The step number to look ahead plus one equals contexLength_.
-  // how many steps to look ahead
  size_t contexLength_;
  TensorShape wDims_;
 };

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2081,6 +2081,23 @@ class MaxOutLayer(LayerBase):
                           g_layer_map[input_layer.name].width, out_channels)
+@config_layer('row_conv')
+class RowConvLayer(LayerBase):
+    def __init__(self, name, inputs, context_length, **xargs):
+        super(RowConvLayer, self).__init__(
+            name, 'maxout', 0, inputs=inputs, **xargs)
+        config_assert(
+            len(self.inputs) == 1,
+            'TransLayer must have one and only one input')
+        input_layer = self.get_input_layer(0)
+        row_conv_conf = self.config.inputs[0].row_conv_conf
+        row_conv_conf.context_length = context_length
+        self.set_layer_size(input_layer.size)
+        psize = context_length * input_layer.size
+        dims = [context_length, input_layer.size]
+        self.create_input_parameter(0, psize, dims)
 # key: cost type
 # value: cost class
 g_cost_map = {}

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -120,6 +120,7 @@ __all__ = [
    'smooth_l1_cost',
    'layer_support',
    'multiplex_layer',
+    'row_conv_layer',
 ]
@@ -187,6 +188,7 @@ class LayerType(object):
    SPP_LAYER = "spp"
    PAD_LAYER = "pad"
    MULTIPLEX_LAYER = "multiplex"
+    ROW_CONV_LAYER = "row_conv"
    PRINT_LAYER = "print"
    PRIORBOX_LAYER = "priorbox"
@@ -5528,3 +5530,77 @@ def multiplex_layer(input, name=None, layer_attr=None):
        layer_type=LayerType.MULTIPLEX_LAYER,
        parents=input,
        size=l.config.size)
+@wrap_name_default()
+@wrap_act_default(act=LinearActivation())
+@wrap_param_attr_default()
+@layer_support(DROPOUT)
+def row_conv_layer(input,
+                   context_len,
+                   act=None,
+                   name=None,
+                   param_attr=None,
+                   layer_attr=None):
+    """
+    The row convolution is called lookahead convolution. It is firstly
+    introduced in paper of `Deep Speech 2: End-toEnd Speech Recognition
+    in English and Mandarin <https://arxiv.org/pdf/1512.02595v1.pdf>`_ .
+    The bidirectional RNN that learns representation for a sequence by
+    performing a forward and a backward pass through the entire sequence.
+    However, unlike unidirectional RNNs, bidirectional RNNs are challenging
+    to deploy in an online and low-latency setting. The lookahead convolution
+    incorporates information from future subsequences in a computationally
+    efficient manner to improve unidirectional recurrent neural networks.
+    The connection of row convolution is different form the 1D sequence
+    convolution. Assumed that, the future context-length is k, that is to say,
+    it can get the output at timestep t by using the the input feature from t-th
+    timestep to (t+k+1)-th timestep. Assumed that the hidden dim of input
+    activations are d, the activations r_t for the new layer at time-step t are:
+    .. math::
+        r_{t,r} = \sum_{j=1}^{k + 1} {w_{i,j}h_{t+j-1, i}}
+                  \quad \text{for} \quad  (1 \leq i \leq d)
+    Note:
+        The `context_len` is `k + 1`. That is to say, the lookahead step
+        number plus one equals context_len.
+    .. code-block:: python
+       row_conv = row_conv_layer(input=input_layer, context_len=3)
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param context_len: The context length equals the lookahead step number
+                        plus one.
+    :type context_len: int
+    :param act: Activation Type. Default is linear activation.
+    :type act: BaseActivation
+    :param param_attr: The Parameter Attribute. If None, the parameter will be
+                       initialized smartly. It's better set it by yourself.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput)
+    assert context_len > 0, "the context_len must be greatet than 0."
+    Layer(
+        inputs=[Input(input.name, **param_attr.attr)],
+        name=name,
+        context_length=context_len,
+        type=LayerType.ROW_CONV_LAYER,
+        active_type=act.name,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.ROW_CONV_LAYER, input, activation=act, size=input.size)
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -5,6 +5,6 @@ last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
 img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
 test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
 test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
-test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer)
+test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer test_row_conv)
 export whole_configs=(test_split_datasource)
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_row_conv.protostr
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 2560
+  active_type: ""
+}
+layers {
+  name: "__row_conv_layer_0__"
+  type: "maxout"
+  size: 2560
+  active_type: "relu"
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___row_conv_layer_0__.w0"
+    row_conv_conf {
+      context_length: 19
+    }
+  }
+}
+parameters {
+  name: "___row_conv_layer_0__.w0"
+  size: 48640
+  initial_mean: 0.0
+  initial_std: 0.229415733871
+  dims: 19
+  dims: 2560
+  initial_strategy: 0
+  initial_smart: true
+}
+input_layer_names: "data"
+output_layer_names: "__row_conv_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__row_conv_layer_0__"
+  input_layer_names: "data"
+  output_layer_names: "__row_conv_layer_0__"
+  is_recurrent_layer_group: false
+}
--- a/python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_row_conv.py
+from paddle.trainer_config_helpers import *
+settings(batch_size=1000, learning_rate=1e-5)
+data = data_layer(name='data', size=2560)
+row_conv = row_conv_layer(input=data, context_len=19, act=ReluActivation())
+outputs(row_conv)