Merge remote-tracking branch 'upstream/master'

23e47bb6 · liaogang · a8df4111 · d130d181 · 23e47bb6 · 23e47bb6
44 changed file
--- a/doc_cn/demo/quick_start/index.md
+++ b/doc_cn/demo/quick_start/index.md
@@ -4,7 +4,7 @@

 ## 安装(Install)

-首先请参考<a href = "../../build_and_install/install/index.html">安装教程</a>安装PaddlePaddle。
+首先请参考<a href = "../../build_and_install/index.html">安装教程</a>安装PaddlePaddle。

 ## 使用概述(Overview)


--- a/paddle/cuda/src/hl_cuda_cublas.cc
+++ b/paddle/cuda/src/hl_cuda_cublas.cc
@@ -217,7 +217,7 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
  } else {
    LOG(FATAL) << "parameter transa error!";
  }
-  CHECK_EQ(stat, CUBLAS_STATUS_SUCCESS);
+  CHECK_EQ(stat, CUBLAS_STATUS_SUCCESS) << hl_cublas_get_error_string(stat);
  CHECK_SYNC("hl_matrix_mul failed");
 }

@@ -266,7 +266,7 @@ void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
    LOG(FATAL) << "parameter transa error!";
  }

-  CHECK_EQ(stat, CUBLAS_STATUS_SUCCESS);
+  CHECK_EQ(stat, CUBLAS_STATUS_SUCCESS) << hl_cublas_get_error_string(stat);
  CHECK_SYNC("hl_matrix_mul_vector");
 }


--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -497,20 +497,21 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
    int idSize = 0;
    // connect in_links
    for (size_t j = 0; j < inFrameLines_.size(); ++j) {
+      Info& info = info_[shareInlinkInfo ? 0 : j];
      // idSize denotes the sum number of tokens in each length i
-      idSize = info_[j].idIndex[i + 1] - info_[j].idIndex[i];
+      idSize = info.idIndex[i + 1] - info.idIndex[i];
      InFrameLine inFrameLine = inFrameLines_[j];
      auto scatterAgent =
          dynamic_cast<ScatterAgentLayer*>(inFrameLine.agents[i].get());
      scatterAgent->setRealLayerAndOutput(inFrameLine.inLayer,
-                                          inFrameLine.outArg, info_[j].allIds,
-                                          info_[j].idIndex[i], idSize);
+                                          inFrameLine.outArg, info.allIds,
+                                          info.idIndex[i], idSize);
      if (hasSubseq) {
        // size: the length of subsequence
        int size =
-            info_[j].seqStartPosIndex[i + 1] - info_[j].seqStartPosIndex[i];
-        scatterAgent->setSequenceStartPositions(info_[j].sequenceStartPositions,
-                                                info_[j].seqStartPosIndex[i],
+            info.seqStartPosIndex[i + 1] - info.seqStartPosIndex[i];
+        scatterAgent->setSequenceStartPositions(info.sequenceStartPositions,
+                                                info.seqStartPosIndex[i],
                                                size);
      }
    }
@@ -744,10 +745,13 @@ void RecurrentGradientMachine::selectRowsOneTime(LayerPtr layer,
                                                 const IVectorPtr& allIds,
                                                 Argument* arg,
                                                 PassType passType) {
-  const MatrixPtr& realV = layer->getOutputValue();
+  Argument& src = layer->getOutput();
+  if (src.value) {
+    const MatrixPtr& realV = src.value;
    int height = realV->getHeight();
    int width = realV->getWidth();
-  Matrix::resizeOrCreate(arg->value, height, width, /* trans */ false, useGpu_);
+    Matrix::resizeOrCreate(
+      arg->value, height, width, /* trans */ false, useGpu_);
    arg->value->zeroMem();
    arg->value->selectRows(*realV, *allIds);
    if (passType != PASS_TEST) {
@@ -755,6 +759,11 @@ void RecurrentGradientMachine::selectRowsOneTime(LayerPtr layer,
                             useGpu_);
      arg->grad->zeroMem();
    }
+  }
+  if (src.ids) {
+    IVector::resizeOrCreate(arg->ids, src.ids->getSize(), useGpu_);
+    arg->ids->selectFrom(*src.ids, *allIds);
+  }
 }

 void RecurrentGradientMachine::createSeqPos(

--- a/paddle/gserver/layers/AgentLayer.cpp
+++ b/paddle/gserver/layers/AgentLayer.cpp
@@ -139,15 +139,16 @@ void ScatterAgentLayer::forward(PassType passType) {
  Layer::forward(passType);
  CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId());

-  if (realLayer_->getOutput().ids) {  // ids scatter
-    IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_);
-    output_.ids->selectFrom(*realLayer_->getOutput().ids, *ids_);
-  } else {  // value scatter
  int width = this->getSize();
-    if (realOutArg_.value) {
-      output_.subArgFrom(realOutArg_, /* offset */ idIndex_ * width, idSize_,
+  if (realOutArg_.value || realOutArg_.ids) {
+    output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_,
                       width, useGpu_);
  } else {  // used in generation
+    if (realLayer_->getOutput().ids) {
+      IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_);
+      output_.ids->selectFrom(*realLayer_->getOutput().ids, *ids_);
+    }
+    if (realLayer_->getOutput().value) {
      int height = ids_->getSize();
      resetOutput(height, width);

@@ -213,18 +214,17 @@ void SequenceGatherAgentLayer::forward(PassType passType) {
 void SequenceScatterAgentLayer::forward(PassType passType) {
  Layer::forward(passType);
  CHECK_EQ(realLayer_->getDeviceId(), this->getDeviceId());
-  CHECK(!realLayer_->getOutput().ids) << "Not supported";

  const Argument& input = realLayer_->getOutput();
-  CHECK_EQ(input.value->getWidth(), this->getSize());
+  CHECK_EQ(realLayer_->getSize(), this->getSize());
  int width = this->getSize();

  AsyncGpuBlock asyncGpuBlock;
  REGISTER_TIMER_INFO("SequenceAgentLayerForward", getName().c_str());

-  if (realOutArg_.value) {
+  if (realOutArg_.value || realOutArg_.ids) {
    CHECK(realOutArg_.sequenceStartPositions);
-    output_.subArgFrom(realOutArg_, /* offset */ idIndex_ * width, idSize_,
+    output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_,
                       width, useGpu_, /* trans */ false, /* seqFlag */ true,
                       /* seqStart */ seqStartPosIndex_,
                       /* seqSize */ numSequences_);

--- a/paddle/gserver/layers/CRFLayer.h
+++ b/paddle/gserver/layers/CRFLayer.h
@@ -25,7 +25,7 @@ namespace paddle {
 /**
 * A layer for calculating the cost of sequential conditional random field
 * model.
- * See LinearChainCRF.h for the detail of the CRF formulation.
+ * See class LinearChainCRF for the detail of the CRF formulation.
 */
 class CRFLayer : public Layer {
 public:

--- a/paddle/gserver/layers/LinearChainCRF.h
+++ b/paddle/gserver/layers/LinearChainCRF.h
@@ -21,39 +21,39 @@ namespace paddle {

 class LinearChainCRF {
 public:
-  /*
-    The size of para and grad must be (numClasses + 2) * numClasses.
-    The first numClasses values of para are for starting weights (a).
-    The next numClasses values of para are for ending weights (b),
-    The remaning values are for transition weights (w).
-
-    The probability of a state sequence s of length L is defined as:
-    P(s) = (1/Z) exp(a_{s_1} + b_{s_L}
-                     + \sum_{l=1}^L x_{s_l}
-                     + \sum_{l=2}^L w_{s_{l-1},s_l})
-    where Z is a normalization value so that the sum of P(s) over all possible
-    sequences is 1, and x is the input feature to the CRF.
+  /**
+   * The size of para and grad must be \f$(numClasses + 2) * numClasses\f$.
+   * The first numClasses values of para are for starting weights (\f$a\f$).
+   * The next numClasses values of para are for ending weights (\f$b\f$),
+   * The remaning values are for transition weights (\f$w\f$).
+   *
+   * The probability of a state sequence s of length \f$L\f$ is defined as:
+   * \f$P(s) = (1/Z) exp(a_{s_1} + b_{s_L}
+   *                  + \sum_{l=1}^L x_{s_l}
+   *                  + \sum_{l=2}^L w_{s_{l-1},s_l})\f$
+   * where \f$Z\f$ is a normalization value so that the sum of \f$P(s)\f$ over all possible
+   * sequences is \f$1\f$, and \f$x\f$ is the input feature to the CRF.
   */
  LinearChainCRF(int numClasses, real* para, real* grad);

-  /*
-    Calculate the negative log likelihood of s given x.
-    The size of x must be length * numClasses. Each consecutive numClasses
-    values are the features for one time step.
+  /**
+   * Calculate the negative log likelihood of s given x.
+   * The size of x must be length * numClasses. Each consecutive numClasses
+   * values are the features for one time step.
   */
  real forward(real* x, int* s, int length);

-  /*
-    Calculate the gradient with respect to x, a, b, and w.
-    The gradient of x will be stored in dx.
-    backward() can only be called after a corresponding call to forward() with
-    the same x, s and length.
-    NOTE: The gradient is added to dx and grad (provided at constructor).
+  /**
+   * Calculate the gradient with respect to x, a, b, and w.
+   * The gradient of x will be stored in dx.
+   * backward() can only be called after a corresponding call to forward() with
+   * the same x, s and length.
+   * @note The gradient is added to dx and grad (provided at constructor).
   */
  void backward(real* x, real* dx, int* s, int length);

-  /*
-    Find the most probable sequence given x. The result will be stored in s.
+  /**
+   * Find the most probable sequence given x. The result will be stored in s.
   */
  void decode(real* x, int* s, int length);


--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -56,7 +56,6 @@ add_test(NAME test_RecurrentGradientMachine
    COMMAND .set_python_path.sh -d
            ${PROJ_ROOT}/python:${PROJ_ROOT}/paddle/gserver/tests
            ${CMAKE_CURRENT_BINARY_DIR}/test_RecurrentGradientMachine
-            --use_gpu=false
    WORKING_DIRECTORY ${PROJ_ROOT}/paddle)

 add_unittest_without_exec(test_NetworkCompare

--- a/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
+++ b/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
+                        test_list=None,
+                        module='rnn_data_provider',
+                        obj='process_subseq')
+
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+# This hierachical RNN is designed to be equivalent to the simple RNN in
+# sequence_rnn.conf
+
+def outer_step(wid, x):
+    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
+    def inner_step(y, wid):
+        z = embedding_layer(input=wid, size=word_dim)
+        inner_mem = memory(name="inner_rnn_state",
+                           size=hidden_dim,
+                           boot_layer=outer_mem)
+        out = fc_layer(input=[y, z, inner_mem],
+                        size=hidden_dim,
+                        act=TanhActivation(),
+                        bias_attr=True,
+                        name="inner_rnn_state")
+        return out
+
+    inner_rnn_output = recurrent_group(
+        step=inner_step,
+        name="inner",
+        input=[x, wid])
+    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
+
+    # "return last" should also work. But currently RecurrentGradientMachine
+    # does not handle it correctly. Current implementation requires that
+    # all the out links are from sequences. However, it does not report error
+    # when the out links are not sequences.
+    return inner_rnn_output
+
+out = recurrent_group(
+    name="outer",
+    step=outer_step,
+    input=[SubsequenceInput(data), SubsequenceInput(emb)])
+
+rep = last_seq(input=out)
+prob = fc_layer(size=label_dim,
+                input=rep,
+                act=SoftmaxActivation(),
+                bias_attr=True)
+
+outputs(classification_cost(input=prob,
+                            label=data_layer(name="label", size=label_dim)))
--- a/paddle/gserver/tests/sequence_rnn_multi_input.conf
+++ b/paddle/gserver/tests/sequence_rnn_multi_input.conf
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+######################## data source ################################
+define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
+                        test_list=None,
+                        module='rnn_data_provider',
+                        obj='process_seq')
+
+
+settings(batch_size=2, learning_rate=0.01)
+######################## network configure ################################
+dict_dim = 10
+word_dim = 8
+hidden_dim = 8
+label_dim = 3
+
+data = data_layer(name="word", size=dict_dim)
+
+emb = embedding_layer(input=data, size=word_dim)
+
+def step(y, wid):
+    z = embedding_layer(input=wid, size=word_dim)
+    mem = memory(name="rnn_state", size=hidden_dim)
+    out = fc_layer(input=[y, z, mem],
+                    size=hidden_dim,
+                    act=TanhActivation(),
+                    bias_attr=True,
+                    name="rnn_state")
+    return out
+
+out = recurrent_group(
+    name="rnn",
+    step=step,
+    input=[emb, data])
+
+rep = last_seq(input=out)
+prob = fc_layer(size=label_dim,
+                input=rep,
+                act=SoftmaxActivation(),
+                bias_attr=True)
+
+outputs(classification_cost(input=prob,
+                            label=data_layer(name="label", size=label_dim)))
--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -92,7 +92,11 @@ void CalCost(const string& conf, const string& dir, real* cost,
  rmDir(dir.c_str());
 }

-void test(const string& conf1, const string& conf2, double eps) {
+void test(const string& conf1, const string& conf2, double eps, bool useGpu) {
+  if (!paddle::version::isWithGpu() && useGpu) {
+    return;
+  }
+  FLAGS_use_gpu = useGpu;
  int num_passes = 5;
  real* cost1 = new real[num_passes];
  const string dir1 = "gserver/tests/t1";
@@ -113,17 +117,28 @@ void test(const string& conf1, const string& conf2, double eps) {
 }

 TEST(RecurrentGradientMachine, HasSubSequence) {
+  for (bool useGpu : {false, true}) {
    test("gserver/tests/sequence_layer_group.conf",
         "gserver/tests/sequence_nest_layer_group.conf",
-       1e-5);
+         1e-5, useGpu);
+  }
 }

 TEST(RecurrentGradientMachine, rnn) {
+  for (bool useGpu : {false, true}) {
    test("gserver/tests/sequence_rnn.conf",
         "gserver/tests/sequence_nest_rnn.conf",
-       0);
+         1e-6, useGpu);
+  }
 }

+TEST(RecurrentGradientMachine, rnn_multi_input) {
+  for (bool useGpu : {false, true}) {
+    test("gserver/tests/sequence_rnn_multi_input.conf",
+         "gserver/tests/sequence_nest_rnn_multi_input.conf",
+         1e-6, useGpu);
+  }
+}

 int main(int argc, char** argv) {
  if (paddle::version::isWithPyDataProvider()) {

--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -554,11 +554,16 @@ void Argument::degradeSequence(const Argument& input, bool useGpu) {
 void Argument::subArgFrom(const Argument& input, size_t offset, size_t height,
                          size_t width, bool useGpu, bool trans, bool seqFlag,
                          size_t seqStart, size_t seqSize) {
-  value = Matrix::create(input.value->getData() + offset, height, width, trans,
-                         useGpu);
+  if (input.value) {
+    value = Matrix::create(input.value->getData() + offset * width,
+                           height, width, trans, useGpu);
+  }
+  if (input.ids) {
+    ids = IVector::create(input.ids->getData() + offset, height, useGpu);
+  }
  if (input.grad) {
-    grad = Matrix::create(input.grad->getData() + offset, height, width, trans,
-                          useGpu);
+    grad = Matrix::create(input.grad->getData() + offset * width,
+                          height, width, trans, useGpu);
  }
  if (seqFlag) {
    sequenceStartPositions = std::make_shared<ICpuGpuVector>(

--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -177,11 +177,11 @@ struct Argument {
  }

  /**
-   * @brief (value, grad, sequenceStartPositions) of output are subset of
+   * @brief (value, ids, grad, sequenceStartPositions) of output are subset of
   *        input. Note that, output share the same memory of input.
   *
   * @param input[in]       input
-   * @param offset[in]      offset of input.value
+   * @param offset[in]      offset in terms of rows
   * @param height[in]      height of output.value
   * @param width[in]       width of output.value
   * @param useGpu[in]

--- a/paddle/trainer/ThreadParameterUpdater.cpp
+++ b/paddle/trainer/ThreadParameterUpdater.cpp
@@ -141,7 +141,7 @@ void SgdThreadUpdater::traverse(GetTraverseCallback getTraverseCallback) {
  } else if (hasCpuPara) {
    getGlobalSyncThreadPool()->exec(cpuTraverse);
  } else if (hasGpuPara) {
-    cpuTraverse(0, 0);
+      gpuTraverse(0, 0);
  }
 }


--- a/paddle/trainer/TrainerInternal.cpp
+++ b/paddle/trainer/TrainerInternal.cpp
@@ -101,6 +101,7 @@ void TrainerInternal::trainOneBatch(int64_t batchId,
      // it
      //! to ParameterHook.
      auto& grad = para->getBuf(PARAMETER_GRADIENT);
+      SetDevice device(para->getDeviceId());
      paraStats[para->getID()].avgAbsGrad = grad->getAbsSum() / para->getSize();
      paraStats[para->getID()].maxAbsGrad = grad->getAbsMax();
    }

--- a/paddle/trainer/tests/sample_trainer_config_parallel.conf
+++ b/paddle/trainer/tests/sample_trainer_config_parallel.conf
@@ -13,137 +13,74 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
+from paddle.trainer_config_helpers import *

-TrainData(
-    SimpleData(
+TrainData(SimpleData(
            files = "trainer/tests/sample_filelist.txt",
            feat_dim = 3,
            context_len = 0,
-        buffer_capacity = 1000000,
-    )
-)
+            buffer_capacity = 1000000))

-TestData(
-    SimpleData(
+TestData(SimpleData(
           files = "trainer/tests/sample_filelist.txt",
           feat_dim = 3,
           context_len = 0,
-        buffer_capacity = 1000000,
-    )
-)
+           buffer_capacity = 1000000))

-Settings(
-    algorithm = "sgd",
-    num_batches_per_send_parameter = 1,
-    num_batches_per_get_parameter = 1,
-    batch_size = 100,
-    learning_rate = 0.001,
-    learning_rate_decay_a = 1e-5,
-    learning_rate_decay_b = 0.5,
-)
+settings(batch_size = 100)

-default_initial_std(0.2)
 # Output layer, label layer, cost layer, preferably set to the same environment.
 output_device = 0

-model_type("nn")
-
 # Input Layer does not need to specify the device number.
-Layer(
-    name = "input",
-    type = "data",
-    size = 3,
-)
+data = data_layer(name='input', size=3)

 # Calculate in the CPU.
-Layer(
-    name = "layer1_1",
-    type = "fc",
-    size = 5,
-    active_type = "sigmoid",
-    device = -1,
-    inputs = "input",
-)
+fc1 = fc_layer(input=data, size=5,
+               bias_attr=True,
+               layer_attr=ExtraAttr(device=-1),
+               act=SigmoidActivation())

 # Calculate in the GPU 0.
-Layer(
-    name = "layer2_1",
-    type = "fc",
-    size = 10,
-    active_type = "sigmoid",
-    device = 0,
-    inputs = "layer1_1",
-)
+fc2 = fc_layer(input=fc1, size=10,
+               bias_attr=True,
+               layer_attr=ExtraAttr(device=0),
+               act=SigmoidActivation())

 # Calculate in the GPU 1.
-Layer(
-    name = "layer2_2",
-    type = "fc",
-    size = 10,
-    active_type = "sigmoid",
-    device = 1,
-    inputs = "layer1_1",
-)
+fc3 = fc_layer(input=fc1, size=10,
+               bias_attr=True,
+               layer_attr=ExtraAttr(device=1),
+               act=SigmoidActivation())

 # Calculate in the GPU 0.
-Layer(
-    name = "layer3_1",
-    type = "fc",
-    size = 10,
-    device = 0,
-    active_type = "sigmoid",
-    inputs = ["layer2_1", "layer2_2"],
-)
+fc4 = fc_layer(input=[fc2,fc3], size=10,
+               bias_attr=True,
+               layer_attr=ExtraAttr(device=0),
+               act=SigmoidActivation())

 # Calculate in the GPU 1.
-Layer(
-    name = "layer3_2",
-    type = "fc",
-    size = 10,
-    device = 1,
-    active_type = "sigmoid",
-    inputs = ["layer2_1", "layer2_2"],
-)
-
+fc5 = fc_layer(input=[fc2,fc3], size=10,
+               bias_attr=True,
+               layer_attr=ExtraAttr(device=1),
+               act=SigmoidActivation())

-Layer(
-    name = "output",
-    type = "fc",
-    size = 10,
-    device = output_device,
-    active_type = "sigmoid",
-    inputs = ["layer3_1", "layer3_2"],
-)
+output = fc_layer(input=[fc4,fc5], size=10,
+                  bias_attr=True,
+                  layer_attr=ExtraAttr(device=output_device),
+                  act=SoftmaxActivation())

 if get_config_arg('with_cost', bool, True):
    # This is for training the neural network.
    # We need to have another data layer for label
    # and a layer for calculating cost
-    Layer(
-        name = "label",
-        type = "data",
-        device = output_device,
-        size = 1,
-    )
-
-    Layer(
-        name = "cost",
-        type = "multi-class-cross-entropy",
-        device = output_device,
-        inputs = ["output", "label"],
-    )
-
-    Evaluator(
-        name = "error",
-        type = "classification_error",
-        inputs = ["output", "label"])
-
-    Inputs("input", "label")
-    Outputs("cost")
+    lbl = data_layer(name='label', size=1,
+                    layer_attr=ExtraAttr(device=output_device))
                    
+    outputs(classification_cost(input=output, 
+                                label=lbl,
+                                layer_attr=ExtraAttr(device=output_device)))
 else:
    # This is for prediction where we don't have label
    # and don't need to calculate cost
-    Inputs("input")
-    Outputs("output")
+    outputs(output)
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1279,7 +1279,7 @@ class LayerBase(object):
            size,
            dims=None,
            sparse = None,
-            format = "csr"):
+            format = None):
        if dims is None:
            # TODO(yuyang18): print warning and callstack here!
            dims = list()
@@ -2074,7 +2074,7 @@ class MaxLayer(LayerBase):
            active_type='linear',
            device=None,
            bias=False,
-            output_max_index=False):
+            output_max_index=None):
        super(MaxLayer, self).__init__(name, 'max', 0, inputs=inputs, device=device)
        config_assert(len(self.inputs) == 1, 'MaxLayer must have 1 input')
        self.config.trans_type =  trans_type
@@ -2083,7 +2083,8 @@ class MaxLayer(LayerBase):
            input_layer = self.get_input_layer(input_index)
            self.set_layer_size(input_layer.size)
        self.create_bias_parameter(bias, self.config.size)
-        self.config.output_max_index=output_max_index
+        if output_max_index is not None:
+            self.config.output_max_index = output_max_index


 @config_layer('maxid')
@@ -2440,7 +2441,7 @@ class MixedLayer(LayerBase):
            inputs,
            size=0,
            bias=True,
-            error_clipping_threshold=0.0,
+            error_clipping_threshold=None,
            **xargs):
        config_assert(inputs, 'inputs cannot be empty')
        super(MixedLayer, self).__init__(
@@ -2510,6 +2511,7 @@ class MixedLayer(LayerBase):

        self.create_bias_parameter(bias, self.config.size)

+        if error_clipping_threshold is not None:
            self.config.error_clipping_threshold = error_clipping_threshold

 # like MixedLayer, but no bias parameter

--- a/python/paddle/trainer_config_helpers/activations.py
+++ b/python/paddle/trainer_config_helpers/activations.py
@@ -15,8 +15,10 @@
 __all__ = ["TanhActivation", "SigmoidActivation",
           "SoftmaxActivation", "IdentityActivation", "LinearActivation",
           'SequenceSoftmaxActivation', 'ExpActivation',
-           "ReluActivation", "BReluActivation", "SoftReluActivation", "STanhActivation",
-           "AbsActivation", "SquareActivation", "BaseActivation"]
+           "ReluActivation", "BReluActivation", "SoftReluActivation",
+           "STanhActivation",
+           "AbsActivation", "SquareActivation",
+           "BaseActivation"]


 class BaseActivation(object):
@@ -36,6 +38,9 @@ class BaseActivation(object):
        self.name = name
        self.support_hppl = support_hppl

+    def __repr__(self):
+        return self.name
+

 class TanhActivation(BaseActivation):
    """

--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
@@ -17,6 +17,42 @@ __all__ = ['ParamAttr', 'ExtraAttr', 'ParameterAttribute',
           'ExtraLayerAttribute']


+def convert_and_compare(x, Type):
+    """                                                                                                                                                                                                
+    Convert x to be the same type as Type and then convert back to                                                                                                                                      
+    check whether there is a loss of information                                                                                                                                                        
+    :param x: object to be checked                                                                                                                                                                      
+    :param Type: target type to check x over                                                                                                                                                           
+    
+    """
+    return type(x)(Type(x))==x
+
+def is_compatible_with(x, Type):
+    """                                                                                                                                                                                                
+    Check if x has a type compatible with Type                                                                                                                                                         
+    :param x: object to be checked                                                                                                                                                                     
+    :param Type: target type to check x over                                                                                                                                                           
+    
+    """
+    if type(x) == Type:
+        return True
+    try:
+        if float == Type or int == Type:
+        # avoid those types that can be converted to float/int but not very                                                                                                                            
+        # meaningful and  could potentially lead to error                                                                                                                                              
+        # i.e., str and bool typed value should not be used for initializing float/int variable                                                                                                        
+            if not isinstance(x, str) and not isinstance(x, bool):
+                return convert_and_compare(x, Type)
+        elif bool == Type:
+            # should not use string type to initialize bool variable                                                                                                                                   
+            if not isinstance(x, str):
+                return convert_and_compare(x, Type)
+        else:
+            return False
+    except:
+        return False
+
+
 class ParameterAttribute(object):
    """
    Parameter Attributes object. To fine-tuning network training process, user
@@ -65,14 +101,18 @@ class ParameterAttribute(object):
        elif initial_std is None and initial_mean is None and initial_max \
                is None and initial_min is None:
            self.attr = {'initial_smart': True}
-        elif isinstance(initial_std, float) or isinstance(initial_mean, float):
+        elif is_compatible_with(initial_std, float) or \
+             is_compatible_with(initial_mean, float):
            self.attr = dict()
            if initial_std is not None:
                self.attr['initial_std'] = initial_std
            if initial_mean is not None:
                self.attr['initial_mean'] = initial_mean
            self.attr['initial_strategy'] = 0  # Gauss Random
-        elif isinstance(initial_max, float) and isinstance(initial_min, float):
+        elif is_compatible_with(initial_max, float) and \
+             is_compatible_with(initial_min, float):
+            initial_max = initial_max
+            initial_min = initial_min
            assert initial_min < initial_max
            initial_mean = (initial_max + initial_min) / 2
            initial_std = initial_mean - initial_min
@@ -83,16 +123,16 @@ class ParameterAttribute(object):
        else:
            raise RuntimeError("Unexpected branch.")

-        if not is_static and isinstance(l1_rate, float):
+        if not is_static and is_compatible_with(l1_rate, float):
            self.attr['decay_rate_l1'] = l1_rate

-        if not is_static and isinstance(l2_rate, float):
+        if not is_static and is_compatible_with(l2_rate, float):
            self.attr['decay_rate'] = l2_rate

-        if not is_static and isinstance(learning_rate, float):
+        if not is_static and is_compatible_with(learning_rate, float):
            self.attr['learning_rate'] = learning_rate

-        if not is_static and isinstance(momentum, float):
+        if not is_static and is_compatible_with(momentum, float):
            self.attr['momentum'] = momentum

        if name is not None:
@@ -134,12 +174,16 @@ class ExtraLayerAttribute(object):
                      The dropout rate is the zero rate of this mask. The
                      details of what dropout is please refer to `here
                      <https://www.cs.toronto.edu/~hinton/absps/
-                      JMLRdropout.pdf>`_
+                      JMLRdropout.pdf>`_.
    :type drop_rate: float
-
+    :param device: device ID of layer. device=-1, use CPU. device>0, use GPU.
+                   The details allocation in parallel_nn please refer to `here
+                   <http://www.paddlepaddle.org/doc/ui/cmd_argument/
+                   use_case.html#case-2-specify-layers-in-different-devices>`_.
+    :type device: int
    """

-    def __init__(self, error_clipping_threshold=None, drop_rate=None):
+    def __init__(self, error_clipping_threshold=None, drop_rate=None, device=None):
        self.attr = dict()
        if isinstance(error_clipping_threshold, float):
            assert error_clipping_threshold > 0
@@ -149,6 +193,9 @@ class ExtraLayerAttribute(object):
            assert drop_rate > 0
            self.attr["drop_rate"] = drop_rate

+        if isinstance(device, int):
+            self.attr["device"] = device
+
    def check(self, layer_name):
        for key in self.attr:
            if not hasattr(self, 'can_%s' % key) or \

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -616,7 +616,7 @@ def lstmemory_group(input, size=None, name=None,
    cell states, or hidden states in every time step are accessible to for the
    user. This is especially useful in attention model. If you do not need to
    access to the internal states of the lstm, but merely use its outputs,
-    it is recommanded to use the lstmemory, which is relatively faster than
+    it is recommended to use the lstmemory, which is relatively faster than
    lstmemory_group.

    NOTE: In PaddlePaddle's implementation, the following input-to-hidden
@@ -1052,7 +1052,7 @@ def dropout_layer(input, dropout_rate, name=None):
                       layer_attr=ExtraAttr(drop_rate=dropout_rate))


-def outputs(layers):
+def outputs(layers, *args):
    """
    Declare the end of network. Currently it will only calculate the
    input/output order of network. It will calculate the predict network or
@@ -1089,9 +1089,12 @@ def outputs(layers):
    if isinstance(layers, LayerOutput):
        layers = [layers]

+    if len(args) != 0:
+        layers.extend(args)
+
    assert len(layers) > 0
    if len(layers) != 1:
-        logger.warning("EndOfNetwork routine try to calculate network's"
+        logger.warning("`outputs` routine try to calculate network's"
                       " inputs and outputs order. It might not work well."
                       "Please see follow log carefully.")
    inputs = []

--- a/python/paddle/trainer_config_helpers/poolings.py
+++ b/python/paddle/trainer_config_helpers/poolings.py
@@ -47,9 +47,14 @@ class MaxPooling(BasePoolingType):
    ..  math::

        max(samples\\_of\\_a\\_sequence)
+
+    :param output_max_index: True if output sequence max index instead of max
+                             value. None means use default value in proto.
+    :type output_max_index: bool|None
    """
-    def __init__(self):
+    def __init__(self, output_max_index=None):
        BasePoolingType.__init__(self, "max")
+        self.output_max_index = output_max_index
        

 class AvgPooling(BasePoolingType):

--- a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
+++ b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
@@ -3,3 +3,8 @@ add_test(NAME layers_test
  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
        python ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/layers_test.py
    WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
+
+add_test(NAME test_layerHelpers
+  COMMAND
+  ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+)
--- a/python/paddle/trainer_config_helpers/tests/configs/.gitignore
+++ b/python/paddle/trainer_config_helpers/tests/configs/.gitignore
+*protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/check.md5
+++ b/python/paddle/trainer_config_helpers/tests/configs/check.md5
+7e6919d17562516e9a1d9a88de1fb3b9  img_layers.protostr
+a5d9259ff1fd7ca23d0ef090052cb1f2  last_first_seq.protostr
+9c038249ec8ff719753a746cdb04c026  layer_activations.protostr
+5913f87b39cee3b2701fa158270aca26  projections.protostr
+6b39e34beea8dfb782bee9bd3dea9eb5  simple_rnn_layers.protostr
+0fc1409600f1a3301da994ab9d28b0bf  test_cost_layers.protostr
+144bc6d3a509de74115fa623741797ed  test_expand_layer.protostr
+2378518bdb71e8c6e888b1842923df58  test_fc.protostr
+8bb44e1e5072d0c261572307e7672bda  test_grumemory_layer.protostr
+1f3510672dce7a9ed25317fc58579ac7  test_hsigmoid.protostr
+d350bd91a0dc13e854b1364c3d9339c6  test_lstmemory_layer.protostr
+251a948ba41c1071afcd3d9cf9c233f7  test_ntm_layers.protostr
+e6ff04e70aea27c7b06d808cc49c9497  test_print_layer.protostr
+2a75dd33b640c49a8821c2da6e574577  test_rnn_group.protostr
+67d6fde3afb54f389d0ce4ff14726fe1  test_sequence_pooling.protostr
+f586a548ef4350ba1ed47a81859a64cb  unused_layers.protostr
+8122477f4f65244580cec09edc590041  util_layers.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+#!/bin/bash
+
+set -e
+cd `dirname $0`
+export PYTHONPATH=$PWD/../../../../
+
+configs=(test_fc layer_activations projections test_print_layer
+test_sequence_pooling test_lstmemory_layer test_grumemory_layer
+last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
+img_layers util_layers simple_rnn_layers unused_layers test_cost_layers
+test_rnn_group)
+
+
+for conf in ${configs[*]}
+do
+    echo "Generating " $conf
+    python -m paddle.utils.dump_config $conf.py > $conf.protostr
+done
--- a/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
+from paddle.trainer_config_helpers import *
+
+settings(
+    learning_rate=1e-3,
+    batch_size=1000
+)
+
+img = data_layer(name='image', size=256*256)
+
+img_conv = img_conv_layer(input=img, num_channels=1, num_filters=64,
+                          filter_size=(32, 64), padding=(1, 0), stride=(1, 1),
+                          act=LinearActivation())
+img_bn = batch_norm_layer(input=img_conv, act=ReluActivation())
+
+img_norm = img_cmrnorm_layer(input=img_bn, size=32)
+
+img_pool = img_pool_layer(input=img_conv, pool_size=32, pool_type=MaxPooling())
+
+
+outputs(img_pool, img_norm)
\ No newline at end of file
--- a/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
+from paddle.trainer_config_helpers import *
+
+settings(
+    batch_size=1000,
+    learning_rate=1e-5
+)
+
+din = data_layer(name='data', size=30)
+
+seq_op = [
+    first_seq,
+    last_seq
+]
+
+agg_level = [
+    AggregateLevel.EACH_SEQUENCE,
+    AggregateLevel.EACH_TIMESTEP
+]
+
+opts = []
+
+for op in seq_op:
+    for al in agg_level:
+        opts.append(op(input=din, agg_level=al))
+
+outputs(opts)
\ No newline at end of file
--- a/python/paddle/trainer_config_helpers/tests/configs/layer_activations.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/layer_activations.py
+'''
+Test all activations.
+'''
+
+from paddle.trainer_config_helpers import *
+
+settings(
+    learning_rate=1e-4,
+    batch_size=1000
+)
+
+din = data_layer(name='input', size=100)
+
+acts = [
+    TanhActivation, SigmoidActivation, SoftmaxActivation, IdentityActivation,
+    LinearActivation, ExpActivation, ReluActivation, BReluActivation,
+    SoftReluActivation, STanhActivation, AbsActivation, SquareActivation]
+
+outputs(
+    [fc_layer(input=din, size=100, act=act(), name="layer_%d" % i) for i, act in
+     enumerate(acts)])
--- a/python/paddle/trainer_config_helpers/tests/configs/projections.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/projections.py
+'''
+Test mixed layer, projections and operators.
+'''
+from paddle.trainer_config_helpers import *
+
+settings(
+    batch_size=1000,
+    learning_rate=1e-4
+)
+
+din = data_layer(name='test', size=100)
+
+din = embedding_layer(input=din, size=256)
+
+with mixed_layer(size=100) as m1:
+    m1 += full_matrix_projection(input=din)
+
+with mixed_layer(size=100) as m2:
+    m2 += table_projection(input=m1)
+
+with mixed_layer(size=100) as m3:
+    m3 += identity_projection(input=m2)
+
+with mixed_layer(size=100) as m4:
+    m4 += dotmul_projection(input=m3)
+
+with mixed_layer() as m5:
+    m5 += context_projection(input=m4, context_len=3)
+
+with mixed_layer() as m6:
+    m6 += dotmul_operator(a=m3, b=m4)
+
+img = data_layer(name='img', size=32*32)
+flt = data_layer(name='filter', size=3*3*1*64)
+
+with mixed_layer() as m7:
+    m7 += conv_operator(img=img, filter=flt, num_filters=64,
+                        num_channel=1, filter_size=3)
+
+end = mixed_layer(input=[full_matrix_projection(input=m5),
+                         trans_full_matrix_projection(input=m6),
+                         full_matrix_projection(input=m7)],
+                  size=100,
+                  layer_attr=ExtraAttr(drop_rate=0.5,
+                                       error_clipping_threshold=40))
+
+outputs(end)
--- a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+#!/bin/bash
+cd `dirname $0`
+set -e
+./generate_protostr.sh
+md5sum -c check.md5
--- a/python/paddle/trainer_config_helpers/tests/configs/simple_rnn_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/simple_rnn_layers.py
+from paddle.trainer_config_helpers import *
+
+settings(
+    batch_size=1000,
+    learning_rate=1e-4
+)
+
+din = data_layer(name='data', size=200)
+
+hidden = fc_layer(input=din, size=200, act=SigmoidActivation())
+
+rnn = recurrent_layer(input=hidden, act=SigmoidActivation())
+
+rnn2 = recurrent_layer(input=hidden, act=SigmoidActivation(), reverse=True)
+
+lstm1_param = fc_layer(input=hidden, size=200*4, act=LinearActivation(),
+                       bias_attr=False)
+
+lstm1 = lstmemory(input=lstm1_param, act=SigmoidActivation())
+
+lstm2_param = fc_layer(input=hidden, size=200*4, act=LinearActivation(),
+                       bias_attr=False)
+
+lstm2 = lstmemory(input=lstm2_param, act=SigmoidActivation(), reverse=True)
+
+gru1_param = fc_layer(input=hidden, size=200*3, act=LinearActivation(),
+                      bias_attr=False)
+gru1 = grumemory(input=gru1_param, act=SigmoidActivation())
+
+gru2_param = fc_layer(input=hidden, size=200*3, act=LinearActivation(),
+                      bias_attr=False)
+gru2 = grumemory(input=gru2_param, act=SigmoidActivation(), reverse=True)
+
+outputs(last_seq(input=rnn), first_seq(input=rnn2),
+        last_seq(input=lstm1), first_seq(input=lstm2),
+        last_seq(input=gru1), first_seq(gru2))
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
+from paddle.trainer_config_helpers import *
+
+settings(
+    learning_rate=1e-4,
+    batch_size=1000
+)
+
+seq_in = data_layer(name='input', size=200)
+labels = data_layer(name='labels', size=5000)
+
+probs = data_layer(name='probs', size=10)
+xe_label = data_layer(name='xe-label', size=10)
+
+outputs(ctc_layer(input=seq_in, label=labels),
+        crf_layer(input=fc_layer(input=seq_in, size=4),
+                  label=data_layer(name='crf_label', size=4)),
+        rank_cost(left=data_layer(name='left', size=1),
+                  right=data_layer(name='right', size=1),
+                  label=data_layer(name='label', size=1)),
+        lambda_cost(input=data_layer(name='list_feature', size=100),
+                    score=data_layer(name='list_scores', size=1)),
+        cross_entropy(input=probs, label=xe_label),
+        cross_entropy_with_selfnorm(input=probs, label=xe_label),
+        huber_cost(input=data_layer(name='huber_probs', size=1),
+                   label=data_layer(name='huber_label', size=1)),
+        multi_binary_label_cross_entropy(input=probs, label=xe_label))
--- a/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py
+from paddle.trainer_config_helpers import *
+
+settings(
+    batch_size=1000,
+    learning_rate=1e-5
+)
+
+din = data_layer(name='data', size=30)
+data_seq = data_layer(name='data_seq', size=30)
+
+outputs(expand_layer(input=din, expand_as=data_seq,
+                     expand_level=ExpandLevel.FROM_SEQUENCE),
+        expand_layer(input=din, expand_as=data_seq,
+                     expand_level=ExpandLevel.FROM_TIMESTEP))
--- a/python/paddle/trainer_config_helpers/tests/configs/test_fc.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_fc.py
+from paddle.trainer_config_helpers import *
+
+settings(
+    batch_size=1000,
+    learning_rate=1e-5
+)
+
+din = data_layer(name='data', size=100)
+
+trans = trans_layer(input=din)
+
+hidden = fc_layer(input=trans, size=100,
+                  bias_attr=False)
+
+mask = data_layer(name='mask', size=100)
+
+hidden_sel = selective_fc_layer(input=din, select=mask, size=100,
+                                act=SigmoidActivation())
+
+outputs(hidden, hidden_sel)
--- a/python/paddle/trainer_config_helpers/tests/configs/test_grumemory_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_grumemory_layer.py
+from paddle.trainer_config_helpers import *
+
+settings(
+    batch_size=1000,
+    learning_rate=1e-4
+)
+
+din = data_layer(name='data', size=120)
+
+outputs(grumemory(input=din, size=40, reverse=True, gate_act=TanhActivation(),
+                  act=SigmoidActivation()))
--- a/python/paddle/trainer_config_helpers/tests/configs/test_hsigmoid.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_hsigmoid.py
+from paddle.trainer_config_helpers import *
+
+settings(
+    learning_rate=1e-4,
+    batch_size=1000
+)
+
+din = data_layer(name='data', size=100)
+label = data_layer(name='label', size=10)
+
+outputs(hsigmoid(input=din, label=label, num_classes=10))
\ No newline at end of file
--- a/python/paddle/trainer_config_helpers/tests/configs/test_lstmemory_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_lstmemory_layer.py
+from paddle.trainer_config_helpers import *
+
+settings(
+    batch_size=1000,
+    learning_rate=1e-5
+)
+
+din = data_layer(name='data', size=128)
+
+outputs(lstmemory(input=din, reverse=True, gate_act=TanhActivation(),
+                  act=TanhActivation(), size=32))
--- a/python/paddle/trainer_config_helpers/tests/configs/test_ntm_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_ntm_layers.py
+from paddle.trainer_config_helpers import *
+
+settings(
+    batch_size=1000,
+    learning_rate=1e-5
+)
+
+weight = data_layer(name='w', size=1)
+a = data_layer(name='a', size=100)
+b = data_layer(name='b', size=100)
+c = data_layer(name='c', size=200)
+d = data_layer(name='d', size=31)
+
+outputs(interpolation_layer(input=[a, b], weight=weight),
+        power_layer(input=a, weight=weight),
+        scaling_layer(input=a, weight=weight),
+        cos_sim(a=a, b=b),
+        cos_sim(a=a, b=c, size=2),
+        sum_to_one_norm_layer(input=a),
+        conv_shift_layer(a=a, b=d),
+        tensor_layer(a=a, b=b, size=1000),
+        slope_intercept_layer(input=a, slope=0.7, intercept=0.9),
+        linear_comb_layer(weights=b, vectors=c))
--- a/python/paddle/trainer_config_helpers/tests/configs/test_print_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_print_layer.py
+from paddle.trainer_config_helpers import *
+
+settings(
+    learning_rate=1e-4,
+    batch_size=1000
+)
+
+din = data_layer(name='input', size=100)
+
+print_layer(input=din)
+
+outputs(din)
--- a/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
+from paddle.trainer_config_helpers import *
+
+settings(
+    learning_rate=1e-4,
+    batch_size=1000
+)
+
+seq = data_layer(name='seq_input', size=100)
+sub_seq = data_layer(name='sub_seq_input', size=100)
+lbl = data_layer(name='label', size=1)
+
+
+def generate_rnn_simple(name):
+    def rnn_simple(s):
+        m = memory(name=name, size=200)
+        fc = fc_layer(input=[s, m], size=200, name=name)
+        return fc
+
+    return rnn_simple
+
+
+with mixed_layer() as lstm_param:  # test lstm unit, rnn group
+    lstm_param += full_matrix_projection(input=seq, size=100 * 4)
+
+with mixed_layer() as gru_param:
+    gru_param += full_matrix_projection(input=seq, size=100 * 3)
+
+outputs(last_seq(input=recurrent_group(step=generate_rnn_simple('rnn_forward'),
+                                       input=seq)),
+        first_seq(input=recurrent_group(step=generate_rnn_simple('rnn_back'),
+                                        input=seq, reverse=True)),
+        last_seq(input=recurrent_group(step=generate_rnn_simple(
+            'rnn_subseq_forward'), input=SubsequenceInput(input=sub_seq))),
+        last_seq(input=lstmemory_group(input=lstm_param, size=100)),
+        last_seq(input=gru_group(input=gru_param, size=100)))
--- a/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
+from paddle.trainer_config_helpers import *
+
+settings(
+    learning_rate=1e-4,
+    batch_size=1000
+)
+
+din = data_layer(name='dat_in', size=100)
+
+POOL_TYPE = [
+    MaxPooling,
+    AvgPooling,
+    SumPooling
+]
+
+AGG_LEVEL = [
+    AggregateLevel.EACH_SEQUENCE,
+    AggregateLevel.EACH_TIMESTEP
+]
+
+opts = []
+
+for pt in POOL_TYPE:
+    for al in AGG_LEVEL:
+        opts.append(pooling_layer(input=din, agg_level=al, pooling_type=pt()))
+
+opts.append(pooling_layer(input=din,
+                          pooling_type=MaxPooling(output_max_index=True)))
+
+outputs(opts)
--- a/python/paddle/trainer_config_helpers/tests/configs/unused_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/unused_layers.py
+from paddle.trainer_config_helpers import *
+settings(
+    batch_size=1000,
+    learning_rate=1e-4
+)
+
+probs = data_layer(name='probs', size=100)
+
+outputs(
+    sampling_id_layer(input=probs),  # It seems not support training
+
+    # It seems this layer is not correct, and should be rewrite.
+    # block_expand_layer(input=probs, channel=1, block_x=1, block_y=3),
+)
\ No newline at end of file
--- a/python/paddle/trainer_config_helpers/tests/configs/util_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/util_layers.py
+from paddle.trainer_config_helpers import *
+
+settings(learning_rate=1e-4, batch_size=1000)
+
+a = data_layer(name='a', size=10)
+b = data_layer(name='b', size=10)
+
+result = addto_layer(input=[a, b])
+concat1 = concat_layer(input=[a, b])
+concat2 = concat_layer(input=[
+    identity_projection(input=a),
+    identity_projection(input=b)
+])
+
+outputs(result, concat1, concat2)
\ No newline at end of file
--- a/python/paddle/trainer_config_helpers/tests/layers_test_config.py
+++ b/python/paddle/trainer_config_helpers/tests/layers_test_config.py
@@ -23,6 +23,15 @@ z = out_prod_layer(input1=x, input2=y)

 x1 = fc_layer(input=x, size=5)
 y1 = fc_layer(input=y, size=5)
+
+z1 = mixed_layer(act=LinearActivation(),
+                 input=[conv_operator(img=x1,
+                                      filter=y1,
+                                      filter_size=1,
+                                      num_filters=5,
+                                      num_channel=5,
+                                      stride=1)])
+
 y2 = fc_layer(input=y, size=15)

 cos1 = cos_sim(a=x1, b=y1)
@@ -30,7 +39,7 @@ cos3 = cos_sim(a=x1, b=y2, size=3)

 linear_comb = linear_comb_layer(weights=x1, vectors=y2, size=3)

-out = fc_layer(input=[cos1, cos3, linear_comb, z],
+out = fc_layer(input=[cos1, cos3, linear_comb, z, z1],
               size=num_classes,
               act=SoftmaxActivation())

@@ -38,11 +47,21 @@ print_layer(input=[out])

 outputs(classification_cost(out, data_layer(name="label", size=num_classes)))

-dotmul = mixed_layer(input=[dotmul_operator(x=x1, y=y1),
+dotmul = mixed_layer(input=[dotmul_operator(a=x1, b=x1),
                            dotmul_projection(input=y1)])

+proj_with_attr_init = mixed_layer(input=full_matrix_projection(input=y1,
+                                                               param_attr=ParamAttr(learning_rate = 0,
+                                                                                 initial_mean = 0,
+                                                                                 initial_std = 0)),
+                               bias_attr = ParamAttr(initial_mean=0, initial_std=0, learning_rate=0),
+                               act = LinearActivation(),
+                               size = 5,
+                               name='proj_with_attr_init')
+
+
 # for ctc
-tmp = fc_layer(input=[x1, dotmul],
+tmp = fc_layer(input=[x1, dotmul, proj_with_attr_init],
               size=num_classes + 1,
               act=SoftmaxActivation())
 ctc = ctc_layer(input=tmp,