Merge conflict with maxout layer

fd4eeaf5 · liaogang · ddfff3a7 · 46bd5f53 · fd4eeaf5 · fd4eeaf5
122 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -3,4 +3,6 @@ build/
 *.user
 .vscode
 .idea
\ No newline at end of file
+.project
+.pydevproject
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,9 +2,17 @@ language: cpp
 cache: ccache
 sudo: required
 dist: trusty
+os:
+  - linux
+  - osx
 env:
  - JOB=DOCS
  - JOB=BUILD_AND_TEST
+matrix:
+  exclude:
+    - os: osx
+      env: JOB=DOCS  # Only generate documentation in linux
 addons:
  apt:
    packages:
@@ -27,9 +35,11 @@ addons:
      - libgoogle-glog-dev
      - libgflags-dev
      - libgtest-dev
+      - graphviz
 before_install:
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi
+  - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
  - pip install wheel protobuf sphinx breathe recommonmark
-  - sudo paddle/scripts/travis/before_install.sh
 script:
  - paddle/scripts/travis/main.sh
 notifications:

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 2.8)
 project(paddle CXX C)
 set(PADDLE_MAJOR_VERSION 0)
 set(PADDLE_MINOR_VERSION 8)
-set(PADDLE_PATCH_VERSION 0b1)
+set(PADDLE_PATCH_VERSION 0b2)
 set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
@@ -104,7 +104,7 @@ else()
 endif(NOT WITH_GPU)
 if(WITH_DOUBLE)
-    add_definitions(-DPADDLE_TYPE_DOUBLE -DHPPL_TYPE_DOUBLE)
+    add_definitions(-DPADDLE_TYPE_DOUBLE)
    set(ACCURACY double)
 else(WITH_DOUBLE)
    set(ACCURACY float)

--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -17,10 +17,17 @@
 ## Find MKL First.
 set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains MKL")
-find_path(MKL_INCLUDE_DIR mkl.h PATHS ${MKL_ROOT}/include)
+find_path(MKL_INCLUDE_DIR mkl.h PATHS
-find_library(MKL_CORE_LIB NAMES mkl_core PATHS ${MKL_ROOT}/lib)
+  ${MKL_ROOT}/include)
-find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS ${MKL_ROOT}/lib)
+find_library(MKL_CORE_LIB NAMES mkl_core PATHS
-find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS ${MKL_ROOT}/lib)
+  ${MKL_ROOT}/lib
+  ${MKL_ROOT}/lib/intel64)
+find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS
+  ${MKL_ROOT}/lib
+  ${MKL_ROOT}/lib/intel64)
+find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
+  ${MKL_ROOT}/lib
+  ${MKL_ROOT}/lib/intel64)
 if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -64,7 +64,9 @@ set(COMMON_FLAGS
    -Wdelete-non-virtual-dtor
    -Wno-unused-parameter
    -Wno-error=literal-suffix
-    -Wno-error=unused-local-typedefs)
+    -Wno-error=unused-local-typedefs
+    -Wno-error=unused-function  # Warnings in Numpy Header.
+)
 foreach(flag ${COMMON_FLAGS})
    safe_set_cflag(CMAKE_C_FLAGS ${flag})

--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -184,3 +184,20 @@ macro(add_paddle_culib TARGET_NAME)
    cuda_add_library(${TARGET_NAME} STATIC ${ARGN})
    set(CUDA_NVCC_FLAGS ${NVCC_FLAG})
 endmacro()
+# Creates C resources file from files in given resource file
+function(create_resources res_file output)
+    # Create empty output file
+    file(WRITE ${output} "")
+    # Get short filename
+    string(REGEX MATCH "([^/]+)$" filename ${res_file})
+    # Replace filename spaces & extension separator for C compatibility
+    string(REGEX REPLACE "\\.| |-" "_" filename ${filename})
+    # Read hex data from file
+    file(READ ${res_file} filedata HEX)
+    # Convert hex data for C compatibility
+    string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," filedata ${filedata})
+    # Append data to output file
+    file(APPEND ${output} "const unsigned char ${filename}[] = {${filedata}};\nconst unsigned ${filename}_size = sizeof(${filename});\n")
+endfunction()
--- a/demo/mnist/.gitignore
+++ b/demo/mnist/.gitignore
+data/raw_data
+data/*.list
+mnist_vgg_model
+plot.png
+train.log
+*pyc
--- a/demo/mnist/data/generate_list.py
+++ b/demo/mnist/data/generate_list.py
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+o = open("./" + "train.list", "w")
+o.write("./data/raw_data/train" +"\n")
+o.close()
+o = open("./" + "test.list", "w")
+o.write("./data/raw_data/t10k" +"\n")
+o.close()
\ No newline at end of file
--- a/demo/mnist/data/get_mnist_data.sh
+++ b/demo/mnist/data/get_mnist_data.sh
+#!/usr/bin/env sh
+# This scripts downloads the mnist data and unzips it.
+set -e
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+rm -rf "$DIR/raw_data"
+mkdir "$DIR/raw_data"
+cd "$DIR/raw_data"
+echo "Downloading..."
+for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte
+do
+    if [ ! -e $fname ]; then
+        wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz
+        gunzip ${fname}.gz
+    fi
+done
+cd $DIR
+rm -f *.list
+python generate_list.py
--- a/demo/mnist/mnist_provider.py
+++ b/demo/mnist/mnist_provider.py
+from paddle.trainer.PyDataProvider2 import *
+# Define a py data provider
+@provider(input_types={
+    'pixel': dense_vector(28 * 28),
+    'label': integer_value(10)
+})
+def process(settings, filename):  # settings is not used currently.
+    imgf = filename + "-images-idx3-ubyte"
+    labelf = filename + "-labels-idx1-ubyte"
+    f = open(imgf, "rb")
+    l = open(labelf, "rb")
+    f.read(16)
+    l.read(8)
+    # Define number of samples for train/test
+    if "train" in filename:
+        n = 60000
+    else:
+        n = 10000
+    for i in range(n):
+        label = ord(l.read(1))
+        pixels = []
+        for j in range(28 * 28):
+            pixels.append(float(ord(f.read(1))) / 255.0)
+        yield {"pixel": pixels, 'label': label}
+    f.close()
+    l.close()
--- a/demo/mnist/train.sh
+++ b/demo/mnist/train.sh
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+config=vgg_16_mnist.py
+output=./mnist_vgg_model
+log=train.log
+paddle train \
+--config=$config \
+--dot_period=10 \
+--log_period=100 \
+--test_all_data_in_one_period=1 \
+--use_gpu=0 \
+--trainer_count=1 \
+--num_passes=100 \
+--save_dir=$output \
+2>&1 | tee $log
+python -m paddle.utils.plotcurve -i $log > plot.png
--- a/demo/mnist/vgg_16_mnist.py
+++ b/demo/mnist/vgg_16_mnist.py
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer_config_helpers import *
+is_predict = get_config_arg("is_predict", bool, False)
+####################Data Configuration ##################
+if not is_predict:
+  data_dir='./data/'
+  define_py_data_sources2(train_list= data_dir + 'train.list',
+                        test_list= data_dir + 'test.list',
+                        module='mnist_provider',
+                        obj='process')
+######################Algorithm Configuration #############
+settings(
+    batch_size = 128,
+    learning_rate = 0.1 / 128.0,
+    learning_method = MomentumOptimizer(0.9),
+    regularization = L2Regularization(0.0005 * 128)
+)
+#######################Network Configuration #############
+data_size=1*28*28
+label_size=10
+img = data_layer(name='pixel', size=data_size)
+# small_vgg is predined in trainer_config_helpers.network
+predict = small_vgg(input_image=img,
+                    num_channels=1,
+                    num_classes=label_size)
+if not is_predict:
+    lbl = data_layer(name="label", size=label_size)
+    inputs(img, lbl)
+    outputs(classification_cost(input=predict, label=lbl))
+else:
+    outputs(predict)
--- a/demo/quick_start/preprocess.sh
+++ b/demo/quick_start/preprocess.sh
@@ -20,6 +20,8 @@
 set -e
+export LC_ALL=C
 mkdir -p data/tmp
 python preprocess.py -i data/reviews_Electronics_5.json.gz
 # uniq and shuffle

--- a/demo/quick_start/train.sh
+++ b/demo/quick_start/train.sh
@@ -18,6 +18,8 @@ cfg=trainer_config.lr.py
 #cfg=trainer_config.emb.py
 #cfg=trainer_config.cnn.py
 #cfg=trainer_config.lstm.py
+#cfg=trainer_config.bidi-lstm.py
+#cfg=trainer_config.db-lstm.py
 paddle train \
  --config=$cfg \
  --save_dir=./output \

--- a/demo/quick_start/trainer_config.bidi-lstm.py
+++ b/demo/quick_start/trainer_config.bidi-lstm.py
+# edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer_config_helpers import *
+dict_file = "./data/dict.txt"
+word_dict = dict()
+with open(dict_file, 'r') as f:
+    for i, line in enumerate(f):
+        w = line.strip().split()[0]
+        word_dict[w] = i
+is_predict = get_config_arg('is_predict', bool, False)
+trn = 'data/train.list' if not is_predict else None
+tst = 'data/test.list' if not is_predict else 'data/pred.list'
+process = 'process' if not is_predict else 'process_predict'
+define_py_data_sources2(train_list=trn,
+                        test_list=tst,
+                        module="dataprovider_emb",
+                        obj=process,
+                        args={"dictionary": word_dict})
+batch_size = 128 if not is_predict else 1
+settings(
+    batch_size=batch_size,
+    learning_rate=2e-3,
+    learning_method=AdamOptimizer(),
+    regularization=L2Regularization(8e-4),
+    gradient_clipping_threshold=25
+)
+bias_attr = ParamAttr(initial_std=0.,l2_rate=0.)
+data = data_layer(name="word", size=len(word_dict))
+emb = embedding_layer(input=data, size=128)
+bi_lstm = bidirectional_lstm(input=emb, size=128)
+dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5)
+output = fc_layer(input=dropout, size=2,
+                  bias_attr=bias_attr,
+                  act=SoftmaxActivation())
+if is_predict:
+    maxid = maxid_layer(output)
+    outputs([maxid, output])
+else:
+    label = data_layer(name="label", size=2)
+    cls = classification_cost(input=output, label=label)
+    outputs(cls)
--- a/demo/quick_start/trainer_config.db-lstm.py
+++ b/demo/quick_start/trainer_config.db-lstm.py
+# edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer_config_helpers import *
+dict_file = "./data/dict.txt"
+word_dict = dict()
+with open(dict_file, 'r') as f:
+    for i, line in enumerate(f):
+        w = line.strip().split()[0]
+        word_dict[w] = i
+is_predict = get_config_arg('is_predict', bool, False)
+trn = 'data/train.list' if not is_predict else None
+tst = 'data/test.list' if not is_predict else 'data/pred.list'
+process = 'process' if not is_predict else 'process_predict'
+define_py_data_sources2(train_list=trn,
+                        test_list=tst,
+                        module="dataprovider_emb",
+                        obj=process,
+                        args={"dictionary": word_dict})
+batch_size = 128 if not is_predict else 1
+settings(
+    batch_size=batch_size,
+    learning_rate=2e-3,
+    learning_method=AdamOptimizer(),
+    regularization=L2Regularization(8e-4),
+    gradient_clipping_threshold=25
+)
+bias_attr = ParamAttr(initial_std=0.,l2_rate=0.)
+data = data_layer(name="word", size=len(word_dict))
+emb = embedding_layer(input=data, size=128)
+hidden_0 = mixed_layer(size=128, input=[full_matrix_projection(input=emb)])
+lstm_0 = lstmemory(input=hidden_0, layer_attr=ExtraAttr(drop_rate=0.1))
+input_layers = [hidden_0, lstm_0]
+for i in range(1,8):
+    fc = fc_layer(input=input_layers, size=128)
+    lstm = lstmemory(input=fc, layer_attr=ExtraAttr(drop_rate=0.1),
+                    reverse=(i % 2) == 1,)
+    input_layers = [fc, lstm]
+lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling())
+output = fc_layer(input=lstm_last, size=2,
+                  bias_attr=bias_attr,
+                  act=SoftmaxActivation())
+if is_predict:
+    maxid = maxid_layer(output)
+    outputs([maxid, output])
+else:
+    label = data_layer(name="label", size=2)
+    cls = classification_cost(input=output, label=label)
+    outputs(cls)
--- a/demo/seqToseq/seqToseq_net.py
+++ b/demo/seqToseq/seqToseq_net.py
@@ -96,12 +96,12 @@ def gru_encoder_decoder(data_conf,
    encoded_vector = concat_layer(input=[src_forward, src_backward])
    with mixed_layer(size=decoder_size) as encoded_proj:
-        encoded_proj += full_matrix_projection(encoded_vector)
+        encoded_proj += full_matrix_projection(input=encoded_vector)
    backward_first = first_seq(input=src_backward)
    with mixed_layer(size=decoder_size,
                     act=TanhActivation(), ) as decoder_boot:
-        decoder_boot += full_matrix_projection(backward_first)
+        decoder_boot += full_matrix_projection(input=backward_first)
    def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
        decoder_mem = memory(name='gru_decoder',
@@ -113,8 +113,8 @@ def gru_encoder_decoder(data_conf,
                                   decoder_state=decoder_mem, )
        with mixed_layer(size=decoder_size * 3) as decoder_inputs:
-            decoder_inputs += full_matrix_projection(context)
+            decoder_inputs += full_matrix_projection(input=context)
-            decoder_inputs += full_matrix_projection(current_word)
+            decoder_inputs += full_matrix_projection(input=current_word)
        gru_step = gru_step_layer(name='gru_decoder',
                                  input=decoder_inputs,

--- a/demo/sequence_tagging/data/get_data.sh
+++ b/demo/sequence_tagging/data/get_data.sh
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+cd $DIR
+wget http://www.cnts.ua.ac.be/conll2000/chunking/train.txt.gz
+wget http://www.cnts.ua.ac.be/conll2000/chunking/test.txt.gz
--- a/demo/sequence_tagging/data/test.list
+++ b/demo/sequence_tagging/data/test.list
+data/test.txt.gz
--- a/demo/sequence_tagging/data/train.list
+++ b/demo/sequence_tagging/data/train.list
+data/train.txt.gz
--- a/demo/sequence_tagging/dataprovider.py
+++ b/demo/sequence_tagging/dataprovider.py
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer.PyDataProvider2 import *
+import gzip
+import logging
+logging.basicConfig(
+    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s',
+)
+logger = logging.getLogger('paddle')
+logger.setLevel(logging.INFO)
+OOV_POLICY_IGNORE = 0
+OOV_POLICY_USE = 1
+OOV_POLICY_ERROR = 2
+num_original_columns = 3
+# Feature combination patterns.
+# [[-1,0], [0,0]]  means previous token at column 0 and current token at 
+# column 0 are combined as one feature.
+patterns = [
+    [[-2,0]],
+    [[-1,0]],
+    [[0,0]],
+    [[1,0]],
+    [[2,0]],
+    [[-1,0], [0,0]],
+    [[0,0], [1,0]],
+    [[-2,1]],
+    [[-1,1]],
+    [[0,1]],
+    [[1,1]],
+    [[2,1]],
+    [[-2,1], [-1,1]],
+    [[-1,1], [0,1]],
+    [[0,1], [1,1]],
+    [[1,1], [2,1]],
+    [[-2,1], [-1,1], [0,1]],
+    [[-1,1], [0,1], [1,1]],
+    [[0,1], [1,1], [2,1]],
+]
+dict_label = {
+ 'B-ADJP': 0,
+ 'I-ADJP': 1,
+ 'B-ADVP': 2,
+ 'I-ADVP': 3,
+ 'B-CONJP': 4,
+ 'I-CONJP': 5,
+ 'B-INTJ': 6,
+ 'I-INTJ': 7,
+ 'B-LST': 8,
+ 'I-LST': 9,
+ 'B-NP': 10,
+ 'I-NP': 11,
+ 'B-PP': 12,
+ 'I-PP': 13,
+ 'B-PRT': 14,
+ 'I-PRT': 15,
+ 'B-SBAR': 16,
+ 'I-SBAR': 17,
+ 'B-UCP': 18,
+ 'I-UCP': 19,
+ 'B-VP': 20,
+ 'I-VP': 21,
+ 'O': 22
+}
+def make_features(sequence):
+    length = len(sequence)
+    num_features = len(sequence[0])
+    def get_features(pos):
+        if pos < 0:
+            return ['#B%s' % -pos] * num_features
+        if pos >= length:
+            return ['#E%s' % (pos - length + 1)] * num_features
+        return sequence[pos]
+    for i in xrange(length):
+        for pattern in patterns:
+            fname = '/'.join([get_features(i+pos)[f] for pos, f in pattern])
+            sequence[i].append(fname)
+'''
+Source file format:
+Each line is for one timestep. The features are separated by space.
+An empty line indicates end of a sequence.
+cutoff: a list of numbers. If count of a feature is smaller than this,
+ it will be ignored.
+if oov_policy[i] is OOV_POLICY_USE, id 0 is reserved for OOV features of
+i-th column.
+return a list of dict for each column
+'''
+def create_dictionaries(filename, cutoff, oov_policy):
+    def add_to_dict(sequence, dicts):
+        num_features = len(dicts)
+        for features in sequence:
+            l = len(features)
+            assert l == num_features, "Wrong number of features " + line
+            for i in xrange(l):
+                if features[i] in dicts[i]:
+                    dicts[i][features[i]] += 1
+                else:
+                    dicts[i][features[i]] = 1
+    num_features = len(cutoff)
+    dicts = []
+    for i in xrange(num_features):
+        dicts.append(dict())
+    f = gzip.open(filename, 'rb')
+    sequence = []
+    for line in f:
+        line = line.strip()
+        if not line:
+            make_features(sequence)
+            add_to_dict(sequence, dicts)
+            sequence = []
+            continue
+        features = line.split(' ')
+        sequence.append(features)
+    for i in xrange(num_features):
+        dct = dicts[i]
+        n = 1 if oov_policy[i] == OOV_POLICY_USE else 0
+        todo = []
+        for k, v in dct.iteritems():
+            if v < cutoff[i]:
+                todo.append(k)
+            else:
+                dct[k] = n
+                n += 1
+        if oov_policy[i] == OOV_POLICY_USE:
+            # placeholder so that len(dct) will be the number of features
+            # including OOV
+            dct['#OOV#'] = 0
+        logger.info('column %d dict size=%d, ignored %d' % (i, n, len(todo)))
+        for k in todo:
+            del dct[k]
+    f.close()
+    return dicts
+def initializer(settings, **xargs):
+    cutoff = [3, 1, 0]
+    cutoff += [3] * len(patterns)
+    oov_policy = [OOV_POLICY_IGNORE, OOV_POLICY_ERROR, OOV_POLICY_ERROR]
+    oov_policy += [OOV_POLICY_IGNORE] * len(patterns)
+    dicts = create_dictionaries('data/train.txt.gz', cutoff, oov_policy)
+    dicts[2] = dict_label
+    settings.dicts = dicts
+    settings.oov_policy = oov_policy
+    input_types = []
+    num_features = len(dicts)
+    for i in xrange(num_original_columns):
+        input_types.append(integer_sequence(len(dicts[i])))
+        logger.info("slot %s size=%s" % (i, len(dicts[i])))
+    if patterns:
+        dim = 0
+        for i in xrange(num_original_columns, num_features):
+            dim += len(dicts[i])
+        input_types.append(sparse_binary_vector_sequence(dim))
+        logger.info("feature size=%s" % dim)
+    settings.input_types = input_types
+'''
+if oov_policy[i] == OOV_POLICY_USE, features in i-th column which are not
+existed in dicts[i] will be assigned to id 0.
+if oov_policy[i] == OOV_POLICY_ERROR, all features in i-th column MUST exist
+in dicts[i].
+'''
+@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, filename):
+    input_file = filename
+    dicts = settings.dicts
+    oov_policy = settings.oov_policy
+    def gen_sample(sequence):
+        num_features = len(dicts)
+        sample = [list() for i in xrange(num_original_columns)]
+        if patterns:
+            sample.append([])
+        for features in sequence:
+            assert len(features) == num_features, \
+                "Wrong number of features: " + line
+            for i in xrange(num_original_columns):
+                id = dicts[i].get(features[i], -1)
+                if id != -1:
+                    sample[i].append(id)
+                elif oov_policy[i] == OOV_POLICY_IGNORE:
+                    sample[i].append(0xffffffff)
+                elif oov_policy[i] == OOV_POLICY_ERROR:
+                    logger.fatal("Unknown token: %s" % features[i])
+                else:
+                    sample[i].append(0)
+            if patterns:
+                dim = 0
+                vec = []
+                for i in xrange(num_original_columns, num_features):
+                    id = dicts[i].get(features[i], -1)
+                    if id != -1:
+                        vec.append(dim + id)
+                    elif oov_policy[i] == OOV_POLICY_IGNORE:
+                        pass
+                    elif oov_policy[i] == OOV_POLICY_ERROR:
+                        logger.fatal("Unknown token: %s" % features[i])
+                    else:
+                        vec.ids.append(dim + 0)
+                    dim += len(dicts[i])
+                sample[-1].append(vec)
+        return sample
+    num_features = len(dicts)
+    f = gzip.open(input_file, 'rb')
+    num_sequences = 0
+    sequence = []
+    for line in f:
+        line = line.strip()
+        if not line:
+            make_features(sequence)
+            yield gen_sample(sequence)
+            sequence = []
+            num_sequences += 1
+            continue
+        features = line.split(' ')
+        sequence.append(features)
+    f.close()
+    logger.info("num_sequences=%s" % num_sequences)
--- a/demo/sequence_tagging/linear_crf.py
+++ b/demo/sequence_tagging/linear_crf.py
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer_config_helpers import *
+import math
+define_py_data_sources2(train_list="data/train.list",
+                        test_list="data/test.list",
+                        module="dataprovider",
+                        obj="process")
+batch_size = 1
+settings(
+    learning_method=MomentumOptimizer(),
+    batch_size=batch_size,
+    regularization=L2Regularization(batch_size * 1e-4),
+    average_window=0.5,
+    learning_rate=1e-1,
+    learning_rate_decay_a=1e-5,
+    learning_rate_decay_b=0.25,
+)
+num_label_types=23
+def get_simd_size(size):
+    return int(math.ceil(float(size) / 8)) * 8
+# Currently, in order to use sparse_update=True,
+# the size has to be aligned.
+num_label_types = get_simd_size(num_label_types)
+features = data_layer(name="features", size=76328)
+word = data_layer(name="word", size=6778)
+pos = data_layer(name="pos", size=44)
+chunk = data_layer(name="chunk",
+                   size=num_label_types)
+crf_input = fc_layer(
+    input=features,
+    size=num_label_types,
+    act=LinearActivation(),
+    bias_attr=False,
+    param_attr=ParamAttr(initial_std=0, sparse_update=True))
+crf=crf_layer(
+    input=crf_input,
+    label=chunk,
+    param_attr=ParamAttr(name="crfw", initial_std=0),
+)
+crf_decoding=crf_decoding_layer(
+    size=num_label_types,
+    input=crf_input,
+    label=chunk,
+    param_attr=ParamAttr(name="crfw"),
+)
+sum_evaluator(
+    name="error",
+    input=crf_decoding,
+)
+chunk_evaluator(
+    name="chunk_f1",
+    input =[crf_decoding, chunk],
+    chunk_scheme="IOB",
+    num_chunk_types=11,
+)
+inputs(word, pos, chunk, features)
+outputs(crf)
--- a/demo/sequence_tagging/readme.md
+++ b/demo/sequence_tagging/readme.md
+# Sequence Tagging
+This demo is a sequence model for assigning tags to each token in a sentence. The task is described at <a href = "http://www.cnts.ua.ac.be/conll2000/chunking">CONLL2000 Text Chunking</a> task.
+## Download data
+```bash
+cd demo/sequence_tagging
+./data/get_data.sh
+```
+## Train model
+```bash
+cd demo/sequence_tagging
+./train.sh
+```
+## Model description
+We provide two models. One is a linear CRF model (linear_crf.py) with is equivalent to the one at <a href="http://leon.bottou.org/projects/sgd#stochastic_gradient_crfs">leon.bottou.org/projects/sgd</a>. The second one is a stacked bidirectional RNN and CRF model (rnn_crf.py).
+<center>
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+<thead>
+<th scope="col" class="left">Model name</th>
+<th scope="col" class="left">Number of parameters</th>
+<th scope="col" class="left">F1 score</th>
+</thead>
+<tbody>
+<tr>
+<td class="left">linear_crf</td>
+<td class="left"> 1.8M </td>
+<td class="left"> 0.937</td>
+</tr>
+<tr>
+<td class="left">rnn_crf</td>
+<td class="left"> 960K </td>
+<td class="left">0.941</td>
+</tr>
+</tbody>
+</table>
+</center>
+<br>
--- a/demo/sequence_tagging/rnn_crf.py
+++ b/demo/sequence_tagging/rnn_crf.py
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer_config_helpers import *
+import math
+define_py_data_sources2(train_list="data/train.list",
+                        test_list="data/test.list",
+                        module="dataprovider",
+                        obj="process")
+batch_size = 16
+settings(
+    learning_method=MomentumOptimizer(),
+    batch_size=batch_size,
+    regularization=L2Regularization(batch_size * 1e-5),
+    average_window=0.5,
+    learning_rate = 2e-3,
+    learning_rate_decay_a = 5e-7,
+    learning_rate_decay_b = 0.5,
+)
+word_dim=128
+hidden_dim = 128
+with_rnn = True
+initial_std=1/math.sqrt(hidden_dim)
+param_attr=ParamAttr(initial_std=initial_std)
+cpu_layer_attr=ExtraLayerAttribute(device=-1)
+default_device(0)
+num_label_types=23
+features = data_layer(name="features", size=76328)
+word = data_layer(name="word", size=6778)
+pos = data_layer(name="pos", size=44)
+chunk = data_layer(name="chunk",
+                   size=num_label_types,
+                   layer_attr=cpu_layer_attr)
+emb = embedding_layer(
+    input=word, size=word_dim, param_attr=ParamAttr(initial_std=0))
+hidden1 = mixed_layer(
+    size=hidden_dim,
+    act=STanhActivation(),
+    bias_attr=True,
+    input=[full_matrix_projection(emb),
+           table_projection(pos, param_attr=param_attr)]
+)
+if with_rnn:
+    rnn1 = recurrent_layer(
+        act=ReluActivation(),
+        bias_attr=True,
+        input=hidden1,
+        param_attr=ParamAttr(initial_std=0),
+    )
+hidden2 = mixed_layer(
+    size=hidden_dim,
+    act=STanhActivation(),
+    bias_attr=True,
+    input=[full_matrix_projection(hidden1)
+    ] + ([
+        full_matrix_projection(rnn1, param_attr=ParamAttr(initial_std=0))
+    ] if with_rnn else []),
+)
+if with_rnn:
+    rnn2=recurrent_layer(
+        reverse=True,
+        act=ReluActivation(),
+        bias_attr=True,
+        input=hidden2,
+        param_attr=ParamAttr(initial_std=0),
+    )
+crf_input = mixed_layer(
+    size=num_label_types,
+    bias_attr=False,
+    input=[
+        full_matrix_projection(hidden2),
+    ] + ([
+        full_matrix_projection(rnn2, param_attr=ParamAttr(initial_std=0))
+    ] if with_rnn else []),
+)
+crf = crf_layer(
+    input=crf_input,
+    label=chunk,
+    param_attr=ParamAttr(name="crfw", initial_std=0),
+    layer_attr=cpu_layer_attr,
+)
+crf_decoding = crf_decoding_layer(
+    size=num_label_types,
+    input=crf_input,
+    label=chunk,
+    param_attr=ParamAttr(name="crfw"),
+    layer_attr=cpu_layer_attr,
+)
+sum_evaluator(
+    name="error",
+    input=crf_decoding,
+)
+chunk_evaluator(
+    name="chunk_f1",
+    input =[crf_decoding, chunk],
+    chunk_scheme="IOB",
+    num_chunk_types=11,
+)
+inputs(word, pos, chunk, features)
+outputs(crf)
--- a/demo/sequence_tagging/train.sh
+++ b/demo/sequence_tagging/train.sh
+#!/bin/bash
+paddle train \
+       --config rnn_crf.py \
+       --parallel_nn=1 \
+       --use_gpu=1 \
+       --dot_period=10 \
+       --log_period=1000 \
+       --test_period=0 \
+       --num_passes=10
--- a/demo/sequence_tagging/train_linear.sh
+++ b/demo/sequence_tagging/train_linear.sh
+#!/bin/bash
+paddle train \
+       --config linear_crf.py \
+       --use_gpu=0 \
+       --dot_period=100 \
+       --log_period=10000 \
+       --test_period=0 \
+       --num_passes=10
--- a/doc/build/contribute_to_paddle.md
+++ b/doc/build/contribute_to_paddle.md
@@ -99,3 +99,7 @@ git pull --rebase upstream HEAD
 git push -f origin HEAD
 ```
 Now your Pull Request is updated with the latest version.
+## Revise your pull request
+When you revise your pull request according to reviewer's comments, please use 'git commit' instead of 'git commit --amend' to commit your changes so that the reviewers can see the difference between the new pull requrest and the old pull request.
--- a/doc/build/docker_install.rst
+++ b/doc/build/docker_install.rst
@@ -69,7 +69,7 @@ If you want to launch container with GPU support, you need to set some environme
 ..  code-block:: bash
-    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}"
+    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
    export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
    docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:gpu-latest

--- a/doc/demo/quick_start/index_en.md
+++ b/doc/demo/quick_start/index_en.md
@@ -134,7 +134,7 @@ def process(settings, file_name):
 You need to add a data provider definition `define_py_data_sources2` in our network configuration. This definition specifies:
 - The path of the training and testing data (`data/train.list`, `data/test.list`).
- The location of the data provider file (`dataprovider_pow`).
+- The location of the data provider file (`dataprovider_bow`).
 - The function to call to get data. (`process`).
 - Additional arguments or data. Here it passes the path of word dictionary.

--- a/doc/ui/api/trainer_config_helpers/layers.rst
+++ b/doc/ui/api/trainer_config_helpers/layers.rst
@@ -73,6 +73,12 @@ img_pool_layer
    :members: img_pool_layer
    :noindex:
+maxout_layer
+------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: maxout_layer
+    :noindex:
 Norm Layer
 ==========
@@ -130,6 +136,12 @@ gru_step_layer
 Recurrent Layer Group
 =====================
+memory
+------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: memory
+    :noindex:
 recurrent_group
 ---------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -377,6 +389,12 @@ ctc_layer
    :members: ctc_layer
    :noindex:
+nce_layer
+-----------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: nce_layer
+    :noindex:
 hsigmoid
 ---------
 ..  automodule:: paddle.trainer_config_helpers.layers

--- a/doc_cn/algorithm/rnn/hierarchical-layer.md
+++ b/doc_cn/algorithm/rnn/hierarchical-layer.md
+# 支持双层序列作为输入的Layer
+## 概述
+在自然语言处理任务中，序列是一种常见的数据类型。一个独立的词语，可以看作是一个非序列输入，或者，我们称之为一个0层的序列；由词语构成的句子，是一个单层序列；若干个句子构成一个段落，是一个双层的序列。
+双层序列是一个嵌套的序列，它的每一个元素，又是一个单层的序列。这是一种非常灵活的数据组织方式，帮助我们构造一些复杂的输入信息。
+我们可以按照如下层次定义非序列，单层序列，以及双层序列。
+ 0层序列：一个独立的元素，类型可以是PaddlePaddle支持的任意输入数据类型
+ 单层序列：排成一列的多个元素，每个元素是一个0层序列，元素之间的顺序是重要的输入信息
+ 双层序列：排成一列的多个元素，每个元素是一个单层序列，称之为双层序列的一个子序列（subseq），subseq的每个元素是一个0层序列
+在 PaddlePaddle中，下面这些Layer能够接受双层序列作为输入，完成相应的计算。
+## pooling_layer
+pooling_layer的使用示例如下，详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#pooling-layer">配置API</a>。
+```python
+seq_pool = pooling_layer(input=layer,
+                         pooling_type=AvgPooling(),
+                         agg_level=AggregateLevel.EACH_SEQUENCE)
+```
+- `pooling_type` 目前支持两种，分别是：MaxPooling()和AvgPooling()。
+- `agg_level=AggregateLevel.TIMESTEP`时（默认值）：
+  - 作用：双层序列经过运算变成一个0层序列，或单层序列经过运算变成一个0层序列
+  - 输入：一个双层序列，或一个单层序列
+  - 输出：一个0层序列，即整个输入序列（单层或双层）的平均值（或最大值）
+- `agg_level=AggregateLevel.EACH_SEQUENCE`时：
+  - 作用：一个双层序列经过运算变成一个单层序列
+  - 输入：必须是一个双层序列
+  - 输出：一个单层序列，序列的每个元素是原来双层序列每个subseq元素的平均值（或最大值）
+## last_seq 和 first_seq
+last_seq的使用示例如下（first_seq类似），详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#last-seq">配置API</a>。
+```python
+last = last_seq(input=layer,
+                agg_level=AggregateLevel.EACH_SEQUENCE)
+```
+- `agg_level=AggregateLevel.TIMESTEP`时（默认值）：
+  - 作用：一个双层序列经过运算变成一个0层序列，或一个单层序列经过运算变成一个0层序列
+  - 输入：一个双层序列或一个单层序列
+  - 输出：一个0层序列，即整个输入序列（双层或者单层）最后一个，或第一个元素。
+- `agg_level=AggregateLevel.EACH_SEQUENCE`时：
+  - 作用：一个双层序列经过运算变成一个单层序列
+  - 输入：必须是一个双层序列
+  - 输出：一个单层序列，其中每个元素是双层序列中每个subseq最后一个（或第一个）元素。
+## expand_layer
+expand_layer的使用示例如下，详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#expand-layer">配置API</a>。
+```python
+expand = expand_layer(input=layer1,
+                      expand_as=layer2,
+                      expand_level=ExpandLevel.FROM_TIMESTEP)
+```
+- `expand_level=ExpandLevel.FROM_TIMESTEP`时（默认值）：
+  - 作用：一个0层序列经过运算扩展成一个单层序列，或者一个双层序列
+  - 输入：layer1必须是一个0层序列，是待扩展的数据；layer2可以是一个单层序列，或者是一个双层序列，提供扩展的长度信息
+  - 输出：一个单层序列，或一个双层序列，输出序列的类型（双层序列，或单层序列）和序列中含有元素的数目同 layer2一致。若输出是单层序列，单层序列的每个元素（0层序列），都是对layer1元素的拷贝；若输出是双层序列，双层序列每个subseq中每个元素（0层序列），都是对layer1元素的拷贝
+- `expand_level=ExpandLevel.FROM_SEQUENCE`时：
+  - 作用：一个单层序列经过运算扩展成一个双层序列
+  - 输入：layer1必须是一个单层序列，是待扩展的数据；layer2必须是一个双层序列，提供扩展的长度信息
+  - 输出：一个双层序列，序列中含有元素的数目同layer2一致。要求单层序列含有元素的数目（0层序列），和双层序列含有subseq 的数目一致。单层序列第i个元素（0层序列），被扩展为一个单层序列，构成了输出双层序列的第i个subseq。
\ No newline at end of file
--- a/doc_cn/algorithm/rnn/hierarchical-rnn.md
+++ b/doc_cn/algorithm/rnn/hierarchical-rnn.md
+# 双层RNN配置与示例
+我们在`paddle/gserver/tests/test_RecurrentGradientMachine`单测中，通过多组语义相同的单双层RNN配置，讲解如何使用双层RNN。
+## 示例1：双进双出，subseq间无memory
+配置：单层RNN（`sequence_layer_group`）和双层RNN（`sequence_nest_layer_group`），语义完全相同。
+### 读取双层序列的方法
+首先，我们看一下单双层序列的不同数据组织形式（您也可以采用别的组织形式）：
+- 单层序列的数据（`Sequence/tour_train_wdseg`）如下，一共有10个样本。每个样本由两部分组成，一个label（此处都为2）和一个已经分词后的句子。
+```text
+2  	酒店 有 很 舒适 的 床垫 子 ， 床上用品 也 应该 是 一人 一 换 ， 感觉 很 利落 对 卫生 很 放心 呀 。
+2  	很 温馨 ， 也 挺 干净 的 * 地段 不错 ， 出来 就 有 全家 ， 离 地铁站 也 近 ， 交通 很方便 * 就是 都 不 给 刷牙 的 杯子 啊 ， 就 第一天 给 了 一次性杯子 *
+2  	位置 方便 ， 强烈推荐 ， 十一 出去玩 的 时候 选 的 ， 对面 就是 华润万家 ， 周围 吃饭 的 也 不少 。
+2  	交通便利 ， 吃 很 便利 ， 乾 浄 、 安静 ， 商务 房 有 电脑 、 上网 快 ， 价格 可以 ， 就 早餐 不 好吃 。 整体 是 不错 的 。 適 合 出差 來 住 。
+2  	本来 准备 住 两 晚 ， 第 2 天 一早 居然 停电 ， 且 无 通知 ， 只有 口头 道歉 。 总体来说 性价比 尚可 ， 房间 较 新 ， 还是 推荐 .
+2  	这个 酒店 去过 很多 次 了 ， 选择 的 主要原因 是 离 客户 最 便宜 相对 又 近 的 酒店
+2  	挺好 的 汉庭 ， 前台 服务 很 热情 ， 卫生 很 整洁 ， 房间 安静 ， 水温 适中 ， 挺好 ！
+2  	HowardJohnson 的 品质 ， 服务 相当 好 的 一 家 五星级 。 房间 不错 、 泳池 不错 、 楼层 安排 很 合理 。 还有 就是 地理位置 ， 简直 一 流 。 就 在 天一阁 、 月湖 旁边 ， 离 天一广场 也 不远 。 下次 来 宁波 还会 住 。
+2  	酒店 很干净 ， 很安静 ， 很 温馨 ， 服务员 服务 好 ， 各方面 都 不错 *
+2  	挺好 的 ， 就是 没 窗户 ， 不过 对 得 起 这 价格
+```
+- 双层序列的数据（`Sequence/tour_train_wdseg.nest`）如下，一共有4个样本。样本间用空行分开，代表不同的双层序列，序列数据和上面的完全一样。每个样本的子句数分别为2,3,2,3。
+```text
+2  	酒店 有 很 舒适 的 床垫 子 ， 床上用品 也 应该 是 一人 一 换 ， 感觉 很 利落 对 卫生 很 放心 呀 。
+2  	很 温馨 ， 也 挺 干净 的 * 地段 不错 ， 出来 就 有 全家 ， 离 地铁站 也 近 ， 交通 很方便 * 就是 都 不 给 刷牙 的 杯子 啊 ， 就 第一天 给 了 一次性杯子 *
+2  	位置 方便 ， 强烈推荐 ， 十一 出去玩 的 时候 选 的 ， 对面 就是 华润万家 ， 周围 吃饭 的 也 不少 。
+2  	交通便利 ， 吃 很 便利 ， 乾 浄 、 安静 ， 商务 房 有 电脑 、 上网 快 ， 价格 可以 ， 就 早餐 不 好吃 。 整体 是 不错 的 。 適 合 出差 來 住 。
+2  	本来 准备 住 两 晚 ， 第 2 天 一早 居然 停电 ， 且 无 通知 ， 只有 口头 道歉 。 总体来说 性价比 尚可 ， 房间 较 新 ， 还是 推荐 .
+2  	这个 酒店 去过 很多 次 了 ， 选择 的 主要原因 是 离 客户 最 便宜 相对 又 近 的 酒店
+2  	挺好 的 汉庭 ， 前台 服务 很 热情 ， 卫生 很 整洁 ， 房间 安静 ， 水温 适中 ， 挺好 ！
+2  	HowardJohnson 的 品质 ， 服务 相当 好 的 一 家 五星级 。 房间 不错 、 泳池 不错 、 楼层 安排 很 合理 。 还有 就是 地理位置 ， 简直 一 流 。 就 在 天一阁 、 月湖 旁边 ， 离 天一广场 也 不远 。 下次 来 宁波 还会 住 。
+2  	酒店 很干净 ， 很安静 ， 很 温馨 ， 服务员 服务 好 ， 各方面 都 不错 *
+2  	挺好 的 ， 就是 没 窗户 ， 不过 对 得 起 这 价格
+```
+其次，我们看一下单双层序列的不同dataprovider（见`sequenceGen.py`）：
+- 单层序列的dataprovider如下：
+  - word_slot是integer_value_sequence类型，代表单层序列。
+  - label是integer_value类型，代表一个向量。
+```python
+def hook(settings, dict_file, **kwargs):
+    settings.word_dict = dict_file
+    settings.input_types = [integer_value_sequence(len(settings.word_dict)), 
+                            integer_value(3)]
+@provider(init_hook=hook)
+def process(settings, file_name):
+    with open(file_name, 'r') as fdata:
+        for line in fdata:
+            label, comment = line.strip().split('\t')
+            label = int(''.join(label.split()))
+            words = comment.split()
+            word_slot = [settings.word_dict[w] for w in words if w in settings.word_dict]
+            yield word_slot, label
+```
+- 双层序列的dataprovider如下：
+  - word_slot是integer_value_sub_sequence类型，代表双层序列。
+  - label是integer_value_sequence类型，代表单层序列，即一个子句一个label。注意：也可以为integer_value类型，代表一个向量，即一个句子一个label。通常根据任务需求进行不同设置。
+  - 关于dataprovider中input_types的详细用法，参见PyDataProvider2。
+```python
+def hook2(settings, dict_file, **kwargs):
+    settings.word_dict = dict_file
+    settings.input_types = [integer_value_sub_sequence(len(settings.word_dict)),
+                            integer_value_sequence(3)]
+@provider(init_hook=hook2)
+def process2(settings, file_name):
+    with open(file_name) as fdata:
+        label_list = []
+        word_slot_list = []
+        for line in fdata:
+            if (len(line)) > 1:
+                label,comment = line.strip().split('\t')
+                label = int(''.join(label.split()))
+                words = comment.split()
+                word_slot = [settings.word_dict[w] for w in words if w in settings.word_dict]
+                label_list.append(label)
+                word_slot_list.append(word_slot)
+            else:
+                yield word_slot_list, label_list
+                label_list = []
+                word_slot_list = []
+```
+### 模型中的配置
+首先，我们看一下单层序列的配置（见`sequence_layer_group.conf`）。注意：batchsize=5表示一次过5句单层序列，因此2个batch就可以完成1个pass。
+```python
+settings(batch_size=5)
+data = data_layer(name="word", size=dict_dim)
+emb = embedding_layer(input=data, size=word_dim)
+# (lstm_input + lstm) is equal to lstmemory 
+with mixed_layer(size=hidden_dim*4) as lstm_input:
+    lstm_input += full_matrix_projection(input=emb)
+lstm = lstmemory_group(input=lstm_input,
+                       size=hidden_dim,
+                       act=TanhActivation(),
+                       gate_act=SigmoidActivation(),
+                       state_act=TanhActivation(),
+                       lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
+lstm_last = last_seq(input=lstm)
+with mixed_layer(size=label_dim, 
+                 act=SoftmaxActivation(), 
+                 bias_attr=True) as output:
+    output += full_matrix_projection(input=lstm_last)
+outputs(classification_cost(input=output, label=data_layer(name="label", size=1)))
+```
+其次，我们看一下语义相同的双层序列配置（见`sequence_nest_layer_group.conf`），并对其详细分析：
+- batchsize=2表示一次过2句双层序列。但从上面的数据格式可知，2句双层序列和5句单层序列的数据完全一样。
+- data_layer和embedding_layer不关心数据是否是序列格式，因此两个配置在这两层上的输出是一样的。
+- lstmemory:
+  - 单层序列过了一个mixed_layer和lstmemory_group。
+  - 双层序列在同样的mixed_layer和lstmemory_group外，直接加了一层group。由于这个外层group里面没有memory，表示subseq间不存在联系，即起到的作用仅仅是把双层seq拆成单层，因此双层序列过完lstmemory的输出和单层的一样。
+- last_seq：
+  - 单层序列直接取了最后一个元素
+  - 双层序列首先（last_seq层）取了每个subseq的最后一个元素，将其拼接成一个新的单层序列；接着（expand_layer层）将其扩展成一个新的双层序列，其中第i个subseq中的所有向量均为输入的单层序列中的第i个向量；最后（average_layer层）取了每个subseq的平均值。
+  - 分析得出：第一个last_seq后，每个subseq的最后一个元素就等于单层序列的最后一个元素，而expand_layer和average_layer后，依然保持每个subseq最后一个元素的值不变（这两层仅是为了展示它们的用法，实际中并不需要）。因此单双层序列的输出是一样旳。
+```python
+settings(batch_size=2)
+data = data_layer(name="word", size=dict_dim)
+emb_group = embedding_layer(input=data, size=word_dim)
+# (lstm_input + lstm) is equal to lstmemory 
+def lstm_group(lstm_group_input):
+    with mixed_layer(size=hidden_dim*4) as group_input:
+      group_input += full_matrix_projection(input=lstm_group_input)
+    lstm_output = lstmemory_group(input=group_input,
+                                  name="lstm_group",
+                                  size=hidden_dim,
+                                  act=TanhActivation(),
+                                  gate_act=SigmoidActivation(),
+                                  state_act=TanhActivation(),
+                                  lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
+    return lstm_output
+lstm_nest_group = recurrent_group(input=SubsequenceInput(emb_group),
+                                  step=lstm_group,
+                                  name="lstm_nest_group")
+# hasSubseq ->(seqlastins) seq
+lstm_last = last_seq(input=lstm_nest_group, agg_level=AggregateLevel.EACH_SEQUENCE)
+# seq ->(expand) hasSubseq
+lstm_expand = expand_layer(input=lstm_last, expand_as=emb_group, expand_level=ExpandLevel.FROM_SEQUENCE)
+# hasSubseq ->(average) seq
+lstm_average = pooling_layer(input=lstm_expand,
+                             pooling_type=AvgPooling(),
+                             agg_level=AggregateLevel.EACH_SEQUENCE)
+with mixed_layer(size=label_dim, 
+                 act=SoftmaxActivation(), 
+                 bias_attr=True) as output:
+    output += full_matrix_projection(input=lstm_average)
+outputs(classification_cost(input=output, label=data_layer(name="label", size=1)))
+```
+## 示例2：双进双出，subseq间有memory
+配置：单层RNN（`sequence_rnn.conf`），双层RNN（`sequence_nest_rnn.conf`和`sequence_nest_rnn_readonly_memory.conf`），语义完全相同。
+### 读取双层序列的方法
+我们看一下单双层序列的不同数据组织形式和dataprovider（见`rnn_data_provider.py`）
+```python
+data = [
+    [[[1, 3, 2], [4, 5, 2]], 0],
+    [[[0, 2], [2, 5], [0, 1, 2]], 1],
+]
+@provider(input_types=[integer_value_sub_sequence(10),
+                       integer_value(3)])
+def process_subseq(settings, file_name):
+    for d in data:
+        yield d
+@provider(input_types=[integer_value_sequence(10),
+                       integer_value(3)])
+def process_seq(settings, file_name):
+    for d in data:
+        seq = []
+```
+- 单层序列：有两句，分别为[1,3,2,4,5,2]和[0,2,2,5,0,1,2]。
+- 双层序列：有两句，分别为[[1,3,2],[4,5,2]]（2个子句）和[[0,2],[2,5],[0,1,2]]（3个子句）。
+- 单双层序列的label都分别是0和1
+### 模型中的配置
+我们选取单双层序列配置中的不同部分，来对比分析两者语义相同的原因。
+- 单层序列：过了一个很简单的recurrent_group。每一个时间步，当前的输入y和上一个时间步的输出rnn_state做了一个全链接。
+```python
+def step(y):
+    mem = memory(name="rnn_state", size=hidden_dim)
+    return fc_layer(input=[y, mem],
+                    size=hidden_dim,
+                    act=TanhActivation(),
+                    bias_attr=True,
+                    name="rnn_state")
+out = recurrent_group(step=step, input=emb)
+```
+- 双层序列，外层memory是一个元素：
+  - 内层inner_step的recurrent_group和单层序列的几乎一样。除了boot_layer=outer_mem，表示将外层的outer_mem作为内层memory的初始状态。外层outer_step中，outer_mem是一个子句的最后一个向量，即整个双层group是将前一个子句的最后一个向量，作为下一个子句memory的初始状态。
+  - 从输入数据上看，单双层序列的句子是一样的，只是双层序列将其又做了子序列划分。因此双层序列的配置中，必须将前一个子句的最后一个元素，作为boot_layer传给下一个子句的memory，才能保证和单层序列的配置中“每一个时间步都用了上一个时间步的输出结果”一致。
+```python
+def outer_step(x):
+    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
+    def inner_step(y):
+        inner_mem = memory(name="inner_rnn_state",
+                           size=hidden_dim,
+                           boot_layer=outer_mem)
+        return fc_layer(input=[y, inner_mem],
+                        size=hidden_dim,
+                        act=TanhActivation(),
+                        bias_attr=True,
+                        name="inner_rnn_state")
+    inner_rnn_output = recurrent_group(
+        step=inner_step,
+        input=x)
+    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
+    return inner_rnn_output
+out = recurrent_group(step=outer_step, input=SubsequenceInput(emb))
+```
+- 双层序列，外层memory是单层序列：
+  - 由于外层每个时间步返回的是一个子句，这些子句的长度往往不等长。因此当外层有is_seq=True的memory时，内层是**无法直接使用**它的，即内层memory的boot_layer不能链接外层的这个memory。
+  - 如果内层memory想**间接使用**这个外层memory，只能通过`pooling_layer`、`last_seq`或`first_seq`这三个layer将它先变成一个元素。但这种情况下，外层memory必须有boot_layer，否则在第0个时间步时，由于外层memory没有任何seq信息，因此上述三个layer的前向会报出“**Check failed: input.sequenceStartPositions**”的错误。
+## 示例3：双进双出，输入不等长
+**输入不等长**是指recurrent_group的多个输入在各时刻的长度可以不相等, 但需要指定一个和输出长度一致的input，用<font color="red">targetInlink</font>表示。参考配置：单层RNN（`sequence_rnn_multi_unequalength_inputs.conf`），双层RNN（`sequence_nest_rnn_multi_unequalength_inputs.conf`）
+### 读取双层序列的方法
+我们看一下单双层序列的数据组织形式和dataprovider（见`rnn_data_provider.py`）
+```python
+data2 = [
+    [[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]] ,0],
+    [[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]], 1],
+]
+@provider(input_types=[integer_value_sub_sequence(10),
+                       integer_value_sub_sequence(10),
+                       integer_value(2)],
+          should_shuffle=False)
+def process_unequalength_subseq(settings, file_name): #双层RNN的dataprovider
+    for d in data2:
+        yield d
+@provider(input_types=[integer_value_sequence(10),
+                       integer_value_sequence(10),
+                       integer_value(2)],
+          should_shuffle=False)
+def process_unequalength_seq(settings, file_name): #单层RNN的dataprovider
+    for d in data2:
+        words1=reduce(lambda x,y: x+y, d[0])
+        words2=reduce(lambda x,y: x+y, d[1])
+        yield words1, words2, d[2]
+```
+data2 中有两个样本，每个样本有两个特征, 记fea1, fea2。
+- 单层序列：两个样本分别为[[1, 2, 4, 5, 2], [5, 4, 1, 3, 1]] 和 [[0, 2, 2, 5, 0, 1, 2], [1, 5, 4, 2, 3, 6, 1]]
+- 双层序列：两个样本分别为
+  - **样本1**：[[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]]]。fea1和fea2都分别有2个子句，fea1=[[1, 2], [4, 5, 2]], fea2=[[5, 4, 1], [3, 1]]
+  - **样本2**：[[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]]]。fea1和fea2都分别有3个子句， fea1=[[0, 2], [2, 5], [0, 1, 2]], fea2=[[1, 5], [4], [2, 3, 6, 1]]。<br/>
+  - **注意**：每个样本中，各特征的子句数目需要相等。这里说的“双进双出，输入不等长”是指fea1在i时刻的输入的长度可以不等于fea2在i时刻的输入的长度。如对于第1个样本，时刻i=2, fea1[2]=[4, 5, 2]，fea2[2]=[3, 1]，3≠2。
+- 单双层序列中，两个样本的label都分别是0和1
+### 模型中的配置
+单层RNN（`sequence_rnn_multi_unequalength_inputs.conf`）和双层RNN（`sequence_nest_rnn_multi_unequalength_inputs.conf`）两个模型配置达到的效果完全一样，区别只在于输入为单层还是双层序列，现在我们来看它们内部分别是如何实现的。
+- 单层序列：
+  - 过了一个简单的recurrent_group。每一个时间步，当前的输入y和上一个时间步的输出rnn_state做了一个全连接，功能与示例2中`sequence_rnn.conf`的`step`函数完全相同。这里，两个输入x1,x2分别通过calrnn返回最后时刻的状态。结果得到的encoder1_rep和encoder2_rep分别是单层序列，最后取encoder1_rep的最后一个时刻和encoder2_rep的所有时刻分别相加得到context。
+  - 注意到这里recurrent_group输入的每个样本中，fea1和fea2的长度都分别相等，这并非偶然，而是因为recurrent_group要求输入为单层序列时，所有输入的长度都必须相等。
+```python
+def step(x1, x2):
+	def calrnn(y):
+		mem = memory(name = 'rnn_state_' + y.name, size = hidden_dim)
+        out = fc_layer(input = [y, mem],
+	        size = hidden_dim,
+	        act = TanhActivation(),
+            bias_attr = True,
+            name = 'rnn_state_' + y.name)
+        return out
+	encoder1 = calrnn(x1)
+    encoder2 = calrnn(x2)
+    return [encoder1, encoder2]
+encoder1_rep, encoder2_rep = recurrent_group(
+    name="stepout",                           
+    step=step,
+    input=[emb1, emb2])
+encoder1_last = last_seq(input = encoder1_rep)                           
+encoder1_expandlast = expand_layer(input = encoder1_last,
+                                   expand_as = encoder2_rep)
+context = mixed_layer(input = [identity_projection(encoder1_expandlast),
+                               identity_projection(encoder2_rep)],
+                      size = hidden_dim)
+```
+- 双层序列：
+  - 双层RNN中，对输入的两个特征分别求时序上的连续全连接(`inner_step1`和`inner_step2`分别处理fea1和fea2)，其功能与示例2中`sequence_nest_rnn.conf`的`outer_step`函数完全相同。不同之处是，此时输入`[SubsequenceInput(emb1), SubsequenceInput(emb2)]`在各时刻并不等长。
+  - 函数`outer_step`中可以分别处理这两个特征，但我们需要用<font color=red>targetInlink</font>指定recurrent_group的输出的格式（各子句长度）只能和其中一个保持一致，如这里选择了和emb2的长度一致。
+  - 最后，依然是取encoder1_rep的最后一个时刻和encoder2_rep的所有时刻分别相加得到context。
+```python
+def outer_step(x1, x2):
+    outer_mem1 = memory(name = "outer_rnn_state1", size = hidden_dim)
+    outer_mem2 = memory(name = "outer_rnn_state2", size = hidden_dim)
+    def inner_step1(y):
+        inner_mem = memory(name = 'inner_rnn_state_' + y.name,
+                           size = hidden_dim,
+                           boot_layer = outer_mem1)
+        out = fc_layer(input = [y, inner_mem],
+                       size = hidden_dim,
+                       act = TanhActivation(),
+                       bias_attr = True,
+                       name = 'inner_rnn_state_' + y.name)
+        return out
+    def inner_step2(y):
+        inner_mem = memory(name = 'inner_rnn_state_' + y.name,
+                           size = hidden_dim,
+                           boot_layer = outer_mem2)
+        out = fc_layer(input = [y, inner_mem],
+                       size = hidden_dim,
+                       act = TanhActivation(),
+                       bias_attr = True,
+                       name = 'inner_rnn_state_' + y.name)
+        return out
+    encoder1 = recurrent_group(
+        step = inner_step1,
+        name = 'inner1',
+        input = x1)
+    encoder2 = recurrent_group(
+        step = inner_step2,
+        name = 'inner2',
+        input = x2)
+    sentence_last_state1 = last_seq(input = encoder1, name = 'outer_rnn_state1')
+    sentence_last_state2_ = last_seq(input = encoder2, name = 'outer_rnn_state2')
+    encoder1_expand = expand_layer(input = sentence_last_state1,
+                                   expand_as = encoder2)
+    return [encoder1_expand, encoder2]
+encoder1_rep, encoder2_rep = recurrent_group(
+    name="outer",
+    step=outer_step,
+    input=[SubsequenceInput(emb1), SubsequenceInput(emb2)],
+    targetInlink=emb2)
+encoder1_last = last_seq(input = encoder1_rep)
+encoder1_expandlast = expand_layer(input = encoder1_last,
+                                   expand_as = encoder2_rep)
+context = mixed_layer(input = [identity_projection(encoder1_expandlast),
+                               identity_projection(encoder2_rep)],
+                      size = hidden_dim)
+```
+## 示例4：beam_search的生成
+TBD
\ No newline at end of file
--- a/doc_cn/algorithm/rnn/rnn-tutorial.md
+++ b/doc_cn/algorithm/rnn/rnn-tutorial.md
+# Recurrent Group教程
+## 概述
+序列数据是自然语言处理任务面对的一种主要输入数据类型。
+一句话是由词语构成的序列，多句话进一步构成了段落。因此，段落可以看作是一个嵌套的双层的序列，这个序列的每个元素又是一个序列。
+双层序列是PaddlePaddle支持的一种非常灵活的数据组织方式，帮助我们更好地描述段落、多轮对话等更为复杂的语言数据。基于双层序列输入，我们可以设计搭建一个灵活的、层次化的RNN，分别从词语和句子级别编码输入数据，同时也能够引入更加复杂的记忆机制，更好地完成一些复杂的语言理解任务。
+在PaddlePaddle中，`recurrent_group`是一种任意复杂的RNN单元，用户只需定义RNN在一个时间步内完成的计算，PaddlePaddle负责完成信息和误差在时间序列上的传播。
+更进一步，`recurrent_group`同样可以扩展到双层序列的处理上。通过两个嵌套的`recurrent_group`分别定义子句级别和词语级别上需要完成的运算，最终实现一个层次化的复杂RNN。
+目前，在PaddlePaddle中，能够对双向序列进行处理的有`recurrent_group`和部分Layer，具体可参考文档：<a href = "hierarchical-layer.html">支持双层序列作为输入的Layer</a>。
+## 相关概念
+### 基本原理
+`recurrent_group` 是PaddlePaddle支持的一种任意复杂的RNN单元。使用者只需要关注于设计RNN在一个时间步之内完成的计算，PaddlePaddle负责完成信息和梯度在时间序列上的传播。
+PaddlePaddle中，`recurrent_group`的一个简单调用如下：
+``` python
+recurrent_group(step, input, reverse)
+```
+- step：一个可调用的函数，定义一个时间步之内RNN单元完成的计算
+- input：输入，必须是一个单层序列，或者一个双层序列
+- reverse：是否以逆序处理输入序列
+使用`recurrent_group`的核心是设计step函数的计算逻辑。step函数内部可以自由组合PaddlePaddle支持的各种layer，完成任意的运算逻辑。`recurrent_group` 的输入（即input）会成为step函数的输入，由于step 函数只关注于RNN一个时间步之内的计算，在这里`recurrent_group`替我们完成了原始输入数据的拆分。
+### 输入
+`recurrent_group`处理的输入序列主要分为以下三种类型：
+- **数据输入**：一个双层序列进入`recurrent_group`会被拆解为一个单层序列，一个单层序列进入`recurrent_group`会被拆解为非序列，然后交给step函数，这一过程对用户是完全透明的。可以有以下两种：1）通过data_layer拿到的用户输入；2）其它layer的输出。
+- **只读Memory输入**：`StaticInput` 定义了一个只读的Memory，由`StaticInput`指定的输入不会被`recurrent_group`拆解，`recurrent_group` 循环展开的每个时间步总是能够引用所有输入，可以是一个非序列，或者一个单层序列。
+- **序列生成任务的输入**：`GeneratedInput`只用于在序列生成任务中指定输入数据。
+### 输入示例
+序列生成任务大多遵循encoder-decoer架构，encoder和decoder可以是能够处理序列的任意神经网络单元，而RNN是最流行的选择。
+给定encoder输出和当前词，decoder每次预测产生下一个最可能的词语。在这种结构中，decoder接受两个输入：
+- 要生成的目标序列：是decoder的数据输入，也是decoder循环展开的依据，`recurrent_group`会对这类输入进行拆解。
+- encoder输出，可以是一个非序列，或者一个单层序列：是一个unbounded memory，decoder循环展开的每一个时间步会引用全部结果，不应该被拆解，这种类型的输入必须通过`StaticInput`指定。关于Unbounded Memory的更多讨论请参考论文 [Neural Turning Machine](https://arxiv.org/abs/1410.5401)。
+在序列生成任务中，decoder RNN总是引用上一时刻预测出的词的词向量，作为当前时刻输入。`GeneratedInput`自动完成这一过程。
+### 输出
+`step`函数必须返回一个或多个Layer的输出，这个Layer的输出会作为整个`recurrent_group` 最终的输出结果。在输出的过程中，`recurrent_group` 会将每个时间步的输出拼接，这个过程对用户也是透明的。
+### memory
+memory只能在`recurrent_group`中定义和使用。memory不能独立存在，必须指向一个PaddlePaddle定义的Layer。引用memory得到这layer上一时刻输出，因此，可以将memory理解为一个时延操作。
+可以显示地指定一个layer的输出用于初始化memory。不指定时，memory默认初始化为0。
+## 双层RNN介绍
+`recurrent_group`帮助我们完成对输入序列的拆分，对输出的合并，以及计算逻辑在序列上的循环展开。
+利用这种特性，两个嵌套的`recurrent_group`能够处理双层序列，实现词语和句子两个级别的双层RNN结构。
+- 单层（word-level）RNN：每个状态（state）对应一个词（word）。
+- 双层（sequence-level）RNN：一个双层RNN由多个单层RNN组成，每个单层RNN（即双层RNN的每个状态）对应一个子句（subseq）。
+为了描述方便，下文以NLP任务为例，将含有子句（subseq）的段落定义为一个双层序列，将含有词语的句子定义为一个单层序列，那么0层序列即为一个词语。
+## 双层RNN的使用
+### 训练流程的使用方法
+使用 `recurrent_group`需要遵循以下约定：
+- **单进单出**：输入和输出都是单层序列。
+  - 如果有多个输入，不同输入序列含有的词语数必须严格相等。
+  - 输出一个单层序列，输出序列的词语数和输入序列一致。
+  - memory：在step函数中定义 memory指向一个layer，通过引用memory得到这个layer上一个时刻输出，形成recurrent 连接。memory的is_seq参数必须为false。如果没有定义memory，每个时间步之内的运算是独立的。
+  - boot_layer：memory的初始状态，默认初始状为0，memory的is_seq参数必须为false。
+- **双进双出**：输入和输出都是双层序列。
+  - 如果有多个输入序列，不同输入含有的子句（subseq）数必须严格相等，但子句含有的词语数可以不相等。
+  - 输出一个双层序列，子句（subseq）数、子句的单词数和指定的一个输入序列一致，默认为第一个输入。
+  - memory：在step函数中定义memory，指向一个layer，通过引用memory得到这个layer上一个时刻的输出，形成recurrent连接。定义在外层`recurrent_group` step函数中的memory，能够记录上一个subseq 的状态，可以是一个单层序列（只作为read-only memory），也可以是一个词语。如果没有定义memory，那么 subseq 之间的运算是独立的。
+  - boot_layer：memory 初始状态，可以是一个单层序列（只作为read-only memory）或一个向量。默认不设置，即初始状态为0。
+- **双进单出**：目前还未支持，会报错"In hierachical RNN, all out links should be from sequences now"。
+### 生成流程的使用方法
+使用`beam_search`需要遵循以下约定：
+- 单层RNN：从一个word生成下一个word。
+- 双层RNN：即把单层RNN生成后的subseq给拼接成一个新的双层seq。从语义上看，也不存在一个subseq直接生成下一个subseq的情况。
\ No newline at end of file
--- a/doc_cn/build_and_install/install/docker_install.rst
+++ b/doc_cn/build_and_install/install/docker_install.rst
@@ -23,9 +23,9 @@ PaddlePaddle提供的Docker镜像版本
 +-----------------+------------------+------------------------+-----------------------+
 |       GPU       | gpu-latest       | gpu-devel-latest       | gpu-demo-latest       |
 +-----------------+------------------+------------------------+-----------------------+
-| CPU WITHOUT AVX | cpu-noavx-latest | cpu-devel-noavx-latest | cpu-demo-noavx-latest |
+| CPU WITHOUT AVX | cpu-noavx-latest | cpu-noavx-devel-latest | cpu-noavx-demo-latest |
 +-----------------+------------------+------------------------+-----------------------+
-| GPU WITHOUT AVX | gpu-noavx-latest | gpu-devel-noavx-latest | gpu-demo-noavx-latest |
+| GPU WITHOUT AVX | gpu-noavx-latest | gpu-noavx-devel-latest | gpu-noavx-demo-latest |
 +-----------------+------------------+------------------------+-----------------------+
 其中，横向包括三个版本，normal，devel和demo。

--- a/doc_cn/conf.py.in
+++ b/doc_cn/conf.py.in
@@ -47,6 +47,7 @@ extensions = [
    'sphinx.ext.autosummary',
    'sphinx.ext.mathjax',
    'sphinx.ext.napoleon',
+    'sphinx.ext.graphviz'
 ]
 table_styling_embed_css = True

--- a/doc_cn/faq/index.rst
+++ b/doc_cn/faq/index.rst
+####################
+PaddlePaddle常见问题
+####################
+..  contents::
+1. 如何减少PaddlePaddle的内存占用
+---------------------------------
+神经网络的训练本身是一个非常消耗内存和显存的工作。经常会消耗数十G的内存和数G的显存。
+PaddlePaddle的内存占用主要分为如下几个方面\:
+* DataProvider缓冲池内存 (只针对内存)
+* 神经元激活内存 （针对内存和显存）
+* 参数内存 (针对内存和显存)
+* 其他内存杂项
+这其中，其他内存杂项是指PaddlePaddle本身所用的一些内存，包括字符串分配，临时变量等等，
+这些内存就不考虑如何缩减了。
+其他的内存的减少方法依次为
+减少DataProvider缓冲池内存
++++++++++++++++++++++++++
+PyDataProvider使用的是异步加载，同时在内存里直接随即选取数据来做Shuffle。即
+..  graphviz::
+    digraph {
+        rankdir=LR;
+        数据文件 -> 内存池 -> PaddlePaddle训练
+    }
+所以，减小这个内存池即可减小内存占用，同时也可以加速开始训练前数据载入的过程。但是，这
+个内存池实际上决定了shuffle的粒度。所以，如果将这个内存池减小，又要保证数据是随机的，
+那么最好将数据文件在每次读取之前做一次shuffle。可能的代码为
+..  literalinclude:: reduce_min_pool_size.py
+这样做可以极大的减少内存占用，并且可能会加速训练过程。 详细文档参考 `这里
+<../ui/data_provider/pydataprovider2.html#provider>`_ 。
+神经元激活内存
++++++++++++++
+神经网络在训练的时候，会对每一个激活暂存一些数据，包括激活，參差等等。
+在反向传递的时候，这些数据会被用来更新参数。这些数据使用的内存主要和两个参数有关系，
+一是batch size，另一个是每条序列(Sequence)长度。所以，其实也是和每个mini-batch中包含
+的时间步信息成正比。
+所以，做法可以有两种。他们是
+* 减小batch size。 即在网络配置中 :code:`settings(batch_size=1000)` 设置成一个小一些的值。但是batch size本身是神经网络的超参数，减小batch size可能会对训练结果产生影响。
+* 减小序列的长度，或者直接扔掉非常长的序列。比如，一个数据集大部分序列长度是100-200,
+  但是突然有一个10000长的序列，就很容易导致内存超限。特别是在LSTM等RNN中。
+参数内存
++++++++
+PaddlePaddle支持非常多的优化算法(Optimizer)，不同的优化算法需要使用不同大小的内存。
+例如如果使用 :code:`adadelta` 算法，则需要使用参数规模大约5倍的内存。 如果参数保存下来的
+文件为 :code:`100M`， 那么该优化算法至少需要 :code:`500M` 的内存。
+可以考虑使用一些优化算法，例如 :code:`momentum`。
+2. 如何加速PaddlePaddle的训练速度
+---------------------------------
+PaddlePaddle是神经网络训练平台，加速PaddlePaddle训练有如下几个方面\：
+* 减少数据载入的耗时
+* 加速训练速度
+* 利用更多的计算资源
+减少数据载入的耗时
++++++++++++++++++
+使用 :code:`pydataprovider`时，可以减少缓存池的大小，同时设置内存缓存功能，即可以极大的加速数据载入流程。
+:code:`DataProvider` 缓存池的减小，和之前减小通过减小缓存池来减小内存占用的原理一致。
+..  literalinclude:: reduce_min_pool_size.py
+同时 :code:`@provider` 接口有一个 :code:`cache` 参数来控制缓存方法，将其设置成 :code:`CacheType.CACHE_PASS_IN_MEM` 的话，会将第一个 :code:`pass` (过完所有训练数据即为一个pass)生成的数据缓存在内存里，在之后的 :code:`pass` 中，不会再从 :code:`python` 端读取数据，而是直接从内存的缓存里读取数据。这也会极大减少数据读入的耗时。
+加速训练速度
++++++++++++
+PaddlePaddle支持Sparse的训练，sparse训练需要训练特征是 :code:`sparse_binary_vector` 、 :code:`sparse_vector` 、或者 :code:`integer_value` 的任一一种。同时，与这个训练数据交互的Layer，需要将其Parameter设置成 sparse 更新模式，即设置 :code:`sparse_update=True`
+这里使用简单的 :code:`word2vec` 训练语言模型距离，具体使用方法为\:
+使用一个词前两个词和后两个词，来预测这个中间的词。这个任务的DataProvider为\:
+..  literalinclude:: word2vec_dataprovider.py
+这个任务的配置为\:
+..  literalinclude:: word2vec_config.py
+更多关于sparse训练的内容请参考 `sparse训练的文档 <TBD>`_
+利用更多的计算资源
++++++++++++++++++
+利用更多的计算资源可以分为一下几个方式来进行\:
+* 单机CPU训练
+  * 使用多线程训练。设置命令行参数 :code:`trainer_count`，即可以设置参与训练的线程数量。使用方法为 :code:`paddle train --trainer_count=4`
+* 单机GPU训练
+  * 使用显卡训练。设置命令行参数 :code:`use_gpu`。 使用方法为 :code:`paddle train --use_gpu=true`
+  * 使用多块显卡训练。设置命令行参数 :code:`use_gpu` 和 :code:`trainer_count`。使用 :code:`--use_gpu=True` 开启GPU训练，使用 :code:`trainer_count` 指定显卡数量。使用方法为 :code:`paddle train --use_gpu=true --trainer_count=4`
+* 多机训练
+  * 使用多机训练的方法也比较简单，需要先在每个节点启动 :code:`paddle pserver`，在使用 :code:`paddle train --pservers=192.168.100.1,192.168.100.2` 来指定每个pserver的ip地址
+  * 具体的多机训练方法参考 `多机训练 <TBD>`_ 文档。
+3. 遇到“非法指令”或者是“illegal instruction” 
+--------------------------------------------
+paddle在进行计算的时候为了提升计算性能，使用了avx指令。部分老的cpu型号无法支持这样的指令。通常来说执行下grep avx /proc/cpuinfo看看是否有输出即可知道是否支持。（另：用此方法部分虚拟机可能检测到支持avx指令但是实际运行会挂掉，请当成是不支持，看下面的解决方案）
+解决办法是\:
+* 使用 NO_AVX的 `安装包 <../build_and_install/index.html>`_ 或者 `Docker image <../build_and_install/install/docker_install.html>`_
+* 或者，使用 :code:`-DWITH_AVX=OFF` 重新编译PaddlePaddle。
+4. 如何选择SGD算法的学习率
+--------------------------
+在采用sgd/async_sgd进行训练时，一个重要的问题是选择正确的learning_rate。如果learning_rate太大，那么训练有可能不收敛，如果learning_rate太小，那么收敛可能很慢，导致训练时间过长。
+通常做法是从一个比较大的learning_rate开始试，如果不收敛，那减少学习率10倍继续试验，直到训练收敛为止。那么如何判断训练不收敛呢？可以估计出如果模型采用不变的输出最小的cost0是多少。
+如果训练过程的的cost明显高于这个常数输出的cost，那么我们可以判断为训练不收敛。举一个例子，假如我们是三分类问题，采用multi-class-cross-entropy作为cost，数据中0,1,2三类的比例为 :code:`0.2, 0.5, 0.3` , 那么常数输出所能达到的最小cost是 :code:`-(0.2*log(0.2)+0.5*log(0.5)+0.3*log(0.3))=1.03` 。如果训练一个pass（或者更早）后，cost还大于这个数，那么可以认为训练不收敛，应该降低学习率。
+5. 如何初始化参数
+-----------------
+默认情况下，PaddlePaddle使用均值0，标准差为 :math:`\frac{1}{\sqrt{d}}` 来初始化参数。其中 :math:`d` 为参数矩阵的宽度。这种初始化方式在一般情况下不会产生很差的结果。如果用户想要自定义初始化方式，PaddlePaddle目前提供两种参数初始化的方式\:
+* 高斯分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_mean=0.0, initial_std=1.0)`
+* 均匀分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0)`
+比如设置一个全连接层的参数初始化方式和bias初始化方式，可以使用如下代码。
+..  code-block:: python
+    hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0), 
+                      bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0))
+上述代码将bias全部初始化为1.0, 同时将参数初始化为 :code:`[1.0, -1.0]` 的均匀分布。
+6. 如何共享参数
+---------------
+PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字的参数，会共享参数。设置参数的名字，可以使用 :code:`ParamAttr(name="YOUR_PARAM_NAME")` 来设置。更方便的设置方式，是想要共享的参数使用同样的 :code:`ParamAttr` 对象。
+简单的全连接网络，参数共享的配置示例为\:
+..  literalinclude:: ../../python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
+这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。
--- a/doc_cn/faq/reduce_min_pool_size.py
+++ b/doc_cn/faq/reduce_min_pool_size.py
+@provider(min_pool_size=0, ...)
+def process(settings, filename):
+    os.system('shuf %s > %s.shuf' % (filename, filename))  # shuffle before.
+    with open('%s.shuf' % filename, 'r') as f:
+        for line in f:
+            yield get_sample_from_line(line)
\ No newline at end of file
--- a/doc_cn/faq/word2vec_config.py
+++ b/doc_cn/faq/word2vec_config.py
+... # the settings and define data provider is omitted.
+DICT_DIM=3000  # dictionary dimension.
+word_ids=data_layer('word_ids', size=DICT_DIM)
+emb = embedding_layer(input=word_ids, size=256, param_attr=ParamAttr(sparse_update=True))
+emb_sum = pooling_layer(input=emb, pooling_type=SumPooling())
+predict = fc_layer(input=emb_sum, size=DICT_DIM, act=Softmax())
+outputs(classification_cost(input=predict, label=data_layer('label', size=DICT_DIM))) 
\ No newline at end of file
--- a/doc_cn/faq/word2vec_dataprovider.py
+++ b/doc_cn/faq/word2vec_dataprovider.py
+DICT_DIM=3000
+@provider(input_types=[integer_sequence(DICT_DIM), integer_value(DICT_DIM)])
+def process(settings, filename):
+	with open(filename) as f:
+		# yield word ids to predict inner word id
+		# such as [28, 29, 10, 4], 4
+		# It means the sentance is  28, 29, 4, 10, 4.
+		yield read_next_from_file(f)
\ No newline at end of file
--- a/doc_cn/index.rst
+++ b/doc_cn/index.rst
@@ -3,6 +3,7 @@ PaddlePaddle文档
 使用指南
 --------
 * `快速入门 <demo/quick_start/index.html>`_
 * `编译与安装 <build_and_install/index.html>`_
 * `用户接口 <ui/index.html>`_
@@ -16,4 +17,13 @@ PaddlePaddle文档
 算法教程
 --------
-* `RNN配置 <../doc/algorithm/rnn/rnn.html>`_
+* `Recurrent Group教程 <algorithm/rnn/rnn-tutorial.html>`_
+* `单层RNN示例 <../doc/algorithm/rnn/rnn.html>`_
+* `双层RNN示例 <algorithm/rnn/hierarchical-rnn.html>`_
+* `支持双层序列作为输入的Layer <algorithm/rnn/hierarchical-layer.html>`_
+常见问题
+--------
+* `常见问题 <faq/index.html>`_
--- a/doc_cn/ui/data_provider/mnist_provider.dict.py
+++ b/doc_cn/ui/data_provider/mnist_provider.dict.py
@@ -2,10 +2,10 @@ from paddle.trainer.PyDataProvider2 import *
 # Define a py data provider
-@provider(input_types=[
+@provider(input_types={
-    dense_vector(28 * 28),
+    'pixel': dense_vector(28 * 28),
-    integer_value(10)
+    'label': integer_value(10)
-])
+})
 def process(settings, filename):  # settings is not used currently.
    f = open(filename, 'r')  # open one of training file
@@ -20,6 +20,6 @@ def process(settings, filename):  # settings is not used currently.
            pixels_float.append(float(each_pixel_str))
        # give data to paddle.
-        yield { "pixel": pixels_float, 'label': int(label) }
+        yield {"pixel": pixels_float, 'label': int(label)}
    f.close()  # close file
--- a/doc_cn/ui/data_provider/pydataprovider2.rst
+++ b/doc_cn/ui/data_provider/pydataprovider2.rst
@@ -141,8 +141,6 @@ DataProvider创建的时候执行。这个初始化函数具有如下参数:
   是一个batch size，但是有时为了计算均衡性，可以将一条数据设置成多个batch size
 *  cache 是数据缓存的策略，参考 `cache`_
 *  init_hook 是初始化时调用的函数，参考 `init_hook`_
-*  use_dynamic_order 如果是true的话，可以返回一个dict，key是data_layer的名字，value是特征值。同时，也可以
-   返回一个list或者tuple。如果是false的话，只能够返回list或者tuple
 *  check 设置成true的话，会根据input_types检查数据的合法性。
 *  check_fail_continue 如果设置成true的话，即使在check中数据不合法，也会扔到这条数据，继续训练。 如果
   check是false的话，没有作用。

--- a/paddle/.set_python_path.sh
+++ b/paddle/.set_python_path.sh
@@ -33,7 +33,7 @@ if ! python -c "import paddle" >/dev/null 2>/dev/null; then
    esac
  done
  shift $(($OPTIND - 1))
-  export PYTHONPATH=$PYPATH
+  export PYTHONPATH=$PYPATH:$PYTHONPATH
  $@
 else
  echo "paddle package is already in your PYTHONPATH. But unittest need a clean environment."

--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -2,10 +2,17 @@ set(AVX_SOURCES
    src/hl_math.cc
    src/hl_avx_functions.cc
 )
-set(CUDA_SOURCES
-    src/hl_time.cc
+if(WITH_AVX)
-    src/hl_cpu_functions.cc
+    set(CUDA_SOURCES
-    ${AVX_SOURCES})
+        src/hl_time.cc
+        src/hl_cpu_functions.cc
+        ${AVX_SOURCES})
+else()
+    set(CUDA_SOURCES
+        src/hl_time.cc
+        src/hl_cpu_functions.cc)
+endif()
 set(CUDA_CXX_WITH_GPU_SOURCES
    src/hl_cuda_cublas.cc

--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@@ -185,7 +185,7 @@ typedef struct {
    size_t                  nnz;
 } _hl_sparse_matrix_s, *hl_sparse_matrix_s;
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
 /**
 * HPPL data type: real (float or double)
 *

--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -169,7 +169,7 @@ extern void hl_avgpool_forward(
 * @brief   Maximum pool backward.
 *
 * @param[in]   frameCnt    batch size of input image.
- * @param[in]   outGrad     input data.
+ * @param[in]   outGrad     output grad data.
 * @param[in]   channels    number of channel.
 * @param[in]   height      image height.
 * @param[in]   width       image width.
@@ -296,4 +296,34 @@ extern void hl_bilinear_backward(real* inGrad,
                                 const size_t outputW,
                                 const size_t numChannels);
+/**
+ * @brief   MaxOut forward.
+ *
+ * @param[in]   inData      input data.
+ * @param[out]  outData     output data.
+ * @param[out]  idData      output maxId.
+ * @param[in]   batchSize   batchSize.
+ * @param[in]   size        number of channels * image height * image width.
+ * @param[in]   featLen     feature length = image height * image width.
+ * @param[in]   groups      number of groups.
+ */
+extern void hl_maxout_forward(
+    const real* inData, real* outData, int* idData,
+    size_t batchSize, size_t size, size_t featLen, size_t groups);
+/**
+ * @brief   MaxOut backward.
+ *
+ * @param[out]  inGrad      input grad data.
+ * @param[in]   outGrad     output grad data.
+ * @param[in]   idData      output maxId.
+ * @param[in]   batchSize   batchSize.
+ * @param[in]   size        number of channels * image height * image width.
+ * @param[in]   featLen     feature length = image height * image width.
+ * @param[in]   groups      number of groups.
+ */
+extern void hl_maxout_backward(
+    real* inGrad, const real* outGrad, const int* idData,
+    size_t batchSize, size_t size, size_t featLen, size_t groups);
 #endif /* HL_CNN_H_ */
--- a/paddle/cuda/include/hl_cpu_gru.cuh
+++ b/paddle/cuda/include/hl_cpu_gru.cuh
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/math/MathFunctions.h"
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
 #define     CBLAS_GEMM     paddle::gemm<float>
 #else
 #define     CBLAS_GEMM     paddle::gemm<double>

--- a/paddle/cuda/include/hl_gpu_functions.cuh
+++ b/paddle/cuda/include/hl_gpu_functions.cuh
@@ -28,7 +28,7 @@ namespace hppl {
    const real min = SIGMOID_THRESHOLD_MIN;
    const real max = SIGMOID_THRESHOLD_MAX;
    real tmp = (a < min) ? min : ((a > max) ? max : a);
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
    return __fdividef(1.0f, 1.0f + __expf(-tmp));
 #else
    return 1.0 / (1.0 + exp(-tmp));
@@ -36,7 +36,7 @@ namespace hppl {
  }
  __device__ static real tanh(const real a) {
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
    return __fdividef(2.0f, (1.0f + __expf(-2.0f*a))) - 1.0f;
 #else
    return (2.0 / (1.0 + exp(-2.0*a))) - 1.0;

--- a/paddle/cuda/include/hl_matrix_base.cuh
+++ b/paddle/cuda/include/hl_matrix_base.cuh
@@ -30,7 +30,7 @@ limitations under the License. */
 #define INLINE   inline
 #endif
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
 #define     DEVICE_FMAX     fmaxf
 #define     DEVICE_FMIN     fminf
 #else

--- a/paddle/cuda/include/hl_matrix_type.cuh
+++ b/paddle/cuda/include/hl_matrix_type.cuh
@@ -21,7 +21,7 @@ limitations under the License. */
 #ifdef __CUDA_ARCH__
 // typedef void*  vecType;
 #include <vector_types.h>
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
 typedef float4 vecType;
 #else
 typedef double2 vecType;
@@ -30,7 +30,7 @@ typedef double2 vecType;
 #include <mmintrin.h>
 #include <xmmintrin.h>
 #include <emmintrin.h>
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
 typedef __m128  vecType;
 #else
 typedef __m128d vecType;

--- a/paddle/cuda/include/hl_sequence.h
+++ b/paddle/cuda/include/hl_sequence.h
@@ -143,7 +143,7 @@ extern void hl_context_projection_backward_weight(real* outputGrad,
 */
 extern void hl_sequence2batch_copy(real *batch,
                                   real *sequence,
-                                   int *batchIndex,
+                                   const int *batchIndex,
                                   int seqWidth,
                                   int batchCount,
                                   bool seq2batch);

--- a/paddle/cuda/include/hl_sse_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_sse_matrix_kernel.cuh
@@ -20,7 +20,7 @@ limitations under the License. */
 #define VECTOR_SIZE     16
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
 /* number of float in vector */
 #define     VECTOR_LEN      4
 #define     VECTOR_SET      _mm_set_ps1
@@ -41,7 +41,7 @@ inline bool hl_check_align(void *ptr) {
  return hl_check_align(reinterpret_cast<size_t>(ptr));
 }
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
 template <class Agg>
 inline real hl_agg_op(Agg agg, vecType mm) {
  __m128 lo = _mm_unpacklo_ps(mm, mm);

--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -113,4 +113,12 @@ inline void hl_bilinear_backward(real* inGrad,
                                const size_t outputW,
                                const size_t numChannels) {}
+inline void hl_maxout_forward(
+    const real* inData, real* outData, int* idData,
+    size_t batchSize, size_t size, size_t featLen, size_t group) {}
+inline void hl_maxout_backward(
+    real* inGrad, const real* outGrad, const int* idData,
+    size_t batchSize, size_t size, size_t featLen, size_t group) {}
 #endif  // HL_CNN_STUB_H_
--- a/paddle/cuda/include/stub/hl_sequence_stub.h
+++ b/paddle/cuda/include/stub/hl_sequence_stub.h
@@ -62,7 +62,7 @@ inline void hl_context_projection_backward_weight(real* outputGrad,
 inline void hl_sequence2batch_copy(real *batch,
                                   real *sequence,
-                                   int *batchIndex,
+                                   const int *batchIndex,
                                   int seqWidth,
                                   int batchCount,
                                   bool seq2batch) {}

--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -662,4 +662,63 @@ void hl_bilinear_backward(real* inGrad,
    threadNum, inGrad, inImgH, inImgW, inputH, inputW, outGrad,
    outImgH, outImgW, outputH, outputW, numChannels, ratioH, ratioW);
  CHECK_SYNC("hl_bilinear_backward failed");
 }
\ No newline at end of file
+__global__ void maxoutFpCompute(size_t nthreads, const real * inData,
+                                real * outData, int* idData, 
+                                size_t size, size_t featLen, size_t groups) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if(index < nthreads) {
+    size_t batch_idx = index / size;
+    size_t i = index % size;
+    size_t channel_idx = i / featLen;
+    size_t feat_idx = i % featLen;
+    size_t data_idx = (batch_idx * size + channel_idx * featLen) * groups + feat_idx;
+    real max = inData[data_idx];
+    int maxId = 0;
+    for (size_t g = 1; g < groups; ++g) {
+      real tmp = inData[data_idx + g * featLen];
+      if (tmp > max) {
+        max = tmp;
+        maxId = g;
+      }
+    }
+    outData[index] = max;
+    idData[index] = maxId;
+  }
+}
+void hl_maxout_forward(const real* inData, real* outData,
+                       int* idData, size_t batchSize, size_t size,
+                       size_t featLen, size_t groups) {
+  int num_kernels = size * batchSize;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  maxoutFpCompute<<< blocks, 1024, 0, STREAM_DEFAULT>>>(
+    num_kernels, inData, outData, idData, size, featLen, groups);
+  CHECK_SYNC("hl_maxout_forward failed");
+}
+__global__ void maxoutBpCompute(size_t nthreads, real* inGrad,
+                                const real* outGrad, const int* idData,
+                                size_t size, size_t featLen, size_t groups) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if(index < nthreads) {
+    size_t batch_idx = index / size;
+    size_t i = index % size;
+    size_t channel_idx = i / featLen;
+    size_t feat_idx = i % featLen;
+    size_t newIndex = batch_idx * size;
+    size_t gradIdx = (channel_idx * groups + (idData + newIndex)[i]) * featLen + feat_idx;
+    (inGrad + newIndex * groups)[gradIdx] += (outGrad + newIndex)[i];
+  }
+}
+void hl_maxout_backward(real* inGrad, const real* outGrad,
+                        const int* idData, size_t batchSize, size_t size,
+                        size_t featLen, size_t groups) {
+  int num_kernels = size * batchSize;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  maxoutBpCompute<<< blocks, 1024, 0, STREAM_DEFAULT >>>(
+    num_kernels, inGrad, outGrad, idData, size, featLen, groups);
+  CHECK_SYNC("hl_maxout_backward failed");
+}
--- a/paddle/cuda/src/hl_cuda_cublas.cc
+++ b/paddle/cuda/src/hl_cuda_cublas.cc
@@ -84,7 +84,7 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
 } /* namespace dynload */
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
 #define     CUBLAS_GEAM     dynload::cublasSgeam
 #define     CUBLAS_GEMV     dynload::cublasSgemv
 #define     CUBLAS_GEMM     dynload::cublasSgemm

--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -340,7 +340,7 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc,
        (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
    CHECK_NOTNULL(hl_desc);
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
@@ -373,7 +373,7 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {
        (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
    CHECK_NOTNULL(hl_desc);
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
@@ -611,7 +611,7 @@ void hl_create_filter_descriptor(hl_filter_descriptor* filter,
    CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc));
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
@@ -921,7 +921,7 @@ void hl_softmax_forward(real *input,
                        int height,
                        int width)
 {
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
@@ -955,7 +955,7 @@ void hl_softmax_backward(real *output_value,
                         int height,
                         int width)
 {
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;

--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -626,7 +626,7 @@ void hl_specify_devices_start(int* device, int number) {
 void hl_rand(real *dest_d, size_t num) {
  pthread_mutex_lock(t_resource.gen_mutex);
  CHECK_EQ(
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
  dynload::curandGenerateUniform(t_resource.gen, dest_d, num),
 #else
  dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num),

--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -47,7 +47,7 @@ void hl_matrix_add(real *A_d,
  CHECK_SYNC("hl_matrix_add failed");
 }
-#ifdef HPPL_TYPE_DOUBLE
+#ifdef PADDLE_TYPE_DOUBLE
    #define THRESHOLD   128
 #else
    #define THRESHOLD   64
@@ -102,7 +102,7 @@ void subMaxAndExp(real* I,
      val = -THRESHOLD;
    }
    I[nextIdx] = val;
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
    O[nextIdx] = __expf(val);
 #else
    O[nextIdx] = exp(val);

--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -374,7 +374,7 @@ template<int blockDimX, int blockDimY, int gridDimX, bool seq2batch, bool isAdd>
 __global__
 void KeSequence2Batch(real *batch,
                      real *sequence,
-                      int *batchIndex,
+                      const int *batchIndex,
                      int seqWidth,
                      int batchCount) {
  int idx = threadIdx.x;
@@ -405,7 +405,7 @@ void KeSequence2Batch(real *batch,
 void hl_sequence2batch_copy(real *batch,
                            real *sequence,
-                            int *batchIndex,
+                            const int *batchIndex,
                            int seqWidth,
                            int batchCount,
                            bool seq2batch) {

--- a/paddle/cuda/src/hl_cuda_sparse.cuh
+++ b/paddle/cuda/src/hl_cuda_sparse.cuh
@@ -355,7 +355,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d,
 }
 /* best perf */
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
 #define CU_CSCMM_THREAD_M_BEST          9
 #else
 #define CU_CSCMM_THREAD_M_BEST          4

--- a/paddle/gserver/dataproviders/DataProvider.cpp
+++ b/paddle/gserver/dataproviders/DataProvider.cpp
@@ -57,7 +57,8 @@ void BufferBatch::clone(DataBatch* srcBatch, bool useGpu) {
  }
 }
-DoubleBuffer::DoubleBuffer(DataProvider* dataPool, bool useGpu,
+DoubleBuffer::DoubleBuffer(DataProvider *dataPool,
+                           bool useGpu,
                           int64_t batchSize) {
  batchSize_ = batchSize;
  dataPool_ = dataPool;
@@ -110,6 +111,9 @@ void DoubleBuffer::removeOneBatch(DataBatch* dataBatch) {
 }
 void DoubleBuffer::insertOneBatch(DataBatch* batch) {
+  while (!bufferQueue_->waitNotEmptyFor(2 /* seconds */)) {  // time out
+    if (stopping_) return;
+  }
  BufferBatch* bufBatch = bufferQueue_->dequeue();
  // clone and copy the data from an Threadlocal Variable
  bufBatch->clone(batch, useGpu_);
@@ -138,7 +142,7 @@ void DoubleBuffer::asyncLoadBatch() {
        actualSize = dataPool_->getNextBatchInternal(batchSize_, &newBatch);
      }
      insertOneBatch(&newBatch);
-    } while (actualSize > 0);
+    } while (actualSize > 0 && !stopping_);
  }
 }

--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -259,7 +259,9 @@ typedef Queue<BufferBatch*> BufferBatchQueue;
 class DoubleBuffer {
 public:
-  DoubleBuffer(DataProvider* dataPool, bool useGpu, int64_t batchSize = 0);
+  DoubleBuffer(DataProvider* dataPool,
+               bool useGpu,
+               int64_t batchSize = 0);
  virtual ~DoubleBuffer();
  void removeOneBatch(DataBatch* dataBatch);
@@ -308,7 +310,8 @@ public:
  /**
   * @brief create only used for unittest.
   */
-  inline static DataProvider* create(const DataConfig &config, bool useGpu) {
+  inline static DataProvider* create(const DataConfig &config,
+                                     bool useGpu = FLAGS_use_gpu) {
    return create(config, ModelConfig(), useGpu);
  }
@@ -348,7 +351,6 @@ public:
   */
  virtual void reset() {
    if (doubleBuffer_ != nullptr) {
-      LOG(INFO) << "the double-buffer is starting ...";
      doubleBuffer_->startAsyncLoad();
    }
  }

--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
@@ -14,13 +14,20 @@ limitations under the License. */
 #ifndef PADDLE_NO_PYTHON
+#include <Python.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unordered_set>
 #include <list>
+#include <numpy/numpyconfig.h>
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#include <numpy/ndarrayobject.h>
 #include "DataProvider.h"
 #include "paddle/utils/PythonUtil.h"
+#include "paddle/utils/Locks.h"
+#include "paddle/utils/Stat.h"
 namespace paddle {
@@ -202,7 +209,10 @@ public:
  PyDataProvider2(const DataConfig& config,
                  const ModelConfig& modelConfig,
                  bool useGpu)
-    :DataProvider(config, useGpu), callingContextCreated_(2) {
+    :DataProvider(config, useGpu),
+      callingContextCreated_(2) {
+    if (PyArray_API == NULL)
+      import_array();
    auto& args = config.load_data_args();
    PyObjectPtr kwargs = PyObjectPtr(PyDict_New());
    if (!args.empty()) {
@@ -246,8 +256,7 @@ private:
                       PyObjectPtr && kwargs) {
    LOG(INFO) << "loading dataprovider " << model <<"::" << className;
-    PyObjectPtr module(PyImport_ImportModule(model.c_str()));
+    PyObjectPtr module = py::import(model);
-    CHECK_PY(module) << "Cannot imort module " << model.c_str();
    PyObjectPtr moduleDict(PyModule_GetDict(module.get()));
    CHECK_PY(moduleDict) << "Invoke module.__dict__ error";
    PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(),
@@ -455,6 +464,7 @@ private:
  std::condition_variable pushCV_;
  std::condition_variable pullCV_;
  std::mutex mtx_;
  ThreadBarrier callingContextCreated_;
  std::unique_ptr<IPyDataProviderCache> cache_;
@@ -497,8 +507,8 @@ public:
   * Resetting the PyDataProvider. May start reading thread here.
   */
  virtual void reset() {
-    DataProvider::reset();
    resetImpl(true);
+    DataProvider::reset();
  }
  /**
@@ -519,6 +529,7 @@ public:
   * Loading a batch of data.
   */
  int64_t getNextBatchInternal(int64_t size_, DataBatch *batch) {
+    REGISTER_TIMER("PyDP2.getNextBatchInternal")
    CHECK_GE(size_, 0);
    size_t size = (size_t) size_;
    if (loadThread_) {  // loading from thread should wait for data pool ready.
@@ -699,10 +710,22 @@ public:
   */
  virtual void fill(Argument &argument, PyObject *obj) {
    real* dat = argument.value->getData() + height_ * headerPtr_->dim;
-    py::SequenceHelper s(obj);
+    if (PyArray_Check(obj)) {
-    // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
+        auto dtype = PyArray_DTYPE((PyArrayObject*)obj);
-    for (size_t i=0; i < headerPtr_->dim; ++i) {
+        if (dtype->type == 'f' && dtype->elsize == sizeof(real)) {
-      dat[i] = (real) s.getDouble(i);
+            real * data = (real*)PyArray_DATA((PyArrayObject*)obj);
+            auto sz = PyArray_SIZE((PyArrayObject*)obj);
+            std::copy(data, data + sz, dat);
+        } else {
+            LOG(FATAL) << "You should yield float" << sizeof(real) * 8
+                       << " array";
+        }
+     } else {
+        py::SequenceHelper s(obj);
+        // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
+        for (size_t i=0; i < headerPtr_->dim; ++i) {
+          dat[i] = (real) s.getDouble(i);
+        }
    }
    ++height_;
  }

--- a/paddle/gserver/evaluators/ChunkEvaluator.cpp
+++ b/paddle/gserver/evaluators/ChunkEvaluator.cpp
@@ -75,7 +75,6 @@ class ChunkEvaluator : public Evaluator {
 public:
  virtual void init(const EvaluatorConfig& config) {
-    CHECK(!FLAGS_use_gpu) << "Not supported";
    Evaluator::init(config);
    if (config.chunk_scheme() == "IOB") {
      numTagTypes_ = 2;
@@ -137,6 +136,7 @@ public:
    CHECK_EQ(arguments.size(), (size_t)2);
    IVectorPtr& output = arguments[0].ids;
    IVectorPtr& label = arguments[1].ids;
+    CHECK(!output->useGpu() && !label->useGpu()) << "Not supported";
    auto sequenceStartPositions =
        arguments[1].sequenceStartPositions->getVector(false);
    CHECK_EQ(output->getSize(), label->getSize());

--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -813,7 +813,6 @@ void TrainerThread::mergeGradSparse(
      para->getMat(PARAMETER_GRADIENT).get());
  std::vector<uint32_t>& ids = mainMat->getIds(threadId_);
-  ids.clear();
  for (auto slaveParams : slaveParameters) {
    SparseRowCpuMatrix* mat =
        dynamic_cast<SparseRowCpuMatrix*>((*slaveParams)[pid]

--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -544,6 +544,12 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
    const std::vector<Argument> inArgs;
    std::vector<Argument> outArgs;
    frames_[i]->forward(inArgs, &outArgs, passType);
+    if (hasSubseq) {
+      for (auto& outFrameLine : outFrameLines_) {
+        CHECK(outFrameLine.frames[i]->getOutput().sequenceStartPositions)
+          << "In hierachical RNN, all out links should be from sequences.";
+      }
+    }
  }
  if (evaluator_ && passType == PASS_TEST) {
    this->eval(evaluator_.get());
@@ -635,16 +641,15 @@ void RecurrentGradientMachine::createInFrameInfo(int inlinkId,
  std::vector<int> sequenceStartPositions;
  const int* subSequenceStartPositions = nullptr;
-  if (hasSubseq) {                    // for sequenceScatterAgentLayer
+  if (hasSubseq) {  // for sequenceScatterAgentLayer
-    subSequenceStartPositions =
+    subSequenceStartPositions = input.subSequenceStartPositions->getData(false);
-        input.subSequenceStartPositions->getData(false);
    inlinkInfo->seqStartPosIndex.clear();
    inlinkInfo->seqStartPosIndex.push_back(0);  // first seqStartPosIndex = 0
  }
  // maxSequenceLength_: max topLevelLength in allsamples
  for (int i = 0; i < maxSequenceLength_; ++i) {
    if (hasSubseq) {
-      sequenceStartPositions.push_back(0);            // first element = 0
+      sequenceStartPositions.push_back(0);  // first element = 0
    }
    int numSeqs = 0;
    for (size_t j = 0; j < numSequences; ++j) {
@@ -676,9 +681,9 @@ void RecurrentGradientMachine::createInFrameInfo(int inlinkId,
  }
  if (hasSubseq) {
    // inFrameLine create sequenceStartPositions one time
-    CHECK_EQ(sequenceStartPositions.size(),
+    CHECK_EQ(
-             static_cast<size_t>(maxSequenceLength_ +
+        sequenceStartPositions.size(),
-                                 input.getNumSubSequences()));
+        static_cast<size_t>(maxSequenceLength_ + input.getNumSubSequences()));
    CHECK_EQ(inlinkInfo->seqStartPosIndex.size(),
             static_cast<size_t>(maxSequenceLength_ + 1));
    createSeqPos(sequenceStartPositions, &inlinkInfo->sequenceStartPositions);
@@ -1102,10 +1107,12 @@ size_t RecurrentGradientMachine::beamShrink(std::vector<Path>& newPaths,
                   newPaths.end(), Path::greaterPath);
  newPaths.resize(totalExpandCount + minNewPathSize);
-  real minPathLogProb = std::min_element(newPaths.end() - minNewPathSize,
+  real minPathLogProb =
-                                         newPaths.end())->logProb;
+      std::min_element(newPaths.end() - minNewPathSize, newPaths.end())
-  real maxPathLogProb = std::max_element(newPaths.end() - minNewPathSize,
+          ->logProb;
-                                         newPaths.end())->logProb;
+  real maxPathLogProb =
+      std::max_element(newPaths.end() - minNewPathSize, newPaths.end())
+          ->logProb;
  // Remove the already formed paths that are relatively short
  finalPaths_[seqId].erase(

--- a/paddle/gserver/layers/AgentLayer.cpp
+++ b/paddle/gserver/layers/AgentLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "AgentLayer.h"
 #include "paddle/utils/Logging.h"
@@ -62,8 +61,8 @@ void SequenceAgentLayer::forward(PassType passType) {
  // get Arguments from real layers
  if (numSamples_ > 0 && numSamples_ < realNumSequences) {
-    int numRows = realOutput.sequenceStartPositions->
+    int numRows =
-                  getData(false)[numSamples_];
+        realOutput.sequenceStartPositions->getData(false)[numSamples_];
    CHECK(!realOutput.ids) << "Not supported";
    output_.subArgFrom(realOutput, /* offset */ 0, numRows, getSize(), useGpu_,
                       /* trans */ false, /* seqFlag */ true,
@@ -141,8 +140,8 @@ void ScatterAgentLayer::forward(PassType passType) {
  int width = this->getSize();
  if (realOutArg_.value || realOutArg_.ids) {
-    output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_,
+    output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, width,
-                       width, useGpu_);
+                       useGpu_);
  } else {  // used in generation
    if (realLayer_->getOutput().ids) {
      IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_);
@@ -224,8 +223,8 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
  if (realOutArg_.value || realOutArg_.ids) {
    CHECK(realOutArg_.sequenceStartPositions);
-    output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_,
+    output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, width,
-                       width, useGpu_, /* trans */ false, /* seqFlag */ true,
+                       useGpu_, /* trans */ false, /* seqFlag */ true,
                       /* seqStart */ seqStartPosIndex_,
                       /* seqSize */ numSequences_);
  } else {
@@ -249,11 +248,12 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
    CHECK_NE(input.sequenceStartPositions.get(),
             output_.sequenceStartPositions.get());
    ICpuGpuVector::resizeOrCreate(output_.sequenceStartPositions,
-                                   numSequences + 1, false);
+                                  numSequences + 1, false);
    int* outStarts = output_.sequenceStartPositions->getMutableData(false);
-    IVector::resizeOrCreate(cpuInputStartPos_, height, false);
+    ICpuGpuVector::resizeOrCreate(inputStartPos_, height, false);
-    int* inStarts = cpuInputStartPos_->getData();
+    int* inStarts = inputStartPos_->getMutableData(false);
    size_t offsetOut = 0;
    for (size_t i = 0; i < numSequences; ++i) {
      outStarts[i] = offsetOut;
@@ -266,13 +266,8 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
    }
    outStarts[numSequences] = offsetOut;
-    if (useGpu_) {
+    outputValue->copyByRowIndex(*input.value,
-      IVector::resizeOrCreate(inputStartPos_, height, true);
+                                *inputStartPos_->getVector(useGpu_));
-      inputStartPos_->copyFrom(*cpuInputStartPos_, HPPL_STREAM_DEFAULT);
-    } else {
-      inputStartPos_ = cpuInputStartPos_;
-    }
-    outputValue->copyByRowIndex(*input.value, *inputStartPos_);
  }
 }

--- a/paddle/gserver/layers/AgentLayer.h
+++ b/paddle/gserver/layers/AgentLayer.h
@@ -191,11 +191,7 @@ class SequenceScatterAgentLayer : public ScatterAgentLayer {
 protected:
  // use to store expanded cpuStartPositions or subSequenceStartPositions
  // of real layer.
-  IVectorPtr cpuInputStartPos_;
+  ICpuGpuVectorPtr inputStartPos_;
-  // point to cpuInputStartPos_ when useGpu_ is false
-  // copy from cpuInputStartPos_ when useGpu_ is true
-  IVectorPtr inputStartPos_;
 public:
  explicit SequenceScatterAgentLayer(const LayerConfig& config)

--- a/paddle/gserver/layers/AverageLayer.cpp
+++ b/paddle/gserver/layers/AverageLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "AverageLayer.h"
 #include "paddle/utils/Logging.h"
@@ -25,13 +24,8 @@ REGISTER_LAYER(average, AverageLayer);
 bool AverageLayer::init(const LayerMap& layerMap,
                        const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
+  SequencePoolLayer::init(layerMap, parameterMap);
-  Layer::init(layerMap, parameterMap);
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
  dataMtx_ = Matrix::create(nullptr, 1, 1, false, useGpu_);
  outMtx_ = Matrix::create(nullptr, 1, getSize(), false, useGpu_);
  // average strategy
@@ -44,57 +38,15 @@ bool AverageLayer::init(const LayerMap& layerMap,
  } else {
    LOG(FATAL) << "Unknown average strategy: " << config_.average_strategy();
  }
-  // transform to which sequence type
-  if (config_.trans_type() == "non-seq") {
-    type_ = kNonSeq;
-  } else if (config_.trans_type() == "seq") {
-    type_ = kSeq;
-  } else {
-    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
-  }
-  setNeedSequenceInfo(false);
  return true;
 }
 void AverageLayer::forward(PassType passType) {
-  Layer::forward(passType);
+  SequencePoolLayer::forward(passType);
-  // average layer should have exactly 1 input
-  CHECK_EQ(1U, inputLayers_.size());
-  size_t dim = getSize();
-  const Argument& input = getInput(0);
-  int64_t newBatchSize =
-      type_ ? input.getNumSubSequences() : input.getNumSequences();
-  ICpuGpuVectorPtr startPositions =
-      type_ ? input.subSequenceStartPositions
-            : input.sequenceStartPositions;
-  const int* starts = startPositions->getData(false);
-  size_t numSequences = startPositions->getSize() - 1;
-  // check
-  CHECK_EQ(numSequences, (size_t)newBatchSize);
-  CHECK_EQ(starts[numSequences], input.getBatchSize());
-  if (type_) {
-    // when trans_type = seq, input must hasSubseq
-    CHECK_EQ(input.hasSubseq(), 1UL);
-  }
-  CHECK_EQ(dim, input.value->getWidth());
-  resetOutput(newBatchSize, dim);
-  auto startsPos = startPositions->getVector(useGpu_);
  MatrixPtr inputValue = getInputValue(0);
-  getOutputValue()->sequenceAvgForward(*inputValue, *startsPos, mode_);
+  getOutputValue()->sequenceAvgForward(
+      *inputValue, *startPositions_->getVector(useGpu_), mode_);
-  /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
-   * thus, in this case, output_ has no sequenceStartPositions.
-   * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
-   * case, we should compute the new sequenceStartPositions.
-  */
-  if (type_) {
-    output_.degradeSequence(input, useGpu_);
-  }
  /* add the bias-vector AFTER average operation */
  if (biases_.get() != NULL) {
@@ -106,26 +58,16 @@ void AverageLayer::forward(PassType passType) {
 }
 void AverageLayer::backward(const UpdateCallback& callback) {
-  const Argument& input = getInput(0);
+  SequencePoolLayer::backward(callback);
-  ICpuGpuVectorPtr startPositions =
-      type_ ? input.subSequenceStartPositions
-            : input.sequenceStartPositions;
-  const int* starts = startPositions->getData(false);
-  /* Do derivation */ { backwardActivation(); }
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
+  const int* starts = startPositions_->getData(false);
  MatrixPtr grad = getInputGrad(0);
  if (grad) {
    size_t dim = getSize();
    real* gradientData = getInputGrad(0)->getData();
    real* gradient = getOutputGrad()->getData();
-    size_t numSequences = startPositions->getSize() - 1;
+    size_t numSequences = startPositions_->getSize() - 1;
    for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
      // TODO(Dangqingqing) optimization for GPU
      int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];

--- a/paddle/gserver/layers/AverageLayer.h
+++ b/paddle/gserver/layers/AverageLayer.h
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#include "Layer.h"
+#include "SequencePoolLayer.h"
 #include "paddle/math/Matrix.h"
 namespace paddle {
@@ -23,20 +22,21 @@ namespace paddle {
 /**
 * A layer for "internal average" for sequence input.
 * Input: one or more sequences. Each sequence contains some instances.
- * If AverageLevel = kNonSeq:
+ * If SequenceLevel = kNonSeq:
 *    Output: output size is the number of input sequences (NOT input instances)
 *    output[i] = average_{for each instance in this sequence}{input[i]}
- * If AverageLevel = kSeq:
+ * If SequenceLevel = kSeq:
 *    Check input sequence must has sub-sequence
 *    Output: output size is the number of input sub-sequences
 *    output[i] = average_{for each instance in this sub-sequence}{input[i]}
+ *
+ * The config file api is pooling_layer.
 */
+class AverageLayer : public SequencePoolLayer {
-class AverageLayer : public Layer {
 public:
  enum AverageStrategy { kAverage = 0, kSum = 1, kAverageSquareRootN = 2 };
-  enum AverageLevel { kNonSeq = 0, kSeq = 1 };
+  explicit AverageLayer(const LayerConfig& config)
-  explicit AverageLayer(const LayerConfig& config) : Layer(config) {}
+      : SequencePoolLayer(config) {}
  ~AverageLayer() {}
@@ -46,11 +46,8 @@ public:
  void backward(const UpdateCallback& callback = nullptr);
 protected:
-  std::unique_ptr<Weight> biases_;
  MatrixPtr outMtx_;
  MatrixPtr dataMtx_;
  int mode_;
-  int type_;
 };
 }  // namespace paddle
--- a/paddle/gserver/layers/ExpandLayer.cpp
+++ b/paddle/gserver/layers/ExpandLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "ExpandLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
@@ -53,9 +52,8 @@ void ExpandLayer::forward(PassType passType) {
  const Argument& shapeInput = getInput(1);
  const Argument& dataInput = getInput(0);
  size_t outputBatchSize = shapeInput.getBatchSize();
-  auto startPositions =
+  auto startPositions = type_ ? shapeInput.subSequenceStartPositions
-      type_ ? shapeInput.subSequenceStartPositions
+                              : shapeInput.sequenceStartPositions;
-            : shapeInput.sequenceStartPositions;
  size_t numSequences = startPositions->getSize() - 1;
  const int* starts = startPositions->getData(false);
@@ -71,8 +69,7 @@ void ExpandLayer::forward(PassType passType) {
  // set output sequence info as shape sequence
  output_.sequenceStartPositions = shapeInput.sequenceStartPositions;
  if (shapeInput.hasSubseq()) {
-    output_.subSequenceStartPositions =
+    output_.subSequenceStartPositions = shapeInput.subSequenceStartPositions;
-        shapeInput.subSequenceStartPositions;
  }
  // reserve output: Expand output to batchsize of sequence data.
@@ -81,8 +78,8 @@ void ExpandLayer::forward(PassType passType) {
  MatrixPtr inputValue = getInputValue(0);
  MatrixPtr outputValue = getOutputValue();
-  IVector::resizeOrCreate(cpuExpandStartsPos_, outputBatchSize, false);
+  ICpuGpuVector::resizeOrCreate(expandStartsPos_, outputBatchSize, false);
-  int* expandStarts = cpuExpandStartsPos_->getData();
+  int* expandStarts = expandStartsPos_->getMutableData(false);
  for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
    int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
    for (int j = 0; j < sequenceLength; j++) {
@@ -90,15 +87,8 @@ void ExpandLayer::forward(PassType passType) {
    }
  }
-  if (useGpu_) {
+  outputValue->copyByRowIndex(*inputValue,
-    // TODO(Dangqingqing) move copyFrom
+                              *expandStartsPos_->getVector(useGpu_));
-    IVector::resizeOrCreate(expandStartsPos_, outputBatchSize, true);
-    expandStartsPos_->copyFrom(*cpuExpandStartsPos_, HPPL_STREAM_DEFAULT);
-  } else {
-    expandStartsPos_ = cpuExpandStartsPos_;
-  }
-  outputValue->copyByRowIndex(*inputValue, *expandStartsPos_);
  if (biases_.get() != NULL) {
    outputValue->addBias(*(biases_->getW()), 1);
@@ -108,16 +98,15 @@ void ExpandLayer::forward(PassType passType) {
 void ExpandLayer::backward(const UpdateCallback& callback) {
  if (biases_ && biases_->getWGrad()) {
    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-     /* Increasing the number of gradient */
+    /* Increasing the number of gradient */
    biases_->getParameterPtr()->incUpdate(callback);
  }
  if (!getInputGrad(0)) return;
  MatrixPtr inputGrad = getInputGrad(0);
  MatrixPtr outputGrad = getOutputGrad();
-  auto cpuSeqStartPos =
+  auto cpuSeqStartPos = type_ ? getInput(1).subSequenceStartPositions
-      type_ ? getInput(1).subSequenceStartPositions
+                              : getInput(1).sequenceStartPositions;
-            : getInput(1).sequenceStartPositions;
  size_t numSequences = cpuSeqStartPos->getSize() - 1;
  const int* starts = cpuSeqStartPos->getData(false);

--- a/paddle/gserver/layers/ExpandLayer.h
+++ b/paddle/gserver/layers/ExpandLayer.h
@@ -44,14 +44,9 @@ protected:
  enum ExpandLevel { kNonSeq = 0, kSeq = 1 };
  /// store the ExpandLevel
  int type_;
-  // TODO(luotao) use ICpuGpuVectorPtr to merge cpuExpandStartsPos_
-  // and expandStartsPos_
  /// expanded sequenceStartPositions or subSequenceStartPositions
  /// of input[1]
-  IVectorPtr cpuExpandStartsPos_;
+  ICpuGpuVectorPtr expandStartsPos_;
-  /// point to cpuExpandStartsPos_ when useGpu_ is false,
-  /// copy from cpuExpandStartsPos_ when useGpu_ is true
-  IVectorPtr expandStartsPos_;
 public:
  explicit ExpandLayer(const LayerConfig& config) : Layer(config) {}

--- a/paddle/gserver/layers/MaxLayer.cpp
+++ b/paddle/gserver/layers/MaxLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "MaxLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
@@ -21,55 +20,11 @@ namespace paddle {
 REGISTER_LAYER(max, MaxLayer);
-bool MaxLayer::init(const LayerMap& layerMap,
-                    const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-  // transform to which sequence type
-  if (config_.trans_type() == "non-seq") {
-    type_ = kNonSeq;
-  } else if (config_.trans_type() == "seq") {
-    type_ = kSeq;
-  } else {
-    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
-  }
-  setNeedSequenceInfo(false);
-  return true;
-}
 void MaxLayer::forward(PassType passType) {
-  Layer::forward(passType);
+  SequencePoolLayer::forward(passType);
-  // max layer should have exactly 1 input
-  CHECK_EQ(1U, inputLayers_.size());
-  size_t dim = getSize();
-  const Argument& input = getInput(0);
-  int64_t newBatchSize =
-      type_ ? input.getNumSubSequences() : input.getNumSequences();
-  ICpuGpuVectorPtr startPositions =
-      type_ ? input.subSequenceStartPositions
-            : input.sequenceStartPositions;
-  auto starts = startPositions->getVector(useGpu_);
-  size_t numSequences = startPositions->getSize() - 1;
-  CHECK_EQ(dim, input.value->getWidth());
+  IVector::resizeOrCreate(maxIndex_, newBatchSize_ * getSize(),
-  CHECK_EQ(numSequences, (size_t)newBatchSize);
+                          useGpu(deviceId_));
-  CHECK_EQ(startPositions->getData(false)[numSequences], input.getBatchSize());
-  if (type_) {
-    // when trans_type = seq, input must hasSubseq
-    CHECK_EQ(input.hasSubseq(), 1UL);
-  }
-  // reset output: resize to "num of sequences", not "batch size".
-  resetOutput(newBatchSize, dim);
-  IVector::resizeOrCreate(maxIndex_, newBatchSize * dim, useGpu(deviceId_));
  maxIndex_->zeroMem();
  MatrixPtr inputValue = getInputValue(0);
@@ -77,16 +32,8 @@ void MaxLayer::forward(PassType passType) {
  {
    REGISTER_TIMER_INFO("MaxLayerForward", getName().c_str());
-    outputValue->maxSequenceForward(*inputValue, *starts, *maxIndex_);
+    outputValue->maxSequenceForward(
-  }
+        *inputValue, *startPositions_->getVector(useGpu_), *maxIndex_);
-  /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
-   * thus, in this case, output_ has no cpuSequenceStartPositions.
-   * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
-   * case, we should compute the new cpuSequenceStartPositions.
-  */
-  if (type_) {
-    output_.degradeSequence(input, useGpu_);
  }
  if (config_.output_max_index()) {
@@ -104,24 +51,14 @@ void MaxLayer::forward(PassType passType) {
 void MaxLayer::backward(const UpdateCallback& callback) {
  CHECK(!config_.output_max_index())
      << "backward is not available when output_max_index is set";
-  /* Do derivation */ { backwardActivation(); }
+  SequencePoolLayer::backward(callback);
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
  MatrixPtr inputGrad = getInputGrad(0);
  MatrixPtr outputGrad = getOutputGrad();
  if (inputGrad) {
-    ICpuGpuVectorPtr starts =
-        type_ ? getInput(0).subSequenceStartPositions
-              : getInput(0).sequenceStartPositions;
    REGISTER_TIMER_INFO("MaxLayerBackward", getName().c_str());
-    inputGrad->maxSequenceBackward(*outputGrad,
+    inputGrad->maxSequenceBackward(
-        *(starts->getVector(useGpu_)), *maxIndex_);
+        *outputGrad, *(startPositions_->getVector(useGpu_)), *maxIndex_);
  }
 }

--- a/paddle/gserver/layers/MaxLayer.h
+++ b/paddle/gserver/layers/MaxLayer.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
-#include "Layer.h"
+#include "SequencePoolLayer.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/utils/ThreadLocal.h"
@@ -24,29 +24,30 @@ namespace paddle {
 /**
 * A layer for "internal max" for sequence input.
 * Input: one or more sequences. Each sequence contains some instances.
- * If MaxLevel = kNonSeq:
+ * If SequenceLevel = kNonSeq:
 *    Output: output size is the number of input sequences (NOT input instances)
 *    output[i] = max_{for each instance in this sequence}{input[i]}
- * If MaxLevel = kSeq:
+ * If SequenceLevel = kSeq:
 *    Check input sequence must has sub-sequence
 *    Output: output size is the number of input sub-sequences
 *    output[i] = max_{for each instance in this sub-sequence}{input[i]}
+ *
+ * The config file api is pooling_layer.
 */
-class MaxLayer : public Layer {
+class MaxLayer : public SequencePoolLayer {
 protected:
-  std::unique_ptr<Weight> biases_;
  // maxIndex_[i][j] = k : the value at (i, j) is from input[k].
  IVectorPtr maxIndex_;
-  int type_;
 public:
-  explicit MaxLayer(const LayerConfig& config) : Layer(config) {}
+  explicit MaxLayer(const LayerConfig& config) : SequencePoolLayer(config) {}
-  enum MaxLevel {kNonSeq = 0, kSeq = 1 };
  ~MaxLayer() {}
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+    return SequencePoolLayer::init(layerMap, parameterMap);
+  }
  void forward(PassType passType);
  void backward(const UpdateCallback& callback = nullptr);

--- a/paddle/gserver/layers/MaxOutLayer.cpp
+++ b/paddle/gserver/layers/MaxOutLayer.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "MaxOutLayer.h"
+#include "hl_gpu.h"
+#include "hl_cnn.h"
+namespace paddle {
+REGISTER_LAYER(maxout, MaxOutLayer);
+size_t MaxOutLayer::getSize() {
+  const MaxOutConfig& maxoutConf = config_.inputs(0).maxout_conf();
+  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  if (imgSizeH_ == 0) {
+    imgSizeH_ = maxoutConf.img_size_y();
+  }
+  if (imgSizeW_ == 0) {
+    imgSizeW_ = maxoutConf.img_size_x();
+  }
+  featLen_ = imgSizeH_ * imgSizeW_;
+  size_t layerSize = featLen_ * outputChannels_;
+  getOutput().setFrameHeight(imgSizeH_);
+  getOutput().setFrameWidth(imgSizeW_);
+  return layerSize;
+}
+bool MaxOutLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  /* the size of inputs for maxout-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+  const MaxOutConfig& conf = config_.inputs(0).maxout_conf();
+  groups_ = conf.groups();
+  channels_ = conf.channels();
+  CHECK_EQ(channels_ % groups_, 0UL);
+  outputChannels_ = channels_ / groups_;
+  return true;
+}
+void MaxOutLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  /* malloc memory for the output_ if necessary */
+  /* note: one sample correspond to one column */
+  size_t batchSize = getInput(0).getBatchSize();
+  size_t size = getSize();
+  resetOutput(batchSize, size);
+  MatrixPtr inputV = getInputValue(0);
+  MatrixPtr outV = getOutputValue();
+  IVector::resizeOrCreate(maxoutId_, size * batchSize, useGpu_);
+  outV->maxoutForward(*inputV, *maxoutId_, outputChannels_, groups_);
+}
+void MaxOutLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  /* Do derivation */
+  MatrixPtr inputG = getInputGrad(0);
+  MatrixPtr outG = getOutputGrad();
+  if (inputG) {
+    inputG->maxoutBackward(*outG, *maxoutId_, outputChannels_, groups_);
+  }
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/MaxOutLayer.h
+++ b/paddle/gserver/layers/MaxOutLayer.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+namespace paddle {
+/**
+ * A layer to do max out on conv layer output.
+ * Input: output of a conv layer.
+ * Output: feature map size same as input.  Channel is (input channel) / groups.
+ * So the num of channels should be able to devided by groups.
+ *
+ * The config file api is maxout_layer.
+ */
+class MaxOutLayer : public Layer {
+protected:
+  size_t groups_;
+  size_t imgSizeH_, imgSizeW_;
+  /// outputChannels_ = channels_ / groups_
+  size_t channels_, outputChannels_;
+  /// feature length = imgSizeH_ * imgSizeW_
+  size_t featLen_;
+  IVectorPtr maxoutId_;
+public:
+  /// return imgSizeH_ * imgSizeW_ * outputChannels_;
+  size_t getSize();
+  explicit MaxOutLayer(const LayerConfig& config) : Layer(config) {}
+  virtual ~MaxOutLayer() {}
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+}  // namespace paddle
--- a/paddle/gserver/layers/NCELayer.cpp
+++ b/paddle/gserver/layers/NCELayer.cpp
@@ -21,14 +21,18 @@ limitations under the License. */
 namespace paddle {
 /**
- * Noise-contrastive estimation
+ * Noise-contrastive estimation.
 * Implements the method in the following paper:
- * A fast and simple algorithm for training neural probabilistic language models
+ * A fast and simple algorithm for training neural probabilistic language models.
+ *
+ * The config file api is nce_layer.
 */
 class NCELayer : public Layer {
  int numClasses_;
-  int numInputs_;  // number of input layer besides labelLayer and weightLayer
+  /// number of input layer besides labelLayer and weightLayer
+  int numInputs_;
  LayerPtr labelLayer_;
+  /// weight layer, can be None
  LayerPtr weightLayer_;
  WeightList weights_;
  std::unique_ptr<Weight> biases_;
@@ -43,7 +47,8 @@ class NCELayer : public Layer {
    real weight;
  };
  std::vector<Sample> samples_;
-  bool prepared_;  // whether samples_ is prepared
+  /// whether samples_ is prepared
+  bool prepared_;
  Argument sampleOut_;
  IVectorPtr labelIds_;

--- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
+++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/utils/Logging.h"
-#include "Layer.h"
+#include "SequencePoolLayer.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/utils/Stat.h"
@@ -29,20 +29,19 @@ namespace paddle {
 * If SequenceLevel = kSeq:
 *   Check input sequence must has sub-sequence
 *   Output: a sequence containing only the last instance of each sub-sequence
- * of the input sequence
+ *           of the input sequence
+ *
+ * The config file api is last_seq and first_seq.
 */
-class SequenceLastInstanceLayer : public Layer {
+class SequenceLastInstanceLayer : public SequencePoolLayer {
 protected:
-  std::unique_ptr<Weight> biases_;
  MatrixPtr tmpSrc_;
  MatrixPtr tmpDest_;
-  enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
-  int type_;
 public:
  explicit SequenceLastInstanceLayer(const LayerConfig& config)
-      : Layer(config) {}
+      : SequencePoolLayer(config) {}
  ~SequenceLastInstanceLayer() {}
@@ -56,55 +55,20 @@ REGISTER_LAYER(seqlastins, SequenceLastInstanceLayer);
 bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
                                     const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
+  SequencePoolLayer::init(layerMap, parameterMap);
-  Layer::init(layerMap, parameterMap);
-  // seqlastins layer should have exactly 1 input
-  CHECK_EQ(1U, inputLayers_.size());
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
  tmpSrc_ =
      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
  tmpDest_ =
      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-  // transform to which sequence type
-  if (config_.trans_type() == "non-seq") {
-    type_ = kNonSeq;
-  } else if (config_.trans_type() == "seq") {
-    type_ = kSeq;
-  } else {
-    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
-  }
-  setNeedSequenceInfo(false);
  return true;
 }
 void SequenceLastInstanceLayer::forward(PassType passType) {
-  Layer::forward(passType);
+  SequencePoolLayer::forward(passType);
-  size_t dim = getSize();
-  const Argument& input = getInput(0);
-  // check
-  auto startPositions =
-      type_ ? input.subSequenceStartPositions->getVector(false)
-            : input.sequenceStartPositions->getVector(false);
-  size_t height = type_ ? input.getNumSubSequences() : input.getNumSequences();
-  CHECK_EQ(dim, input.value->getWidth());
-  CHECK_EQ(startPositions->getData()[height], input.getBatchSize());
-  CHECK_EQ(height, startPositions->getSize() - 1);
-  if (type_) {
-    // when trans_type = seq, input must hasSubseq
-    CHECK_EQ(input.hasSubseq(), 1UL);
-  }
-  reserveOutput(height, dim);
+  const int* starts = startPositions_->getData(false);
-  const int* starts = startPositions->getData();
  MatrixPtr inputValue = getInputValue(0);
  MatrixPtr outputValue = getOutputValue();
@@ -112,21 +76,13 @@ void SequenceLastInstanceLayer::forward(PassType passType) {
    AsyncGpuBlock asyncGpuBlock;
    REGISTER_TIMER_INFO("SequenceLastInstanceLayerForward", getName().c_str());
-    for (size_t seqId = 0; seqId < height; ++seqId) {
+    for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
      int insId =
          config_.select_first() ? starts[seqId] : starts[seqId + 1] - 1;
      outputValue->subMatrix(seqId, 1, tmpDest_)
          ->assign(*(inputValue->subMatrix(insId, 1, tmpSrc_)));
    }
-    /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
-     * thus, in this case, output_ has no sequenceStartPositions.
-     * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
-     * case, we should compute the new sequenceStartPositions.
-    */
-    if (type_) {
-      output_.degradeSequence(input, useGpu_);
-    }
  }
  if (biases_.get() != NULL) {
@@ -138,23 +94,12 @@ void SequenceLastInstanceLayer::forward(PassType passType) {
 }
 void SequenceLastInstanceLayer::backward(const UpdateCallback& callback) {
-  /* activation, should set to 'linear' in most cases */
+  SequencePoolLayer::backward(callback);
-  backwardActivation();
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
  MatrixPtr inputGrad = getInputGrad(0);
  MatrixPtr outputGrad = getOutputGrad();
-  auto startPositions =
+  const int* starts = startPositions_->getData(false);
-      type_ ? getInput(0).subSequenceStartPositions->getVector(false)
+  size_t numSequences = startPositions_->getSize() - 1;
-            : getInput(0).sequenceStartPositions->getVector(false);
-  const int* starts = startPositions->getData();
-  size_t numSequences = startPositions->getSize() - 1;
  if (inputGrad) {
    AsyncGpuBlock asyncGpuBlock;

--- a/paddle/gserver/layers/SequencePoolLayer.cpp
+++ b/paddle/gserver/layers/SequencePoolLayer.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/utils/Logging.h"
+#include "SequencePoolLayer.h"
+namespace paddle {
+bool SequencePoolLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  // seqlastins/max/average layer should have exactly 1 input
+  CHECK_EQ(1U, inputLayers_.size());
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+  // transform to which sequence type
+  if (config_.trans_type() == "non-seq") {
+    type_ = kNonSeq;
+  } else if (config_.trans_type() == "seq") {
+    type_ = kSeq;
+  } else {
+    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
+  }
+  setNeedSequenceInfo(false);
+  return true;
+}
+void SequencePoolLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  const Argument& input = getInput(0);
+  newBatchSize_ = type_ ? input.getNumSubSequences() : input.getNumSequences();
+  size_t dim = getSize();
+  // check
+  CHECK_EQ(dim, input.value->getWidth());
+  startPositions_ =
+      type_ ? input.subSequenceStartPositions : input.sequenceStartPositions;
+  auto starts = startPositions_->getVector(false);
+  CHECK_EQ(starts->getData()[newBatchSize_], input.getBatchSize());
+  CHECK_EQ(newBatchSize_, starts->getSize() - 1);
+  resetOutput(newBatchSize_, dim);
+  if (type_) {
+    CHECK(input.subSequenceStartPositions)
+      << "when trans_type = seq, input must hasSubseq";
+  }
+  /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
+   * thus, in this case, output_ has no sequenceStartPositions.
+   * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
+   * case, we should compute the new sequenceStartPositions.
+  */
+  if (type_) {
+    output_.degradeSequence(input, useGpu_);
+  }
+}
+void SequencePoolLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ { backwardActivation(); }
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+    // Increasing the number of gradient
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/SequencePoolLayer.h
+++ b/paddle/gserver/layers/SequencePoolLayer.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+namespace paddle {
+/**
+ * A base layer for SequenceLastInstanceLayer/AverageLayer/MaxLayer.
+ *
+ * Input: one or more sequences. Each sequence contains some instances.
+ * If SequenceLevel = kNonSeq:
+ *    Output: output size is the number of input sequences (NOT input instances)
+ *    output[i] = seqlastin/average/max_{for each instance in this
+ * sequence}{input[i]}
+ * If SequenceLevel = kSeq:
+ *    Check input sequence must has sub-sequence
+ *    Output: output size is the number of input sub-sequences
+ *    output[i] = seqlastin/average/max_{for each instance in this
+ * sub-sequence}{input[i]}
+ *
+ * The config file api is pooling_layer.
+ */
+class SequencePoolLayer : public Layer {
+protected:
+  int type_;
+  std::unique_ptr<Weight> biases_;
+  enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
+  size_t newBatchSize_;
+  ICpuGpuVectorPtr startPositions_;
+public:
+  explicit SequencePoolLayer(const LayerConfig& config) : Layer(config) {}
+  virtual ~SequencePoolLayer() {}
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+}  // namespace paddle
--- a/paddle/gserver/tests/rnn_data_provider.py
+++ b/paddle/gserver/tests/rnn_data_provider.py
@@ -14,12 +14,15 @@
 from paddle.trainer.PyDataProvider2 import *
+# Note that each config should has an independent provider
+# in current design of PyDataProvider2.
+#######################################################
 data = [
    [[[1, 3, 2], [4, 5, 2]], 0],
    [[[0, 2], [2, 5], [0, 1, 2]], 1],
 ]
+# Used for sequence_nest_rnn.conf
 @provider(input_types=[integer_value_sub_sequence(10),
                       integer_value(3)],
          should_shuffle=False)
@@ -27,7 +30,7 @@ def process_subseq(settings, file_name):
    for d in data:
        yield d
+# Used for sequence_rnn.conf
 @provider(input_types=[integer_value_sequence(10),
                       integer_value(3)],
          should_shuffle=False)
@@ -38,11 +41,32 @@ def process_seq(settings, file_name):
            seq += subseq
        yield seq, d[1]
+# Used for sequence_nest_rnn_multi_input.conf
+@provider(input_types=[integer_value_sub_sequence(10),
+                       integer_value(3)],
+          should_shuffle=False)
+def process_subseq2(settings, file_name):
+    for d in data:
+        yield d
+# Used for sequence_rnn_multi_input.conf
+@provider(input_types=[integer_value_sequence(10),
+                       integer_value(3)],
+          should_shuffle=False)
+def process_seq2(settings, file_name):
+    for d in data:
+        seq = []
+        for subseq in d[0]:
+            seq += subseq
+        yield seq, d[1]
+###########################################################
 data2 = [
    [[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]] ,0],
    [[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]], 1],
 ]
+# Used for sequence_nest_rnn_multi_unequalength_inputs.conf
 @provider(input_types=[integer_value_sub_sequence(10),
                       integer_value_sub_sequence(10),
                       integer_value(2)],
@@ -52,6 +76,7 @@ def process_unequalength_subseq(settings, file_name):
        yield d
+# Used for sequence_rnn_multi_unequalength_inputs.conf
 @provider(input_types=[integer_value_sequence(10),
                       integer_value_sequence(10),
                       integer_value(2)],

--- a/paddle/gserver/tests/sequenceGen.py
+++ b/paddle/gserver/tests/sequenceGen.py
@@ -21,7 +21,7 @@ from paddle.trainer.PyDataProvider2 import *
 def hook(settings, dict_file, **kwargs):
    settings.word_dict = dict_file
    settings.input_types = [integer_value_sequence(len(settings.word_dict)),
-                            integer_value_sequence(3)]
+                            integer_value(3)]
    settings.logger.info('dict len : %d' % (len(settings.word_dict)))
@@ -34,14 +34,14 @@ def process(settings, file_name):
            words = comment.split()
            word_slot = [settings.word_dict[w] for w in words if
                         w in settings.word_dict]
-            yield word_slot, [label]
+            yield word_slot, label
 ## for hierarchical sequence network
 def hook2(settings, dict_file, **kwargs):
    settings.word_dict = dict_file
    settings.input_types = [integer_value_sub_sequence(len(settings.word_dict)),
-                            integer_value_sub_sequence(3)]
+                            integer_value_sequence(3)]
    settings.logger.info('dict len : %d' % (len(settings.word_dict)))
@@ -57,7 +57,7 @@ def process2(settings, file_name):
                words = comment.split()
                word_slot = [settings.word_dict[w] for w in words if
                             w in settings.word_dict]
-                label_list.append([label])
+                label_list.append(label)
                word_slot_list.append(word_slot)
            else:
                yield word_slot_list, label_list

--- a/paddle/gserver/tests/sequence_nest_rnn.conf
+++ b/paddle/gserver/tests/sequence_nest_rnn.conf
@@ -56,9 +56,8 @@ def outer_step(x):
    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
    # "return last" should also work. But currently RecurrentGradientMachine
-    # does not handle it correctly. Current implementation requires that
+    # does not handle it, and will report error: In hierachical RNN, all out 
-    # all the out links are from sequences. However, it does not report error
+    # links should be from sequences now.
-    # when the out links are not sequences.
    return inner_rnn_output
 out = recurrent_group(

--- a/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
+++ b/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
@@ -19,7 +19,7 @@ from paddle.trainer_config_helpers import *
 define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
                        test_list=None,
                        module='rnn_data_provider',
-                        obj='process_subseq')
+                        obj='process_subseq2')
 settings(batch_size=2, learning_rate=0.01)
@@ -57,9 +57,8 @@ def outer_step(wid, x):
    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
    # "return last" should also work. But currently RecurrentGradientMachine
-    # does not handle it correctly. Current implementation requires that
+    # does not handle it, and will report error: In hierachical RNN, all out 
-    # all the out links are from sequences. However, it does not report error
+    # links should be from sequences now.
-    # when the out links are not sequences.
    return inner_rnn_output
 out = recurrent_group(

--- a/paddle/gserver/tests/sequence_rnn_multi_input.conf
+++ b/paddle/gserver/tests/sequence_rnn_multi_input.conf
@@ -19,7 +19,7 @@ from paddle.trainer_config_helpers import *
 define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
                        test_list=None,
                        module='rnn_data_provider',
-                        obj='process_seq')
+                        obj='process_seq2')
 settings(batch_size=2, learning_rate=0.01)

--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -327,6 +327,24 @@ TEST(Layer, blockExpandLayer) {
  }
 }
+TEST(Layer, maxoutLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("maxout");
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  MaxOutConfig* maxout = input->mutable_maxout_conf();
+  maxout->set_img_size_x(32);
+  maxout->set_img_size_y(32);
+  maxout->set_channels(4);
+  maxout->set_groups(2);
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "maxout", 10, false, useGpu);
+  }
+}
 void testFcLayer(string format, size_t nnz) {
  TestConfig config;
  config.biasSize = 4096;

--- a/paddle/gserver/tests/test_PyDataProvider2.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider2.cpp
@@ -117,7 +117,7 @@ TEST(PyDataProvider2, index_no_seq) {
 }
 TEST(PyDataProvider2, init_hook) {
-  paddle::PyObjectPtr pickle(PyImport_ImportModule("pickle"));
+  paddle::PyObjectPtr pickle = paddle::py::import("pickle");
  paddle::PyObjectPtr globals(
      PyModule_GetDict(PyImport_AddModule("__main__")));
  PyDict_SetItemString(globals.get(), "pickle", pickle.get());

--- a/paddle/gserver/tests/test_PyDataProvider2.py
+++ b/paddle/gserver/tests/test_PyDataProvider2.py
@@ -86,7 +86,7 @@ def test_can_over_batch_size(setting, filename):
        yield [random.randint(0, 100 - 1) for _ in xrange(seq_len)]
-@provider(input_types=[index_slot(10), index_slot(10)])
+@provider(input_types={'input1':index_slot(10), 'input2': index_slot(10)})
 def test_input_order(setting, filename):
    for _ in xrange(1000):
        yield {

--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <gtest/gtest.h>
 #include <paddle/utils/Util.h>
 #include <paddle/utils/Version.h>
@@ -24,7 +23,7 @@ limitations under the License. */
 P_DECLARE_int32(seed);
 using namespace paddle;  // NOLINT
-using namespace std;  // NOLINT
+using namespace std;     // NOLINT
 class TrainerForTest : public paddle::Trainer {
 public:
  void startTrain() {
@@ -44,11 +43,10 @@ public:
   */
  size_t getTotalParameterSize() const {
    auto p = const_cast<TrainerForTest*>(this);
-    auto & params = p->getGradientMachine()->getParameters();
+    auto& params = p->getGradientMachine()->getParameters();
-    return std::accumulate(params.begin(), params.end(), 0UL,
+    return std::accumulate(
-                           [](size_t a, const ParameterPtr& p){
+        params.begin(), params.end(), 0UL,
-      return a+p->getSize();
+        [](size_t a, const ParameterPtr& p) { return a + p->getSize(); });
-    });
  }
 };

--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -283,13 +283,13 @@ void GpuMatrix::copyFrom(const IVector& src) {
  copyFrom(matrix);
 }
-void GpuMatrix::copyByRowIndex(Matrix& b, IVector& rowIndex) {
+void GpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
  size_t height = getHeight();
  size_t width = getWidth();
  CHECK_EQ(b.getWidth(), width);
  real* dst = getData();
  real* src = b.getData();
-  int* index = rowIndex.getData();
+  const int* index = rowIndex.getData();
  hl_sequence2batch_copy(dst, src, index, width, height, true);
 }
@@ -584,6 +584,42 @@ void GpuMatrix::colMax(Matrix& max) {
  max.maxCols(*this);
 }
+void GpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
+  LOG(FATAL) << "Is not supported";
+}
+void GpuMatrix::maxoutForward(Matrix& a, IVector& id, size_t channels,
+                              size_t groups) {
+  CHECK(dynamic_cast<GpuMatrix*>(&a));
+  CHECK(dynamic_cast<GpuIVector*>(&id));
+  CHECK_EQ(a.getHeight(), getHeight());
+  size_t size = getWidth();
+  size_t batchSize = getHeight();
+  const real* input  = a.getData();
+  real* output = getData();
+  int* idForGpu = id.getData();
+  hl_maxout_forward(input, output, idForGpu, batchSize, size,
+                    size / channels, groups);
+}
+void GpuMatrix::maxoutBackward(Matrix& a, IVector& id, size_t channels,
+                               size_t groups) {
+  CHECK(dynamic_cast<GpuMatrix*>(&a));
+  CHECK(dynamic_cast<GpuIVector*>(&id));
+  CHECK_EQ(a.getHeight(), getHeight());
+  size_t size = a.getWidth();
+  size_t batchSize = getHeight();
+  real* input  = getData();
+  const real* output = a.getData();
+  const int* idForGpu = id.getData();
+  hl_maxout_backward(input, output, idForGpu, batchSize, size,
+                     size / channels, groups);
+}
 /*calulate the error of classification */
 void GpuMatrix::classificationError(MatrixPtr output, IVectorPtr label) {
  GpuMatrixPtr output_ptr = std::dynamic_pointer_cast<GpuMatrix>(output);
@@ -1329,11 +1365,11 @@ void CpuMatrix::copyFrom(const IVector& src) {
  }
 }
-void CpuMatrix::copyByRowIndex(Matrix& b, IVector& rowIndex) {
+void CpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
  size_t height = getHeight();
  size_t width = getWidth();
  CHECK_EQ(b.getWidth(), width);
-  int* index = rowIndex.getData();
+  const int* index = rowIndex.getData();
  for (size_t i = 0; i < height; i++) {
    CHECK_LT(static_cast<size_t>(index[i]), b.getHeight());
    real* src = b.getData() + index[i] * width;
@@ -2799,6 +2835,95 @@ void CpuMatrix::colMax(Matrix& max) {
  max.maxCols(*this);
 }
+void CpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
+  CHECK(isContiguous());
+  CHECK(!maxIds.useGpu() && !maxVal.useGpu()) << "Matrix type are not equal";
+  size_t numSamples = getWidth();
+  size_t beam = maxVal.getHeight();
+  CHECK_EQ(maxIds.getSize(), numSamples * beam);
+  CHECK_EQ(maxVal.getWidth(), numSamples);
+  real* a = getData();
+  int* s = maxIds.getData();
+  real* t = maxVal.getData();
+  size_t dim = getHeight();
+  for (size_t i = 0; i < numSamples; i++) {
+    std::vector<std::pair<real, size_t>> vec;
+    for (size_t j = 0; j < dim; j++) {
+      vec.push_back(std::pair<real, size_t>(a[i + j * numSamples], j));
+    }
+    std::partial_sort(
+        vec.begin(), vec.begin() + beam, vec.end(),
+        [](const std::pair<real, size_t>& l, const std::pair<real, size_t>& r) {
+          return l.first > r.first;
+        });
+    for (size_t j = 0; j < beam; j++) {
+      t[i + j * numSamples] = vec[j].first;
+      s[i + j * numSamples] = vec[j].second;
+    }
+  }
+}
+void CpuMatrix::maxoutForward(Matrix& a, IVector& id, size_t channels,
+                              size_t groups) {
+  CHECK(dynamic_cast<CpuMatrix*>(&a));
+  CHECK(dynamic_cast<CpuIVector*>(&id));
+  CHECK_EQ(a.getHeight(), getHeight());
+  size_t size = getWidth();
+  size_t batchSize = getHeight();
+  size_t featLen = size / channels;
+  const real* input  = a.getData();
+  int* idForCpu = id.getData();
+  MatrixPtr maxInMat, maxOutMat;
+  Matrix::resizeOrCreate(maxInMat, groups, size, false, false);
+  Matrix::resizeOrCreate(maxOutMat, 1, size, false, false);
+  for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
+    size_t newIndex = batch_idx * size;
+    IVectorPtr tmpId = IVector::create(idForCpu + newIndex, size, false);
+    for (size_t i = 0; i < channels; ++i) {
+      size_t newFeatLen = i * featLen;
+      for (size_t j = 0; j < groups; ++j) {
+        maxInMat->subMatrix(j, j + 1, newFeatLen, newFeatLen + featLen)
+            ->copyFrom(input + (newIndex + newFeatLen) * groups + j * featLen,
+                       featLen);
+      }
+    }
+    maxInMat->colMax(*tmpId, *maxOutMat);
+    this->subRowMatrix(batch_idx, batch_idx + 1)->copyFrom(*maxOutMat);
+  }
+}
+void CpuMatrix::maxoutBackward(Matrix& a, IVector& id, size_t channels,
+                               size_t groups) {
+  CHECK(dynamic_cast<CpuMatrix*>(&a));
+  CHECK(dynamic_cast<CpuIVector*>(&id));
+  CHECK_EQ(a.getHeight(), getHeight());
+  size_t size = a.getWidth();
+  size_t batchSize = getHeight();
+  size_t featLen = size / channels;
+  size_t newFeatLen = groups * featLen;
+  real* inputG  = getData();
+  const real* outG  = a.getData();
+  int* idForCpu = id.getData();
+  for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
+    size_t newIndex = batch_idx * size;
+    int* idData = idForCpu + newIndex;
+    for (size_t i = 0; i < size; ++i) {
+      int gradIdx =
+          idData[i] * featLen + (i / featLen) * newFeatLen + i % featLen;
+      (inputG + newIndex * groups)[gradIdx] += (outG + newIndex)[i];
+    }
+  }
+}
 void CpuMatrix::rowNormalizeL1(Matrix& out) {
  CHECK(!out.useGpu());

--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -253,7 +253,7 @@ public:
    LOG(FATAL) << "copy data from int vector only available on CpuMatrix.";
  }
-  virtual void copyByRowIndex(Matrix& b, IVector& rowIndex) {
+  virtual void copyByRowIndex(Matrix& b, const IVector& rowIndex) {
    LOG(FATAL) << "Not implemented";
  }
@@ -493,16 +493,40 @@ public:
    LOG(FATAL) << "Not implemeted";
  }
+  /**
+   * set the max of each column of this to mat
+   */
  virtual void colMax(Matrix& max) { LOG(FATAL) << "not implemented"; }
+  /**
+   * @brief Get the top k elements of each column of this matrix.
+   *
+   * The row ids and values of these elements are stored in
+   * maxIds and max respectively. where k is the size of maxIds.
+   * And note that the top k elements are not sorted.
+   */
+  virtual void colMax(IVector& maxIds, Matrix& maxVal) {
+    LOG(FATAL) << "not implemented";
+  }
+  virtual void maxoutForward(Matrix& a, IVector& id, size_t channels,
+                             size_t groups) {
+    LOG(FATAL) << "not implemented";
+  }
+  virtual void maxoutBackward(Matrix& a, IVector& id, size_t channels,
+                              size_t groups) {
+    LOG(FATAL) << "not implemented";
+  }
  virtual void rowMaxId(IVector& maxIds) { LOG(FATAL) << "Not implemented"; }
  /**
   * @brief Get the top k elements of each row of this matrix.
   *
   * The column ids and values of these elements are stored in
-   * maxIds and max respectively. Note that the top k
+   * maxIds and max respectively. where k is the size of maxIds.
-   * elements are not sorted.
+   * And note that the top k elements are not sorted.
   */
  virtual void rowMax(IVector& maxIds, Matrix& max) {
    LOG(FATAL) << "Not implemented";
@@ -995,7 +1019,7 @@ public:
  void copyFrom(const IVector& src);
-  void copyByRowIndex(Matrix& b, IVector& rowIndex);
+  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
@@ -1101,6 +1125,9 @@ public:
  void rowMax(Matrix& max);
  void rowMax(IVector& maxIds, Matrix& max);
  void colMax(Matrix& max);
+  void colMax(IVector& maxIds, Matrix& max);
+  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
+  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
  void oneHotCrossEntropy(Matrix& output, IVector& label);
  void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
@@ -1271,7 +1298,7 @@ public:
  void copyFrom(CpuSparseMatrix& src);
-  void copyByRowIndex(Matrix& b, IVector& rowIndex);
+  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
@@ -1425,6 +1452,9 @@ public:
  void rowMax(Matrix& max);
  void rowMax(IVector& maxIds, Matrix& maxVal);
  void colMax(Matrix& max);
+  void colMax(IVector& maxIds, Matrix& maxVal);
+  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
+  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
  void rowNormalizeL1(Matrix& out);
  void oneHotCrossEntropy(Matrix& output, IVector& label);

--- a/paddle/math/SparseRowMatrix.cpp
+++ b/paddle/math/SparseRowMatrix.cpp
@@ -227,12 +227,18 @@ void CacheRowCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
 void SparsePrefetchRowCpuMatrix::addRows(const unsigned int* ids, size_t len) {
  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
+  for (size_t i = 0; i < len; i ++) {
+    CHECK_LT(*(ids + i), this->getHeight())
+      << "id:" << *(ids + i) << "Height:" << this->getHeight()
+      << "sparse id value exceeds the max input dimension, "
+      << "it could be caused invalid input data samples";
+  }
  localIndices.insert(localIndices.end(), ids, ids + len);
 }
 void SparsePrefetchRowCpuMatrix::addRows(MatrixPtr input) {
  CpuSparseMatrix* mat = dynamic_cast<CpuSparseMatrix*>(input.get());
-  CHECK(mat) << "only support non value sparse matrix";
+  CHECK(mat) << "only support sparse matrix";
  addRows(reinterpret_cast<const unsigned int*>(mat->getCols()),
          mat->getElementCnt());
 }
@@ -243,7 +249,13 @@ void SparsePrefetchRowCpuMatrix::addRows(IVectorPtr ids) {
  int* index = ids->getData();
  for (size_t i = 0; i < numSamples; ++i) {
    if (index[i] == -1) continue;
-    localIndices.push_back((unsigned int)index[i]);
+    unsigned int id = (unsigned int)index[i];
+    CHECK_LT(id, this->getHeight())
+      << "id:" << id << "Height:" << this->getHeight()
+      << "sparse id value exceeds the max input dimension, "
+      << "it could be caused invalid input data samples";
+    localIndices.push_back(id);
  }
 }

--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -2065,6 +2065,78 @@ TEST(Matrix, PoolFwdBwd) {
  }
 }
+void testMaxOutFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
+                      int channels, int groups) {
+  int inWidth = imgSizeH * imgSizeW * channels;
+  int outChannels = channels / groups;
+  int outWidth = imgSizeH * imgSizeW * outChannels;
+  // forward
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
+  IVectorPtr id = CpuIVector::create(numSamples * outWidth, false);
+  IVectorPtr idGpu = GpuIVector::create(numSamples * outWidth, true);
+  IVectorPtr idCheck = CpuIVector::create(numSamples * outWidth, false);
+  input->randomizeUniform();
+  inputGpu->copyFrom(*input);
+  target->maxoutForward(*input, *id, outChannels, groups);
+  targetGpu->maxoutForward(*inputGpu, *idGpu, outChannels, groups);
+  // check
+  targetCheck->copyFrom(*targetGpu);
+  MatrixCheckErr(*target, *targetCheck);
+  idCheck->copyFrom(*idGpu);
+  VectorCheckEqual(*id, *idCheck);
+  // backward
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth, false,
+                                              true);
+  MatrixPtr targetCheckGrad = CpuMatrix::create(numSamples, inWidth, false,
+                                                false);
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+  inputGrad->maxoutBackward(*targetGrad, *id, outChannels, groups);
+  inputGpuGrad->maxoutBackward(*targetGpuGrad, *idGpu, outChannels, groups);
+  // check
+  targetCheckGrad->copyFrom(*inputGpuGrad);
+  MatrixCheckErr(*inputGrad, *targetCheckGrad);
+}
+TEST(Matrix, MaxOutFwdBwd) {
+  for (auto numSamples : {5, 10}) {
+    for (auto channels : {8, 16}) {
+      for (auto imgSizeH : {14, 28}) {
+        for (auto imgSizeW : {16, 30}) {
+          for (auto groups : {2, 4}) {
+            VLOG(3) << " numSamples=" << numSamples
+                    << " channels=" << channels
+                    << " imgSizeH=" << imgSizeH
+                    << " imgSizeW=" << imgSizeW
+                    << " groups=" << groups;
+            testMaxOutFwdBwd(numSamples, imgSizeH, imgSizeW, channels, groups);
+          }
+        }
+      }
+    }
+  }
+}
 int main(int argc, char** argv) {
  testing::InitGoogleTest(&argc, argv);
  initMain(argc, argv);

--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@@ -146,6 +146,12 @@ public:
    }
  }
+  void enableBufType(ParameterType type) {
+    if (bufs_[type]) return;
+    bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
+    bufs_[type]->zeroMem();
+  }
  void enableIntType(ParameterType type, size_t intStoreSize = 0) {
    if (!intBufs_[type]) {
      SetDevice device(deviceId_);

--- a/paddle/pserver/PserverForPython.h
+++ b/paddle/pserver/PserverForPython.h
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include "paddle/pserver/ParameterClient.h"
-#include "paddle/pserver/ParameterServer.h"
-#include "paddle/parameter/Parameter.h"
-#include <Python.h>
-namespace paddle {
-struct PyObjectDeleter {
-  void operator()(PyObject* obj) {
-    if (obj) {
-      Py_DECREF(obj);
-    }
-  }
-};
-class ParameterClientPy : public ParameterClient {
-protected:
-  typedef std::unique_ptr<PyObject, PyObjectDeleter> PyObjectPtr;
-  std::vector<ParameterPtr> parameter_;
-  int initArgc_;
-  char** initArgv_;
-public:
-  ParameterClientPy(std::vector<std::string> configs, int argc,
-                    std::vector<std::string> argv, bool useGpu) {
-    initArgc_ = argc;
-    initArgv_ = new char* [argc];
-    for (int i = 0; i < argc; i++) {
-      initArgv_[i] = new char[argv[i].size()];
-      strcpy(initArgv_[i],      // NOLINT
-             argv[i].c_str());  // NOLINT TODO(yuyang18): use snprintf instead.
-    }
-    ParameterConfig pyConfig;
-    ParameterPtr param;
-    for (auto& config : configs) {
-      pyConfig.ParseFromString(config);
-      param.reset(new Parameter(pyConfig, useGpu));
-      parameter_.push_back(param);
-    }
-    Py_Initialize();
-    CHECK(Py_IsInitialized());
-  }
-  ~ParameterClientPy() {
-    delete initArgv_;
-    Py_Finalize();
-  }
-  Parameter getParameter(int idx) { return *(parameter_[idx].get()); }
-  void initClientPy() {
-    initMain(initArgc_, initArgv_);
-    CHECK(init(parameter_)) << "Init Client Failed.";
-  }
-  void setConfigPy(std::string config) {
-    OptimizationConfig optConfig;
-    optConfig.ParseFromString(config);
-    setConfig(optConfig);
-  }
-  bool inStatusPy(int status) { return inStatus(PServerStatus(status)); }
-  void setStatusPy(int status) { setStatus(PServerStatus(status)); }
-  void waitForStatusPy(int status) { waitForStatus(PServerStatus(status)); }
-  void sendParameterPy(int updateMode, int parameterType, int numSamples,
-                       real cost, bool sendBackParameter) {
-    sendParameter(ParameterUpdateMode(updateMode), ParameterType(parameterType),
-                  int64_t(numSamples), real(cost), sendBackParameter);
-  }
-  template <class ProtoIn, class ProtoOut>
-  std::string asyncCallPy(const char* serviceName, const char* funcName,
-                          const std::string in) {
-    ProtoIn protoIn;
-    ProtoOut protoOut;
-    std::mutex waitLock;
-    std::string data;
-    protoIn.ParseFromString(in);
-    waitLock.lock();
-    auto callback = [&](ProtoOut* pOut, bool isSuccessful) {
-      if (isSuccessful) {
-        pOut->SerializeToString(&data);
-      } else {
-        LOG(INFO) << "Async Talk Failed.";
-      }
-      waitLock.unlock();
-    };
-    ubClient_.asyncCall<ProtoIn, ProtoOut>(serviceName, funcName, protoIn,
-                                           &protoOut, callback);
-    waitLock.lock();
-    protoOut.SerializeToString(&data);
-    return data;
-  }
-};
-}  // namespace paddle
--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
@@ -63,7 +63,8 @@ class SparseBinaryScanner(IScanner):
    def scan(self, dat):
        self.extend_cols(dat)
-        self.__rows__.append(len(dat))
+        self.__rows__.append(len(dat) + self.__rows__[-1])
+        self.__height__ += 1
    def extend_cols(self, dat):
        self.__cols__.extend(dat)

--- a/paddle/scripts/travis/before_install.sh
+++ b/paddle/scripts/travis/before_install.sh
--- a/paddle/scripts/travis/before_install.osx.sh
+++ b/paddle/scripts/travis/before_install.osx.sh
+#!/bin/bash
+brew update
+brew tap homebrew/science
+brew install python
+sudo pip install --upgrade protobuf==2.6.0
+brew install homebrew/versions/protobuf260 --without-python
+brew install cmake python glog gflags openblas wget md5sha1sum
+wget https://github.com/google/googletest/archive/release-1.8.0.tar.gz -O gtest.tar.gz
+tar xf gtest.tar.gz
+cd googletest-release-1.8.0/
+cmake .
+make install
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
 #!/bin/bash
 source ./common.sh
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_TESTING=ON -DON_TRAVIS=ON
+CMAKE_EXTRA=""
-make -j `nproc`
+if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
-env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="-j `nproc`"
+  CMAKE_EXTRA="-DPYTHON_LIBRARY=/usr/local/Cellar/python/2.7.12_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/config/libpython2.7.dylib"
+fi
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_TESTING=ON -DON_TRAVIS=ON ${CMAKE_EXTRA}
+NPROC=1
+if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
+  NRPOC=`nproc`
+elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
+  NPROC=`sysctl -n hw.ncpu`
+fi
+make -j $NPROC
+env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="-j $NPROC"
 sudo make install
 sudo paddle version
--- a/paddle/trainer/ThreadParameterUpdater.cpp
+++ b/paddle/trainer/ThreadParameterUpdater.cpp
@@ -20,6 +20,8 @@ limitations under the License. */
 #include "paddle/math/SparseRowMatrix.h"
 #include "paddle/utils/Thread.h"
+P_DECLARE_int32(trainer_count);
 namespace paddle {
 SgdThreadUpdater::SgdThreadUpdater(const OptimizationConfig& optConfig)
@@ -48,6 +50,13 @@ void SgdThreadUpdater::init(std::vector<ParameterPtr>& parameters) {
                                              false /*inPserver*/));
    size_t numRows = para->isGradSparseUpdate() ? para->getConfig().dims(0) : 0;
    optimizers_[pid]->init(numRows, &para->getConfig());
+    if (para->isGradSparseUpdate() && FLAGS_trainer_count == 1) {
+      // For trainer_count=1, the gradient machine is NeuralNetwork, which does
+      // not create parameter buf for PARAMETER_GRADIENT for sparse update in
+      // Parameter::enableType(). But gradient parameter buf is still used
+      // in SgdThreadUpdater. We need to explicitly create it.
+      para->enableBufType(PARAMETER_GRADIENT);
+    }
  }
 }
@@ -211,7 +220,7 @@ void SgdThreadUpdater::threadUpdateSparse(
    // From MultiGradientMachine
    SparseRowIdsCpuMatrix* mainMat = dynamic_cast<SparseRowIdsCpuMatrix*>(
      para->getMat(PARAMETER_GRADIENT).get());
-    const std::vector<uint32_t>& sparseIds = mainMat->getIds(tid);
+    std::vector<uint32_t>& sparseIds = mainMat->getIds(tid);
    for (auto id : sparseIds) {
      // setup sub bufs
@@ -221,6 +230,7 @@ void SgdThreadUpdater::threadUpdateSparse(
      optimizer->update(vecs, para->getConfig(), id);
      vecs[PARAMETER_GRADIENT]->zeroMem();
    }
+    sparseIds.clear();
  } else if (dynamic_cast<SparseRowCpuMatrix*>(
               para->getMat(PARAMETER_GRADIENT).get())) {
    // From NeuralNetwork
@@ -246,6 +256,10 @@ void SgdThreadUpdater::threadUpdateSparse(
      optimizer->update(vecs, para->getConfig(), id);
      vecs[PARAMETER_GRADIENT]->zeroMem();
    }
+    // For numThreads > 1, MultiGradientMachine is used, which goes
+    // to the above branch.
+    CHECK_EQ(numThreads, 1UL);
+    mainMat->clearIndices();
  } else {
    auto & m = *para->getMat(PARAMETER_GRADIENT).get();
    LOG(FATAL) << "Internal error: " << para->getName() << " "

--- a/paddle/trainer/tests/test_config.conf
+++ b/paddle/trainer/tests/test_config.conf
@@ -13,157 +13,71 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#Todo(luotao02) This config is only used for unitest. It is out of date now, and will be updated later.
+from paddle.trainer_config_helpers import *
-default_initial_std(0.5)
+TrainData(ProtoData(
+    files = "dummy_list",
-model_type("nn")
+    constant_slots = [1.0],
+    async_load_data = True))
-DataLayer(
-    name = "input",
+TestData(SimpleData(
-    size = 3,
+    files = "trainer/tests/sample_filelist.txt",
-)
+    feat_dim = 3,
+    context_len = 0,
-DataLayer(
+    buffer_capacity = 1000000,
-    name = "weight",
+    async_load_data = False))
-    size = 1,
-)
+settings(batch_size = 100)
-Layer(
+data = data_layer(name='input', size=3)
-    name = "layer1_1",
-    type = "fc",
+wt = data_layer(name='weight', size=1)
-    size = 5,
-    active_type = "sigmoid",
+fc1 = fc_layer(input=data, size=5,
-    inputs = "input",
+               bias_attr=True,
-)
+               act=SigmoidActivation())
-Layer(
+fc2 = fc_layer(input=data, size=12,
-    name = "layer1_2",
+               bias_attr=True,
-    type = "fc",
+               param_attr=ParamAttr(name='sharew'),
-    size = 12,
+               act=LinearActivation())
-    active_type = "linear",
-    inputs = Input("input", parameter_name='sharew'),
+fc3 = fc_layer(input=data, size=3,
-)
+               bias_attr=True,
+               act=TanhActivation())
-Layer(
-    name = "layer1_3",
+fc4 = fc_layer(input=data, size=5,
-    type = "fc",
+               bias_attr=True,
-    size = 3,
+               layer_attr=ExtraAttr(drop_rate=0.5),
-    active_type = "tanh",
+               act=SquareActivation())
-    inputs = "input",
-)
+pool = img_pool_layer(input=fc2,
+                      pool_size=2,
-Layer(
+                      pool_size_y=3,
-    name = "layer1_5",
+                      num_channels=1,
-    type = "fc",
+                      padding=1,
-    size = 3,
+                      padding_y=2,
-    active_type = "tanh",
+                      stride=2,
-    inputs = Input("input",
+                      stride_y=3,
-              learning_rate=0.01,
+                      img_width=3,
-              momentum=0.9,
+                      pool_type=CudnnAvgPooling())
-              decay_rate=0.05,
-              initial_mean=0.0,
+concat = concat_layer(input=[fc3, fc4])
-              initial_std=0.01,
-              format = "csc",
+with mixed_layer(size=3, act=SoftmaxActivation()) as output:
-              nnz = 4)
+    output += full_matrix_projection(input=fc1)
-)
+    output += trans_full_matrix_projection(input=fc2,
+                                           param_attr=ParamAttr(name='sharew'))
-FCLayer(
+    output += full_matrix_projection(input=concat)
-    name = "layer1_4",
+    output += identity_projection(input=fc3)
-    size = 5,
-    active_type = "square",
+lbl = data_layer(name='label', size=1)
-    inputs = "input",
-    drop_rate = 0.5,
+cost = classification_cost(input=output, label=lbl, weight=wt,
-)
+                           layer_attr=ExtraAttr(device=-1))
-Layer(
+nce = nce_layer(input=fc2, label=lbl, weight=wt,
-    name = "pool",
+                num_classes=3, 
-    type = "pool",
+                neg_distribution=[0.1, 0.3, 0.6])
-    inputs = Input("layer1_2",
-                   pool = Pool(pool_type="cudnn-avg-pool",
+outputs(cost, nce)
-                               channels = 1,
-                               size_x = 2,
-                               size_y = 3,
-                               img_width = 3,
-                               padding = 1,
-                               padding_y = 2,
-                               stride = 2,
-                               stride_y = 3))
-)
-Layer(
-    name = "concat",
-    type = "concat",
-    inputs = ["layer1_3", "layer1_4"],
-)
-MixedLayer(
-    name = "output",
-    size = 3,
-    active_type = "softmax",
-    inputs = [
-        FullMatrixProjection("layer1_1",
-              learning_rate=0.1),
-        TransposedFullMatrixProjection("layer1_2", parameter_name='sharew'),
-        FullMatrixProjection("concat"),
-        IdentityProjection("layer1_3"),
-    ],
-)
-Layer(
-    name = "label",
-    type = "data",
-    size = 1,
-)
-Layer(
-    name = "cost",
-    type = "multi-class-cross-entropy",
-    inputs = ["output", "label", "weight"],
-)
-Layer(
-    name = "cost2",
-    type = "nce",
-    num_classes = 3,
-    active_type = "sigmoid",
-    neg_sampling_dist = [0.1, 0.3, 0.6],
-    inputs = ["layer1_2", "label", "weight"],
-)
-Evaluator(
-    name = "error",
-    type = "classification_error",
-    inputs = ["output", "label", "weight"]
-)
-Inputs("input", "label", "weight")
-Outputs("cost", "cost2")
-TrainData(
-    ProtoData(
-        files = "dummy_list",
-        constant_slots = [1.0],
-        async_load_data = True,
-    )
-)
-TestData(
-    SimpleData(
-        files = "trainer/tests/sample_filelist.txt",
-        feat_dim = 3,
-        context_len = 0,
-        buffer_capacity = 1000000,
-        async_load_data = False,
-    ),
-)
-Settings(
-    algorithm = "sgd",
-    num_batches_per_send_parameter = 1,
-    num_batches_per_get_parameter = 1,
-    batch_size = 100,
-    learning_rate = 0.001,
-    learning_rate_decay_a = 1e-5,
-    learning_rate_decay_b = 0.5,
-)
--- a/paddle/utils/.gitignore
+++ b/paddle/utils/.gitignore
+enable_virtualenv.c
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -2,6 +2,9 @@
 file(GLOB UTIL_HEADERS . *.h)
 file(GLOB UTIL_SOURCES . *.cpp)
+create_resources(enable_virtualenv.py enable_virtualenv.c)
+set(UTIL_RES enable_virtualenv.c)
 if(APPLE)
    file(GLOB UTIL_ARCH_SOURCES . arch/osx/*.cpp)
 else()
@@ -9,7 +12,8 @@ else()
 endif()
 add_library(paddle_utils STATIC
        ${UTIL_SOURCES}
-        ${UTIL_ARCH_SOURCES})
+        ${UTIL_ARCH_SOURCES}
+        ${UTIL_RES})
 add_style_check_target(paddle_utils ${UTIL_HEADERS})
 add_style_check_target(paddle_utils ${UTIL_SOURCES}
    ${UTIL_ARCH_SOURCES})

--- a/paddle/utils/Logging.h
+++ b/paddle/utils/Logging.h
@@ -191,7 +191,7 @@ void installFailureWriter(void(*callback)(const char*, int));
 }
 #endif  // PADDLE_USE_GLOG
-#ifdef NDEBUG
+#ifndef NDEBUG
 #define DEBUG_LEVEL 5
 #define DBG VLOG(DEBUG_LEVEL)
 #else

--- a/paddle/utils/PythonUtil.cpp
+++ b/paddle/utils/PythonUtil.cpp
@@ -77,11 +77,18 @@ static std::recursive_mutex g_pyMutex;
 PyGuard::PyGuard() : guard_(g_pyMutex) {}
-static void printPyErrorStack(std::ostream& os, bool withEndl = false) {
+static void printPyErrorStack(std::ostream& os, bool withEndl = false,
+                              bool withPyPath = true) {
  PyObject * ptype, *pvalue, *ptraceback;
  PyErr_Fetch(&ptype, &pvalue, &ptraceback);
  PyErr_NormalizeException(&ptype, &pvalue, &ptraceback);
  PyErr_Clear();
+  if (withPyPath) {
+    os << "Current PYTHONPATH: " << py::repr(PySys_GetObject(strdup("path")));
+    if (withEndl) {
+      os << std::endl;
+    }
+  }
  PyTracebackObject* obj = (PyTracebackObject*)ptraceback;
  os << "Python Error: " << PyString_AsString(PyObject_Str(ptype))
@@ -114,10 +121,7 @@ PyObjectPtr callPythonFuncRetPyObj(const std::string& moduleName,
                                   const std::string& funcName,
                                   const std::vector<std::string>& args) {
  PyGuard guard;
-  PyObjectPtr pyModuleName(PyString_FromString(moduleName.c_str()));
+  PyObjectPtr pyModule = py::import(moduleName);
-  CHECK_PY(pyModuleName) << "Import PyModule failed" << moduleName;
-  PyObjectPtr pyModule(PyImport_Import(pyModuleName.get()));
-  CHECK_PY(pyModule) << "Import Python Module"<< moduleName << " failed.";
  PyObjectPtr pyFunc(PyObject_GetAttrString(pyModule.get(), funcName.c_str()));
  CHECK_PY(pyFunc) << "GetAttrString failed.";
  PyObjectPtr pyArgs(PyTuple_New(args.size()));
@@ -143,7 +147,7 @@ PyObjectPtr createPythonClass(
    const std::vector<std::string>& args,
    const std::map<std::string, std::string>& kwargs) {
  PyGuard guard;
-  PyObjectPtr pyModule(PyImport_ImportModule(moduleName.c_str()));
+  PyObjectPtr pyModule = py::import(moduleName);
  LOG(INFO) << "createPythonClass moduleName.c_str:" << moduleName.c_str();
  CHECK_PY(pyModule) << "Import module " << moduleName << " failed.";
  PyObjectPtr pyDict(PyModule_GetDict(pyModule.get()));
@@ -181,18 +185,29 @@ std::string getPyCallStack() {
  printPyErrorStack(os, true);
  return os.str();
 }
+PyObjectPtr import(const std::string &moduleName) {
+  auto module = PyImport_ImportModule(moduleName.c_str());
+  CHECK_PY(module) << "Import " << moduleName << "Error";
+  return PyObjectPtr(module);
+}
 }  // namespace py
 #endif
+extern "C" {
+extern const char enable_virtualenv_py[];
+}
 void initPython(int argc, char** argv) {
 #ifndef PADDLE_NO_PYTHON
  Py_SetProgramName(argv[0]);
  Py_Initialize();
  PySys_SetArgv(argc, argv);
  // python blocks SIGINT. Need to enable it.
  signal(SIGINT, SIG_DFL);
+  // Manually activate virtualenv when user is using virtualenv
+  PyRun_SimpleString(enable_virtualenv_py);
 #endif
 }

--- a/paddle/utils/PythonUtil.h
+++ b/paddle/utils/PythonUtil.h
@@ -87,6 +87,8 @@ PyObjectPtr createPythonClass(const std::string& moduleName,
  CHECK((x) != nullptr) << ::paddle::py::getPyCallStack()
 namespace py {
+PyObjectPtr import(const std::string& moduleName);
 /**
 * Cast a PyLong or PyInt to int type T.
 * @tparam T return type.

--- a/paddle/utils/Queue.h
+++ b/paddle/utils/Queue.h
@@ -135,6 +135,21 @@ public:
    queueCV_.wait(lock, [this]() { return numElements_ == 0; });
  }
+  /**
+   * @brief wait queue is not empty at most for some seconds.
+   * @param seconds wait time limit.
+   * @return true if queue is not empty. false if timeout.
+   */
+  bool waitNotEmptyFor(int seconds) {
+    std::unique_lock<std::mutex> lock(queueLock_);
+    return queueCV_.wait_for(
+          lock,
+          std::chrono::seconds(seconds),
+          [this] {
+      return numElements_ != 0;
+    });
+  }
 private:
  std::deque<T> elements_;
  int numElements_;

--- a/paddle/utils/enable_virtualenv.py
+++ b/paddle/utils/enable_virtualenv.py
+import os
+def __activate_virtual_env__():
+  __path__ = os.getenv('VIRTUAL_ENV')
+  if __path__ is None:
+    return
+  __script__ = os.path.join(__path__, 'bin', 'activate_this.py')
+  execfile(__script__, {'__file__': __script__})
+__activate_virtual_env__()
--- a/proto/ModelConfig.proto.m4
+++ b/proto/ModelConfig.proto.m4
@@ -170,6 +170,15 @@ message BlockExpandConfig {
  required uint32 img_size_y = 11;
 }
+message MaxOutConfig {
+  required uint32 channels = 1;
+  required uint32 groups = 2;
+  // The size of input feature map.
+  required uint32 img_size_x = 3;
+  required uint32 img_size_y = 4;
+}
 message ProjectionConfig {
  required string type = 1;
  required string name = 2;
@@ -235,6 +244,7 @@ message LayerInputConfig {
  // Set the argument name.
  optional string input_layer_argument = 9;
  optional BilinearInterpConfig bilinear_interp_conf = 10;
+  optional MaxOutConfig maxout_conf = 11;
 }
 message LayerConfig {

--- a/python/paddle/trainer/PyDataProvider2.py
+++ b/python/paddle/trainer/PyDataProvider2.py
@@ -208,7 +208,6 @@ def provider(input_types=None, should_shuffle=None, pool_size=-1,
             calc_batch_size=None,
             cache=CacheType.NO_CACHE,
             check=False, check_fail_continue=False,
-             use_dynamic_order=True,
             init_hook=None, **kwargs):
    """
    Provider decorator. Use it to make a function into PyDataProvider2 object.
@@ -228,9 +227,15 @@ def provider(input_types=None, should_shuffle=None, pool_size=-1,
    The configuration of data provider should be setup by\:
    :param input_types: Specify the input types, can also be set in init_hook.
-                        It is a list of InputType object. For example, input_types= \
+                        It could be a list of InputType object. For example,
-                        [dense_vector(9), integer_value(2)].
+                        input_types=[dense_vector(9), integer_value(2)]. Or user
-    :type input_types: list|tuple
+                        can set a dict of InputType object, which key is
+                        data_layer's name. For example, input_types=\
+                        {'img': img_features, 'label': label}. when using dict of
+                        InputType, user could yield a dict of feature values, which
+                        key is also data_layer's name.
+    :type input_types: list|tuple|dict
    :param should_shuffle: True if data should shuffle. Pass None means shuffle
                           when is training and not to shuffle when is testing.
@@ -281,12 +286,6 @@ def provider(input_types=None, should_shuffle=None, pool_size=-1,
                                drop the wrong format data when it is True. Has
                                no effect when check set to False.
    :type check_fail_continue: bool
-    :param use_dynamic_order: Allow provider to yield a dictionary object, whose
-                              key is a input data layer name, and value is the
-                              feature value. The tuples are still allowed when
-                              use_dynmaic_order is True.
-    :type use_dynamic_order: bool
    """
    def __wrapper__(generator):
@@ -340,6 +339,11 @@ def provider(input_types=None, should_shuffle=None, pool_size=-1,
                assert self.slots is not None
                assert self.generator is not None
+                use_dynamic_order = False
+                if isinstance(self.slots, dict):  # reorder input_types
+                    self.slots = [self.slots[ipt] for ipt in self.input_order]
+                    use_dynamic_order = True
                if len(self.slots) == 1:
                    self.generator = SingleSlotWrapper(self.generator)

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -216,6 +216,10 @@ def Inputs(*args):
        if g_current_submodel is g_root_submodel:
            g_config.model_config.input_layer_names.append(name)
+@config_func
+def HasInputsSet():
+    return len(g_config.model_config.input_layer_names) != 0
 # Define the name of the output layers of the NeuralNetwork.
 # Usually the output is simply the cost layer.
@@ -466,6 +470,7 @@ class Input(Cfg):
            pool=None,
            image=None,
            block_expand=None,
+            maxout=None,
            format=None,
            nnz=None,
            is_static=None,
@@ -794,6 +799,16 @@ class BlockExpand(Cfg):
            output_y = 0):
        self.add_keys(locals())
+@config_class
+class MaxOut(Cfg):
+    def __init__(
+            self,
+            channels,
+            groups,
+            img_size_x = 0,
+            img_size_y = 0):
+        self.add_keys(locals())
 def DataBase(async_load_data=False,
             constant_slots=None,
             data_ratio=1,
@@ -1098,6 +1113,12 @@ def parse_block_expand(block_expand, input_layer_name, block_expand_conf):
            int(math.ceil((2 * block_expand.padding_y + block_expand.img_size_y \
            - block_expand.block_y) / float(block_expand.stride_y)))
+def parse_maxout(maxout, input_layer_name, maxout_conf):
+    maxout_conf.channels = maxout.channels
+    maxout_conf.groups = maxout.groups
+    maxout_conf.img_size_x = maxout.img_size_x
+    maxout_conf.img_size_y = maxout.img_size_y
 # Define an evaluator
 @config_func
 def Evaluator(
@@ -1721,6 +1742,21 @@ class BlockExpandLayer(LayerBase):
            self.set_layer_size(block_expand_conf.block_x * block_expand_conf.block_y
                * block_expand_conf.channels)
+@config_layer('maxout')
+class MaxOutLayer(LayerBase):
+    def __init__(
+            self,
+            name,
+            inputs,
+            **xargs):
+        super(MaxOutLayer, self).__init__(name, 'maxout', 0, inputs=inputs, **xargs)
+        input_layer = self.get_input_layer(0)
+        parse_maxout(self.inputs[0].maxout,
+                     input_layer.name,
+                     self.config.inputs[0].maxout_conf)
+        maxout_conf = self.config.inputs[0].maxout_conf
+        self.set_layer_size(g_layer_map[input_layer.name].size / maxout_conf.groups)
 # key: cost type
 # value: cost class
 g_cost_map = {}
@@ -1735,7 +1771,6 @@ def define_cost(class_name, cost_type):
    g_cost_map[cost_type] = cls
 define_cost('MultiClassCrossEntropy', 'multi-class-cross-entropy')
-define_cost('ClassificationErrorLayer', 'classification_error')
 define_cost('RankingCost', 'rank-cost')
 define_cost('AucValidation', 'auc-validation')
 define_cost('PnpairValidation', 'pnpair-validation')

--- a/python/paddle/trainer_config_helpers/data_sources.py
+++ b/python/paddle/trainer_config_helpers/data_sources.py
@@ -68,7 +68,7 @@ def define_py_data_source(file_list, cls, module,
        file_list_name = 'train.list'
        if isinstance(cls, TestData):
            file_list_name = 'test.list'
-        with open(file_list_name, 'r') as f:
+        with open(file_list_name, 'w') as f:
            f.writelines(file_list)
        file_list = file_list_name
@@ -84,6 +84,7 @@ def define_py_data_source(file_list, cls, module,
            data.load_data_module = load_data_module
            data.load_data_object = load_data_object
            data.load_data_args = load_data_args
+            data.async_load_data = True
            return data
        data_cls = py_data2

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -50,11 +50,12 @@ __all__ = ["full_matrix_projection", "AggregateLevel", "ExpandLevel",
           'slope_intercept_layer', 'trans_full_matrix_projection',
           'linear_comb_layer',
           'convex_comb_layer', 'ctc_layer', 'crf_layer', 'crf_decoding_layer',
+           'nce_layer',
           'cross_entropy_with_selfnorm', 'cross_entropy',
           'multi_binary_label_cross_entropy',
           'rank_cost', 'lambda_cost', 'huber_cost',
           # 'block_expand_layer',  # TODO(yuyang18): this layer is not correct
-           'out_prod_layer', 'print_layer'
+           'maxout_layer', 'out_prod_layer', 'print_layer'
           ]
@@ -110,12 +111,14 @@ class LayerType(object):
    SLOPE_INTERCEPT_LAYER = "slope_intercept"
    LINEAR_COMBINATION_LAYER = "convex_comb"
    BLOCK_EXPAND = "blockexpand"
+    MAXOUT = "maxout"
    PRINT_LAYER = "print"
    CTC_LAYER = "ctc"
    CRF_LAYER = "crf"
    CRF_DECODING_LAYER = "crf_decoding"
+    NCE_LAYER = 'nce'
    RANK_COST = "rank-cost"
    LAMBDA_COST = "lambda_cost"
@@ -169,7 +172,7 @@ class LayerOutput(object):
    :param activation: Layer Activation.
    :type activation: BaseActivation.
    :param parents: Layer's parents.
-    :type parents: list|tuple|collection.Sequence
+    :type parents: list|tuple|collections.Sequence
    """
    def __init__(self, name, layer_type, parents=None, activation=None,
@@ -1692,7 +1695,7 @@ def img_conv_layer(input, filter_size, num_filters,
 @layer_support()
 def img_pool_layer(input, pool_size, name=None,
                   num_channels=None, pool_type=None,
-                   stride=1, start=None, padding=0, layer_attr=None,
+                   stride=1, padding=0, layer_attr=None,
                   pool_size_y=None, stride_y=None, padding_y=None,
                   img_width=None):
    """
@@ -1723,8 +1726,6 @@ def img_pool_layer(input, pool_size, name=None,
    :type stride: int
    :param stride_y: stride height of pooling. It is equal to stride by default.
    :type stride_y: int|None
-    :param start: start position of pooling operation. Note it is deprecated now.
-    :type start: int|None
    :param layer_attr: Extra Layer attribute.
    :type layer_attr: ExtraLayerAttribute
    :param img_width: the width of input feature map. If it is None, the input feature
@@ -1758,7 +1759,7 @@ def img_pool_layer(input, pool_size, name=None,
                          pool_type=type_name,
                          channels=num_channels,
                          size_x=pool_size,
-                          start=start,
+                          start=None,
                          stride=stride,
                          padding=padding,
                          size_y=pool_size_y,
@@ -2053,10 +2054,16 @@ def concat_layer(input, act=None, name=None, layer_attr=None):
    Concat all input vector into one huge vector.
    Inputs can be list of LayerOutput or list of projection.
+    The example usage is:
+    ..  code-block:: python
+        concat = concat_layer(input=[layer1, layer2])
    :param name: Layer name.
    :type name: basestring
    :param input: input layers or projections
-    :type input: list|tuple|collection.Sequence
+    :type input: list|tuple|collections.Sequence
    :param act: Activation type.
    :type act: BaseActivation
    :param layer_attr: Extra Layer Attribute.
@@ -2842,30 +2849,52 @@ def beam_search(step, input, bos_id, eos_id, beam_size,
    return tmp
+def __cost_input__(input, label, weight=None):
+    """
+    inputs and parents for cost layers. 
+    """
+    ipts = [Input(input.name), Input(label.name)]
+    parents = [input, label]
+    if weight is not None:
+        assert weight.layer_type == LayerType.DATA
+        ipts.append(Input(weight.name))
+        parents.append(weight)
+    return ipts, parents
 @wrap_name_default()
-def regression_cost(input, label, cost='square_error', name=None):
+@layer_support()
+def regression_cost(input, label, weight=None, name=None,
+                    layer_attr=None):
    """
    Regression Layer.
    TODO(yuyang18): Complete this method.
    :param name: layer name.
+    :type name: basestring
    :param input: Network prediction.
+    :type input: LayerOutput
    :param label: Data label.
-    :param cost: Cost method.
+    :type label: LayerOutput
+    :param weight: The weight affects the cost, namely the scale of cost.
+                   It is an optional argument.
+    :type weight: LayerOutput
+    :param layer_attr: layer's extra attribute.
+    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
+    :rtype: LayerOutput
    """
-    Layer(inputs=[Input(input.name), Input(label.name)], type=cost, name=name)
+    ipts, parents = __cost_input__(input, label, weight)
-    return LayerOutput(
-        name, LayerType.COST, parents=[input, label]
+    Layer(inputs=ipts, type="square_error", name=name,
-    )
+          **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(name, LayerType.COST, parents=parents)
 @wrap_name_default("cost")
 @layer_support()
-def classification_cost(input, label, name=None,
+def classification_cost(input, label, weight=None, name=None,
-                        cost="multi-class-cross-entropy",
                        evaluator=classification_error_evaluator,
                        layer_attr=None):
    """
@@ -2877,8 +2906,9 @@ def classification_cost(input, label, name=None,
    :type input: LayerOutput
    :param label: label layer name. data_layer often.
    :type label: LayerOutput
-    :param cost: cost method.
+    :param weight: The weight affects the cost, namely the scale of cost.
-    :type cost: basestring
+                   It is an optional argument.
+    :type weight: LayerOutput
    :param evaluator: Evaluator method.
    :param layer_attr: layer's extra attribute.
    :type layer_attr: ExtraLayerAttribute
@@ -2888,7 +2918,10 @@ def classification_cost(input, label, name=None,
    assert input.layer_type != LayerType.DATA
    assert isinstance(input.activation, SoftmaxActivation)
    assert label.layer_type == LayerType.DATA
-    Layer(name=name, type=cost, inputs=[Input(input.name), Input(label.name)],
+    ipts, parents = __cost_input__(input, label, weight)
+    Layer(name=name, type="multi-class-cross-entropy", inputs=ipts,
          **ExtraLayerAttribute.to_kwargs(layer_attr))
    def __add_evaluator__(e):
@@ -2900,7 +2933,7 @@ def classification_cost(input, label, name=None,
        assert isinstance(e.for_classification, bool)
        assert e.for_classification
-        e(name=e.__name__, input=input, label=label)
+        e(name=e.__name__, input=input, label=label, weight=weight)
    if not isinstance(evaluator, collections.Sequence):
        evaluator = [evaluator]
@@ -2908,7 +2941,7 @@ def classification_cost(input, label, name=None,
    for each_evaluator in evaluator:
        __add_evaluator__(each_evaluator)
-    return LayerOutput(name, LayerType.COST, parents=[input, label])
+    return LayerOutput(name, LayerType.COST, parents=parents)
 def conv_operator(img, filter, filter_size, num_filters,
@@ -2984,7 +3017,8 @@ def conv_operator(img, filter, filter_size, num_filters,
 @wrap_name_default()
-def conv_shift_layer(a, b, name=None):
+@layer_support()
+def conv_shift_layer(a, b, name=None, layer_attr=None):
    """
    This layer performs cyclic convolution for two input. For example:
      - a[in]: contains M elements.
@@ -3013,6 +3047,8 @@ def conv_shift_layer(a, b, name=None):
    :type a: LayerOutput
    :param b: input layer b
    :type b: LayerOutput
+    :param layer_attr: layer's extra attribute.
+    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@@ -3022,6 +3058,7 @@ def conv_shift_layer(a, b, name=None):
        name=name,
        type=LayerType.CONV_SHIFT_LAYER,
        inputs=[a.name, b.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
    )
    return LayerOutput(name, LayerType.CONV_SHIFT_LAYER, parents=[a, b],
@@ -3095,6 +3132,7 @@ def tensor_layer(a, b, size, act=None, name=None,
 @wrap_param_attr_default()
 @wrap_bias_attr_default()
 @wrap_act_default()
+@layer_support()
 def selective_fc_layer(input, select, size, act=None, name=None,
                       pass_generation=False,
                       has_selected_colums=True,
@@ -3167,7 +3205,8 @@ def selective_fc_layer(input, select, size, act=None, name=None,
 @wrap_name_default()
-def sampling_id_layer(input, name=None):
+@layer_support()
+def sampling_id_layer(input, name=None, layer_attr=None):
    """
    A layer for sampling id from multinomial distribution from the input layer.
    Sampling one id for one sample.
@@ -3182,6 +3221,8 @@ def sampling_id_layer(input, name=None):
    :type input: LayerOutput
    :param name: The Layer Name.
    :type name: basestring
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@@ -3189,12 +3230,15 @@ def sampling_id_layer(input, name=None):
        name=name,
        type=LayerType.SAMPLING_ID_LAYER,
        inputs=[Input(input.name)],
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
    )
    return LayerOutput(name, LayerType.SAMPLING_ID_LAYER, input)
 @wrap_name_default()
-def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0):
+@layer_support()
+def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0,
+                          layer_attr=None):
    """
    This layer for applying a slope and an intercept to the input
    element-wise. There is no activation and weight.
@@ -3216,6 +3260,8 @@ def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0):
    :type slope: float.
    :param intercept: the offset.
    :type intercept: float.
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@@ -3225,12 +3271,15 @@ def slope_intercept_layer(input, name=None, slope=1.0, intercept=0.0):
        slope=slope,
        intercept=intercept,
        inputs=[Input(input.name)],
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
    )
    return LayerOutput(name, LayerType.SLOPE_INTERCEPT_LAYER, input)
 @wrap_name_default()
-def linear_comb_layer(weights, vectors, size=None, name=None):
+@layer_support()
+def linear_comb_layer(weights, vectors, size=None, name=None,
+                      layer_attr=None):
    """
    A layer for weighted sum of vectors takes two inputs.
      - Input: size of weights is M
@@ -3271,6 +3320,8 @@ def linear_comb_layer(weights, vectors, size=None, name=None):
    :type size: int
    :param name: The Layer Name.
    :type name: basestring
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@@ -3286,6 +3337,7 @@ def linear_comb_layer(weights, vectors, size=None, name=None):
        type=LayerType.LINEAR_COMBINATION_LAYER,
        size=size,
        inputs=[Input(weights.name), Input(vectors.name)],
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
    )
    return LayerOutput(name, LayerType.LINEAR_COMBINATION_LAYER,
                       [weights, vectors], size=size)
@@ -3295,6 +3347,7 @@ convex_comb_layer = linear_comb_layer
 @wrap_name_default()
+@layer_support()
 def block_expand_layer(input,
                       channel=0,
                       block_x=0,
@@ -3303,7 +3356,8 @@ def block_expand_layer(input,
                       stride_y=0,
                       padding_x=0,
                       padding_y=0,
-                       name=None):
+                       name=None,
+                       layer_attr=None):
    """
    Expand feature map to minibatch matrix.
       - matrix width is: block_y * block_x * channel
@@ -3350,6 +3404,8 @@ def block_expand_layer(input,
    :type padding_y: int
    :param name: The name of this layer, which can not specify.
    :type name: None|basestring.
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@@ -3364,13 +3420,83 @@ def block_expand_layer(input,
                                               padding_y=padding_y)
                      ),
          type=LayerType.BLOCK_EXPAND,
+          **ExtraLayerAttribute.to_kwargs(layer_attr)
          )
    return LayerOutput(name, LayerType.BLOCK_EXPAND, parents=[input])
 @wrap_name_default()
-def ctc_layer(input, label, size=None, name=None, norm_by_times=False):
+@layer_support()
+def maxout_layer(input,
+                 groups,
+                 num_channels=None,
+                 size_x=None,
+                 size_y=None,
+                 name=None,
+                 layer_attr=None):
+    """
+    A layer to do max out on conv layer output.
+      - Input: output of a conv layer.
+      - Output: feature map size same as input. Channel is (input channel) / groups.
+    So groups should be larger than 1, and the num of channels should be able 
+    to devided by groups.
+    Please refer to Paper: 
+      - Maxout Networks: http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
+      - Multi-digit Number Recognition from Street View \
+        Imagery using Deep Convolutional Neural Networks: \
+        https://arxiv.org/pdf/1312.6082v4.pdf
+    The simple usage is:
+    .. code-block:: python
+       maxout = maxout_layer(input,
+                             num_channels=128,
+                             groups=4)
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param num_channels: The channel number of input layer. If None will be set
+                     automatically from previous output.
+    :type num_channels: int|None
+    :param groups: The group number of input layer.
+    :type groups: int
+    :param size_x: conv output width. If None will be set
+                   automatically from previous output.
+    :type size_x: int|None
+    :param size_y: conv output height. If None will be set
+                   automatically from previous output.
+    :type size_y: int|None
+    :param name: The name of this layer, which can not specify.
+    :type name: None|basestring.
+    :param layer_attr: Extra Layer attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert input.layer_type == LayerType.CONV_LAYER
+    assert isinstance(input.activation, LinearActivation)
+    assert groups > 1
+    if num_channels is None:
+        assert input.num_filters is not None
+        num_channels = input.num_filters
+    assert num_channels % groups == 0
+    Layer(name=name,
+          inputs=Input(input.name,
+                       maxout=MaxOut(channels=num_channels,
+                                     groups=groups)),
+          type=LayerType.MAXOUT,
+          **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(name, LayerType.MAXOUT, parents=[input])
+@wrap_name_default()
+@layer_support()
+def ctc_layer(input, label, size=None, name=None, norm_by_times=False,
+              layer_attr=None):
    """
    Connectionist Temporal Classification (CTC) is designed for temporal
    classication task. That is, for sequence labeling problems where the
@@ -3407,6 +3533,8 @@ def ctc_layer(input, label, size=None, name=None, norm_by_times=False):
    :type name: basestring|None
    :param norm_by_times: Whether to normalization by times. False by default.
    :type norm_by_times: bool
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@@ -3422,14 +3550,17 @@ def ctc_layer(input, label, size=None, name=None, norm_by_times=False):
        type=LayerType.CTC_LAYER,
        size=size,
        norm_by_times=norm_by_times,
-        inputs=[input.name, label.name]
+        inputs=[input.name, label.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
    )
    return LayerOutput(name, LayerType.CTC_LAYER, [input, label], size=size)
 @wrap_name_default()
 @wrap_param_attr_default()
-def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None):
+@layer_support()
+def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None,
+              layer_attr=None):
    """
    A layer for calculating the cost of sequential conditional random
    field model.
@@ -3455,6 +3586,8 @@ def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None):
    :type param_attr: ParameterAttribute
    :param name: The name of this layers. It is not necessary.
    :type name: None|basestring
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@@ -3478,6 +3611,7 @@ def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None):
        type=LayerType.CRF_LAYER,
        size=size,
        inputs=ipts,
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
    )
    parents = [input, label]
    if weight is not None:
@@ -3487,7 +3621,9 @@ def crf_layer(input, label, size=None, weight=None, param_attr=None, name=None):
 @wrap_name_default()
 @wrap_param_attr_default()
-def crf_decoding_layer(input, size, label=None, param_attr=None, name=None):
+@layer_support()
+def crf_decoding_layer(input, size, label=None, param_attr=None, name=None,
+                       layer_attr=None):
    """
    A layer for calculating the decoding sequence of sequential conditional
    random field model. The decoding sequence is stored in output.ids.
@@ -3505,6 +3641,8 @@ def crf_decoding_layer(input, size, label=None, param_attr=None, name=None):
    :type param_attr: ParameterAttribute
    :param name: The name of this layers. It is not necessary.
    :type name: None|basestring
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@@ -3521,12 +3659,90 @@ def crf_decoding_layer(input, size, label=None, param_attr=None, name=None):
        type=LayerType.CRF_DECODING_LAYER,
        size=size,
        inputs=ipts,
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
    )
    parents = [input]
    if label is not None:
        parents.append(label)
    return LayerOutput(name, LayerType.CRF_DECODING_LAYER, parents, size=size)
+@wrap_bias_attr_default(has_bias=True)
+@wrap_name_default()
+@layer_support()
+def nce_layer(input, label, num_classes, weight=None,
+              num_neg_samples=10, neg_distribution=None,
+              name=None, bias_attr=None, layer_attr=None):
+    """
+    Noise-contrastive estimation.
+    Implements the method in the following paper:
+    A fast and simple algorithm for training neural probabilistic language models.
+    The example usage is:
+    .. code-block:: python
+       cost = nce_layer(input=layer1, label=layer2, weight=layer3,
+                        num_classes=3, neg_distribution=[0.1,0.3,0.6])
+    :param name: layer name
+    :type name: basestring
+    :param input: input layers. It could be a LayerOutput of list/tuple of LayerOutput.
+    :type input: LayerOutput|list|tuple|collections.Sequence
+    :param label: label layer
+    :type label: LayerOutput
+    :param weight: weight layer, can be None(default)
+    :type weight: LayerOutput
+    :param num_classes: number of classes.
+    :type num_classes: int 
+    :param num_neg_samples: number of negative samples. Default is 10.
+    :type num_neg_samples: int 
+    :param neg_distribution: The distribution for generating the random negative labels.
+                             A uniform distribution will be used if not provided.
+                             If not None, its length must be equal to num_classes.
+    :type neg_distribution: list|tuple|collections.Sequence|None
+    :param bias_attr: Bias parameter attribute. True if no bias.
+    :type bias_attr: ParameterAttribute|None|False
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :return: layer name.
+    :rtype: LayerOutput
+    """
+    if isinstance(input, LayerOutput):
+        input = [input]
+    assert isinstance(input, collections.Sequence)
+    assert isinstance(label, LayerOutput)
+    assert label.layer_type == LayerType.DATA
+    if neg_distribution is not None:
+        assert isinstance(neg_distribution, collections.Sequence)
+        assert len(neg_distribution) == num_classes
+        assert sum(neg_distribution) == 1
+    ipts_for_layer = []
+    parents = []
+    for each_input in input:
+        assert isinstance(each_input, LayerOutput)
+        ipts_for_layer.append(each_input.name)
+        parents.append(each_input)
+    ipts_for_layer.append(label.name)
+    parents.append(label)
+    if weight is not None:
+        assert isinstance(weight, LayerOutput)
+        assert weight.layer_type == LayerType.DATA
+        ipts_for_layer.append(weight.name)
+        parents.append(weight)
+    Layer(
+        name=name,
+        type=LayerType.NCE_LAYER,
+        num_classes=num_classes,
+        neg_sampling_dist=neg_distribution,
+        num_neg_samples=num_neg_samples,
+        inputs=ipts_for_layer,
+        bias=ParamAttr.to_bias(bias_attr),
+        **ExtraLayerAttribute.to_kwargs(layer_attr)
+    )
+    return LayerOutput(name, LayerType.NCE_LAYER, parents=parents)
 """
 following are cost Layers.
@@ -3534,7 +3750,8 @@ following are cost Layers.
 @wrap_name_default()
-def rank_cost(left, right, label, weight=None, name=None, coeff=1.0):
+@layer_support()
+def rank_cost(left, right, label, weight=None, name=None, coeff=1.0, layer_attr=None):
    """
    A cost Layer for learning to rank using gradient descent. Details can refer
    to `papers <http://research.microsoft.com/en-us/um/people/cburges/papers/
@@ -3578,6 +3795,8 @@ def rank_cost(left, right, label, weight=None, name=None, coeff=1.0):
    :type name: None|basestring
    :param coeff: The coefficient affects the gradient in the backward.
    :type coeff: float
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@@ -3595,13 +3814,15 @@ def rank_cost(left, right, label, weight=None, name=None, coeff=1.0):
          type=LayerType.RANK_COST,
          inputs=ipts,
          coeff=coeff,
+          **ExtraLayerAttribute.to_kwargs(layer_attr)
          )
    return LayerOutput(name, LayerType.RANK_COST, parents=parents)
 @wrap_name_default()
-def lambda_cost(input, score, name, NDCG_num=5, max_sort_size=-1):
+@layer_support()
+def lambda_cost(input, score, name, NDCG_num=5, max_sort_size=-1, layer_attr=None):
    """
    lambdaCost for lambdaRank LTR approach.
@@ -3632,6 +3853,8 @@ def lambda_cost(input, score, name, NDCG_num=5, max_sort_size=-1):
    :type max_sort_size: int
    :param name: The name of this layers. It is not necessary.
    :type name: None|basestring
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@@ -3642,14 +3865,16 @@ def lambda_cost(input, score, name, NDCG_num=5, max_sort_size=-1):
          type=LayerType.LAMBDA_COST,
          inputs=[input.name, score.name],
          NDCG_num=NDCG_num,
-          max_sort_size=max_sort_size
+          max_sort_size=max_sort_size,
+          **ExtraLayerAttribute.to_kwargs(layer_attr)
          )
    return LayerOutput(name, LayerType.LAMBDA_COST, parents=[input, score])
 @wrap_name_default()
-def cross_entropy(input, label, name=None, coeff=1.0):
+@layer_support()
+def cross_entropy(input, label, name=None, coeff=1.0, layer_attr=None):
    """
    A loss layer for multi class entropy.
@@ -3667,6 +3892,8 @@ def cross_entropy(input, label, name=None, coeff=1.0):
    :type name: None|basestring.
    :param coeff: The coefficient affects the gradient in the backward.
    :type coeff: float.
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
    :rtype: LayerOutput.
    """
@@ -3675,13 +3902,16 @@ def cross_entropy(input, label, name=None, coeff=1.0):
          type=LayerType.CROSS_ENTROPY,
          inputs=[input.name, label.name],
          coeff=coeff,
+          **ExtraLayerAttribute.to_kwargs(layer_attr)
          )
    return LayerOutput(name, LayerType.CROSS_ENTROPY, parents=[input, label])
 @wrap_name_default()
+@layer_support()
 def cross_entropy_with_selfnorm(input, label, name=None, coeff=1.0,
-                                softmax_selfnorm_alpha=0.1):
+                                softmax_selfnorm_alpha=0.1,
+                                layer_attr=None):
    """
    A loss layer for multi class entropy with selfnorm.
@@ -3701,6 +3931,8 @@ def cross_entropy_with_selfnorm(input, label, name=None, coeff=1.0,
    :type coeff: float.
    :param softmax_selfnorm_alpha: The scale factor affects the cost.
    :type softmax_selfnorm_alpha: float.
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
    :rtype: LayerOutput.
    """
@@ -3709,6 +3941,7 @@ def cross_entropy_with_selfnorm(input, label, name=None, coeff=1.0,
          inputs=[input.name, label.name],
          coeff=coeff,
          softmax_selfnorm_alpha=softmax_selfnorm_alpha,
+          **ExtraLayerAttribute.to_kwargs(layer_attr)
          )
    return LayerOutput(name,
@@ -3717,7 +3950,8 @@ def cross_entropy_with_selfnorm(input, label, name=None, coeff=1.0,
 @wrap_name_default()
-def huber_cost(input, label, name=None, coeff=1.0):
+@layer_support()
+def huber_cost(input, label, name=None, coeff=1.0, layer_attr=None):
    """
    A loss layer for huber loss.
@@ -3733,6 +3967,8 @@ def huber_cost(input, label, name=None, coeff=1.0):
    :type name: None|basestring.
    :param coeff: The coefficient affects the gradient in the backward.
    :type coeff: float.
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
    :rtype: LayerOutput.
    """
@@ -3743,12 +3979,15 @@ def huber_cost(input, label, name=None, coeff=1.0):
          type=LayerType.HUBER,
          inputs=[input.name, label.name],
          coeff=coeff,
+          **ExtraLayerAttribute.to_kwargs(layer_attr)
          )
    return LayerOutput(name, LayerType.HUBER, parents=[input, label])
 @wrap_name_default()
-def multi_binary_label_cross_entropy(input, label, name=None, coeff=1.0):
+@layer_support()
+def multi_binary_label_cross_entropy(input, label, name=None, coeff=1.0,
+                                     layer_attr=None):
    """
    A loss layer for multi binary label cross entropy.
@@ -3766,6 +4005,8 @@ def multi_binary_label_cross_entropy(input, label, name=None, coeff=1.0):
    :type name: None|basestring
    :param coeff: The coefficient affects the gradient in the backward.
    :type coeff: float
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
@@ -3773,13 +4014,14 @@ def multi_binary_label_cross_entropy(input, label, name=None, coeff=1.0):
    if input.activation is None or \
            not isinstance(input.activation, SigmoidActivation):
        logger.log(logging.WARN,
-                   "%s is not recommend for batch normalization's activation, "
+                   "%s is not recommend for multi_binary_label_cross_entropy's activation, "
-                   "maybe the relu is better" % repr(input.activation))
+                   "maybe the sigmoid is better" % repr(input.activation))
    Layer(name=name,
          type=LayerType.MULTI_BIN_LABEL_CROSS_ENTROPY,
          inputs=[input.name, label.name],
          coeff=coeff,
+          **ExtraLayerAttribute.to_kwargs(layer_attr)
          )
    return LayerOutput(name, LayerType.MULTI_BIN_LABEL_CROSS_ENTROPY,
                       parents=[input, label])
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -20,7 +20,7 @@ from activations import LinearActivation, ReluActivation, SoftmaxActivation, \
    IdentityActivation, TanhActivation, SequenceSoftmaxActivation
 from attrs import ExtraAttr
 from default_decorators import wrap_name_default, wrap_act_default, \
-    wrap_param_default
+    wrap_param_default, wrap_bias_attr_default, wrap_param_attr_default
 from layers import *  # There are too many layers used in network, so import *
 from poolings import MaxPooling, SumPooling
 from paddle.trainer.config_parser import *
@@ -30,7 +30,7 @@ __all__ = ['sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool",
           'lstmemory_unit', 'small_vgg', 'img_conv_group', 'vgg_16_network',
           'gru_unit', 'gru_group', 'simple_gru', 'simple_attention',
           'text_conv_pool',
-           'bidirectional_lstm', 'outputs']
+           'bidirectional_lstm', 'inputs', 'outputs']
 ######################################################
@@ -133,7 +133,7 @@ def simple_img_conv_pool(input, filter_size, num_filters, pool_size, name=None,
                         pool_type=None, act=None, groups=1, conv_stride=1,
                         conv_padding=0, bias_attr=None, num_channel=None,
                         param_attr=None, shared_bias=True,
-                         conv_layer_attr=None, pool_stride=1, pool_start=None,
+                         conv_layer_attr=None, pool_stride=1,
                         pool_padding=0, pool_layer_attr=None):
    """
    Simple image convolution and pooling group.
@@ -172,8 +172,6 @@ def simple_img_conv_pool(input, filter_size, num_filters, pool_size, name=None,
    :type conv_layer_attr: ExtraLayerAttribute
    :param pool_stride: see img_pool_layer for details
    :type pool_stride: int
-    :param pool_start: see img_pool_layer for details. It is deprecated now.
-    :type pool_start: int
    :param pool_padding: see img_pool_layer for details
    :type pool_padding: int
    :param pool_layer_attr: see img_pool_layer for details
@@ -192,7 +190,7 @@ def simple_img_conv_pool(input, filter_size, num_filters, pool_size, name=None,
    return img_pool_layer(name="%s_pool" % name, input=_conv_,
                          pool_size=pool_size,
                          pool_type=pool_type, stride=pool_stride,
-                          start=pool_start, padding=pool_padding,
+                          padding=pool_padding,
                          layer_attr=pool_layer_attr)
@@ -203,7 +201,7 @@ def img_conv_bn_pool(input, filter_size, num_filters, pool_size, name=None,
                     conv_param_attr=None, shared_bias=True,
                     conv_layer_attr=None, bn_param_attr=None,
                     bn_bias_attr=None, bn_layer_attr=None, pool_stride=1,
-                     pool_start=None, pool_padding=0, pool_layer_attr=None):
+                     pool_padding=0, pool_layer_attr=None):
    """
    Convolution, batch normalization, pooling group.
@@ -243,8 +241,6 @@ def img_conv_bn_pool(input, filter_size, num_filters, pool_size, name=None,
    :param bn_layer_attr: ParameterAttribute.
    :param pool_stride: see img_pool_layer's document.
    :type pool_stride: int
-    :param pool_start: see img_pool_layer's document. It is deprecated now.
-    :type pool_start: int
    :param pool_padding: see img_pool_layer's document.
    :type pool_padding: int
    :param pool_layer_attr: see img_pool_layer's document.
@@ -268,7 +264,7 @@ def img_conv_bn_pool(input, filter_size, num_filters, pool_size, name=None,
    return img_pool_layer(name="%s_pool" % name,
                          input=__bn__, pool_type=pool_type,
                          pool_size=pool_size, stride=pool_stride,
-                          start=pool_start, padding=pool_padding,
+                          padding=pool_padding,
                          layer_attr=pool_layer_attr)
@@ -372,8 +368,8 @@ def small_vgg(input_image, num_channels, num_classes):
    tmp = __vgg__(tmp, 128, 2, [0.4, 0])
    tmp = __vgg__(tmp, 256, 3, [0.4, 0.4, 0])
    tmp = __vgg__(tmp, 512, 3, [0.4, 0.4, 0])
-    tmp = img_pool_layer(input = tmp, stride = 2,
+    tmp = img_pool_layer(input=tmp, stride=2,
-                         pool_size = 2, pool_type = MaxPooling())
+                         pool_size=2, pool_type=MaxPooling())
    tmp = dropout_layer(input=tmp, dropout_rate=0.5)
    tmp = fc_layer(input=tmp, size=512, layer_attr=ExtraAttr(drop_rate=0.5),
                   act=LinearActivation())
@@ -505,7 +501,7 @@ def simple_lstm(input, size, name=None, reverse=False, mat_param_attr=None,
 def lstmemory_unit(input, name=None, size=None, param_attr=None,
                   act=None, gate_act=None, state_act=None,
                   mixed_bias_attr=None, lstm_bias_attr=None,
-                   mixed_layer_attr=None,lstm_layer_attr=None,
+                   mixed_layer_attr=None, lstm_layer_attr=None,
                   get_output_layer_attr=None):
    """
    Define calculations that a LSTM unit performs in a single time step.
@@ -745,7 +741,6 @@ def gru_group(input,
              gru_bias_attr=None,
              act=None, gate_act=None,
              gru_layer_attr=None):
    """
    gru_group is a recurrent layer group version Gated Recurrent Unit. It
    does exactly the same calculation as the grumemory layer does. A promising
@@ -919,12 +914,12 @@ def bidirectional_lstm(input, size, name=None, return_seq=False,
    fw = simple_lstm(name='%s_fw' % name, input=input, size=size,
                     **dict((k[len('fwd_'):], v) for k, v in args.iteritems()
-                        if k.startswith('fwd_')))
+                            if k.startswith('fwd_')))
    bw = simple_lstm(name="%s_bw" % name, input=input, size=size,
                     reverse=True,
                     **dict((k[len('bwd_'):], v) for k, v in args.iteritems()
-                        if k.startswith('bwd_')))
+                            if k.startswith('bwd_')))
    if return_seq:
        return concat_layer(name=name, input=[fw, bw], layer_attr=concat_attr,
@@ -1052,14 +1047,30 @@ def dropout_layer(input, dropout_rate, name=None):
                       layer_attr=ExtraAttr(drop_rate=dropout_rate))
-def outputs(layers, *args):
+def inputs(layers, *args):
+    """
+    Declare the inputs of network. The order of input should be as same as
+    the data provider's return order.
+    :param layers: Input Layers.
+    :type layers: list|tuple|LayerOutput.
+    :return:
    """
-    Declare the end of network. Currently it will only calculate the
-    input/output order of network. It will calculate the predict network or
-    train network's output automatically.
+    if isinstance(layers, LayerOutput) or isinstance(layers, basestring):
+        layers = [layers]
+    if len(args) != 0:
+        layers.extend(args)
-    :param layers:
+    Inputs(*[l.name for l in layers])
+def outputs(layers, *args):
+    """
+    Declare the outputs of network. If user have not defined the inputs of
+    network, this method will calculate the input order by dfs travel.
+    :param layers: Output layers.
    :type layers: list|tuple|LayerOutput
    :return:
    """
@@ -1093,6 +1104,11 @@ def outputs(layers, *args):
        layers.extend(args)
    assert len(layers) > 0
+    if HasInputsSet():  # input already set
+        Outputs(*[l.name for l in layers])
+        return  # just return outputs.
    if len(layers) != 1:
        logger.warning("`outputs` routine try to calculate network's"
                       " inputs and outputs order. It might not work well."

--- a/python/paddle/trainer_config_helpers/optimizers.py
+++ b/python/paddle/trainer_config_helpers/optimizers.py
@@ -362,6 +362,13 @@ def __extends__(dict1, dict2):
                    default_factory=lambda _: BaseRegularization())
 def settings(batch_size,
             learning_rate=1e-3,
+             learning_rate_decay_a=0.,
+             learning_rate_decay_b=0.,
+             learning_rate_schedule='poly',
+             learning_rate_args='',
+             average_window=0,
+             do_average_in_cpu=False,
+             max_average_window=None,
             learning_method=None,
             regularization=None,
             is_async=False,
@@ -408,10 +415,14 @@ def settings(batch_size,
    else:
        algorithm = 'owlqn'
+    args=['batch_size', 'learning_rate', 'learning_rate_decay_a',
+          'learning_rate_decay_b', 'learning_rate_schedule',
+          'learning_rate_args', 'average_window', 'do_average_in_cpu',
+          'max_average_window']
    kwargs = dict()
-    kwargs['batch_size'] = batch_size
-    kwargs['learning_rate'] = learning_rate
    kwargs['algorithm'] = algorithm
+    for arg in args:
+        kwargs[arg] = locals()[arg]
    kwargs = __extends__(kwargs, learning_method.to_setting_kwargs())
    learning_method.extra_settings()

--- a/python/paddle/trainer_config_helpers/tests/configs/check.md5
+++ b/python/paddle/trainer_config_helpers/tests/configs/check.md5
@@ -2,13 +2,17 @@
 a5d9259ff1fd7ca23d0ef090052cb1f2  last_first_seq.protostr
 9c038249ec8ff719753a746cdb04c026  layer_activations.protostr
 5913f87b39cee3b2701fa158270aca26  projections.protostr
+7334ba0a4544f0623231330fc51d390d  shared_fc.protostr
+8b8b6bb128a7dfcc937be86145f53e2f  shared_lstm.protostr
 6b39e34beea8dfb782bee9bd3dea9eb5  simple_rnn_layers.protostr
 0fc1409600f1a3301da994ab9d28b0bf  test_cost_layers.protostr
+6cd5f28a3416344f20120698470e0a4c  test_cost_layers_with_weight.protostr
 144bc6d3a509de74115fa623741797ed  test_expand_layer.protostr
 2378518bdb71e8c6e888b1842923df58  test_fc.protostr
 8bb44e1e5072d0c261572307e7672bda  test_grumemory_layer.protostr
 1f3510672dce7a9ed25317fc58579ac7  test_hsigmoid.protostr
 d350bd91a0dc13e854b1364c3d9339c6  test_lstmemory_layer.protostr
+6fa59551808ee7012bbd24f757e782d2  test_maxout.protostr
 251a948ba41c1071afcd3d9cf9c233f7  test_ntm_layers.protostr
 e6ff04e70aea27c7b06d808cc49c9497  test_print_layer.protostr
 2a75dd33b640c49a8821c2da6e574577  test_rnn_group.protostr

--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
@@ -8,8 +8,8 @@ configs=(test_fc layer_activations projections test_print_layer
 test_sequence_pooling test_lstmemory_layer test_grumemory_layer
 last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
 img_layers util_layers simple_rnn_layers unused_layers test_cost_layers
-test_rnn_group test_bilinear_interp)
+test_rnn_group shared_fc shared_lstm test_cost_layers_with_weight
+test_bilinear_interp test_maxout)
 for conf in ${configs[*]}
 do

--- a/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
+from paddle.trainer_config_helpers import *
+settings(
+    learning_rate=1e-4,
+    batch_size=1000
+)
+a = data_layer(name='feature_a', size=200)
+b = data_layer(name='feature_b', size=200)
+fc_param = ParamAttr(name='fc_param', initial_max=1.0, initial_min=-1.0)
+bias_param = ParamAttr(name='bias_param', initial_mean=0.0, initial_std=0.0)
+softmax_param = ParamAttr(name='softmax_param', initial_max=1.0, initial_min=-1.0)
+hidden_a = fc_layer(input=a, size=200, param_attr=fc_param, bias_attr=bias_param)
+hidden_b = fc_layer(input=b, size=200, param_attr=fc_param, bias_attr=bias_param)
+predict = fc_layer(input=[hidden_a, hidden_b], param_attr=[softmax_param, softmax_param],
+                   bias_attr=False, size=10, act=SoftmaxActivation())
+outputs(classification_cost(input=predict, label=data_layer(name='label', size=10)))
--- a/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
+from paddle.trainer_config_helpers import *
+settings(learning_rate=1e-4, batch_size=1000)
+data_1 = data_layer(name='data_a', size=100)
+data_2 = data_layer(name='data_b', size=100)
+mixed_param = ParamAttr(name='mixed_param')
+with mixed_layer(size=400, bias_attr=False) as m1:
+    m1 += full_matrix_projection(input=data_1, param_attr=mixed_param)
+with mixed_layer(size=400, bias_attr=False) as m2:
+    m2 += full_matrix_projection(input=data_2, param_attr=mixed_param)
+lstm_param = ParamAttr(name='lstm_param')
+lstm_bias = ParamAttr(name='lstm_bias', initial_mean=0., initial_std=0.)
+lstm1 = lstmemory_group(input=m1, param_attr=lstm_param, lstm_bias_attr=lstm_bias, mixed_bias_attr=False)
+lstm2 = lstmemory_group(input=m2, param_attr=lstm_param, lstm_bias_attr=lstm_bias, mixed_bias_attr=False)
+softmax_param = ParamAttr(name='softmax_param')
+predict = fc_layer(input=[last_seq(input=lstm1), last_seq(input=lstm2)],
+                   size=10,
+                   param_attr=[softmax_param, softmax_param],
+                   bias_attr=False,
+                   act=SoftmaxActivation())
+outputs(classification_cost(input=predict, label=data_layer(name='label', size=10)))
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
+from paddle.trainer_config_helpers import *
+settings(
+    learning_rate=1e-4,
+    batch_size=1000
+)
+data = data_layer(name='input', size=300)
+lbl = data_layer(name='label', size=1)
+wt = data_layer(name='weight', size=1)
+fc = fc_layer(input=data, size=10, act=SoftmaxActivation())
+outputs(classification_cost(input=fc, label=lbl, weight=wt),
+        regression_cost(input=fc, label=lbl, weight=wt))
--- a/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
+from paddle.trainer_config_helpers import *
+settings(
+    batch_size=1000,
+    learning_rate=1e-5
+)
+data = data_layer(name='data', size=2304)
+conv = img_conv_layer(input=data,
+                      filter_size = 3,
+                      num_channels=1,
+                      num_filters=16,
+                      padding=1,
+                      act=LinearActivation(),
+                      bias_attr=True)
+maxout = maxout_layer(input=conv,
+                      num_channels=16,
+                      groups=2)
+pool = img_pool_layer(input=maxout,
+                      num_channels=8,
+                      pool_size=2,
+                      stride=2,
+                      pool_type=MaxPooling())
+fc = fc_layer(input=pool, size=384, bias_attr=False)
+outputs(fc)