Merge conflict with maxout layer

fd4eeaf5 · liaogang · ddfff3a7 · 46bd5f53 · fd4eeaf5 · fd4eeaf5
122 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -3,4 +3,6 @@ build/
 *.user
 .vscode
 .idea
\ No newline at end of file
+.project
+.pydevproject
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,9 +2,17 @@ language: cpp
 cache: ccache
 sudo: required
 dist: trusty
+os:
+  - linux
+  - osx
 env:
  - JOB=DOCS
  - JOB=BUILD_AND_TEST
+matrix:
+  exclude:
+    - os: osx
+      env: JOB=DOCS  # Only generate documentation in linux
 addons:
  apt:
    packages:
@@ -27,9 +35,11 @@ addons:
      - libgoogle-glog-dev
      - libgflags-dev
      - libgtest-dev
+      - graphviz
 before_install:
+  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi
+  - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
  - pip install wheel protobuf sphinx breathe recommonmark
-  - sudo paddle/scripts/travis/before_install.sh
 script:
  - paddle/scripts/travis/main.sh
 notifications:

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 2.8)
 project(paddle CXX C)
 set(PADDLE_MAJOR_VERSION 0)
 set(PADDLE_MINOR_VERSION 8)
-set(PADDLE_PATCH_VERSION 0b1)
+set(PADDLE_PATCH_VERSION 0b2)
 set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
@@ -104,7 +104,7 @@ else()
 endif(NOT WITH_GPU)
 if(WITH_DOUBLE)
-    add_definitions(-DPADDLE_TYPE_DOUBLE -DHPPL_TYPE_DOUBLE)
+    add_definitions(-DPADDLE_TYPE_DOUBLE)
    set(ACCURACY double)
 else(WITH_DOUBLE)
    set(ACCURACY float)

--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -17,10 +17,17 @@
 ## Find MKL First.
 set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains MKL")
-find_path(MKL_INCLUDE_DIR mkl.h PATHS ${MKL_ROOT}/include)
+find_path(MKL_INCLUDE_DIR mkl.h PATHS
-find_library(MKL_CORE_LIB NAMES mkl_core PATHS ${MKL_ROOT}/lib)
+  ${MKL_ROOT}/include)
-find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS ${MKL_ROOT}/lib)
+find_library(MKL_CORE_LIB NAMES mkl_core PATHS
-find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS ${MKL_ROOT}/lib)
+  ${MKL_ROOT}/lib
+  ${MKL_ROOT}/lib/intel64)
+find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS
+  ${MKL_ROOT}/lib
+  ${MKL_ROOT}/lib/intel64)
+find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
+  ${MKL_ROOT}/lib
+  ${MKL_ROOT}/lib/intel64)
 if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -64,7 +64,9 @@ set(COMMON_FLAGS
    -Wdelete-non-virtual-dtor
    -Wno-unused-parameter
    -Wno-error=literal-suffix
-    -Wno-error=unused-local-typedefs)
+    -Wno-error=unused-local-typedefs
+    -Wno-error=unused-function  # Warnings in Numpy Header.
+)
 foreach(flag ${COMMON_FLAGS})
    safe_set_cflag(CMAKE_C_FLAGS ${flag})

--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -184,3 +184,20 @@ macro(add_paddle_culib TARGET_NAME)
    cuda_add_library(${TARGET_NAME} STATIC ${ARGN})
    set(CUDA_NVCC_FLAGS ${NVCC_FLAG})
 endmacro()
+# Creates C resources file from files in given resource file
+function(create_resources res_file output)
+    # Create empty output file
+    file(WRITE ${output} "")
+    # Get short filename
+    string(REGEX MATCH "([^/]+)$" filename ${res_file})
+    # Replace filename spaces & extension separator for C compatibility
+    string(REGEX REPLACE "\\.| |-" "_" filename ${filename})
+    # Read hex data from file
+    file(READ ${res_file} filedata HEX)
+    # Convert hex data for C compatibility
+    string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," filedata ${filedata})
+    # Append data to output file
+    file(APPEND ${output} "const unsigned char ${filename}[] = {${filedata}};\nconst unsigned ${filename}_size = sizeof(${filename});\n")
+endfunction()
--- a/demo/mnist/.gitignore
+++ b/demo/mnist/.gitignore
+data/raw_data
+data/*.list
+mnist_vgg_model
+plot.png
+train.log
+*pyc
--- a/demo/mnist/data/generate_list.py
+++ b/demo/mnist/data/generate_list.py
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+o = open("./" + "train.list", "w")
+o.write("./data/raw_data/train" +"\n")
+o.close()
+o = open("./" + "test.list", "w")
+o.write("./data/raw_data/t10k" +"\n")
+o.close()
\ No newline at end of file
--- a/demo/mnist/data/get_mnist_data.sh
+++ b/demo/mnist/data/get_mnist_data.sh
+#!/usr/bin/env sh
+# This scripts downloads the mnist data and unzips it.
+set -e
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+rm -rf "$DIR/raw_data"
+mkdir "$DIR/raw_data"
+cd "$DIR/raw_data"
+echo "Downloading..."
+for fname in train-images-idx3-ubyte train-labels-idx1-ubyte t10k-images-idx3-ubyte t10k-labels-idx1-ubyte
+do
+    if [ ! -e $fname ]; then
+        wget --no-check-certificate http://yann.lecun.com/exdb/mnist/${fname}.gz
+        gunzip ${fname}.gz
+    fi
+done
+cd $DIR
+rm -f *.list
+python generate_list.py
--- a/demo/mnist/mnist_provider.py
+++ b/demo/mnist/mnist_provider.py
+from paddle.trainer.PyDataProvider2 import *
+# Define a py data provider
+@provider(input_types={
+    'pixel': dense_vector(28 * 28),
+    'label': integer_value(10)
+})
+def process(settings, filename):  # settings is not used currently.
+    imgf = filename + "-images-idx3-ubyte"
+    labelf = filename + "-labels-idx1-ubyte"
+    f = open(imgf, "rb")
+    l = open(labelf, "rb")
+    f.read(16)
+    l.read(8)
+    # Define number of samples for train/test
+    if "train" in filename:
+        n = 60000
+    else:
+        n = 10000
+    for i in range(n):
+        label = ord(l.read(1))
+        pixels = []
+        for j in range(28 * 28):
+            pixels.append(float(ord(f.read(1))) / 255.0)
+        yield {"pixel": pixels, 'label': label}
+    f.close()
+    l.close()
--- a/demo/mnist/train.sh
+++ b/demo/mnist/train.sh
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+config=vgg_16_mnist.py
+output=./mnist_vgg_model
+log=train.log
+paddle train \
+--config=$config \
+--dot_period=10 \
+--log_period=100 \
+--test_all_data_in_one_period=1 \
+--use_gpu=0 \
+--trainer_count=1 \
+--num_passes=100 \
+--save_dir=$output \
+2>&1 | tee $log
+python -m paddle.utils.plotcurve -i $log > plot.png
--- a/demo/mnist/vgg_16_mnist.py
+++ b/demo/mnist/vgg_16_mnist.py
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer_config_helpers import *
+is_predict = get_config_arg("is_predict", bool, False)
+####################Data Configuration ##################
+if not is_predict:
+  data_dir='./data/'
+  define_py_data_sources2(train_list= data_dir + 'train.list',
+                        test_list= data_dir + 'test.list',
+                        module='mnist_provider',
+                        obj='process')
+######################Algorithm Configuration #############
+settings(
+    batch_size = 128,
+    learning_rate = 0.1 / 128.0,
+    learning_method = MomentumOptimizer(0.9),
+    regularization = L2Regularization(0.0005 * 128)
+)
+#######################Network Configuration #############
+data_size=1*28*28
+label_size=10
+img = data_layer(name='pixel', size=data_size)
+# small_vgg is predined in trainer_config_helpers.network
+predict = small_vgg(input_image=img,
+                    num_channels=1,
+                    num_classes=label_size)
+if not is_predict:
+    lbl = data_layer(name="label", size=label_size)
+    inputs(img, lbl)
+    outputs(classification_cost(input=predict, label=lbl))
+else:
+    outputs(predict)
--- a/demo/quick_start/preprocess.sh
+++ b/demo/quick_start/preprocess.sh
@@ -20,6 +20,8 @@
 set -e
+export LC_ALL=C
 mkdir -p data/tmp
 python preprocess.py -i data/reviews_Electronics_5.json.gz
 # uniq and shuffle

--- a/demo/quick_start/train.sh
+++ b/demo/quick_start/train.sh
@@ -18,6 +18,8 @@ cfg=trainer_config.lr.py
 #cfg=trainer_config.emb.py
 #cfg=trainer_config.cnn.py
 #cfg=trainer_config.lstm.py
+#cfg=trainer_config.bidi-lstm.py
+#cfg=trainer_config.db-lstm.py
 paddle train \
  --config=$cfg \
  --save_dir=./output \

--- a/demo/quick_start/trainer_config.bidi-lstm.py
+++ b/demo/quick_start/trainer_config.bidi-lstm.py
+# edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer_config_helpers import *
+dict_file = "./data/dict.txt"
+word_dict = dict()
+with open(dict_file, 'r') as f:
+    for i, line in enumerate(f):
+        w = line.strip().split()[0]
+        word_dict[w] = i
+is_predict = get_config_arg('is_predict', bool, False)
+trn = 'data/train.list' if not is_predict else None
+tst = 'data/test.list' if not is_predict else 'data/pred.list'
+process = 'process' if not is_predict else 'process_predict'
+define_py_data_sources2(train_list=trn,
+                        test_list=tst,
+                        module="dataprovider_emb",
+                        obj=process,
+                        args={"dictionary": word_dict})
+batch_size = 128 if not is_predict else 1
+settings(
+    batch_size=batch_size,
+    learning_rate=2e-3,
+    learning_method=AdamOptimizer(),
+    regularization=L2Regularization(8e-4),
+    gradient_clipping_threshold=25
+)
+bias_attr = ParamAttr(initial_std=0.,l2_rate=0.)
+data = data_layer(name="word", size=len(word_dict))
+emb = embedding_layer(input=data, size=128)
+bi_lstm = bidirectional_lstm(input=emb, size=128)
+dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5)
+output = fc_layer(input=dropout, size=2,
+                  bias_attr=bias_attr,
+                  act=SoftmaxActivation())
+if is_predict:
+    maxid = maxid_layer(output)
+    outputs([maxid, output])
+else:
+    label = data_layer(name="label", size=2)
+    cls = classification_cost(input=output, label=label)
+    outputs(cls)
--- a/demo/quick_start/trainer_config.db-lstm.py
+++ b/demo/quick_start/trainer_config.db-lstm.py
+# edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer_config_helpers import *
+dict_file = "./data/dict.txt"
+word_dict = dict()
+with open(dict_file, 'r') as f:
+    for i, line in enumerate(f):
+        w = line.strip().split()[0]
+        word_dict[w] = i
+is_predict = get_config_arg('is_predict', bool, False)
+trn = 'data/train.list' if not is_predict else None
+tst = 'data/test.list' if not is_predict else 'data/pred.list'
+process = 'process' if not is_predict else 'process_predict'
+define_py_data_sources2(train_list=trn,
+                        test_list=tst,
+                        module="dataprovider_emb",
+                        obj=process,
+                        args={"dictionary": word_dict})
+batch_size = 128 if not is_predict else 1
+settings(
+    batch_size=batch_size,
+    learning_rate=2e-3,
+    learning_method=AdamOptimizer(),
+    regularization=L2Regularization(8e-4),
+    gradient_clipping_threshold=25
+)
+bias_attr = ParamAttr(initial_std=0.,l2_rate=0.)
+data = data_layer(name="word", size=len(word_dict))
+emb = embedding_layer(input=data, size=128)
+hidden_0 = mixed_layer(size=128, input=[full_matrix_projection(input=emb)])
+lstm_0 = lstmemory(input=hidden_0, layer_attr=ExtraAttr(drop_rate=0.1))
+input_layers = [hidden_0, lstm_0]
+for i in range(1,8):
+    fc = fc_layer(input=input_layers, size=128)
+    lstm = lstmemory(input=fc, layer_attr=ExtraAttr(drop_rate=0.1),
+                    reverse=(i % 2) == 1,)
+    input_layers = [fc, lstm]
+lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling())
+output = fc_layer(input=lstm_last, size=2,
+                  bias_attr=bias_attr,
+                  act=SoftmaxActivation())
+if is_predict:
+    maxid = maxid_layer(output)
+    outputs([maxid, output])
+else:
+    label = data_layer(name="label", size=2)
+    cls = classification_cost(input=output, label=label)
+    outputs(cls)
--- a/demo/seqToseq/seqToseq_net.py
+++ b/demo/seqToseq/seqToseq_net.py
@@ -96,12 +96,12 @@ def gru_encoder_decoder(data_conf,
    encoded_vector = concat_layer(input=[src_forward, src_backward])
    with mixed_layer(size=decoder_size) as encoded_proj:
-        encoded_proj += full_matrix_projection(encoded_vector)
+        encoded_proj += full_matrix_projection(input=encoded_vector)
    backward_first = first_seq(input=src_backward)
    with mixed_layer(size=decoder_size,
                     act=TanhActivation(), ) as decoder_boot:
-        decoder_boot += full_matrix_projection(backward_first)
+        decoder_boot += full_matrix_projection(input=backward_first)
    def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
        decoder_mem = memory(name='gru_decoder',
@@ -113,8 +113,8 @@ def gru_encoder_decoder(data_conf,
                                   decoder_state=decoder_mem, )
        with mixed_layer(size=decoder_size * 3) as decoder_inputs:
-            decoder_inputs += full_matrix_projection(context)
+            decoder_inputs += full_matrix_projection(input=context)
-            decoder_inputs += full_matrix_projection(current_word)
+            decoder_inputs += full_matrix_projection(input=current_word)
        gru_step = gru_step_layer(name='gru_decoder',
                                  input=decoder_inputs,

--- a/demo/sequence_tagging/data/get_data.sh
+++ b/demo/sequence_tagging/data/get_data.sh
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+DIR="$( cd "$(dirname "$0")" ; pwd -P )"
+cd $DIR
+wget http://www.cnts.ua.ac.be/conll2000/chunking/train.txt.gz
+wget http://www.cnts.ua.ac.be/conll2000/chunking/test.txt.gz
--- a/demo/sequence_tagging/data/test.list
+++ b/demo/sequence_tagging/data/test.list
+data/test.txt.gz
--- a/demo/sequence_tagging/data/train.list
+++ b/demo/sequence_tagging/data/train.list
+data/train.txt.gz
--- a/demo/sequence_tagging/dataprovider.py
+++ b/demo/sequence_tagging/dataprovider.py
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer.PyDataProvider2 import *
+import gzip
+import logging
+logging.basicConfig(
+    format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s',
+)
+logger = logging.getLogger('paddle')
+logger.setLevel(logging.INFO)
+OOV_POLICY_IGNORE = 0
+OOV_POLICY_USE = 1
+OOV_POLICY_ERROR = 2
+num_original_columns = 3
+# Feature combination patterns.
+# [[-1,0], [0,0]]  means previous token at column 0 and current token at 
+# column 0 are combined as one feature.
+patterns = [
+    [[-2,0]],
+    [[-1,0]],
+    [[0,0]],
+    [[1,0]],
+    [[2,0]],
+    [[-1,0], [0,0]],
+    [[0,0], [1,0]],
+    [[-2,1]],
+    [[-1,1]],
+    [[0,1]],
+    [[1,1]],
+    [[2,1]],
+    [[-2,1], [-1,1]],
+    [[-1,1], [0,1]],
+    [[0,1], [1,1]],
+    [[1,1], [2,1]],
+    [[-2,1], [-1,1], [0,1]],
+    [[-1,1], [0,1], [1,1]],
+    [[0,1], [1,1], [2,1]],
+]
+dict_label = {
+ 'B-ADJP': 0,
+ 'I-ADJP': 1,
+ 'B-ADVP': 2,
+ 'I-ADVP': 3,
+ 'B-CONJP': 4,
+ 'I-CONJP': 5,
+ 'B-INTJ': 6,
+ 'I-INTJ': 7,
+ 'B-LST': 8,
+ 'I-LST': 9,
+ 'B-NP': 10,
+ 'I-NP': 11,
+ 'B-PP': 12,
+ 'I-PP': 13,
+ 'B-PRT': 14,
+ 'I-PRT': 15,
+ 'B-SBAR': 16,
+ 'I-SBAR': 17,
+ 'B-UCP': 18,
+ 'I-UCP': 19,
+ 'B-VP': 20,
+ 'I-VP': 21,
+ 'O': 22
+}
+def make_features(sequence):
+    length = len(sequence)
+    num_features = len(sequence[0])
+    def get_features(pos):
+        if pos < 0:
+            return ['#B%s' % -pos] * num_features
+        if pos >= length:
+            return ['#E%s' % (pos - length + 1)] * num_features
+        return sequence[pos]
+    for i in xrange(length):
+        for pattern in patterns:
+            fname = '/'.join([get_features(i+pos)[f] for pos, f in pattern])
+            sequence[i].append(fname)
+'''
+Source file format:
+Each line is for one timestep. The features are separated by space.
+An empty line indicates end of a sequence.
+cutoff: a list of numbers. If count of a feature is smaller than this,
+ it will be ignored.
+if oov_policy[i] is OOV_POLICY_USE, id 0 is reserved for OOV features of
+i-th column.
+return a list of dict for each column
+'''
+def create_dictionaries(filename, cutoff, oov_policy):
+    def add_to_dict(sequence, dicts):
+        num_features = len(dicts)
+        for features in sequence:
+            l = len(features)
+            assert l == num_features, "Wrong number of features " + line
+            for i in xrange(l):
+                if features[i] in dicts[i]:
+                    dicts[i][features[i]] += 1
+                else:
+                    dicts[i][features[i]] = 1
+    num_features = len(cutoff)
+    dicts = []
+    for i in xrange(num_features):
+        dicts.append(dict())
+    f = gzip.open(filename, 'rb')
+    sequence = []
+    for line in f:
+        line = line.strip()
+        if not line:
+            make_features(sequence)
+            add_to_dict(sequence, dicts)
+            sequence = []
+            continue
+        features = line.split(' ')
+        sequence.append(features)
+    for i in xrange(num_features):
+        dct = dicts[i]
+        n = 1 if oov_policy[i] == OOV_POLICY_USE else 0
+        todo = []
+        for k, v in dct.iteritems():
+            if v < cutoff[i]:
+                todo.append(k)
+            else:
+                dct[k] = n
+                n += 1
+        if oov_policy[i] == OOV_POLICY_USE:
+            # placeholder so that len(dct) will be the number of features
+            # including OOV
+            dct['#OOV#'] = 0
+        logger.info('column %d dict size=%d, ignored %d' % (i, n, len(todo)))
+        for k in todo:
+            del dct[k]
+    f.close()
+    return dicts
+def initializer(settings, **xargs):
+    cutoff = [3, 1, 0]
+    cutoff += [3] * len(patterns)
+    oov_policy = [OOV_POLICY_IGNORE, OOV_POLICY_ERROR, OOV_POLICY_ERROR]
+    oov_policy += [OOV_POLICY_IGNORE] * len(patterns)
+    dicts = create_dictionaries('data/train.txt.gz', cutoff, oov_policy)
+    dicts[2] = dict_label
+    settings.dicts = dicts
+    settings.oov_policy = oov_policy
+    input_types = []
+    num_features = len(dicts)
+    for i in xrange(num_original_columns):
+        input_types.append(integer_sequence(len(dicts[i])))
+        logger.info("slot %s size=%s" % (i, len(dicts[i])))
+    if patterns:
+        dim = 0
+        for i in xrange(num_original_columns, num_features):
+            dim += len(dicts[i])
+        input_types.append(sparse_binary_vector_sequence(dim))
+        logger.info("feature size=%s" % dim)
+    settings.input_types = input_types
+'''
+if oov_policy[i] == OOV_POLICY_USE, features in i-th column which are not
+existed in dicts[i] will be assigned to id 0.
+if oov_policy[i] == OOV_POLICY_ERROR, all features in i-th column MUST exist
+in dicts[i].
+'''
+@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
+def process(settings, filename):
+    input_file = filename
+    dicts = settings.dicts
+    oov_policy = settings.oov_policy
+    def gen_sample(sequence):
+        num_features = len(dicts)
+        sample = [list() for i in xrange(num_original_columns)]
+        if patterns:
+            sample.append([])
+        for features in sequence:
+            assert len(features) == num_features, \
+                "Wrong number of features: " + line
+            for i in xrange(num_original_columns):
+                id = dicts[i].get(features[i], -1)
+                if id != -1:
+                    sample[i].append(id)
+                elif oov_policy[i] == OOV_POLICY_IGNORE:
+                    sample[i].append(0xffffffff)
+                elif oov_policy[i] == OOV_POLICY_ERROR:
+                    logger.fatal("Unknown token: %s" % features[i])
+                else:
+                    sample[i].append(0)
+            if patterns:
+                dim = 0
+                vec = []
+                for i in xrange(num_original_columns, num_features):
+                    id = dicts[i].get(features[i], -1)
+                    if id != -1:
+                        vec.append(dim + id)
+                    elif oov_policy[i] == OOV_POLICY_IGNORE:
+                        pass
+                    elif oov_policy[i] == OOV_POLICY_ERROR:
+                        logger.fatal("Unknown token: %s" % features[i])
+                    else:
+                        vec.ids.append(dim + 0)
+                    dim += len(dicts[i])
+                sample[-1].append(vec)
+        return sample
+    num_features = len(dicts)
+    f = gzip.open(input_file, 'rb')
+    num_sequences = 0
+    sequence = []
+    for line in f:
+        line = line.strip()
+        if not line:
+            make_features(sequence)
+            yield gen_sample(sequence)
+            sequence = []
+            num_sequences += 1
+            continue
+        features = line.split(' ')
+        sequence.append(features)
+    f.close()
+    logger.info("num_sequences=%s" % num_sequences)
--- a/demo/sequence_tagging/linear_crf.py
+++ b/demo/sequence_tagging/linear_crf.py
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer_config_helpers import *
+import math
+define_py_data_sources2(train_list="data/train.list",
+                        test_list="data/test.list",
+                        module="dataprovider",
+                        obj="process")
+batch_size = 1
+settings(
+    learning_method=MomentumOptimizer(),
+    batch_size=batch_size,
+    regularization=L2Regularization(batch_size * 1e-4),
+    average_window=0.5,
+    learning_rate=1e-1,
+    learning_rate_decay_a=1e-5,
+    learning_rate_decay_b=0.25,
+)
+num_label_types=23
+def get_simd_size(size):
+    return int(math.ceil(float(size) / 8)) * 8
+# Currently, in order to use sparse_update=True,
+# the size has to be aligned.
+num_label_types = get_simd_size(num_label_types)
+features = data_layer(name="features", size=76328)
+word = data_layer(name="word", size=6778)
+pos = data_layer(name="pos", size=44)
+chunk = data_layer(name="chunk",
+                   size=num_label_types)
+crf_input = fc_layer(
+    input=features,
+    size=num_label_types,
+    act=LinearActivation(),
+    bias_attr=False,
+    param_attr=ParamAttr(initial_std=0, sparse_update=True))
+crf=crf_layer(
+    input=crf_input,
+    label=chunk,
+    param_attr=ParamAttr(name="crfw", initial_std=0),
+)
+crf_decoding=crf_decoding_layer(
+    size=num_label_types,
+    input=crf_input,
+    label=chunk,
+    param_attr=ParamAttr(name="crfw"),
+)
+sum_evaluator(
+    name="error",
+    input=crf_decoding,
+)
+chunk_evaluator(
+    name="chunk_f1",
+    input =[crf_decoding, chunk],
+    chunk_scheme="IOB",
+    num_chunk_types=11,
+)
+inputs(word, pos, chunk, features)
+outputs(crf)
--- a/demo/sequence_tagging/readme.md
+++ b/demo/sequence_tagging/readme.md
+# Sequence Tagging
+This demo is a sequence model for assigning tags to each token in a sentence. The task is described at <a href = "http://www.cnts.ua.ac.be/conll2000/chunking">CONLL2000 Text Chunking</a> task.
+## Download data
+```bash
+cd demo/sequence_tagging
+./data/get_data.sh
+```
+## Train model
+```bash
+cd demo/sequence_tagging
+./train.sh
+```
+## Model description
+We provide two models. One is a linear CRF model (linear_crf.py) with is equivalent to the one at <a href="http://leon.bottou.org/projects/sgd#stochastic_gradient_crfs">leon.bottou.org/projects/sgd</a>. The second one is a stacked bidirectional RNN and CRF model (rnn_crf.py).
+<center>
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+<thead>
+<th scope="col" class="left">Model name</th>
+<th scope="col" class="left">Number of parameters</th>
+<th scope="col" class="left">F1 score</th>
+</thead>
+<tbody>
+<tr>
+<td class="left">linear_crf</td>
+<td class="left"> 1.8M </td>
+<td class="left"> 0.937</td>
+</tr>
+<tr>
+<td class="left">rnn_crf</td>
+<td class="left"> 960K </td>
+<td class="left">0.941</td>
+</tr>
+</tbody>
+</table>
+</center>
+<br>
--- a/demo/sequence_tagging/rnn_crf.py
+++ b/demo/sequence_tagging/rnn_crf.py
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.trainer_config_helpers import *
+import math
+define_py_data_sources2(train_list="data/train.list",
+                        test_list="data/test.list",
+                        module="dataprovider",
+                        obj="process")
+batch_size = 16
+settings(
+    learning_method=MomentumOptimizer(),
+    batch_size=batch_size,
+    regularization=L2Regularization(batch_size * 1e-5),
+    average_window=0.5,
+    learning_rate = 2e-3,
+    learning_rate_decay_a = 5e-7,
+    learning_rate_decay_b = 0.5,
+)
+word_dim=128
+hidden_dim = 128
+with_rnn = True
+initial_std=1/math.sqrt(hidden_dim)
+param_attr=ParamAttr(initial_std=initial_std)
+cpu_layer_attr=ExtraLayerAttribute(device=-1)
+default_device(0)
+num_label_types=23
+features = data_layer(name="features", size=76328)
+word = data_layer(name="word", size=6778)
+pos = data_layer(name="pos", size=44)
+chunk = data_layer(name="chunk",
+                   size=num_label_types,
+                   layer_attr=cpu_layer_attr)
+emb = embedding_layer(
+    input=word, size=word_dim, param_attr=ParamAttr(initial_std=0))
+hidden1 = mixed_layer(
+    size=hidden_dim,
+    act=STanhActivation(),
+    bias_attr=True,
+    input=[full_matrix_projection(emb),
+           table_projection(pos, param_attr=param_attr)]
+)
+if with_rnn:
+    rnn1 = recurrent_layer(
+        act=ReluActivation(),
+        bias_attr=True,
+        input=hidden1,
+        param_attr=ParamAttr(initial_std=0),
+    )
+hidden2 = mixed_layer(
+    size=hidden_dim,
+    act=STanhActivation(),
+    bias_attr=True,
+    input=[full_matrix_projection(hidden1)
+    ] + ([
+        full_matrix_projection(rnn1, param_attr=ParamAttr(initial_std=0))
+    ] if with_rnn else []),
+)
+if with_rnn:
+    rnn2=recurrent_layer(
+        reverse=True,
+        act=ReluActivation(),
+        bias_attr=True,
+        input=hidden2,
+        param_attr=ParamAttr(initial_std=0),
+    )
+crf_input = mixed_layer(
+    size=num_label_types,
+    bias_attr=False,
+    input=[
+        full_matrix_projection(hidden2),
+    ] + ([
+        full_matrix_projection(rnn2, param_attr=ParamAttr(initial_std=0))
+    ] if with_rnn else []),
+)
+crf = crf_layer(
+    input=crf_input,
+    label=chunk,
+    param_attr=ParamAttr(name="crfw", initial_std=0),
+    layer_attr=cpu_layer_attr,
+)
+crf_decoding = crf_decoding_layer(
+    size=num_label_types,
+    input=crf_input,
+    label=chunk,
+    param_attr=ParamAttr(name="crfw"),
+    layer_attr=cpu_layer_attr,
+)
+sum_evaluator(
+    name="error",
+    input=crf_decoding,
+)
+chunk_evaluator(
+    name="chunk_f1",
+    input =[crf_decoding, chunk],
+    chunk_scheme="IOB",
+    num_chunk_types=11,
+)
+inputs(word, pos, chunk, features)
+outputs(crf)
--- a/demo/sequence_tagging/train.sh
+++ b/demo/sequence_tagging/train.sh
+#!/bin/bash
+paddle train \
+       --config rnn_crf.py \
+       --parallel_nn=1 \
+       --use_gpu=1 \
+       --dot_period=10 \
+       --log_period=1000 \
+       --test_period=0 \
+       --num_passes=10
--- a/demo/sequence_tagging/train_linear.sh
+++ b/demo/sequence_tagging/train_linear.sh
+#!/bin/bash
+paddle train \
+       --config linear_crf.py \
+       --use_gpu=0 \
+       --dot_period=100 \
+       --log_period=10000 \
+       --test_period=0 \
+       --num_passes=10
--- a/doc/build/contribute_to_paddle.md
+++ b/doc/build/contribute_to_paddle.md
@@ -99,3 +99,7 @@ git pull --rebase upstream HEAD
 git push -f origin HEAD
 ```
 Now your Pull Request is updated with the latest version.
+## Revise your pull request
+When you revise your pull request according to reviewer's comments, please use 'git commit' instead of 'git commit --amend' to commit your changes so that the reviewers can see the difference between the new pull requrest and the old pull request.
--- a/doc/build/docker_install.rst
+++ b/doc/build/docker_install.rst
@@ -69,7 +69,7 @@ If you want to launch container with GPU support, you need to set some environme
 ..  code-block:: bash
-    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}"
+    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
    export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
    docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:gpu-latest

--- a/doc/demo/quick_start/index_en.md
+++ b/doc/demo/quick_start/index_en.md
@@ -134,7 +134,7 @@ def process(settings, file_name):
 You need to add a data provider definition `define_py_data_sources2` in our network configuration. This definition specifies:
 - The path of the training and testing data (`data/train.list`, `data/test.list`).
- The location of the data provider file (`dataprovider_pow`).
+- The location of the data provider file (`dataprovider_bow`).
 - The function to call to get data. (`process`).
 - Additional arguments or data. Here it passes the path of word dictionary.

--- a/doc/ui/api/trainer_config_helpers/layers.rst
+++ b/doc/ui/api/trainer_config_helpers/layers.rst
@@ -73,6 +73,12 @@ img_pool_layer
    :members: img_pool_layer
    :noindex:
+maxout_layer
+------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: maxout_layer
+    :noindex:
 Norm Layer
 ==========
@@ -130,6 +136,12 @@ gru_step_layer
 Recurrent Layer Group
 =====================
+memory
+------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: memory
+    :noindex:
 recurrent_group
 ---------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -377,6 +389,12 @@ ctc_layer
    :members: ctc_layer
    :noindex:
+nce_layer
+-----------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: nce_layer
+    :noindex:
 hsigmoid
 ---------
 ..  automodule:: paddle.trainer_config_helpers.layers

--- a/doc_cn/algorithm/rnn/hierarchical-layer.md
+++ b/doc_cn/algorithm/rnn/hierarchical-layer.md
+# 支持双层序列作为输入的Layer
+## 概述
+在自然语言处理任务中，序列是一种常见的数据类型。一个独立的词语，可以看作是一个非序列输入，或者，我们称之为一个0层的序列；由词语构成的句子，是一个单层序列；若干个句子构成一个段落，是一个双层的序列。
+双层序列是一个嵌套的序列，它的每一个元素，又是一个单层的序列。这是一种非常灵活的数据组织方式，帮助我们构造一些复杂的输入信息。
+我们可以按照如下层次定义非序列，单层序列，以及双层序列。
+ 0层序列：一个独立的元素，类型可以是PaddlePaddle支持的任意输入数据类型
+ 单层序列：排成一列的多个元素，每个元素是一个0层序列，元素之间的顺序是重要的输入信息
+ 双层序列：排成一列的多个元素，每个元素是一个单层序列，称之为双层序列的一个子序列（subseq），subseq的每个元素是一个0层序列
+在 PaddlePaddle中，下面这些Layer能够接受双层序列作为输入，完成相应的计算。
+## pooling_layer
+pooling_layer的使用示例如下，详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#pooling-layer">配置API</a>。
+```python
+seq_pool = pooling_layer(input=layer,
+                         pooling_type=AvgPooling(),
+                         agg_level=AggregateLevel.EACH_SEQUENCE)
+```
+- `pooling_type` 目前支持两种，分别是：MaxPooling()和AvgPooling()。
+- `agg_level=AggregateLevel.TIMESTEP`时（默认值）：
+  - 作用：双层序列经过运算变成一个0层序列，或单层序列经过运算变成一个0层序列
+  - 输入：一个双层序列，或一个单层序列
+  - 输出：一个0层序列，即整个输入序列（单层或双层）的平均值（或最大值）
+- `agg_level=AggregateLevel.EACH_SEQUENCE`时：
+  - 作用：一个双层序列经过运算变成一个单层序列
+  - 输入：必须是一个双层序列
+  - 输出：一个单层序列，序列的每个元素是原来双层序列每个subseq元素的平均值（或最大值）
+## last_seq 和 first_seq
+last_seq的使用示例如下（first_seq类似），详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#last-seq">配置API</a>。
+```python
+last = last_seq(input=layer,
+                agg_level=AggregateLevel.EACH_SEQUENCE)
+```
+- `agg_level=AggregateLevel.TIMESTEP`时（默认值）：
+  - 作用：一个双层序列经过运算变成一个0层序列，或一个单层序列经过运算变成一个0层序列
+  - 输入：一个双层序列或一个单层序列
+  - 输出：一个0层序列，即整个输入序列（双层或者单层）最后一个，或第一个元素。
+- `agg_level=AggregateLevel.EACH_SEQUENCE`时：
+  - 作用：一个双层序列经过运算变成一个单层序列
+  - 输入：必须是一个双层序列
+  - 输出：一个单层序列，其中每个元素是双层序列中每个subseq最后一个（或第一个）元素。
+## expand_layer
+expand_layer的使用示例如下，详细见<a href = "../../../doc/ui/api/trainer_config_helpers/layers.html#expand-layer">配置API</a>。
+```python
+expand = expand_layer(input=layer1,
+                      expand_as=layer2,
+                      expand_level=ExpandLevel.FROM_TIMESTEP)
+```
+- `expand_level=ExpandLevel.FROM_TIMESTEP`时（默认值）：
+  - 作用：一个0层序列经过运算扩展成一个单层序列，或者一个双层序列
+  - 输入：layer1必须是一个0层序列，是待扩展的数据；layer2可以是一个单层序列，或者是一个双层序列，提供扩展的长度信息
+  - 输出：一个单层序列，或一个双层序列，输出序列的类型（双层序列，或单层序列）和序列中含有元素的数目同 layer2一致。若输出是单层序列，单层序列的每个元素（0层序列），都是对layer1元素的拷贝；若输出是双层序列，双层序列每个subseq中每个元素（0层序列），都是对layer1元素的拷贝
+- `expand_level=ExpandLevel.FROM_SEQUENCE`时：
+  - 作用：一个单层序列经过运算扩展成一个双层序列
+  - 输入：layer1必须是一个单层序列，是待扩展的数据；layer2必须是一个双层序列，提供扩展的长度信息
+  - 输出：一个双层序列，序列中含有元素的数目同layer2一致。要求单层序列含有元素的数目（0层序列），和双层序列含有subseq 的数目一致。单层序列第i个元素（0层序列），被扩展为一个单层序列，构成了输出双层序列的第i个subseq。
\ No newline at end of file
--- a/doc_cn/algorithm/rnn/hierarchical-rnn.md
+++ b/doc_cn/algorithm/rnn/hierarchical-rnn.md
--- a/doc_cn/algorithm/rnn/rnn-tutorial.md
+++ b/doc_cn/algorithm/rnn/rnn-tutorial.md
+# Recurrent Group教程
+## 概述
+序列数据是自然语言处理任务面对的一种主要输入数据类型。
+一句话是由词语构成的序列，多句话进一步构成了段落。因此，段落可以看作是一个嵌套的双层的序列，这个序列的每个元素又是一个序列。
+双层序列是PaddlePaddle支持的一种非常灵活的数据组织方式，帮助我们更好地描述段落、多轮对话等更为复杂的语言数据。基于双层序列输入，我们可以设计搭建一个灵活的、层次化的RNN，分别从词语和句子级别编码输入数据，同时也能够引入更加复杂的记忆机制，更好地完成一些复杂的语言理解任务。
+在PaddlePaddle中，`recurrent_group`是一种任意复杂的RNN单元，用户只需定义RNN在一个时间步内完成的计算，PaddlePaddle负责完成信息和误差在时间序列上的传播。
+更进一步，`recurrent_group`同样可以扩展到双层序列的处理上。通过两个嵌套的`recurrent_group`分别定义子句级别和词语级别上需要完成的运算，最终实现一个层次化的复杂RNN。
+目前，在PaddlePaddle中，能够对双向序列进行处理的有`recurrent_group`和部分Layer，具体可参考文档：<a href = "hierarchical-layer.html">支持双层序列作为输入的Layer</a>。
+## 相关概念
+### 基本原理
+`recurrent_group` 是PaddlePaddle支持的一种任意复杂的RNN单元。使用者只需要关注于设计RNN在一个时间步之内完成的计算，PaddlePaddle负责完成信息和梯度在时间序列上的传播。
+PaddlePaddle中，`recurrent_group`的一个简单调用如下：
+``` python
+recurrent_group(step, input, reverse)
+```
+- step：一个可调用的函数，定义一个时间步之内RNN单元完成的计算
+- input：输入，必须是一个单层序列，或者一个双层序列
+- reverse：是否以逆序处理输入序列
+使用`recurrent_group`的核心是设计step函数的计算逻辑。step函数内部可以自由组合PaddlePaddle支持的各种layer，完成任意的运算逻辑。`recurrent_group` 的输入（即input）会成为step函数的输入，由于step 函数只关注于RNN一个时间步之内的计算，在这里`recurrent_group`替我们完成了原始输入数据的拆分。
+### 输入
+`recurrent_group`处理的输入序列主要分为以下三种类型：
+- **数据输入**：一个双层序列进入`recurrent_group`会被拆解为一个单层序列，一个单层序列进入`recurrent_group`会被拆解为非序列，然后交给step函数，这一过程对用户是完全透明的。可以有以下两种：1）通过data_layer拿到的用户输入；2）其它layer的输出。
+- **只读Memory输入**：`StaticInput` 定义了一个只读的Memory，由`StaticInput`指定的输入不会被`recurrent_group`拆解，`recurrent_group` 循环展开的每个时间步总是能够引用所有输入，可以是一个非序列，或者一个单层序列。
+- **序列生成任务的输入**：`GeneratedInput`只用于在序列生成任务中指定输入数据。
+### 输入示例
+序列生成任务大多遵循encoder-decoer架构，encoder和decoder可以是能够处理序列的任意神经网络单元，而RNN是最流行的选择。
+给定encoder输出和当前词，decoder每次预测产生下一个最可能的词语。在这种结构中，decoder接受两个输入：
+- 要生成的目标序列：是decoder的数据输入，也是decoder循环展开的依据，`recurrent_group`会对这类输入进行拆解。
+- encoder输出，可以是一个非序列，或者一个单层序列：是一个unbounded memory，decoder循环展开的每一个时间步会引用全部结果，不应该被拆解，这种类型的输入必须通过`StaticInput`指定。关于Unbounded Memory的更多讨论请参考论文 [Neural Turning Machine](https://arxiv.org/abs/1410.5401)。
+在序列生成任务中，decoder RNN总是引用上一时刻预测出的词的词向量，作为当前时刻输入。`GeneratedInput`自动完成这一过程。
+### 输出
+`step`函数必须返回一个或多个Layer的输出，这个Layer的输出会作为整个`recurrent_group` 最终的输出结果。在输出的过程中，`recurrent_group` 会将每个时间步的输出拼接，这个过程对用户也是透明的。
+### memory
+memory只能在`recurrent_group`中定义和使用。memory不能独立存在，必须指向一个PaddlePaddle定义的Layer。引用memory得到这layer上一时刻输出，因此，可以将memory理解为一个时延操作。
+可以显示地指定一个layer的输出用于初始化memory。不指定时，memory默认初始化为0。
+## 双层RNN介绍
+`recurrent_group`帮助我们完成对输入序列的拆分，对输出的合并，以及计算逻辑在序列上的循环展开。
+利用这种特性，两个嵌套的`recurrent_group`能够处理双层序列，实现词语和句子两个级别的双层RNN结构。
+- 单层（word-level）RNN：每个状态（state）对应一个词（word）。
+- 双层（sequence-level）RNN：一个双层RNN由多个单层RNN组成，每个单层RNN（即双层RNN的每个状态）对应一个子句（subseq）。
+为了描述方便，下文以NLP任务为例，将含有子句（subseq）的段落定义为一个双层序列，将含有词语的句子定义为一个单层序列，那么0层序列即为一个词语。
+## 双层RNN的使用
+### 训练流程的使用方法
+使用 `recurrent_group`需要遵循以下约定：
+- **单进单出**：输入和输出都是单层序列。
+  - 如果有多个输入，不同输入序列含有的词语数必须严格相等。
+  - 输出一个单层序列，输出序列的词语数和输入序列一致。
+  - memory：在step函数中定义 memory指向一个layer，通过引用memory得到这个layer上一个时刻输出，形成recurrent 连接。memory的is_seq参数必须为false。如果没有定义memory，每个时间步之内的运算是独立的。
+  - boot_layer：memory的初始状态，默认初始状为0，memory的is_seq参数必须为false。
+- **双进双出**：输入和输出都是双层序列。
+  - 如果有多个输入序列，不同输入含有的子句（subseq）数必须严格相等，但子句含有的词语数可以不相等。
+  - 输出一个双层序列，子句（subseq）数、子句的单词数和指定的一个输入序列一致，默认为第一个输入。
+  - memory：在step函数中定义memory，指向一个layer，通过引用memory得到这个layer上一个时刻的输出，形成recurrent连接。定义在外层`recurrent_group` step函数中的memory，能够记录上一个subseq 的状态，可以是一个单层序列（只作为read-only memory），也可以是一个词语。如果没有定义memory，那么 subseq 之间的运算是独立的。
+  - boot_layer：memory 初始状态，可以是一个单层序列（只作为read-only memory）或一个向量。默认不设置，即初始状态为0。
+- **双进单出**：目前还未支持，会报错"In hierachical RNN, all out links should be from sequences now"。
+### 生成流程的使用方法
+使用`beam_search`需要遵循以下约定：
+- 单层RNN：从一个word生成下一个word。
+- 双层RNN：即把单层RNN生成后的subseq给拼接成一个新的双层seq。从语义上看，也不存在一个subseq直接生成下一个subseq的情况。
\ No newline at end of file
--- a/doc_cn/build_and_install/install/docker_install.rst
+++ b/doc_cn/build_and_install/install/docker_install.rst
@@ -23,9 +23,9 @@ PaddlePaddle提供的Docker镜像版本
 +-----------------+------------------+------------------------+-----------------------+
 |       GPU       | gpu-latest       | gpu-devel-latest       | gpu-demo-latest       |
 +-----------------+------------------+------------------------+-----------------------+
-| CPU WITHOUT AVX | cpu-noavx-latest | cpu-devel-noavx-latest | cpu-demo-noavx-latest |
+| CPU WITHOUT AVX | cpu-noavx-latest | cpu-noavx-devel-latest | cpu-noavx-demo-latest |
 +-----------------+------------------+------------------------+-----------------------+
-| GPU WITHOUT AVX | gpu-noavx-latest | gpu-devel-noavx-latest | gpu-demo-noavx-latest |
+| GPU WITHOUT AVX | gpu-noavx-latest | gpu-noavx-devel-latest | gpu-noavx-demo-latest |
 +-----------------+------------------+------------------------+-----------------------+
 其中，横向包括三个版本，normal，devel和demo。

--- a/doc_cn/conf.py.in
+++ b/doc_cn/conf.py.in
@@ -47,6 +47,7 @@ extensions = [
    'sphinx.ext.autosummary',
    'sphinx.ext.mathjax',
    'sphinx.ext.napoleon',
+    'sphinx.ext.graphviz'
 ]
 table_styling_embed_css = True

--- a/doc_cn/faq/index.rst
+++ b/doc_cn/faq/index.rst
+####################
+PaddlePaddle常见问题
+####################
+..  contents::
+1. 如何减少PaddlePaddle的内存占用
+---------------------------------
+神经网络的训练本身是一个非常消耗内存和显存的工作。经常会消耗数十G的内存和数G的显存。
+PaddlePaddle的内存占用主要分为如下几个方面\:
+* DataProvider缓冲池内存 (只针对内存)
+* 神经元激活内存 （针对内存和显存）
+* 参数内存 (针对内存和显存)
+* 其他内存杂项
+这其中，其他内存杂项是指PaddlePaddle本身所用的一些内存，包括字符串分配，临时变量等等，
+这些内存就不考虑如何缩减了。
+其他的内存的减少方法依次为
+减少DataProvider缓冲池内存
++++++++++++++++++++++++++
+PyDataProvider使用的是异步加载，同时在内存里直接随即选取数据来做Shuffle。即
+..  graphviz::
+    digraph {
+        rankdir=LR;
+        数据文件 -> 内存池 -> PaddlePaddle训练
+    }
+所以，减小这个内存池即可减小内存占用，同时也可以加速开始训练前数据载入的过程。但是，这
+个内存池实际上决定了shuffle的粒度。所以，如果将这个内存池减小，又要保证数据是随机的，
+那么最好将数据文件在每次读取之前做一次shuffle。可能的代码为
+..  literalinclude:: reduce_min_pool_size.py
+这样做可以极大的减少内存占用，并且可能会加速训练过程。 详细文档参考 `这里
+<../ui/data_provider/pydataprovider2.html#provider>`_ 。
+神经元激活内存
++++++++++++++
+神经网络在训练的时候，会对每一个激活暂存一些数据，包括激活，參差等等。
+在反向传递的时候，这些数据会被用来更新参数。这些数据使用的内存主要和两个参数有关系，
+一是batch size，另一个是每条序列(Sequence)长度。所以，其实也是和每个mini-batch中包含
+的时间步信息成正比。
+所以，做法可以有两种。他们是
+* 减小batch size。 即在网络配置中 :code:`settings(batch_size=1000)` 设置成一个小一些的值。但是batch size本身是神经网络的超参数，减小batch size可能会对训练结果产生影响。
+* 减小序列的长度，或者直接扔掉非常长的序列。比如，一个数据集大部分序列长度是100-200,
+  但是突然有一个10000长的序列，就很容易导致内存超限。特别是在LSTM等RNN中。
+参数内存
++++++++
+PaddlePaddle支持非常多的优化算法(Optimizer)，不同的优化算法需要使用不同大小的内存。
+例如如果使用 :code:`adadelta` 算法，则需要使用参数规模大约5倍的内存。 如果参数保存下来的
+文件为 :code:`100M`， 那么该优化算法至少需要 :code:`500M` 的内存。
+可以考虑使用一些优化算法，例如 :code:`momentum`。
+2. 如何加速PaddlePaddle的训练速度
+---------------------------------
+PaddlePaddle是神经网络训练平台，加速PaddlePaddle训练有如下几个方面\：
+* 减少数据载入的耗时
+* 加速训练速度
+* 利用更多的计算资源
+减少数据载入的耗时
++++++++++++++++++
+使用 :code:`pydataprovider`时，可以减少缓存池的大小，同时设置内存缓存功能，即可以极大的加速数据载入流程。
+:code:`DataProvider` 缓存池的减小，和之前减小通过减小缓存池来减小内存占用的原理一致。
+..  literalinclude:: reduce_min_pool_size.py
+同时 :code:`@provider` 接口有一个 :code:`cache` 参数来控制缓存方法，将其设置成 :code:`CacheType.CACHE_PASS_IN_MEM` 的话，会将第一个 :code:`pass` (过完所有训练数据即为一个pass)生成的数据缓存在内存里，在之后的 :code:`pass` 中，不会再从 :code:`python` 端读取数据，而是直接从内存的缓存里读取数据。这也会极大减少数据读入的耗时。
+加速训练速度
++++++++++++
+PaddlePaddle支持Sparse的训练，sparse训练需要训练特征是 :code:`sparse_binary_vector` 、 :code:`sparse_vector` 、或者 :code:`integer_value` 的任一一种。同时，与这个训练数据交互的Layer，需要将其Parameter设置成 sparse 更新模式，即设置 :code:`sparse_update=True`
+这里使用简单的 :code:`word2vec` 训练语言模型距离，具体使用方法为\:
+使用一个词前两个词和后两个词，来预测这个中间的词。这个任务的DataProvider为\:
+..  literalinclude:: word2vec_dataprovider.py
+这个任务的配置为\:
+..  literalinclude:: word2vec_config.py
+更多关于sparse训练的内容请参考 `sparse训练的文档 <TBD>`_
+利用更多的计算资源
++++++++++++++++++
+利用更多的计算资源可以分为一下几个方式来进行\:
+* 单机CPU训练
+  * 使用多线程训练。设置命令行参数 :code:`trainer_count`，即可以设置参与训练的线程数量。使用方法为 :code:`paddle train --trainer_count=4`
+* 单机GPU训练
+  * 使用显卡训练。设置命令行参数 :code:`use_gpu`。 使用方法为 :code:`paddle train --use_gpu=true`
+  * 使用多块显卡训练。设置命令行参数 :code:`use_gpu` 和 :code:`trainer_count`。使用 :code:`--use_gpu=True` 开启GPU训练，使用 :code:`trainer_count` 指定显卡数量。使用方法为 :code:`paddle train --use_gpu=true --trainer_count=4`
+* 多机训练
+  * 使用多机训练的方法也比较简单，需要先在每个节点启动 :code:`paddle pserver`，在使用 :code:`paddle train --pservers=192.168.100.1,192.168.100.2` 来指定每个pserver的ip地址
+  * 具体的多机训练方法参考 `多机训练 <TBD>`_ 文档。
+3. 遇到“非法指令”或者是“illegal instruction” 
+--------------------------------------------
+paddle在进行计算的时候为了提升计算性能，使用了avx指令。部分老的cpu型号无法支持这样的指令。通常来说执行下grep avx /proc/cpuinfo看看是否有输出即可知道是否支持。（另：用此方法部分虚拟机可能检测到支持avx指令但是实际运行会挂掉，请当成是不支持，看下面的解决方案）
+解决办法是\:
+* 使用 NO_AVX的 `安装包 <../build_and_install/index.html>`_ 或者 `Docker image <../build_and_install/install/docker_install.html>`_
+* 或者，使用 :code:`-DWITH_AVX=OFF` 重新编译PaddlePaddle。
+4. 如何选择SGD算法的学习率
+--------------------------
+在采用sgd/async_sgd进行训练时，一个重要的问题是选择正确的learning_rate。如果learning_rate太大，那么训练有可能不收敛，如果learning_rate太小，那么收敛可能很慢，导致训练时间过长。
+通常做法是从一个比较大的learning_rate开始试，如果不收敛，那减少学习率10倍继续试验，直到训练收敛为止。那么如何判断训练不收敛呢？可以估计出如果模型采用不变的输出最小的cost0是多少。
+如果训练过程的的cost明显高于这个常数输出的cost，那么我们可以判断为训练不收敛。举一个例子，假如我们是三分类问题，采用multi-class-cross-entropy作为cost，数据中0,1,2三类的比例为 :code:`0.2, 0.5, 0.3` , 那么常数输出所能达到的最小cost是 :code:`-(0.2*log(0.2)+0.5*log(0.5)+0.3*log(0.3))=1.03` 。如果训练一个pass（或者更早）后，cost还大于这个数，那么可以认为训练不收敛，应该降低学习率。
+5. 如何初始化参数
+-----------------
+默认情况下，PaddlePaddle使用均值0，标准差为 :math:`\frac{1}{\sqrt{d}}` 来初始化参数。其中 :math:`d` 为参数矩阵的宽度。这种初始化方式在一般情况下不会产生很差的结果。如果用户想要自定义初始化方式，PaddlePaddle目前提供两种参数初始化的方式\:
+* 高斯分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_mean=0.0, initial_std=1.0)`
+* 均匀分布。将 :code:`param_attr` 设置成 :code:`param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0)`
+比如设置一个全连接层的参数初始化方式和bias初始化方式，可以使用如下代码。
+..  code-block:: python
+    hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0), 
+                      bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0))
+上述代码将bias全部初始化为1.0, 同时将参数初始化为 :code:`[1.0, -1.0]` 的均匀分布。
+6. 如何共享参数
+---------------
+PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字的参数，会共享参数。设置参数的名字，可以使用 :code:`ParamAttr(name="YOUR_PARAM_NAME")` 来设置。更方便的设置方式，是想要共享的参数使用同样的 :code:`ParamAttr` 对象。
+简单的全连接网络，参数共享的配置示例为\:
+..  literalinclude:: ../../python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
+这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。
--- a/doc_cn/faq/reduce_min_pool_size.py
+++ b/doc_cn/faq/reduce_min_pool_size.py
+@provider(min_pool_size=0, ...)
+def process(settings, filename):
+    os.system('shuf %s > %s.shuf' % (filename, filename))  # shuffle before.
+    with open('%s.shuf' % filename, 'r') as f:
+        for line in f:
+            yield get_sample_from_line(line)
\ No newline at end of file
--- a/doc_cn/faq/word2vec_config.py
+++ b/doc_cn/faq/word2vec_config.py
+... # the settings and define data provider is omitted.
+DICT_DIM=3000  # dictionary dimension.
+word_ids=data_layer('word_ids', size=DICT_DIM)
+emb = embedding_layer(input=word_ids, size=256, param_attr=ParamAttr(sparse_update=True))
+emb_sum = pooling_layer(input=emb, pooling_type=SumPooling())
+predict = fc_layer(input=emb_sum, size=DICT_DIM, act=Softmax())
+outputs(classification_cost(input=predict, label=data_layer('label', size=DICT_DIM))) 
\ No newline at end of file
--- a/doc_cn/faq/word2vec_dataprovider.py
+++ b/doc_cn/faq/word2vec_dataprovider.py
+DICT_DIM=3000
+@provider(input_types=[integer_sequence(DICT_DIM), integer_value(DICT_DIM)])
+def process(settings, filename):
+	with open(filename) as f:
+		# yield word ids to predict inner word id
+		# such as [28, 29, 10, 4], 4
+		# It means the sentance is  28, 29, 4, 10, 4.
+		yield read_next_from_file(f)
\ No newline at end of file
--- a/doc_cn/index.rst
+++ b/doc_cn/index.rst
@@ -3,6 +3,7 @@ PaddlePaddle文档
 使用指南
 --------
 * `快速入门 <demo/quick_start/index.html>`_
 * `编译与安装 <build_and_install/index.html>`_
 * `用户接口 <ui/index.html>`_
@@ -16,4 +17,13 @@ PaddlePaddle文档
 算法教程
 --------
-* `RNN配置 <../doc/algorithm/rnn/rnn.html>`_
+* `Recurrent Group教程 <algorithm/rnn/rnn-tutorial.html>`_
+* `单层RNN示例 <../doc/algorithm/rnn/rnn.html>`_
+* `双层RNN示例 <algorithm/rnn/hierarchical-rnn.html>`_
+* `支持双层序列作为输入的Layer <algorithm/rnn/hierarchical-layer.html>`_
+常见问题
+--------
+* `常见问题 <faq/index.html>`_
--- a/doc_cn/ui/data_provider/mnist_provider.dict.py
+++ b/doc_cn/ui/data_provider/mnist_provider.dict.py
@@ -2,10 +2,10 @@ from paddle.trainer.PyDataProvider2 import *
 # Define a py data provider
-@provider(input_types=[
+@provider(input_types={
-    dense_vector(28 * 28),
+    'pixel': dense_vector(28 * 28),
-    integer_value(10)
+    'label': integer_value(10)
-])
+})
 def process(settings, filename):  # settings is not used currently.
    f = open(filename, 'r')  # open one of training file
@@ -20,6 +20,6 @@ def process(settings, filename):  # settings is not used currently.
            pixels_float.append(float(each_pixel_str))
        # give data to paddle.
-        yield { "pixel": pixels_float, 'label': int(label) }
+        yield {"pixel": pixels_float, 'label': int(label)}
    f.close()  # close file
--- a/doc_cn/ui/data_provider/pydataprovider2.rst
+++ b/doc_cn/ui/data_provider/pydataprovider2.rst
@@ -141,8 +141,6 @@ DataProvider创建的时候执行。这个初始化函数具有如下参数:
   是一个batch size，但是有时为了计算均衡性，可以将一条数据设置成多个batch size
 *  cache 是数据缓存的策略，参考 `cache`_
 *  init_hook 是初始化时调用的函数，参考 `init_hook`_
-*  use_dynamic_order 如果是true的话，可以返回一个dict，key是data_layer的名字，value是特征值。同时，也可以
-   返回一个list或者tuple。如果是false的话，只能够返回list或者tuple
 *  check 设置成true的话，会根据input_types检查数据的合法性。
 *  check_fail_continue 如果设置成true的话，即使在check中数据不合法，也会扔到这条数据，继续训练。 如果
   check是false的话，没有作用。

--- a/paddle/.set_python_path.sh
+++ b/paddle/.set_python_path.sh
@@ -33,7 +33,7 @@ if ! python -c "import paddle" >/dev/null 2>/dev/null; then
    esac
  done
  shift $(($OPTIND - 1))
-  export PYTHONPATH=$PYPATH
+  export PYTHONPATH=$PYPATH:$PYTHONPATH
  $@
 else
  echo "paddle package is already in your PYTHONPATH. But unittest need a clean environment."

--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -2,10 +2,17 @@ set(AVX_SOURCES
    src/hl_math.cc
    src/hl_avx_functions.cc
 )
-set(CUDA_SOURCES
-    src/hl_time.cc
+if(WITH_AVX)
-    src/hl_cpu_functions.cc
+    set(CUDA_SOURCES
-    ${AVX_SOURCES})
+        src/hl_time.cc
+        src/hl_cpu_functions.cc
+        ${AVX_SOURCES})
+else()
+    set(CUDA_SOURCES
+        src/hl_time.cc
+        src/hl_cpu_functions.cc)
+endif()
 set(CUDA_CXX_WITH_GPU_SOURCES
    src/hl_cuda_cublas.cc

--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@@ -185,7 +185,7 @@ typedef struct {
    size_t                  nnz;
 } _hl_sparse_matrix_s, *hl_sparse_matrix_s;
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
 /**
 * HPPL data type: real (float or double)
 *

--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -169,7 +169,7 @@ extern void hl_avgpool_forward(
 * @brief   Maximum pool backward.
 *
 * @param[in]   frameCnt    batch size of input image.
- * @param[in]   outGrad     input data.
+ * @param[in]   outGrad     output grad data.
 * @param[in]   channels    number of channel.
 * @param[in]   height      image height.
 * @param[in]   width       image width.
@@ -296,4 +296,34 @@ extern void hl_bilinear_backward(real* inGrad,
                                 const size_t outputW,
                                 const size_t numChannels);
+/**
+ * @brief   MaxOut forward.
+ *
+ * @param[in]   inData      input data.
+ * @param[out]  outData     output data.
+ * @param[out]  idData      output maxId.
+ * @param[in]   batchSize   batchSize.
+ * @param[in]   size        number of channels * image height * image width.
+ * @param[in]   featLen     feature length = image height * image width.
+ * @param[in]   groups      number of groups.
+ */
+extern void hl_maxout_forward(
+    const real* inData, real* outData, int* idData,
+    size_t batchSize, size_t size, size_t featLen, size_t groups);
+/**
+ * @brief   MaxOut backward.
+ *
+ * @param[out]  inGrad      input grad data.
+ * @param[in]   outGrad     output grad data.
+ * @param[in]   idData      output maxId.
+ * @param[in]   batchSize   batchSize.
+ * @param[in]   size        number of channels * image height * image width.
+ * @param[in]   featLen     feature length = image height * image width.
+ * @param[in]   groups      number of groups.
+ */
+extern void hl_maxout_backward(
+    real* inGrad, const real* outGrad, const int* idData,
+    size_t batchSize, size_t size, size_t featLen, size_t groups);
 #endif /* HL_CNN_H_ */
--- a/paddle/cuda/include/hl_cpu_gru.cuh
+++ b/paddle/cuda/include/hl_cpu_gru.cuh
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/math/MathFunctions.h"
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
 #define     CBLAS_GEMM     paddle::gemm<float>
 #else
 #define     CBLAS_GEMM     paddle::gemm<double>

--- a/paddle/cuda/include/hl_gpu_functions.cuh
+++ b/paddle/cuda/include/hl_gpu_functions.cuh
@@ -28,7 +28,7 @@ namespace hppl {
    const real min = SIGMOID_THRESHOLD_MIN;
    const real max = SIGMOID_THRESHOLD_MAX;
    real tmp = (a < min) ? min : ((a > max) ? max : a);
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
    return __fdividef(1.0f, 1.0f + __expf(-tmp));
 #else
    return 1.0 / (1.0 + exp(-tmp));
@@ -36,7 +36,7 @@ namespace hppl {
  }
  __device__ static real tanh(const real a) {
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
    return __fdividef(2.0f, (1.0f + __expf(-2.0f*a))) - 1.0f;
 #else
    return (2.0 / (1.0 + exp(-2.0*a))) - 1.0;

--- a/paddle/cuda/include/hl_matrix_base.cuh
+++ b/paddle/cuda/include/hl_matrix_base.cuh
@@ -30,7 +30,7 @@ limitations under the License. */
 #define INLINE   inline
 #endif
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
 #define     DEVICE_FMAX     fmaxf
 #define     DEVICE_FMIN     fminf
 #else

--- a/paddle/cuda/include/hl_matrix_type.cuh
+++ b/paddle/cuda/include/hl_matrix_type.cuh
@@ -21,7 +21,7 @@ limitations under the License. */
 #ifdef __CUDA_ARCH__
 // typedef void*  vecType;
 #include <vector_types.h>
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
 typedef float4 vecType;
 #else
 typedef double2 vecType;
@@ -30,7 +30,7 @@ typedef double2 vecType;
 #include <mmintrin.h>
 #include <xmmintrin.h>
 #include <emmintrin.h>
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
 typedef __m128  vecType;
 #else
 typedef __m128d vecType;

--- a/paddle/cuda/include/hl_sequence.h
+++ b/paddle/cuda/include/hl_sequence.h
@@ -143,7 +143,7 @@ extern void hl_context_projection_backward_weight(real* outputGrad,
 */
 extern void hl_sequence2batch_copy(real *batch,
                                   real *sequence,
-                                   int *batchIndex,
+                                   const int *batchIndex,
                                   int seqWidth,
                                   int batchCount,
                                   bool seq2batch);

--- a/paddle/cuda/include/hl_sse_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_sse_matrix_kernel.cuh
@@ -20,7 +20,7 @@ limitations under the License. */
 #define VECTOR_SIZE     16
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
 /* number of float in vector */
 #define     VECTOR_LEN      4
 #define     VECTOR_SET      _mm_set_ps1
@@ -41,7 +41,7 @@ inline bool hl_check_align(void *ptr) {
  return hl_check_align(reinterpret_cast<size_t>(ptr));
 }
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
 template <class Agg>
 inline real hl_agg_op(Agg agg, vecType mm) {
  __m128 lo = _mm_unpacklo_ps(mm, mm);

--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -113,4 +113,12 @@ inline void hl_bilinear_backward(real* inGrad,
                                const size_t outputW,
                                const size_t numChannels) {}
+inline void hl_maxout_forward(
+    const real* inData, real* outData, int* idData,
+    size_t batchSize, size_t size, size_t featLen, size_t group) {}
+inline void hl_maxout_backward(
+    real* inGrad, const real* outGrad, const int* idData,
+    size_t batchSize, size_t size, size_t featLen, size_t group) {}
 #endif  // HL_CNN_STUB_H_
--- a/paddle/cuda/include/stub/hl_sequence_stub.h
+++ b/paddle/cuda/include/stub/hl_sequence_stub.h
@@ -62,7 +62,7 @@ inline void hl_context_projection_backward_weight(real* outputGrad,
 inline void hl_sequence2batch_copy(real *batch,
                                   real *sequence,
-                                   int *batchIndex,
+                                   const int *batchIndex,
                                   int seqWidth,
                                   int batchCount,
                                   bool seq2batch) {}

--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -662,4 +662,63 @@ void hl_bilinear_backward(real* inGrad,
    threadNum, inGrad, inImgH, inImgW, inputH, inputW, outGrad,
    outImgH, outImgW, outputH, outputW, numChannels, ratioH, ratioW);
  CHECK_SYNC("hl_bilinear_backward failed");
 }
\ No newline at end of file
+__global__ void maxoutFpCompute(size_t nthreads, const real * inData,
+                                real * outData, int* idData, 
+                                size_t size, size_t featLen, size_t groups) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if(index < nthreads) {
+    size_t batch_idx = index / size;
+    size_t i = index % size;
+    size_t channel_idx = i / featLen;
+    size_t feat_idx = i % featLen;
+    size_t data_idx = (batch_idx * size + channel_idx * featLen) * groups + feat_idx;
+    real max = inData[data_idx];
+    int maxId = 0;
+    for (size_t g = 1; g < groups; ++g) {
+      real tmp = inData[data_idx + g * featLen];
+      if (tmp > max) {
+        max = tmp;
+        maxId = g;
+      }
+    }
+    outData[index] = max;
+    idData[index] = maxId;
+  }
+}
+void hl_maxout_forward(const real* inData, real* outData,
+                       int* idData, size_t batchSize, size_t size,
+                       size_t featLen, size_t groups) {
+  int num_kernels = size * batchSize;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  maxoutFpCompute<<< blocks, 1024, 0, STREAM_DEFAULT>>>(
+    num_kernels, inData, outData, idData, size, featLen, groups);
+  CHECK_SYNC("hl_maxout_forward failed");
+}
+__global__ void maxoutBpCompute(size_t nthreads, real* inGrad,
+                                const real* outGrad, const int* idData,
+                                size_t size, size_t featLen, size_t groups) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if(index < nthreads) {
+    size_t batch_idx = index / size;
+    size_t i = index % size;
+    size_t channel_idx = i / featLen;
+    size_t feat_idx = i % featLen;
+    size_t newIndex = batch_idx * size;
+    size_t gradIdx = (channel_idx * groups + (idData + newIndex)[i]) * featLen + feat_idx;
+    (inGrad + newIndex * groups)[gradIdx] += (outGrad + newIndex)[i];
+  }
+}
+void hl_maxout_backward(real* inGrad, const real* outGrad,
+                        const int* idData, size_t batchSize, size_t size,
+                        size_t featLen, size_t groups) {
+  int num_kernels = size * batchSize;
+  int blocks = (num_kernels + 1024 - 1) / 1024;
+  maxoutBpCompute<<< blocks, 1024, 0, STREAM_DEFAULT >>>(
+    num_kernels, inGrad, outGrad, idData, size, featLen, groups);
+  CHECK_SYNC("hl_maxout_backward failed");
+}
--- a/paddle/cuda/src/hl_cuda_cublas.cc
+++ b/paddle/cuda/src/hl_cuda_cublas.cc
@@ -84,7 +84,7 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
 } /* namespace dynload */
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
 #define     CUBLAS_GEAM     dynload::cublasSgeam
 #define     CUBLAS_GEMV     dynload::cublasSgemv
 #define     CUBLAS_GEMM     dynload::cublasSgemm

--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -340,7 +340,7 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc,
        (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
    CHECK_NOTNULL(hl_desc);
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
@@ -373,7 +373,7 @@ void hl_create_tensor_descriptor(hl_tensor_descriptor* image_desc) {
        (cudnn_tensor_descriptor)malloc(sizeof(_cudnn_tensor_descriptor));
    CHECK_NOTNULL(hl_desc);
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
@@ -611,7 +611,7 @@ void hl_create_filter_descriptor(hl_filter_descriptor* filter,
    CHECK_CUDNN(dynload::cudnnCreateFilterDescriptor(&hl_filter->desc));
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
@@ -921,7 +921,7 @@ void hl_softmax_forward(real *input,
                        int height,
                        int width)
 {
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;
@@ -955,7 +955,7 @@ void hl_softmax_backward(real *output_value,
                         int height,
                         int width)
 {
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
    cudnnDataType_t data_type = CUDNN_DATA_FLOAT;
 #else
    cudnnDataType_t data_type = CUDNN_DATA_DOUBLE;

--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -626,7 +626,7 @@ void hl_specify_devices_start(int* device, int number) {
 void hl_rand(real *dest_d, size_t num) {
  pthread_mutex_lock(t_resource.gen_mutex);
  CHECK_EQ(
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
  dynload::curandGenerateUniform(t_resource.gen, dest_d, num),
 #else
  dynload::curandGenerateUniformDouble(t_resource.gen, dest_d, num),

--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -47,7 +47,7 @@ void hl_matrix_add(real *A_d,
  CHECK_SYNC("hl_matrix_add failed");
 }
-#ifdef HPPL_TYPE_DOUBLE
+#ifdef PADDLE_TYPE_DOUBLE
    #define THRESHOLD   128
 #else
    #define THRESHOLD   64
@@ -102,7 +102,7 @@ void subMaxAndExp(real* I,
      val = -THRESHOLD;
    }
    I[nextIdx] = val;
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
    O[nextIdx] = __expf(val);
 #else
    O[nextIdx] = exp(val);

--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -374,7 +374,7 @@ template<int blockDimX, int blockDimY, int gridDimX, bool seq2batch, bool isAdd>
 __global__
 void KeSequence2Batch(real *batch,
                      real *sequence,
-                      int *batchIndex,
+                      const int *batchIndex,
                      int seqWidth,
                      int batchCount) {
  int idx = threadIdx.x;
@@ -405,7 +405,7 @@ void KeSequence2Batch(real *batch,
 void hl_sequence2batch_copy(real *batch,
                            real *sequence,
-                            int *batchIndex,
+                            const int *batchIndex,
                            int seqWidth,
                            int batchCount,
                            bool seq2batch) {

--- a/paddle/cuda/src/hl_cuda_sparse.cuh
+++ b/paddle/cuda/src/hl_cuda_sparse.cuh
@@ -355,7 +355,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d,
 }
 /* best perf */
-#ifndef HPPL_TYPE_DOUBLE
+#ifndef PADDLE_TYPE_DOUBLE
 #define CU_CSCMM_THREAD_M_BEST          9
 #else
 #define CU_CSCMM_THREAD_M_BEST          4

--- a/paddle/gserver/dataproviders/DataProvider.cpp
+++ b/paddle/gserver/dataproviders/DataProvider.cpp
@@ -57,7 +57,8 @@ void BufferBatch::clone(DataBatch* srcBatch, bool useGpu) {
  }
 }
-DoubleBuffer::DoubleBuffer(DataProvider* dataPool, bool useGpu,
+DoubleBuffer::DoubleBuffer(DataProvider *dataPool,
+                           bool useGpu,
                           int64_t batchSize) {
  batchSize_ = batchSize;
  dataPool_ = dataPool;
@@ -110,6 +111,9 @@ void DoubleBuffer::removeOneBatch(DataBatch* dataBatch) {
 }
 void DoubleBuffer::insertOneBatch(DataBatch* batch) {
+  while (!bufferQueue_->waitNotEmptyFor(2 /* seconds */)) {  // time out
+    if (stopping_) return;
+  }
  BufferBatch* bufBatch = bufferQueue_->dequeue();
  // clone and copy the data from an Threadlocal Variable
  bufBatch->clone(batch, useGpu_);
@@ -138,7 +142,7 @@ void DoubleBuffer::asyncLoadBatch() {
        actualSize = dataPool_->getNextBatchInternal(batchSize_, &newBatch);
      }
      insertOneBatch(&newBatch);
-    } while (actualSize > 0);
+    } while (actualSize > 0 && !stopping_);
  }
 }

--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -259,7 +259,9 @@ typedef Queue<BufferBatch*> BufferBatchQueue;
 class DoubleBuffer {
 public:
-  DoubleBuffer(DataProvider* dataPool, bool useGpu, int64_t batchSize = 0);
+  DoubleBuffer(DataProvider* dataPool,
+               bool useGpu,
+               int64_t batchSize = 0);
  virtual ~DoubleBuffer();
  void removeOneBatch(DataBatch* dataBatch);
@@ -308,7 +310,8 @@ public:
  /**
   * @brief create only used for unittest.
   */
-  inline static DataProvider* create(const DataConfig &config, bool useGpu) {
+  inline static DataProvider* create(const DataConfig &config,
+                                     bool useGpu = FLAGS_use_gpu) {
    return create(config, ModelConfig(), useGpu);
  }
@@ -348,7 +351,6 @@ public:
   */
  virtual void reset() {
    if (doubleBuffer_ != nullptr) {
-      LOG(INFO) << "the double-buffer is starting ...";
      doubleBuffer_->startAsyncLoad();
    }
  }

--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
@@ -14,13 +14,20 @@ limitations under the License. */
 #ifndef PADDLE_NO_PYTHON
+#include <Python.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <unordered_set>
 #include <list>
+#include <numpy/numpyconfig.h>
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
+#include <numpy/ndarrayobject.h>
 #include "DataProvider.h"
 #include "paddle/utils/PythonUtil.h"
+#include "paddle/utils/Locks.h"
+#include "paddle/utils/Stat.h"
 namespace paddle {
@@ -202,7 +209,10 @@ public:
  PyDataProvider2(const DataConfig& config,
                  const ModelConfig& modelConfig,
                  bool useGpu)
-    :DataProvider(config, useGpu), callingContextCreated_(2) {
+    :DataProvider(config, useGpu),
+      callingContextCreated_(2) {
+    if (PyArray_API == NULL)
+      import_array();
    auto& args = config.load_data_args();
    PyObjectPtr kwargs = PyObjectPtr(PyDict_New());
    if (!args.empty()) {
@@ -246,8 +256,7 @@ private:
                       PyObjectPtr && kwargs) {
    LOG(INFO) << "loading dataprovider " << model <<"::" << className;
-    PyObjectPtr module(PyImport_ImportModule(model.c_str()));
+    PyObjectPtr module = py::import(model);
-    CHECK_PY(module) << "Cannot imort module " << model.c_str();
    PyObjectPtr moduleDict(PyModule_GetDict(module.get()));
    CHECK_PY(moduleDict) << "Invoke module.__dict__ error";
    PyObjectPtr cls(PyDict_GetItemString(moduleDict.get(),
@@ -455,6 +464,7 @@ private:
  std::condition_variable pushCV_;
  std::condition_variable pullCV_;
  std::mutex mtx_;
  ThreadBarrier callingContextCreated_;
  std::unique_ptr<IPyDataProviderCache> cache_;
@@ -497,8 +507,8 @@ public:
   * Resetting the PyDataProvider. May start reading thread here.
   */
  virtual void reset() {
-    DataProvider::reset();
    resetImpl(true);
+    DataProvider::reset();
  }
  /**
@@ -519,6 +529,7 @@ public:
   * Loading a batch of data.
   */
  int64_t getNextBatchInternal(int64_t size_, DataBatch *batch) {
+    REGISTER_TIMER("PyDP2.getNextBatchInternal")
    CHECK_GE(size_, 0);
    size_t size = (size_t) size_;
    if (loadThread_) {  // loading from thread should wait for data pool ready.
@@ -699,10 +710,22 @@ public:
   */
  virtual void fill(Argument &argument, PyObject *obj) {
    real* dat = argument.value->getData() + height_ * headerPtr_->dim;
-    py::SequenceHelper s(obj);
+    if (PyArray_Check(obj)) {
-    // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
+        auto dtype = PyArray_DTYPE((PyArrayObject*)obj);
-    for (size_t i=0; i < headerPtr_->dim; ++i) {
+        if (dtype->type == 'f' && dtype->elsize == sizeof(real)) {
-      dat[i] = (real) s.getDouble(i);
+            real * data = (real*)PyArray_DATA((PyArrayObject*)obj);
+            auto sz = PyArray_SIZE((PyArrayObject*)obj);
+            std::copy(data, data + sz, dat);
+        } else {
+            LOG(FATAL) << "You should yield float" << sizeof(real) * 8
+                       << " array";
+        }
+     } else {
+        py::SequenceHelper s(obj);
+        // TODO(yuyang18): Here we can use AVX or SSE to accelerate memory copy.
+        for (size_t i=0; i < headerPtr_->dim; ++i) {
+          dat[i] = (real) s.getDouble(i);
+        }
    }
    ++height_;
  }

--- a/paddle/gserver/evaluators/ChunkEvaluator.cpp
+++ b/paddle/gserver/evaluators/ChunkEvaluator.cpp
@@ -75,7 +75,6 @@ class ChunkEvaluator : public Evaluator {
 public:
  virtual void init(const EvaluatorConfig& config) {
-    CHECK(!FLAGS_use_gpu) << "Not supported";
    Evaluator::init(config);
    if (config.chunk_scheme() == "IOB") {
      numTagTypes_ = 2;
@@ -137,6 +136,7 @@ public:
    CHECK_EQ(arguments.size(), (size_t)2);
    IVectorPtr& output = arguments[0].ids;
    IVectorPtr& label = arguments[1].ids;
+    CHECK(!output->useGpu() && !label->useGpu()) << "Not supported";
    auto sequenceStartPositions =
        arguments[1].sequenceStartPositions->getVector(false);
    CHECK_EQ(output->getSize(), label->getSize());

--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -813,7 +813,6 @@ void TrainerThread::mergeGradSparse(
      para->getMat(PARAMETER_GRADIENT).get());
  std::vector<uint32_t>& ids = mainMat->getIds(threadId_);
-  ids.clear();
  for (auto slaveParams : slaveParameters) {
    SparseRowCpuMatrix* mat =
        dynamic_cast<SparseRowCpuMatrix*>((*slaveParams)[pid]

--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -544,6 +544,12 @@ void RecurrentGradientMachine::forward(const std::vector<Argument>& inArgs,
    const std::vector<Argument> inArgs;
    std::vector<Argument> outArgs;
    frames_[i]->forward(inArgs, &outArgs, passType);
+    if (hasSubseq) {
+      for (auto& outFrameLine : outFrameLines_) {
+        CHECK(outFrameLine.frames[i]->getOutput().sequenceStartPositions)
+          << "In hierachical RNN, all out links should be from sequences.";
+      }
+    }
  }
  if (evaluator_ && passType == PASS_TEST) {
    this->eval(evaluator_.get());
@@ -635,16 +641,15 @@ void RecurrentGradientMachine::createInFrameInfo(int inlinkId,
  std::vector<int> sequenceStartPositions;
  const int* subSequenceStartPositions = nullptr;
-  if (hasSubseq) {                    // for sequenceScatterAgentLayer
+  if (hasSubseq) {  // for sequenceScatterAgentLayer
-    subSequenceStartPositions =
+    subSequenceStartPositions = input.subSequenceStartPositions->getData(false);
-        input.subSequenceStartPositions->getData(false);
    inlinkInfo->seqStartPosIndex.clear();
    inlinkInfo->seqStartPosIndex.push_back(0);  // first seqStartPosIndex = 0
  }
  // maxSequenceLength_: max topLevelLength in allsamples
  for (int i = 0; i < maxSequenceLength_; ++i) {
    if (hasSubseq) {
-      sequenceStartPositions.push_back(0);            // first element = 0
+      sequenceStartPositions.push_back(0);  // first element = 0
    }
    int numSeqs = 0;
    for (size_t j = 0; j < numSequences; ++j) {
@@ -676,9 +681,9 @@ void RecurrentGradientMachine::createInFrameInfo(int inlinkId,
  }
  if (hasSubseq) {
    // inFrameLine create sequenceStartPositions one time
-    CHECK_EQ(sequenceStartPositions.size(),
+    CHECK_EQ(
-             static_cast<size_t>(maxSequenceLength_ +
+        sequenceStartPositions.size(),
-                                 input.getNumSubSequences()));
+        static_cast<size_t>(maxSequenceLength_ + input.getNumSubSequences()));
    CHECK_EQ(inlinkInfo->seqStartPosIndex.size(),
             static_cast<size_t>(maxSequenceLength_ + 1));
    createSeqPos(sequenceStartPositions, &inlinkInfo->sequenceStartPositions);
@@ -1102,10 +1107,12 @@ size_t RecurrentGradientMachine::beamShrink(std::vector<Path>& newPaths,
                   newPaths.end(), Path::greaterPath);
  newPaths.resize(totalExpandCount + minNewPathSize);
-  real minPathLogProb = std::min_element(newPaths.end() - minNewPathSize,
+  real minPathLogProb =
-                                         newPaths.end())->logProb;
+      std::min_element(newPaths.end() - minNewPathSize, newPaths.end())
-  real maxPathLogProb = std::max_element(newPaths.end() - minNewPathSize,
+          ->logProb;
-                                         newPaths.end())->logProb;
+  real maxPathLogProb =
+      std::max_element(newPaths.end() - minNewPathSize, newPaths.end())
+          ->logProb;
  // Remove the already formed paths that are relatively short
  finalPaths_[seqId].erase(

--- a/paddle/gserver/layers/AgentLayer.cpp
+++ b/paddle/gserver/layers/AgentLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "AgentLayer.h"
 #include "paddle/utils/Logging.h"
@@ -62,8 +61,8 @@ void SequenceAgentLayer::forward(PassType passType) {
  // get Arguments from real layers
  if (numSamples_ > 0 && numSamples_ < realNumSequences) {
-    int numRows = realOutput.sequenceStartPositions->
+    int numRows =
-                  getData(false)[numSamples_];
+        realOutput.sequenceStartPositions->getData(false)[numSamples_];
    CHECK(!realOutput.ids) << "Not supported";
    output_.subArgFrom(realOutput, /* offset */ 0, numRows, getSize(), useGpu_,
                       /* trans */ false, /* seqFlag */ true,
@@ -141,8 +140,8 @@ void ScatterAgentLayer::forward(PassType passType) {
  int width = this->getSize();
  if (realOutArg_.value || realOutArg_.ids) {
-    output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_,
+    output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, width,
-                       width, useGpu_);
+                       useGpu_);
  } else {  // used in generation
    if (realLayer_->getOutput().ids) {
      IVector::resizeOrCreate(output_.ids, ids_->getSize(), useGpu_);
@@ -224,8 +223,8 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
  if (realOutArg_.value || realOutArg_.ids) {
    CHECK(realOutArg_.sequenceStartPositions);
-    output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_,
+    output_.subArgFrom(realOutArg_, /* offset */ idIndex_, idSize_, width,
-                       width, useGpu_, /* trans */ false, /* seqFlag */ true,
+                       useGpu_, /* trans */ false, /* seqFlag */ true,
                       /* seqStart */ seqStartPosIndex_,
                       /* seqSize */ numSequences_);
  } else {
@@ -249,11 +248,12 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
    CHECK_NE(input.sequenceStartPositions.get(),
             output_.sequenceStartPositions.get());
    ICpuGpuVector::resizeOrCreate(output_.sequenceStartPositions,
-                                   numSequences + 1, false);
+                                  numSequences + 1, false);
    int* outStarts = output_.sequenceStartPositions->getMutableData(false);
-    IVector::resizeOrCreate(cpuInputStartPos_, height, false);
+    ICpuGpuVector::resizeOrCreate(inputStartPos_, height, false);
-    int* inStarts = cpuInputStartPos_->getData();
+    int* inStarts = inputStartPos_->getMutableData(false);
    size_t offsetOut = 0;
    for (size_t i = 0; i < numSequences; ++i) {
      outStarts[i] = offsetOut;
@@ -266,13 +266,8 @@ void SequenceScatterAgentLayer::forward(PassType passType) {
    }
    outStarts[numSequences] = offsetOut;
-    if (useGpu_) {
+    outputValue->copyByRowIndex(*input.value,
-      IVector::resizeOrCreate(inputStartPos_, height, true);
+                                *inputStartPos_->getVector(useGpu_));
-      inputStartPos_->copyFrom(*cpuInputStartPos_, HPPL_STREAM_DEFAULT);
-    } else {
-      inputStartPos_ = cpuInputStartPos_;
-    }
-    outputValue->copyByRowIndex(*input.value, *inputStartPos_);
  }
 }

--- a/paddle/gserver/layers/AgentLayer.h
+++ b/paddle/gserver/layers/AgentLayer.h
@@ -191,11 +191,7 @@ class SequenceScatterAgentLayer : public ScatterAgentLayer {
 protected:
  // use to store expanded cpuStartPositions or subSequenceStartPositions
  // of real layer.
-  IVectorPtr cpuInputStartPos_;
+  ICpuGpuVectorPtr inputStartPos_;
-  // point to cpuInputStartPos_ when useGpu_ is false
-  // copy from cpuInputStartPos_ when useGpu_ is true
-  IVectorPtr inputStartPos_;
 public:
  explicit SequenceScatterAgentLayer(const LayerConfig& config)

--- a/paddle/gserver/layers/AverageLayer.cpp
+++ b/paddle/gserver/layers/AverageLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "AverageLayer.h"
 #include "paddle/utils/Logging.h"
@@ -25,13 +24,8 @@ REGISTER_LAYER(average, AverageLayer);
 bool AverageLayer::init(const LayerMap& layerMap,
                        const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
+  SequencePoolLayer::init(layerMap, parameterMap);
-  Layer::init(layerMap, parameterMap);
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
  dataMtx_ = Matrix::create(nullptr, 1, 1, false, useGpu_);
  outMtx_ = Matrix::create(nullptr, 1, getSize(), false, useGpu_);
  // average strategy
@@ -44,57 +38,15 @@ bool AverageLayer::init(const LayerMap& layerMap,
  } else {
    LOG(FATAL) << "Unknown average strategy: " << config_.average_strategy();
  }
-  // transform to which sequence type
-  if (config_.trans_type() == "non-seq") {
-    type_ = kNonSeq;
-  } else if (config_.trans_type() == "seq") {
-    type_ = kSeq;
-  } else {
-    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
-  }
-  setNeedSequenceInfo(false);
  return true;
 }
 void AverageLayer::forward(PassType passType) {
-  Layer::forward(passType);
+  SequencePoolLayer::forward(passType);
-  // average layer should have exactly 1 input
-  CHECK_EQ(1U, inputLayers_.size());
-  size_t dim = getSize();
-  const Argument& input = getInput(0);
-  int64_t newBatchSize =
-      type_ ? input.getNumSubSequences() : input.getNumSequences();
-  ICpuGpuVectorPtr startPositions =
-      type_ ? input.subSequenceStartPositions
-            : input.sequenceStartPositions;
-  const int* starts = startPositions->getData(false);
-  size_t numSequences = startPositions->getSize() - 1;
-  // check
-  CHECK_EQ(numSequences, (size_t)newBatchSize);
-  CHECK_EQ(starts[numSequences], input.getBatchSize());
-  if (type_) {
-    // when trans_type = seq, input must hasSubseq
-    CHECK_EQ(input.hasSubseq(), 1UL);
-  }
-  CHECK_EQ(dim, input.value->getWidth());
-  resetOutput(newBatchSize, dim);
-  auto startsPos = startPositions->getVector(useGpu_);
  MatrixPtr inputValue = getInputValue(0);
-  getOutputValue()->sequenceAvgForward(*inputValue, *startsPos, mode_);
+  getOutputValue()->sequenceAvgForward(
+      *inputValue, *startPositions_->getVector(useGpu_), mode_);
-  /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
-   * thus, in this case, output_ has no sequenceStartPositions.
-   * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
-   * case, we should compute the new sequenceStartPositions.
-  */
-  if (type_) {
-    output_.degradeSequence(input, useGpu_);
-  }
  /* add the bias-vector AFTER average operation */
  if (biases_.get() != NULL) {
@@ -106,26 +58,16 @@ void AverageLayer::forward(PassType passType) {
 }
 void AverageLayer::backward(const UpdateCallback& callback) {
-  const Argument& input = getInput(0);
+  SequencePoolLayer::backward(callback);
-  ICpuGpuVectorPtr startPositions =
-      type_ ? input.subSequenceStartPositions
-            : input.sequenceStartPositions;
-  const int* starts = startPositions->getData(false);
-  /* Do derivation */ { backwardActivation(); }
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
+  const int* starts = startPositions_->getData(false);
  MatrixPtr grad = getInputGrad(0);
  if (grad) {
    size_t dim = getSize();
    real* gradientData = getInputGrad(0)->getData();
    real* gradient = getOutputGrad()->getData();
-    size_t numSequences = startPositions->getSize() - 1;
+    size_t numSequences = startPositions_->getSize() - 1;
    for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
      // TODO(Dangqingqing) optimization for GPU
      int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];

--- a/paddle/gserver/layers/AverageLayer.h
+++ b/paddle/gserver/layers/AverageLayer.h
@@ -12,10 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
-#include "Layer.h"
+#include "SequencePoolLayer.h"
 #include "paddle/math/Matrix.h"
 namespace paddle {
@@ -23,20 +22,21 @@ namespace paddle {
 /**
 * A layer for "internal average" for sequence input.
 * Input: one or more sequences. Each sequence contains some instances.
- * If AverageLevel = kNonSeq:
+ * If SequenceLevel = kNonSeq:
 *    Output: output size is the number of input sequences (NOT input instances)
 *    output[i] = average_{for each instance in this sequence}{input[i]}
- * If AverageLevel = kSeq:
+ * If SequenceLevel = kSeq:
 *    Check input sequence must has sub-sequence
 *    Output: output size is the number of input sub-sequences
 *    output[i] = average_{for each instance in this sub-sequence}{input[i]}
+ *
+ * The config file api is pooling_layer.
 */
+class AverageLayer : public SequencePoolLayer {
-class AverageLayer : public Layer {
 public:
  enum AverageStrategy { kAverage = 0, kSum = 1, kAverageSquareRootN = 2 };
-  enum AverageLevel { kNonSeq = 0, kSeq = 1 };
+  explicit AverageLayer(const LayerConfig& config)
-  explicit AverageLayer(const LayerConfig& config) : Layer(config) {}
+      : SequencePoolLayer(config) {}
  ~AverageLayer() {}
@@ -46,11 +46,8 @@ public:
  void backward(const UpdateCallback& callback = nullptr);
 protected:
-  std::unique_ptr<Weight> biases_;
  MatrixPtr outMtx_;
  MatrixPtr dataMtx_;
  int mode_;
-  int type_;
 };
 }  // namespace paddle
--- a/paddle/gserver/layers/ExpandLayer.cpp
+++ b/paddle/gserver/layers/ExpandLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "ExpandLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
@@ -53,9 +52,8 @@ void ExpandLayer::forward(PassType passType) {
  const Argument& shapeInput = getInput(1);
  const Argument& dataInput = getInput(0);
  size_t outputBatchSize = shapeInput.getBatchSize();
-  auto startPositions =
+  auto startPositions = type_ ? shapeInput.subSequenceStartPositions
-      type_ ? shapeInput.subSequenceStartPositions
+                              : shapeInput.sequenceStartPositions;
-            : shapeInput.sequenceStartPositions;
  size_t numSequences = startPositions->getSize() - 1;
  const int* starts = startPositions->getData(false);
@@ -71,8 +69,7 @@ void ExpandLayer::forward(PassType passType) {
  // set output sequence info as shape sequence
  output_.sequenceStartPositions = shapeInput.sequenceStartPositions;
  if (shapeInput.hasSubseq()) {
-    output_.subSequenceStartPositions =
+    output_.subSequenceStartPositions = shapeInput.subSequenceStartPositions;
-        shapeInput.subSequenceStartPositions;
  }
  // reserve output: Expand output to batchsize of sequence data.
@@ -81,8 +78,8 @@ void ExpandLayer::forward(PassType passType) {
  MatrixPtr inputValue = getInputValue(0);
  MatrixPtr outputValue = getOutputValue();
-  IVector::resizeOrCreate(cpuExpandStartsPos_, outputBatchSize, false);
+  ICpuGpuVector::resizeOrCreate(expandStartsPos_, outputBatchSize, false);
-  int* expandStarts = cpuExpandStartsPos_->getData();
+  int* expandStarts = expandStartsPos_->getMutableData(false);
  for (size_t sequenceId = 0; sequenceId < numSequences; ++sequenceId) {
    int sequenceLength = starts[sequenceId + 1] - starts[sequenceId];
    for (int j = 0; j < sequenceLength; j++) {
@@ -90,15 +87,8 @@ void ExpandLayer::forward(PassType passType) {
    }
  }
-  if (useGpu_) {
+  outputValue->copyByRowIndex(*inputValue,
-    // TODO(Dangqingqing) move copyFrom
+                              *expandStartsPos_->getVector(useGpu_));
-    IVector::resizeOrCreate(expandStartsPos_, outputBatchSize, true);
-    expandStartsPos_->copyFrom(*cpuExpandStartsPos_, HPPL_STREAM_DEFAULT);
-  } else {
-    expandStartsPos_ = cpuExpandStartsPos_;
-  }
-  outputValue->copyByRowIndex(*inputValue, *expandStartsPos_);
  if (biases_.get() != NULL) {
    outputValue->addBias(*(biases_->getW()), 1);
@@ -108,16 +98,15 @@ void ExpandLayer::forward(PassType passType) {
 void ExpandLayer::backward(const UpdateCallback& callback) {
  if (biases_ && biases_->getWGrad()) {
    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-     /* Increasing the number of gradient */
+    /* Increasing the number of gradient */
    biases_->getParameterPtr()->incUpdate(callback);
  }
  if (!getInputGrad(0)) return;
  MatrixPtr inputGrad = getInputGrad(0);
  MatrixPtr outputGrad = getOutputGrad();
-  auto cpuSeqStartPos =
+  auto cpuSeqStartPos = type_ ? getInput(1).subSequenceStartPositions
-      type_ ? getInput(1).subSequenceStartPositions
+                              : getInput(1).sequenceStartPositions;
-            : getInput(1).sequenceStartPositions;
  size_t numSequences = cpuSeqStartPos->getSize() - 1;
  const int* starts = cpuSeqStartPos->getData(false);

--- a/paddle/gserver/layers/ExpandLayer.h
+++ b/paddle/gserver/layers/ExpandLayer.h
@@ -44,14 +44,9 @@ protected:
  enum ExpandLevel { kNonSeq = 0, kSeq = 1 };
  /// store the ExpandLevel
  int type_;
-  // TODO(luotao) use ICpuGpuVectorPtr to merge cpuExpandStartsPos_
-  // and expandStartsPos_
  /// expanded sequenceStartPositions or subSequenceStartPositions
  /// of input[1]
-  IVectorPtr cpuExpandStartsPos_;
+  ICpuGpuVectorPtr expandStartsPos_;
-  /// point to cpuExpandStartsPos_ when useGpu_ is false,
-  /// copy from cpuExpandStartsPos_ when useGpu_ is true
-  IVectorPtr expandStartsPos_;
 public:
  explicit ExpandLayer(const LayerConfig& config) : Layer(config) {}

--- a/paddle/gserver/layers/MaxLayer.cpp
+++ b/paddle/gserver/layers/MaxLayer.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "MaxLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
@@ -21,55 +20,11 @@ namespace paddle {
 REGISTER_LAYER(max, MaxLayer);
-bool MaxLayer::init(const LayerMap& layerMap,
-                    const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
-  Layer::init(layerMap, parameterMap);
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
-  // transform to which sequence type
-  if (config_.trans_type() == "non-seq") {
-    type_ = kNonSeq;
-  } else if (config_.trans_type() == "seq") {
-    type_ = kSeq;
-  } else {
-    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
-  }
-  setNeedSequenceInfo(false);
-  return true;
-}
 void MaxLayer::forward(PassType passType) {
-  Layer::forward(passType);
+  SequencePoolLayer::forward(passType);
-  // max layer should have exactly 1 input
-  CHECK_EQ(1U, inputLayers_.size());
-  size_t dim = getSize();
-  const Argument& input = getInput(0);
-  int64_t newBatchSize =
-      type_ ? input.getNumSubSequences() : input.getNumSequences();
-  ICpuGpuVectorPtr startPositions =
-      type_ ? input.subSequenceStartPositions
-            : input.sequenceStartPositions;
-  auto starts = startPositions->getVector(useGpu_);
-  size_t numSequences = startPositions->getSize() - 1;
-  CHECK_EQ(dim, input.value->getWidth());
+  IVector::resizeOrCreate(maxIndex_, newBatchSize_ * getSize(),
-  CHECK_EQ(numSequences, (size_t)newBatchSize);
+                          useGpu(deviceId_));
-  CHECK_EQ(startPositions->getData(false)[numSequences], input.getBatchSize());
-  if (type_) {
-    // when trans_type = seq, input must hasSubseq
-    CHECK_EQ(input.hasSubseq(), 1UL);
-  }
-  // reset output: resize to "num of sequences", not "batch size".
-  resetOutput(newBatchSize, dim);
-  IVector::resizeOrCreate(maxIndex_, newBatchSize * dim, useGpu(deviceId_));
  maxIndex_->zeroMem();
  MatrixPtr inputValue = getInputValue(0);
@@ -77,16 +32,8 @@ void MaxLayer::forward(PassType passType) {
  {
    REGISTER_TIMER_INFO("MaxLayerForward", getName().c_str());
-    outputValue->maxSequenceForward(*inputValue, *starts, *maxIndex_);
+    outputValue->maxSequenceForward(
-  }
+        *inputValue, *startPositions_->getVector(useGpu_), *maxIndex_);
-  /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
-   * thus, in this case, output_ has no cpuSequenceStartPositions.
-   * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
-   * case, we should compute the new cpuSequenceStartPositions.
-  */
-  if (type_) {
-    output_.degradeSequence(input, useGpu_);
  }
  if (config_.output_max_index()) {
@@ -104,24 +51,14 @@ void MaxLayer::forward(PassType passType) {
 void MaxLayer::backward(const UpdateCallback& callback) {
  CHECK(!config_.output_max_index())
      << "backward is not available when output_max_index is set";
-  /* Do derivation */ { backwardActivation(); }
+  SequencePoolLayer::backward(callback);
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
  MatrixPtr inputGrad = getInputGrad(0);
  MatrixPtr outputGrad = getOutputGrad();
  if (inputGrad) {
-    ICpuGpuVectorPtr starts =
-        type_ ? getInput(0).subSequenceStartPositions
-              : getInput(0).sequenceStartPositions;
    REGISTER_TIMER_INFO("MaxLayerBackward", getName().c_str());
-    inputGrad->maxSequenceBackward(*outputGrad,
+    inputGrad->maxSequenceBackward(
-        *(starts->getVector(useGpu_)), *maxIndex_);
+        *outputGrad, *(startPositions_->getVector(useGpu_)), *maxIndex_);
  }
 }

--- a/paddle/gserver/layers/MaxLayer.h
+++ b/paddle/gserver/layers/MaxLayer.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
-#include "Layer.h"
+#include "SequencePoolLayer.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/utils/ThreadLocal.h"
@@ -24,29 +24,30 @@ namespace paddle {
 /**
 * A layer for "internal max" for sequence input.
 * Input: one or more sequences. Each sequence contains some instances.
- * If MaxLevel = kNonSeq:
+ * If SequenceLevel = kNonSeq:
 *    Output: output size is the number of input sequences (NOT input instances)
 *    output[i] = max_{for each instance in this sequence}{input[i]}
- * If MaxLevel = kSeq:
+ * If SequenceLevel = kSeq:
 *    Check input sequence must has sub-sequence
 *    Output: output size is the number of input sub-sequences
 *    output[i] = max_{for each instance in this sub-sequence}{input[i]}
+ *
+ * The config file api is pooling_layer.
 */
-class MaxLayer : public Layer {
+class MaxLayer : public SequencePoolLayer {
 protected:
-  std::unique_ptr<Weight> biases_;
  // maxIndex_[i][j] = k : the value at (i, j) is from input[k].
  IVectorPtr maxIndex_;
-  int type_;
 public:
-  explicit MaxLayer(const LayerConfig& config) : Layer(config) {}
+  explicit MaxLayer(const LayerConfig& config) : SequencePoolLayer(config) {}
-  enum MaxLevel {kNonSeq = 0, kSeq = 1 };
  ~MaxLayer() {}
-  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap) {
+    return SequencePoolLayer::init(layerMap, parameterMap);
+  }
  void forward(PassType passType);
  void backward(const UpdateCallback& callback = nullptr);

--- a/paddle/gserver/layers/MaxOutLayer.cpp
+++ b/paddle/gserver/layers/MaxOutLayer.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "MaxOutLayer.h"
+#include "hl_gpu.h"
+#include "hl_cnn.h"
+namespace paddle {
+REGISTER_LAYER(maxout, MaxOutLayer);
+size_t MaxOutLayer::getSize() {
+  const MaxOutConfig& maxoutConf = config_.inputs(0).maxout_conf();
+  imgSizeH_ = inputLayers_[0]->getOutput().getFrameHeight();
+  imgSizeW_ = inputLayers_[0]->getOutput().getFrameWidth();
+  if (imgSizeH_ == 0) {
+    imgSizeH_ = maxoutConf.img_size_y();
+  }
+  if (imgSizeW_ == 0) {
+    imgSizeW_ = maxoutConf.img_size_x();
+  }
+  featLen_ = imgSizeH_ * imgSizeW_;
+  size_t layerSize = featLen_ * outputChannels_;
+  getOutput().setFrameHeight(imgSizeH_);
+  getOutput().setFrameWidth(imgSizeW_);
+  return layerSize;
+}
+bool MaxOutLayer::init(const LayerMap& layerMap,
+                       const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  /* the size of inputs for maxout-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1);
+  const MaxOutConfig& conf = config_.inputs(0).maxout_conf();
+  groups_ = conf.groups();
+  channels_ = conf.channels();
+  CHECK_EQ(channels_ % groups_, 0UL);
+  outputChannels_ = channels_ / groups_;
+  return true;
+}
+void MaxOutLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  /* malloc memory for the output_ if necessary */
+  /* note: one sample correspond to one column */
+  size_t batchSize = getInput(0).getBatchSize();
+  size_t size = getSize();
+  resetOutput(batchSize, size);
+  MatrixPtr inputV = getInputValue(0);
+  MatrixPtr outV = getOutputValue();
+  IVector::resizeOrCreate(maxoutId_, size * batchSize, useGpu_);
+  outV->maxoutForward(*inputV, *maxoutId_, outputChannels_, groups_);
+}
+void MaxOutLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  /* Do derivation */
+  MatrixPtr inputG = getInputGrad(0);
+  MatrixPtr outG = getOutputGrad();
+  if (inputG) {
+    inputG->maxoutBackward(*outG, *maxoutId_, outputChannels_, groups_);
+  }
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/MaxOutLayer.h
+++ b/paddle/gserver/layers/MaxOutLayer.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+namespace paddle {
+/**
+ * A layer to do max out on conv layer output.
+ * Input: output of a conv layer.
+ * Output: feature map size same as input.  Channel is (input channel) / groups.
+ * So the num of channels should be able to devided by groups.
+ *
+ * The config file api is maxout_layer.
+ */
+class MaxOutLayer : public Layer {
+protected:
+  size_t groups_;
+  size_t imgSizeH_, imgSizeW_;
+  /// outputChannels_ = channels_ / groups_
+  size_t channels_, outputChannels_;
+  /// feature length = imgSizeH_ * imgSizeW_
+  size_t featLen_;
+  IVectorPtr maxoutId_;
+public:
+  /// return imgSizeH_ * imgSizeW_ * outputChannels_;
+  size_t getSize();
+  explicit MaxOutLayer(const LayerConfig& config) : Layer(config) {}
+  virtual ~MaxOutLayer() {}
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+}  // namespace paddle
--- a/paddle/gserver/layers/NCELayer.cpp
+++ b/paddle/gserver/layers/NCELayer.cpp
@@ -21,14 +21,18 @@ limitations under the License. */
 namespace paddle {
 /**
- * Noise-contrastive estimation
+ * Noise-contrastive estimation.
 * Implements the method in the following paper:
- * A fast and simple algorithm for training neural probabilistic language models
+ * A fast and simple algorithm for training neural probabilistic language models.
+ *
+ * The config file api is nce_layer.
 */
 class NCELayer : public Layer {
  int numClasses_;
-  int numInputs_;  // number of input layer besides labelLayer and weightLayer
+  /// number of input layer besides labelLayer and weightLayer
+  int numInputs_;
  LayerPtr labelLayer_;
+  /// weight layer, can be None
  LayerPtr weightLayer_;
  WeightList weights_;
  std::unique_ptr<Weight> biases_;
@@ -43,7 +47,8 @@ class NCELayer : public Layer {
    real weight;
  };
  std::vector<Sample> samples_;
-  bool prepared_;  // whether samples_ is prepared
+  /// whether samples_ is prepared
+  bool prepared_;
  Argument sampleOut_;
  IVectorPtr labelIds_;

--- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
+++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/utils/Logging.h"
-#include "Layer.h"
+#include "SequencePoolLayer.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/utils/Stat.h"
@@ -29,20 +29,19 @@ namespace paddle {
 * If SequenceLevel = kSeq:
 *   Check input sequence must has sub-sequence
 *   Output: a sequence containing only the last instance of each sub-sequence
- * of the input sequence
+ *           of the input sequence
+ *
+ * The config file api is last_seq and first_seq.
 */
-class SequenceLastInstanceLayer : public Layer {
+class SequenceLastInstanceLayer : public SequencePoolLayer {
 protected:
-  std::unique_ptr<Weight> biases_;
  MatrixPtr tmpSrc_;
  MatrixPtr tmpDest_;
-  enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
-  int type_;
 public:
  explicit SequenceLastInstanceLayer(const LayerConfig& config)
-      : Layer(config) {}
+      : SequencePoolLayer(config) {}
  ~SequenceLastInstanceLayer() {}
@@ -56,55 +55,20 @@ REGISTER_LAYER(seqlastins, SequenceLastInstanceLayer);
 bool SequenceLastInstanceLayer::init(const LayerMap& layerMap,
                                     const ParameterMap& parameterMap) {
-  /* Initialize the basic parent class */
+  SequencePoolLayer::init(layerMap, parameterMap);
-  Layer::init(layerMap, parameterMap);
-  // seqlastins layer should have exactly 1 input
-  CHECK_EQ(1U, inputLayers_.size());
-  /* initialize biases_ */
-  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
-  }
  tmpSrc_ =
      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
  tmpDest_ =
      Matrix::create(nullptr, /* height= */ 1, 1, /* trans= */ false, useGpu_);
-  // transform to which sequence type
-  if (config_.trans_type() == "non-seq") {
-    type_ = kNonSeq;
-  } else if (config_.trans_type() == "seq") {
-    type_ = kSeq;
-  } else {
-    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
-  }
-  setNeedSequenceInfo(false);
  return true;
 }
 void SequenceLastInstanceLayer::forward(PassType passType) {
-  Layer::forward(passType);
+  SequencePoolLayer::forward(passType);
-  size_t dim = getSize();
-  const Argument& input = getInput(0);
-  // check
-  auto startPositions =
-      type_ ? input.subSequenceStartPositions->getVector(false)
-            : input.sequenceStartPositions->getVector(false);
-  size_t height = type_ ? input.getNumSubSequences() : input.getNumSequences();
-  CHECK_EQ(dim, input.value->getWidth());
-  CHECK_EQ(startPositions->getData()[height], input.getBatchSize());
-  CHECK_EQ(height, startPositions->getSize() - 1);
-  if (type_) {
-    // when trans_type = seq, input must hasSubseq
-    CHECK_EQ(input.hasSubseq(), 1UL);
-  }
-  reserveOutput(height, dim);
+  const int* starts = startPositions_->getData(false);
-  const int* starts = startPositions->getData();
  MatrixPtr inputValue = getInputValue(0);
  MatrixPtr outputValue = getOutputValue();
@@ -112,21 +76,13 @@ void SequenceLastInstanceLayer::forward(PassType passType) {
    AsyncGpuBlock asyncGpuBlock;
    REGISTER_TIMER_INFO("SequenceLastInstanceLayerForward", getName().c_str());
-    for (size_t seqId = 0; seqId < height; ++seqId) {
+    for (size_t seqId = 0; seqId < newBatchSize_; ++seqId) {
      int insId =
          config_.select_first() ? starts[seqId] : starts[seqId + 1] - 1;
      outputValue->subMatrix(seqId, 1, tmpDest_)
          ->assign(*(inputValue->subMatrix(insId, 1, tmpSrc_)));
    }
-    /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
-     * thus, in this case, output_ has no sequenceStartPositions.
-     * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
-     * case, we should compute the new sequenceStartPositions.
-    */
-    if (type_) {
-      output_.degradeSequence(input, useGpu_);
-    }
  }
  if (biases_.get() != NULL) {
@@ -138,23 +94,12 @@ void SequenceLastInstanceLayer::forward(PassType passType) {
 }
 void SequenceLastInstanceLayer::backward(const UpdateCallback& callback) {
-  /* activation, should set to 'linear' in most cases */
+  SequencePoolLayer::backward(callback);
-  backwardActivation();
-  if (biases_ && biases_->getWGrad()) {
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
-    // Increasing the number of gradient
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
  MatrixPtr inputGrad = getInputGrad(0);
  MatrixPtr outputGrad = getOutputGrad();
-  auto startPositions =
+  const int* starts = startPositions_->getData(false);
-      type_ ? getInput(0).subSequenceStartPositions->getVector(false)
+  size_t numSequences = startPositions_->getSize() - 1;
-            : getInput(0).sequenceStartPositions->getVector(false);
-  const int* starts = startPositions->getData();
-  size_t numSequences = startPositions->getSize() - 1;
  if (inputGrad) {
    AsyncGpuBlock asyncGpuBlock;

--- a/paddle/gserver/layers/SequencePoolLayer.cpp
+++ b/paddle/gserver/layers/SequencePoolLayer.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/utils/Logging.h"
+#include "SequencePoolLayer.h"
+namespace paddle {
+bool SequencePoolLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  // seqlastins/max/average layer should have exactly 1 input
+  CHECK_EQ(1U, inputLayers_.size());
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+  }
+  // transform to which sequence type
+  if (config_.trans_type() == "non-seq") {
+    type_ = kNonSeq;
+  } else if (config_.trans_type() == "seq") {
+    type_ = kSeq;
+  } else {
+    LOG(FATAL) << "Unknown trans_type: " << config_.trans_type();
+  }
+  setNeedSequenceInfo(false);
+  return true;
+}
+void SequencePoolLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  const Argument& input = getInput(0);
+  newBatchSize_ = type_ ? input.getNumSubSequences() : input.getNumSequences();
+  size_t dim = getSize();
+  // check
+  CHECK_EQ(dim, input.value->getWidth());
+  startPositions_ =
+      type_ ? input.subSequenceStartPositions : input.sequenceStartPositions;
+  auto starts = startPositions_->getVector(false);
+  CHECK_EQ(starts->getData()[newBatchSize_], input.getBatchSize());
+  CHECK_EQ(newBatchSize_, starts->getSize() - 1);
+  resetOutput(newBatchSize_, dim);
+  if (type_) {
+    CHECK(input.subSequenceStartPositions)
+      << "when trans_type = seq, input must hasSubseq";
+  }
+  /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
+   * thus, in this case, output_ has no sequenceStartPositions.
+   * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
+   * case, we should compute the new sequenceStartPositions.
+  */
+  if (type_) {
+    output_.degradeSequence(input, useGpu_);
+  }
+}
+void SequencePoolLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ { backwardActivation(); }
+  if (biases_ && biases_->getWGrad()) {
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+    // Increasing the number of gradient
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/SequencePoolLayer.h
+++ b/paddle/gserver/layers/SequencePoolLayer.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+namespace paddle {
+/**
+ * A base layer for SequenceLastInstanceLayer/AverageLayer/MaxLayer.
+ *
+ * Input: one or more sequences. Each sequence contains some instances.
+ * If SequenceLevel = kNonSeq:
+ *    Output: output size is the number of input sequences (NOT input instances)
+ *    output[i] = seqlastin/average/max_{for each instance in this
+ * sequence}{input[i]}
+ * If SequenceLevel = kSeq:
+ *    Check input sequence must has sub-sequence
+ *    Output: output size is the number of input sub-sequences
+ *    output[i] = seqlastin/average/max_{for each instance in this
+ * sub-sequence}{input[i]}
+ *
+ * The config file api is pooling_layer.
+ */
+class SequencePoolLayer : public Layer {
+protected:
+  int type_;
+  std::unique_ptr<Weight> biases_;
+  enum SequenceLevel { kNonSeq = 0, kSeq = 1 };
+  size_t newBatchSize_;
+  ICpuGpuVectorPtr startPositions_;
+public:
+  explicit SequencePoolLayer(const LayerConfig& config) : Layer(config) {}
+  virtual ~SequencePoolLayer() {}
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback = nullptr);
+};
+}  // namespace paddle
--- a/paddle/gserver/tests/rnn_data_provider.py
+++ b/paddle/gserver/tests/rnn_data_provider.py
@@ -14,12 +14,15 @@
 from paddle.trainer.PyDataProvider2 import *
+# Note that each config should has an independent provider
+# in current design of PyDataProvider2.
+#######################################################
 data = [
    [[[1, 3, 2], [4, 5, 2]], 0],
    [[[0, 2], [2, 5], [0, 1, 2]], 1],
 ]
+# Used for sequence_nest_rnn.conf
 @provider(input_types=[integer_value_sub_sequence(10),
                       integer_value(3)],
          should_shuffle=False)
@@ -27,7 +30,7 @@ def process_subseq(settings, file_name):
    for d in data:
        yield d
+# Used for sequence_rnn.conf
 @provider(input_types=[integer_value_sequence(10),
                       integer_value(3)],
          should_shuffle=False)
@@ -38,11 +41,32 @@ def process_seq(settings, file_name):
            seq += subseq
        yield seq, d[1]
+# Used for sequence_nest_rnn_multi_input.conf
+@provider(input_types=[integer_value_sub_sequence(10),
+                       integer_value(3)],
+          should_shuffle=False)
+def process_subseq2(settings, file_name):
+    for d in data:
+        yield d
+# Used for sequence_rnn_multi_input.conf
+@provider(input_types=[integer_value_sequence(10),
+                       integer_value(3)],
+          should_shuffle=False)
+def process_seq2(settings, file_name):
+    for d in data:
+        seq = []
+        for subseq in d[0]:
+            seq += subseq
+        yield seq, d[1]
+###########################################################
 data2 = [
    [[[1, 2], [4, 5, 2]], [[5, 4, 1], [3, 1]] ,0],
    [[[0, 2], [2, 5], [0, 1, 2]],[[1, 5], [4], [2, 3, 6, 1]], 1],
 ]
+# Used for sequence_nest_rnn_multi_unequalength_inputs.conf
 @provider(input_types=[integer_value_sub_sequence(10),
                       integer_value_sub_sequence(10),
                       integer_value(2)],
@@ -52,6 +76,7 @@ def process_unequalength_subseq(settings, file_name):
        yield d
+# Used for sequence_rnn_multi_unequalength_inputs.conf
 @provider(input_types=[integer_value_sequence(10),
                       integer_value_sequence(10),
                       integer_value(2)],

--- a/paddle/gserver/tests/sequenceGen.py
+++ b/paddle/gserver/tests/sequenceGen.py
@@ -21,7 +21,7 @@ from paddle.trainer.PyDataProvider2 import *
 def hook(settings, dict_file, **kwargs):
    settings.word_dict = dict_file
    settings.input_types = [integer_value_sequence(len(settings.word_dict)),
-                            integer_value_sequence(3)]
+                            integer_value(3)]
    settings.logger.info('dict len : %d' % (len(settings.word_dict)))
@@ -34,14 +34,14 @@ def process(settings, file_name):
            words = comment.split()
            word_slot = [settings.word_dict[w] for w in words if
                         w in settings.word_dict]
-            yield word_slot, [label]
+            yield word_slot, label
 ## for hierarchical sequence network
 def hook2(settings, dict_file, **kwargs):
    settings.word_dict = dict_file
    settings.input_types = [integer_value_sub_sequence(len(settings.word_dict)),
-                            integer_value_sub_sequence(3)]
+                            integer_value_sequence(3)]
    settings.logger.info('dict len : %d' % (len(settings.word_dict)))
@@ -57,7 +57,7 @@ def process2(settings, file_name):
                words = comment.split()
                word_slot = [settings.word_dict[w] for w in words if
                             w in settings.word_dict]
-                label_list.append([label])
+                label_list.append(label)
                word_slot_list.append(word_slot)
            else:
                yield word_slot_list, label_list

--- a/paddle/gserver/tests/sequence_nest_rnn.conf
+++ b/paddle/gserver/tests/sequence_nest_rnn.conf
@@ -56,9 +56,8 @@ def outer_step(x):
    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
    # "return last" should also work. But currently RecurrentGradientMachine
-    # does not handle it correctly. Current implementation requires that
+    # does not handle it, and will report error: In hierachical RNN, all out 
-    # all the out links are from sequences. However, it does not report error
+    # links should be from sequences now.
-    # when the out links are not sequences.
    return inner_rnn_output
 out = recurrent_group(

--- a/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
+++ b/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
@@ -19,7 +19,7 @@ from paddle.trainer_config_helpers import *
 define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
                        test_list=None,
                        module='rnn_data_provider',
-                        obj='process_subseq')
+                        obj='process_subseq2')
 settings(batch_size=2, learning_rate=0.01)
@@ -57,9 +57,8 @@ def outer_step(wid, x):
    last = last_seq(input=inner_rnn_output, name="outer_rnn_state")
    # "return last" should also work. But currently RecurrentGradientMachine
-    # does not handle it correctly. Current implementation requires that
+    # does not handle it, and will report error: In hierachical RNN, all out 
-    # all the out links are from sequences. However, it does not report error
+    # links should be from sequences now.
-    # when the out links are not sequences.
    return inner_rnn_output
 out = recurrent_group(

--- a/paddle/gserver/tests/sequence_rnn_multi_input.conf
+++ b/paddle/gserver/tests/sequence_rnn_multi_input.conf
@@ -19,7 +19,7 @@ from paddle.trainer_config_helpers import *
 define_py_data_sources2(train_list='gserver/tests/Sequence/dummy.list',
                        test_list=None,
                        module='rnn_data_provider',
-                        obj='process_seq')
+                        obj='process_seq2')
 settings(batch_size=2, learning_rate=0.01)

--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -327,6 +327,24 @@ TEST(Layer, blockExpandLayer) {
  }
 }
+TEST(Layer, maxoutLayer) {
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("maxout");
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 4096, 0});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  MaxOutConfig* maxout = input->mutable_maxout_conf();
+  maxout->set_img_size_x(32);
+  maxout->set_img_size_y(32);
+  maxout->set_channels(4);
+  maxout->set_groups(2);
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "maxout", 10, false, useGpu);
+  }
+}
 void testFcLayer(string format, size_t nnz) {
  TestConfig config;
  config.biasSize = 4096;

--- a/paddle/gserver/tests/test_PyDataProvider2.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider2.cpp
@@ -117,7 +117,7 @@ TEST(PyDataProvider2, index_no_seq) {
 }
 TEST(PyDataProvider2, init_hook) {
-  paddle::PyObjectPtr pickle(PyImport_ImportModule("pickle"));
+  paddle::PyObjectPtr pickle = paddle::py::import("pickle");
  paddle::PyObjectPtr globals(
      PyModule_GetDict(PyImport_AddModule("__main__")));
  PyDict_SetItemString(globals.get(), "pickle", pickle.get());

--- a/paddle/gserver/tests/test_PyDataProvider2.py
+++ b/paddle/gserver/tests/test_PyDataProvider2.py
@@ -86,7 +86,7 @@ def test_can_over_batch_size(setting, filename):
        yield [random.randint(0, 100 - 1) for _ in xrange(seq_len)]
-@provider(input_types=[index_slot(10), index_slot(10)])
+@provider(input_types={'input1':index_slot(10), 'input2': index_slot(10)})
 def test_input_order(setting, filename):
    for _ in xrange(1000):
        yield {

--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <gtest/gtest.h>
 #include <paddle/utils/Util.h>
 #include <paddle/utils/Version.h>
@@ -24,7 +23,7 @@ limitations under the License. */
 P_DECLARE_int32(seed);
 using namespace paddle;  // NOLINT
-using namespace std;  // NOLINT
+using namespace std;     // NOLINT
 class TrainerForTest : public paddle::Trainer {
 public:
  void startTrain() {
@@ -44,11 +43,10 @@ public:
   */
  size_t getTotalParameterSize() const {
    auto p = const_cast<TrainerForTest*>(this);
-    auto & params = p->getGradientMachine()->getParameters();
+    auto& params = p->getGradientMachine()->getParameters();
-    return std::accumulate(params.begin(), params.end(), 0UL,
+    return std::accumulate(
-                           [](size_t a, const ParameterPtr& p){
+        params.begin(), params.end(), 0UL,
-      return a+p->getSize();
+        [](size_t a, const ParameterPtr& p) { return a + p->getSize(); });
-    });
  }
 };

--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -283,13 +283,13 @@ void GpuMatrix::copyFrom(const IVector& src) {
  copyFrom(matrix);
 }
-void GpuMatrix::copyByRowIndex(Matrix& b, IVector& rowIndex) {
+void GpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
  size_t height = getHeight();
  size_t width = getWidth();
  CHECK_EQ(b.getWidth(), width);
  real* dst = getData();
  real* src = b.getData();
-  int* index = rowIndex.getData();
+  const int* index = rowIndex.getData();
  hl_sequence2batch_copy(dst, src, index, width, height, true);
 }
@@ -584,6 +584,42 @@ void GpuMatrix::colMax(Matrix& max) {
  max.maxCols(*this);
 }
+void GpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
+  LOG(FATAL) << "Is not supported";
+}
+void GpuMatrix::maxoutForward(Matrix& a, IVector& id, size_t channels,
+                              size_t groups) {
+  CHECK(dynamic_cast<GpuMatrix*>(&a));
+  CHECK(dynamic_cast<GpuIVector*>(&id));
+  CHECK_EQ(a.getHeight(), getHeight());
+  size_t size = getWidth();
+  size_t batchSize = getHeight();
+  const real* input  = a.getData();
+  real* output = getData();
+  int* idForGpu = id.getData();
+  hl_maxout_forward(input, output, idForGpu, batchSize, size,
+                    size / channels, groups);
+}
+void GpuMatrix::maxoutBackward(Matrix& a, IVector& id, size_t channels,
+                               size_t groups) {
+  CHECK(dynamic_cast<GpuMatrix*>(&a));
+  CHECK(dynamic_cast<GpuIVector*>(&id));
+  CHECK_EQ(a.getHeight(), getHeight());
+  size_t size = a.getWidth();
+  size_t batchSize = getHeight();
+  real* input  = getData();
+  const real* output = a.getData();
+  const int* idForGpu = id.getData();
+  hl_maxout_backward(input, output, idForGpu, batchSize, size,
+                     size / channels, groups);
+}
 /*calulate the error of classification */
 void GpuMatrix::classificationError(MatrixPtr output, IVectorPtr label) {
  GpuMatrixPtr output_ptr = std::dynamic_pointer_cast<GpuMatrix>(output);
@@ -1329,11 +1365,11 @@ void CpuMatrix::copyFrom(const IVector& src) {
  }
 }
-void CpuMatrix::copyByRowIndex(Matrix& b, IVector& rowIndex) {
+void CpuMatrix::copyByRowIndex(Matrix& b, const IVector& rowIndex) {
  size_t height = getHeight();
  size_t width = getWidth();
  CHECK_EQ(b.getWidth(), width);
-  int* index = rowIndex.getData();
+  const int* index = rowIndex.getData();
  for (size_t i = 0; i < height; i++) {
    CHECK_LT(static_cast<size_t>(index[i]), b.getHeight());
    real* src = b.getData() + index[i] * width;
@@ -2799,6 +2835,95 @@ void CpuMatrix::colMax(Matrix& max) {
  max.maxCols(*this);
 }
+void CpuMatrix::colMax(IVector& maxIds, Matrix& maxVal) {
+  CHECK(isContiguous());
+  CHECK(!maxIds.useGpu() && !maxVal.useGpu()) << "Matrix type are not equal";
+  size_t numSamples = getWidth();
+  size_t beam = maxVal.getHeight();
+  CHECK_EQ(maxIds.getSize(), numSamples * beam);
+  CHECK_EQ(maxVal.getWidth(), numSamples);
+  real* a = getData();
+  int* s = maxIds.getData();
+  real* t = maxVal.getData();
+  size_t dim = getHeight();
+  for (size_t i = 0; i < numSamples; i++) {
+    std::vector<std::pair<real, size_t>> vec;
+    for (size_t j = 0; j < dim; j++) {
+      vec.push_back(std::pair<real, size_t>(a[i + j * numSamples], j));
+    }
+    std::partial_sort(
+        vec.begin(), vec.begin() + beam, vec.end(),
+        [](const std::pair<real, size_t>& l, const std::pair<real, size_t>& r) {
+          return l.first > r.first;
+        });
+    for (size_t j = 0; j < beam; j++) {
+      t[i + j * numSamples] = vec[j].first;
+      s[i + j * numSamples] = vec[j].second;
+    }
+  }
+}
+void CpuMatrix::maxoutForward(Matrix& a, IVector& id, size_t channels,
+                              size_t groups) {
+  CHECK(dynamic_cast<CpuMatrix*>(&a));
+  CHECK(dynamic_cast<CpuIVector*>(&id));
+  CHECK_EQ(a.getHeight(), getHeight());
+  size_t size = getWidth();
+  size_t batchSize = getHeight();
+  size_t featLen = size / channels;
+  const real* input  = a.getData();
+  int* idForCpu = id.getData();
+  MatrixPtr maxInMat, maxOutMat;
+  Matrix::resizeOrCreate(maxInMat, groups, size, false, false);
+  Matrix::resizeOrCreate(maxOutMat, 1, size, false, false);
+  for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
+    size_t newIndex = batch_idx * size;
+    IVectorPtr tmpId = IVector::create(idForCpu + newIndex, size, false);
+    for (size_t i = 0; i < channels; ++i) {
+      size_t newFeatLen = i * featLen;
+      for (size_t j = 0; j < groups; ++j) {
+        maxInMat->subMatrix(j, j + 1, newFeatLen, newFeatLen + featLen)
+            ->copyFrom(input + (newIndex + newFeatLen) * groups + j * featLen,
+                       featLen);
+      }
+    }
+    maxInMat->colMax(*tmpId, *maxOutMat);
+    this->subRowMatrix(batch_idx, batch_idx + 1)->copyFrom(*maxOutMat);
+  }
+}
+void CpuMatrix::maxoutBackward(Matrix& a, IVector& id, size_t channels,
+                               size_t groups) {
+  CHECK(dynamic_cast<CpuMatrix*>(&a));
+  CHECK(dynamic_cast<CpuIVector*>(&id));
+  CHECK_EQ(a.getHeight(), getHeight());
+  size_t size = a.getWidth();
+  size_t batchSize = getHeight();
+  size_t featLen = size / channels;
+  size_t newFeatLen = groups * featLen;
+  real* inputG  = getData();
+  const real* outG  = a.getData();
+  int* idForCpu = id.getData();
+  for (size_t batch_idx = 0; batch_idx < batchSize; ++batch_idx) {
+    size_t newIndex = batch_idx * size;
+    int* idData = idForCpu + newIndex;
+    for (size_t i = 0; i < size; ++i) {
+      int gradIdx =
+          idData[i] * featLen + (i / featLen) * newFeatLen + i % featLen;
+      (inputG + newIndex * groups)[gradIdx] += (outG + newIndex)[i];
+    }
+  }
+}
 void CpuMatrix::rowNormalizeL1(Matrix& out) {
  CHECK(!out.useGpu());

--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -253,7 +253,7 @@ public:
    LOG(FATAL) << "copy data from int vector only available on CpuMatrix.";
  }
-  virtual void copyByRowIndex(Matrix& b, IVector& rowIndex) {
+  virtual void copyByRowIndex(Matrix& b, const IVector& rowIndex) {
    LOG(FATAL) << "Not implemented";
  }
@@ -493,16 +493,40 @@ public:
    LOG(FATAL) << "Not implemeted";
  }
+  /**
+   * set the max of each column of this to mat
+   */
  virtual void colMax(Matrix& max) { LOG(FATAL) << "not implemented"; }
+  /**
+   * @brief Get the top k elements of each column of this matrix.
+   *
+   * The row ids and values of these elements are stored in
+   * maxIds and max respectively. where k is the size of maxIds.
+   * And note that the top k elements are not sorted.
+   */
+  virtual void colMax(IVector& maxIds, Matrix& maxVal) {
+    LOG(FATAL) << "not implemented";
+  }
+  virtual void maxoutForward(Matrix& a, IVector& id, size_t channels,
+                             size_t groups) {
+    LOG(FATAL) << "not implemented";
+  }
+  virtual void maxoutBackward(Matrix& a, IVector& id, size_t channels,
+                              size_t groups) {
+    LOG(FATAL) << "not implemented";
+  }
  virtual void rowMaxId(IVector& maxIds) { LOG(FATAL) << "Not implemented"; }
  /**
   * @brief Get the top k elements of each row of this matrix.
   *
   * The column ids and values of these elements are stored in
-   * maxIds and max respectively. Note that the top k
+   * maxIds and max respectively. where k is the size of maxIds.
-   * elements are not sorted.
+   * And note that the top k elements are not sorted.
   */
  virtual void rowMax(IVector& maxIds, Matrix& max) {
    LOG(FATAL) << "Not implemented";
@@ -995,7 +1019,7 @@ public:
  void copyFrom(const IVector& src);
-  void copyByRowIndex(Matrix& b, IVector& rowIndex);
+  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
@@ -1101,6 +1125,9 @@ public:
  void rowMax(Matrix& max);
  void rowMax(IVector& maxIds, Matrix& max);
  void colMax(Matrix& max);
+  void colMax(IVector& maxIds, Matrix& max);
+  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
+  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
  void oneHotCrossEntropy(Matrix& output, IVector& label);
  void oneHotCrossEntropyBp(Matrix& outputV, IVector& label);
@@ -1271,7 +1298,7 @@ public:
  void copyFrom(CpuSparseMatrix& src);
-  void copyByRowIndex(Matrix& b, IVector& rowIndex);
+  void copyByRowIndex(Matrix& b, const IVector& rowIndex);
  MatrixPtr clone(size_t height, size_t width, bool useGpu = false);
@@ -1425,6 +1452,9 @@ public:
  void rowMax(Matrix& max);
  void rowMax(IVector& maxIds, Matrix& maxVal);
  void colMax(Matrix& max);
+  void colMax(IVector& maxIds, Matrix& maxVal);
+  void maxoutForward(Matrix& a, IVector& id, size_t channels, size_t groups);
+  void maxoutBackward(Matrix& a, IVector& id, size_t channels, size_t groups);
  void rowNormalizeL1(Matrix& out);
  void oneHotCrossEntropy(Matrix& output, IVector& label);

--- a/paddle/math/SparseRowMatrix.cpp
+++ b/paddle/math/SparseRowMatrix.cpp
@@ -227,12 +227,18 @@ void CacheRowCpuMatrix::mul(CpuSparseMatrix* a, CpuMatrix* b, real scaleAB,
 void SparsePrefetchRowCpuMatrix::addRows(const unsigned int* ids, size_t len) {
  std::vector<unsigned int>& localIndices = indexDictHandle_->localIndices;
+  for (size_t i = 0; i < len; i ++) {
+    CHECK_LT(*(ids + i), this->getHeight())
+      << "id:" << *(ids + i) << "Height:" << this->getHeight()
+      << "sparse id value exceeds the max input dimension, "
+      << "it could be caused invalid input data samples";
+  }
  localIndices.insert(localIndices.end(), ids, ids + len);
 }
 void SparsePrefetchRowCpuMatrix::addRows(MatrixPtr input) {
  CpuSparseMatrix* mat = dynamic_cast<CpuSparseMatrix*>(input.get());
-  CHECK(mat) << "only support non value sparse matrix";
+  CHECK(mat) << "only support sparse matrix";
  addRows(reinterpret_cast<const unsigned int*>(mat->getCols()),
          mat->getElementCnt());
 }
@@ -243,7 +249,13 @@ void SparsePrefetchRowCpuMatrix::addRows(IVectorPtr ids) {
  int* index = ids->getData();
  for (size_t i = 0; i < numSamples; ++i) {
    if (index[i] == -1) continue;
-    localIndices.push_back((unsigned int)index[i]);
+    unsigned int id = (unsigned int)index[i];
+    CHECK_LT(id, this->getHeight())
+      << "id:" << id << "Height:" << this->getHeight()
+      << "sparse id value exceeds the max input dimension, "
+      << "it could be caused invalid input data samples";
+    localIndices.push_back(id);
  }
 }

--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -2065,6 +2065,78 @@ TEST(Matrix, PoolFwdBwd) {
  }
 }
+void testMaxOutFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
+                      int channels, int groups) {
+  int inWidth = imgSizeH * imgSizeW * channels;
+  int outChannels = channels / groups;
+  int outWidth = imgSizeH * imgSizeW * outChannels;
+  // forward
+  MatrixPtr input = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpu = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr target = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpu = GpuMatrix::create(numSamples, outWidth, false, true);
+  MatrixPtr targetCheck = CpuMatrix::create(numSamples, outWidth, false, false);
+  IVectorPtr id = CpuIVector::create(numSamples * outWidth, false);
+  IVectorPtr idGpu = GpuIVector::create(numSamples * outWidth, true);
+  IVectorPtr idCheck = CpuIVector::create(numSamples * outWidth, false);
+  input->randomizeUniform();
+  inputGpu->copyFrom(*input);
+  target->maxoutForward(*input, *id, outChannels, groups);
+  targetGpu->maxoutForward(*inputGpu, *idGpu, outChannels, groups);
+  // check
+  targetCheck->copyFrom(*targetGpu);
+  MatrixCheckErr(*target, *targetCheck);
+  idCheck->copyFrom(*idGpu);
+  VectorCheckEqual(*id, *idCheck);
+  // backward
+  MatrixPtr inputGrad = CpuMatrix::create(numSamples, inWidth, false, false);
+  MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
+  MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
+  MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth, false,
+                                              true);
+  MatrixPtr targetCheckGrad = CpuMatrix::create(numSamples, inWidth, false,
+                                                false);
+  inputGrad->randomizeUniform();
+  targetGrad->randomizeUniform();
+  inputGpuGrad->copyFrom(*inputGrad);
+  targetGpuGrad->copyFrom(*targetGrad);
+  inputGrad->maxoutBackward(*targetGrad, *id, outChannels, groups);
+  inputGpuGrad->maxoutBackward(*targetGpuGrad, *idGpu, outChannels, groups);
+  // check
+  targetCheckGrad->copyFrom(*inputGpuGrad);
+  MatrixCheckErr(*inputGrad, *targetCheckGrad);
+}
+TEST(Matrix, MaxOutFwdBwd) {
+  for (auto numSamples : {5, 10}) {
+    for (auto channels : {8, 16}) {
+      for (auto imgSizeH : {14, 28}) {
+        for (auto imgSizeW : {16, 30}) {
+          for (auto groups : {2, 4}) {
+            VLOG(3) << " numSamples=" << numSamples
+                    << " channels=" << channels
+                    << " imgSizeH=" << imgSizeH
+                    << " imgSizeW=" << imgSizeW
+                    << " groups=" << groups;
+            testMaxOutFwdBwd(numSamples, imgSizeH, imgSizeW, channels, groups);
+          }
+        }
+      }
+    }
+  }
+}
 int main(int argc, char** argv) {
  testing::InitGoogleTest(&argc, argv);
  initMain(argc, argv);

--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@@ -146,6 +146,12 @@ public:
    }
  }
+  void enableBufType(ParameterType type) {
+    if (bufs_[type]) return;
+    bufs_[type] = Vector::createParallelVector(config_.size(), useGpu_);
+    bufs_[type]->zeroMem();
+  }
  void enableIntType(ParameterType type, size_t intStoreSize = 0) {
    if (!intBufs_[type]) {
      SetDevice device(deviceId_);

--- a/paddle/pserver/PserverForPython.h
+++ b/paddle/pserver/PserverForPython.h
-/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include "paddle/pserver/ParameterClient.h"
-#include "paddle/pserver/ParameterServer.h"
-#include "paddle/parameter/Parameter.h"
-#include <Python.h>
-namespace paddle {
-struct PyObjectDeleter {
-  void operator()(PyObject* obj) {
-    if (obj) {
-      Py_DECREF(obj);
-    }
-  }
-};
-class ParameterClientPy : public ParameterClient {
-protected:
-  typedef std::unique_ptr<PyObject, PyObjectDeleter> PyObjectPtr;
-  std::vector<ParameterPtr> parameter_;
-  int initArgc_;
-  char** initArgv_;
-public:
-  ParameterClientPy(std::vector<std::string> configs, int argc,
-                    std::vector<std::string> argv, bool useGpu) {
-    initArgc_ = argc;
-    initArgv_ = new char* [argc];
-    for (int i = 0; i < argc; i++) {
-      initArgv_[i] = new char[argv[i].size()];
-      strcpy(initArgv_[i],      // NOLINT
-             argv[i].c_str());  // NOLINT TODO(yuyang18): use snprintf instead.
-    }
-    ParameterConfig pyConfig;
-    ParameterPtr param;
-    for (auto& config : configs) {
-      pyConfig.ParseFromString(config);
-      param.reset(new Parameter(pyConfig, useGpu));
-      parameter_.push_back(param);
-    }
-    Py_Initialize();
-    CHECK(Py_IsInitialized());
-  }
-  ~ParameterClientPy() {
-    delete initArgv_;
-    Py_Finalize();
-  }
-  Parameter getParameter(int idx) { return *(parameter_[idx].get()); }
-  void initClientPy() {
-    initMain(initArgc_, initArgv_);
-    CHECK(init(parameter_)) << "Init Client Failed.";
-  }
-  void setConfigPy(std::string config) {
-    OptimizationConfig optConfig;
-    optConfig.ParseFromString(config);
-    setConfig(optConfig);
-  }
-  bool inStatusPy(int status) { return inStatus(PServerStatus(status)); }
-  void setStatusPy(int status) { setStatus(PServerStatus(status)); }
-  void waitForStatusPy(int status) { waitForStatus(PServerStatus(status)); }
-  void sendParameterPy(int updateMode, int parameterType, int numSamples,
-                       real cost, bool sendBackParameter) {
-    sendParameter(ParameterUpdateMode(updateMode), ParameterType(parameterType),
-                  int64_t(numSamples), real(cost), sendBackParameter);
-  }
-  template <class ProtoIn, class ProtoOut>
-  std::string asyncCallPy(const char* serviceName, const char* funcName,
-                          const std::string in) {
-    ProtoIn protoIn;
-    ProtoOut protoOut;
-    std::mutex waitLock;
-    std::string data;
-    protoIn.ParseFromString(in);
-    waitLock.lock();
-    auto callback = [&](ProtoOut* pOut, bool isSuccessful) {
-      if (isSuccessful) {
-        pOut->SerializeToString(&data);
-      } else {
-        LOG(INFO) << "Async Talk Failed.";
-      }
-      waitLock.unlock();
-    };
-    ubClient_.asyncCall<ProtoIn, ProtoOut>(serviceName, funcName, protoIn,
-                                           &protoOut, callback);
-    waitLock.lock();
-    protoOut.SerializeToString(&data);
-    return data;
-  }
-};
-}  // namespace paddle
--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
@@ -63,7 +63,8 @@ class SparseBinaryScanner(IScanner):
    def scan(self, dat):
        self.extend_cols(dat)
-        self.__rows__.append(len(dat))
+        self.__rows__.append(len(dat) + self.__rows__[-1])
+        self.__height__ += 1
    def extend_cols(self, dat):
        self.__cols__.extend(dat)

--- a/paddle/scripts/travis/before_install.sh
+++ b/paddle/scripts/travis/before_install.sh
--- a/paddle/scripts/travis/before_install.osx.sh
+++ b/paddle/scripts/travis/before_install.osx.sh
+#!/bin/bash
+brew update
+brew tap homebrew/science
+brew install python
+sudo pip install --upgrade protobuf==2.6.0
+brew install homebrew/versions/protobuf260 --without-python
+brew install cmake python glog gflags openblas wget md5sha1sum
+wget https://github.com/google/googletest/archive/release-1.8.0.tar.gz -O gtest.tar.gz
+tar xf gtest.tar.gz
+cd googletest-release-1.8.0/
+cmake .
+make install
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
 #!/bin/bash
 source ./common.sh
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_TESTING=ON -DON_TRAVIS=ON
+CMAKE_EXTRA=""
-make -j `nproc`
+if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
-env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="-j `nproc`"
+  CMAKE_EXTRA="-DPYTHON_LIBRARY=/usr/local/Cellar/python/2.7.12_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/config/libpython2.7.dylib"
+fi
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_TESTING=ON -DON_TRAVIS=ON ${CMAKE_EXTRA}
+NPROC=1
+if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
+  NRPOC=`nproc`
+elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
+  NPROC=`sysctl -n hw.ncpu`
+fi
+make -j $NPROC
+env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="-j $NPROC"
 sudo make install
 sudo paddle version
--- a/paddle/trainer/ThreadParameterUpdater.cpp
+++ b/paddle/trainer/ThreadParameterUpdater.cpp
--- a/paddle/trainer/tests/test_config.conf
+++ b/paddle/trainer/tests/test_config.conf
--- a/paddle/utils/.gitignore
+++ b/paddle/utils/.gitignore
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
--- a/paddle/utils/Logging.h
+++ b/paddle/utils/Logging.h
--- a/paddle/utils/PythonUtil.cpp
+++ b/paddle/utils/PythonUtil.cpp
--- a/paddle/utils/PythonUtil.h
+++ b/paddle/utils/PythonUtil.h
--- a/paddle/utils/Queue.h
+++ b/paddle/utils/Queue.h
--- a/paddle/utils/enable_virtualenv.py
+++ b/paddle/utils/enable_virtualenv.py
--- a/proto/ModelConfig.proto.m4
+++ b/proto/ModelConfig.proto.m4
--- a/python/paddle/trainer/PyDataProvider2.py
+++ b/python/paddle/trainer/PyDataProvider2.py
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
--- a/python/paddle/trainer_config_helpers/data_sources.py
+++ b/python/paddle/trainer_config_helpers/data_sources.py
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
--- a/python/paddle/trainer_config_helpers/optimizers.py
+++ b/python/paddle/trainer_config_helpers/optimizers.py
--- a/python/paddle/trainer_config_helpers/tests/configs/check.md5
+++ b/python/paddle/trainer_config_helpers/tests/configs/check.md5
--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
--- a/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/shared_fc.py
--- a/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py