fix conflict with baidu/develop

766a61c3 · qijun · fcf177fc · 7180b424 · 766a61c3 · 766a61c3
92 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 2.8)
 project(paddle CXX C)
 set(PADDLE_MAJOR_VERSION 0)
 set(PADDLE_MINOR_VERSION 8)
-set(PADDLE_PATCH_VERSION 0b2)
+set(PADDLE_PATCH_VERSION 0b3)
 set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})

 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
@@ -135,6 +135,7 @@ endif()

 if(WITH_GLOG)
    add_definitions(-DPADDLE_USE_GLOG)
+    include_directories(${LIBGLOG_INCLUDE_DIR})
 endif()

 if(WITH_GFLAGS)

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -21,12 +21,6 @@ function(safe_set_flag is_c src_list flag_name)
    endif()
    if(${safe_name})
        set(${src_list} "${${src_list}} ${flag_name}" PARENT_SCOPE)
-        if(is_c)
-          set(CUDA_NVCC_FLAGS
-              --compiler-options;${flag_name}
-              ${CUDA_NVCC_FLAGS}
-              PARENT_SCOPE)
-        endif()
    endif()
 endfunction()

@@ -40,6 +34,20 @@ macro(safe_set_cxxflag src_list flag_name)
    safe_set_flag(OFF ${src_list} ${flag_name})
 endmacro()

+# helper macro to set nvcc flag
+macro(safe_set_nvflag flag_name)
+    string(REPLACE "-" "_" safe_name ${flag_name})
+    string(REPLACE "=" "_" safe_name ${safe_name})
+    CHECK_C_COMPILER_FLAG(${flag_name} C_COMPILER_SUPPORT_FLAG_${safe_name})
+    set(safe_name C_COMPILER_SUPPORT_FLAG_${safe_name})
+    if(${safe_name})
+        set(CUDA_NVCC_FLAGS
+            --compiler-options;${flag_name}
+            ${CUDA_NVCC_FLAGS})
+    endif()
+endmacro()
+
+
 CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS)
 if(NOT UINT64_MAX_EXISTS)
  set(CMAKE_REQUIRED_DEFINITIONS -D__STDC_LIMIT_MACROS)
@@ -63,20 +71,43 @@ set(COMMON_FLAGS
    -Wnon-virtual-dtor
    -Wdelete-non-virtual-dtor
    -Wno-unused-parameter
+    -Wno-unused-function
+    -Wno-error=literal-suffix
+    -Wno-error=unused-local-typedefs)
+
+set(GPU_COMMON_FLAGS
+    -fPIC
+    -fno-omit-frame-pointer
+    -Wnon-virtual-dtor
+    -Wdelete-non-virtual-dtor
+    -Wno-unused-parameter
+    -Wno-unused-function
    -Wno-error=literal-suffix
    -Wno-error=unused-local-typedefs
    -Wno-error=unused-function  # Warnings in Numpy Header.
 )

+if (APPLE)
+    # On Mac OS X build fat binaries with x86_64 architectures by default.
+    set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
+else()
+    set(GPU_COMMON_FLAGS
+        -Wall
+        -Wextra
+        -Werror
+        ${GPU_COMMON_FLAGS})
+endif()
+
+
 foreach(flag ${COMMON_FLAGS})
    safe_set_cflag(CMAKE_C_FLAGS ${flag})
    safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag})
 endforeach()

-# On Mac OS X build fat binaries with x86_64 architectures by default.
-if (APPLE)
-    set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
-endif ()
+foreach(flag ${GPU_COMMON_FLAGS})
+    safe_set_nvflag(${flag})
+endforeach()
+

 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.

--- a/cmake/swig.cmake
+++ b/cmake/swig.cmake
@@ -27,6 +27,7 @@ function(generate_python_api target_name)
        COMMAND swig -python -c++ -outcurrentdir -I../ api/Paddle.swig
                && mv ${PROJ_ROOT}/paddle/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
        DEPENDS ${PROJ_ROOT}/paddle/api/Paddle.swig
+                ${PROJ_ROOT}/paddle/api/PaddleAPI.h
        WORKING_DIRECTORY ${PROJ_ROOT}/paddle
        COMMENT "Generate Python API from swig")
    add_custom_target(${target_name} ALL DEPENDS

--- a/demo/introduction/README.md
+++ b/demo/introduction/README.md
+This folder contains scripts used in PaddlePaddle introduction.
+- use `bash train.sh` to train a simple linear regression model
+- use `python evaluate_model.py` to read model parameters. You can see that `w` and `b` are very close to [2, 0.3].
+
--- a/demo/introduction/dataprovider.py
+++ b/demo/introduction/dataprovider.py
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.PyDataProvider2 import *
+import random
+
+# define data types of input: 2 real numbers
+@provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
+def process(settings, input_file):
+    for i in xrange(2000):
+        x = random.random()
+        yield [x], [2*x+0.3]
+
--- a/demo/introduction/evaluate_model.py
+++ b/demo/introduction/evaluate_model.py
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Print model parameters in last model
+
+Usage:
+    python evaluate_model.py
+"""
+import numpy as np
+import os
+
+def load(file_name):
+    with open(file_name, 'rb') as f:
+        f.read(16) # skip header for float type.
+        return np.fromfile(f, dtype=np.float32)
+
+def main():
+    print 'w=%.6f, b=%.6f from pass 29' % (load('output/pass-00029/w'),
+            load('output/pass-00029/b'))
+
+if __name__ == '__main__':
+    main()
--- a/demo/introduction/train.sh
+++ b/demo/introduction/train.sh
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+paddle train \
+    --config=trainer_config.py \
+    --save_dir=./output \
+    --num_passes=30 \
+    2>&1 |tee 'train.log'
--- a/demo/introduction/trainer_config.py
+++ b/demo/introduction/trainer_config.py
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+# 1. read data. Suppose you saved above python code as dataprovider.py
+data_file = 'empty.list'
+with open(data_file, 'w') as f: f.writelines(' ')
+define_py_data_sources2(train_list=data_file, test_list=None, 
+        module='dataprovider', obj='process',args={})
+
+# 2. learning algorithm
+settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
+
+# 3. Network configuration
+x = data_layer(name='x', size=1)
+y = data_layer(name='y', size=1)
+y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
+cost = regression_cost(input=y_predict, label=y)
+outputs(cost)
+
--- a/demo/quick_start/api_train.py
+++ b/demo/quick_start/api_train.py
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import itertools
+import random
+
+from paddle.trainer.config_parser import parse_config
+from py_paddle import swig_paddle as api
+from py_paddle import DataProviderConverter
+from paddle.trainer.PyDataProvider2 \
+    import integer_value, integer_value_sequence, sparse_binary_vector
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--train_data",
+                        type=str, required=False, help="train data file")
+    parser.add_argument("--test_data", type=str, help="test data file")
+    parser.add_argument("--config",
+                        type=str, required=True, help="config file name")
+    parser.add_argument("--dict_file", required=True, help="dictionary file")
+    parser.add_argument("--seq",
+                        default=1, type=int,
+                        help="whether use sequence training")
+    parser.add_argument("--use_gpu", default=0, type=int,
+                        help="whether use GPU for training")
+    parser.add_argument("--trainer_count", default=1, type=int,
+                        help="Number of threads for training")
+    parser.add_argument("--num_passes", default=5, type=int,
+                        help="Number of training passes")
+    return parser.parse_args()
+
+UNK_IDX = 0
+
+def load_data(file_name, word_dict):
+    with open(file_name, 'r') as f:
+        for line in f:
+            label, comment = line.strip().split('\t')
+            words = comment.split()
+            word_slot = [word_dict.get(w, UNK_IDX) for w in words]
+            yield word_slot, int(label)
+
+def load_dict(dict_file):
+    word_dict = dict()
+    with open(dict_file, 'r') as f:
+        for i, line in enumerate(f):
+            w = line.strip().split()[0]
+            word_dict[w] = i
+    return word_dict
+
+def main():
+    options = parse_arguments()
+    api.initPaddle("--use_gpu=%s" % options.use_gpu,
+                   "--trainer_count=%s" % options.trainer_count)
+
+    word_dict = load_dict(options.dict_file)
+    train_dataset = list(load_data(options.train_data, word_dict))
+    if options.test_data:
+        test_dataset = list(load_data(options.test_data, word_dict))
+    else:
+        test_dataset = None
+
+    trainer_config = parse_config(options.config,
+                                  "dict_file=%s" % options.dict_file)
+    # No need to have data provider for trainer
+    trainer_config.ClearField('data_config')
+    trainer_config.ClearField('test_data_config')
+
+    # create a GradientMachine from the model configuratin
+    model = api.GradientMachine.createFromConfigProto(
+        trainer_config.model_config)
+    # create a trainer for the gradient machine
+    trainer = api.Trainer.create(trainer_config, model)
+
+    # create a data converter which converts data to PaddlePaddle
+    # internal format
+    input_types = [
+        integer_value_sequence(len(word_dict)) if options.seq
+            else sparse_binary_vector(len(word_dict)),
+        integer_value(2)]
+    converter = DataProviderConverter(input_types)
+
+    batch_size = trainer_config.opt_config.batch_size
+    trainer.startTrain()
+    for train_pass in xrange(options.num_passes):
+        trainer.startTrainPass()
+        random.shuffle(train_dataset)
+        for pos in xrange(0, len(train_dataset), batch_size):
+            batch = itertools.islice(train_dataset, pos, pos + batch_size)
+            size = min(batch_size, len(train_dataset) - pos)
+            trainer.trainOneDataBatch(size, converter(batch))
+        trainer.finishTrainPass()
+        if test_dataset:
+            trainer.startTestPeriod();
+            for pos in xrange(0, len(test_dataset), batch_size):
+                batch = itertools.islice(test_dataset, pos, pos + batch_size)
+                size = min(batch_size, len(test_dataset) - pos)
+                trainer.testOneDataBatch(size, converter(batch))
+            trainer.finishTestPeriod()
+    trainer.finishTrain()
+
+if __name__ == '__main__':
+    main()
--- a/demo/quick_start/api_train.sh
+++ b/demo/quick_start/api_train.sh
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+# Note: if using trainer_config.emb.py, trainer_config.cnn.py
+# or trainer_config.lstm.py, you need to change --seq to --seq=1
+# because they are sequence models.
+python api_train.py \
+  --config=trainer_config.lr.py \
+  --trainer_count=2 \
+  --num_passes=15 \
+  --use_gpu=0 \
+  --seq=0 \
+  --train_data=data/train.txt \
+  --test_data=data/test.txt \
+  --dict_file=data/dict.txt \
+  2>&1 | tee 'train.log'
--- a/demo/quick_start/dataprovider_emb.py
+++ b/demo/quick_start/dataprovider_emb.py
@@ -16,6 +16,7 @@ from paddle.trainer.PyDataProvider2 import *

 UNK_IDX = 0

+
 def initializer(settings, dictionary, **kwargs):
    settings.word_dict = dictionary
    settings.input_types = [

--- a/demo/quick_start/train.sh
+++ b/demo/quick_start/train.sh
@@ -24,7 +24,7 @@ paddle train \
  --config=$cfg \
  --save_dir=./output \
  --trainer_count=4 \
-  --log_period=20 \
+  --log_period=100 \
  --num_passes=15 \
  --use_gpu=false \
  --show_parameter_stats_period=100 \

--- a/demo/quick_start/trainer_config.lr.py
+++ b/demo/quick_start/trainer_config.lr.py
@@ -16,7 +16,7 @@

 from paddle.trainer_config_helpers import *

-dict_file = "./data/dict.txt"
+dict_file = get_config_arg('dict_file', str, "./data/dict.txt")
 word_dict = dict()
 with open(dict_file, 'r') as f:
    for i, line in enumerate(f):
@@ -63,7 +63,6 @@ if not is_predict:
    label = data_layer(name="label", size=2)

    # Define cross-entropy classification loss and error.
-    classification_cost(input=output, label=label)
    cls = classification_cost(input=output, label=label)
    outputs(cls)
 else:

--- a/demo/quick_start/trainer_config.lstm.py
+++ b/demo/quick_start/trainer_config.lstm.py
@@ -42,20 +42,13 @@ settings(
    gradient_clipping_threshold=25
 )

-bias_attr = ParamAttr(initial_std=0.,l2_rate=0.)

 data = data_layer(name="word", size=len(word_dict))
 emb = embedding_layer(input=data, size=128)
-fc = fc_layer(input=emb, size=512,
-              act=LinearActivation(),
-              bias_attr=bias_attr,
-              layer_attr=ExtraAttr(drop_rate=0.1))
-lstm = lstmemory(input=fc, act=TanhActivation(),
-                 bias_attr=bias_attr,
-                 layer_attr=ExtraAttr(drop_rate=0.25))
-lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling())
-output = fc_layer(input=lstm_last, size=2,
-                  bias_attr=bias_attr,
+lstm = simple_lstm(input=emb, size=128,
+                   lstm_cell_attr=ExtraAttr(drop_rate=0.25))
+lstm_max = pooling_layer(input=lstm, pooling_type=MaxPooling())
+output = fc_layer(input=lstm_max, size=2,
                  act=SoftmaxActivation())
 if is_predict:
    maxid = maxid_layer(output)

--- a/demo/sentiment/predict.py
+++ b/demo/sentiment/predict.py
@@ -46,8 +46,8 @@ class SentimentPrediction():
        conf = parse_config(train_conf, "is_predict=1")
        self.network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
        self.network.loadParameters(self.model_dir)
-        slots = [integer_value_sequence(self.dict_dim)]
-        self.converter = DataProviderConverter(slots)
+        input_types = [integer_value_sequence(self.dict_dim)]
+        self.converter = DataProviderConverter(input_types)

    def load_dict(self):
        """

--- a/doc/build/build_from_source.md
+++ b/doc/build/build_from_source.md
@@ -153,12 +153,12 @@ As a simple example, consider the following:
 - **Only CPU**

  ```bash
-  cmake  .. -DWITH_GPU=OFF -DWITH_DOC=OFF
+  cmake  .. -DWITH_GPU=OFF
  ```
 - **GPU**

  ```bash
-  cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF
+  cmake .. -DWITH_GPU=ON
  ```

 - **GPU with doc and swig**
@@ -171,7 +171,7 @@ Finally, you can build PaddlePaddle:

 ```bash
 # you can add build option here, such as:    
-cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF -DCMAKE_INSTALL_PREFIX=<path to install>
+cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX=<path to install>
 # please use sudo make install, if you want to install PaddlePaddle into the system
 make -j `nproc` && make install
 # set PaddlePaddle installation path in ~/.bashrc
@@ -246,7 +246,7 @@ easy_install pip

        ```bash
        sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local
-        sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
+        sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib/libcudnn*
        ```
    2. Then you need to set DYLD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.

@@ -273,12 +273,12 @@ As a simple example, consider the following:
 - **Only CPU**

  ```bash
-  cmake  .. -DWITH_GPU=OFF -DWITH_DOC=OFF
+  cmake  .. -DWITH_GPU=OFF
  ```
 - **GPU**

  ```bash
-  cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF
+  cmake .. -DWITH_GPU=ON
  ```

 - **GPU with doc and swig**
@@ -291,9 +291,9 @@ Finally, you can build PaddlePaddle:

 ```bash
 # you can add build option here, such as:    
-cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF -DCMAKE_INSTALL_PREFIX=<installation path>
+cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX=<installation path>
 # please use sudo make install, if you want to install PaddlePaddle into the system
-make -j `nproc` && make install
+make -j `sysctl -n hw.ncpu` && make install
 # set PaddlePaddle installation path in ~/.bashrc
 export PATH=<installation path>/bin:$PATH
 ```

--- a/doc/build/contribute_to_paddle.md
+++ b/doc/build/contribute_to_paddle.md
@@ -4,7 +4,7 @@ We sincerely appreciate your contributions. You can use fork and pull request
 workflow to merge your code. 
 
 ## Code Requirements
- Your code mush be fully documented by
+- Your code must be fully documented by
  [doxygen](http://www.stack.nl/~dimitri/doxygen/) style.
 - Make sure the compiler option WITH\_STYLE\_CHECK is on and the compiler
  passes the code style check.
@@ -20,16 +20,30 @@ It's just that simple.

 ## Clone

+Paddle is currently using [git-flow branching model](http://nvie.com/posts/a-successful-git-branching-model/).
+The **develop** is the main branch, and other user's branches are feature branches.
+
 Once you've created a fork, you can use your favorite git client to clone your
 repo or just head straight to the command line:
 
 ```shell
 # Clone your fork to your local machine
-git clone https://github.com/USERNAME/Paddle.git
+git clone --branch develop https://github.com/USERNAME/Paddle.git
+```
+If your repository doesn't contain **develop** branch, just create it by your own.
+
+```shell
+git clone https://github.com/USERNAME/Paddle.git Paddle
+cd Paddle
+git checkout -b develop  # create develop branch.
+git remote add upstream https://github.com/baidu/Paddle.git  # add upstream to baidu/Paddle
+git pull upstream develop  # update to upstream
 ```
+
 Then you can start to develop by making a local developement branch
+
 ```shell
-git checkout -b MY_COOL_STUFF_BRANCH origin/master
+git checkout -b MY_COOL_STUFF_BRANCH
 ```

 ## Commit
@@ -41,7 +55,7 @@ Commit your changes by following command lines:
 git status
 # add modified files
 git add xx
-git commit -m "commit info"
+env EDITOR=vim git commit  # You can write your comments by vim/nano/emacs.
 ```
 The first line of commit infomation is the title. The second and later lines
 are the details if any.
@@ -63,7 +77,7 @@ git remote -v
 Update your fork with the latest upstream changes:

 ```shell
-git pull --rebase upstream HEAD
+git pull --rebase upstream develop
 ```

 If there are no unique commits locally, git will simply perform a fast-forward.
@@ -76,7 +90,7 @@ Now, your local master branch is up-to-date with everything modified upstream.

 ```shell
 # push to your repository in Github
-git push origin HEAD
+git push -u origin MY_COOL_STUFF_BRANCH  # create remote branch MY_COOL_STUFF_BRANCH to origin.
 ```

 ## Pull Request
@@ -93,13 +107,24 @@ of conflict, you need to do the update manually. You need to do the following on
 your local repository:
 ```shell
 git checkout MY_COOL_STUFF_BRANCH
-git pull --rebase upstream HEAD
+git pull upstream develop
 # You may need to resolve the conflict according to the git prompt.
 # Make and test your code.
-git push -f origin HEAD
+git push origin MY_COOL_STUFF_BRANCH
 ```
 Now your Pull Request is updated with the latest version.

 ## Revise your pull request

 When you revise your pull request according to reviewer's comments, please use 'git commit' instead of 'git commit --amend' to commit your changes so that the reviewers can see the difference between the new pull requrest and the old pull request.
+
+The possible commands are
+
+```shell
+git checkout MY_COOL_STUFF_BRANCH
+git pull upstream develop   # update local to newest code base.
+# May be some conflicts will occured.
+# And develop your cool stuff
+env EDITOR=vim git commit  # add your revise log
+git push origin MY_COOL_STUFF_BRANCH
+```
--- a/doc/index.md
+++ b/doc/index.md
@@ -3,6 +3,7 @@ PaddlePaddle Documentation

 User Guide
 ----------
+* [Introduction](introduction/index.md)
 * [Quick Start](demo/quick_start/index_en.md)
 * [Build and Installation](build/index.rst)
 * [Contribute Code](build/contribute_to_paddle.md)

--- a/doc/introduction/index.md
+++ b/doc/introduction/index.md
+# Introduction
+
+PaddlePaddle is a deep learning platform open-sourced by Baidu. With PaddlePaddle, you can easily train a classic neural network within a couple lines of configuration, or you can build sophisticated models that provide state-of-the-art performance on difficult learning tasks like sentiment analysis, machine translation, image caption and so on.
+
+## 1. A Classic Problem
+
+Now, to give you a hint of what using PaddlePaddle looks like, let's start with a fundamental learning problem - <a href="https://en.wikipedia.org/wiki/Simple_linear_regression">**simple linear regression**</a> : you have observed a set of two-dimensional data points of `X` and `Y`, where `X` is an explanatory variable and `Y` is corresponding dependent variable, and you want to recover the underlying correlation between `X` and `Y`. Linear regression can be used in many practical scenarios. For example, `X` can be a variable about house size, and `Y` a variable about house price. You can build a model that captures relationship between them by observing real estate markets.
+
+## 2. Prepare the Data
+
+Suppose the true relationship can be characterized as `Y = 2X + 0.3`, let's see how to recover this pattern only from observed data. Here is a piece of python code that feeds synthetic data to PaddlePaddle. The code is pretty self-explanatory, the only extra thing you need to add for PaddlePaddle is a definition of input data types.
+
+```python
+# dataprovider.py
+from paddle.trainer.PyDataProvider2 import *
+import random
+
+# define data types of input: 2 real numbers
+@provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
+def process(settings, input_file):
+    for i in xrange(2000):
+        x = random.random()
+        yield [x], [2*x+0.3]
+```
+
+## 3. Train a NeuralNetwork in PaddlePaddle
+
+To recover this relationship between `X` and `Y`, we use a neural network with one layer of linear activation units and a square error cost layer. Don't worry if you are not familiar with these terminologies, it's just saying that we are starting from a random line `Y' = wX + b` , then we gradually adapt `w` and `b` to minimize the difference between `Y'` and `Y`. Here is what it looks like in PaddlePaddle:
+
+```python
+# trainer_config.py
+from paddle.trainer_config_helpers import *
+
+# 1. read data. Suppose you saved above python code as dataprovider.py
+data_file = 'empty.list'
+with open(data_file, 'w') as f: f.writelines(' ')
+define_py_data_sources2(train_list=data_file, test_list=None, 
+        module='dataprovider', obj='process',args={})
+
+# 2. learning algorithm
+settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
+
+# 3. Network configuration
+x = data_layer(name='x', size=1)
+y = data_layer(name='y', size=1)
+y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
+cost = regression_cost(input=y_predict, label=y)
+outputs(cost)
+```
+
+Some of the most fundamental usages of PaddlePaddle are demonstrated:
+
+-  The first part shows how to feed data into PaddlePaddle. In general cases, PaddlePaddle reads raw data from a list of files, and then do some user-defined process to get real input. In this case, we only need to create a placeholder file since we are generating synthetic data on the fly.
+
+-  The second part describes learning algorithm. It defines in what ways adjustments are made to model parameters. PaddlePaddle provides a rich set of optimizers, but a simple momentum based optimizer will suffice here, and it processes 12 data points each time.
+
+-  Finally, the network configuration. It usually is as simple as "stacking" layers. Three kinds of layers are used in this configuration:
+	-  **Data Layer**: a network always starts with one or more data layers. They provide input data to the rest of the network. In this problem, two data layers are used respectively for `X` and `Y`.
+	-  **FC Layer**: FC layer is short for Fully Connected Layer, which connects all the input units to current layer and does the actual computation specified as activation function. Computation layers like this are the fundamental building blocks of a deeper model.
+	-  **Cost Layer**: in training phase, cost layers are usually the last layers of the network. They measure the performance of current model, and provide guidence to adjust parameters.
+
+Now that everything is ready, you can train the network with a simple command line call:
+ ```
+ paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
+ ```
+
+This means that PaddlePaddle will train this network on the synthectic dataset for 30 passes, and save all the models under path `./output`. You will see from the messages printed out during training phase that the model cost is decreasing as time goes by, which indicates we are getting a closer guess.
+
+
+## 4. Evaluate the Model
+
+Usually, a different dataset that left out during training phase should be used to evalute the models. However, we are lucky enough to know the real answer: `w=2, b=0.3`, thus a better option is to check out model parameters directly.
+
+In PaddlePaddle, training is just to get a collection of model parameters, which are `w` and `b` in this case. Each parameter is saved in an individual file in the popular `numpy` array format. Here is the code that reads parameters from last pass.
+
+```python
+import numpy as np
+import os
+
+def load(file_name):
+    with open(file_name, 'rb') as f:
+        f.read(16) # skip header for float type.
+        return np.fromfile(f, dtype=np.float32)
+        
+print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
+# w=1.999743, b=0.300137
+```
+
+<center> ![](./parameters.png) </center>
+
+Although starts from a random guess, you can see that value of `w` changes quickly towards 2 and `b` changes quickly towards 0.3. In the end, the predicted line is almost identical with real answer.
+
+There, you have recovered the underlying pattern between `X` and `Y` only from observed data.
+
+
+## 5. Where to Go from Here
+
+- <a href="../build/index.html"> Build and Installation </a>
+- <a href="../demo/quick_start/index_en.html">Quick Start</a>
+- <a href="../demo/index.html">Example and Demo</a>
+
--- a/doc/introduction/parameters.png
+++ b/doc/introduction/parameters.png
+../../doc_cn/introduction/parameters.png
\ No newline at end of file
--- a/doc/ui/cmd_argument/argument_outline.md
+++ b/doc/ui/cmd_argument/argument_outline.md
@@ -183,7 +183,7 @@ It looks like there are a lot of arguments. However, most of them are for develo
 </tr>

 <tr>
-<td class="left" rowspan = "5">GPU</td><td class="left">gpu_id</td>
+<td class="left" rowspan = "6">GPU</td><td class="left">gpu_id</td>
 <td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
 </tr>

@@ -207,6 +207,11 @@ It looks like there are a lot of arguments. However, most of them are for develo
 <td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
 </tr>

+<tr>
+<td class="left">cudnn_conv_workspace_limit_in_mb</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
 <tr>
 <td class="left" rowspan = "4">RNN</td>
 <td class="left">beam_size</td>

--- a/doc/ui/cmd_argument/detail_introduction.md
+++ b/doc/ui/cmd_argument/detail_introduction.md
@@ -163,6 +163,10 @@
  - Choose path to dynamic load NVIDIA CUDA library, for instance, /usr/local/cuda/lib64. [Default]: LD_LIBRARY_PATH
  - type: string (default: "", null)

+* `--cudnn_conv_workspace_limit_in_mb`
+  - Specify cuDNN max workspace limit, in units MB, 4096MB=4GB by default. 
+  - type: int32 (default: 4096MB=4GB)
+
 ## NLP: RNN/LSTM/GRU
 * `--rnn_use_batch`
  - Whether to use batch method for calculation in simple RecurrentLayer.

--- a/doc_cn/concepts/nn.rst
+++ b/doc_cn/concepts/nn.rst
+TBD
+
+目前正在书写中。敬请期待。
\ No newline at end of file
--- a/doc_cn/concepts/program_concepts.rst
+++ b/doc_cn/concepts/program_concepts.rst
+TBD
+###
+
+目前正在书写中。敬请期待。
\ No newline at end of file
--- a/doc_cn/concepts/pserver_topology.dot
+++ b/doc_cn/concepts/pserver_topology.dot
+graph pp_topology {
+	rankdir=BT;
+	subgraph cluster_node0 {
+		style=filled;
+		color=lightgrey;
+		node [style=filled, color=white, shape=box];
+		label = "机器0"
+
+		pserver0 [label="Parameter \n Server 0"]
+		trainer0 [label="Trainer 0"]
+	}
+	subgraph cluster_node1 {
+		style=filled;
+		color=lightgrey;
+		node [style=filled, color=white, shape=box];
+		label = "机器1"
+
+		pserver1 [label="Parameter \n Server 1"]
+		trainer1 [label="Trainer 1"]
+	}
+
+	subgraph cluster_node2 {
+		style=filled;
+		color=lightgrey;
+		node [style=filled, color=white, shape=box];
+		label = "机器2"
+
+		pserver2 [label="Parameter \n Server 2"]
+		trainer2 [label="Trainer 2"]
+	}
+
+	subgraph cluster_node3 {
+		style=filled;
+		color=lightgrey;
+		node [style=filled, color=white, shape=box];
+		label = "机器3"
+
+		pserver3 [label="Parameter \n Server 3"]
+		trainer3 [label="Trainer 3"]
+	}
+
+	data [label="数据", shape=hexagon]
+
+	trainer0 -- pserver0
+	trainer0 -- pserver1
+	trainer0 -- pserver2
+	trainer0 -- pserver3
+
+	trainer1 -- pserver0
+	trainer1 -- pserver1
+	trainer1 -- pserver2
+	trainer1 -- pserver3
+
+	trainer2 -- pserver0
+	trainer2 -- pserver1
+	trainer2 -- pserver2
+	trainer2 -- pserver3
+
+	trainer3 -- pserver0
+	trainer3 -- pserver1
+	trainer3 -- pserver2
+	trainer3 -- pserver3
+
+	data -- trainer0
+	data -- trainer1
+	data -- trainer2
+	data -- trainer3
+}
--- a/doc_cn/concepts/trainer_config.py
+++ b/doc_cn/concepts/trainer_config.py
+from paddle.trainer_config_helpers import *
+
+define_py_data_sources2(train_list='train.list',
+                        test_list='test.list',
+                        module='provider',
+                        obj='process')
+settings(
+    batch_size=128,
+    learning_rate=1e-3,
+    learning_method=AdamOptimizer(),
+    regularization=L2Regularization(0.5)
+)
+
+img = data_layer(name='pixel', size=28 * 28)
+
+hidden1 = simple_img_conv_pool(input=img, filter_size=3, num_filters=32, pool_size=3,
+                               num_channel=1)
+
+hidden2 = fc_layer(input=hidden1, size=200, act=TanhActivation(),
+                   layer_attr=ExtraAttr(drop_rate=0.5))
+predict = fc_layer(input=hidden2, size=10, act=SoftmaxActivation())
+
+outputs(classification_cost(input=predict, label=data_layer(name='label', size=10)))
--- a/doc_cn/concepts/use_concepts.rst
+++ b/doc_cn/concepts/use_concepts.rst
+#########################
+PaddlePaddle 基本使用概念
+#########################
+
+PaddlePaddle是一个神经网络学习框架。其单机进程为 :code:`paddle train`。 单机的所有设备使用，均在单机进程内调度完成。 而多机辅助进程 :code:`paddle pserver` 负责联合多个单机进程进行通信，进而充分利用集群的计算资源。 PaddlePaddle同时以 :code:`swig api` 的形式，提供训练结果模型预测的方法和自定义训练流程。
+
+下面我们会分别介绍主要进程 :code:`paddle train` 中的一些概念。这些概念会对如何使用PaddlePaddle有一定的帮助。 了解这些概念的前提是，读者已经了解 `基本的神经网络/机器学习原理和概念 <nn.html>`_ 。同时，如果想要了解PaddlePaddle实现中的一些概念，请参考 `PaddlePaddle 编程中的基本概念 <program_concepts.html>`_ 。
+
+..	contents::
+
+PaddlePaddle 的进程模型
+=======================
+
+PaddlePaddle进程内嵌了一个 :code:`python` 解释器。 这个 :code:`python` 解释器负责解析用户定义的神经网络配置，和解析用户数据，并将用户数据传入给 PaddlePaddle。
+
+..	graphviz:: 
+
+	digraph pp_process {
+		rankdir=LR;
+		config_file [label="用户神经网络配置"];
+		subgraph cluster_pp {
+			style=filled;
+			color=lightgrey;
+			node [style=filled, color=white, shape=box];
+			label = "PaddlePaddle C++";
+			py [label="Python解释器"];
+		}
+		data_provider [label="用户数据解析"];
+		config_file -> py;
+		py -> data_provider [dir="back"];
+	}
+
+所以，PaddlePaddle单机训练进程，:code:`paddle train` , 对于用户的主要接口语言为 python。 主要需要用户配置的两个文件为 :code:`DataProvider` 和训练文件 :code:`TrainerConfig` 。
+
+
+DataProvider
+============
+
+DataProvider是 :code:`paddle train` 的数据提供器。 它负责将用户的原始数据转换成 PaddlePaddle 可以识别的数据类型。每当 PaddlePaddle 需要新的数据训练时，都会调用 DataProvider 返回数据。 当所有数据读取完一轮后，DataProvider 便返回空数据通知 PaddlePaddle。PaddlePaddle负责在下一轮训练开始前，将DataProvider重置。
+
+需要注意的是，DataProvider在PaddlePaddle中是被训练逻辑调用的关系， 而不是新的数据驱动训练。并且所有的 :code:`shuffle` , 和一些随机化的噪声添加，都应该在 DataProvider 阶段完成。
+
+为了方便用户使用自己的数据格式， PaddlePaddle 提供了 `PyDataProvider`_ 来处理数据。 并且在这个Provider中，PaddlePaddle的 C++ 部分接管了如何shuffle，处理 batch，GPU/CPU通信，双缓冲，异步读取等问题。 用户可以参考 `PyDataProvider`_ 的相关文档，继续深入了解 DataProvider 的使用。
+
+
+训练文件
+========
+
+训练文件是PaddlePaddle中配置神经网络结构、学习优化算法、数据传入方式的地方。 训练文件是一个python文件，使用命令行参数 :code:`--config` 传给 paddle 的主程序。 例如\:
+
+..	code-block:: bash
+
+	paddle train --config=trainer_config.py
+
+一个典型简单的训练文件可能为
+
+..  literalinclude:: trainer_config.py
+    :linenos:
+
+下面我们详细的介绍一下训练文件中各个模块的概念。
+
+
+trainer_config_helpers
+----------------------
+
+PaddlePaddle的配置文件与PaddlePaddle C++端通信的最基础协议是 :code:`protobuf` 。而为了避免用户直接写比较难写的 protobuf string，我们书写了一个helpers来生成这个protobuf包。所以在文件的开始，import这些helpers函数。
+
+需要注意的是，这个 :code:`paddle.trainer_config_helpers` 包是标准的python包，这意味着用户可以选择自己喜欢的 :code:`ide` 或者编辑器来编写Paddle的配置文件，这个python包注释文档比较完善，并且考虑了IDE的代码提示与类型注释。
+
+data_sources
+------------
+
+data_sources是配置神经网络的数据源。这里使用的函数是 :code:`define_py_data_sources2` ，这个函数是定义了使用 `PyDataProvider`_ 作为数据源。 而后缀 :code:`2` 是Paddle历史遗留问题，因为Paddle之前使用的 PyDataProvider 性能较差，所以完全重构了一个新的 `PyDataProvider`_ 。
+
+data_sources里面的 train_list 和 test_list 指定的是训练文件列表和测试文件列表。 如果传入一个字符串的话，是指一个训练列表文件。这个训练列表文件中包含的是每一个训练或者测试文件的路径。如果传入一个list的话，则会默认生成一个 list 文件，再传入给 train.list 或者 test.list 。
+
+而 :code:`module` 和 :code:`obj` 指定了 DataProvider 的模块名和函数名。
+
+更具体的使用，请参考 `PyDataProvider`_ 。
+
+settings
+--------
+
+`settings`_ 是神经网络训练算法相关的设置项。包括学习率，batch_size，优化算法，正则方法等等。具体的使用方法请参考 `settings`_ 文档。
+
+网络配置
+--------
+
+上述网络配置中余下的部分均是神经网络配置。第一行是定义一个名字叫 "pixel" 的 :code:`data_layer` 。每一个layer返回的都是一个 :code:`LayerOutput` 对象。 这里第一层的输出对象是 :code:`img` 。然后这个对象传输给了另一个 layer 函数，
+:code:`simple_img_conv_pool` 。:code:`simple_img_conv_pool` 是一个组合层，
+包括了图像的卷积 (convolution) 和池化(pooling)，
+并继续接了一个全连接层( :code:`fc_layer` )，然后再接了一个Softmax的全连接层。
+
+最终，网络配置输出了 :code:`classification_cost` 。标记网络输出的函数为 
+:code:`outputs` 。网络的输出是神经网络的优化目标，神经网络训练的时候，实际上就是
+要最小化这个输出。
+
+在神经网络进行预测的时候，实际上网络的输出也是通过 :code:`outputs` 标记。
+
+
+Layer、Projection、Operator
+===========================
+
+PaddlePaddle的网络基本上是基于Layer来配置的。所谓的Layer即是神经网络的某一层，
+而神经网络的某一层，一般是封装了许多复杂操作的操作集合。比如最简单的
+:code:`fc_layer` ，也包括矩阵乘法，多输入的求和，和activation。
+
+..	code-block:: python
+
+	data = data_layer(name='data', size=200)
+	out = fc_layer(input=data, size=200, act=TanhActivation())
+
+而对于更灵活配置需求，可能这样基于Layer的配置是不灵活的。于是 PaddlePaddle 提供
+了基于 Projection 或者 Operator 的配置。使用Projection和Operator需要与
+:code:`mixed_layer` 配合使用。 :code:`mixed_layer` 是将layer中的元素累加求和，
+并且做一个 :code:`activation` ， 而这个layer具体如何计算，是交由内部的Projection
+和 Operator 定义。Projection是指含有可学习参数的操作，而Operator不含有可学习的
+参数，输入全是其他Layer的输出。
+
+
+例如，和 :code:`fc_layer` 同样功能的 :code:`mixed_layer` 。
+
+..	code-block:: python
+
+	data = data_layer(name='data', size=200)
+	with mixed_layer(size=200) as out:
+		out += full_matrix_projection(input=data)
+
+PaddlePaddle可以使用的mixed layer 配置出非常复杂的网络，甚至可以直接配置一个完整的LSTM。
+用户可以参考 `mixed_layer`_ 的相关文档进行配置。
+
+如何利用单机的所有GPU或所有CPU核心
+==================================
+
+PaddlePaddle的单机进程 :code:`paddle train` 可以充分利用一台计算机上所有的GPU资
+源或者CPU。
+
+如果要使用机器上多块GPU，使用如下命令即可\:
+
+..	code-block:: bash
+
+	paddle train --use_gpu=true --trainer_count=4  # use 4 gpu card, 0, 1, 2, 3
+
+如果要使用机器上多块CPU, 使用如下命令即可\:
+
+..	code-block:: bash
+
+	paddle train --trainer_config=4  # use 4 cpu cores.
+
+对于其他设置GPU的选择情况，例如选择第0、2号GPU显卡，则可以使用 :code:`CUDA_VISIBLE_DEVICES` 环境变量来选择部分的显卡。 具体可以参考连接`masking-gpus`_ 。 可以使用的命令为
+
+..	code-block:: bash
+
+	env CUDA_VISIBLE_DEVICES=0,2 paddle train --use_gpu=true --trainer_config=2
+
+如何利用多台机器的计算资源训练神经网络
+======================================
+
+PaddlePaddle多机使用的经典方法是通过 :code:`Parameter Server` 来对多机的 :code:`paddle train` 进行同步。 而多机训练神经网络，首先要讲数据切分到不同的机器上。 切分数据文件的方式在PaddlePaddle的开源实现中并没有提供工具包。 但是切分数据并不是一件非常复杂的事情，也不是神经网络实现的重点。
+
+多机训练过程中，经典的拓扑结构如下\:
+
+..	graphviz:: pserver_topology.dot
+
+图中每个灰色方块是一台机器，在每个机器中，先去启动一个 :code:`paddle pserver` 进程，并确定整体的端口号。可能的参数是\:
+
+..	code-block:: bash
+
+	paddle pserver --port=5000 --num_gradient_servers=4 --nics='eth0'
+
+这里说明系统的 :code:`paddle pserver` 的起始端口是 :code:`5000` ，并且有四个训练进程(:code:`gradient_servers`，Paddle同时将 :code:`paddle train` 进程称作 :code:`GradientServer` 。因为其为负责提供Gradient的进程)。 而对于训练进程的话，则需要在 :code:`paddle pserver` 启动之后，再在各个节点上运行如下命令\:
+
+..	code-block:: bash
+
+	paddle train --port=5000 --pservers=192.168.100.101,192.168.100.102,192.168.100.103,192.168.100.104 --config=...
+
+对于简单的多机协同使用上述方式即可。同时，pserver/train 通常在高级情况下，还有两个参数需要设置，他们是
+
+* --ports_num\: 一个 pserver进程共绑定多少个端口用来做稠密更新。默认是1
+* --ports_num_for_sparse\: 一个pserver进程共绑定多少端口用来做稀疏更新，默认是0
+
+使用手工指定端口数量，是因为Paddle的网络通信中，使用了 :code:`int32` 作为消息长度，比较容易在大模型下溢出。所以，在 :code:`paddle pserver` 进程中可以启动多个子线程去接受 trainer 的数据，这样单个子线程的长度就不会溢出了。但是这个值不可以调的过大，因为增加这个值，还是对性能，尤其是内存占用有一定的开销的，另外稀疏更新的端口如果太大的话，很容易某一个参数服务器没有分配到任何参数。
+
+详细的说明可以参考，使用 `集群训练Paddle`_ 。
+
+
+..  _PyDataProvider: ../ui/data_provider/pydataprovider2.html
+..	_settings: ../../doc/ui/api/trainer_config_helpers/optimizers.html#settings
+..	_mixed_layer: ../../doc/ui/api/trainer_config_helpers/layers.html#mixed-layer
+..	_masking-gpu: http://www.acceleware.com/blog/cudavisibledevices-masking-gpus
+..  _集群训练Paddle: ../cluster/index.html
--- a/doc_cn/faq/index.rst
+++ b/doc_cn/faq/index.rst
@@ -166,4 +166,14 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字

 这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。

+7. *-cp27mu-linux_x86_64.whl is not a supported wheel on this platform.
+-----------------------------------------------------------------------
+
+出现这个问题的主要原因是，系统编译wheel包的时候，使用的 :code:`wheel` 包是最新的，
+而系统中的 :code:`pip` 包比较老。具体的解决方法是，更新 :code:`pip` 包并重新编译PaddlePaddle。
+更新 :code:`pip` 包的方法是\:
+
+..  code-block:: bash
+
+    pip install --upgrade pip

--- a/doc_cn/index.rst
+++ b/doc_cn/index.rst
@@ -3,8 +3,9 @@ PaddlePaddle文档

 使用指南
 --------
-
+* `介绍 <introduction/index.html>`_
 * `快速入门 <demo/quick_start/index.html>`_
+* `基本使用概念 <concepts/use_concepts.html>`_
 * `编译与安装 <build_and_install/index.html>`_
 * `用户接口 <ui/index.html>`_
 * `使用示例 <demo/index.html>`_

--- a/doc_cn/introduction/index.md
+++ b/doc_cn/introduction/index.md
+# 简介
+
+PaddlePaddle 是起源于百度的开源深度学习平台。它是简单易用的：你可以通过简单的十数行配置搭建经典的神经网络模型；它也是高效强大的：PaddlePaddle可以支撑复杂集群环境下超大模型的训练，令你受益于深度学习的前沿成果。在百度内部，已经有大量产品线使用了基于PaddlePaddle的深度学习技术。
+
+这份简短的介绍将像你展示如何利用PaddlePaddle解决一个经典的学习问题。
+
+## 1. 一个经典的任务
+
+让我们从一个基础问题开始：<a href="https://www.baidu.com/s?wd=单变量线性回归">单变量的线性回归</a>。问题假定观测到了一批二维空间上的点`(x, y) `，并且已知 `x` 和 `y` 之间存在着某种线性关系，我们的目标是通过观测数据还原这个线性关系。作为一个简单基础的模型，线性回归却有着广泛的应用场景。比如可以想象一个资产定价的简化场景，其中 `x` 对应于房屋的大小，`y` 对应于房屋价格。我们可以通过观察市场上房屋的情况获得二者之间的关系，从而为新房屋的定价提供参考。
+
+
+## 2. 准备数据
+
+假设变量 `X` 和 `Y` 的真实关系为： `Y = 2X + 0.3`，这里展示如何使用观测数据还原这一线性关系。如下Python代码将随机产生2000个观测点，它们将被用作PaddlePaddle的输入。产生PaddlePaddle的输入数据和写一段普通的Python脚本几乎一样，你唯一需要增加的就是定义输入数据的类型。
+
+```python
+# -*- coding:utf-8 -*-
+# dataprovider.py
+from paddle.trainer.PyDataProvider2 import *
+import random
+
+# 定义输入数据的类型: 2个浮点数
+@provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
+def process(settings, input_file):
+    for i in xrange(2000):
+        x = random.random()
+        yield [x], [2*x+0.3]
+```
+
+## 3. 训练模型
+
+为了还原 `Y = 2X + 0.3`，我们先从一条随机的直线 `Y' = wX + b` 开始，然后利用观测数据调整 `w` 和 `b` 使得 `Y'` 和 `Y` 的差距不断减小，最终趋于相同。这个过程就是模型的训练过程，而 `w` 和 `b` 就是模型的参数，即我们的训练目标。
+
+在PaddlePaddle里，该模型的网络配置如下。
+
+```python
+# -*- coding:utf-8 -*-
+# trainer_config.py
+from paddle.trainer_config_helpers import *
+
+# 1. 定义数据来源，调用上面的process函数获得观测数据
+data_file = 'empty.list'
+with open(data_file, 'w') as f: f.writelines(' ')
+define_py_data_sources2(train_list=data_file, test_list=None, 
+        module='dataprovider', obj='process',args={})
+
+# 2. 学习算法。控制如何改变模型参数 w 和 b
+settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
+
+# 3. 神经网络配置
+x = data_layer(name='x', size=1)
+y = data_layer(name='y', size=1)
+# 线性计算单元: y_predict = wx + b
+y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
+# 损失计算，度量 y_predict 和真实 y 之间的差距
+cost = regression_cost(input=y_predict, label=y)
+outputs(cost)
+```
+这段简短的配置展示了PaddlePaddle的基本用法：
+
+- 首先，第一部分定义了数据输入。一般情况下，PaddlePaddle先从一个文件列表里获得数据文件地址，然后交给用户自定义的函数（例如上面的`process`函数）进行读入和预处理从而得到真实输入。本文中由于输入数据是随机生成的不需要读输入文件，所以放一个空列表（`empty.list`）即可。
+
+- 第二部分主要是选择学习算法，它定义了模型参数如何改变。PaddlePaddle提供了很多优秀的学习算法，但这里使用一个简单的基于momentum的算法就足够了，它每次读取12个数据进行计算和模型更新。
+
+- 最后一部分是神经网络的配置。由于PaddlePaddle已经实现了丰富的网络单元（Layer），所以很多时候你需要做的只是声明正确的网络单元并把它们拼接起来。这里使用了三种网络单元：
+	- **数据层**：数据层 `data_layer` 是神经网络的入口，它读入数据并将它们传输到下游的其它单元。这里数据层有两个，分别对应于变量 `X` 和 `Y`。
+	- **全连接层**：全连接层 `fc_layer` 是基础的计算单元，这里利用它建模变量之间的线性关系。计算单元是神经网络的核心，PaddlePaddle支持大量的计算单元和任意深度的网络连接，从而可以挖掘复杂的数据关系。
+	- **回归损失层**：回归损失层 `regression_cost`是众多损失函数层的一种，它们在训练过程作为网络的出口，用来计算模型的表现，并指导模型参数的改变。
+
+这样定义了网络结构并保存为`trainer_config.py`之后，运行训练命令即可：
+ ```
+ paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
+ ```
+
+PaddlePaddle将在观测数据集上迭代训练30轮，并将每轮的模型结果存放在 `./output` 路径下。从输出日志可以看到，随着轮数增加损失函数的输出在不断的减小，这意味着模型在不断的改进，直到逼近真实解：` Y = 2X + 0.3 `
+
+## 4. 模型检验
+
+训练完成后，我们希望能够检验模型的好坏。一种常用的做法是用模型对另外一组数据进行预测，然后评价预测的效果。但在这个例子中，由于已经知道了真实答案，我们可以直接观察模型的参数是否符合预期来进行检验。
+
+PaddlePaddle将每个模型参数作为一个numpy数组单独存为一个文件，所以可以利用如下方法读取模型的参数。
+
+```python
+import numpy as np
+import os
+
+def load(file_name):
+    with open(file_name, 'rb') as f:
+        f.read(16) # skip header for float type.
+        return np.fromfile(f, dtype=np.float32)
+        
+print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
+# w=1.999743, b=0.300137
+```
+<center> ![](./parameters.png) </center>
+
+从图中可以看到，虽然 `w` 和 `b` 都使用随机值初始化，但在起初的几轮训练中它们都在快速逼近真实值，并且后续仍在不断改进，使得最终得到的模型几乎与真实模型重合。
+
+这样，我们就完成了对单变量线性回归问题的解决：将数据输入PaddlePaddle，训练模型，最后验证结果。
+
+## 5. 推荐后续阅读
+
+- <a href="../build_and_install/index.html">安装/编译</a>：PaddlePaddle的安装与编译文档。
+- <a href="../demo/quick_start/index.html">快速入门 </a>：使用商品评论分类任务，系统性的介绍如何一步步改进，最终得到产品级的深度模型。
+- <a href="../demo/index.html">示例</a>：各种实用案例，涵盖图像、文本、推荐等多个领域。
--- a/doc_cn/introduction/parameters.png
+++ b/doc_cn/introduction/parameters.png
--- a/paddle/api/Arguments.cpp
+++ b/paddle/api/Arguments.cpp
@@ -14,27 +14,10 @@ limitations under the License. */


 #include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"

 #include "paddle/parameter/Argument.h"

-struct ArgumentsPrivate {
-  std::vector<paddle::Argument> outputs;
-
-  inline paddle::Argument& getArg(size_t idx) throw(RangeError) {
-    if (idx < outputs.size()) {
-      return outputs[idx];
-    } else {
-      RangeError e;
-      throw e;
-    }
-  }
-
-  template <typename T>
-  std::shared_ptr<T>& cast(void* rawPtr) const {
-    return *(std::shared_ptr<T>*)(rawPtr);
-  }
-};
-
 size_t Arguments::getSlotNum() const { return m->outputs.size(); }

 Arguments* Arguments::createArguments(size_t slotNum) {

--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -40,6 +40,8 @@ configure_file(

 generate_python_api(python_swig_sources)

+file(GLOB PY_PADDLE_PYTHON_FILES ${PROJ_ROOT}/paddle/py_paddle/*.py)
+
 # TODO(yuyang18) : make wheel name calculated by cmake
 add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/dist/.timestamp
    COMMAND ${PYTHON_EXECUTABLE} setup.py  bdist_wheel
@@ -55,6 +57,7 @@ add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/dist/.timestamp
            paddle_trainer
            paddle_api
            paddle_cuda
+	    ${PY_PADDLE_PYTHON_FILES}
 )

 install(DIRECTORY ${PROJ_ROOT}/paddle/dist/

--- a/paddle/api/ConfigParser.cpp
+++ b/paddle/api/ConfigParser.cpp
@@ -14,17 +14,9 @@ limitations under the License. */


 #include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
 #include "paddle/trainer/Trainer.h"

-struct TrainerConfigPrivate {
-  std::shared_ptr<paddle::TrainerConfig> conf;
-  TrainerConfigPrivate() : conf(std::make_shared<paddle::TrainerConfig>()) {}
-};
-
-struct ModelConfigPrivate {
-  std::shared_ptr<paddle::TrainerConfig> conf;
-};
-
 struct ParameterConfigPrivate {
  paddle::ParameterPtr parameter;
  paddle::ParameterConfig config;
@@ -39,19 +31,6 @@ struct ParameterConfigPrivate {
  }
 };

-struct OptimizationConfigPrivate {
-  std::shared_ptr<paddle::TrainerConfig> trainer_config;
-  paddle::OptimizationConfig config;
-
-  paddle::OptimizationConfig& getConfig() {
-    if (trainer_config != nullptr) {
-      return *trainer_config->mutable_opt_config();
-    } else {
-      return config;
-    }
-  }
-};
-
 TrainerConfig::TrainerConfig() : m(new TrainerConfigPrivate()) {}

 TrainerConfig::~TrainerConfig() { delete m; }
@@ -59,10 +38,19 @@ TrainerConfig::~TrainerConfig() { delete m; }
 TrainerConfig* TrainerConfig::createFromTrainerConfigFile(
    const std::string& confPath) {
  LOG(INFO) << "load trainer config from " << confPath;
-  paddle::TrainerConfigHelper helper(confPath);
-  //! TODO(yuyang18): Make TrainerConfigPrivate to TrainerConfigHelper
+  auto conf = std::make_shared<paddle::TrainerConfigHelper>(confPath);
  auto retv = new TrainerConfig();
-  *retv->m->conf = helper.getConfig();
+  retv->m->conf = conf;
+  return retv;
+}
+
+TrainerConfig* TrainerConfig::createFromProtoString(
+    const std::string& str) {
+  auto retv = new TrainerConfig();
+  paddle::TrainerConfig trainerConfigProto;
+  auto conf = std::make_shared<paddle::TrainerConfigHelper>(trainerConfigProto);
+  CHECK(conf->getMutableConfig().ParseFromString(str));
+  retv->m->conf = conf;
  return retv;
 }

@@ -76,10 +64,6 @@ ModelConfig* TrainerConfig::getModelConfig() const {
  return retv;
 }

-void* ModelConfig::getPaddleModelConfig() const {
-  return m->conf->mutable_model_config();
-}
-
 ParameterConfig::ParameterConfig() : m(new ParameterConfigPrivate()) {}

 ParameterConfig::~ParameterConfig() {
@@ -132,8 +116,6 @@ OptimizationConfig* TrainerConfig::getOptimizationConfig() const {
  return opt_config;
 }

-void* OptimizationConfig::getRawPtr() { return &m->getConfig(); }
-
 OptimizationConfig* OptimizationConfig::createFromProtoString(
    const std::string& str) {
  auto conf = new OptimizationConfig();

--- a/paddle/api/GradientMachine.cpp
+++ b/paddle/api/GradientMachine.cpp
@@ -14,30 +14,22 @@ limitations under the License. */


 #include "PaddleAPI.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
+#include "PaddleAPIPrivate.h"
+
 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 #include "Internal.h"

 std::vector<int> GradientMachine::defaultParamTypes = {
    PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM};

-struct GradientMachinePrivate {
-  std::shared_ptr<paddle::GradientMachine> machine;
-
-  template <typename T>
-  inline T& cast(void* ptr) {
-    return *(T*)(ptr);
-  }
-};
-
 GradientMachine::GradientMachine() : m(new GradientMachinePrivate()) {}

 GradientMachine::~GradientMachine() { delete m; }

 GradientMachine* GradientMachine::createFromPaddleModelPtr(
-    void* confPtr, GradientMatchineCreateMode mode,
+    const void* confPtr, GradientMatchineCreateMode mode,
    const std::vector<int>& types) {
-  auto& conf = *(paddle::ModelConfig*)(confPtr);
+  auto& conf = *(const paddle::ModelConfig*)(confPtr);
  std::vector<ParameterType> realTypes;
  staticCastVector(&realTypes, types);
  auto machineRawPtr = paddle::GradientMachine::create(conf, mode, realTypes);
@@ -66,7 +58,7 @@ GradientMachine* GradientMachine::createByConfigProtoStr(
 GradientMachine* GradientMachine::createByModelConfig(
    ModelConfig* conf, GradientMatchineCreateMode mode,
    const std::vector<int>& types) {
-  auto confPtr = (paddle::ModelConfig*)conf->getPaddleModelConfig();
+  auto confPtr = &conf->m->conf->getModelConfig();
  return GradientMachine::createFromPaddleModelPtr(confPtr, mode, types);
 }


--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -446,7 +446,6 @@ struct OptimizationConfigPrivate;
 class OptimizationConfig {
  DISABLE_COPY_AND_ASSIGN(OptimizationConfig);
  OptimizationConfig();
-  void* getRawPtr();

 public:
  static OptimizationConfig* createFromProtoString(const std::string& str);
@@ -462,6 +461,7 @@ private:

  friend class TrainerConfig;
  friend class ParameterOptimizer;
+  friend class Trainer;
 };

 struct ParameterPrivate;
@@ -515,8 +515,6 @@ public:
  virtual ~ModelConfig();

 private:
-  void* getPaddleModelConfig() const;
-
  ModelConfigPrivate* m;
  friend class TrainerConfig;
  friend struct TrainerConfigPrivate;
@@ -539,6 +537,7 @@ public:

  static TrainerConfig* createFromTrainerConfigFile(
      const std::string& configPath);
+  static TrainerConfig* createFromProtoString(const std::string& str);

  ModelConfig* getModelConfig() const;

@@ -546,6 +545,7 @@ public:

 private:
  TrainerConfigPrivate* m;
+  friend class Trainer;
 };

 /**
@@ -700,11 +700,12 @@ private:
  GradientMachinePrivate* m;

  static GradientMachine* createFromPaddleModelPtr(
-      void* confPtr, GradientMatchineCreateMode mode,
+      const void* confPtr, GradientMatchineCreateMode mode,
      const std::vector<int>& types);

  // Not to use c++ 11 init-list, so we use static var as function default arg.
  static std::vector<int> defaultParamTypes;
+  friend class Trainer;
 };

 struct TrainerPrivate;
@@ -712,6 +713,7 @@ class Trainer {
 private:
  TrainerPrivate* m;
  Trainer();
+  Trainer(TrainerConfig* optConfig, GradientMachine* gm);
  DISABLE_COPY_AND_ASSIGN(Trainer);

 public:
@@ -720,38 +722,42 @@ public:
  /// Create A Trainer By TrainerConfig. using paddle command line.
  static Trainer* createByCommandLine() throw(IOError);

-  /// Start Train.
+  static Trainer* create(TrainerConfig* optConfig, GradientMachine* gm)
+      throw(IOError);
+
+  /// Start training
  void startTrain();
+
+  /// Finish training
  void finishTrain();

-  /// Start Pass.
+  /// Start a pass.
  void startTrainPass();
-  void finishTrainPass();

-  void setBatchSize(size_t batchSize);
+  /// Finish a pass
+  void finishTrainPass();

  /**
   * Train one batch,
   *
-   * @param batchSize -1 wiil use command line or batch size set before,
-   *                  otherwise use this batchSize for train.
-   *
   * @return true if all batch finished.
   */
-  bool trainOneBatch(size_t batchSize = -1UL);
+  bool trainOneBatch(size_t batchSize);

-  bool prepareBatchData(size_t batchSize = -1UL);
+  void trainOneDataBatch(size_t batchSize, const Arguments& args);

-  void finishTrainOneBatch();
+  void startTestPeriod();
+  void testOneDataBatch(size_t batchSize, const Arguments& args);
+  void finishTestPeriod();

-  void forwardOneBatch() throw(UnsupportError);
+  void forwardOneBatch(size_t batchSize);

-  Arguments* getNetworkOutput();
+  Arguments* getForwardOutput();

  Matrix* getLayerOutput(const std::string& layerName);
 };

-/// The N-Best results generated from one input sequence.
+/// the N-Best results generated from one input sequence.
 class ISequenceResults {
 public:
  virtual ~ISequenceResults();

--- a/paddle/api/PaddleAPIPrivate.h
+++ b/paddle/api/PaddleAPIPrivate.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/trainer/TrainerConfigHelper.h"
+
+#pragma once
+
+struct GradientMachinePrivate {
+  std::shared_ptr<paddle::GradientMachine> machine;
+
+  template <typename T>
+  inline T& cast(void* ptr) {
+    return *(T*)(ptr);
+  }
+};
+
+struct OptimizationConfigPrivate {
+  std::shared_ptr<paddle::TrainerConfigHelper> trainer_config;
+  paddle::OptimizationConfig config;
+
+  const paddle::OptimizationConfig& getConfig() {
+    if (trainer_config != nullptr) {
+      return trainer_config->getOptConfig();
+    } else {
+      return config;
+    }
+  }
+};
+
+struct TrainerConfigPrivate {
+  std::shared_ptr<paddle::TrainerConfigHelper> conf;
+  TrainerConfigPrivate() {}
+};
+
+struct ModelConfigPrivate {
+  std::shared_ptr<paddle::TrainerConfigHelper> conf;
+};
+
+struct ArgumentsPrivate {
+  std::vector<paddle::Argument> outputs;
+
+  inline paddle::Argument& getArg(size_t idx) throw(RangeError) {
+    if (idx < outputs.size()) {
+      return outputs[idx];
+    } else {
+      RangeError e;
+      throw e;
+    }
+  }
+
+  template <typename T>
+  std::shared_ptr<T>& cast(void* rawPtr) const {
+    return *(std::shared_ptr<T>*)(rawPtr);
+  }
+};
+
--- a/paddle/api/ParameterOptimizer.cpp
+++ b/paddle/api/ParameterOptimizer.cpp
@@ -14,6 +14,7 @@ limitations under the License. */


 #include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
 #include "paddle/parameter/ParameterOptimizer.h"
 #include "Internal.h"
 #include <algorithm>
@@ -60,10 +61,9 @@ ParameterOptimizer::~ParameterOptimizer() {

 ParameterOptimizer* ParameterOptimizer::create(OptimizationConfig* config) {
  CHECK(config != nullptr);
-  auto opt_config_ptr = (paddle::OptimizationConfig*)config->getRawPtr();
  auto retOptimizer = new ParameterOptimizer();
  retOptimizer->m->optimizer.reset(
-      paddle::ParameterOptimizer::create(*opt_config_ptr, false));
+      paddle::ParameterOptimizer::create(config->m->getConfig(), false));
  return retOptimizer;
 }


--- a/paddle/api/Trainer.cpp
+++ b/paddle/api/Trainer.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"

 #include <stdlib.h>
 #include <memory>
@@ -30,31 +31,17 @@ P_DECLARE_string(config);
 P_DECLARE_string(init_model_path);
 P_DECLARE_int32(start_pass);

-struct TrainPassContext {
-  int64_t batchId;
-  int32_t batchSize;
-  real avgTestCost;
-  int64_t numAvgTests;
-  int passInnerId;
-  paddle::DataBatch data;
-  std::vector<paddle::Argument> forwardOutput;
-};
-
 struct TrainerPrivate : public paddle::Trainer {
-  void startTrain();
-  void finishTrain();
-
-  void startTrainPass();
-  void finishTrainPass();
-
-  bool _trainOneBatch();
-
-  bool _prepareBatchData();
-  void _forwardOneBatch() throw(UnsupportError);
-
+  bool _trainOneBatch(size_t batchSize);
+  bool forwardOneBatch(size_t batchSize);
+  void forwardOneDataBatch(const std::vector<paddle::Argument>& inArgs);
+  void setBatchSize(size_t batchSize);
+  std::vector<paddle::Argument>& getForwardOutput();
+
+  void startTestPeriod();
+  void finishTestPeriod();
+  void testOneDataBatch(const paddle::DataBatch& dataBatch);
  TrainerPrivate() : paddle::Trainer() {}
-
-  TrainPassContext trainPassContext;
 };

 Trainer::Trainer() : m(new TrainerPrivate()) {
@@ -75,61 +62,76 @@ Trainer* Trainer::createByCommandLine() throw(IOError) {
  }
 }

-void Trainer::startTrain() { m->startTrain(); }
+Trainer::Trainer(TrainerConfig* config, GradientMachine* gm)
+    : m(new TrainerPrivate()) {
+  m->init(config->m->conf, /* testing= */false, gm ? gm->m->machine : nullptr);
+}

-void TrainerPrivate::startTrain() {
-  srand(this->config_->getConfig().start_pass() + 1);
-  this->dataProvider_->reset();
-  this->trainerInternal_.getGradientMachine()->start(*config_, dataProvider_);
+Trainer* Trainer::create(TrainerConfig* config, GradientMachine* gm)
+    throw(IOError)
+{
+  auto retv = new Trainer(config, gm);
+  if (retv->m->getConfig().IsInitialized()) {
+    return retv;
+  } else {
+    retv->m->getConfig().CheckInitialized();
+    throw IOError();
+  }
 }

-void Trainer::finishTrain() { m->finishTrain(); }
+void Trainer::startTrain() { m->startTrain(); }

-void TrainerPrivate::finishTrain() {
-  this->trainerInternal_.getGradientMachine()->finish();
-}
+void Trainer::finishTrain() { m->finishTrain(); }

 void Trainer::startTrainPass() { m->startTrainPass(); }

-void TrainerPrivate::startTrainPass() {
-  this->stats_.reset();
-  this->trainPassContext.batchId = 0;
-  this->trainPassContext.batchSize = this->config_->getOptConfig().batch_size();
-  this->trainPassContext.avgTestCost = 0;
-  this->trainPassContext.numAvgTests = 0;
-  this->trainPassContext.passInnerId = 0;
-  this->trainerInternal_.getParameterUpdater()->startPass();
-  this->evaluator_->start();
-}
-
 void Trainer::finishTrainPass() { m->finishTrainPass(); }

-void TrainerPrivate::finishTrainPass() {
-  this->trainerInternal_.getGradientMachine()->onPassEnd();
-  this->trainerInternal_.getParameterUpdater()->finishPass();
-  evaluator_->finish();
+void Trainer::trainOneDataBatch(size_t batchSize, const Arguments& inArgs) {
+  paddle::DataBatch dataBatch;
+  dataBatch.getStreams() = inArgs.m->outputs;
+  dataBatch.setSize(batchSize);
+  m->trainOneDataBatch(dataBatch);
 }

-void Trainer::setBatchSize(size_t batchSize) {
-  this->m->trainPassContext.batchSize = batchSize;
+bool Trainer::trainOneBatch(size_t batchSize) {
+  return m->_trainOneBatch(batchSize);
 }

-bool Trainer::trainOneBatch(size_t batchSize) {
-  if (batchSize == -1UL) {
-    this->setBatchSize(batchSize);
+bool TrainerPrivate::_trainOneBatch(size_t batchSize) {
+  paddle::DataBatch dataBatch;
+  CHECK(dataProvider_) << "data_provider is not specified";
+  int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+  if (num == 0) {
+    return false;
  }
-  return m->_trainOneBatch();
+  trainOneDataBatch(dataBatch);
+  return false;
 }

-bool TrainerPrivate::_trainOneBatch() {
-  if (this->_prepareBatchData()) {
-    return true;
+void TrainerPrivate::startTestPeriod() {
+  if (!tester_) {
+    createTester();
  }
-  this->trainerInternal_.trainOneBatch(this->trainPassContext.batchId,
-                                       this->trainPassContext.data);
-  return false;
+  tester_->startTestPeriod();
+}
+
+void Trainer::startTestPeriod() { m->startTestPeriod(); }
+
+void TrainerPrivate::testOneDataBatch(const paddle::DataBatch& dataBatch) {
+  tester_->testOneDataBatch(dataBatch, &forwardOutput_);
+}
+
+void Trainer::testOneDataBatch(size_t batchSize, const Arguments& args) {
+  paddle::DataBatch dataBatch;
+  dataBatch.getStreams() = args.m->outputs;
+  dataBatch.setSize(batchSize);
+  m->testOneDataBatch(dataBatch);
 }

+void TrainerPrivate::finishTestPeriod() { tester_->finishTestPeriod(); }
+void Trainer::finishTestPeriod() { m->finishTestPeriod(); }
+
 Matrix* Trainer::getLayerOutput(const std::string& layerName) {
  auto nn = std::dynamic_pointer_cast<paddle::NeuralNetwork>(
          this->m->getGradientMachine());
@@ -138,46 +140,37 @@ Matrix* Trainer::getLayerOutput(const std::string& layerName) {
  return Matrix::createByPaddleMatrixPtr(&m);
 }

-bool Trainer::prepareBatchData(size_t batchSize) {
-  if (batchSize != -1UL) {
-    this->setBatchSize(batchSize);
+void Trainer::forwardOneBatch(size_t batchSize) { m->forwardOneBatch(batchSize); }
+
+bool TrainerPrivate::forwardOneBatch(size_t batchSize)  {
+  CHECK(dataProvider_) << "data_provider is not specified";
+  paddle::DataBatch dataBatch;
+  int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+  if (num == 0) {
+    return false;
  }
-  return this->m->_prepareBatchData();
-}

-bool TrainerPrivate::_prepareBatchData() {
-  int num = dataProvider_->getNextBatch(this->trainPassContext.batchSize,
-                                        &this->trainPassContext.data);
-  return num == 0;
+  forwardOneDataBatch(dataBatch.getStreams());
+  return true;
 }

-void Trainer::finishTrainOneBatch() { ++m->trainPassContext.batchId; }
+void TrainerPrivate::forwardOneDataBatch(
+    const std::vector<paddle::Argument>& inArgs) {

-void Trainer::forwardOneBatch() throw(UnsupportError) { m->_forwardOneBatch(); }
-
-void TrainerPrivate::_forwardOneBatch() throw(UnsupportError) {
-  auto& dataBatch = this->trainPassContext.data;
-
-  int64_t actualBatchSize = dataBatch.getSize();
-  if (actualBatchSize == 0) {
-    return;
-  }
-
-  const std::vector<paddle::Argument>& inArgs = dataBatch.getStreams();
-  std::vector<paddle::Argument>& outArgs = this->trainPassContext.forwardOutput;
-  outArgs.clear();
-  paddle::PassType passType =
-      this->trainerInternal_.getParameterUpdater()->startBatch(actualBatchSize);
+  std::vector<paddle::Argument>& outArgs = forwardOutput_;

  if (config_->getOptConfig().use_sparse_remote_updater()) {
-    this->trainerInternal_.getGradientMachine()->prefetch(inArgs);
-    this->trainerInternal_.getParameterUpdater()->getParametersRemote();
+    trainerInternal_.getGradientMachine()->prefetch(inArgs);
+    trainerInternal_.getParameterUpdater()->getParametersRemote();
  }
-  this->trainerInternal_.getGradientMachine()->forward(
-        inArgs, &outArgs, passType);
+  trainerInternal_.getGradientMachine()->forward(
+      inArgs, &outArgs, paddle::PASS_TEST);
+}
+
+Arguments* Trainer::getForwardOutput() {
+  return Arguments::createByPaddleArgumentVector(&m->getForwardOutput());
 }

-Arguments* Trainer::getNetworkOutput() {
-  return Arguments::createByPaddleArgumentVector(
-      &m->trainPassContext.forwardOutput);
+std::vector<paddle::Argument>& TrainerPrivate::getForwardOutput() {
+  return forwardOutput_;
 }
--- a/paddle/api/test/run_tests.sh
+++ b/paddle/api/test/run_tests.sh
@@ -30,7 +30,7 @@ source .test_env/bin/activate

 pip --timeout 600  install ../../dist/*.whl

-test_list="testArguments.py testGradientMachine.py testMatrix.py  testVector.py testTrain.py"
+test_list="testArguments.py testGradientMachine.py testMatrix.py  testVector.py testTrain.py testTrainer.py"

 export PYTHONPATH=$PWD/../../../python/


--- a/paddle/api/test/testTrain.py
+++ b/paddle/api/test/testTrain.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from py_paddle import swig_paddle, DataProviderWrapperConverter
+from py_paddle import swig_paddle
 import paddle.trainer.config_parser
-from paddle.trainer.PyDataProviderWrapper import DenseSlot, IndexSlot
 import numpy
 import util


--- a/paddle/api/test/testTrainer.py
+++ b/paddle/api/test/testTrainer.py
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.config_parser import parse_config
+from paddle.trainer.config_parser import logger
+from py_paddle import swig_paddle
+import util
+
+def main():
+    trainer_config = parse_config(
+        "./testTrainConfig.py", "")
+    model = swig_paddle.GradientMachine.createFromConfigProto(
+        trainer_config.model_config)
+    trainer = swig_paddle.Trainer.create(trainer_config, model)
+    trainer.startTrain()
+    for train_pass in xrange(2):
+        trainer.startTrainPass()
+        num = 0
+        cost = 0
+        while True:  # Train one batch
+            batch_size = 1000
+            data, atEnd = util.loadMNISTTrainData(batch_size)
+            if atEnd:
+                break
+            trainer.trainOneDataBatch(batch_size, data)
+            outs = trainer.getForwardOutput()
+            cost += sum(outs[0]['value'])
+            num += batch_size
+        trainer.finishTrainPass()
+        logger.info('train cost=%f' % (cost / num))
+
+        trainer.startTestPeriod()
+        num = 0
+        cost = 0
+        while True:  # Test one batch
+            batch_size = 1000
+            data, atEnd = util.loadMNISTTrainData(batch_size)
+            if atEnd:
+                break
+            trainer.testOneDataBatch(batch_size, data)
+            outs = trainer.getForwardOutput()
+            cost += sum(outs[0]['value'])
+            num += batch_size
+        trainer.finishTestPeriod()
+        logger.info('test cost=%f' % (cost / num))
+
+    trainer.finishTrain()
+ 
+
+if __name__ == '__main__':
+    swig_paddle.initPaddle("--use_gpu=0", "--trainer_count=1")
+    main()
--- a/paddle/cuda/include/hl_device_functions.cuh
+++ b/paddle/cuda/include/hl_device_functions.cuh
@@ -48,5 +48,24 @@ inline __device__ double paddleAtomicAdd(double* address, double val) {
 }
 }  // namespace paddle

+/**
+ * @brief  sum reduction
+ *
+ * @param[in,out]  smem       input data, better to use __shared__ memory.
+ * @param[in]      tid        thread index.
+ * @param[in]      threads    the total thread number used to reduce,
+ *                            such as, blockDim.x.
+ *
+ * @return smem[0]: the sum of each elements in smem.
+ */
+__device__ __forceinline__
+void simpleReduce(real* smem, int tid, int threads) {
+  for (unsigned int s = threads / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      smem[tid] += smem[tid + s];
+    }
+    __syncthreads();
+  }
+}

 #endif /* HL_DEVICE_FUNCTIONS_CUH_ */
--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
@@ -229,4 +229,40 @@ extern void hl_cossim_derivative(real* grad,
                                 int input2_height,
                                 real scale);

+/**
+ * @brief   Matrix addition: A_d[i][j] += scale * B_d[j/channel].
+ *
+ * @param[in]   A_d     input matrix (M x N).
+ * @param[in]   B_d     input matrix (1 x channel).
+ * @param[in]   channel width of B.
+ * @param[in]   dimM    height of A.
+ * @param[in]   dimN    width of A.
+ * @param[in]   scale   scalar used for addition.
+ *
+ */
+extern void hl_matrix_add_shared_bias(real* A_d,
+                                      real* B_d,
+                                      const int channel,
+                                      const int dimM,
+                                      const int dimN,
+                                      real scale);
+
+/**
+ * @brief   Matrix addition: A_d[i][j] += scale * B_d[j/channel].
+ *
+ * @param[in]   B_d     input matrix (1 x channel).
+ * @param[in]   A_d     input matrix (M x N).
+ * @param[in]   channel width of B.
+ * @param[in]   dimM    height of A.
+ * @param[in]   dimN    width of A.
+ * @param[in]   scale   scalar used for addition.
+ *
+ */
+extern void hl_matrix_collect_shared_bias(real* B_d,
+                                          real* A_d,
+                                          const int channel,
+                                          const int dimM,
+                                          const int dimN,
+                                          real scale);
+
 #endif /* HL_MATRIX_H_ */
--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@@ -101,4 +101,17 @@ inline void hl_cossim_derivative(real* grad,
                                 int input2_height,
                                 real scale) {}

+inline void hl_matrix_add_shared_bias(real* A_d,
+                                      real* B_d,
+                                      const int channel,
+                                      const int dimM,
+                                      const int dimN,
+                                      real scale) {}
+
+inline void hl_matrix_collect_shared_bias(real* B_d,
+                                          real* A_d,
+                                          const int channel,
+                                          const int dimM,
+                                          const int dimN,
+                                          real scale) {}
 #endif  // HL_MATRIX_STUB_H_
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -20,6 +20,11 @@ limitations under the License. */
 #include "hl_thread.ph"
 #include "hl_dso_loader.h"
 #include "paddle/utils/Logging.h"
+#include "paddle/utils/CommandLineParser.h"
+
+P_DEFINE_int32(cudnn_conv_workspace_limit_in_mb, 4096,
+                "Specify cuDNN max workspace limit, in units MB, "
+                "4096MB=4GB by default.");

 namespace dynload {

@@ -242,7 +247,7 @@ void hl_conv_workspace(hl_tensor_descriptor input,
    CHECK_NOTNULL(conv);

    // Specify workspace limit directly
-    size_t memoryLimitBytes = 8 * 1024 * 1024;
+    size_t memoryLimitBytes = (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;

    // cudnn convolution forward configuration
    cudnnTensorDescriptor_t       fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);

--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "hl_sequence.h"
 #include "paddle/utils/Logging.h"
 #include "hl_device_functions.cuh"
+#include "hl_gpu_matrix_kernel.cuh"

 DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
 DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1*a + p2*b);
@@ -673,3 +674,89 @@ void hl_cossim_derivative(real* grad,
        input1_height, input2_height, scale);
  CHECK_SYNC("hl_cossim_derivate failed");
 }
+
+__global__ void KeMatrixAddSharedBias(real* A,
+                                      real* B,
+                                      const int channel,
+                                      const int M,
+                                      const int N,
+                                      real scale) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int dim = N / channel;
+  if (index < M * N) {
+    int i = index % N;
+    i = i / dim; 
+    A[index] += scale * B[i];
+  }
+}
+
+void hl_matrix_add_shared_bias(real* A_d,
+                               real* B_d,
+                               const int channel,
+                               const int dimM,
+                               const int dimN,
+                               real scale) {
+  const int blocks = 512;
+  const int grids = DIVUP(dimM * dimN, blocks);
+  KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>
+    (A_d, B_d, channel, dimM, dimN, scale);
+  CHECK_SYNC("hl_matrix_add_shared_bias failed");
+}
+
+
+template <int blockSize>
+__global__ void KeMatrixCollectSharedBias(real *B,
+                                          real *A,
+                                          const int channel,
+                                          const int M,
+                                          const int N,
+                                          const int dim,
+                                          const int limit,
+                                          real scale) {
+  if (dim < limit) { 
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < channel) {
+      real sum = 0.0;
+      for (int i = 0; i < M; ++i) {
+        for (int j = 0; j < dim; ++j) {
+          sum += A[i * N + index * dim + j];
+        }
+      }
+      B[index] += scale * sum;
+    }
+  } else {
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+    __shared__ real smem[blockSize];
+    real sum = 0.0;
+    for (int j = 0; j < ((dim * M + blockSize - 1) / blockSize); ++j) {
+      int n = j * blockSize + tid;
+      int m = n / dim;
+      int w = n % dim;
+      smem[tid] =  (m < M && w < dim) ? A[m * N + bid * dim + w] : 0.0;
+      __syncthreads();
+      simpleReduce(smem, tid, blockSize);
+      sum += smem[0];
+    }
+    if (tid == 0) {
+      B[bid] += scale * sum;
+    }
+  }
+}
+
+void hl_matrix_collect_shared_bias(real* B_d,
+                                   real* A_d,
+                                   const int channel,
+                                   const int dimM,
+                                   const int dimN,
+                                   real scale) {
+  const int dim = dimN / channel;
+  const int blocks = 256;
+  const int limit = 64;
+  int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel;
+
+  KeMatrixCollectSharedBias<blocks>
+      <<< grids, blocks, 0, STREAM_DEFAULT>>>
+      (B_d, A_d, channel, dimM, dimN, dim, limit, scale);
+  CHECK_SYNC("hl_matrix_collect_shared_bias failed");
+}
--- a/paddle/cuda/src/hl_cuda_sparse.cuh
+++ b/paddle/cuda/src/hl_cuda_sparse.cuh
@@ -908,24 +908,6 @@ int findIndex(int* indice, int num, int index) {
  return (end - 1);
 }

-/**
- * @brief  sum reduction
- *
- * @param[in,out]  smem       input data, better to use __shared__ memory.
- * @param[in]      tid        local thread index.
- * @param[in]      blockDimX  the size of blockDim.x.
- *
- * note: return smem[0]: the sum of each elements of smem.
- */
-__device__ __forceinline__
-void reduce(real* smem, int tid, int blockDimX) {
-  for (unsigned int s = blockDimX / 2; s > 0; s >>= 1) {
-    if (tid < s) {
-      smem[tid] += smem[tid + s];
-    }
-    __syncthreads();
-  }
-}

 /**
 * @brief sum columns of csr sparse matrix (csr_val), then add to a_val.

--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -46,63 +46,100 @@ static inline std::string join(const std::string& part1, const std::string& part
  return ret;
 }

-static inline void GetDsoHandleWithSearchPath(
+static inline void GetDsoHandleFromDefaultPath(
+        std::string& dso_path, void** dso_handle, int dynload_flags) {
+    LOG(INFO) << "Try to find cuda library: " << dso_path
+              << "from default system path.";
+    // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH 
+    *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+    
+    // DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
+    // bring System Integrity Projection (SIP), if dso_handle
+    // is null, search from default package path in Mac OS.
+    #if defined(__APPLE__) or defined(__OSX__)
+    if (nullptr == *dso_handle) {
+        dso_path = join("/usr/local/cuda/lib/", dso_path);
+        *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+        if (nullptr == *dso_handle) {
+            if (dso_path == "libcudnn.dylib") {
+                LOG(FATAL) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n"
+                << "For instance, sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C "
+                << "/usr/local \n sudo chmod a+r /usr/local/cuda/include/cudnn.h "
+                << "/usr/local/cuda/lib/libcudnn*";
+            }
+        } 
+    }   
+    #endif
+}
+
+static inline void GetDsoHandleFromSearchPath(
        const std::string& search_root,
-        const std::string& dso_path,
+        const std::string& dso_name,
        void** dso_handle) {
    int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
    *dso_handle = nullptr;

-    std::string dlPath = dso_path;
+    std::string dlPath = dso_name;
    if (search_root.empty()) {
-        // default search xxx.so from LD_LIBRARY_PATH
-        *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
+        GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
    } else {
        // search xxx.so from custom path
-        dlPath = join(search_root, dso_path);
+        dlPath = join(search_root, dso_name);
        *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
-        // then, search xxx.so from LD_LIBRARY_PATH
-        if (nullptr == *dso_handle) {
-            *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+        // if not found, search from default path
+        if (nullptr == dso_handle) {
+            LOG(WARNING) << "Failed to find cuda library: " << dlPath;
+            dlPath = dso_name;
+            GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
        }
    }

    CHECK(nullptr != *dso_handle)
-      << "For Gpu version of PaddlePaddle, it couldn't find CUDA library: "
-      << dlPath.c_str() << ". Please make sure you already specify its path. "
-      << "Note: for training data on Cpu using Gpu version of PaddlePaddle, "
-      << "you must specify libcudart via export LD_LIBRARY_PATH for Linux or "
-      << "export DYLD_LIBRARY_PATH for MAC OS.";
+      << "Failed to find cuda library: " << dlPath << std::endl
+      << "Please specify its path correctly using one of the following ideas: \n"
+
+      << "Idea 1. set cuda and cudnn lib path at runtime. "
+      << "http://www.paddlepaddle.org/doc/ui/cmd_argument/argument_outline.html \n"
+      << "For instance, issue command: paddle train --use_gpu=1 "
+      << "--cuda_dir=/usr/local/cudnn/lib --cudnn_dir=/usr/local/cudnn/lib ...\n"
+
+      << "Idea 2. set environment variable LD_LIBRARY_PATH on Linux or "
+      << "DYLD_LIBRARY_PATH on Mac OS. \n"
+      << "For instance, issue command: export LD_LIBRARY_PATH=... \n"
+
+      << "Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is impossible "
+      << "unless System Integrity Protection (SIP) is disabled. However, @Idea 1"
+      << "always work well.";
 }

 void GetCublasDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
+    GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
 #else
-    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
+    GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
 #endif
 }

 void GetCudnnDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-    GetDsoHandleWithSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
+    GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
 #else
-    GetDsoHandleWithSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
+    GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
 #endif
 }

 void GetCudartDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-    GetDsoHandleWithSearchPath("", "libcudart.dylib", dso_handle);
+    GetDsoHandleFromSearchPath("", "libcudart.dylib", dso_handle);
 #else
-    GetDsoHandleWithSearchPath("", "libcudart.so", dso_handle);
+    GetDsoHandleFromSearchPath("", "libcudart.so", dso_handle);
 #endif
 }

 void GetCurandDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
+    GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
 #else
-    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
+    GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
 #endif
 }
--- a/paddle/gserver/layers/ConcatenateLayer.cpp
+++ b/paddle/gserver/layers/ConcatenateLayer.cpp
@@ -97,7 +97,8 @@ void ConcatenateLayer::backward(const UpdateCallback& callback) {
 */
 class ConcatenateLayer2 : public Layer {
 public:
-  explicit ConcatenateLayer2(const LayerConfig& config) : Layer(config) {}
+  explicit ConcatenateLayer2(const LayerConfig& config) :
+      Layer(config) {}

  ~ConcatenateLayer2() {}

@@ -110,6 +111,8 @@ protected:
  std::vector<std::unique_ptr<Projection>> projections_;
  std::vector<Argument> projOutput_;
  std::vector<std::pair<size_t, size_t>> projCol_;
+  bool sharedBias_;
+  std::unique_ptr<Weight> biases_;
 };

 REGISTER_LAYER(concat2, ConcatenateLayer2);
@@ -119,7 +122,6 @@ bool ConcatenateLayer2::init(const LayerMap& layerMap,
  /* Initialize the basic parent class */
  if (!Layer::init(layerMap, parameterMap)) return false;

-  CHECK(!biasParameter_);
  CHECK_EQ(inputLayers_.size(), parameters_.size());
  projections_.reserve(inputLayers_.size());
  projCol_.reserve(inputLayers_.size());
@@ -137,6 +139,13 @@ bool ConcatenateLayer2::init(const LayerMap& layerMap,
  }
  CHECK_EQ(getSize(), endCol);

+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    sharedBias_ = config_.shared_biases();
+    size_t psize = config_.bias_size();
+    biases_ = std::unique_ptr<Weight>(new Weight(1, psize, biasParameter_));
+  }
+
  return true;
 }

@@ -154,8 +163,17 @@ void ConcatenateLayer2::forward(PassType passType) {
    projOutput_[i].grad = output_.grad->subColMatrix(startCol, endCol);
  }

-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    projections_[i]->forward(&getInput(i), &projOutput_[i], passType);
+  {
+    AsyncGpuBlock block;
+    for (size_t i = 0; i != inputLayers_.size(); ++i) {
+      projections_[i]->forward(&getInput(i), &projOutput_[i], passType);
+    }
+  }
+
+  /* add the bias-vector */
+  if (biases_) {
+    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
+    output_.value->addBias(*(biases_->getW()), 1, sharedBias_);
  }

  /* activation */ {
@@ -170,6 +188,13 @@ void ConcatenateLayer2::backward(const UpdateCallback& callback) {
    backwardActivation();
  }

+  AsyncGpuBlock block;
+  if (biases_ && biases_->getWGrad()) {
+    REGISTER_TIMER_INFO("Concat2BpBiasTimer", getName().c_str());
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBias_);
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
  for (size_t i = 0; i != inputLayers_.size(); ++i) {
    if (projections_[i]) {
      projections_[i]->backward(callback);

--- a/paddle/gserver/layers/ConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ConvBaseLayer.cpp
@@ -35,25 +35,12 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
    filterSizeY_.push_back(conf.filter_size_y());
    filterPixels_.push_back(filterSize_.back() * filterSizeY_.back());
    channels_.push_back(conf.channels());
-    imgSize_.push_back(conf.img_size());
-    imgPixels_.push_back(imgSize_.back() * imgSize_.back());
+    imgSizeH_.push_back(conf.img_size());
+    imgSizeW_.push_back(conf.img_size());
    groups_.push_back(conf.groups());
    filterChannels_.push_back(conf.filter_channels());
-    outputX_.push_back(conf.output_x());
-    outputs_.push_back(outputX_.back() * outputX_.back());
-  }
-
-  /* initialize the weightList */
-  CHECK(inputLayers_.size() == parameters_.size());
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    size_t height, width;
-    height = filterPixels_[i] * filterChannels_[i];
-    width = numFilters_;
-
-    // create a new weight
-    CHECK_EQ(parameters_[i]->getSize(), width * height);
-    Weight* w = new Weight(height, width, parameters_[i]);
-    weights_.emplace_back(w);
+    outputH_.push_back(conf.output_x());
+    outputW_.push_back(conf.output_x());
  }

  /* initialize the biases_ */
@@ -74,4 +61,34 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
  return true;
 }

+size_t ConvBaseLayer::calOutputSize() {
+  auto clearAndReserve = [this](IntV* vec) {
+    vec->clear();
+    vec->reserve(this->inputLayers_.size());
+  };
+  clearAndReserve(&imgSizeH_);
+  clearAndReserve(&imgSizeW_);
+  clearAndReserve(&outputH_);
+  clearAndReserve(&outputW_);
+  size_t layerSize = 0;
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    imgSizeH_.push_back(inputLayers_[i]->getOutput().getFrameHeight());
+    imgSizeW_.push_back(inputLayers_[i]->getOutput().getFrameWidth());
+    if (imgSizeH_[i] == 0)
+      imgSizeH_[i] = config_.inputs(i).conv_conf().img_size();
+    if (imgSizeW_[i] == 0)
+      imgSizeW_[i] = config_.inputs(i).conv_conf().img_size();
+    outputH_.push_back(
+        outputSize(imgSizeH_[i], filterSizeY_[i], paddingY_[i], strideY_[i]));
+    outputW_.push_back(
+        outputSize(imgSizeW_[i], filterSize_[i], padding_[i], stride_[i]));
+    CHECK_EQ(outputH_[i], outputH_[0]);
+    CHECK_EQ(outputW_[i], outputW_[0]);
+  }
+  getOutput().setFrameHeight(outputH_[0]);
+  getOutput().setFrameWidth(outputW_[0]);
+  layerSize = outputH_[0] * outputW_[0] * size_t(numFilters_);
+  return layerSize;
+}
+
 }  // namespace paddle
--- a/paddle/gserver/layers/ConvBaseLayer.h
+++ b/paddle/gserver/layers/ConvBaseLayer.h
@@ -43,19 +43,18 @@ protected:
  IntV filterSizeY_;
  /// The spatial dimensions of the convolution input.
  IntV channels_;
-  /// The spatial dimensions of input feature map.
-  IntV imgSize_;
-  /// The total pixel size of input feature map.
-  /// imgPixels_ = imgSizeX_ * imgSizeY_.
-  IntV imgPixels_;
+  /// The spatial dimensions of input feature map height.
+  IntV imgSizeH_;
+  /// The spatial dimensions of input feature map width.
+  IntV imgSizeW_;
  /// filterPixels_ = filterSizeX_ * filterSizeY_.
  IntV filterPixels_;
  /// filterChannels_ = channels_/groups_.
  IntV filterChannels_;
-  /// The spatial dimensions of output feature map.
-  IntV outputX_;
-  /// The spatial dimensions of output feature map.
-  IntV outputs_;
+  /// The spatial dimensions of output feature map height.
+  IntV outputH_;
+  /// The spatial dimensions of output feature map width.
+  IntV outputW_;
  /// Group size, refer to grouped convolution in
  /// Alex Krizhevsky's paper: when group=2, the first half of the
  /// filters are only connected to the first half of the input channels,
@@ -80,6 +79,13 @@ public:

  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);

+  /**
+   * imgSizeH_ and imgSizeW_ will be set according to the previous input layers
+   * in this function. Then it will calculate outputH_ and outputW_ and set them
+   * into output argument.
+   */
+  virtual size_t calOutputSize();
+
  Weight& getWeight(int idx) { return *weights_[idx]; }

  /**

--- a/paddle/gserver/layers/ConvProjection.cpp
+++ b/paddle/gserver/layers/ConvProjection.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Stat.h"
+#include "ConvProjection.h"
+
+namespace paddle {
+
+REGISTER_PROJECTION(conv, ConvProjection);
+
+ThreadLocalD<std::vector<MemoryHandle*>> ConvProjection::convMem_;
+
+ConvProjection::ConvProjection(const ProjectionConfig& config,
+                               ParameterPtr parameter, bool useGpu)
+    : Projection(config, parameter, useGpu) {
+
+  CHECK(useGpu);  // only support GPU
+  getConvParams();
+  initCudnn();
+
+  size_t height = filterH_ * filterW_ * channels_ / groups_;
+  size_t width = numFilters_;
+  weight_.reset(new Weight(height, width, parameter));
+  weightOffset_ = height * width / groups_;
+}
+
+void ConvProjection::getConvParams() {
+  const ConvConfig &conf = config_.conv_conf();
+  paddingH_ = conf.padding_y();
+  paddingW_ = conf.padding();
+
+  strideH_ = conf.stride_y();
+  strideW_ = conf.stride();
+
+  filterH_ = conf.filter_size_y();
+  filterW_ = conf.filter_size();
+
+  configImgH_ = conf.img_size();
+  configImgW_ = conf.img_size();
+
+  channels_ = conf.channels();
+  numFilters_ = config_.num_filters();
+
+  groups_ = conf.groups();
+  CHECK_EQ(channels_ % groups_, 0);
+  CHECK_EQ(numFilters_ % groups_, 0);
+}
+
+void ConvProjection::initCudnn() {
+  hl_create_filter_descriptor(&filterDesc_, channels_, numFilters_,
+                              filterH_, filterW_);
+  hl_create_tensor_descriptor(&inputDesc_);
+  hl_create_tensor_descriptor(&outputDesc_);
+  hl_create_convolution_descriptor(&convDesc_, inputDesc_, filterDesc_,
+                                   paddingH_, paddingW_, strideH_, strideW_);
+
+  // initialize all to default algorithms
+  fwdAlgo_ = 0;
+  bwdFilterAlgo_ = 0;
+  bwdDataAlgo_ = 0;
+  fwdLimitBytes_ = 0;
+  bwdDataLimitBytes_ = 0;
+  bwdFilterLimitBytes_ = 0;
+  workSpaceInBytes_ = 0;
+
+  batchNum_ = 0;
+  isSelectAlgo_ = false;
+}
+
+void ConvProjection::reshapeTensorDesc(int batchSize) {
+  hl_tensor_reshape(inputDesc_, batchSize, channels_, imageH_, imageW_,
+                    channels_ * imageH_ * imageW_, imageH_ * imageW_,
+                    imageW_, 1);
+  hl_reset_convolution_descriptor(convDesc_, inputDesc_, filterDesc_,
+                                  paddingH_, paddingW_, strideH_, strideW_);
+
+  // The stride between two consecutive images in ConvProjection may not be 1,
+  // for example, in the case of layer ConcatenateLayer2 with two
+  // ConvProjection, the stride is the output_size of layer ConcatenateLayer2.
+  // So the calculation of nStride is different from CudnnConvLayer.
+  // In fact, only "nStride = out_->value->getStride()" is ok.
+  size_t nStride = numFilters_ * outputH_ * outputW_;
+  if (out_->value->isContiguous()) {
+    CHECK_EQ(nStride, out_->value->getWidth());
+  } else {
+    nStride = out_->value->getStride();
+  }
+
+  hl_tensor_reshape(outputDesc_, batchSize, numFilters_, outputH_, outputW_,
+                    nStride, outputH_ * outputW_, outputW_, 1);
+}
+
+void ConvProjection::reshape(int batchSize) {
+  size_t width = calOutputSize();
+  CHECK_EQ(width, out_->value->getWidth());
+
+  isSelectAlgo_ = (batchSize == batchNum_);
+  batchNum_ = batchSize;
+
+  if (!isSelectAlgo_) {
+    reshapeTensorDesc(batchSize);
+    hl_conv_workspace(inputDesc_, outputDesc_, filterDesc_,
+                      convDesc_, &fwdAlgo_, &fwdLimitBytes_,
+                      &bwdDataAlgo_, &bwdDataLimitBytes_,
+                      &bwdFilterAlgo_, &bwdFilterLimitBytes_);
+
+    size_t maxWorkSpace = 0;
+    maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
+    maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
+    workSpaceInBytes_ = maxWorkSpace;
+
+
+    VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
+                         << " / " << bwdDataAlgo_
+                         << " / " << bwdFilterAlgo_;
+  }
+
+  isSelectAlgo_ = true;
+}
+
+void ConvProjection::forward() {
+  int batchSize = in_->value->getHeight();
+  reshape(batchSize);
+
+  void* workSpace = NULL;
+  if (workSpaceInBytes_ > 0) {
+    workSpace = getSpaceBytes(workSpaceInBytes_);
+  }
+
+  for (int g = 0; g < groups_; ++g) {
+    REGISTER_TIMER_INFO("CudnnConvFwTimer", getName().c_str());
+
+    real *inputData = in_->value->getData() + g * inputOffset_;
+    real *wgtData = weight_->getW()->getData() + g * weightOffset_;
+    real *outData = out_->value->getData() + g * outputOffset_;
+    hl_convolution_forward(inputDesc_, inputData, outputDesc_,
+                           outData, filterDesc_, wgtData,
+                           convDesc_, workSpace,
+                           fwdLimitBytes_, fwdAlgo_);
+  }
+}
+
+void ConvProjection::backward(const UpdateCallback& callback) {
+  REGISTER_TIMER_INFO("CudnnConvBpTimer", getName().c_str());
+
+  void* workSpace = NULL;
+  if (workSpaceInBytes_ > 0) {
+    workSpace = getSpaceBytes(workSpaceInBytes_);
+  }
+
+  for (int g = 0; g < groups_; ++g) {
+    real *outGrad = out_->grad->getData() + g * outputOffset_;
+    if (weight_->getWGrad()) {
+      real *inputData = in_->value->getData() + g * inputOffset_;
+      real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_;
+      hl_convolution_backward_filter(
+          inputDesc_, inputData, outputDesc_, outGrad, filterDesc_,
+          weightGrad, convDesc_, workSpace, bwdFilterLimitBytes_,
+          bwdFilterAlgo_);
+    }
+
+    MatrixPtr preGrad = in_->grad;
+    if (NULL != preGrad) {
+      real *inputGrad = preGrad->getData() + g * inputOffset_;
+      real *wgtData = weight_->getW()->getData() + g* weightOffset_;
+      hl_convolution_backward_data(
+          inputDesc_, inputGrad, outputDesc_, outGrad, filterDesc_,
+          wgtData, convDesc_, workSpace, bwdDataLimitBytes_,
+          bwdDataAlgo_);
+    }
+  }
+
+  weight_->getParameterPtr()->incUpdate(callback);
+}
+
+void* ConvProjection::getSpaceBytes(size_t size) {
+  std::vector<MemoryHandle*>& convMem = *convMem_;
+  if (convMem.empty()) {
+    int numDevices = hl_get_device_count();
+    convMem.resize(numDevices);
+  }
+
+  int devId = hl_get_device();
+  MemoryHandle** localMem = &(convMem[devId]);
+  if (NULL == *localMem || size > (*localMem)->getAllocSize()) {
+    *localMem = new GpuMemoryHandle(size);
+  }
+  return (*localMem)->getBuf();
+}
+
+ConvProjection::~ConvProjection() {
+  hl_destroy_tensor_descriptor(inputDesc_);
+  hl_destroy_tensor_descriptor(outputDesc_);
+  hl_destroy_filter_descriptor(filterDesc_);
+  hl_destroy_convolution_descriptor(convDesc_);
+}
+
+}  // namespace paddle
--- a/paddle/gserver/layers/ConvProjection.h
+++ b/paddle/gserver/layers/ConvProjection.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "Projection.h"
+
+namespace paddle {
+
+/**
+ * @brief Convolution projection do the same calculation with CudnnConvLayer.
+ */
+class ConvProjection : public Projection {
+public:
+  /**
+   * Constructor.
+   */
+  ConvProjection(const ProjectionConfig& config, ParameterPtr parameter,
+                 bool useGpu);
+
+  ~ConvProjection();
+
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+
+protected:
+  void getConvParams();
+  void initCudnn();
+
+  void reshapeTensorDesc(int batchSize);
+  void reshape(int batchSize);
+
+  int outputSize(int imageSize, int filterSize, int padding, int stride) {
+    return (imageSize - filterSize + 2 * padding) / stride + 1;
+  }
+
+  size_t calOutputSize() {
+    imageH_ = in_->getFrameHeight();
+    imageW_ = in_->getFrameWidth();
+    if (imageH_ == 0) imageH_ = configImgH_;
+    if (imageW_ == 0) imageW_ = configImgW_;
+    outputH_ = outputSize(imageH_, filterH_, paddingH_, strideH_);
+    outputW_ = outputSize(imageW_, filterW_, paddingW_, strideW_);
+
+    const_cast<Argument*>(out_)->setFrameHeight(outputH_);
+    const_cast<Argument*>(out_)->setFrameWidth(outputW_);
+
+    inputOffset_ = (channels_ / groups_) * imageH_ * imageW_;
+    outputOffset_ = (numFilters_ / groups_) * outputH_ * outputW_;
+    return outputH_ * outputW_ * numFilters_;
+  }
+
+  static void* getSpaceBytes(size_t size);
+
+  /// imageH_ and imageW_ is calculated from the input layer.
+  int imageH_, imageW_;
+  /// configImgH_ and configImgW_ is obtained from config.
+  int configImgH_, configImgW_;
+  int outputH_, outputW_;
+  int channels_, numFilters_;
+  int paddingH_, paddingW_;
+  int strideH_, strideW_;
+  int filterH_, filterW_;
+  /// One group offset of input data.
+  int inputOffset_;
+  /// One group offset of output data.
+  int outputOffset_;
+  /// One group offset of weight.
+  int weightOffset_;
+  int groups_;
+
+  /// Cudnn tensor descriptor for input.
+  hl_tensor_descriptor inputDesc_;
+  /// Cudnn tensor descriptor for output.
+  hl_tensor_descriptor outputDesc_;
+  /// Cudnn tensor descriptor for filter.
+  hl_filter_descriptor filterDesc_;
+  /// Cudnn tensor descriptor for a convolution operation.
+  hl_convolution_descriptor convDesc_;
+
+  /// Record the algorithm for forward convolution, which is obtained by cudnn
+  /// api to search the best suited algorithm.
+  int fwdAlgo_;
+  /// Record the algorithm for computing convolution gradient with respect to
+  /// filter coefficients.
+  int bwdFilterAlgo_;
+  /// Record the algorithm for computing convolution gradient with respect to
+  /// the output.
+  int bwdDataAlgo_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// forward convolution with the specified algo.
+  size_t fwdLimitBytes_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// backwardFilter with the specified algo.
+  size_t bwdDataLimitBytes_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// backwardData with the specified algo.
+  size_t bwdFilterLimitBytes_;
+  /// Size of total work space.
+  size_t workSpaceInBytes_;
+
+  /// Whether to call cuDNN api to choose conv algorithm.
+  bool isSelectAlgo_;
+  /// batchNum is used to record batch size. If the batch size is changed,
+  /// the selection algorithm will be called.
+  int batchNum_;
+  bool bias_;
+
+  std::unique_ptr<Weight> weight_;
+  static ThreadLocalD<std::vector<MemoryHandle*>> convMem_;
+};
+
+}  // namespace paddle
--- a/paddle/gserver/layers/CudnnConvLayer.cpp
+++ b/paddle/gserver/layers/CudnnConvLayer.cpp
@@ -22,215 +22,64 @@ REGISTER_LAYER(cudnn_conv, CudnnConvLayer);

 bool CudnnConvLayer::init(const LayerMap &layerMap,
                          const ParameterMap &parameterMap) {
-  ConvBaseLayer::init(layerMap, parameterMap);
+  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
  CHECK(useGpu_) << "CudnnConvLayer only support gpu";

-  maxGroups_ = 0;
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    CHECK_EQ(channels_[i] % groups_[i], 0);
-    CHECK_EQ(numFilters_ % groups_[i], 0);
-
-    hl_filter_descriptor filter;
-    hl_create_filter_descriptor(&filter, channels_[i] / groups_[i],
-                                numFilters_ / groups_[i], filterSizeY_[i],
-                                filterSize_[i]);
-    filterDesc_.push_back(filter);
-
-    hl_tensor_descriptor input;
-    hl_create_tensor_descriptor(&input);
-    inputDesc_.push_back(input);
-
-    hl_tensor_descriptor output;
-    int outputX =
-        outputSize(imgSize_[i], filterSize_[i], padding_[i], stride_[i]);
-    CHECK_EQ(outputX, outputX_[i]);
-    hl_create_tensor_descriptor(&output);
-    outputDesc_.push_back(output);
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  projections_.reserve(inputLayers_.size());
+  projConf_.reserve(inputLayers_.size());

-    hl_convolution_descriptor conv;
-    hl_create_convolution_descriptor(&conv, input, filter, paddingY_[i],
-                                     padding_[i], strideY_[i], stride_[i]);
-    convDesc_.push_back(conv);
-
-    weightOffset_.push_back((numFilters_ / groups_[i]) *
-                            (channels_[i] / groups_[i]) * filterPixels_[i]);
-    inputOffset_.push_back((channels_[i] / groups_[i]) * imgSize_[i] *
-                           imgSize_[i]);
-    outputOffset_.push_back((numFilters_ / groups_[i]) * outputX_[i] *
-                            outputX_[i]);
-
-    // initialize all to default algorithms
-    fwdAlgo_.push_back(0);
-    bwdFilterAlgo_.push_back(0);
-    bwdDataAlgo_.push_back(0);
-    fwdLimitBytes_.push_back(0);
-    bwdFilterLimitBytes_.push_back(0);
-    bwdDataLimitBytes_.push_back(0);
-
-    // cudnn streams per group equal to 1
-    if (groups_[i] > maxGroups_) {
-      maxGroups_ = groups_[i];
-    }
-  }
-
-  workSpaceInBytes_ = 0;
-  workSpaceData_ = NULL;
-  for (int i = 0; i < maxGroups_; ++i) {
-    workSpace_.push_back(NULL);
+  numFilters_ = config_.num_filters();
+  CHECK(config_.shared_biases());
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    ProjectionConfig* conf = new ProjectionConfig();
+    conf->set_type("conv");
+    conf->set_num_filters(numFilters_);
+    conf->set_allocated_conv_conf(
+        config_.mutable_inputs(i)->mutable_conv_conf());
+    conf->set_input_size(getPrev(i)->getSize());
+    conf->set_output_size(getSize());
+    projConf_.emplace_back(conf);
+    projections_.emplace_back(Projection::create(*projConf_[i],
+                                                 parameters_[i], useGpu_));
  }

  if (biases_.get() && sharedBiases_) {
    hl_create_tensor_descriptor(&biasDesc_);
+    hl_create_tensor_descriptor(&outputDesc_);
    hl_tensor_reshape(biasDesc_, 1, numFilters_ / groups_[0], 1, 1);
    biasOffset_ = numFilters_ / groups_[0];
  }

-  batchNum_ = 0;
-  isSelectAlgo_ = false;
  return true;
 }

-void CudnnConvLayer::allocConvWorkSpace(size_t maxWorkSpace) {
-  size_t totalWorkSpace = maxWorkSpace * maxGroups_;
-
-  if (totalWorkSpace  > workSpaceInBytes_) {
-      if (workSpaceInBytes_ != 0) {
-          hl_free_mem_device(workSpaceData_);
-      }
-      // total amount of storage needed over all groups
-      workSpaceData_ = hl_malloc_device(totalWorkSpace);
-
-      // update work space address for each group
-      for (int i = 0; i < maxGroups_; ++i) {
-            workSpace_[i] = reinterpret_cast<char *>(workSpaceData_)
-                                  + i * maxWorkSpace;
-      }
-      workSpaceInBytes_ = totalWorkSpace;
-  }
-}
-
-void CudnnConvLayer::reshape(int batchSize) {
-  CHECK_NE(inputLayers_.size(), 0UL);
-  imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  if (imageH_ == 0) imageH_ = imgSize_[0];
-  if (imageW_ == 0) imageW_ = imgSize_[0];
-
-  for (size_t i = 1; i < inputLayers_.size(); i++) {
-    int imageH = inputLayers_[i]->getOutput().getFrameHeight();
-    int imageW = inputLayers_[i]->getOutput().getFrameWidth();
-    if (imageH) {
-      CHECK_EQ(imageH_, imageH) << "Inputs must have same height.";
-    }
-    if (imageW) {
-      CHECK_EQ(imageW_, imageW) << "Inputs must have same width.";
-    }
-  }
-
-  outputH_ = outputSize(imageH_, filterSizeY_[0], paddingY_[0], strideY_[0]);
-  outputW_ = outputSize(imageW_, filterSize_[0], padding_[0], stride_[0]);
-  // check outputH & outputW
-  getOutput().setFrameHeight(outputH_);
-  getOutput().setFrameWidth(outputW_);
-
-  // if the batchSize remains the same, set isSelectAlgo_ true.
-  // Otherwise, set isSelectAlgo_ false and select algo again.
-  isSelectAlgo_ = (batchSize == batchNum_);
-  batchNum_ = batchSize;
-
-  size_t maxWorkSpace = 0;
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    CHECK_EQ(inputLayers_[i]->getOutput().value->getWidth(),
-             (size_t)(channels_[i] * imageH_ * imageW_));
-
-    hl_tensor_reshape(inputDesc_[i], batchSize, channels_[i] / groups_[i],
-                      imageH_, imageW_, channels_[i] * imageH_ * imageW_,
-                      imageH_ * imageW_, imageW_, 1);
-
-    hl_tensor_reshape(outputDesc_[i], batchSize, numFilters_ / groups_[i],
-                      outputH_, outputW_, numFilters_ * outputH_ * outputW_,
-                      outputH_ * outputW_, outputW_, 1);
-
-    hl_reset_convolution_descriptor(convDesc_[i], inputDesc_[i],
-                                    filterDesc_[i], paddingY_[i],
-                                    padding_[i], strideY_[i], stride_[i]);
-
-    inputOffset_[i] = (channels_[i] / groups_[i]) * imageH_ * imageW_;
-    outputOffset_[i] = (numFilters_ / groups_[i]) * outputH_ * outputW_;
-
-    if (!isSelectAlgo_) {
-      hl_conv_workspace(inputDesc_[i], outputDesc_[i], filterDesc_[i],
-                        convDesc_[i], &fwdAlgo_[i], &fwdLimitBytes_[i],
-                        &bwdDataAlgo_[i], &bwdDataLimitBytes_[i],
-                        &bwdFilterAlgo_[i], &bwdFilterLimitBytes_[i]);
-
-      maxWorkSpace = std::max(fwdLimitBytes_[i], bwdDataLimitBytes_[i]);
-      maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_[i]);
-
-      VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_[i]
-                           << " / " << bwdDataAlgo_[i]
-                           << " / " << bwdFilterAlgo_[i];
-    }
-  }
-
-  if (!isSelectAlgo_) {
-    allocConvWorkSpace(maxWorkSpace);
-  }
-
-  isSelectAlgo_ = true;
-}
-
 void CudnnConvLayer::forward(PassType passType) {
  Layer::forward(passType);
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  reshape(batchSize);
-  resetOutput(batchSize, outputH_ * outputW_ * numFilters_);
+
+  int batchSize = getInput(0).getBatchSize();
+  resetOutput(batchSize, calOutputSize());

  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    REGISTER_TIMER_INFO("CudnnConvFwTimer", getName().c_str());
-    for (int g = 0; g < groups_[i]; ++g) {
-      real *inputData = getInputValue(i)->getData() + inputOffset_[i] * g;
-      real *wgtData = weights_[i]->getW()->getData() + weightOffset_[i] * g;
-      real *outData = getOutputValue()->getData() + outputOffset_[i] * g;
-      hl_convolution_forward(inputDesc_[i], inputData, outputDesc_[i],
-                             outData, filterDesc_[i], wgtData,
-                             convDesc_[i], workSpace_[g],
-                             fwdLimitBytes_[i], fwdAlgo_[i]);
-    }
+    projections_[i]->forward(&getInput(i), &getOutput(), passType);
  }

  if (biases_) {
    REGISTER_TIMER_INFO("CudnnConvBiasTimer", getName().c_str());
-    addBiases();
-  }
-
-  forwardActivation();
-}
-
-void CudnnConvLayer::addBiases() {
-  if (sharedBiases_) {
+    int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+    hl_tensor_reshape(outputDesc_, batchSize, numFilters_ / groups_[0],
+        outputH_[0], outputW_[0], numFilters_ * outputH_[0] * outputW_[0],
+        outputH_[0] * outputW_[0], outputW_[0], 1);
+    outputOffset_ = getOutputValue()->getWidth() / groups_[0];
    for (int g = 0; g < groups_[0]; ++g) {
      real *biasData = biases_->getW()->getData() + biasOffset_ * g;
-      real *outData = getOutputValue()->getData() + outputOffset_[0] * g;
+      real *outData = getOutputValue()->getData() + outputOffset_ * g;
      hl_convolution_forward_add_bias(biasDesc_, biasData,
-                                      outputDesc_[0], outData);
+                                      outputDesc_, outData);
    }
-  } else {
-    LOG(FATAL) << "Not supported";
  }
-}

-void CudnnConvLayer::bpropBiases() {
-  if (sharedBiases_) {
-    for (int g = 0; g < groups_[0]; ++g) {
-      real *biasGrad = biases_->getWGrad()->getData() + biasOffset_ * g;
-      real *outGrad = getOutputGrad()->getData() + outputOffset_[0] * g;
-      hl_convolution_backward_bias(biasDesc_, biasGrad,
-                                   outputDesc_[0], outGrad);
-    }
-  } else {
-    LOG(FATAL) << "Not supported";
-  }
+  forwardActivation();
 }

 void CudnnConvLayer::backward(const UpdateCallback &callback) {
@@ -238,52 +87,23 @@ void CudnnConvLayer::backward(const UpdateCallback &callback) {

  if (biases_ && biases_->getWGrad()) {
    REGISTER_TIMER_INFO("CudnnConvBpBiasTimer", getName().c_str());
-    bpropBiases();
+    for (int g = 0; g < groups_[0]; ++g) {
+      real *biasGrad = biases_->getWGrad()->getData() + biasOffset_ * g;
+      real *outGrad = getOutputGrad()->getData() + outputOffset_ * g;
+      hl_convolution_backward_bias(biasDesc_, biasGrad, outputDesc_, outGrad);
+    }
    biases_->getParameterPtr()->incUpdate(callback);
  }

  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    REGISTER_TIMER_INFO("CudnnConvBpTimer", getName().c_str());
-    for (int g = 0; g < groups_[i]; ++g) {
-      real *outGrad = getOutputGrad()->getData() + outputOffset_[i] * g;
-      if (weights_[i]->getWGrad()) {
-        real *inputData = getInputValue(i)->getData() + inputOffset_[i] * g;
-        real *weightGrad =
-            weights_[i]->getWGrad()->getData() + weightOffset_[i] * g;
-        hl_convolution_backward_filter(
-            inputDesc_[i], inputData, outputDesc_[i], outGrad, filterDesc_[i],
-            weightGrad, convDesc_[i], workSpace_[g], bwdFilterLimitBytes_[i],
-            bwdFilterAlgo_[i]);
-      }
-
-      MatrixPtr preGrad = getInputGrad(i);
-      if (NULL != preGrad) {
-        real *inputGrad = preGrad->getData() + inputOffset_[i] * g;
-        real *wgtData = weights_[i]->getW()->getData() + weightOffset_[i] * g;
-        hl_convolution_backward_data(
-            inputDesc_[i], inputGrad, outputDesc_[i], outGrad, filterDesc_[i],
-            wgtData, convDesc_[i], workSpace_[g], bwdDataLimitBytes_[i],
-            bwdDataAlgo_[i]);
-      }
-    }
-    weights_[i]->getParameterPtr()->incUpdate(callback);
+    projections_[i]->backward(callback);
  }
 }

 CudnnConvLayer::~CudnnConvLayer() {
-  if (biasDesc_) {
+  if (biases_) {
    hl_destroy_tensor_descriptor(biasDesc_);
-  }
-
-  for (size_t i = 0; i < inputDesc_.size(); i++) {
-    hl_destroy_tensor_descriptor(inputDesc_[i]);
-    hl_destroy_tensor_descriptor(outputDesc_[i]);
-    hl_destroy_filter_descriptor(filterDesc_[i]);
-    hl_destroy_convolution_descriptor(convDesc_[i]);
-  }
-  if (workSpaceInBytes_ != 0) {
-    hl_free_mem_device(workSpaceData_);
-    workSpaceInBytes_ = 0;
+    hl_destroy_tensor_descriptor(outputDesc_);
  }
 }


--- a/paddle/gserver/layers/CudnnConvLayer.h
+++ b/paddle/gserver/layers/CudnnConvLayer.h
@@ -17,12 +17,13 @@ limitations under the License. */

 #include "ConvBaseLayer.h"
 #include "paddle/math/Matrix.h"
+#include "Projection.h"
 #include <vector>

 namespace paddle {

 /**
- * @brief A subclass of ConvBaseLayer by cuDNN implementation. It only
+ * @brief A 2-dimension conv layer implemented by cuDNN. It only
 *        supports GPU mode. We automatic select CudnnConvLayer for GPU
 *        mode and ExpandConvLayer for CPU mode if you set type of "conv".
 *        User also can specfiy type of "exconv" or "cudnn_conv" for
@@ -31,81 +32,21 @@ namespace paddle {
 * The config file api is img_conv_layer.
 */
 class CudnnConvLayer : public ConvBaseLayer {
-private:
-  /// resize Cudnn workspace size
-  void allocConvWorkSpace(size_t maxWorkSpace);
-
 protected:
-  int imageH_, imageW_, outputH_, outputW_;
-  /// Cudnn tensor descriptor for bias.
+  std::vector<std::unique_ptr<ProjectionConfig>> projConf_;
+  std::vector<std::unique_ptr<Projection>> projections_;
+
  hl_tensor_descriptor biasDesc_;
-  /// Cudnn tensor descriptor for input.
-  std::vector<hl_tensor_descriptor> inputDesc_;
-  /// Cudnn tensor descriptor for output.
-  std::vector<hl_tensor_descriptor> outputDesc_;
-  /// Cudnn tensor descriptor for filter.
-  std::vector<hl_filter_descriptor> filterDesc_;
-  /// Cudnn tensor descriptor for a convolution operation.
-  std::vector<hl_convolution_descriptor> convDesc_;
-  /// One sample offset of input data.
-  IntV inputOffset_;
-  /// One sample offset of output data.
-  IntV outputOffset_;
-  /// One group offset of weight.
-  IntV weightOffset_;
-  /// One group offset of bias.
+  hl_tensor_descriptor outputDesc_;
  int biasOffset_;
-
-  /// Save the algorithm for forward convolution, which is obtained by cudnn
-  /// api to search the best suited algorithm.
-  std::vector<int> fwdAlgo_;
-  /// Save the algorithm for computing convolution gradient with respect to
-  /// filter coefficients.
-  std::vector<int> bwdFilterAlgo_;
-  /// Save the algorithm for computing convolution gradient with respect to
-  /// the output.
-  std::vector<int> bwdDataAlgo_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// forward convolution with the specified algo.
-  std::vector<size_t> fwdLimitBytes_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// backwardFilter with the specified algo.
-  std::vector<size_t> bwdFilterLimitBytes_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// backwardData with the specified algo.
-  std::vector<size_t> bwdDataLimitBytes_;
-
-  /// Device work space address for each group.
-  std::vector<void*> workSpace_;
-  /// Max number of groups.
-  int maxGroups_;
-  /// Total work space address in device for all groups.
-  void* workSpaceData_;
-  /// Size of total work space.
-  size_t workSpaceInBytes_;
-
-  /// Is or not select conv algorihtm.
-  bool isSelectAlgo_;
-
-  /// batchNum is used to record batch size. If the batch size is changed,
-  /// the selection algorithm will be called.
-  int batchNum_;
+  int outputOffset_;

 public:
  explicit CudnnConvLayer(const LayerConfig& config) : ConvBaseLayer(config) {}

  ~CudnnConvLayer();

-  /**
-   * Intialization. Initialize member variables and create tenor descriptor.
-   */
  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  /**
-   * Reshape is done each forward. Reshape tensor decriptor
-   * inputDesc_, outputDesc_, convDesc_. And search the faster algo
-   * or the fastest algo within a given memeory limit.
-   */
-  void reshape(int batchSize);
  void forward(PassType passType);
  void backward(const UpdateCallback& callback);
  void addBiases();

--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -37,32 +37,29 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
    caffeMode_ = conf.caffe_mode();
  }

+  /* initialize the weightList */
+  CHECK(inputLayers_.size() == parameters_.size());
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    size_t height, width;
+    height = filterPixels_[i] * filterChannels_[i];
+    width = numFilters_;
+
+    // create a new weight
+    CHECK_EQ(parameters_[i]->getSize(), width * height);
+    Weight* w = new Weight(height, width, parameters_[i]);
+    weights_.emplace_back(w);
+  }
+
  return true;
 }

-size_t ExpandConvLayer::getSize() {
+size_t ExpandConvLayer::getOutputSize() {
  CHECK_NE(inputLayers_.size(), 0UL);
-  imgSizeH_.clear();
-  imgSizeW_.clear();
-  outputH_.clear();
-  outputW_.clear();
+  size_t layerSize = ConvBaseLayer::calOutputSize();
  subN_.clear();
-  size_t layerSize = 0;
  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    imgSizeH_.push_back(inputLayers_[i]->getOutput().getFrameHeight());
-    imgSizeW_.push_back(inputLayers_[i]->getOutput().getFrameWidth());
-    if (imgSizeH_[i] == 0) imgSizeH_[i] = imgSize_[i];
-    if (imgSizeW_[i] == 0) imgSizeW_[i] = imgSize_[i];
-    outputH_.push_back(
-        outputSize(imgSizeH_[i], filterSize_[i], padding_[i], stride_[i]));
-    outputW_.push_back(
-        outputSize(imgSizeW_[i], filterSize_[i], padding_[i], stride_[i]));
    subN_.push_back(outputH_[i] * outputW_[i]);
-    CHECK(layerSize == 0 || subN_[i] * size_t(numFilters_) == layerSize);
-    layerSize = subN_[i] * numFilters_;
  }
-  getOutput().setFrameHeight(outputH_[0]);
-  getOutput().setFrameWidth(outputW_[0]);
  return layerSize;
 }

@@ -119,7 +116,7 @@ void ExpandConvLayer::expandFwdOnce(MatrixPtr image, int inIdx, int startIdx) {
 }

 void ExpandConvLayer::addSharedBias() {
-  size_t mapW = getSize() / numFilters_;
+  size_t mapW = getOutputValue()->getWidth() / numFilters_;
  size_t mapH = getOutputValue()->getElementCnt() / mapW;
  MatrixPtr out =
      Matrix::create(getOutputValue()->getData(), mapH, mapW, false, useGpu_);
@@ -158,7 +155,7 @@ void ExpandConvLayer::forward(PassType passType) {
   *   transOutValue correspond sample to one row */
  int batchSize = inputLayers_[0]->getOutputValue()->getWidth();
  batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  resetOutput(batchSize, getSize());
+  resetOutput(batchSize, getOutputSize());

  MatrixPtr image = nullptr;
  for (size_t i = 0; i != inputLayers_.size(); ++i) {
@@ -183,7 +180,7 @@ void ExpandConvLayer::forward(PassType passType) {
 }

 void ExpandConvLayer::bpropSharedBias(MatrixPtr biases, MatrixPtr v) {
-  size_t mapW = getSize() / numFilters_;
+  size_t mapW = v->getWidth() / numFilters_;
  size_t mapH = v->getElementCnt() / mapW;
  MatrixPtr vTmp = Matrix::create(v->getData(), mapH, mapW, false, useGpu_);


--- a/paddle/gserver/layers/ExpandConvLayer.h
+++ b/paddle/gserver/layers/ExpandConvLayer.h
@@ -37,14 +37,6 @@ protected:
  IntV subN_;
  /// subK_ = channels_ * filterPixels_ * groups_.
  IntV subK_;
-  /// The spatial dimensions of height of input feature map.
-  IntV imgSizeH_;
-  /// The spatial dimensions of width of input feature map.
-  IntV imgSizeW_;
-  /// The spatial dimensions of height of output feature map.
-  IntV outputH_;
-  /// The spatial dimensions of width of output feature map.
-  IntV outputW_;
  /// Expand one sample at a time. shape:
  /// (numChannels * filterPixels_, outputSizeH * outputSizeW)
  MatrixPtr expandInput_;
@@ -58,7 +50,7 @@ public:

  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);

-  size_t getSize();
+  size_t getOutputSize();

  /**
   * Create or resize expandInput_.

--- a/paddle/gserver/layers/MixedLayer.cpp
+++ b/paddle/gserver/layers/MixedLayer.cpp
@@ -41,9 +41,13 @@ bool MixedLayer::init(const LayerMap& layerMap,
    }
    operators_.emplace_back(Operator::create(operator_conf, useGpu_));
  }
+
  /* initialize biases_ */
  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+    sharedBias_ = config_.shared_biases();
+    size_t psize = config_.bias_size();
+    biases_ = std::unique_ptr<Weight>(
+        new Weight(1, psize, biasParameter_));
  }

  return true;
@@ -119,12 +123,6 @@ void MixedLayer::forward(PassType passType) {

  MatrixPtr outV = getOutputValue();

-  /* add the bias-vector */
-  if (biases_.get() != NULL) {
-    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
  for (size_t i = 0; i != inputLayers_.size(); ++i) {
    if (projections_[i]) {
      projections_[i]->forward(&getInput(i), &output_, passType);
@@ -140,6 +138,12 @@ void MixedLayer::forward(PassType passType) {
    op->forward(ins, &output_, passType);
  }

+  /* add the bias-vector */
+  if (biases_.get() != NULL) {
+    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
+    outV->addBias(*(biases_->getW()), 1, sharedBias_);
+  }
+
  /* activation */ {
    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
    forwardActivation();
@@ -154,7 +158,7 @@ void MixedLayer::backward(const UpdateCallback& callback) {

  if (biases_ && biases_->getWGrad()) {
    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBias_);

    /* Increasing the number of gradient */
    biases_->getParameterPtr()->incUpdate(callback);

--- a/paddle/gserver/layers/MixedLayer.h
+++ b/paddle/gserver/layers/MixedLayer.h
@@ -58,5 +58,6 @@ protected:
  /// the matrix size of projection state
  std::vector<int> projectionStateMatrixSize_;
  std::unique_ptr<Weight> biases_;
+  bool sharedBias_;
 };
 }  // namespace paddle
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -669,12 +669,14 @@ void testLayerGrad(TestConfig testConf, string testLayerName, size_t batchSize,

 void testProjectionGrad(ProjectionConfig conf, InputType inputType,
                        size_t parameterSize, size_t batchSize, bool useGpu,
-                        bool testState) {
+                        bool testState, int biasSize, bool sharedBias) {
  TestConfig config;
  conf.set_name(conf.type());
  config.layerConfig.set_type("mixed");
  config.layerConfig.set_size(conf.output_size());
-  config.biasSize = config.layerConfig.size();
+  config.biasSize = biasSize == 0 ? config.layerConfig.size() : biasSize;
+  config.layerConfig.set_bias_size(config.biasSize);
+  config.layerConfig.set_shared_biases(sharedBias);
  config.inputDefs.push_back(
      {inputType, "layer_0", conf.input_size(), parameterSize});
  *config.layerConfig.add_inputs()->mutable_proj_conf() = conf;

--- a/paddle/gserver/tests/LayerGradUtil.h
+++ b/paddle/gserver/tests/LayerGradUtil.h
@@ -217,7 +217,8 @@ void testLayerGrad(TestConfig testConf, string testLayerName, size_t batchSize,

 void testProjectionGrad(ProjectionConfig conf, InputType inputType,
                        size_t parameterSize, size_t batchSize, bool useGpu,
-                        bool testState = false);
+                        bool testState = false, int biasSize = 0,
+                        bool sharedBias = false);

 void testOperatorGrad(TestConfig& config, OperatorConfig& operatorConf,
                      size_t batchSize, bool useGpu, bool testState = false);

--- a/paddle/gserver/tests/img_conv_a.conf
+++ b/paddle/gserver/tests/img_conv_a.conf
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+data = data_layer(name ="input", size=8*16*16)
+conv1 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                        num_channels=8,
+                        num_filters=16, stride=1,
+                        bias_attr=False,
+                        act=ReluActivation())
+conv2 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8,
+                       num_filters=16, stride=1,
+                       bias_attr=False,
+                       act=ReluActivation())
+
+concat = concat_layer(input=[conv1, conv2])
+
+conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                      num_channels=8,
+                      num_filters=16, stride=1,
+                      bias_attr=True,
+                      act=LinearActivation())
+
+outputs(concat, conv)
--- a/paddle/gserver/tests/img_conv_b.conf
+++ b/paddle/gserver/tests/img_conv_b.conf
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+data = data_layer(name ="input", size=8*16*16)
+proj1 = conv_projection(input=data, filter_size=1, filter_size_y=1,
+                        num_channels=8, num_filters=16, stride=1)
+proj2 = conv_projection(input=data, filter_size=1, filter_size_y=1,
+                        num_channels=8, num_filters=16, stride=1)
+concat = concat_layer(input=[proj1, proj2], bias_attr=False, act=ReluActivation())
+
+proj = conv_projection(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8, num_filters=16, stride=1)
+
+with mixed_layer(bias_attr=True, act=LinearActivation()) as conv:
+    conv += proj
+
+outputs(concat, conv)
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -134,6 +134,45 @@ TEST(Projection, identity) {
  }
 }

+
+#ifndef PADDLE_ONLY_CPU
+TEST(Projection, conv) {
+  const int NUM_FILTERS = 16;
+  const int FILTER_SIZE = 2;
+  const int FILTER_SIZE_Y = 3;
+  const int CHANNELS = 3;
+  const int IMAGE_SIZE = 16;
+
+  ProjectionConfig conf;
+  conf.set_type("conv");
+  conf.set_num_filters(NUM_FILTERS);
+
+  ConvConfig* conv = conf.mutable_conv_conf();
+  conv->set_filter_size(FILTER_SIZE);
+  conv->set_filter_size_y(FILTER_SIZE_Y);
+  conv->set_channels(CHANNELS);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(IMAGE_SIZE);
+  int outputSize = (2 * conv->padding() + conv->img_size() -
+      conv->filter_size()) / conv->stride() + 1;
+  int outputSizeY = (2 * conv->padding_y() + conv->img_size() -
+      conv->filter_size_y()) / conv->stride_y() + 1;
+  conv->set_output_x(outputSize);
+  conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
+  conf.set_output_size(outputSize * outputSizeY * NUM_FILTERS);
+
+  testProjectionGrad(conf, INPUT_DATA,
+      /* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE * FILTER_SIZE_Y,
+      /* batchSize */ 100, true, false, NUM_FILTERS, true);
+}
+#endif
+
+
 TEST(Layer, concat) {
  TestConfig config;
  config.biasSize = 0;

--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -236,6 +236,15 @@ TEST(Compare, img_pool) {
  compareNetwork(config_file_a, config_file_b);
  FLAGS_use_gpu = useGpu;
 }
+
+TEST(Compare, img_conv) {
+  std::string config_file_a = "./gserver/tests/img_conv_a.conf";
+  std::string config_file_b = "./gserver/tests/img_conv_b.conf";
+  bool useGpu = FLAGS_use_gpu;
+  FLAGS_use_gpu = true;
+  compareNetwork(config_file_a, config_file_b);
+  FLAGS_use_gpu = useGpu;
+}
 #endif



--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -340,6 +340,15 @@ void GpuMatrix::addBias(Matrix& b, real scale) {
  BaseMatrix::addBias(b, scale);
 }

+void GpuMatrix::addSharedBias(Matrix& b, real scale) {
+  CHECK(b.getHeight() == 1) << "the Bias should be a vector";
+  CHECK_LE(b.getWidth(), getWidth());
+  CHECK_EQ(getWidth() % b.getWidth(), 0UL);
+  hl_matrix_add_shared_bias(getData(), b.getData(), b.getWidth(),
+                            getHeight(), getWidth(), scale);
+}
+
+
 void GpuMatrix::collectBias(Matrix& a, real scale) {
  CHECK_EQ(getHeight(), (size_t)1);
  CHECK_EQ(width_, a.getWidth());
@@ -353,7 +362,17 @@ void GpuMatrix::collectBias(Matrix& a, real scale) {
  }
 }

-void GpuMatrix::sequenceAvgForward(Matrix& a, const IVector& startsPos,
+
+void GpuMatrix::collectSharedBias(Matrix& a, real scale) {
+  CHECK_EQ(getHeight(), (size_t)1);
+  CHECK_EQ(a.getWidth() % getWidth(), 0UL);
+  hl_matrix_collect_shared_bias(getData(), a.getData(), getWidth(),
+                                a.getHeight(), a.getWidth(), scale);
+}
+
+
+void GpuMatrix::sequenceAvgForward(Matrix& a,
+                                   const IVector& startsPos,
                                   int mode) {
  size_t height = getHeight();
  size_t width = getWidth();
@@ -1972,6 +1991,24 @@ void CpuMatrix::addBias(Matrix& b, real scale) {
  }
 }

+void CpuMatrix::addSharedBias(Matrix& b, real scale) {
+  CHECK_EQ(b.getHeight(), (size_t)1);
+  real* aData = getData();
+  real* bData = b.getData();
+  size_t numSamples = getHeight();
+  size_t channel = b.getWidth();
+  CHECK_EQ(getWidth() % channel, 0UL);
+  size_t dim = getWidth() / channel;
+
+  for (size_t i = 0; i < numSamples; i++) {
+    for (size_t c = 0; c < channel; c++) {
+      for (size_t j = 0; j < dim; j++) {
+        aData[i * getStride() + c * dim + j] += scale * bData[c];
+      }
+    }
+  }
+}
+
 void CpuMatrix::collectBias(Matrix& a, real scale) {
  CHECK_EQ(getHeight(), (size_t)1);
  CHECK_EQ(width_, a.getWidth());
@@ -1989,7 +2026,25 @@ void CpuMatrix::collectBias(Matrix& a, real scale) {
  }
 }

-void CpuMatrix::sequenceAvgForward(Matrix& a, const IVector& startsPos,
+void CpuMatrix::collectSharedBias(Matrix& a, real scale) {
+  CHECK_EQ(getHeight(), (size_t)1);
+  real* B = getData();
+  real* A = a.getData();
+  size_t numSamples = a.getHeight();
+  size_t channel = getWidth();
+  CHECK_EQ(a.getWidth() % channel, 0UL);
+  size_t dim = a.getWidth() / channel;
+  for (size_t i = 0; i < numSamples; i++) {
+    for (size_t c = 0; c < channel; c++) {
+      for (size_t j = 0; j < dim; j++) {
+        B[c] += scale * A[i * channel * dim + c * dim + j];
+      }
+    }
+  }
+}
+
+void CpuMatrix::sequenceAvgForward(Matrix& a,
+                                   const IVector& startsPos,
                                   int mode) {
  size_t height = getHeight();
  size_t width = getWidth();

--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -343,11 +343,35 @@ public:
    LOG(FATAL) << "Not implemented";
  }

+  virtual void addSharedBias(Matrix& b, real scale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void addBias(Matrix& b, real scale, bool sharedBias) {
+    if (!sharedBias) {
+      addBias(b, scale);
+    } else {
+      addSharedBias(b, scale);
+    }
+  }
+
  /// add each sample from a to this.
  virtual void collectBias(Matrix& a, real scale) {
    LOG(FATAL) << "Not implemented";
  }

+  virtual void collectSharedBias(Matrix& a, real scale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void collectBias(Matrix& a, real scale, bool sharedBias) {
+    if (!sharedBias) {
+      collectBias(a, scale);
+    } else {
+      collectSharedBias(a, scale);
+    }
+  }
+
  virtual void sequenceAvgForward(Matrix& a, const IVector& startsPos,
    int mode) {
    LOG(FATAL) << "Not implemented";
@@ -1021,6 +1045,7 @@ public:

  /// add b to each sample of this.
  void addBias(Matrix& b, real scale);
+  void addSharedBias(Matrix& b, real scale);

  /**
   * @code
@@ -1028,6 +1053,7 @@ public:
   * @endcode
   */
  void collectBias(Matrix& a, real scale);
+  void collectSharedBias(Matrix& a, real scale);

  void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);

@@ -1341,9 +1367,11 @@ public:
 public:
  /// add b to each sample of this.
  void addBias(Matrix& b, real scale);
+  void addSharedBias(Matrix& b, real scale);

  /// add each sample of a to this.
  void collectBias(Matrix& a, real scale);
+  void collectSharedBias(Matrix& a, real scale);

  void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);


--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/math/SparseMatrix.h"
 #include <gtest/gtest.h>
 #include "paddle/gserver/tests/TestUtil.h"
+#include "paddle/utils/Stat.h"
+

 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
@@ -2071,6 +2073,60 @@ TEST(Matrix, MaxOutFwdBwd) {
  }
 }

+void testAddSharedBias(int numSamples, int dim, int channel) {
+  MatrixPtr cpuData = std::make_shared<CpuMatrix>(numSamples, dim);
+  MatrixPtr gpuData = std::make_shared<GpuMatrix>(numSamples, dim);
+
+  MatrixPtr cpuBias = std::make_shared<CpuMatrix>(1, channel);
+  MatrixPtr gpuBias = std::make_shared<GpuMatrix>(1, channel);
+
+  cpuData->randomizeUniform();
+  gpuData->copyFrom(*cpuData);
+  cpuBias->randomizeUniform();
+  gpuBias->copyFrom(*cpuBias);
+
+  cpuData->addSharedBias(*cpuBias, 1.0);
+  gpuData->addSharedBias(*gpuBias, 1.0);
+
+  MatrixPtr check = std::make_shared<CpuMatrix>(numSamples, dim);
+  check->copyFrom(*gpuData);
+  MatrixCheckErr(*cpuData, *check);
+}
+
+void testCollectSharedBias(int numSamples, int dim, int channel) {
+  MatrixPtr cpuData = std::make_shared<CpuMatrix>(numSamples, dim);
+  MatrixPtr gpuData = std::make_shared<GpuMatrix>(numSamples, dim);
+
+  MatrixPtr cpuBias = std::make_shared<CpuMatrix>(1, channel);
+  MatrixPtr gpuBias = std::make_shared<GpuMatrix>(1, channel);
+
+  cpuData->randomizeUniform();
+  gpuData->copyFrom(*cpuData);
+  cpuBias->randomizeUniform();
+  gpuBias->copyFrom(*cpuBias);
+
+  cpuBias->collectSharedBias(*cpuData, 1.0);
+  gpuBias->collectSharedBias(*gpuData, 1.0);
+
+  MatrixPtr check = std::make_shared<CpuMatrix>(1, channel);
+  check->copyFrom(*gpuBias);
+  MatrixCheckErr(*cpuBias, *check);
+}
+
+
+TEST(Matrix, sharedBias) {
+  for (auto numSamples : {1, 100, 520}) {
+    for (auto dim : {100 * 16, 100 * 32}) {
+      for (auto channel : {8, 16}) {
+        VLOG(3) << " numSamples=" << numSamples << " dim=" << dim
+                << " channel=" << channel;
+        testAddSharedBias(numSamples, dim, channel);
+        testCollectSharedBias(numSamples, dim, channel);
+      }
+    }
+  }
+}
+
 int main(int argc, char** argv) {
  testing::InitGoogleTest(&argc, argv);
  initMain(argc, argv);

--- a/paddle/pserver/ParameterServer2.cpp
+++ b/paddle/pserver/ParameterServer2.cpp
@@ -264,6 +264,15 @@ void ParameterServer2::setParameter(const SendParameterRequest& request,
  std::vector<int64_t> blockIds;
  blockIds.reserve(request.blocks_size());
  int bufferIndex = 0;
+
+  if (!request.blocks().size()) {
+    LOG(WARNING)
+          << "--ports_num or --ports_num_for_sparse might be too large, "
+          << "or total dense parameter size or sparse parameters size "
+          << "might be too small, this psever doesn't store any parameter.";
+    return;
+  }
+
  for (const auto& block : request.blocks()) {
    /// block size for parameter(e.g. 128 for sparse row, 1K for dense)
    uint64_t blockSize = getParameterConfig(block).parameter_block_size();

--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
@@ -63,7 +63,7 @@ class SparseBinaryScanner(IScanner):

    def scan(self, dat):
        self.extend_cols(dat)
-        self.__rows__.append(len(dat) + self.__rows__[-1])
+        self.__rows__.append(len(self.__cols__))
        self.__height__ += 1

    def extend_cols(self, dat):

--- a/paddle/py_paddle/util.py
+++ b/paddle/py_paddle/util.py
@@ -79,6 +79,20 @@ class __ParameterCallbackWrapper__(swig_paddle.UpdateCallback):
        else:
            return __ParameterCallbackWrapper__(callback).__disown__()

+def __arguments_to_numpy__(i, arg):
+    assert isinstance(arg, swig_paddle.Arguments)
+    value = arg.getSlotValue(i)
+    if value is not None:
+        assert isinstance(value, swig_paddle.Matrix)
+        value = value.copyToNumpyMat()
+        ids = arg.getSlotIds(i)
+    if ids is not None:
+        assert isinstance(ids, swig_paddle.IVector)
+        ids = ids.copyToNumpyArray()
+    return {
+        "value": value,
+        "id": ids
+    }

 def __monkeypatch_gradient_machine__():
    """
@@ -88,20 +102,6 @@ def __monkeypatch_gradient_machine__():
    swig_paddle.GradientMachine.loadFromConfigFile = \
        staticmethod(loadGradientMachine)

-    def __arguments_to_numpy__(i, arg):
-        assert isinstance(arg, swig_paddle.Arguments)
-        value = arg.getSlotValue(i)
-        if value is not None:
-            assert isinstance(value, swig_paddle.Matrix)
-            value = value.copyToNumpyMat()
-        ids = arg.getSlotIds(i)
-        if ids is not None:
-            assert isinstance(ids, swig_paddle.IVector)
-            ids = ids.copyToNumpyArray()
-        return {
-            "value": value,
-            "id": ids
-        }

    def __matrix_to_numpy__(m):
        if isinstance(m, swig_paddle.Matrix):
@@ -126,7 +126,7 @@ def __monkeypatch_gradient_machine__():
        :type paramTypes: list of int
        :return: paddle.GradientMachine
        """
-        assert isinstance(protoObj, paddle.proto.ModelConfig_pb2.ModelConfig)
+        assert isinstance(protoObj, paddle.proto.ModelConfig)
        return swig_paddle.GradientMachine.createByConfigProtoStr(
            protoObj.SerializeToString(), createMode, paramTypes)

@@ -460,13 +460,29 @@ def __monkey_patch_protobuf_objects__():
        """

        assert isinstance(protoObj,
-                          paddle.proto.TrainerConfig_pb2.OptimizationConfig)
+                          paddle.proto.OptimizationConfig)
        return swig_paddle.OptimizationConfig.createFromProtoString(
            protoObj.SerializeToString())

    swig_paddle.OptimizationConfig.createFromProto = staticmethod(
        OptimizationConfig_createFromProto)

+    def TrainerConfig_createFromProto(protoObj):
+        """
+        Create a new paddle.TrainerConfig from
+        proto.OptimizationConfig
+
+        :param protoObj: proto.TrainerConfig
+        :return: paddle.TrainerConfig
+        """
+        assert isinstance(protoObj,
+                          paddle.proto.TrainerConfig)
+        return swig_paddle.TrainerConfig.createFromProtoString(
+            protoObj.SerializeToString())
+
+    swig_paddle.TrainerConfig.createFromProto = staticmethod(
+        TrainerConfig_createFromProto)
+

 def __monkey_patch_parameter__():
    def getBufs(self):
@@ -483,9 +499,66 @@ def __monkey_patch_parameter__():
    swig_paddle.Parameter.getBufs = getBufs


+def __monkey_patch_trainer__():
+    swig_paddle.Trainer.__create__ = staticmethod(swig_paddle.Trainer.create)
+
+    def Trainer_create(config, model=None):
+        """
+        Create a trainer for model with TrainerCOnfig trainer_config
+        trainer_config.model_config will be ignored when model is supplied.
+        Trainer.trainOneBatch() and Trainer.forwardOneBatch() can be used only
+        when trainer_config.data_config is set.
+
+        A typical usage for Trainer is:
+        .. code-block:: python
+           trainer = Trainer.create(trainer_config, model)
+           for p in xrange(num_passes)
+               while True:
+                   data = get_next_batch(batch_size)
+                   if not data:
+                       break
+                   trainer.trainOneDataBatch(batch_size, data)
+               trainer.finishTrainPass()
+           trainer.finishTrain()
+
+        The trainer will take care of logging, model saving, distributed
+        training, etc.
+
+        :param config: trainer configuration
+        :type config: paddle.proto.TrainerConfig
+        :param model: the model to be trained
+        :type model: swig_paddle.GradientMachine
+        :return: a trainer
+        :rtype swig_paddle.Trainer
+
+        """
+        assert isinstance(config, paddle.proto.TrainerConfig)
+        if model is not None:
+            assert isinstance(model, swig_paddle.GradientMachine)
+        return swig_paddle.Trainer.__create__(
+            swig_paddle.TrainerConfig.createFromProto(config), model)
+    swig_paddle.Trainer.create = staticmethod(Trainer_create)
+
+    swig_paddle.Trainer.__getForwardOutput__ = \
+        swig_paddle.Trainer.getForwardOutput
+
+    def getForwardOutput(self):
+        """
+        Get the netword outputs from the previous trainOneBatch(),
+        trainOneDataBatch(), testOneDataPatch(), or forwardOneBatch() call.
+
+        :return: list of dictionary with keys ['id', 'value'], each value is a
+                 numpy.ndarray.
+        """
+        outArgs = self.__getForwardOutput__()
+        return [__arguments_to_numpy__(i, outArgs) for i in xrange(
+            outArgs.getSlotNum())]
+
+    swig_paddle.Trainer.getForwardOutput = getForwardOutput
+
 def monkeypatches():
    patches = [__monkeypatch_init_paddle__, __monkeypatch_gradient_machine__,
               __monkey_patch_protobuf_objects__,
-               __monkey_patch_parameter__]
+               __monkey_patch_parameter__, __monkey_patch_trainer__]
    for patch in patches:
        patch()
--- a/paddle/trainer/CMakeLists.txt
+++ b/paddle/trainer/CMakeLists.txt
@@ -7,6 +7,7 @@ set(TRAINER_SOURCES
        Tester.cpp
        Trainer.cpp
        TrainerInternal.cpp
+        TrainerBenchmark.cpp
        ThreadParameterUpdater.cpp
        TrainerInternalConfig.cpp
        TrainerConfigHelper.cpp)

--- a/paddle/trainer/ParamUtil.cpp
+++ b/paddle/trainer/ParamUtil.cpp
@@ -89,6 +89,9 @@ void ParameterUtil::saveParameters(int passId, int passInnerId) {
  }

  std::string basePath = config_->getSaveDir();
+  if (basePath.find('/') == std::string::npos) {
+    basePath = "./" + basePath;
+  }
  mkDirRecursively(basePath.c_str());

  std::string saveDir = path::join(basePath, buf);

--- a/paddle/trainer/Tester.cpp
+++ b/paddle/trainer/Tester.cpp
@@ -71,24 +71,36 @@ Tester::Tester(const std::shared_ptr<TrainerConfigHelper> &config,
      parameterUpdater_));
 }

+void Tester::startTestPeriod() {
+  testEvaluator_->start();
+  testContext_.cost = 0;
+  testContext_.numSamples = 0;
+
+  parameterUpdater_->apply();
+  if (intconfig_->prevBatchState) {
+    gradientMachine_->getState(*intconfig_->trainState);
+    gradientMachine_->setState(*intconfig_->testState);
+  }
+}
+
+void Tester::testOneDataBatch(
+    const DataBatch& dataBatch, std::vector<Argument>* outArgs) {
+  testContext_.cost += forwardOneBatch(
+    dataBatch, testEvaluator_.get(), outArgs);
+  testContext_.numSamples += dataBatch.getSize();
+}
+
 void Tester::testOnePeriod() {
  DataBatch dataBatch;
  int64_t batchSize = config_->getOptConfig().batch_size();
-  testEvaluator_->start();
-  real cost = 0;
-  int64_t numSamples = 0;
  bool testAllData =
      intconfig_->testPeriod == 0 || intconfig_->testAllDataInOnePeriod;
-
  int batches =
      testAllData ? std::numeric_limits<int>::max() : intconfig_->testPeriod;

-  parameterUpdater_->apply();
-  if (intconfig_->prevBatchState) {
-    gradientMachine_->getState(*intconfig_->trainState);
-    gradientMachine_->setState(*intconfig_->testState);
-  }
+  std::vector<Argument> outArgs;

+  startTestPeriod();
  for (int i = 0; i < batches; ++i) {
    int num = testDataProvider_->getNextBatch(batchSize, &dataBatch);
    if (num == 0) {
@@ -102,13 +114,17 @@ void Tester::testOnePeriod() {
        num = testDataProvider_->getNextBatch(batchSize, &dataBatch);
      }
    }
-    cost += testOneBatch(dataBatch, testEvaluator_.get());
-    numSamples += num;
+    testOneDataBatch(dataBatch, &outArgs);
  }
+}
+
+void Tester::finishTestPeriod() {
  testEvaluator_->finish();
-  CHECK_GT(numSamples, 0) << "There is no samples in your test batch. Possibly "
-                             "wrong implementation of DataProvidor.reset()";
-  LOG(INFO) << " Test samples=" << numSamples << " cost=" << cost / numSamples
+  CHECK_GT(testContext_.numSamples, 0)
+      << "There is no samples in your test batch. Possibly "
+         "wrong implementation of DataProvidor.reset()";
+  LOG(INFO) << " Test samples=" << testContext_.numSamples
+            << " cost=" << testContext_.cost / testContext_.numSamples
            << " Eval: " << *testEvaluator_;
  parameterUpdater_->restore();
  if (intconfig_->prevBatchState) {
@@ -128,9 +144,11 @@ int64_t Tester::testOneBatchById(int64_t batchId) {
    return 0;
  }

+  std::vector<Argument> outArgs;
+
  stats_ += std::pair<int64_t, real>{
      actualBatchSize,
-      testOneBatch(dataBatch, testEvaluator_.get())};
+      forwardOneBatch(dataBatch, testEvaluator_.get(), &outArgs)};

  if (((batchId + 1) % intconfig_->logPeriod) == 0) {
    LOG(INFO) << " Batch=" << batchId + 1 << " " << stats_.getStats(false);
@@ -139,7 +157,10 @@ int64_t Tester::testOneBatchById(int64_t batchId) {
  return actualBatchSize;
 }

-real Tester::testOneBatch(const DataBatch &dataBatch, Evaluator *evaluator) {
+real Tester::forwardOneBatch(const DataBatch &dataBatch,
+                             Evaluator *evaluator,
+                             std::vector<Argument>* pOutArgs) {
+  auto& outArgs = *pOutArgs;
  const std::vector<Argument>& inArgs = dataBatch.getStreams();
  if (intconfig_->loadsaveParametersInPserver) {
    REGISTER_TIMER("prefetch");
@@ -148,12 +169,11 @@ real Tester::testOneBatch(const DataBatch &dataBatch, Evaluator *evaluator) {
                                           true /*after apply*/);
  }

-  std::vector<Argument> outArgs;
  gradientMachine_->forward(inArgs, &outArgs, PASS_TEST);

  // write features if set this flag and outArgs is not empty
  std::string featFile = intconfig_->featFile;
-  if (!featFile.empty() && !outArgs.empty()) {
+  if (!featFile.empty() && outArgs.empty()) {
    size_t numOutputs = outArgs.size();
    std::vector<MatrixPtr> featMatrices;
    featMatrices.resize(numOutputs);

--- a/paddle/trainer/Tester.h
+++ b/paddle/trainer/Tester.h
@@ -68,6 +68,10 @@ public:
   * is training at same time.
   */
  void testOnePeriod();
+  void startTestPeriod();
+  void finishTestPeriod();
+  void testOneDataBatch(const DataBatch& dataBatch,
+                        std::vector<Argument>* outArgs);

  /**
   * Test for given data batch.
@@ -75,7 +79,9 @@ public:
   * @param evaluator Evaluator
   * @return cost
   */
-  real testOneBatch(const DataBatch &dataBatch, Evaluator *evaluator);
+  real forwardOneBatch(const DataBatch& dataBatch,
+                       Evaluator* evaluator,
+                       std::vector<Argument>* outArgs);


  /**
@@ -99,6 +105,10 @@ protected:
  std::ofstream os_;
  std::vector<MatrixPtr> cpuMat_;
  std::vector<IVectorPtr> cpuVec_;
+  struct {
+    int64_t numSamples;
+    real cost;
+  } testContext_;

 private:
  /**

--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -40,7 +40,7 @@ limitations under the License. */
 #include "TrainerConfigHelper.h"

 P_DEFINE_string(config, "", "Trainer config file");
-P_DEFINE_int32(test_period, 1000,
+P_DEFINE_int32(test_period, 0,
               "Run test every so many train batches."
               " 0 for testing after each pass."
               " If not 0, test log_period batches."
@@ -196,7 +196,8 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper> &config,
  if (!dataProvider_ && config_->hasDataConfig()) {
    dataProvider_.reset(DataProvider::create(*config_, *config_, gpuData));
  }
-  if (dataProvider_) {
+  if (!testDataProvider_) {
+    // No evaluator_ if there is testDataProvider but no dataProvider.
    evaluator_.reset(trainerInternal_.getGradientMachine()->makeEvaluator());
    currentEvaluator_.reset(
        trainerInternal_.getGradientMachine()->makeEvaluator());
@@ -215,10 +216,7 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper> &config,
        DataProvider::create(config_->getTestDataConfig(), *config_, gpuData));
  }
  if (testDataProvider_) {
-    tester_.reset(new Tester(config_, createTesterConfig(),
-                 trainerInternal_.getGradientMachine(),
-                 trainerInternal_.getParameterUpdater(),
-                 testDataProvider_));
+    createTester();
  }

  if (!testing &&
@@ -258,34 +256,25 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper> &config,
    }
  }

-
  // set current evaluator and evalutor
  trainerInternal_.setCurrentEvaluator(currentEvaluator_.get());
  trainerInternal_.setEvaluator(evaluator_.get());
 }

 void Trainer::train(size_t numPasses) {
-  srand(config_->getConfig().start_pass() + 1);
-  dataProvider_->reset();
-
-  if (this->testDataProvider_) {
-    this->testDataProvider_->reset();
-  }
-
-  trainerInternal_.getGradientMachine()->start(*config_, dataProvider_);
-
+  startTrain();
  for (size_t i = 0; i < numPasses; ++i) {
    if (IGradientMachineMode::trainWholeDataInOneBatch(mode_)) {
      trainOnePassBatch(config_->getConfig().start_pass() + i);
    } else {
-      trainOnePass(config_->getConfig().start_pass() + i);
+      trainOnePass();
    }
    if (i < numPasses - 1) {
      dataProvider_->reset();
    }
  }

-  trainerInternal_.getGradientMachine()->finish();
+  finishTrain();
 }


@@ -387,13 +376,30 @@ real Trainer::checkGradient() {
  return maxDiff;
 }

-void Trainer::trainOnePass(int passId) {
-  this->stats_->reset();
-  int64_t batchId = 0;
-  int32_t batchSize = config_->getOptConfig().batch_size();
-  real avgTestCost = 0;
-  int64_t numAvgTests = 0;
-  int passInnerId = 1;
+void Trainer::startTrain() {
+  trainPassContext_.passId = config_->getConfig().start_pass();
+  srand(config_->getConfig().start_pass() + 1);
+  if (dataProvider_) {
+    dataProvider_->reset();
+  }
+
+  if (this->testDataProvider_) {
+    this->testDataProvider_->reset();
+  }
+
+  trainerInternal_.getGradientMachine()->start(*config_, dataProvider_);
+}
+
+void Trainer::finishTrain() {
+  trainerInternal_.getGradientMachine()->finish();
+}
+
+void Trainer::startTrainPass() {
+  stats_->reset();
+  trainPassContext_.batchId = 0;
+  trainPassContext_.avgTestCost = 0;
+  trainPassContext_.numAvgTests = 0;
+  trainPassContext_.passInnerId = 1;

  trainerInternal_.getParameterUpdater()->startPass();
  evaluator_->start();
@@ -401,81 +407,83 @@ void Trainer::trainOnePass(int passId) {
    trainerInternal_.getGradientMachine()->resetState();
    trainerInternal_.getGradientMachine()->getState(testState_);
  }
-  while (true) {
-    DataBatch dataBatch;
-
-    int num = 0;
-    {
-      REGISTER_TIMER("getTrainBatch");
-      num = dataProvider_->getNextBatch(batchSize, &dataBatch);
-    }
-    if (num == 0) break;
+}

-    if (averageEvaluator_) {
-      int64_t mod = batchId % FLAGS_average_test_period;
-      if (mod >= FLAGS_average_test_period - FLAGS_log_period) {
-        if (mod == FLAGS_average_test_period - FLAGS_log_period) {
-          averageEvaluator_->start();
-        }
-        trainerInternal_.getParameterUpdater()->apply();
-        if (FLAGS_prev_batch_state) {
-          trainerInternal_.getGradientMachine()->getState(trainState_);
-        }
-        avgTestCost +=
-            tester_->testOneBatch(dataBatch, averageEvaluator_.get());
-        if (FLAGS_prev_batch_state) {
-          trainerInternal_.getGradientMachine()->setState(trainState_);
-        }
-        numAvgTests += num;
-        trainerInternal_.getParameterUpdater()->restore();
+void Trainer::trainOneDataBatch(DataBatch& dataBatch) {
+  int num = dataBatch.getSize();
+  if (averageEvaluator_) {
+    int64_t mod = trainPassContext_.batchId % FLAGS_average_test_period;
+    if (mod >= FLAGS_average_test_period - FLAGS_log_period) {
+      if (mod == FLAGS_average_test_period - FLAGS_log_period) {
+        averageEvaluator_->start();
      }
+      trainerInternal_.getParameterUpdater()->apply();
+      if (FLAGS_prev_batch_state) {
+        trainerInternal_.getGradientMachine()->getState(trainState_);
+      }
+      trainPassContext_.avgTestCost +=
+          tester_->forwardOneBatch(
+            dataBatch, averageEvaluator_.get(), &forwardOutput_);
+      if (FLAGS_prev_batch_state) {
+        trainerInternal_.getGradientMachine()->setState(trainState_);
+      }
+      trainPassContext_.numAvgTests += num;
+      trainerInternal_.getParameterUpdater()->restore();
    }
-    {
-      REGISTER_TIMER("TrainBatch");
-      trainerInternal_.trainOneBatch(batchId, dataBatch);
-    }
+  }
+  {
+    REGISTER_TIMER("TrainBatch");
+    trainerInternal_.trainOneBatch(
+      trainPassContext_.batchId, dataBatch, &forwardOutput_);
+  }

-    if (averageEvaluator_ &&
-        batchId % FLAGS_average_test_period == FLAGS_average_test_period - 1) {
-      averageEvaluator_->finish();
-      LOG(INFO) << " Averaged parameter:"
-                << " cost=" << avgTestCost / numAvgTests
-                << " Eval: " << *averageEvaluator_;
-      numAvgTests = 0;
-      avgTestCost = 0;
-    }
+  if (averageEvaluator_ &&
+      trainPassContext_.batchId % FLAGS_average_test_period
+        == FLAGS_average_test_period - 1) {
+    averageEvaluator_->finish();
+    LOG(INFO) << " Averaged parameter:"
+              << " cost=" << trainPassContext_.avgTestCost
+                             / trainPassContext_.numAvgTests
+              << " Eval: " << *averageEvaluator_;
+    trainPassContext_.numAvgTests = 0;
+    trainPassContext_.avgTestCost = 0;
+  }

-    ++batchId;
+  ++trainPassContext_.batchId;

-    if (batchId % FLAGS_log_period == 0) {
-      FOR_TIMING(globalStat.setThreadInfo(true));
-      FOR_TIMING(globalStat.printAllStatus());
-      FOR_TIMING(globalStat.reset());
-    }
+  if (trainPassContext_.batchId % FLAGS_log_period == 0) {
+    FOR_TIMING(globalStat.setThreadInfo(true));
+    FOR_TIMING(globalStat.printAllStatus());
+    FOR_TIMING(globalStat.reset());
+  }

-    if (testDataProvider_ && FLAGS_test_period > 0 &&
-        batchId % FLAGS_test_period == 0) {
-      tester_->testOnePeriod();
-    }
+  if (testDataProvider_ && FLAGS_test_period > 0 &&
+      trainPassContext_.batchId % FLAGS_test_period == 0) {
+    tester_->testOnePeriod();
+  }

-    if (FLAGS_saving_period_by_batches > 0 &&
-        batchId > FLAGS_saving_period_by_batches * passInnerId &&
-        0 == FLAGS_trainer_id) {
-      trainerInternal_.getParameterUpdater()->catchUpWith();
-      if (testDataProvider_) {
-        tester_->testOnePeriod();
-      }
-      paramUtil_->saveParametersOnePass(passId, passInnerId);
-      ++passInnerId;
+  if (FLAGS_saving_period_by_batches > 0 &&
+      trainPassContext_.batchId
+          > FLAGS_saving_period_by_batches * trainPassContext_.passInnerId &&
+      0 == FLAGS_trainer_id) {
+    trainerInternal_.getParameterUpdater()->catchUpWith();
+    if (testDataProvider_) {
+      tester_->testOnePeriod();
    }
+    paramUtil_->saveParametersOnePass(
+      trainPassContext_.passId, trainPassContext_.passInnerId);
+    ++trainPassContext_.passInnerId;
  }
+}

-  if (batchId == 0) {
+void Trainer::finishTrainPass() {
+  if (trainPassContext_.batchId == 0) {
    // This means no more data from DataProvider
    return;
  }

-  trainerInternal_.finishTrainPass(passId, batchId);
+  trainerInternal_.finishTrainPass(
+    trainPassContext_.passId, trainPassContext_.batchId);

  FOR_TIMING(globalStat.setThreadInfo(true));
  FOR_TIMING(globalStat.printAllStatus());
@@ -485,9 +493,30 @@ void Trainer::trainOnePass(int passId) {
    tester_->testOnePeriod();
  }

-  if (passId % FLAGS_saving_period == 0 && FLAGS_trainer_id == 0) {
-    paramUtil_->saveParametersOnePass(passId);
+  if (trainPassContext_.passId % FLAGS_saving_period == 0
+      && FLAGS_trainer_id == 0) {
+    paramUtil_->saveParametersOnePass(trainPassContext_.passId);
  }
+  ++trainPassContext_.passId;
+}
+
+void Trainer::trainOnePass() {
+  startTrainPass();
+  size_t batchSize = config_->getOptConfig().batch_size();
+  while (true) {
+    DataBatch dataBatch;
+
+    int num = 0;
+    {
+      REGISTER_TIMER("getTrainBatch");
+      num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+    }
+    if (num == 0) break;
+    CHECK_EQ(num, dataBatch.getSize());
+    trainOneDataBatch(dataBatch);
+  }
+
+  finishTrainPass();
 }

 void Trainer::trainOnePassBatch(int passId) {
@@ -582,6 +611,13 @@ void Trainer::clearGradient() {

 int Trainer::getBatchSize() { return config_->getOptConfig().batch_size(); }

+void Trainer::createTester() {
+  tester_.reset(new paddle::Tester(config_, createTesterConfig(),
+                                   trainerInternal_.getGradientMachine(),
+                                   trainerInternal_.getParameterUpdater(),
+                                   testDataProvider_));
+}
+
 void Trainer::test() {
  tester_->test();
 }

--- a/paddle/trainer/Trainer.h
+++ b/paddle/trainer/Trainer.h
@@ -94,6 +94,12 @@ public:
   */
  real checkGradient();

+  void startTrain();
+  void finishTrain();
+  void startTrainPass();
+  void finishTrainPass();
+  void trainOneDataBatch(DataBatch& dataBatch);
+  void time();

  /**
   * given a dataBatch and the current parameter value
@@ -144,11 +150,11 @@ public:

 protected:
  /**
-   * Train one pass of data. passId starts from 0
+   * Train one pass of data.
   *
   * SGD Method.
   */
-  void trainOnePass(int passId);
+  void trainOnePass();

  /**
   * Train one pass in one batch.
@@ -161,6 +167,8 @@ protected:
   */
  void clearGradient();

+  void createTester();
+
 private:
  std::unique_ptr<TesterConfig> createTesterConfig();

@@ -173,6 +181,17 @@ protected:
  MachineState trainState_;
  MachineState testState_;

+  struct TrainPassContext {
+    int64_t batchId;
+    real avgTestCost;
+    int64_t numAvgTests;
+    int passId;
+    int passInnerId;
+  };
+  std::vector<paddle::Argument> forwardOutput_;
+
+  TrainPassContext trainPassContext_;
+
  std::unique_ptr<Evaluator> evaluator_;
  std::unique_ptr<Evaluator> currentEvaluator_;
  std::unique_ptr<Evaluator> averageEvaluator_;

--- a/paddle/trainer/TrainerBenchmark.cpp
+++ b/paddle/trainer/TrainerBenchmark.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#undef PADDLE_DISABLE_TIMER
+
+#include "Trainer.h"
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/Util.h"
+
+P_DECLARE_int32(test_period);
+
+P_DEFINE_bool(feed_data, false, "Wether to read data from DataProvider.");
+
+namespace paddle {
+
+void Trainer::time() {
+  startTrain();
+
+  trainerInternal_.getParameterUpdater()->startPass();
+  evaluator_->start();
+
+  DataBatch dataBatch;
+  int32_t batchSize = config_->getOptConfig().batch_size();
+  int32_t num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+  CHECK_EQ(num, batchSize) << "The sample number is less than batch size "
+                           << num << " != " << batchSize;
+
+  CHECK(dataBatch.getSize()) << "No data from data provider";
+
+  std::vector<paddle::Argument> outputs;
+  // burning time
+  LOG(INFO) << "Burning time...";
+  for (int n = 0; n < 10; ++n) {
+    trainerInternal_.trainOneBatch(n, dataBatch, &outputs);
+  }
+  LOG(INFO) << "Burning time end.";
+
+  for (int n = 0; n < FLAGS_test_period; n++) {
+    if (FLAGS_feed_data) {
+      REGISTER_TIMER("GetData");
+      num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+    }
+
+    if (num != batchSize) {
+      break;
+    }
+
+    {
+      REGISTER_TIMER("FwdBwd");
+      trainerInternal_.trainOneBatch(n, dataBatch, &outputs);
+    }
+  }
+  globalStat.setThreadInfo(true);
+  globalStat.printSegTimerStatus();
+  globalStat.reset();
+
+  finishTrain();
+}
+
+}  // namespace paddle
--- a/paddle/trainer/TrainerInternal.cpp
+++ b/paddle/trainer/TrainerInternal.cpp
@@ -55,6 +55,8 @@ void TrainerInternal::init(const std::shared_ptr<TrainerConfigHelper> &config,

    gradientMachine_ = gradientMachine;
    if (!gradientMachine) {
+      CHECK(config_->getConfig().has_model_config())
+          << "Missing model_config in trainer_config";
      gradientMachine_.reset(GradientMachine::create(
        config_->getConfig().model_config(), intconfig_->mode,
        parameterUpdater_->getParameterTypes()));
@@ -62,7 +64,8 @@ void TrainerInternal::init(const std::shared_ptr<TrainerConfigHelper> &config,
 }

 void TrainerInternal::trainOneBatch(int64_t batchId,
-                                    const DataBatch& dataBatch) {
+                                    const DataBatch& dataBatch,
+                                    std::vector<Argument>* outArgs) {
  // true means updating parameter whenever gradient is ready during backward()
  bool doPipelineUpdate =
      (intconfig_->mode != GradientMachine::kSgdSparseCpuTraining) &&
@@ -84,7 +87,6 @@ void TrainerInternal::trainOneBatch(int64_t batchId,
  }

  const std::vector<Argument>& inArgs = dataBatch.getStreams();
-  std::vector<Argument> outArgs;

  PassType passType = parameterUpdater_->startBatch(actualBatchSize);

@@ -114,7 +116,7 @@ void TrainerInternal::trainOneBatch(int64_t batchId,
    timer.start();
 #endif
    REGISTER_TIMER("forwardBackward");
-    forwardBackwardBatch(inArgs, outArgs, passType, updateCallback,
+    forwardBackwardBatch(inArgs, *outArgs, passType, updateCallback,
                         doPipelineUpdate);
 #ifndef PADDLE_DISABLE_TIMER
    timer.stop();
@@ -132,7 +134,7 @@ void TrainerInternal::trainOneBatch(int64_t batchId,
  real cost = 0;
  {
    REGISTER_TIMER("sumCost");
-    cost = Argument::sumCosts(outArgs);
+    cost = Argument::sumCosts(*outArgs);
  }

  if (batchId % intconfig_->log_period == 0) {

--- a/paddle/trainer/TrainerInternal.h
+++ b/paddle/trainer/TrainerInternal.h
@@ -81,7 +81,9 @@ public:
   * @param batchId current batch id
   * @param dataBatch data for the batch
   */
-  void trainOneBatch(int64_t batchId, const DataBatch& dataBatch);
+  void trainOneBatch(int64_t batchId,
+                     const DataBatch& dataBatch,
+                     std::vector<Argument>* outArgs);

  /**
   * showParameterStats

--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/trainer/TrainerMain.cpp
@@ -103,6 +103,8 @@ int main(int argc, char** argv) {
    trainer.checkGradient();
  } else if (FLAGS_job == "test") {
    trainer.test();
+  } else if (FLAGS_job == "time") {
+    trainer.time();
  } else {
    LOG(FATAL) << "Unknown job type: " << FLAGS_job;
  }

--- a/proto/ModelConfig.proto.m4
+++ b/proto/ModelConfig.proto.m4
@@ -267,7 +267,7 @@ sinclude(`ModelConfigLayer.proto.m4')
  // (which is how convnets are usually trained). Setting this to
  // false will untie the biases, yielding a separate bias for
  // every location at which the filter is applied.
-  optional bool shared_biases = 8;
+  optional bool shared_biases = 8 [default = false];

  // Valid values are ones that divide the area of the output
  // grid in this convolutional layer. For example if this layer
@@ -391,6 +391,9 @@ sinclude(`ModelConfigLayer.proto.m4')

  // use to compute moving mean and variance.
  optional real moving_average_fraction = 47 [default = 0.9];
+
+  // bias size
+  optional uint32 bias_size = 48 [default = 0];
 }

 message EvaluatorConfig {

--- a/proto/TrainerConfig.proto.m4
+++ b/proto/TrainerConfig.proto.m4
@@ -130,7 +130,7 @@ message OptimizationConfig {
 };

 message TrainerConfig {
-  required ModelConfig model_config = 1;
+  optional ModelConfig model_config = 1;
  optional DataConfig data_config = 2;
  required OptimizationConfig opt_config = 3;
  optional DataConfig test_data_config = 4;

--- a/python/paddle/proto/__init__.py
+++ b/python/paddle/proto/__init__.py
@@ -12,3 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from paddle.proto.TrainerConfig_pb2 import OptimizationConfig, TrainerConfig
+from paddle.proto.ModelConfig_pb2 import ModelConfig
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -632,6 +632,44 @@ class ContextProjection(Projection):
    _total_pad = 0


+@config_class
+class ConvProjection(Projection):
+    type = 'conv'
+
+    def __init__(
+            self,
+            input_layer_name,
+            num_filters=None,
+            conv_conf=None,
+            **xargs):
+        super(ConvProjection, self).__init__(input_layer_name, **xargs)
+
+        if num_filters is not None:
+            self.proj_conf.num_filters = num_filters
+
+        parse_conv(conv_conf,
+                   input_layer_name,
+                   self.proj_conf.conv_conf)
+        # TODO: support rectangle input
+        self.proj_conf.output_size = (self.proj_conf.conv_conf.output_x  ** 2) * num_filters
+
+    def calc_output_size(self, input_layer_config):
+        return self.proj_conf.output_size
+
+    def calc_parameter_size(self, input_size, output_size):
+        co = self.proj_conf.num_filters
+        ci = self.proj_conf.conv_conf.channels
+        fh = self.proj_conf.conv_conf.filter_size
+        fw = self.proj_conf.conv_conf.filter_size_y
+        return co * ci * fh * fw
+
+    def calc_bias_size(self):
+        return self.proj_conf.num_filters
+
+    def calc_parameter_dims(self, input_size, output_size):
+        return None
+
+
 # Define a operator for mixed layer
 @config_class
 class Operator(Cfg):
@@ -2528,8 +2566,15 @@ class MixedLayer(LayerBase):
            record_operator_conf = self.config.operator_confs.add()
            record_operator_conf.CopyFrom(operator_conf)

+        psize = self.config.size
+        if isinstance(self.inputs[0], ConvProjection):
+            self.config.shared_biases = True
+            psize = 0
+            for input in self.inputs:
+                psize += input.calc_bias_size()

-        self.create_bias_parameter(bias, self.config.size)
+        self.config.bias_size = psize
+        self.create_bias_parameter(bias, psize)

        if error_clipping_threshold is not None:
            self.config.error_clipping_threshold = error_clipping_threshold
@@ -2547,8 +2592,10 @@ class ConcatenateLayer(LayerBase):
            self,
            name,
            inputs,
+            bias=False,
            **xargs):
        config_assert(inputs, 'inputs cannot be empty')
+        config_assert(not bias, 'ConcatenateLayer cannot support bias.')
        super(ConcatenateLayer, self).__init__(
            name, 'concat', 0, inputs=inputs, **xargs)
        size = 0
@@ -2567,10 +2614,19 @@ class ConcatenateLayer2(LayerBase):
            self,
            name,
            inputs,
+            bias=False,
            **xargs):
        config_assert(inputs, 'inputs cannot be empty')
        super(ConcatenateLayer2, self).__init__(
            name, 'concat2', 0, inputs=inputs, **xargs)
+
+        if isinstance(self.inputs[0], ConvProjection):
+          for input_index in xrange(len(self.inputs) - 1):
+              input = self.inputs[input_index + 1]
+              config_assert(isinstance(input, ConvProjection),
+                  "The first input of ConcatenateLayer2 is ConvProjection, "
+                  "the other inputs should also be ConvProjection.")
+
        size = 0
        for input_index in xrange(len(self.inputs)):
            input_layer = self.get_input_layer(input_index)
@@ -2596,6 +2652,16 @@ class ConcatenateLayer2(LayerBase):
              input.proj_conf.output_size)
            self.create_input_parameter(input_index, psize, dims)

+        psize = self.config.size
+        if isinstance(self.inputs[0], ConvProjection):
+            self.config.shared_biases = True
+            psize = 0
+            for input in self.inputs:
+                psize += input.calc_bias_size()
+
+        self.config.bias_size = psize
+        self.create_bias_parameter(bias, psize)
+
 @config_layer('recurrent')
 class RecurrentLayer(LayerBase):
    def __init__(

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -34,7 +34,7 @@ __all__ = ["full_matrix_projection", "AggregateLevel", "ExpandLevel",
           "table_projection", "mixed_layer", "data_layer",
           "embedding_layer", "fc_layer", "grumemory",
           "pooling_layer", "lstmemory", "last_seq", "first_seq",
-           "cos_sim", "hsigmoid",
+           "cos_sim", "hsigmoid", "conv_projection",
           "regression_cost", 'classification_cost', "LayerOutput",
           'img_conv_layer', 'img_pool_layer', 'batch_norm_layer',
           'img_cmrnorm_layer', 'addto_layer',
@@ -54,7 +54,7 @@ __all__ = ["full_matrix_projection", "AggregateLevel", "ExpandLevel",
           'cross_entropy_with_selfnorm', 'cross_entropy',
           'multi_binary_label_cross_entropy',
           'rank_cost', 'lambda_cost', 'huber_cost',
-           # 'block_expand_layer',  # TODO(yuyang18): this layer is not correct
+           'block_expand_layer',
           'maxout_layer', 'out_prod_layer', 'print_layer'
           ]

@@ -1984,7 +1984,7 @@ def addto_layer(input, act=None, name=None, bias_attr=None,
 @wrap_act_default(act=IdentityActivation())
 @wrap_name_default("concat")
 @layer_support()
-def concat_layer(input, act=None, name=None, layer_attr=None):
+def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
    """
    Concat all input vector into one huge vector.
    Inputs can be list of LayerOutput or list of projection.
@@ -2043,10 +2043,14 @@ def concat_layer(input, act=None, name=None, layer_attr=None):
    layer_type = (LayerType.CONCAT_LAYER if is_concat_layer
                  else LayerType.CONCAT_PROJ_LAYER)

+    if layer_type == LayerType.CONCAT_LAYER:
+        assert not bias_attr
+    
    Layer(
        name=name, type=layer_type,
        inputs=[x.name for x in input] if is_concat_layer else input,
        active_type=act.name,
+        bias=ParamAttr.to_bias(bias_attr),
        **ExtraLayerAttribute.to_kwargs(layer_attr)
    )

@@ -2950,6 +2954,103 @@ def conv_operator(img, filter, filter_size, num_filters,
    op.origin = [img, filter]
    return op

+@wrap_param_attr_default()
+def conv_projection(input, filter_size, num_filters,
+                    num_channels=None, stride=1, padding=0,
+                    filter_size_y=None, stride_y=None, padding_y=None,
+                    groups=1, param_attr=None):
+    """
+    ConvProjection with a layer as input.
+    It performs element-wise multiplication with weight.
+
+    Different from img_conv_layer and conv_op, conv_projection is an Projection,
+    which can be used in mixed_layer and conat_layer. It use cudnn to implement
+    conv and only support GPU mode.
+
+    The example usage is:
+
+    .. code-block:: python
+
+       proj = conv_projection(img=input1,
+                              filter_size=3,
+                              num_filters=64,
+                              num_channels=64)
+
+    :param input: input layer
+    :type input: LayerOutput
+    :param filter_size: The x dimension of a filter kernel.
+    :type filter_size: int
+    :param filter_size_y: The y dimension of a filter kernel. Since
+                          PaddlePaddle now supports rectangular filters,
+                          the filter's shape can be (filter_size, filter_size_y).
+    :type filter_size_y: int
+    :param num_filters: channel of output data.
+    :type num_filters: int
+    :param num_channel: channel of input data.
+    :type num_channel: int
+    :param stride: The x dimension of the stride.
+    :type stride: int
+    :param stride_y: The y dimension of the stride.
+    :type stride_y: int
+    :param padding: The x dimension of padding.
+    :type padding: int
+    :param padding_y: The y dimension of padding.
+    :type padding_y: int
+    :param groups: The group number.
+    :type groups: int
+    :param param_attr: Convolution param attribute. None means default attribute
+    :type param_attr: ParameterAttribute
+    :return: A DotMulProjection Object.
+    :rtype: DotMulProjection
+    """
+    if num_channels is None:
+        assert input.num_filters is not None
+        num_channels = input.num_filters
+
+    if filter_size_y is None:
+        if isinstance(filter_size, collections.Sequence):
+            assert len(filter_size) == 2
+            filter_size, filter_size_y = filter_size
+        else:
+            filter_size_y = filter_size
+
+    if stride_y is None:
+        if isinstance(stride, collections.Sequence):
+            assert len(stride) == 2
+            stride, stride_y = stride
+        else:
+            stride_y = stride
+
+    if padding_y is None:
+        if isinstance(padding, collections.Sequence):
+            assert len(padding) == 2
+            padding, padding_y = padding
+        else:
+            padding_y = padding
+
+    if param_attr.attr.get('initial_smart'):
+        # special initial for conv layers.
+        init_w = (2.0 / (filter_size ** 2 * num_channels)) ** 0.5
+        param_attr.attr["initial_mean"] = 0.0
+        param_attr.attr["initial_std"] = init_w
+        param_attr.attr["initial_strategy"] = 0
+        param_attr.attr["initial_smart"] = False
+
+    proj = ConvProjection(input_layer_name=input.name,
+                          num_filters=num_filters,
+                          conv_conf=Conv(filter_size=filter_size,
+                                         padding=padding,
+                                         stride=stride,
+                                         channels=num_channels,
+                                         filter_size_y=filter_size_y,
+                                         padding_y=padding_y,
+                                         stride_y=stride_y,
+                                         groups=groups),
+                          **param_attr.attr)
+
+    proj.origin = input
+    return proj
+

 @wrap_name_default()
 @layer_support()
@@ -3284,18 +3385,18 @@ convex_comb_layer = linear_comb_layer
 @wrap_name_default()
 @layer_support()
 def block_expand_layer(input,
-                       channel=0,
                       block_x=0,
                       block_y=0,
                       stride_x=0,
                       stride_y=0,
                       padding_x=0,
                       padding_y=0,
+                       num_channels=None,
                       name=None,
                       layer_attr=None):
    """
    Expand feature map to minibatch matrix.
-       - matrix width is: block_y * block_x * channel
+       - matrix width is: block_y * block_x * num_channels
       - matirx height is: outputH * outputW

    .. math::
@@ -3307,7 +3408,7 @@ def block_expand_layer(input,
    The expand method is the same with ExpandConvLayer, but saved the transposed
    value. After expanding, output.sequenceStartPositions will store timeline.
    The number of time steps are outputH * outputW and the dimension of each
-    time step is block_y * block_x * channel. This layer can be used after
+    time step is block_y * block_x * num_channels. This layer can be used after
    convolution neural network, and before recurrent neural network.

    The simple usage is:
@@ -3315,7 +3416,7 @@ def block_expand_layer(input,
    .. code-block:: python

       block_expand = block_expand_layer(input,
-                                         channel=128,
+                                         num_channels=128,
                                         stride_x=1,
                                         stride_y=1,
                                         block_x=1,
@@ -3323,8 +3424,8 @@ def block_expand_layer(input,

    :param input: The input layer.
    :type input: LayerOutput
-    :param channel: The channel number of input layer.
-    :type channel: int
+    :param num_channels: The channel number of input layer.
+    :type num_channels: int|None
    :param block_x: The width of sub block.
    :type block_x: int
    :param block_y: The width of sub block.
@@ -3344,16 +3445,18 @@ def block_expand_layer(input,
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
+    if num_channels is None:
+        assert input.num_filters is not None
+        num_channels = input.num_filters
    Layer(name=name,
-          input=Input(input.name,
-                      block_expand=BlockExpand(channels=channel,
-                                               block_x=block_x,
-                                               block_y=block_y,
-                                               stride_x=stride_x,
-                                               stride_y=stride_y,
-                                               padding_x=padding_x,
-                                               padding_y=padding_y)
-                      ),
+          inputs=Input(input.name,
+                       block_expand=BlockExpand(channels=num_channels,
+                                                block_x=block_x,
+                                                block_y=block_y,
+                                                stride_x=stride_x,
+                                                stride_y=stride_y,
+                                                padding_x=padding_x,
+                                                padding_y=padding_y)),
          type=LayerType.BLOCK_EXPAND,
          **ExtraLayerAttribute.to_kwargs(layer_attr)
          )

--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -29,7 +29,7 @@ __all__ = ['sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool",
           "img_conv_bn_pool", 'dropout_layer', 'lstmemory_group',
           'lstmemory_unit', 'small_vgg', 'img_conv_group', 'vgg_16_network',
           'gru_unit', 'gru_group', 'simple_gru', 'simple_attention',
-           'text_conv_pool',
+           'simple_gru2', 'bidirectional_gru', 'text_conv_pool',
           'bidirectional_lstm', 'inputs', 'outputs']


@@ -811,22 +811,37 @@ def simple_gru(input,
               gru_layer_attr=None
               ):
    """
-    simple_gru is also a recurrent layer group version Gated Recurrent Unit as
-    gru_group. The difference only lies in implemention details.
+    You maybe see gru_step_layer, grumemory in layers.py, gru_unit, gru_group,
+    simple_gru in network.py. The reason why there are so many interfaces is
+    that we have two ways to implement recurrent neural network. One way is to
+    use one complete layer to implement rnn (including simple rnn, gru and lstm)
+    with multiple time steps, such as recurrent_layer, lstmemory, grumemory. But,
+    the multiplication operation :math:`W x_t` is not computed in these layers.
+    See details in their interfaces in layers.py. 
+    The other implementation is to use an recurrent group which can ensemble a
+    series of layers to compute rnn step by step. This way is flexible for
+    attenion mechanism or other complex connections.
+
+    - gru_step_layer: only compute rnn by one step. It needs an memory as input
+      and can be used in recurrent group.
+    - gru_unit: a wrapper of gru_step_layer with memory. 
+    - gru_group: a GRU cell implemented by a combination of multiple layers in
+      recurrent group.
+      But :math:`W x_t` is not done in group.  
+    - gru_memory: a GRU cell implemented by one layer, which does same calculation
+      with gru_group and is faster than gru_group. 
+    - simple_gru: a complete GRU implementation inlcuding :math:`W x_t` and 
+      gru_group. :math:`W` contains :math:`W_r`, :math:`W_z` and :math:`W`, see
+      formula in grumemory. 
+
    The computational speed is that, grumemory is relatively better than
    gru_group, and gru_group is relatively better than simple_gru.

-    simple_gru does exactly the same calculation as the grumemory layer does.
-    Please see grumemory in layers.py for more detail about the maths.
-
    The example usage is:

    ..  code-block:: python

-        gru = gur_group(input=[layer1],
-                        size=256,
-                        act=TanhActivation(),
-                        gate_act=SigmoidActivation())
+        gru = simple_gru(input=[layer1], size=256)

    :param input: input layer name.
    :type input: LayerOutput
@@ -863,6 +878,132 @@ def simple_gru(input,
                     gru_layer_attr=gru_layer_attr)


+@wrap_name_default('simple_gru2')
+def simple_gru2(input,
+                size,
+                name=None,
+                reverse=False,
+                mixed_param_attr=None,
+                mixed_bias_attr=None,
+                gru_param_attr=None,
+                gru_bias_attr=None,
+                act=None,
+                gate_act=None,
+                mixed_layer_attr=None,
+                gru_cell_attr=None
+                ):
+    """
+    simple_gru2 is the same with simple_gru, but using grumemory instead
+    Please see grumemory in layers.py for more detail about the maths.
+    simple_gru2 is faster than simple_gru.
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        gru = simple_gru2(input=[layer1], size=256)
+
+    :param input: input layer name.
+    :type input: LayerOutput
+    :param name: name of the gru group.
+    :type name: basestring
+    :param size: hidden size of the gru.
+    :type size: int
+    :param reverse: whether to process the input data in a reverse order
+    :type reverse: bool
+    :param act: type of the activiation
+    :type act: BaseActivation
+    :param gate_act: type of the gate activiation
+    :type gate_act: BaseActivation
+    :param gru_bias_attr: bias. False means no bias, None means default bias.
+    :type gru_bias_attr: ParameterAttribute|False
+    :param gru_layer_attr: Extra parameter attribute of the gru layer.
+    :type gru_layer_attr: ParameterAttribute|False
+    :return: the gru group.
+    :rtype: LayerOutput
+    """
+    with mixed_layer(name='%s_transform' % name,
+                     size=size * 3,
+                     bias_attr=mixed_bias_attr,
+                     layer_attr=mixed_layer_attr) as m:
+        m += full_matrix_projection(input=input, param_attr=mixed_param_attr)
+
+    return grumemory(name=name,
+                     size=size,
+                     input=m,
+                     reverse=reverse,
+                     bias_attr=gru_bias_attr,
+                     param_attr=gru_param_attr,
+                     act=act,
+                     gate_act=gate_act,
+                     layer_attr=gru_cell_attr)
+
+
+@wrap_name_default("bidirectional_gru")
+def bidirectional_gru(input, size, name=None, return_seq=False,
+                      fwd_mixed_param_attr=None, fwd_mixed_bias_attr=None,
+                      fwd_gru_param_attr=None, fwd_gru_bias_attr=None,
+                      fwd_act=None, fwd_gate_act=None,
+                      fwd_mixed_layer_attr=None, fwd_gru_cell_attr=None,
+
+                      bwd_mixed_param_attr=None, bwd_mixed_bias_attr=None,
+                      bwd_gru_param_attr=None, bwd_gru_bias_attr=None,
+                      bwd_act=None, bwd_gate_act=None,
+                      bwd_mixed_layer_attr=None, bwd_gru_cell_attr=None,
+
+                      last_seq_attr=None, first_seq_attr=None,
+                      concat_attr=None, concat_act=None):
+    """
+    A bidirectional_gru is a recurrent unit that iterates over the input
+    sequence both in forward and bardward orders, and then concatenate two
+    outputs to form a final output. However, concatenation of two outputs
+    is not the only way to form the final output, you can also, for example,
+    just add them together.
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        bi_gru = bidirectional_gru(input=[input1], size=512)
+
+    :param name: bidirectional gru layer name.
+    :type name: basestring
+    :param input: input layer.
+    :type input: LayerOutput
+    :param size: gru layer size.
+    :type size: int
+    :param return_seq: If set False, outputs of the last time step are
+                       concatenated and returned.
+                       If set True, the entire output sequences that are
+                       processed in forward and backward directions are
+                       concatenated and returned.
+    :type return_seq: bool
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    args = locals()
+
+    fw = simple_gru2(name='%s_fw' % name, input=input, size=size,
+                     **dict((k[len('fwd_'):], v) for k, v in args.iteritems()
+                        if k.startswith('fwd_')))
+
+    bw = simple_gru2(name="%s_bw" % name, input=input, size=size,
+                     reverse=True,
+                     **dict((k[len('bwd_'):], v) for k, v in args.iteritems()
+                        if k.startswith('bwd_')))
+
+    if return_seq:
+        return concat_layer(name=name, input=[fw, bw], layer_attr=concat_attr,
+                            act=concat_act)
+    else:
+        fw_seq = last_seq(name="%s_fw_last" % name, input=fw,
+                          layer_attr=last_seq_attr)
+        bw_seq = first_seq(name="%s_bw_last" % name, input=bw,
+                           layer_attr=first_seq_attr)
+        return concat_layer(name=name, input=[fw_seq, bw_seq],
+                            layer_attr=concat_attr, act=concat_act)
+
+
 @wrap_name_default("bidirectional_lstm")
 def bidirectional_lstm(input, size, name=None, return_seq=False,
                       fwd_mat_param_attr=None, fwd_bias_param_attr=None,
@@ -893,7 +1034,7 @@ def bidirectional_lstm(input, size, name=None, return_seq=False,

    ..  code-block:: python

-        lstm_step = bidirectional_lstm(input=[input1], size=512)
+        bi_lstm = bidirectional_lstm(input=[input1], size=512)

    :param name: bidirectional lstm layer name.
    :type name: basestring
@@ -907,7 +1048,7 @@ def bidirectional_lstm(input, size, name=None, return_seq=False,
                       processed in forward and backward directions are
                       concatenated and returned.
    :type return_seq: bool
-    :return: lstm layer name.
+    :return: LayerOutput object accroding to the return_seq.
    :rtype: LayerOutput
    """
    args = locals()

--- a/python/paddle/trainer_config_helpers/tests/configs/check.md5
+++ b/python/paddle/trainer_config_helpers/tests/configs/check.md5
 86c0815275a9d5eb902e23c6a592f58a  img_layers.protostr
 a5d9259ff1fd7ca23d0ef090052cb1f2  last_first_seq.protostr
 9c038249ec8ff719753a746cdb04c026  layer_activations.protostr
-5913f87b39cee3b2701fa158270aca26  projections.protostr
+34e04043cbb12931c47fa44ec50eeffc  projections.protostr
 7334ba0a4544f0623231330fc51d390d  shared_fc.protostr
-8b8b6bb128a7dfcc937be86145f53e2f  shared_lstm.protostr
+bb8e233b05b8e07f9ed386b7aee4f2c6  shared_lstm.protostr
 6b39e34beea8dfb782bee9bd3dea9eb5  simple_rnn_layers.protostr
+f98e79e1630d5eb827c300e64836d269  test_bi_grumemory.protostr
 0fc1409600f1a3301da994ab9d28b0bf  test_cost_layers.protostr
 6cd5f28a3416344f20120698470e0a4c  test_cost_layers_with_weight.protostr
 144bc6d3a509de74115fa623741797ed  test_expand_layer.protostr
@@ -12,10 +13,10 @@ a5d9259ff1fd7ca23d0ef090052cb1f2  last_first_seq.protostr
 8bb44e1e5072d0c261572307e7672bda  test_grumemory_layer.protostr
 1f3510672dce7a9ed25317fc58579ac7  test_hsigmoid.protostr
 d350bd91a0dc13e854b1364c3d9339c6  test_lstmemory_layer.protostr
-6fa59551808ee7012bbd24f757e782d2  test_maxout.protostr
+5433ed33d4e7414eaf658f2a55946186  test_maxout.protostr
 251a948ba41c1071afcd3d9cf9c233f7  test_ntm_layers.protostr
 e6ff04e70aea27c7b06d808cc49c9497  test_print_layer.protostr
-2a75dd33b640c49a8821c2da6e574577  test_rnn_group.protostr
+fded24727338fb8ce44d9951ed8aea08  test_rnn_group.protostr
 67d6fde3afb54f389d0ce4ff14726fe1  test_sequence_pooling.protostr
 f586a548ef4350ba1ed47a81859a64cb  unused_layers.protostr
-8122477f4f65244580cec09edc590041  util_layers.protostr
+f937a5a6e7e8864b4d8cf56b0f7c7f44  util_layers.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
@@ -9,7 +9,7 @@ test_sequence_pooling test_lstmemory_layer test_grumemory_layer
 last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
 img_layers util_layers simple_rnn_layers unused_layers test_cost_layers
 test_rnn_group shared_fc shared_lstm test_cost_layers_with_weight
-test_maxout)
+test_maxout test_bi_grumemory)


 for conf in ${configs[*]}

--- a/python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py
+from paddle.trainer_config_helpers import *
+
+settings(
+    batch_size=1000,
+    learning_rate=1e-4
+)
+
+din = data_layer(name='data', size=120)
+
+outputs(bidirectional_gru(input=din, size=40, return_seq=True))
--- a/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
@@ -25,6 +25,25 @@ pool = img_pool_layer(input=maxout,
                      stride=2,
                      pool_type=MaxPooling())

-fc = fc_layer(input=pool, size=384, bias_attr=False)
+conv2 = img_conv_layer(input=pool,
+                       filter_size = 3,
+                       num_channels=32,
+                       num_filters=128,
+                       padding=1,
+                       act=LinearActivation(),
+                       bias_attr=True)
+
+maxout2 = maxout_layer(input=conv,
+                       num_channels=128,
+                       groups=4)
+
+block = block_expand_layer(input=maxout,
+                           num_channels=32,
+                           stride_x=1,
+                           stride_y=1,
+                           block_x=1,
+                           block_y=6)
+
+fc = fc_layer(input=block, size=384, bias_attr=False)

 outputs(fc)