diff --git a/.travis.yml b/.travis.yml
index bf0e0b7bbddd4c1f69e287e0f5ad471a54a75600..7812ac02837895a32fcad36158814268e93a4da8 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -35,6 +35,8 @@ addons:
       - libgoogle-glog-dev
       - libgflags-dev
       - libgtest-dev
+      - curl
+      - lcov
       - graphviz
 before_install:
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4613155f7700b25b2a8d7c250832722085b332fa..282e3e199ef440092550deec906019bc44bc73bd 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,13 +3,13 @@ cmake_minimum_required(VERSION 2.8)
 project(paddle CXX C)
 set(PADDLE_MAJOR_VERSION 0)
 set(PADDLE_MINOR_VERSION 8)
-set(PADDLE_PATCH_VERSION 0b2)
+set(PADDLE_PATCH_VERSION 0b3)
 set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
 set(PROJ_ROOT ${CMAKE_SOURCE_DIR})
 include(package)
-include(swig)
+find_package(SWIG 2.0)
 find_package(CUDA QUIET)
 find_package(Protobuf REQUIRED)
 find_package(PythonLibs 2.7 REQUIRED)
@@ -40,6 +40,9 @@ option(WITH_TESTING "Compile and run unittest for PaddlePaddle" ${GTEST_FOUND})
 option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
 option(WITH_SWIG_PY "Compile PaddlePaddle with py PaddlePaddle prediction api" ${SWIG_FOUND})
 option(ON_TRAVIS "Running test on travis-ci or not." OFF)
+option(ON_COVERALLS "Generating code coverage data on coveralls or not." OFF)
+option(COVERALLS_UPLOAD "Uploading the generated coveralls json." ON)
+
 if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING 
         "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
@@ -49,11 +52,16 @@ endif()
 include(enableCXX11)
 include(cpplint)
 include(ccache)
+if(WITH_RDMA)
+  include(rdma)
+endif()
 include(util)
 include(flags)
 include(cudnn)
 include(FindPythonModule)
 include(check_packages)
+include(swig)
+include(coveralls)
 
 # add PaddlePaddle version
 if(DEFINED ENV{PADDLE_VERSION})
@@ -129,12 +137,15 @@ else(WITH_PYTHON)
     add_definitions(-DPADDLE_NO_PYTHON)
 endif(WITH_PYTHON)
 
-if(NOT WITH_RDMA)
-    add_definitions(-DPADDLE_DISABLE_RDMA)
-endif()
+if(WITH_RDMA)
+  include_directories("${RDMA_INC_DIR}")
+else(WITH_RDMA)
+  add_definitions(-DPADDLE_DISABLE_RDMA)
+endif(WITH_RDMA)
 
 if(WITH_GLOG)
     add_definitions(-DPADDLE_USE_GLOG)
+    include_directories(${LIBGLOG_INCLUDE_DIR})
 endif()
 
 if(WITH_GFLAGS)
diff --git a/ISSUE_TEMPLATE.md b/ISSUE_TEMPLATE.md
new file mode 100644
index 0000000000000000000000000000000000000000..b70d66dc259afbad0243895fbc2a57ad5c071488
--- /dev/null
+++ b/ISSUE_TEMPLATE.md
@@ -0,0 +1,14 @@
+Thank you for contributing to PaddlePaddle. Submitting an issue is a great help for us.
+Both Chinese and English issues are welcome.
+
+It's hard to solve a problem when important details are missing.
+Before submitting the issue, look over the following criteria before handing your request in.
+
+- [ ] Was there a similar issue submitted or resolved before ? You could search issue in the github.
+- [ ] Did you retrieve your issue from widespread search engines ?
+- [ ] Is my description of the issue clear enough to reproduce this problem?
+   * If some errors occured, we need details about `how do you run your code?`, `what system do you use?`, `Are you using GPU or not?`, etc.
+   * If you use an recording [asciinema](https://asciinema.org/) to show what you are doing to make it happen, that's awesome! We could help you solve the problem more quickly.
+- [ ] Is my description of the issue use the github markdown correctly?
+   * Please use the proper markdown syntaxes for styling all forms of writing, e.g, source code, error information, etc.
+   * Check out [this page](https://guides.github.com/features/mastering-markdown/) to find out much more about markdown.
diff --git a/README.md b/README.md
index 1cc0444c0617af3da0ec1d9beaf2ae73e31bd7b2..66767d7ff8e4acf8ef246f7e0129a66e64486727 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,10 @@
 # PaddlePaddle
 
-|  **`Linux`**   | **`License`** | **`Chat Room`** |
-|----------------|---------------|-----------------|
-|[![Build Status](https://travis-ci.org/baidu/Paddle.svg?branch=master)](https://travis-ci.org/baidu/Paddle)|[![License](https://img.shields.io/badge/license-Apache%202.0-green.svg)](LICENSE)|[![Join the chat at https://gitter.im/PaddlePaddle/Deep_Learning](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/PaddlePaddle/Deep_Learning?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)|
+
+[![Build Status](https://travis-ci.org/baidu/Paddle.svg?branch=master)](https://travis-ci.org/baidu/Paddle)
+[![Coverage Status](https://coveralls.io/repos/github/baidu/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/baidu/Paddle?branch=develop)
+[![Join the chat at https://gitter.im/PaddlePaddle/Deep_Learning](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/PaddlePaddle/Deep_Learning?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
+[![License](https://img.shields.io/badge/license-Apache%202.0-green.svg)](LICENSE)
 
 Welcome to the PaddlePaddle GitHub.
 
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 57c32a54cd727e3acb181eeb19f811fab4dc82fd..685334c6585060c0344e552c6f3fda2c7324de03 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -1,4 +1,4 @@
-# Find the CBlas libraries
+# Find the CBlas and lapack libraries
 #
 # It will search MKL, atlas, OpenBlas, reference-cblas in order.
 #
@@ -19,6 +19,8 @@ set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains MKL")
 
 find_path(MKL_INCLUDE_DIR mkl.h PATHS
   ${MKL_ROOT}/include)
+find_path(MKL_INCLUDE_DIR mkl_lapacke.h PATHS
+  ${MKL_ROOT}/include)
 find_library(MKL_CORE_LIB NAMES mkl_core PATHS
   ${MKL_ROOT}/lib
   ${MKL_ROOT}/lib/intel64)
@@ -37,6 +39,7 @@ if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
           ${MKL_SEQUENTIAL_LIB}
           ${MKL_CORE_LIB})
   add_definitions(-DPADDLE_USE_MKL)
+  message(STATUS "Found MKL (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
   return() # return file.
 endif()
 
@@ -55,15 +58,19 @@ set(ATLAS_LIB_SEARCH_PATHS
     )
 find_path(ATLAS_INC_DIR NAMES cblas.h 
   PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
+find_path(ATLAS_CLAPACK_INC_DIR NAMES clapack.h
+  PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
 find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3 
   PATHS ${ATLAS_LIB_SEARCH_PATHS})
-find_library(ATLAS_LIB NAMES atlas libatlas.so.3
+find_library(ATLAS_LIB NAMES lapack_atlas liblapack_atlas.so.3
   PATHS ${ATLAS_LIB_SEARCH_PATHS})
 
 if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB)
   set(CBLAS_PROVIDER ATLAS)
-  set(CBLAS_INC_DIR ${ATLAS_INC_DIR})
+  set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
   set(CBLAS_LIBS ${ATLAS_LIB} ${ATLAS_CBLAS_LIB})
+  add_definitions(-DPADDLE_USE_ATLAS)  
+  message(STATUS "Found Atlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
   return()
 endif()
 
@@ -83,6 +90,8 @@ set(OPENBLAS_LIB_SEARCH_PATHS
 
 find_path(OPENBLAS_INC_DIR NAMES cblas.h
   PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
+find_path(OPENBLAS_LAPACKE_INC_DIR NAMES lapacke.h
+  PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
 find_library(OPENBLAS_LIB NAMES openblas
   PATHS ${OPENBLAS_LIB_SEARCH_PATHS})
 
@@ -90,6 +99,7 @@ if(OPENBLAS_INC_DIR AND OPENBLAS_LIB)
   set(CBLAS_PROVIDER OPENBLAS)
   set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR})
   set(CBLAS_LIBS ${OPENBLAS_LIB})
+  message(STATUS "Found OpenBlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
   return()
 endif()
 
diff --git a/cmake/coveralls.cmake b/cmake/coveralls.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..9be7643819efdde3f42e4d39b2849ecc17e0d9fb
--- /dev/null
+++ b/cmake/coveralls.cmake
@@ -0,0 +1,103 @@
+# CMake script for code coverage.
+# If _COVERALLS_UPLOAD is ON, it will upload json files to overalls.io automatically.
+
+# Param _COVERAGE_SRCS          A list of coverage source files.
+# Param _COVERALLS_UPLOAD       Upload the result to coveralls.
+# Param _CMAKE_SCRIPT_PATH      CMake script path.
+function(code_coverage _COVERAGE_SRCS _COVERALLS_UPLOAD _CMAKE_SCRIPT_PATH)
+    # clean previous gcov data.
+    file(REMOVE_RECURSE ${PROJECT_BINARY_DIR}/*.gcda)
+
+    # find curl for upload JSON soon.
+    if (_COVERALLS_UPLOAD)
+        find_program(CURL_EXECUTABLE curl)
+        if (NOT CURL_EXECUTABLE)
+            message(FATAL_ERROR "Coveralls: curl not found!")
+        endif()
+    endif()
+
+    # When passing a CMake list to an external process, the list
+    # will be converted from the format "1;2;3" to "1 2 3".
+    set(COVERAGE_SRCS "")
+    foreach (SINGLE_SRC ${_COVERAGE_SRCS})
+        set(COVERAGE_SRCS "${COVERAGE_SRCS}*${SINGLE_SRC}")
+    endforeach()
+
+    # query number of logical cores
+    cmake_host_system_information(RESULT core_size QUERY NUMBER_OF_LOGICAL_CORES)
+    # coveralls json file.
+    set(COVERALLS_FILE ${PROJECT_BINARY_DIR}/coveralls.json)
+    add_custom_target(coveralls_generate
+        # Run regress tests.
+        COMMAND ${CMAKE_CTEST_COMMAND}
+                -j ${core_size}
+                --output-on-failure
+        # Generate Gcov and translate it into coveralls JSON.
+        COMMAND ${CMAKE_COMMAND}
+                -DCOVERAGE_SRCS="${COVERAGE_SRCS}"
+                -DCOVERALLS_OUTPUT_FILE="${COVERALLS_FILE}"
+                -DCOV_PATH="${PROJECT_BINARY_DIR}"
+                -DPROJECT_ROOT="${PROJECT_SOURCE_DIR}"
+                -P "${_CMAKE_SCRIPT_PATH}/coverallsGcovJsons.cmake"
+        WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
+        COMMENT "Coveralls: generating coveralls output..."
+    )
+
+    if (_COVERALLS_UPLOAD)
+        message("COVERALLS UPLOAD: ON")
+        # Upload the JSON to coveralls.
+        add_custom_target(coveralls_upload
+            COMMAND ${CURL_EXECUTABLE}
+                    -S -F json_file=@${COVERALLS_FILE}
+                    https://coveralls.io/api/v1/jobs
+            DEPENDS coveralls_generate
+            WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
+            COMMENT "Coveralls: uploading coveralls output...")
+
+        add_custom_target(coveralls DEPENDS coveralls_upload)
+    else()
+        message("COVERALLS UPLOAD: OFF")
+        add_custom_target(coveralls DEPENDS coveralls_generate)
+    endif()
+endfunction()
+
+if(ON_COVERALLS)
+    set(CMAKE_BUILD_TYPE "Debug")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
+
+    set(EXCLUDE_DIRS
+        "demo/"
+        "build/"
+        "tests/"
+        ".test_env/"
+    )
+
+    if(WITH_GPU)
+        file(GLOB_RECURSE PADDLE_SOURCES RELATIVE "${PROJECT_SOURCE_DIR}" "*.cpp" "*.cc" ".c" "*.cu")
+    else()
+        file(GLOB_RECURSE PADDLE_SOURCES RELATIVE "${PROJECT_SOURCE_DIR}" "*.cpp" "*.cc" "*.c")
+    endif()
+
+    # exclude trivial files in PADDLE_SOURCES
+    foreach(EXCLUDE_DIR ${EXCLUDE_DIRS})
+        foreach(TMP_PATH ${PADDLE_SOURCES})
+            string(FIND ${TMP_PATH} ${EXCLUDE_DIR} EXCLUDE_DIR_FOUND)
+            if(NOT ${EXCLUDE_DIR_FOUND} EQUAL -1)
+                list(REMOVE_ITEM PADDLE_SOURCES ${TMP_PATH})
+            endif()
+        endforeach(TMP_PATH)
+    endforeach()
+
+    # convert to absolute path
+    set(PADDLE_SRCS "")
+    foreach(PADDLE_SRC ${PADDLE_SOURCES})
+        set(PADDLE_SRCS "${PADDLE_SRCS};${PROJECT_SOURCE_DIR}/${PADDLE_SRC}")
+    endforeach()
+
+    code_coverage(
+        "${PADDLE_SRCS}"
+        ${COVERALLS_UPLOAD}
+        "${PROJECT_SOURCE_DIR}/cmake"
+    )
+endif()
diff --git a/cmake/coverallsGcovJsons.cmake b/cmake/coverallsGcovJsons.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..ae3530c3a0eeb79ddbcbf4f2e99be75aa7968a2f
--- /dev/null
+++ b/cmake/coverallsGcovJsons.cmake
@@ -0,0 +1,403 @@
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# Copyright (C) 2014 Joakim Söderberg <joakim.soderberg@gmail.com>
+#
+# This is intended to be run by a custom target in a CMake project like this.
+# 0. Compile program with coverage support.
+# 1. Clear coverage data. (Recursively delete *.gcda in build dir)
+# 2. Run the unit tests.
+# 3. Run this script specifying which source files the coverage should be performed on.
+#
+# This script will then use gcov to generate .gcov files in the directory specified
+# via the COV_PATH var. This should probably be the same as your cmake build dir.
+#
+# It then parses the .gcov files to convert them into the Coveralls JSON format:
+# https://coveralls.io/docs/api
+#
+
+CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
+
+# Since it's not possible to pass a CMake list properly in the
+# "1;2;3" format to an external process, we have replaced the
+# ";" with "*", so reverse that here so we get it back into the
+# CMake list format.
+string(REGEX REPLACE "\\*" ";" COVERAGE_SRCS ${COVERAGE_SRCS})
+
+find_program(GCOV_EXECUTABLE gcov)
+if (NOT GCOV_EXECUTABLE)
+	message(FATAL_ERROR "gcov not found! Aborting...")
+endif()
+
+find_package(Git)
+
+# TODO: Add these git things to the coveralls json.
+if (GIT_FOUND)
+	# Branch.
+	execute_process(
+		COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref HEAD
+		WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+		OUTPUT_VARIABLE GIT_BRANCH
+		OUTPUT_STRIP_TRAILING_WHITESPACE
+	)
+
+	macro (git_log_format FORMAT_CHARS VAR_NAME)
+		execute_process(
+			COMMAND ${GIT_EXECUTABLE} log -1 --pretty=format:%${FORMAT_CHARS}
+			WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+			OUTPUT_VARIABLE ${VAR_NAME}
+			OUTPUT_STRIP_TRAILING_WHITESPACE
+		)
+	endmacro()
+
+	git_log_format(an GIT_AUTHOR_EMAIL)
+	git_log_format(ae GIT_AUTHOR_EMAIL)
+	git_log_format(cn GIT_COMMITTER_NAME)
+	git_log_format(ce GIT_COMMITTER_EMAIL)
+	git_log_format(B GIT_COMMIT_MESSAGE)
+
+	message("Git exe: ${GIT_EXECUTABLE}")
+	message("Git branch: ${GIT_BRANCH}")
+	message("Git author: ${GIT_AUTHOR_NAME}")
+	message("Git e-mail: ${GIT_AUTHOR_EMAIL}")
+	message("Git commiter name: ${GIT_COMMITTER_NAME}")
+	message("Git commiter e-mail: ${GIT_COMMITTER_EMAIL}")
+	message("Git commit message: ${GIT_COMMIT_MESSAGE}")
+
+endif()
+
+############################# Macros #########################################
+
+#
+# This macro converts from the full path format gcov outputs:
+#
+#    /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov
+#
+# to the original source file path the .gcov is for:
+#
+#   /path/to/project/root/subdir/the_file.c
+#
+macro(get_source_path_from_gcov_filename _SRC_FILENAME _GCOV_FILENAME)
+
+	# /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov 
+	# -> 
+	# #path#to#project#root#subdir#the_file.c.gcov   
+	get_filename_component(_GCOV_FILENAME_WEXT ${_GCOV_FILENAME} NAME)
+
+	# #path#to#project#root#subdir#the_file.c.gcov -> /path/to/project/root/subdir/the_file.c
+	string(REGEX REPLACE "\\.gcov$" "" SRC_FILENAME_TMP ${_GCOV_FILENAME_WEXT})
+	string(REGEX REPLACE "\#" "/" SRC_FILENAME_TMP ${SRC_FILENAME_TMP})
+	set(${_SRC_FILENAME} "${SRC_FILENAME_TMP}")
+endmacro()
+
+##############################################################################
+
+# Get the coverage data.
+file(GLOB_RECURSE GCDA_FILES "${COV_PATH}" "*.gcda")
+message("GCDA files:")
+
+# Get a list of all the object directories needed by gcov
+# (The directories the .gcda files and .o files are found in)
+# and run gcov on those.
+foreach(GCDA ${GCDA_FILES})
+	message("Process: ${GCDA}")
+	message("------------------------------------------------------------------------------")
+	get_filename_component(GCDA_DIR ${GCDA} PATH)
+
+	#
+	# The -p below refers to "Preserve path components",
+	# This means that the generated gcov filename of a source file will
+	# keep the original files entire filepath, but / is replaced with #.
+	# Example:
+	#
+	# /path/to/project/root/build/CMakeFiles/the_file.dir/subdir/the_file.c.gcda
+	# ------------------------------------------------------------------------------
+	# File '/path/to/project/root/subdir/the_file.c'
+	# Lines executed:68.34% of 199
+	# /path/to/project/root/subdir/the_file.c:creating '#path#to#project#root#subdir#the_file.c.gcov'
+	#
+	# If -p is not specified then the file is named only "the_file.c.gcov"
+	#
+	execute_process(
+		COMMAND ${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA}
+		WORKING_DIRECTORY ${GCDA_DIR}
+	)
+endforeach()
+
+# TODO: Make these be absolute path
+file(GLOB_RECURSE ALL_GCOV_FILES "${COV_PATH}" "*.gcov")
+
+# Get only the filenames to use for filtering.
+#set(COVERAGE_SRCS_NAMES "")
+#foreach (COVSRC ${COVERAGE_SRCS})
+#	get_filename_component(COVSRC_NAME ${COVSRC} NAME)
+#	message("${COVSRC} -> ${COVSRC_NAME}")
+#	list(APPEND COVERAGE_SRCS_NAMES "${COVSRC_NAME}")
+#endforeach()
+
+#
+# Filter out all but the gcov files we want.
+#
+# We do this by comparing the list of COVERAGE_SRCS filepaths that the
+# user wants the coverage data for with the paths of the generated .gcov files,
+# so that we only keep the relevant gcov files.
+#
+# Example:
+# COVERAGE_SRCS =
+#				/path/to/project/root/subdir/the_file.c
+#
+# ALL_GCOV_FILES =
+#				/path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov
+#				/path/to/project/root/build/#path#to#project#root#subdir#other_file.c.gcov
+# 
+# Result should be:
+# GCOV_FILES = 
+#				/path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov
+#
+set(GCOV_FILES "")
+#message("Look in coverage sources: ${COVERAGE_SRCS}")
+message("\nFilter out unwanted GCOV files:")
+message("===============================")
+
+set(COVERAGE_SRCS_REMAINING ${COVERAGE_SRCS})
+
+foreach (GCOV_FILE ${ALL_GCOV_FILES})
+
+	#
+	# /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov 
+	# -> 
+	# /path/to/project/root/subdir/the_file.c 
+	get_source_path_from_gcov_filename(GCOV_SRC_PATH ${GCOV_FILE})
+
+	# Is this in the list of source files?
+	# TODO: We want to match against relative path filenames from the source file root...
+	list(FIND COVERAGE_SRCS ${GCOV_SRC_PATH} WAS_FOUND)
+
+	if (NOT WAS_FOUND EQUAL -1)
+		message("YES: ${GCOV_FILE}")
+		list(APPEND GCOV_FILES ${GCOV_FILE})
+
+		# We remove it from the list, so we don't bother searching for it again.
+		# Also files left in COVERAGE_SRCS_REMAINING after this loop ends should
+		# have coverage data generated from them (no lines are covered).
+		list(REMOVE_ITEM COVERAGE_SRCS_REMAINING ${GCOV_SRC_PATH})
+	else()
+		message("NO:  ${GCOV_FILE}")
+	endif()
+endforeach()
+
+# TODO: Enable setting these
+set(JSON_SERVICE_NAME "travis-ci")
+set(JSON_SERVICE_JOB_ID $ENV{TRAVIS_JOB_ID})
+
+set(JSON_TEMPLATE
+"{
+  \"service_name\": \"\@JSON_SERVICE_NAME\@\",
+  \"service_job_id\": \"\@JSON_SERVICE_JOB_ID\@\",
+  \"source_files\": \@JSON_GCOV_FILES\@
+}"
+)
+
+set(SRC_FILE_TEMPLATE
+"{
+      \"name\": \"\@GCOV_SRC_REL_PATH\@\",
+      \"source_digest\": \"\@GCOV_CONTENTS_MD5\@\",
+      \"coverage\": \@GCOV_FILE_COVERAGE\@
+  }"
+)
+
+message("\nGenerate JSON for files:")
+message("=========================")
+
+set(JSON_GCOV_FILES "[")
+
+# Read the GCOV files line by line and get the coverage data.
+foreach (GCOV_FILE ${GCOV_FILES})
+
+	get_source_path_from_gcov_filename(GCOV_SRC_PATH ${GCOV_FILE})
+	file(RELATIVE_PATH GCOV_SRC_REL_PATH "${PROJECT_ROOT}" "${GCOV_SRC_PATH}")
+
+	# The new coveralls API doesn't need the entire source (Yay!)
+	# However, still keeping that part for now. Will cleanup in the future.
+	file(MD5 "${GCOV_SRC_PATH}" GCOV_CONTENTS_MD5)
+	message("MD5: ${GCOV_SRC_PATH} = ${GCOV_CONTENTS_MD5}")
+
+	# Loads the gcov file as a list of lines.
+	# (We first open the file and replace all occurences of [] with _
+	#  because CMake will fail to parse a line containing unmatched brackets...
+	#  also the \ to escaped \n in macros screws up things.)
+	# https://public.kitware.com/Bug/view.php?id=15369
+	file(READ ${GCOV_FILE} GCOV_CONTENTS)
+	string(REPLACE "[" "_" GCOV_CONTENTS "${GCOV_CONTENTS}")
+	string(REPLACE "]" "_" GCOV_CONTENTS "${GCOV_CONTENTS}")
+	string(REPLACE "\\" "_" GCOV_CONTENTS "${GCOV_CONTENTS}")
+	file(WRITE ${GCOV_FILE}_tmp "${GCOV_CONTENTS}")
+
+	file(STRINGS ${GCOV_FILE}_tmp GCOV_LINES)
+	list(LENGTH GCOV_LINES LINE_COUNT)
+
+	# Instead of trying to parse the source from the
+	# gcov file, simply read the file contents from the source file.
+	# (Parsing it from the gcov is hard because C-code uses ; in many places
+	#  which also happens to be the same as the CMake list delimeter).
+	file(READ ${GCOV_SRC_PATH} GCOV_FILE_SOURCE)
+
+	string(REPLACE "\\" "\\\\" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+	string(REGEX REPLACE "\"" "\\\\\"" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+	string(REPLACE "\t" "\\\\t" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+	string(REPLACE "\r" "\\\\r" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+	string(REPLACE "\n" "\\\\n" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+	# According to http://json.org/ these should be escaped as well.
+	# Don't know how to do that in CMake however...
+	#string(REPLACE "\b" "\\\\b" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+	#string(REPLACE "\f" "\\\\f" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+	#string(REGEX REPLACE "\u([a-fA-F0-9]{4})" "\\\\u\\1" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+
+	# We want a json array of coverage data as a single string
+	# start building them from the contents of the .gcov
+	set(GCOV_FILE_COVERAGE "[")
+
+	set(GCOV_LINE_COUNT 1) # Line number for the .gcov.
+	set(DO_SKIP 0)
+	foreach (GCOV_LINE ${GCOV_LINES})
+		#message("${GCOV_LINE}")
+		# Example of what we're parsing:
+		# Hitcount  |Line | Source
+		# "        8:   26:        if (!allowed || (strlen(allowed) == 0))"
+		string(REGEX REPLACE 
+			"^([^:]*):([^:]*):(.*)$" 
+			"\\1;\\2;\\3"
+			RES
+			"${GCOV_LINE}")
+
+		# Check if we should exclude lines using the Lcov syntax.
+		string(REGEX MATCH "LCOV_EXCL_START" START_SKIP "${GCOV_LINE}")
+		string(REGEX MATCH "LCOV_EXCL_END" END_SKIP "${GCOV_LINE}")
+		string(REGEX MATCH "LCOV_EXCL_LINE" LINE_SKIP "${GCOV_LINE}")
+
+		set(RESET_SKIP 0)
+		if (LINE_SKIP AND NOT DO_SKIP)
+			set(DO_SKIP 1)
+			set(RESET_SKIP 1)
+		endif()
+
+		if (START_SKIP)
+			set(DO_SKIP 1)
+			message("${GCOV_LINE_COUNT}: Start skip")
+		endif()
+
+		if (END_SKIP)
+			set(DO_SKIP 0)
+		endif()
+
+		list(LENGTH RES RES_COUNT)
+
+		if (RES_COUNT GREATER 2)
+			list(GET RES 0 HITCOUNT)
+			list(GET RES 1 LINE)
+			list(GET RES 2 SOURCE)
+
+			string(STRIP ${HITCOUNT} HITCOUNT)
+			string(STRIP ${LINE} LINE)
+
+			# Lines with 0 line numbers are metadata and can be ignored.
+			if (NOT ${LINE} EQUAL 0)
+				
+				if (DO_SKIP)
+					set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}null, ")
+				else()
+					# Translate the hitcount into valid JSON values.
+					if (${HITCOUNT} STREQUAL "#####")
+						set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}0, ")
+					elseif (${HITCOUNT} STREQUAL "-")
+						set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}null, ")
+					else()
+						set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}${HITCOUNT}, ")
+					endif()
+				endif()
+			endif()
+		else()
+			message(WARNING "Failed to properly parse line (RES_COUNT = ${RES_COUNT}) ${GCOV_FILE}:${GCOV_LINE_COUNT}\n-->${GCOV_LINE}")
+		endif()
+
+		if (RESET_SKIP)
+			set(DO_SKIP 0)
+		endif()
+		math(EXPR GCOV_LINE_COUNT "${GCOV_LINE_COUNT}+1")
+	endforeach()
+
+	message("${GCOV_LINE_COUNT} of ${LINE_COUNT} lines read!")
+
+	# Advanced way of removing the trailing comma in the JSON array.
+	# "[1, 2, 3, " -> "[1, 2, 3"
+	string(REGEX REPLACE ",[ ]*$" "" GCOV_FILE_COVERAGE ${GCOV_FILE_COVERAGE})
+
+	# Append the trailing ] to complete the JSON array.
+	set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]")
+
+	# Generate the final JSON for this file.
+	message("Generate JSON for file: ${GCOV_SRC_REL_PATH}...")
+	string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON)
+
+	set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ")
+endforeach()
+
+# Loop through all files we couldn't find any coverage for
+# as well, and generate JSON for those as well with 0% coverage.
+foreach(NOT_COVERED_SRC ${COVERAGE_SRCS_REMAINING})
+
+	# Loads the source file as a list of lines.
+	file(STRINGS ${NOT_COVERED_SRC} SRC_LINES)
+
+	set(GCOV_FILE_COVERAGE "[")
+	set(GCOV_FILE_SOURCE "")
+
+	foreach (SOURCE ${SRC_LINES})
+		set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}0, ")
+
+		string(REPLACE "\\" "\\\\" SOURCE "${SOURCE}")
+		string(REGEX REPLACE "\"" "\\\\\"" SOURCE "${SOURCE}")
+		string(REPLACE "\t" "\\\\t" SOURCE "${SOURCE}")
+		string(REPLACE "\r" "\\\\r" SOURCE "${SOURCE}")
+		set(GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}${SOURCE}\\n")
+	endforeach()
+
+	# Remove trailing comma, and complete JSON array with ]
+	string(REGEX REPLACE ",[ ]*$" "" GCOV_FILE_COVERAGE ${GCOV_FILE_COVERAGE})
+	set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]")
+
+	# Generate the final JSON for this file.
+	message("Generate JSON for non-gcov file: ${NOT_COVERED_SRC}...")
+	string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON)
+	set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ")
+endforeach()
+
+# Get rid of trailing comma.
+string(REGEX REPLACE ",[ ]*$" "" JSON_GCOV_FILES ${JSON_GCOV_FILES})
+set(JSON_GCOV_FILES "${JSON_GCOV_FILES}]")
+
+# Generate the final complete JSON!
+message("Generate final JSON...")
+string(CONFIGURE ${JSON_TEMPLATE} JSON)
+
+file(WRITE "${COVERALLS_OUTPUT_FILE}" "${JSON}")
+message("###########################################################################")
+message("Generated coveralls JSON containing coverage data:") 
+message("${COVERALLS_OUTPUT_FILE}")
+message("###########################################################################")
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 8c5cb4cc49d934143a6825aceab60c4a7ed0930a..337db53b37fa378abef9f43c745047df0ae92212 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -21,12 +21,6 @@ function(safe_set_flag is_c src_list flag_name)
     endif()
     if(${safe_name})
         set(${src_list} "${${src_list}} ${flag_name}" PARENT_SCOPE)
-        if(is_c)
-          set(CUDA_NVCC_FLAGS
-              --compiler-options;${flag_name}
-              ${CUDA_NVCC_FLAGS}
-              PARENT_SCOPE)
-        endif()
     endif()
 endfunction()
 
@@ -40,6 +34,20 @@ macro(safe_set_cxxflag src_list flag_name)
     safe_set_flag(OFF ${src_list} ${flag_name})
 endmacro()
 
+# helper macro to set nvcc flag
+macro(safe_set_nvflag flag_name)
+    string(REPLACE "-" "_" safe_name ${flag_name})
+    string(REPLACE "=" "_" safe_name ${safe_name})
+    CHECK_C_COMPILER_FLAG(${flag_name} C_COMPILER_SUPPORT_FLAG_${safe_name})
+    set(safe_name C_COMPILER_SUPPORT_FLAG_${safe_name})
+    if(${safe_name})
+        set(CUDA_NVCC_FLAGS
+            --compiler-options;${flag_name}
+            ${CUDA_NVCC_FLAGS})
+    endif()
+endmacro()
+
+
 CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS)
 if(NOT UINT64_MAX_EXISTS)
   set(CMAKE_REQUIRED_DEFINITIONS -D__STDC_LIMIT_MACROS)
@@ -63,20 +71,43 @@ set(COMMON_FLAGS
     -Wnon-virtual-dtor
     -Wdelete-non-virtual-dtor
     -Wno-unused-parameter
+    -Wno-unused-function
+    -Wno-error=literal-suffix
+    -Wno-error=unused-local-typedefs)
+
+set(GPU_COMMON_FLAGS
+    -fPIC
+    -fno-omit-frame-pointer
+    -Wnon-virtual-dtor
+    -Wdelete-non-virtual-dtor
+    -Wno-unused-parameter
+    -Wno-unused-function
     -Wno-error=literal-suffix
     -Wno-error=unused-local-typedefs
     -Wno-error=unused-function  # Warnings in Numpy Header.
 )
 
+if (APPLE)
+    # On Mac OS X build fat binaries with x86_64 architectures by default.
+    set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
+else()
+    set(GPU_COMMON_FLAGS
+        -Wall
+        -Wextra
+        -Werror
+        ${GPU_COMMON_FLAGS})
+endif()
+
+
 foreach(flag ${COMMON_FLAGS})
     safe_set_cflag(CMAKE_C_FLAGS ${flag})
     safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag})
 endforeach()
 
-# On Mac OS X build fat binaries with x86_64 architectures by default.
-if (APPLE)
-    set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
-endif ()
+foreach(flag ${GPU_COMMON_FLAGS})
+    safe_set_nvflag(${flag})
+endforeach()
+
 
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
diff --git a/cmake/rdma.cmake b/cmake/rdma.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..e9a4da79aa92a92aa7e5d21bb795ab9aaf60ab8b
--- /dev/null
+++ b/cmake/rdma.cmake
@@ -0,0 +1,76 @@
+# user should download rdma first from subversion repository
+
+# execute following instruction to download svn mannally
+# svn co https://svn.baidu.com/sys/ip/trunk/rdma/sockrdmav1 rdma/
+# svn co https://svn.baidu.com/sys/ip/trunk/rdma/thirdparty rdma/
+# we use static output in svn repositories to avoid implict bugs from not standard runtime env.
+
+set(RDMA_ROOT $ENV{RDMA_ROOT} CACHE PATH "Folder contains RDMA sock library and thirdparty library")
+
+function(generate_rdma_links)
+  #redirect to current DIR to isolate the pollution from system runtime environment
+  #it can benifits unified control for different gcc environment. 
+  #e.g, by default gcc48 did not refer /usr/lib64 which could contain low version
+  #runtime libraries that will crash process while loading it. That redirect trick
+  #can fix it.
+  execute_process(
+    COMMAND mkdir -p librdma
+    COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so.1
+    COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so
+    COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so.1
+    COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so 
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+  )
+endfunction(generate_rdma_links)
+
+
+#check and set headers
+find_path(RDMA_INC_SXISOCK sxi_sock.h PATHS ${RDMA_ROOT}/sockrdmav1/output/include)
+find_path(RDMA_INC_XIO libxio.h PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
+find_path(RDMA_INC_EVENT event2 PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+find_path(RDMA_INC_NUMA numa.h PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
+
+#check and set libs
+find_library(RDMA_LIB_SXISOCK NAMES sxisock PATHS ${RDMA_ROOT}/sockrdmav1/output)
+find_library(RDMA_LIB_XIO NAMES xio PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
+find_library(RDMA_LIB_EVENT NAMES event PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+find_library(RDMA_LIB_EVENT_CORE NAMES event_core PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+find_library(RDMA_LIB_EVENT_EXTRA NAMES event_extra PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+find_library(RDMA_LIB_EVENT_PTHREADS NAMES event_pthreads PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+find_library(RDMA_LIB_NUMA NAMES numa PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
+
+if(
+    RDMA_INC_SXISOCK AND
+    RDMA_INC_XIO AND
+    RDMA_INC_EVENT AND
+    RDMA_INC_NUMA AND
+    RDMA_LIB_SXISOCK AND 
+    RDMA_LIB_XIO AND
+    RDMA_LIB_EVENT AND
+    RDMA_LIB_EVENT_CORE AND
+    RDMA_LIB_EVENT_EXTRA AND
+    RDMA_LIB_EVENT_PTHREADS AND
+    RDMA_LIB_NUMA
+    )
+
+  set(RDMA_INC_DIR 
+    ${RDMA_INC_SXISOCK} 
+    ${RDMA_INC_XIO}
+    ${RDMA_INC_EVENT}
+    ${RDMA_INC_NUMA})
+  set(RDMA_LIBS  
+    ${RDMA_LIB_SXISOCK} 
+    ${RDMA_LIB_XIO} 
+    ${RDMA_LIB_EVENT} 
+    ${RDMA_LIB_EVENT_CORE} 
+    ${RDMA_LIB_EVENT_EXTRA} 
+    ${RDMA_LIB_EVENT_PTHREADS} 
+    ${RDMA_LIB_NUMA} 
+    )
+  set(RDMA_LD_FLAGS "-L./librdma -libverbs -lrdmacm -Xlinker -rpath ./librdma")
+  return()
+endif()
+
+#if this module is not called, RDMA_INC_DIR RDMA_LIBS will be null, so top module always refer this variable
+
+message(FATAL_ERROR, "RDMA libraries are not found, try to set RDMA_ROOT or check all related libraries.")
diff --git a/cmake/swig.cmake b/cmake/swig.cmake
index f5c1bcc79b3dc0e6c4f4489ee9f72a084afe8847..97e87aa947791e2c5a88e7e554dec43bcd661664 100644
--- a/cmake/swig.cmake
+++ b/cmake/swig.cmake
@@ -1,25 +1,3 @@
-find_program(
-    SWIG_BINARY_PATH
-    swig)
-
-if(${SWIG_BINARY_PATH} STREQUAL "SWIG_BINARY_PATH-NOTFOUND")
-    set(SWIG_FOUND OFF)
-else()
-    set(SWIG_FOUND ON)
-endif()
-
-set(MIN_SWIG_VERSION 2)
-if(SWIG_FOUND)
-    execute_process(COMMAND sh -c "${SWIG_BINARY_PATH} -version | grep Version | cut -f3 -d' '"
-        OUTPUT_VARIABLE _SWIG_VERSION
-        OUTPUT_STRIP_TRAILING_WHITESPACE)
-    if(${_SWIG_VERSION} VERSION_LESS ${MIN_SWIG_VERSION})
-        message("swig version ${MIN_SWIG_VERSION} or greater is needed for generating python api. "
-                 "Only version ${_SWIG_VERSION} is found. Set SWIG_FOUND to FALSE")
-        set(SWIG_FOUND FALSE)
-    endif(${_SWIG_VERSION} VERSION_LESS ${MIN_SWIG_VERSION})
-endif(SWIG_FOUND)
-
 function(generate_python_api target_name)
     add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
                               ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
@@ -27,6 +5,7 @@ function(generate_python_api target_name)
         COMMAND swig -python -c++ -outcurrentdir -I../ api/Paddle.swig
                 && mv ${PROJ_ROOT}/paddle/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
         DEPENDS ${PROJ_ROOT}/paddle/api/Paddle.swig
+                ${PROJ_ROOT}/paddle/api/PaddleAPI.h
         WORKING_DIRECTORY ${PROJ_ROOT}/paddle
         COMMENT "Generate Python API from swig")
     add_custom_target(${target_name} ALL DEPENDS
diff --git a/cmake/util.cmake b/cmake/util.cmake
index 0fa36f070cc11be543efe9573b93173ec771b9be..3f78cd08c390550790b7145c412de32351873e4e 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -67,6 +67,10 @@ endmacro()
 #
 # It will handle WITH_PYTHON/WITH_GLOG etc.
 function(link_paddle_exe TARGET_NAME)
+    if(WITH_RDMA)
+        generate_rdma_links()
+    endif()
+
     if(WITH_METRIC)
         if(WITH_GPU)
             set(METRIC_LIBS paddle_metric_learning paddle_dserver_lib metric metric_cpu)
@@ -109,6 +113,12 @@ function(link_paddle_exe TARGET_NAME)
         ${ZLIB_LIBRARIES}
         ${INTERAL_LIBS}
         ${CMAKE_DL_LIBS})
+
+    if(WITH_RDMA)
+        target_link_libraries(${TARGET_NAME}
+            ${RDMA_LD_FLAGS}
+            ${RDMA_LIBS})
+    endif()
     
     if(WITH_PYTHON)
         target_link_libraries(${TARGET_NAME}
diff --git a/demo/introduction/README.md b/demo/introduction/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..bebf1d090d98691199ede55736dfe5b964a8fd42
--- /dev/null
+++ b/demo/introduction/README.md
@@ -0,0 +1,4 @@
+This folder contains scripts used in PaddlePaddle introduction.
+- use `bash train.sh` to train a simple linear regression model
+- use `python evaluate_model.py` to read model parameters. You can see that `w` and `b` are very close to [2, 0.3].
+
diff --git a/demo/introduction/dataprovider.py b/demo/introduction/dataprovider.py
new file mode 100644
index 0000000000000000000000000000000000000000..be8c0bc89156cf843d9b08276b52f92a4d8c9706
--- /dev/null
+++ b/demo/introduction/dataprovider.py
@@ -0,0 +1,24 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.PyDataProvider2 import *
+import random
+
+# define data types of input: 2 real numbers
+@provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
+def process(settings, input_file):
+    for i in xrange(2000):
+        x = random.random()
+        yield [x], [2*x+0.3]
+
diff --git a/demo/introduction/evaluate_model.py b/demo/introduction/evaluate_model.py
new file mode 100755
index 0000000000000000000000000000000000000000..8cfb843c42105757b0f63c4a00d034b47a37a0bb
--- /dev/null
+++ b/demo/introduction/evaluate_model.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Print model parameters in last model
+
+Usage:
+    python evaluate_model.py
+"""
+import numpy as np
+import os
+
+def load(file_name):
+    with open(file_name, 'rb') as f:
+        f.read(16) # skip header for float type.
+        return np.fromfile(f, dtype=np.float32)
+
+def main():
+    print 'w=%.6f, b=%.6f from pass 29' % (load('output/pass-00029/w'),
+            load('output/pass-00029/b'))
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/introduction/train.sh b/demo/introduction/train.sh
new file mode 100755
index 0000000000000000000000000000000000000000..06db8edd105ada071597ed1aa5e42f7de547174d
--- /dev/null
+++ b/demo/introduction/train.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+paddle train \
+    --config=trainer_config.py \
+    --save_dir=./output \
+    --num_passes=30 \
+    2>&1 |tee 'train.log'
diff --git a/demo/introduction/trainer_config.py b/demo/introduction/trainer_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..3e3df5583282a4335ddea7b1cb30a84052d0adca
--- /dev/null
+++ b/demo/introduction/trainer_config.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+# 1. read data. Suppose you saved above python code as dataprovider.py
+data_file = 'empty.list'
+with open(data_file, 'w') as f: f.writelines(' ')
+define_py_data_sources2(train_list=data_file, test_list=None, 
+        module='dataprovider', obj='process',args={})
+
+# 2. learning algorithm
+settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
+
+# 3. Network configuration
+x = data_layer(name='x', size=1)
+y = data_layer(name='y', size=1)
+y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
+cost = regression_cost(input=y_predict, label=y)
+outputs(cost)
+
diff --git a/demo/quick_start/api_train.py b/demo/quick_start/api_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ae19b8d26534a9521a6da7630796edce36780e7
--- /dev/null
+++ b/demo/quick_start/api_train.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import itertools
+import random
+
+from paddle.trainer.config_parser import parse_config
+from py_paddle import swig_paddle as api
+from py_paddle import DataProviderConverter
+from paddle.trainer.PyDataProvider2 \
+    import integer_value, integer_value_sequence, sparse_binary_vector
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--train_data",
+                        type=str, required=False, help="train data file")
+    parser.add_argument("--test_data", type=str, help="test data file")
+    parser.add_argument("--config",
+                        type=str, required=True, help="config file name")
+    parser.add_argument("--dict_file", required=True, help="dictionary file")
+    parser.add_argument("--seq",
+                        default=1, type=int,
+                        help="whether use sequence training")
+    parser.add_argument("--use_gpu", default=0, type=int,
+                        help="whether use GPU for training")
+    parser.add_argument("--trainer_count", default=1, type=int,
+                        help="Number of threads for training")
+    parser.add_argument("--num_passes", default=5, type=int,
+                        help="Number of training passes")
+    return parser.parse_args()
+
+UNK_IDX = 0
+
+def load_data(file_name, word_dict):
+    with open(file_name, 'r') as f:
+        for line in f:
+            label, comment = line.strip().split('\t')
+            words = comment.split()
+            word_slot = [word_dict.get(w, UNK_IDX) for w in words]
+            yield word_slot, int(label)
+
+def load_dict(dict_file):
+    word_dict = dict()
+    with open(dict_file, 'r') as f:
+        for i, line in enumerate(f):
+            w = line.strip().split()[0]
+            word_dict[w] = i
+    return word_dict
+
+def main():
+    options = parse_arguments()
+    api.initPaddle("--use_gpu=%s" % options.use_gpu,
+                   "--trainer_count=%s" % options.trainer_count)
+
+    word_dict = load_dict(options.dict_file)
+    train_dataset = list(load_data(options.train_data, word_dict))
+    if options.test_data:
+        test_dataset = list(load_data(options.test_data, word_dict))
+    else:
+        test_dataset = None
+
+    trainer_config = parse_config(options.config,
+                                  "dict_file=%s" % options.dict_file)
+    # No need to have data provider for trainer
+    trainer_config.ClearField('data_config')
+    trainer_config.ClearField('test_data_config')
+
+    # create a GradientMachine from the model configuratin
+    model = api.GradientMachine.createFromConfigProto(
+        trainer_config.model_config)
+    # create a trainer for the gradient machine
+    trainer = api.Trainer.create(trainer_config, model)
+
+    # create a data converter which converts data to PaddlePaddle
+    # internal format
+    input_types = [
+        integer_value_sequence(len(word_dict)) if options.seq
+            else sparse_binary_vector(len(word_dict)),
+        integer_value(2)]
+    converter = DataProviderConverter(input_types)
+
+    batch_size = trainer_config.opt_config.batch_size
+    trainer.startTrain()
+    for train_pass in xrange(options.num_passes):
+        trainer.startTrainPass()
+        random.shuffle(train_dataset)
+        for pos in xrange(0, len(train_dataset), batch_size):
+            batch = itertools.islice(train_dataset, pos, pos + batch_size)
+            size = min(batch_size, len(train_dataset) - pos)
+            trainer.trainOneDataBatch(size, converter(batch))
+        trainer.finishTrainPass()
+        if test_dataset:
+            trainer.startTestPeriod();
+            for pos in xrange(0, len(test_dataset), batch_size):
+                batch = itertools.islice(test_dataset, pos, pos + batch_size)
+                size = min(batch_size, len(test_dataset) - pos)
+                trainer.testOneDataBatch(size, converter(batch))
+            trainer.finishTestPeriod()
+    trainer.finishTrain()
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/quick_start/api_train.sh b/demo/quick_start/api_train.sh
new file mode 100755
index 0000000000000000000000000000000000000000..40e9d0a09aaa6b672d6b3997c67c07a5e8a8c3d8
--- /dev/null
+++ b/demo/quick_start/api_train.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+# Note: if using trainer_config.emb.py, trainer_config.cnn.py
+# or trainer_config.lstm.py, you need to change --seq to --seq=1
+# because they are sequence models.
+python api_train.py \
+  --config=trainer_config.lr.py \
+  --trainer_count=2 \
+  --num_passes=15 \
+  --use_gpu=0 \
+  --seq=0 \
+  --train_data=data/train.txt \
+  --test_data=data/test.txt \
+  --dict_file=data/dict.txt \
+  2>&1 | tee 'train.log'
diff --git a/demo/quick_start/dataprovider_emb.py b/demo/quick_start/dataprovider_emb.py
index ca940a89e54770eaf93b7c704a8d1274de2dc693..f5632d5f3f8bd8bb83b12198e7450b239eb1f7f6 100755
--- a/demo/quick_start/dataprovider_emb.py
+++ b/demo/quick_start/dataprovider_emb.py
@@ -16,6 +16,7 @@ from paddle.trainer.PyDataProvider2 import *
 
 UNK_IDX = 0
 
+
 def initializer(settings, dictionary, **kwargs):
     settings.word_dict = dictionary
     settings.input_types = [
diff --git a/demo/quick_start/train.sh b/demo/quick_start/train.sh
index ea4e32249a3d012df16b87d3b5ec0290cb64eb49..49806292a4ec5bd4194ccb6f6a638b6b2b4f37ed 100755
--- a/demo/quick_start/train.sh
+++ b/demo/quick_start/train.sh
@@ -24,7 +24,7 @@ paddle train \
   --config=$cfg \
   --save_dir=./output \
   --trainer_count=4 \
-  --log_period=20 \
+  --log_period=100 \
   --num_passes=15 \
   --use_gpu=false \
   --show_parameter_stats_period=100 \
diff --git a/demo/quick_start/trainer_config.lr.py b/demo/quick_start/trainer_config.lr.py
index 119e3849a4b7e01713bc983d83c000772a60b76d..c6059947f30b32975d72155150de095ade01aa9d 100644
--- a/demo/quick_start/trainer_config.lr.py
+++ b/demo/quick_start/trainer_config.lr.py
@@ -16,7 +16,7 @@
 
 from paddle.trainer_config_helpers import *
 
-dict_file = "./data/dict.txt"
+dict_file = get_config_arg('dict_file', str, "./data/dict.txt")
 word_dict = dict()
 with open(dict_file, 'r') as f:
     for i, line in enumerate(f):
@@ -63,7 +63,6 @@ if not is_predict:
     label = data_layer(name="label", size=2)
 
     # Define cross-entropy classification loss and error.
-    classification_cost(input=output, label=label)
     cls = classification_cost(input=output, label=label)
     outputs(cls)
 else:
diff --git a/demo/quick_start/trainer_config.lstm.py b/demo/quick_start/trainer_config.lstm.py
index ec8a2cb00abd19ef80c327ac564e91661ecc3928..b412a9cbd914dc7abd70b93bbe250759552ee071 100644
--- a/demo/quick_start/trainer_config.lstm.py
+++ b/demo/quick_start/trainer_config.lstm.py
@@ -42,20 +42,13 @@ settings(
     gradient_clipping_threshold=25
 )
 
-bias_attr = ParamAttr(initial_std=0.,l2_rate=0.)
 
 data = data_layer(name="word", size=len(word_dict))
 emb = embedding_layer(input=data, size=128)
-fc = fc_layer(input=emb, size=512,
-              act=LinearActivation(),
-              bias_attr=bias_attr,
-              layer_attr=ExtraAttr(drop_rate=0.1))
-lstm = lstmemory(input=fc, act=TanhActivation(),
-                 bias_attr=bias_attr,
-                 layer_attr=ExtraAttr(drop_rate=0.25))
-lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling())
-output = fc_layer(input=lstm_last, size=2,
-                  bias_attr=bias_attr,
+lstm = simple_lstm(input=emb, size=128,
+                   lstm_cell_attr=ExtraAttr(drop_rate=0.25))
+lstm_max = pooling_layer(input=lstm, pooling_type=MaxPooling())
+output = fc_layer(input=lstm_max, size=2,
                   act=SoftmaxActivation())
 if is_predict:
     maxid = maxid_layer(output)
diff --git a/demo/semantic_role_labeling/.gitignore b/demo/semantic_role_labeling/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..cd90ca7bbe9be46f54cb656a8067c794a55d8cfc
--- /dev/null
+++ b/demo/semantic_role_labeling/.gitignore
@@ -0,0 +1,10 @@
+*.pyc
+train.log
+data/feature
+data/conll05st-release/
+data/src.dict
+data/test.wsj.props
+data/test.wsj.seq_pair
+data/test.wsj.words
+data/tgt.dict
+output
diff --git a/demo/sentiment/predict.py b/demo/sentiment/predict.py
index c61628d34db4a2bcecd8b367879045f7cb57d491..7d0baeabbba68b2a160463364d05cd865bf0314f 100755
--- a/demo/sentiment/predict.py
+++ b/demo/sentiment/predict.py
@@ -46,8 +46,8 @@ class SentimentPrediction():
         conf = parse_config(train_conf, "is_predict=1")
         self.network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
         self.network.loadParameters(self.model_dir)
-        slots = [integer_value_sequence(self.dict_dim)]
-        self.converter = DataProviderConverter(slots)
+        input_types = [integer_value_sequence(self.dict_dim)]
+        self.converter = DataProviderConverter(input_types)
 
     def load_dict(self):
         """
diff --git a/doc/build/build_from_source.md b/doc/build/build_from_source.md
index f7db0a9b92e67e1ecf5e44f1edb17cb8cacd8d2d..c37234d3ef14dfcfeaa1f34b0565e40e0672edc0 100644
--- a/doc/build/build_from_source.md
+++ b/doc/build/build_from_source.md
@@ -153,12 +153,12 @@ As a simple example, consider the following:
 - **Only CPU**
 
   ```bash
-  cmake  .. -DWITH_GPU=OFF -DWITH_DOC=OFF
+  cmake  .. -DWITH_GPU=OFF
   ```
 - **GPU**
 
   ```bash
-  cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF
+  cmake .. -DWITH_GPU=ON
   ```
 
 - **GPU with doc and swig**
@@ -171,7 +171,7 @@ Finally, you can build PaddlePaddle:
 
 ```bash
 # you can add build option here, such as:    
-cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF -DCMAKE_INSTALL_PREFIX=<path to install>
+cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX=<path to install>
 # please use sudo make install, if you want to install PaddlePaddle into the system
 make -j `nproc` && make install
 # set PaddlePaddle installation path in ~/.bashrc
@@ -219,10 +219,9 @@ easy_install pip
   # Install google test on Mac OS X
   # Download gtest 1.7.0
   wget https://github.com/google/googletest/archive/release-1.7.0.tar.gz
-  tar -xvf googletest-release-1.7.0.tar.gz && cd googletest-release-1.7.0
+  tar -xzf googletest-release-1.7.0.tar.gz && cd googletest-release-1.7.0
   # Build gtest
-  mkdir build && cmake ..
-  make
+  mkdir build && cd build && cmake .. && make
   # Install gtest library
   sudo cp -r ../include/gtest /usr/local/include/
   sudo cp lib*.a /usr/local/lib
@@ -246,7 +245,7 @@ easy_install pip
 
         ```bash
         sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local
-        sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
+        sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib/libcudnn*
         ```
     2. Then you need to set DYLD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
 
@@ -273,12 +272,12 @@ As a simple example, consider the following:
 - **Only CPU**
 
   ```bash
-  cmake  .. -DWITH_GPU=OFF -DWITH_DOC=OFF
+  cmake  .. -DWITH_GPU=OFF
   ```
 - **GPU**
 
   ```bash
-  cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF
+  cmake .. -DWITH_GPU=ON
   ```
 
 - **GPU with doc and swig**
@@ -291,9 +290,9 @@ Finally, you can build PaddlePaddle:
 
 ```bash
 # you can add build option here, such as:    
-cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF -DCMAKE_INSTALL_PREFIX=<installation path>
+cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX=<installation path>
 # please use sudo make install, if you want to install PaddlePaddle into the system
-make -j `nproc` && make install
+make -j `sysctl -n hw.ncpu` && make install
 # set PaddlePaddle installation path in ~/.bashrc
 export PATH=<installation path>/bin:$PATH
 ```
diff --git a/doc/build/contribute_to_paddle.md b/doc/build/contribute_to_paddle.md
index bbdbb4d4227d0b5b8ada00baec7182cedcada861..a9ab69c5f42b8d341dca87479a642e28ca58fbf4 100644
--- a/doc/build/contribute_to_paddle.md
+++ b/doc/build/contribute_to_paddle.md
@@ -4,7 +4,7 @@ We sincerely appreciate your contributions. You can use fork and pull request
 workflow to merge your code. 
  
 ## Code Requirements
-- Your code mush be fully documented by
+- Your code must be fully documented by
   [doxygen](http://www.stack.nl/~dimitri/doxygen/) style.
 - Make sure the compiler option WITH\_STYLE\_CHECK is on and the compiler
   passes the code style check.
@@ -20,16 +20,30 @@ It's just that simple.
 
 ## Clone
 
+Paddle is currently using [git-flow branching model](http://nvie.com/posts/a-successful-git-branching-model/).
+The **develop** is the main branch, and other user's branches are feature branches.
+
 Once you've created a fork, you can use your favorite git client to clone your
 repo or just head straight to the command line:
  
 ```shell
 # Clone your fork to your local machine
-git clone https://github.com/USERNAME/Paddle.git
+git clone --branch develop https://github.com/USERNAME/Paddle.git
+```
+If your repository doesn't contain **develop** branch, just create it by your own.
+
+```shell
+git clone https://github.com/USERNAME/Paddle.git Paddle
+cd Paddle
+git checkout -b develop  # create develop branch.
+git remote add upstream https://github.com/baidu/Paddle.git  # add upstream to baidu/Paddle
+git pull upstream develop  # update to upstream
 ```
+
 Then you can start to develop by making a local developement branch
+
 ```shell
-git checkout -b MY_COOL_STUFF_BRANCH origin/master
+git checkout -b MY_COOL_STUFF_BRANCH
 ```
 
 ## Commit
@@ -41,7 +55,7 @@ Commit your changes by following command lines:
 git status
 # add modified files
 git add xx
-git commit -m "commit info"
+env EDITOR=vim git commit  # You can write your comments by vim/nano/emacs.
 ```
 The first line of commit infomation is the title. The second and later lines
 are the details if any.
@@ -63,7 +77,7 @@ git remote -v
 Update your fork with the latest upstream changes:
 
 ```shell
-git pull --rebase upstream HEAD
+git pull --rebase upstream develop
 ```
 
 If there are no unique commits locally, git will simply perform a fast-forward.
@@ -76,7 +90,7 @@ Now, your local master branch is up-to-date with everything modified upstream.
 
 ```shell
 # push to your repository in Github
-git push origin HEAD
+git push -u origin MY_COOL_STUFF_BRANCH  # create remote branch MY_COOL_STUFF_BRANCH to origin.
 ```
 
 ## Pull Request
@@ -93,13 +107,24 @@ of conflict, you need to do the update manually. You need to do the following on
 your local repository:
 ```shell
 git checkout MY_COOL_STUFF_BRANCH
-git pull --rebase upstream HEAD
+git pull upstream develop
 # You may need to resolve the conflict according to the git prompt.
 # Make and test your code.
-git push -f origin HEAD
+git push origin MY_COOL_STUFF_BRANCH
 ```
 Now your Pull Request is updated with the latest version.
 
 ## Revise your pull request
 
 When you revise your pull request according to reviewer's comments, please use 'git commit' instead of 'git commit --amend' to commit your changes so that the reviewers can see the difference between the new pull requrest and the old pull request.
+
+The possible commands are
+
+```shell
+git checkout MY_COOL_STUFF_BRANCH
+git pull upstream develop   # update local to newest code base.
+# May be some conflicts will occured.
+# And develop your cool stuff
+env EDITOR=vim git commit  # add your revise log
+git push origin MY_COOL_STUFF_BRANCH
+```
diff --git a/doc/index.md b/doc/index.md
index df03a33fac98c46635eef05d88639235ac72cf8f..a4dffb0405a6b23c88473307a1d199e3caaadf55 100644
--- a/doc/index.md
+++ b/doc/index.md
@@ -3,6 +3,7 @@ PaddlePaddle Documentation
 
 User Guide
 ----------
+* [Introduction](introduction/index.md)
 * [Quick Start](demo/quick_start/index_en.md)
 * [Build and Installation](build/index.rst)
 * [Contribute Code](build/contribute_to_paddle.md)
diff --git a/doc/introduction/index.md b/doc/introduction/index.md
new file mode 100644
index 0000000000000000000000000000000000000000..004ca07844da0fdbea359508c9fae1012aaad421
--- /dev/null
+++ b/doc/introduction/index.md
@@ -0,0 +1,101 @@
+# Introduction
+
+PaddlePaddle is a deep learning platform open-sourced by Baidu. With PaddlePaddle, you can easily train a classic neural network within a couple lines of configuration, or you can build sophisticated models that provide state-of-the-art performance on difficult learning tasks like sentiment analysis, machine translation, image caption and so on.
+
+## 1. A Classic Problem
+
+Now, to give you a hint of what using PaddlePaddle looks like, let's start with a fundamental learning problem - <a href="https://en.wikipedia.org/wiki/Simple_linear_regression">**simple linear regression**</a> : you have observed a set of two-dimensional data points of `X` and `Y`, where `X` is an explanatory variable and `Y` is corresponding dependent variable, and you want to recover the underlying correlation between `X` and `Y`. Linear regression can be used in many practical scenarios. For example, `X` can be a variable about house size, and `Y` a variable about house price. You can build a model that captures relationship between them by observing real estate markets.
+
+## 2. Prepare the Data
+
+Suppose the true relationship can be characterized as `Y = 2X + 0.3`, let's see how to recover this pattern only from observed data. Here is a piece of python code that feeds synthetic data to PaddlePaddle. The code is pretty self-explanatory, the only extra thing you need to add for PaddlePaddle is a definition of input data types.
+
+```python
+# dataprovider.py
+from paddle.trainer.PyDataProvider2 import *
+import random
+
+# define data types of input: 2 real numbers
+@provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
+def process(settings, input_file):
+    for i in xrange(2000):
+        x = random.random()
+        yield [x], [2*x+0.3]
+```
+
+## 3. Train a NeuralNetwork in PaddlePaddle
+
+To recover this relationship between `X` and `Y`, we use a neural network with one layer of linear activation units and a square error cost layer. Don't worry if you are not familiar with these terminologies, it's just saying that we are starting from a random line `Y' = wX + b` , then we gradually adapt `w` and `b` to minimize the difference between `Y'` and `Y`. Here is what it looks like in PaddlePaddle:
+
+```python
+# trainer_config.py
+from paddle.trainer_config_helpers import *
+
+# 1. read data. Suppose you saved above python code as dataprovider.py
+data_file = 'empty.list'
+with open(data_file, 'w') as f: f.writelines(' ')
+define_py_data_sources2(train_list=data_file, test_list=None, 
+        module='dataprovider', obj='process',args={})
+
+# 2. learning algorithm
+settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
+
+# 3. Network configuration
+x = data_layer(name='x', size=1)
+y = data_layer(name='y', size=1)
+y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
+cost = regression_cost(input=y_predict, label=y)
+outputs(cost)
+```
+
+Some of the most fundamental usages of PaddlePaddle are demonstrated:
+
+-  The first part shows how to feed data into PaddlePaddle. In general cases, PaddlePaddle reads raw data from a list of files, and then do some user-defined process to get real input. In this case, we only need to create a placeholder file since we are generating synthetic data on the fly.
+
+-  The second part describes learning algorithm. It defines in what ways adjustments are made to model parameters. PaddlePaddle provides a rich set of optimizers, but a simple momentum based optimizer will suffice here, and it processes 12 data points each time.
+
+-  Finally, the network configuration. It usually is as simple as "stacking" layers. Three kinds of layers are used in this configuration:
+	-  **Data Layer**: a network always starts with one or more data layers. They provide input data to the rest of the network. In this problem, two data layers are used respectively for `X` and `Y`.
+	-  **FC Layer**: FC layer is short for Fully Connected Layer, which connects all the input units to current layer and does the actual computation specified as activation function. Computation layers like this are the fundamental building blocks of a deeper model.
+	-  **Cost Layer**: in training phase, cost layers are usually the last layers of the network. They measure the performance of current model, and provide guidence to adjust parameters.
+
+Now that everything is ready, you can train the network with a simple command line call:
+ ```
+ paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
+ ```
+
+This means that PaddlePaddle will train this network on the synthectic dataset for 30 passes, and save all the models under path `./output`. You will see from the messages printed out during training phase that the model cost is decreasing as time goes by, which indicates we are getting a closer guess.
+
+
+## 4. Evaluate the Model
+
+Usually, a different dataset that left out during training phase should be used to evalute the models. However, we are lucky enough to know the real answer: `w=2, b=0.3`, thus a better option is to check out model parameters directly.
+
+In PaddlePaddle, training is just to get a collection of model parameters, which are `w` and `b` in this case. Each parameter is saved in an individual file in the popular `numpy` array format. Here is the code that reads parameters from last pass.
+
+```python
+import numpy as np
+import os
+
+def load(file_name):
+    with open(file_name, 'rb') as f:
+        f.read(16) # skip header for float type.
+        return np.fromfile(f, dtype=np.float32)
+        
+print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
+# w=1.999743, b=0.300137
+```
+
+<center> ![](./parameters.png) </center>
+
+Although starts from a random guess, you can see that value of `w` changes quickly towards 2 and `b` changes quickly towards 0.3. In the end, the predicted line is almost identical with real answer.
+
+There, you have recovered the underlying pattern between `X` and `Y` only from observed data.
+
+
+## 5. Where to Go from Here
+
+- <a href="../build/index.html"> Build and Installation </a>
+- <a href="../demo/quick_start/index_en.html">Quick Start</a>
+- <a href="../demo/index.html">Example and Demo</a>
+
diff --git a/doc/introduction/parameters.png b/doc/introduction/parameters.png
new file mode 120000
index 0000000000000000000000000000000000000000..f47e74c94fffabbd32f055febbadb1b18aa0c429
--- /dev/null
+++ b/doc/introduction/parameters.png
@@ -0,0 +1 @@
+../../doc_cn/introduction/parameters.png
\ No newline at end of file
diff --git a/doc/ui/api/trainer_config_helpers/activations.rst b/doc/ui/api/trainer_config_helpers/activations.rst
index c4e14ed779efb6f6601d2c5fa41764f318c82848..070ed03ab6cc938f735667701bd46eec33ea77b4 100644
--- a/doc/ui/api/trainer_config_helpers/activations.rst
+++ b/doc/ui/api/trainer_config_helpers/activations.rst
@@ -32,6 +32,13 @@ LinearActivation
 ..  automodule:: paddle.trainer_config_helpers.activations
     :members: LinearActivation
     :noindex:
+
+LogActivation
+==================
+
+..  automodule:: paddle.trainer_config_helpers.activations
+    :members: LogActivation
+    :noindex:
     
 SquareActivation
 ================
diff --git a/doc/ui/cmd_argument/argument_outline.md b/doc/ui/cmd_argument/argument_outline.md
index 98dadc270dcac8cb5c05f3065c98bac78671d7fa..d6cc2c6ed7cc1b9209d56b4348497427efe40ac3 100644
--- a/doc/ui/cmd_argument/argument_outline.md
+++ b/doc/ui/cmd_argument/argument_outline.md
@@ -183,7 +183,7 @@ It looks like there are a lot of arguments. However, most of them are for develo
 </tr>
 
 <tr>
-<td class="left" rowspan = "5">GPU</td><td class="left">gpu_id</td>
+<td class="left" rowspan = "6">GPU</td><td class="left">gpu_id</td>
 <td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
 </tr>
 
@@ -207,6 +207,11 @@ It looks like there are a lot of arguments. However, most of them are for develo
 <td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
 </tr>
 
+<tr>
+<td class="left">cudnn_conv_workspace_limit_in_mb</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
 <tr>
 <td class="left" rowspan = "4">RNN</td>
 <td class="left">beam_size</td>
diff --git a/doc/ui/cmd_argument/detail_introduction.md b/doc/ui/cmd_argument/detail_introduction.md
index 0d0362d022a72b597e78e760893c91df449e5745..07608e5edf740bd3e1242913f1d2d7589ad313aa 100644
--- a/doc/ui/cmd_argument/detail_introduction.md
+++ b/doc/ui/cmd_argument/detail_introduction.md
@@ -163,6 +163,10 @@
   - Choose path to dynamic load NVIDIA CUDA library, for instance, /usr/local/cuda/lib64. [Default]: LD_LIBRARY_PATH
   - type: string (default: "", null)
 
+* `--cudnn_conv_workspace_limit_in_mb`
+  - Specify cuDNN max workspace limit, in units MB, 4096MB=4GB by default. 
+  - type: int32 (default: 4096MB=4GB)
+
 ## NLP: RNN/LSTM/GRU
 * `--rnn_use_batch`
   - Whether to use batch method for calculation in simple RecurrentLayer.
diff --git a/doc_cn/concepts/nn.rst b/doc_cn/concepts/nn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..f4d2cf490d14761f4b9f6a308180c5e8015cbecb
--- /dev/null
+++ b/doc_cn/concepts/nn.rst
@@ -0,0 +1,3 @@
+TBD
+
+目前正在书写中。敬请期待。
\ No newline at end of file
diff --git a/doc_cn/concepts/program_concepts.rst b/doc_cn/concepts/program_concepts.rst
new file mode 100644
index 0000000000000000000000000000000000000000..af5bbdac260afce0a032461ab913d05bc2f55929
--- /dev/null
+++ b/doc_cn/concepts/program_concepts.rst
@@ -0,0 +1,4 @@
+TBD
+###
+
+目前正在书写中。敬请期待。
\ No newline at end of file
diff --git a/doc_cn/concepts/pserver_topology.dot b/doc_cn/concepts/pserver_topology.dot
new file mode 100644
index 0000000000000000000000000000000000000000..9ff658b8495030f322d4f553f3bf72ddf8d3a578
--- /dev/null
+++ b/doc_cn/concepts/pserver_topology.dot
@@ -0,0 +1,68 @@
+graph pp_topology {
+	rankdir=BT;
+	subgraph cluster_node0 {
+		style=filled;
+		color=lightgrey;
+		node [style=filled, color=white, shape=box];
+		label = "机器0"
+
+		pserver0 [label="Parameter \n Server 0"]
+		trainer0 [label="Trainer 0"]
+	}
+	subgraph cluster_node1 {
+		style=filled;
+		color=lightgrey;
+		node [style=filled, color=white, shape=box];
+		label = "机器1"
+
+		pserver1 [label="Parameter \n Server 1"]
+		trainer1 [label="Trainer 1"]
+	}
+
+	subgraph cluster_node2 {
+		style=filled;
+		color=lightgrey;
+		node [style=filled, color=white, shape=box];
+		label = "机器2"
+
+		pserver2 [label="Parameter \n Server 2"]
+		trainer2 [label="Trainer 2"]
+	}
+
+	subgraph cluster_node3 {
+		style=filled;
+		color=lightgrey;
+		node [style=filled, color=white, shape=box];
+		label = "机器3"
+
+		pserver3 [label="Parameter \n Server 3"]
+		trainer3 [label="Trainer 3"]
+	}
+
+	data [label="数据", shape=hexagon]
+
+	trainer0 -- pserver0
+	trainer0 -- pserver1
+	trainer0 -- pserver2
+	trainer0 -- pserver3
+
+	trainer1 -- pserver0
+	trainer1 -- pserver1
+	trainer1 -- pserver2
+	trainer1 -- pserver3
+
+	trainer2 -- pserver0
+	trainer2 -- pserver1
+	trainer2 -- pserver2
+	trainer2 -- pserver3
+
+	trainer3 -- pserver0
+	trainer3 -- pserver1
+	trainer3 -- pserver2
+	trainer3 -- pserver3
+
+	data -- trainer0
+	data -- trainer1
+	data -- trainer2
+	data -- trainer3
+}
diff --git a/doc_cn/concepts/trainer_config.py b/doc_cn/concepts/trainer_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..8d8c79fb39e0c0ddf13aee5d41297506d3404362
--- /dev/null
+++ b/doc_cn/concepts/trainer_config.py
@@ -0,0 +1,23 @@
+from paddle.trainer_config_helpers import *
+
+define_py_data_sources2(train_list='train.list',
+                        test_list='test.list',
+                        module='provider',
+                        obj='process')
+settings(
+    batch_size=128,
+    learning_rate=1e-3,
+    learning_method=AdamOptimizer(),
+    regularization=L2Regularization(0.5)
+)
+
+img = data_layer(name='pixel', size=28 * 28)
+
+hidden1 = simple_img_conv_pool(input=img, filter_size=3, num_filters=32, pool_size=3,
+                               num_channel=1)
+
+hidden2 = fc_layer(input=hidden1, size=200, act=TanhActivation(),
+                   layer_attr=ExtraAttr(drop_rate=0.5))
+predict = fc_layer(input=hidden2, size=10, act=SoftmaxActivation())
+
+outputs(classification_cost(input=predict, label=data_layer(name='label', size=10)))
diff --git a/doc_cn/concepts/use_concepts.rst b/doc_cn/concepts/use_concepts.rst
new file mode 100644
index 0000000000000000000000000000000000000000..67e98edabc0c2a4ecdf8d7993f8dd66b9365a05d
--- /dev/null
+++ b/doc_cn/concepts/use_concepts.rst
@@ -0,0 +1,191 @@
+#########################
+PaddlePaddle 基本使用概念
+#########################
+
+PaddlePaddle是一个神经网络学习框架。其单机进程为 :code:`paddle train`。 单机的所有设备使用，均在单机进程内调度完成。 而多机辅助进程 :code:`paddle pserver` 负责联合多个单机进程进行通信，进而充分利用集群的计算资源。 PaddlePaddle同时以 :code:`swig api` 的形式，提供训练结果模型预测的方法和自定义训练流程。
+
+下面我们会分别介绍主要进程 :code:`paddle train` 中的一些概念。这些概念会对如何使用PaddlePaddle有一定的帮助。 了解这些概念的前提是，读者已经了解 `基本的神经网络/机器学习原理和概念 <nn.html>`_ 。同时，如果想要了解PaddlePaddle实现中的一些概念，请参考 `PaddlePaddle 编程中的基本概念 <program_concepts.html>`_ 。
+
+..	contents::
+
+PaddlePaddle 的进程模型
+=======================
+
+PaddlePaddle进程内嵌了一个 :code:`python` 解释器。 这个 :code:`python` 解释器负责解析用户定义的神经网络配置，和解析用户数据，并将用户数据传入给 PaddlePaddle。
+
+..	graphviz:: 
+
+	digraph pp_process {
+		rankdir=LR;
+		config_file [label="用户神经网络配置"];
+		subgraph cluster_pp {
+			style=filled;
+			color=lightgrey;
+			node [style=filled, color=white, shape=box];
+			label = "PaddlePaddle C++";
+			py [label="Python解释器"];
+		}
+		data_provider [label="用户数据解析"];
+		config_file -> py;
+		py -> data_provider [dir="back"];
+	}
+
+所以，PaddlePaddle单机训练进程，:code:`paddle train` , 对于用户的主要接口语言为 python。 主要需要用户配置的两个文件为 :code:`DataProvider` 和训练文件 :code:`TrainerConfig` 。
+
+
+DataProvider
+============
+
+DataProvider是 :code:`paddle train` 的数据提供器。 它负责将用户的原始数据转换成 PaddlePaddle 可以识别的数据类型。每当 PaddlePaddle 需要新的数据训练时，都会调用 DataProvider 返回数据。 当所有数据读取完一轮后，DataProvider 便返回空数据通知 PaddlePaddle。PaddlePaddle负责在下一轮训练开始前，将DataProvider重置。
+
+需要注意的是，DataProvider在PaddlePaddle中是被训练逻辑调用的关系， 而不是新的数据驱动训练。并且所有的 :code:`shuffle` , 和一些随机化的噪声添加，都应该在 DataProvider 阶段完成。
+
+为了方便用户使用自己的数据格式， PaddlePaddle 提供了 `PyDataProvider`_ 来处理数据。 并且在这个Provider中，PaddlePaddle的 C++ 部分接管了如何shuffle，处理 batch，GPU/CPU通信，双缓冲，异步读取等问题。 用户可以参考 `PyDataProvider`_ 的相关文档，继续深入了解 DataProvider 的使用。
+
+
+训练文件
+========
+
+训练文件是PaddlePaddle中配置神经网络结构、学习优化算法、数据传入方式的地方。 训练文件是一个python文件，使用命令行参数 :code:`--config` 传给 paddle 的主程序。 例如\:
+
+..	code-block:: bash
+
+	paddle train --config=trainer_config.py
+
+一个典型简单的训练文件可能为
+
+..  literalinclude:: trainer_config.py
+    :linenos:
+
+下面我们详细的介绍一下训练文件中各个模块的概念。
+
+
+trainer_config_helpers
+----------------------
+
+PaddlePaddle的配置文件与PaddlePaddle C++端通信的最基础协议是 :code:`protobuf` 。而为了避免用户直接写比较难写的 protobuf string，我们书写了一个helpers来生成这个protobuf包。所以在文件的开始，import这些helpers函数。
+
+需要注意的是，这个 :code:`paddle.trainer_config_helpers` 包是标准的python包，这意味着用户可以选择自己喜欢的 :code:`ide` 或者编辑器来编写Paddle的配置文件，这个python包注释文档比较完善，并且考虑了IDE的代码提示与类型注释。
+
+data_sources
+------------
+
+data_sources是配置神经网络的数据源。这里使用的函数是 :code:`define_py_data_sources2` ，这个函数是定义了使用 `PyDataProvider`_ 作为数据源。 而后缀 :code:`2` 是Paddle历史遗留问题，因为Paddle之前使用的 PyDataProvider 性能较差，所以完全重构了一个新的 `PyDataProvider`_ 。
+
+data_sources里面的 train_list 和 test_list 指定的是训练文件列表和测试文件列表。 如果传入一个字符串的话，是指一个训练列表文件。这个训练列表文件中包含的是每一个训练或者测试文件的路径。如果传入一个list的话，则会默认生成一个 list 文件，再传入给 train.list 或者 test.list 。
+
+而 :code:`module` 和 :code:`obj` 指定了 DataProvider 的模块名和函数名。
+
+更具体的使用，请参考 `PyDataProvider`_ 。
+
+settings
+--------
+
+`settings`_ 是神经网络训练算法相关的设置项。包括学习率，batch_size，优化算法，正则方法等等。具体的使用方法请参考 `settings`_ 文档。
+
+网络配置
+--------
+
+上述网络配置中余下的部分均是神经网络配置。第一行是定义一个名字叫 "pixel" 的 :code:`data_layer` 。每一个layer返回的都是一个 :code:`LayerOutput` 对象。 这里第一层的输出对象是 :code:`img` 。然后这个对象传输给了另一个 layer 函数，
+:code:`simple_img_conv_pool` 。:code:`simple_img_conv_pool` 是一个组合层，
+包括了图像的卷积 (convolution) 和池化(pooling)，
+并继续接了一个全连接层( :code:`fc_layer` )，然后再接了一个Softmax的全连接层。
+
+最终，网络配置输出了 :code:`classification_cost` 。标记网络输出的函数为 
+:code:`outputs` 。网络的输出是神经网络的优化目标，神经网络训练的时候，实际上就是
+要最小化这个输出。
+
+在神经网络进行预测的时候，实际上网络的输出也是通过 :code:`outputs` 标记。
+
+
+Layer、Projection、Operator
+===========================
+
+PaddlePaddle的网络基本上是基于Layer来配置的。所谓的Layer即是神经网络的某一层，
+而神经网络的某一层，一般是封装了许多复杂操作的操作集合。比如最简单的
+:code:`fc_layer` ，也包括矩阵乘法，多输入的求和，和activation。
+
+..	code-block:: python
+
+	data = data_layer(name='data', size=200)
+	out = fc_layer(input=data, size=200, act=TanhActivation())
+
+而对于更灵活配置需求，可能这样基于Layer的配置是不灵活的。于是 PaddlePaddle 提供
+了基于 Projection 或者 Operator 的配置。使用Projection和Operator需要与
+:code:`mixed_layer` 配合使用。 :code:`mixed_layer` 是将layer中的元素累加求和，
+并且做一个 :code:`activation` ， 而这个layer具体如何计算，是交由内部的Projection
+和 Operator 定义。Projection是指含有可学习参数的操作，而Operator不含有可学习的
+参数，输入全是其他Layer的输出。
+
+
+例如，和 :code:`fc_layer` 同样功能的 :code:`mixed_layer` 。
+
+..	code-block:: python
+
+	data = data_layer(name='data', size=200)
+	with mixed_layer(size=200) as out:
+		out += full_matrix_projection(input=data)
+
+PaddlePaddle可以使用的mixed layer 配置出非常复杂的网络，甚至可以直接配置一个完整的LSTM。
+用户可以参考 `mixed_layer`_ 的相关文档进行配置。
+
+如何利用单机的所有GPU或所有CPU核心
+==================================
+
+PaddlePaddle的单机进程 :code:`paddle train` 可以充分利用一台计算机上所有的GPU资
+源或者CPU。
+
+如果要使用机器上多块GPU，使用如下命令即可\:
+
+..	code-block:: bash
+
+	paddle train --use_gpu=true --trainer_count=4  # use 4 gpu card, 0, 1, 2, 3
+
+如果要使用机器上多块CPU, 使用如下命令即可\:
+
+..	code-block:: bash
+
+	paddle train --trainer_config=4  # use 4 cpu cores.
+
+对于其他设置GPU的选择情况，例如选择第0、2号GPU显卡，则可以使用 :code:`CUDA_VISIBLE_DEVICES` 环境变量来选择部分的显卡。 具体可以参考连接`masking-gpus`_ 。 可以使用的命令为
+
+..	code-block:: bash
+
+	env CUDA_VISIBLE_DEVICES=0,2 paddle train --use_gpu=true --trainer_config=2
+
+如何利用多台机器的计算资源训练神经网络
+======================================
+
+PaddlePaddle多机使用的经典方法是通过 :code:`Parameter Server` 来对多机的 :code:`paddle train` 进行同步。 而多机训练神经网络，首先要讲数据切分到不同的机器上。 切分数据文件的方式在PaddlePaddle的开源实现中并没有提供工具包。 但是切分数据并不是一件非常复杂的事情，也不是神经网络实现的重点。
+
+多机训练过程中，经典的拓扑结构如下\:
+
+..	graphviz:: pserver_topology.dot
+
+图中每个灰色方块是一台机器，在每个机器中，先去启动一个 :code:`paddle pserver` 进程，并确定整体的端口号。可能的参数是\:
+
+..	code-block:: bash
+
+	paddle pserver --port=5000 --num_gradient_servers=4 --nics='eth0'
+
+这里说明系统的 :code:`paddle pserver` 的起始端口是 :code:`5000` ，并且有四个训练进程(:code:`gradient_servers`，Paddle同时将 :code:`paddle train` 进程称作 :code:`GradientServer` 。因为其为负责提供Gradient的进程)。 而对于训练进程的话，则需要在 :code:`paddle pserver` 启动之后，再在各个节点上运行如下命令\:
+
+..	code-block:: bash
+
+	paddle train --port=5000 --pservers=192.168.100.101,192.168.100.102,192.168.100.103,192.168.100.104 --config=...
+
+对于简单的多机协同使用上述方式即可。同时，pserver/train 通常在高级情况下，还有两个参数需要设置，他们是
+
+* --ports_num\: 一个 pserver进程共绑定多少个端口用来做稠密更新。默认是1
+* --ports_num_for_sparse\: 一个pserver进程共绑定多少端口用来做稀疏更新，默认是0
+
+使用手工指定端口数量，是因为Paddle的网络通信中，使用了 :code:`int32` 作为消息长度，比较容易在大模型下溢出。所以，在 :code:`paddle pserver` 进程中可以启动多个子线程去接受 trainer 的数据，这样单个子线程的长度就不会溢出了。但是这个值不可以调的过大，因为增加这个值，还是对性能，尤其是内存占用有一定的开销的，另外稀疏更新的端口如果太大的话，很容易某一个参数服务器没有分配到任何参数。
+
+详细的说明可以参考，使用 `集群训练Paddle`_ 。
+
+
+..  _PyDataProvider: ../ui/data_provider/pydataprovider2.html
+..	_settings: ../../doc/ui/api/trainer_config_helpers/optimizers.html#settings
+..	_mixed_layer: ../../doc/ui/api/trainer_config_helpers/layers.html#mixed-layer
+..	_masking-gpu: http://www.acceleware.com/blog/cudavisibledevices-masking-gpus
+..  _集群训练Paddle: ../cluster/index.html
diff --git a/doc_cn/faq/index.rst b/doc_cn/faq/index.rst
index 283607957ce63099a61d220478728654e993fe9a..db28b4436fe5e76882861a4cf06f358a63d8ebd4 100644
--- a/doc_cn/faq/index.rst
+++ b/doc_cn/faq/index.rst
@@ -166,4 +166,14 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 
 这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。
 
+7. *-cp27mu-linux_x86_64.whl is not a supported wheel on this platform.
+-----------------------------------------------------------------------
+
+出现这个问题的主要原因是，系统编译wheel包的时候，使用的 :code:`wheel` 包是最新的，
+而系统中的 :code:`pip` 包比较老。具体的解决方法是，更新 :code:`pip` 包并重新编译PaddlePaddle。
+更新 :code:`pip` 包的方法是\:
+
+..  code-block:: bash
+
+    pip install --upgrade pip
 
diff --git a/doc_cn/howto/how_to_write_docs/index.rst b/doc_cn/howto/how_to_write_docs/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..869ef747f9f88c7dbb5efdf6e03111a3f76c4014
--- /dev/null
+++ b/doc_cn/howto/how_to_write_docs/index.rst
@@ -0,0 +1,63 @@
+###############################
+如何贡献/修改PaddlePaddle的文档
+###############################
+
+PaddlePaddle的文档使用 `cmake`_ 驱动 `sphinx`_ 生成。公有两个文档，:code:`doc` 和 :code:`doc_cn` 。这两者会在 `cmake`_ 中进行编译，生成后的文档会存储在服务器的 :code:`doc` 和 :code:`doc_cn` 两个目录下。
+
+下面分几个部分介绍一下PaddlePaddle文档的贡献方法。
+
+如何书写PaddlePaddle的文档
+==========================
+
+TBD
+
+如何构建PaddlePaddle的文档
+==========================
+
+构建PaddlePaddle文档，需要使用构建Paddle的全部环境。准备这个环境相对来说比较复杂，所以本文档提供两种方式构建PaddlePaddle的文档，即
+
+* 使用Docker构建PaddlePaddle的文档
+* 直接构建PaddlePaddle的文档。
+
+并且，我们推荐使用Docker来构建PaddlePaddle的文档。
+
+
+使用Docker构建PaddlePaddle的文档
+--------------------------------
+
+使用Docker构建PaddlePaddle的文档，首先要求在系统里安装好Docker工具包。安装Docker请参考 `Docker的官网 <https://docs.docker.com/>`_ 。
+
+安装好Docker之后可以使用源码目录下的脚本构建文档，即
+
+..	code-block:: bash
+
+	cd TO_YOUR_PADDLE_CLONE_PATH
+	cd paddle/scripts/tools/build_docs
+	bash build_docs.sh
+
+执行完这个脚本后，该目录下会生成两个目录，分别是\:
+
+* doc 目录，英文文档地址
+* doc_cn 目录，中文文档地址
+
+打开浏览器访问对应目录下的index.html即可访问本地文档。
+
+..	code-block:: bash
+
+	open doc_cn/index.html
+
+
+直接构建PaddlePaddle的文档
+--------------------------
+
+TBD
+
+
+如何更新www.paddlepaddle.org文档
+================================
+
+TBD
+
+
+..	_cmake: https://cmake.org/
+..	_sphinx: http://www.sphinx-doc.org/en/1.4.8/
\ No newline at end of file
diff --git a/doc_cn/index.rst b/doc_cn/index.rst
index d2d50fbdb47f27ad5ad8d22215a9f0993145430f..f1398206fddffca77f583c195e00034e55932639 100644
--- a/doc_cn/index.rst
+++ b/doc_cn/index.rst
@@ -3,8 +3,9 @@ PaddlePaddle文档
 
 使用指南
 --------
-
+* `介绍 <introduction/index.html>`_
 * `快速入门 <demo/quick_start/index.html>`_
+* `基本使用概念 <concepts/use_concepts.html>`_
 * `编译与安装 <build_and_install/index.html>`_
 * `用户接口 <ui/index.html>`_
 * `使用示例 <demo/index.html>`_
@@ -14,6 +15,7 @@ PaddlePaddle文档
 开发指南
 --------
 * `新写Layer <../doc/dev/new_layer/index.html>`_
+* `如何贡献文档 <howto/how_to_write_docs/index.html>`_
 
 算法教程
 --------
diff --git a/doc_cn/introduction/index.md b/doc_cn/introduction/index.md
new file mode 100644
index 0000000000000000000000000000000000000000..164cb7d4943dfbfcc00a2df7329ae2a877b2d703
--- /dev/null
+++ b/doc_cn/introduction/index.md
@@ -0,0 +1,105 @@
+# 简介
+
+PaddlePaddle 是起源于百度的开源深度学习平台。它是简单易用的：你可以通过简单的十数行配置搭建经典的神经网络模型；它也是高效强大的：PaddlePaddle可以支撑复杂集群环境下超大模型的训练，令你受益于深度学习的前沿成果。在百度内部，已经有大量产品线使用了基于PaddlePaddle的深度学习技术。
+
+这份简短的介绍将像你展示如何利用PaddlePaddle解决一个经典的学习问题。
+
+## 1. 一个经典的任务
+
+让我们从一个基础问题开始：<a href="https://www.baidu.com/s?wd=单变量线性回归">单变量的线性回归</a>。问题假定观测到了一批二维空间上的点`(x, y) `，并且已知 `x` 和 `y` 之间存在着某种线性关系，我们的目标是通过观测数据还原这个线性关系。作为一个简单基础的模型，线性回归却有着广泛的应用场景。比如可以想象一个资产定价的简化场景，其中 `x` 对应于房屋的大小，`y` 对应于房屋价格。我们可以通过观察市场上房屋的情况获得二者之间的关系，从而为新房屋的定价提供参考。
+
+
+## 2. 准备数据
+
+假设变量 `X` 和 `Y` 的真实关系为： `Y = 2X + 0.3`，这里展示如何使用观测数据还原这一线性关系。如下Python代码将随机产生2000个观测点，它们将被用作PaddlePaddle的输入。产生PaddlePaddle的输入数据和写一段普通的Python脚本几乎一样，你唯一需要增加的就是定义输入数据的类型。
+
+```python
+# -*- coding:utf-8 -*-
+# dataprovider.py
+from paddle.trainer.PyDataProvider2 import *
+import random
+
+# 定义输入数据的类型: 2个浮点数
+@provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
+def process(settings, input_file):
+    for i in xrange(2000):
+        x = random.random()
+        yield [x], [2*x+0.3]
+```
+
+## 3. 训练模型
+
+为了还原 `Y = 2X + 0.3`，我们先从一条随机的直线 `Y' = wX + b` 开始，然后利用观测数据调整 `w` 和 `b` 使得 `Y'` 和 `Y` 的差距不断减小，最终趋于相同。这个过程就是模型的训练过程，而 `w` 和 `b` 就是模型的参数，即我们的训练目标。
+
+在PaddlePaddle里，该模型的网络配置如下。
+
+```python
+# -*- coding:utf-8 -*-
+# trainer_config.py
+from paddle.trainer_config_helpers import *
+
+# 1. 定义数据来源，调用上面的process函数获得观测数据
+data_file = 'empty.list'
+with open(data_file, 'w') as f: f.writelines(' ')
+define_py_data_sources2(train_list=data_file, test_list=None, 
+        module='dataprovider', obj='process',args={})
+
+# 2. 学习算法。控制如何改变模型参数 w 和 b
+settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
+
+# 3. 神经网络配置
+x = data_layer(name='x', size=1)
+y = data_layer(name='y', size=1)
+# 线性计算单元: y_predict = wx + b
+y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
+# 损失计算，度量 y_predict 和真实 y 之间的差距
+cost = regression_cost(input=y_predict, label=y)
+outputs(cost)
+```
+这段简短的配置展示了PaddlePaddle的基本用法：
+
+- 首先，第一部分定义了数据输入。一般情况下，PaddlePaddle先从一个文件列表里获得数据文件地址，然后交给用户自定义的函数（例如上面的`process`函数）进行读入和预处理从而得到真实输入。本文中由于输入数据是随机生成的不需要读输入文件，所以放一个空列表（`empty.list`）即可。
+
+- 第二部分主要是选择学习算法，它定义了模型参数如何改变。PaddlePaddle提供了很多优秀的学习算法，但这里使用一个简单的基于momentum的算法就足够了，它每次读取12个数据进行计算和模型更新。
+
+- 最后一部分是神经网络的配置。由于PaddlePaddle已经实现了丰富的网络单元（Layer），所以很多时候你需要做的只是声明正确的网络单元并把它们拼接起来。这里使用了三种网络单元：
+	- **数据层**：数据层 `data_layer` 是神经网络的入口，它读入数据并将它们传输到下游的其它单元。这里数据层有两个，分别对应于变量 `X` 和 `Y`。
+	- **全连接层**：全连接层 `fc_layer` 是基础的计算单元，这里利用它建模变量之间的线性关系。计算单元是神经网络的核心，PaddlePaddle支持大量的计算单元和任意深度的网络连接，从而可以挖掘复杂的数据关系。
+	- **回归损失层**：回归损失层 `regression_cost`是众多损失函数层的一种，它们在训练过程作为网络的出口，用来计算模型的表现，并指导模型参数的改变。
+
+这样定义了网络结构并保存为`trainer_config.py`之后，运行训练命令即可：
+ ```
+ paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
+ ```
+
+PaddlePaddle将在观测数据集上迭代训练30轮，并将每轮的模型结果存放在 `./output` 路径下。从输出日志可以看到，随着轮数增加损失函数的输出在不断的减小，这意味着模型在不断的改进，直到逼近真实解：` Y = 2X + 0.3 `
+
+## 4. 模型检验
+
+训练完成后，我们希望能够检验模型的好坏。一种常用的做法是用模型对另外一组数据进行预测，然后评价预测的效果。但在这个例子中，由于已经知道了真实答案，我们可以直接观察模型的参数是否符合预期来进行检验。
+
+PaddlePaddle将每个模型参数作为一个numpy数组单独存为一个文件，所以可以利用如下方法读取模型的参数。
+
+```python
+import numpy as np
+import os
+
+def load(file_name):
+    with open(file_name, 'rb') as f:
+        f.read(16) # skip header for float type.
+        return np.fromfile(f, dtype=np.float32)
+        
+print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
+# w=1.999743, b=0.300137
+```
+<center> ![](./parameters.png) </center>
+
+从图中可以看到，虽然 `w` 和 `b` 都使用随机值初始化，但在起初的几轮训练中它们都在快速逼近真实值，并且后续仍在不断改进，使得最终得到的模型几乎与真实模型重合。
+
+这样，我们就完成了对单变量线性回归问题的解决：将数据输入PaddlePaddle，训练模型，最后验证结果。
+
+## 5. 推荐后续阅读
+
+- <a href="../build_and_install/index.html">安装/编译</a>：PaddlePaddle的安装与编译文档。
+- <a href="../demo/quick_start/index.html">快速入门 </a>：使用商品评论分类任务，系统性的介绍如何一步步改进，最终得到产品级的深度模型。
+- <a href="../demo/index.html">示例</a>：各种实用案例，涵盖图像、文本、推荐等多个领域。
diff --git a/doc_cn/introduction/parameters.png b/doc_cn/introduction/parameters.png
new file mode 100644
index 0000000000000000000000000000000000000000..2ec67480951e21f0400bce1c34b3108dcd65c18c
Binary files /dev/null and b/doc_cn/introduction/parameters.png differ
diff --git a/paddle/api/Arguments.cpp b/paddle/api/Arguments.cpp
index 8f73e7626042c9b138625ec9db599fdc2e42cc9b..6f51d551200696ebafade2a46243b78086975265 100644
--- a/paddle/api/Arguments.cpp
+++ b/paddle/api/Arguments.cpp
@@ -14,27 +14,10 @@ limitations under the License. */
 
 
 #include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
 
 #include "paddle/parameter/Argument.h"
 
-struct ArgumentsPrivate {
-  std::vector<paddle::Argument> outputs;
-
-  inline paddle::Argument& getArg(size_t idx) throw(RangeError) {
-    if (idx < outputs.size()) {
-      return outputs[idx];
-    } else {
-      RangeError e;
-      throw e;
-    }
-  }
-
-  template <typename T>
-  std::shared_ptr<T>& cast(void* rawPtr) const {
-    return *(std::shared_ptr<T>*)(rawPtr);
-  }
-};
-
 size_t Arguments::getSlotNum() const { return m->outputs.size(); }
 
 Arguments* Arguments::createArguments(size_t slotNum) {
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index fe0da763514a65911b30f42159c6fce7057d18a6..9b2d122a09adabd766014a9d21a167eec5b2de32 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -40,6 +40,8 @@ configure_file(
 
 generate_python_api(python_swig_sources)
 
+file(GLOB PY_PADDLE_PYTHON_FILES ${PROJ_ROOT}/paddle/py_paddle/*.py)
+
 # TODO(yuyang18) : make wheel name calculated by cmake
 add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/dist/.timestamp
     COMMAND ${PYTHON_EXECUTABLE} setup.py  bdist_wheel
@@ -55,6 +57,7 @@ add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/dist/.timestamp
             paddle_trainer
             paddle_api
             paddle_cuda
+	    ${PY_PADDLE_PYTHON_FILES}
 )
 
 install(DIRECTORY ${PROJ_ROOT}/paddle/dist/
diff --git a/paddle/api/ConfigParser.cpp b/paddle/api/ConfigParser.cpp
index c5ee784a0bda09cd7a2b857d98431bec67afcae4..25d94f5a6a1255f3e2faff9816cfd003b20c0418 100644
--- a/paddle/api/ConfigParser.cpp
+++ b/paddle/api/ConfigParser.cpp
@@ -14,17 +14,9 @@ limitations under the License. */
 
 
 #include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
 #include "paddle/trainer/Trainer.h"
 
-struct TrainerConfigPrivate {
-  std::shared_ptr<paddle::TrainerConfig> conf;
-  TrainerConfigPrivate() : conf(std::make_shared<paddle::TrainerConfig>()) {}
-};
-
-struct ModelConfigPrivate {
-  std::shared_ptr<paddle::TrainerConfig> conf;
-};
-
 struct ParameterConfigPrivate {
   paddle::ParameterPtr parameter;
   paddle::ParameterConfig config;
@@ -39,19 +31,6 @@ struct ParameterConfigPrivate {
   }
 };
 
-struct OptimizationConfigPrivate {
-  std::shared_ptr<paddle::TrainerConfig> trainer_config;
-  paddle::OptimizationConfig config;
-
-  paddle::OptimizationConfig& getConfig() {
-    if (trainer_config != nullptr) {
-      return *trainer_config->mutable_opt_config();
-    } else {
-      return config;
-    }
-  }
-};
-
 TrainerConfig::TrainerConfig() : m(new TrainerConfigPrivate()) {}
 
 TrainerConfig::~TrainerConfig() { delete m; }
@@ -59,10 +38,19 @@ TrainerConfig::~TrainerConfig() { delete m; }
 TrainerConfig* TrainerConfig::createFromTrainerConfigFile(
     const std::string& confPath) {
   LOG(INFO) << "load trainer config from " << confPath;
-  paddle::TrainerConfigHelper helper(confPath);
-  //! TODO(yuyang18): Make TrainerConfigPrivate to TrainerConfigHelper
+  auto conf = std::make_shared<paddle::TrainerConfigHelper>(confPath);
   auto retv = new TrainerConfig();
-  *retv->m->conf = helper.getConfig();
+  retv->m->conf = conf;
+  return retv;
+}
+
+TrainerConfig* TrainerConfig::createFromProtoString(
+    const std::string& str) {
+  auto retv = new TrainerConfig();
+  paddle::TrainerConfig trainerConfigProto;
+  auto conf = std::make_shared<paddle::TrainerConfigHelper>(trainerConfigProto);
+  CHECK(conf->getMutableConfig().ParseFromString(str));
+  retv->m->conf = conf;
   return retv;
 }
 
@@ -76,10 +64,6 @@ ModelConfig* TrainerConfig::getModelConfig() const {
   return retv;
 }
 
-void* ModelConfig::getPaddleModelConfig() const {
-  return m->conf->mutable_model_config();
-}
-
 ParameterConfig::ParameterConfig() : m(new ParameterConfigPrivate()) {}
 
 ParameterConfig::~ParameterConfig() {
@@ -132,8 +116,6 @@ OptimizationConfig* TrainerConfig::getOptimizationConfig() const {
   return opt_config;
 }
 
-void* OptimizationConfig::getRawPtr() { return &m->getConfig(); }
-
 OptimizationConfig* OptimizationConfig::createFromProtoString(
     const std::string& str) {
   auto conf = new OptimizationConfig();
diff --git a/paddle/api/GradientMachine.cpp b/paddle/api/GradientMachine.cpp
index 6f1d63575a80f3011cc678df897d54d602edfb3b..bef499c67858b8e2d5432155a8defca56af6019c 100644
--- a/paddle/api/GradientMachine.cpp
+++ b/paddle/api/GradientMachine.cpp
@@ -14,30 +14,22 @@ limitations under the License. */
 
 
 #include "PaddleAPI.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
+#include "PaddleAPIPrivate.h"
+
 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 #include "Internal.h"
 
 std::vector<int> GradientMachine::defaultParamTypes = {
     PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM};
 
-struct GradientMachinePrivate {
-  std::shared_ptr<paddle::GradientMachine> machine;
-
-  template <typename T>
-  inline T& cast(void* ptr) {
-    return *(T*)(ptr);
-  }
-};
-
 GradientMachine::GradientMachine() : m(new GradientMachinePrivate()) {}
 
 GradientMachine::~GradientMachine() { delete m; }
 
 GradientMachine* GradientMachine::createFromPaddleModelPtr(
-    void* confPtr, GradientMatchineCreateMode mode,
+    const void* confPtr, GradientMatchineCreateMode mode,
     const std::vector<int>& types) {
-  auto& conf = *(paddle::ModelConfig*)(confPtr);
+  auto& conf = *(const paddle::ModelConfig*)(confPtr);
   std::vector<ParameterType> realTypes;
   staticCastVector(&realTypes, types);
   auto machineRawPtr = paddle::GradientMachine::create(conf, mode, realTypes);
@@ -66,7 +58,7 @@ GradientMachine* GradientMachine::createByConfigProtoStr(
 GradientMachine* GradientMachine::createByModelConfig(
     ModelConfig* conf, GradientMatchineCreateMode mode,
     const std::vector<int>& types) {
-  auto confPtr = (paddle::ModelConfig*)conf->getPaddleModelConfig();
+  auto confPtr = &conf->m->conf->getModelConfig();
   return GradientMachine::createFromPaddleModelPtr(confPtr, mode, types);
 }
 
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index b3140617af188b6a80067d9dbd312bd9e9155adf..cf790f2f8ef1dbdce37b279227e95328490c518d 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -446,7 +446,6 @@ struct OptimizationConfigPrivate;
 class OptimizationConfig {
   DISABLE_COPY_AND_ASSIGN(OptimizationConfig);
   OptimizationConfig();
-  void* getRawPtr();
 
 public:
   static OptimizationConfig* createFromProtoString(const std::string& str);
@@ -462,6 +461,7 @@ private:
 
   friend class TrainerConfig;
   friend class ParameterOptimizer;
+  friend class Trainer;
 };
 
 struct ParameterPrivate;
@@ -515,8 +515,6 @@ public:
   virtual ~ModelConfig();
 
 private:
-  void* getPaddleModelConfig() const;
-
   ModelConfigPrivate* m;
   friend class TrainerConfig;
   friend struct TrainerConfigPrivate;
@@ -539,6 +537,7 @@ public:
 
   static TrainerConfig* createFromTrainerConfigFile(
       const std::string& configPath);
+  static TrainerConfig* createFromProtoString(const std::string& str);
 
   ModelConfig* getModelConfig() const;
 
@@ -546,6 +545,7 @@ public:
 
 private:
   TrainerConfigPrivate* m;
+  friend class Trainer;
 };
 
 /**
@@ -700,11 +700,12 @@ private:
   GradientMachinePrivate* m;
 
   static GradientMachine* createFromPaddleModelPtr(
-      void* confPtr, GradientMatchineCreateMode mode,
+      const void* confPtr, GradientMatchineCreateMode mode,
       const std::vector<int>& types);
 
   // Not to use c++ 11 init-list, so we use static var as function default arg.
   static std::vector<int> defaultParamTypes;
+  friend class Trainer;
 };
 
 struct TrainerPrivate;
@@ -712,6 +713,7 @@ class Trainer {
 private:
   TrainerPrivate* m;
   Trainer();
+  Trainer(TrainerConfig* optConfig, GradientMachine* gm);
   DISABLE_COPY_AND_ASSIGN(Trainer);
 
 public:
@@ -720,38 +722,42 @@ public:
   /// Create A Trainer By TrainerConfig. using paddle command line.
   static Trainer* createByCommandLine() throw(IOError);
 
-  /// Start Train.
+  static Trainer* create(TrainerConfig* optConfig, GradientMachine* gm)
+      throw(IOError);
+
+  /// Start training
   void startTrain();
+
+  /// Finish training
   void finishTrain();
 
-  /// Start Pass.
+  /// Start a pass.
   void startTrainPass();
-  void finishTrainPass();
 
-  void setBatchSize(size_t batchSize);
+  /// Finish a pass
+  void finishTrainPass();
 
   /**
    * Train one batch,
    *
-   * @param batchSize -1 wiil use command line or batch size set before,
-   *                  otherwise use this batchSize for train.
-   *
    * @return true if all batch finished.
    */
-  bool trainOneBatch(size_t batchSize = -1UL);
+  bool trainOneBatch(size_t batchSize);
 
-  bool prepareBatchData(size_t batchSize = -1UL);
+  void trainOneDataBatch(size_t batchSize, const Arguments& args);
 
-  void finishTrainOneBatch();
+  void startTestPeriod();
+  void testOneDataBatch(size_t batchSize, const Arguments& args);
+  void finishTestPeriod();
 
-  void forwardOneBatch() throw(UnsupportError);
+  void forwardOneBatch(size_t batchSize);
 
-  Arguments* getNetworkOutput();
+  Arguments* getForwardOutput();
 
   Matrix* getLayerOutput(const std::string& layerName);
 };
 
-/// The N-Best results generated from one input sequence.
+/// the N-Best results generated from one input sequence.
 class ISequenceResults {
 public:
   virtual ~ISequenceResults();
diff --git a/paddle/api/PaddleAPIPrivate.h b/paddle/api/PaddleAPIPrivate.h
new file mode 100644
index 0000000000000000000000000000000000000000..93cdca8c4beaaad70a40e5899ccd908594425f4f
--- /dev/null
+++ b/paddle/api/PaddleAPIPrivate.h
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/trainer/TrainerConfigHelper.h"
+
+#pragma once
+
+struct GradientMachinePrivate {
+  std::shared_ptr<paddle::GradientMachine> machine;
+
+  template <typename T>
+  inline T& cast(void* ptr) {
+    return *(T*)(ptr);
+  }
+};
+
+struct OptimizationConfigPrivate {
+  std::shared_ptr<paddle::TrainerConfigHelper> trainer_config;
+  paddle::OptimizationConfig config;
+
+  const paddle::OptimizationConfig& getConfig() {
+    if (trainer_config != nullptr) {
+      return trainer_config->getOptConfig();
+    } else {
+      return config;
+    }
+  }
+};
+
+struct TrainerConfigPrivate {
+  std::shared_ptr<paddle::TrainerConfigHelper> conf;
+  TrainerConfigPrivate() {}
+};
+
+struct ModelConfigPrivate {
+  std::shared_ptr<paddle::TrainerConfigHelper> conf;
+};
+
+struct ArgumentsPrivate {
+  std::vector<paddle::Argument> outputs;
+
+  inline paddle::Argument& getArg(size_t idx) throw(RangeError) {
+    if (idx < outputs.size()) {
+      return outputs[idx];
+    } else {
+      RangeError e;
+      throw e;
+    }
+  }
+
+  template <typename T>
+  std::shared_ptr<T>& cast(void* rawPtr) const {
+    return *(std::shared_ptr<T>*)(rawPtr);
+  }
+};
+
diff --git a/paddle/api/ParameterOptimizer.cpp b/paddle/api/ParameterOptimizer.cpp
index e087defc6043c18123909549ed63f630708d48eb..b13761ab0900d57008c17094c5199ef31a040f54 100644
--- a/paddle/api/ParameterOptimizer.cpp
+++ b/paddle/api/ParameterOptimizer.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 
 
 #include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
 #include "paddle/parameter/ParameterOptimizer.h"
 #include "Internal.h"
 #include <algorithm>
@@ -60,10 +61,9 @@ ParameterOptimizer::~ParameterOptimizer() {
 
 ParameterOptimizer* ParameterOptimizer::create(OptimizationConfig* config) {
   CHECK(config != nullptr);
-  auto opt_config_ptr = (paddle::OptimizationConfig*)config->getRawPtr();
   auto retOptimizer = new ParameterOptimizer();
   retOptimizer->m->optimizer.reset(
-      paddle::ParameterOptimizer::create(*opt_config_ptr, false));
+      paddle::ParameterOptimizer::create(config->m->getConfig(), false));
   return retOptimizer;
 }
 
diff --git a/paddle/api/Trainer.cpp b/paddle/api/Trainer.cpp
index 95b578c8db9fdc12707c4dd7aac5a403862b47d8..b61f36f740d47fe785b30361f26059bf0b64829d 100644
--- a/paddle/api/Trainer.cpp
+++ b/paddle/api/Trainer.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
 
 #include <stdlib.h>
 #include <memory>
@@ -30,31 +31,17 @@ P_DECLARE_string(config);
 P_DECLARE_string(init_model_path);
 P_DECLARE_int32(start_pass);
 
-struct TrainPassContext {
-  int64_t batchId;
-  int32_t batchSize;
-  real avgTestCost;
-  int64_t numAvgTests;
-  int passInnerId;
-  paddle::DataBatch data;
-  std::vector<paddle::Argument> forwardOutput;
-};
-
 struct TrainerPrivate : public paddle::Trainer {
-  void startTrain();
-  void finishTrain();
-
-  void startTrainPass();
-  void finishTrainPass();
-
-  bool _trainOneBatch();
-
-  bool _prepareBatchData();
-  void _forwardOneBatch() throw(UnsupportError);
-
+  bool _trainOneBatch(size_t batchSize);
+  bool forwardOneBatch(size_t batchSize);
+  void forwardOneDataBatch(const std::vector<paddle::Argument>& inArgs);
+  void setBatchSize(size_t batchSize);
+  std::vector<paddle::Argument>& getForwardOutput();
+
+  void startTestPeriod();
+  void finishTestPeriod();
+  void testOneDataBatch(const paddle::DataBatch& dataBatch);
   TrainerPrivate() : paddle::Trainer() {}
-
-  TrainPassContext trainPassContext;
 };
 
 Trainer::Trainer() : m(new TrainerPrivate()) {
@@ -75,61 +62,76 @@ Trainer* Trainer::createByCommandLine() throw(IOError) {
   }
 }
 
-void Trainer::startTrain() { m->startTrain(); }
+Trainer::Trainer(TrainerConfig* config, GradientMachine* gm)
+    : m(new TrainerPrivate()) {
+  m->init(config->m->conf, /* testing= */false, gm ? gm->m->machine : nullptr);
+}
 
-void TrainerPrivate::startTrain() {
-  srand(this->config_->getConfig().start_pass() + 1);
-  this->dataProvider_->reset();
-  this->trainerInternal_.getGradientMachine()->start(*config_, dataProvider_);
+Trainer* Trainer::create(TrainerConfig* config, GradientMachine* gm)
+    throw(IOError)
+{
+  auto retv = new Trainer(config, gm);
+  if (retv->m->getConfig().IsInitialized()) {
+    return retv;
+  } else {
+    retv->m->getConfig().CheckInitialized();
+    throw IOError();
+  }
 }
 
-void Trainer::finishTrain() { m->finishTrain(); }
+void Trainer::startTrain() { m->startTrain(); }
 
-void TrainerPrivate::finishTrain() {
-  this->trainerInternal_.getGradientMachine()->finish();
-}
+void Trainer::finishTrain() { m->finishTrain(); }
 
 void Trainer::startTrainPass() { m->startTrainPass(); }
 
-void TrainerPrivate::startTrainPass() {
-  this->stats_.reset();
-  this->trainPassContext.batchId = 0;
-  this->trainPassContext.batchSize = this->config_->getOptConfig().batch_size();
-  this->trainPassContext.avgTestCost = 0;
-  this->trainPassContext.numAvgTests = 0;
-  this->trainPassContext.passInnerId = 0;
-  this->trainerInternal_.getParameterUpdater()->startPass();
-  this->evaluator_->start();
-}
-
 void Trainer::finishTrainPass() { m->finishTrainPass(); }
 
-void TrainerPrivate::finishTrainPass() {
-  this->trainerInternal_.getGradientMachine()->onPassEnd();
-  this->trainerInternal_.getParameterUpdater()->finishPass();
-  evaluator_->finish();
+void Trainer::trainOneDataBatch(size_t batchSize, const Arguments& inArgs) {
+  paddle::DataBatch dataBatch;
+  dataBatch.getStreams() = inArgs.m->outputs;
+  dataBatch.setSize(batchSize);
+  m->trainOneDataBatch(dataBatch);
 }
 
-void Trainer::setBatchSize(size_t batchSize) {
-  this->m->trainPassContext.batchSize = batchSize;
+bool Trainer::trainOneBatch(size_t batchSize) {
+  return m->_trainOneBatch(batchSize);
 }
 
-bool Trainer::trainOneBatch(size_t batchSize) {
-  if (batchSize == -1UL) {
-    this->setBatchSize(batchSize);
+bool TrainerPrivate::_trainOneBatch(size_t batchSize) {
+  paddle::DataBatch dataBatch;
+  CHECK(dataProvider_) << "data_provider is not specified";
+  int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+  if (num == 0) {
+    return false;
   }
-  return m->_trainOneBatch();
+  trainOneDataBatch(dataBatch);
+  return false;
 }
 
-bool TrainerPrivate::_trainOneBatch() {
-  if (this->_prepareBatchData()) {
-    return true;
+void TrainerPrivate::startTestPeriod() {
+  if (!tester_) {
+    createTester();
   }
-  this->trainerInternal_.trainOneBatch(this->trainPassContext.batchId,
-                                       this->trainPassContext.data);
-  return false;
+  tester_->startTestPeriod();
+}
+
+void Trainer::startTestPeriod() { m->startTestPeriod(); }
+
+void TrainerPrivate::testOneDataBatch(const paddle::DataBatch& dataBatch) {
+  tester_->testOneDataBatch(dataBatch, &forwardOutput_);
+}
+
+void Trainer::testOneDataBatch(size_t batchSize, const Arguments& args) {
+  paddle::DataBatch dataBatch;
+  dataBatch.getStreams() = args.m->outputs;
+  dataBatch.setSize(batchSize);
+  m->testOneDataBatch(dataBatch);
 }
 
+void TrainerPrivate::finishTestPeriod() { tester_->finishTestPeriod(); }
+void Trainer::finishTestPeriod() { m->finishTestPeriod(); }
+
 Matrix* Trainer::getLayerOutput(const std::string& layerName) {
   auto nn = std::dynamic_pointer_cast<paddle::NeuralNetwork>(
           this->m->getGradientMachine());
@@ -138,46 +140,37 @@ Matrix* Trainer::getLayerOutput(const std::string& layerName) {
   return Matrix::createByPaddleMatrixPtr(&m);
 }
 
-bool Trainer::prepareBatchData(size_t batchSize) {
-  if (batchSize != -1UL) {
-    this->setBatchSize(batchSize);
+void Trainer::forwardOneBatch(size_t batchSize) { m->forwardOneBatch(batchSize); }
+
+bool TrainerPrivate::forwardOneBatch(size_t batchSize)  {
+  CHECK(dataProvider_) << "data_provider is not specified";
+  paddle::DataBatch dataBatch;
+  int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+  if (num == 0) {
+    return false;
   }
-  return this->m->_prepareBatchData();
-}
 
-bool TrainerPrivate::_prepareBatchData() {
-  int num = dataProvider_->getNextBatch(this->trainPassContext.batchSize,
-                                        &this->trainPassContext.data);
-  return num == 0;
+  forwardOneDataBatch(dataBatch.getStreams());
+  return true;
 }
 
-void Trainer::finishTrainOneBatch() { ++m->trainPassContext.batchId; }
+void TrainerPrivate::forwardOneDataBatch(
+    const std::vector<paddle::Argument>& inArgs) {
 
-void Trainer::forwardOneBatch() throw(UnsupportError) { m->_forwardOneBatch(); }
-
-void TrainerPrivate::_forwardOneBatch() throw(UnsupportError) {
-  auto& dataBatch = this->trainPassContext.data;
-
-  int64_t actualBatchSize = dataBatch.getSize();
-  if (actualBatchSize == 0) {
-    return;
-  }
-
-  const std::vector<paddle::Argument>& inArgs = dataBatch.getStreams();
-  std::vector<paddle::Argument>& outArgs = this->trainPassContext.forwardOutput;
-  outArgs.clear();
-  paddle::PassType passType =
-      this->trainerInternal_.getParameterUpdater()->startBatch(actualBatchSize);
+  std::vector<paddle::Argument>& outArgs = forwardOutput_;
 
   if (config_->getOptConfig().use_sparse_remote_updater()) {
-    this->trainerInternal_.getGradientMachine()->prefetch(inArgs);
-    this->trainerInternal_.getParameterUpdater()->getParametersRemote();
+    trainerInternal_.getGradientMachine()->prefetch(inArgs);
+    trainerInternal_.getParameterUpdater()->getParametersRemote();
   }
-  this->trainerInternal_.getGradientMachine()->forward(
-        inArgs, &outArgs, passType);
+  trainerInternal_.getGradientMachine()->forward(
+      inArgs, &outArgs, paddle::PASS_TEST);
+}
+
+Arguments* Trainer::getForwardOutput() {
+  return Arguments::createByPaddleArgumentVector(&m->getForwardOutput());
 }
 
-Arguments* Trainer::getNetworkOutput() {
-  return Arguments::createByPaddleArgumentVector(
-      &m->trainPassContext.forwardOutput);
+std::vector<paddle::Argument>& TrainerPrivate::getForwardOutput() {
+  return forwardOutput_;
 }
diff --git a/paddle/api/test/run_tests.sh b/paddle/api/test/run_tests.sh
index 1fc6fd5a8c185301612655d9971082203fe647dc..a4814f98f89c2e24195074369bc897b8b4bd2d9b 100755
--- a/paddle/api/test/run_tests.sh
+++ b/paddle/api/test/run_tests.sh
@@ -30,7 +30,7 @@ source .test_env/bin/activate
 
 pip --timeout 600  install ../../dist/*.whl
 
-test_list="testArguments.py testGradientMachine.py testMatrix.py  testVector.py testTrain.py"
+test_list="testArguments.py testGradientMachine.py testMatrix.py  testVector.py testTrain.py testTrainer.py"
 
 export PYTHONPATH=$PWD/../../../python/
 
diff --git a/paddle/api/test/testTrain.py b/paddle/api/test/testTrain.py
index 7f79c2701e9ed2e8c618be076d684c7793a8ad42..7759118a3d9d108f0c05d985ac74a5122799ccb4 100644
--- a/paddle/api/test/testTrain.py
+++ b/paddle/api/test/testTrain.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from py_paddle import swig_paddle, DataProviderWrapperConverter
+from py_paddle import swig_paddle
 import paddle.trainer.config_parser
-from paddle.trainer.PyDataProviderWrapper import DenseSlot, IndexSlot
 import numpy
 import util
 
diff --git a/paddle/api/test/testTrainer.py b/paddle/api/test/testTrainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..da69a60f84f4d7c6fad54fc116a31b54ef162f60
--- /dev/null
+++ b/paddle/api/test/testTrainer.py
@@ -0,0 +1,63 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.config_parser import parse_config
+from paddle.trainer.config_parser import logger
+from py_paddle import swig_paddle
+import util
+
+def main():
+    trainer_config = parse_config(
+        "./testTrainConfig.py", "")
+    model = swig_paddle.GradientMachine.createFromConfigProto(
+        trainer_config.model_config)
+    trainer = swig_paddle.Trainer.create(trainer_config, model)
+    trainer.startTrain()
+    for train_pass in xrange(2):
+        trainer.startTrainPass()
+        num = 0
+        cost = 0
+        while True:  # Train one batch
+            batch_size = 1000
+            data, atEnd = util.loadMNISTTrainData(batch_size)
+            if atEnd:
+                break
+            trainer.trainOneDataBatch(batch_size, data)
+            outs = trainer.getForwardOutput()
+            cost += sum(outs[0]['value'])
+            num += batch_size
+        trainer.finishTrainPass()
+        logger.info('train cost=%f' % (cost / num))
+
+        trainer.startTestPeriod()
+        num = 0
+        cost = 0
+        while True:  # Test one batch
+            batch_size = 1000
+            data, atEnd = util.loadMNISTTrainData(batch_size)
+            if atEnd:
+                break
+            trainer.testOneDataBatch(batch_size, data)
+            outs = trainer.getForwardOutput()
+            cost += sum(outs[0]['value'])
+            num += batch_size
+        trainer.finishTestPeriod()
+        logger.info('test cost=%f' % (cost / num))
+
+    trainer.finishTrain()
+ 
+
+if __name__ == '__main__':
+    swig_paddle.initPaddle("--use_gpu=0", "--trainer_count=1")
+    main()
diff --git a/paddle/cuda/include/hl_cuda_cublas.h b/paddle/cuda/include/hl_cuda_cublas.h
index 0ffbed18b5f9e57f22d1bbe1a98a0d899f2fa88d..d757317eb4a97559feef22d4fd8edf7c10ca6745 100644
--- a/paddle/cuda/include/hl_cuda_cublas.h
+++ b/paddle/cuda/include/hl_cuda_cublas.h
@@ -21,8 +21,8 @@ limitations under the License. */
 /**
  * @brief   Matrix transpose: C_d = T(A_d)
  *
- * @param[in]   A_d     input matrix (M x N).
- * @param[out]  C_d     output matrix (N x M).
+ * @param[in]   A_d     input matrix (dimM x dimN).
+ * @param[out]  C_d     output matrix (dimN x dimM).
  * @param[in]   dimM    matrix height.
  * @param[in]   dimN    matrix width.
  * @param[in]   lda     the first dimension of A_d.
@@ -39,8 +39,8 @@ extern void hl_matrix_transpose(real *A_d,
 /*
  * @brief Matrix transpose, while lda = dimN, ldc = dimM.
  *
- * @param[in]   A_d     input matrix (M x N).
- * @param[out]  C_d     output matrix (N x M).
+ * @param[in]   A_d     input matrix (dimM x dimN).
+ * @param[out]  C_d     output matrix (dimN x dimM).
  * @param[in]   dimM    matrix height.
  * @param[in]   dimN    matrix width.
  *
@@ -50,6 +50,22 @@ extern void hl_matrix_transpose(real *A_d,
                                 int dimM,
                                 int dimN);
 
+/*
+ * @brief Matrix inverse
+ *
+ * @param[in]   A_d    input matrix (dimN x dimN).
+ * @param[out]  C_d    output matrix (dimN x dimN).
+ * @param[in]   dimN   matrix height = matrix width
+ * @param[in]   lda    the first dimension of A_d
+ * @param[in]   ldc    the first dimension of C_d
+ *
+ */
+extern void hl_matrix_inverse(real *A_d,
+                              real *C_d,
+                              int dimN,
+                              int lda,
+                              int ldc);
+
 /**
  * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
  *
diff --git a/paddle/cuda/include/hl_device_functions.cuh b/paddle/cuda/include/hl_device_functions.cuh
index 88d950d6c17132d1d9969d0f3766395377e2de96..159c26f443cb17116da2d2d5282f883d875a85be 100755
--- a/paddle/cuda/include/hl_device_functions.cuh
+++ b/paddle/cuda/include/hl_device_functions.cuh
@@ -48,5 +48,24 @@ inline __device__ double paddleAtomicAdd(double* address, double val) {
 }
 }  // namespace paddle
 
+/**
+ * @brief  sum reduction
+ *
+ * @param[in,out]  smem       input data, better to use __shared__ memory.
+ * @param[in]      tid        thread index.
+ * @param[in]      threads    the total thread number used to reduce,
+ *                            such as, blockDim.x.
+ *
+ * @return smem[0]: the sum of each elements in smem.
+ */
+__device__ __forceinline__
+void simpleReduce(real* smem, int tid, int threads) {
+  for (unsigned int s = threads / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      smem[tid] += smem[tid + s];
+    }
+    __syncthreads();
+  }
+}
 
 #endif /* HL_DEVICE_FUNCTIONS_CUH_ */
diff --git a/paddle/cuda/include/hl_matrix.h b/paddle/cuda/include/hl_matrix.h
index 17419790471a7d1e86f2cf0017290004ec0c4dfc..71e8f8e3a60c9ff340f36c5057a22cecc112fd48 100644
--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
@@ -229,4 +229,40 @@ extern void hl_cossim_derivative(real* grad,
                                  int input2_height,
                                  real scale);
 
+/**
+ * @brief   Matrix addition: A_d[i][j] += scale * B_d[j/channel].
+ *
+ * @param[in]   A_d     input matrix (M x N).
+ * @param[in]   B_d     input matrix (1 x channel).
+ * @param[in]   channel width of B.
+ * @param[in]   dimM    height of A.
+ * @param[in]   dimN    width of A.
+ * @param[in]   scale   scalar used for addition.
+ *
+ */
+extern void hl_matrix_add_shared_bias(real* A_d,
+                                      real* B_d,
+                                      const int channel,
+                                      const int dimM,
+                                      const int dimN,
+                                      real scale);
+
+/**
+ * @brief   Matrix addition: A_d[i][j] += scale * B_d[j/channel].
+ *
+ * @param[in]   B_d     input matrix (1 x channel).
+ * @param[in]   A_d     input matrix (M x N).
+ * @param[in]   channel width of B.
+ * @param[in]   dimM    height of A.
+ * @param[in]   dimN    width of A.
+ * @param[in]   scale   scalar used for addition.
+ *
+ */
+extern void hl_matrix_collect_shared_bias(real* B_d,
+                                          real* A_d,
+                                          const int channel,
+                                          const int dimM,
+                                          const int dimN,
+                                          real scale);
+
 #endif /* HL_MATRIX_H_ */
diff --git a/paddle/cuda/include/stub/hl_cuda_cublas_stub.h b/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
index 4a5e2a25a71b38b2c38688820cbffdb10251bcac..903dcbe8355d6f593d96bc1f9e686d54035a9366 100644
--- a/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
@@ -30,6 +30,12 @@ inline void hl_matrix_transpose(real *A_d,
                                 int dimM,
                                 int dimN) {}
 
+inline void hl_matrix_inverse(real *A_d,
+                              real *C_d,
+                              int dimN,
+                              int lda,
+                              int ldc) {}
+
 inline void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
                           real *B_d, hl_trans_op_t transb,
                           real *C_d,
diff --git a/paddle/cuda/include/stub/hl_matrix_stub.h b/paddle/cuda/include/stub/hl_matrix_stub.h
index f1f1020c84d46cb14a85fa7569fa6cf36a1c8dab..e37b1275432caae29b14e95658e3db291632a672 100644
--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@@ -101,4 +101,17 @@ inline void hl_cossim_derivative(real* grad,
                                  int input2_height,
                                  real scale) {}
 
+inline void hl_matrix_add_shared_bias(real* A_d,
+                                      real* B_d,
+                                      const int channel,
+                                      const int dimM,
+                                      const int dimN,
+                                      real scale) {}
+
+inline void hl_matrix_collect_shared_bias(real* B_d,
+                                          real* A_d,
+                                          const int channel,
+                                          const int dimM,
+                                          const int dimN,
+                                          real scale) {}
 #endif  // HL_MATRIX_STUB_H_
diff --git a/paddle/cuda/src/hl_cuda_cublas.cc b/paddle/cuda/src/hl_cuda_cublas.cc
index b3c9001ba397361376ee191081a71863b2e5a578..724ea490e8ea9a8b2a1be39f3e0037df6e49882f 100644
--- a/paddle/cuda/src/hl_cuda_cublas.cc
+++ b/paddle/cuda/src/hl_cuda_cublas.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 
 #include <sys/time.h>
 #include <mutex>
+#include "hl_cuda.h"
 #include "hl_cuda_cublas.h"
 #include "hl_thread.ph"
 #include "hl_dso_loader.h"
@@ -75,6 +76,8 @@ DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmBatched)
 DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmBatched)
 DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmBatched)
 DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetrfBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetriBatched)
 CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
 
 #undef DYNAMIC_LOAD_CUBLAS_WRAP
@@ -88,10 +91,14 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
 #define     CUBLAS_GEAM     dynload::cublasSgeam
 #define     CUBLAS_GEMV     dynload::cublasSgemv
 #define     CUBLAS_GEMM     dynload::cublasSgemm
+#define     CUBLAS_GETRF    dynload::cublasSgetrfBatched
+#define     CUBLAS_GETRI    dynload::cublasSgetriBatched
 #else
 #define     CUBLAS_GEAM     dynload::cublasDgeam
 #define     CUBLAS_GEMV     dynload::cublasDgemv
 #define     CUBLAS_GEMM     dynload::cublasDgemm
+#define     CUBLAS_GETRF    dynload::cublasDgetrfBatched
+#define     CUBLAS_GETRI    dynload::cublasDgetriBatched
 #endif
 
 const char* hl_cublas_get_error_string(cublasStatus_t status) {
@@ -162,6 +169,54 @@ void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN) {
   hl_matrix_transpose(A_d, C_d, dimM, dimN, dimN, dimM);
 }
 
+void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
+  /* Solve Ax = I */
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  /* Step 1: Compute the LU decomposition of matrix A */
+  real **inout_h = &A_d;
+  real **inout_d = (real **)hl_malloc_device(sizeof(real *));
+  hl_memcpy(inout_d, inout_h, sizeof(real *));
+
+  int *pivot_d = (int *)hl_malloc_device(dimN*sizeof(int));  
+  int *info_d = (int *)t_resource.gpu_mem;
+
+  /* Note: cublasSgetrfBatched is used to calculate a number of
+     small-sized matrices. There may be a better way to reconstruct
+     the API for better performance.
+   */
+  CHECK_CUBLAS(CUBLAS_GETRF(t_resource.handle,
+	       dimN, inout_d, lda, pivot_d,
+               info_d, 1));
+
+  int info_h; 
+  hl_memcpy(&info_h, info_d, sizeof(int));
+  if (info_h != 0) {
+      LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n";
+  }
+
+  /* Step 2: Compute the inverse of the matrix given its LU decomposition */
+  real **out_h = &C_d;
+  real **out_d = (real **)hl_malloc_device(sizeof(real *));
+  hl_memcpy(out_d, out_h, sizeof(real *));
+
+  CHECK_CUBLAS(CUBLAS_GETRI(t_resource.handle,
+	       dimN, (const real **)inout_d, lda, pivot_d,
+	       out_d, ldc, info_d, 1));
+
+  hl_memcpy(&info_h, info_d, sizeof(int));
+  if (info_h != 0) {
+      LOG(FATAL) << "Inversion of matrix failed: matrix may be singular.\n";
+  }
+
+  hl_free_mem_device(inout_d);
+  hl_free_mem_device(pivot_d);
+  hl_free_mem_device(out_d);
+  
+  CHECK_SYNC("hl_matrix_inverse failed");
+}
+
 void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
                    real *B_d, hl_trans_op_t transb,
                    real *C_d,
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
index b215c0f6e33a18630f41668f97e5e06ad6b29800..92b28e4345c3d4d306e6ee2a7f9f50189454f951 100644
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -20,6 +20,11 @@ limitations under the License. */
 #include "hl_thread.ph"
 #include "hl_dso_loader.h"
 #include "paddle/utils/Logging.h"
+#include "paddle/utils/CommandLineParser.h"
+
+P_DEFINE_int32(cudnn_conv_workspace_limit_in_mb, 4096,
+                "Specify cuDNN max workspace limit, in units MB, "
+                "4096MB=4GB by default.");
 
 namespace dynload {
 
@@ -36,65 +41,28 @@ void* cudnn_dso_handle = nullptr;
 
 #ifdef PADDLE_USE_DSO
 
-#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                         \
-  struct DynLoad__##__name {                                    \
-    template <typename... Args>                                 \
-    cudnnStatus_t operator()(Args... args) {                    \
-      typedef cudnnStatus_t (*cudnnFunc)(Args...);              \
-      std::call_once(cudnn_dso_flag, GetCudnnDsoHandle,         \
-                     &cudnn_dso_handle);                        \
-      void* p_##__name = dlsym(cudnn_dso_handle, #__name);      \
-      return reinterpret_cast<cudnnFunc>(p_##__name)(args...);  \
-    }                                                           \
+#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                          \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      using cudnn_func = decltype(__name(args...))(*)(Args...);  \
+      std::call_once(cudnn_dso_flag, GetCudnnDsoHandle,          \
+                     &cudnn_dso_handle);                         \
+      void* p_##__name = dlsym(cudnn_dso_handle, #__name);       \
+      return reinterpret_cast<cudnn_func>(p_##__name)(args...);  \
+    }                                                            \
   } __name; /* struct DynLoad__##__name */
 
-struct DynLoad__cudnnGetVersion {
-  template <typename... Args>
-  size_t operator()(Args... args) {
-    typedef size_t (*cudnnFunc)(Args...);
-    std::call_once(cudnn_dso_flag, GetCudnnDsoHandle,
-                   &cudnn_dso_handle);
-    void* p_name = dlsym(cudnn_dso_handle, "cudnnGetVersion");
-    return reinterpret_cast<cudnnFunc>(p_name)(args...);
-  }
-} cudnnGetVersion; /* struct DynLoad__##__name */
-
-struct DynLoad__cudnnGetErrorString {
-  template <typename... Args>
-  const char* operator()(Args... args) {
-    typedef const char* (*cudnnFunc)(Args...);
-    std::call_once(cudnn_dso_flag, GetCudnnDsoHandle,
-                   &cudnn_dso_handle);
-    void* p_name = dlsym(cudnn_dso_handle, "cudnnGetErrorString");
-    return reinterpret_cast<cudnnFunc>(p_name)(args...);
-  }
-} cudnnGetErrorString; /* struct DynLoad__##__name */
-
-
 #else
 
-#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                         \
-  struct DynLoad__##__name {                                    \
-    template <typename... Args>                                 \
-    cudnnStatus_t operator()(Args... args) {                    \
-      return __name(args...);                                   \
-    }                                                           \
+#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                          \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      return __name(args...);                                    \
+    }                                                            \
   } __name; /* struct DynLoad__##__name */
 
-struct DynLoad__cudnnGetVersion {
-  template <typename... Args>
-  size_t operator()(Args... args) {
-    return cudnnGetVersion(args...);
-  }
-} cudnnGetVersion; /* struct DynLoad__##__name */
-
-struct DynLoad__cudnnGetErrorString {
-  template <typename... Args>
-  const char* operator()(Args... args) {
-    return cudnnGetErrorString(args...);
-  }
-} cudnnGetErrorString; /* struct DynLoad__##__name */
-
 #endif
 
 /**
@@ -128,7 +96,9 @@ struct DynLoad__cudnnGetErrorString {
   __macro(cudnnPoolingForward)                            \
   __macro(cudnnPoolingBackward)                           \
   __macro(cudnnSoftmaxBackward)                           \
-  __macro(cudnnSoftmaxForward)
+  __macro(cudnnSoftmaxForward)                            \
+  __macro(cudnnGetVersion)                                \
+  __macro(cudnnGetErrorString)
 CUDNN_DNN_ROUTINE_EACH(DYNAMIC_LOAD_CUDNN_WRAP)
 
 #define CUDNN_DNN_ROUTINE_EACH_R2(__macro)                \
@@ -242,7 +212,7 @@ void hl_conv_workspace(hl_tensor_descriptor input,
     CHECK_NOTNULL(conv);
 
     // Specify workspace limit directly
-    size_t memoryLimitBytes = 8 * 1024 * 1024;
+    size_t memoryLimitBytes = (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;
 
     // cudnn convolution forward configuration
     cudnnTensorDescriptor_t       fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);
diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc
index e9fe9f1c117a0573643c81f061bb36399523b38d..3ea2c91bd5a41e0cd6ece0605a25e645676faa40 100644
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -85,44 +85,24 @@ void* cudart_dso_handle = nullptr;
 #define DYNAMIC_LOAD_CUDART_WRAP(__name)                            \
   struct DynLoad__##__name {                                        \
     template <typename... Args>                                     \
-    cudaError_t operator()(Args... args) {                          \
-      typedef cudaError_t (*cudartFunc)(Args...);                   \
+    auto operator()(Args... args) -> decltype(__name(args...)) {    \
+      using cudart_func = decltype(__name(args...))(*)(Args...);    \
       std::call_once(cudart_dso_flag, GetCudartDsoHandle,           \
                      &cudart_dso_handle);                           \
       void* p_##__name = dlsym(cudart_dso_handle, #__name);         \
-      return reinterpret_cast<cudartFunc>(p_##__name)(args...);     \
+      return reinterpret_cast<cudart_func>(p_##__name)(args...);    \
     }                                                               \
   } __name;  /* struct DynLoad__##__name */
 #else
 #define DYNAMIC_LOAD_CUDART_WRAP(__name)                            \
   struct DynLoad__##__name {                                        \
     template <typename... Args>                                     \
-    cudaError_t operator()(Args... args) {                          \
+    auto operator()(Args... args) -> decltype(__name(args...)) {    \
       return __name(args...);                                       \
     }                                                               \
   } __name;  /* struct DynLoad__##__name */
 #endif
 
-#ifdef PADDLE_USE_DSO
-  struct DynLoad__cudaGetErrorString {
-    template <typename... Args>
-    const char* operator()(Args... args) {
-      typedef const char* (*cudaFunc)(Args...);
-      std::call_once(cudart_dso_flag, GetCudartDsoHandle,
-                     &cudart_dso_handle);
-      void* p_func = dlsym(cudart_dso_handle, "cudaGetErrorString");
-      return reinterpret_cast<cudaFunc>(p_func)(args...);
-    }
-  } cudaGetErrorString;  /* struct DynLoad__cudaGetErrorString */
-#else
-struct DynLoad__cudaGetErrorString {
-  template <typename... Args>
-  const char* operator()(Args... args) {
-    return cudaGetErrorString(args...);
-  }
-} cudaGetErrorString;  /* struct DynLoad__cudaGetErrorString */
-#endif
-
 /* include all needed cuda functions in HPPL */
 #define CUDA_ROUTINE_EACH(__macro)        \
   __macro(cudaMalloc)                     \
@@ -152,7 +132,8 @@ struct DynLoad__cudaGetErrorString {
   __macro(cudaSetDeviceFlags)             \
   __macro(cudaGetLastError)               \
   __macro(cudaFuncSetCacheConfig)         \
-  __macro(cudaRuntimeGetVersion)
+  __macro(cudaRuntimeGetVersion)          \
+  __macro(cudaGetErrorString)
 
 CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)
 
diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu
index 067e68c41e11986fd740ea1a524763f8b1bd4c0c..3df9f63f9e4b79d61a818b2af49a4d9dfd84a9ab 100644
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "hl_sequence.h"
 #include "paddle/utils/Logging.h"
 #include "hl_device_functions.cuh"
+#include "hl_gpu_matrix_kernel.cuh"
 
 DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
 DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1*a + p2*b);
@@ -673,3 +674,89 @@ void hl_cossim_derivative(real* grad,
         input1_height, input2_height, scale);
   CHECK_SYNC("hl_cossim_derivate failed");
 }
+
+__global__ void KeMatrixAddSharedBias(real* A,
+                                      real* B,
+                                      const int channel,
+                                      const int M,
+                                      const int N,
+                                      real scale) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int dim = N / channel;
+  if (index < M * N) {
+    int i = index % N;
+    i = i / dim; 
+    A[index] += scale * B[i];
+  }
+}
+
+void hl_matrix_add_shared_bias(real* A_d,
+                               real* B_d,
+                               const int channel,
+                               const int dimM,
+                               const int dimN,
+                               real scale) {
+  const int blocks = 512;
+  const int grids = DIVUP(dimM * dimN, blocks);
+  KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>
+    (A_d, B_d, channel, dimM, dimN, scale);
+  CHECK_SYNC("hl_matrix_add_shared_bias failed");
+}
+
+
+template <int blockSize>
+__global__ void KeMatrixCollectSharedBias(real *B,
+                                          real *A,
+                                          const int channel,
+                                          const int M,
+                                          const int N,
+                                          const int dim,
+                                          const int limit,
+                                          real scale) {
+  if (dim < limit) { 
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < channel) {
+      real sum = 0.0;
+      for (int i = 0; i < M; ++i) {
+        for (int j = 0; j < dim; ++j) {
+          sum += A[i * N + index * dim + j];
+        }
+      }
+      B[index] += scale * sum;
+    }
+  } else {
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+    __shared__ real smem[blockSize];
+    real sum = 0.0;
+    for (int j = 0; j < ((dim * M + blockSize - 1) / blockSize); ++j) {
+      int n = j * blockSize + tid;
+      int m = n / dim;
+      int w = n % dim;
+      smem[tid] =  (m < M && w < dim) ? A[m * N + bid * dim + w] : 0.0;
+      __syncthreads();
+      simpleReduce(smem, tid, blockSize);
+      sum += smem[0];
+    }
+    if (tid == 0) {
+      B[bid] += scale * sum;
+    }
+  }
+}
+
+void hl_matrix_collect_shared_bias(real* B_d,
+                                   real* A_d,
+                                   const int channel,
+                                   const int dimM,
+                                   const int dimN,
+                                   real scale) {
+  const int dim = dimN / channel;
+  const int blocks = 256;
+  const int limit = 64;
+  int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel;
+
+  KeMatrixCollectSharedBias<blocks>
+      <<< grids, blocks, 0, STREAM_DEFAULT>>>
+      (B_d, A_d, channel, dimM, dimN, dim, limit, scale);
+  CHECK_SYNC("hl_matrix_collect_shared_bias failed");
+}
diff --git a/paddle/cuda/src/hl_cuda_sparse.cuh b/paddle/cuda/src/hl_cuda_sparse.cuh
index c3b98f4ebc38db055e3ac90691021665cbd97ced..9cf2d5a843343075c33d19bf34d9ed315299de83 100644
--- a/paddle/cuda/src/hl_cuda_sparse.cuh
+++ b/paddle/cuda/src/hl_cuda_sparse.cuh
@@ -908,24 +908,6 @@ int findIndex(int* indice, int num, int index) {
   return (end - 1);
 }
 
-/**
- * @brief  sum reduction
- *
- * @param[in,out]  smem       input data, better to use __shared__ memory.
- * @param[in]      tid        local thread index.
- * @param[in]      blockDimX  the size of blockDim.x.
- *
- * note: return smem[0]: the sum of each elements of smem.
- */
-__device__ __forceinline__
-void reduce(real* smem, int tid, int blockDimX) {
-  for (unsigned int s = blockDimX / 2; s > 0; s >>= 1) {
-    if (tid < s) {
-      smem[tid] += smem[tid + s];
-    }
-    __syncthreads();
-  }
-}
 
 /**
  * @brief sum columns of csr sparse matrix (csr_val), then add to a_val.
diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc
index eee9984e07326668a49fd2627e361804a6aacd7b..c0b5d6e357fc70ed17180ab38458164918b13878 100644
--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -46,63 +46,100 @@ static inline std::string join(const std::string& part1, const std::string& part
   return ret;
 }
 
-static inline void GetDsoHandleWithSearchPath(
+static inline void GetDsoHandleFromDefaultPath(
+        std::string& dso_path, void** dso_handle, int dynload_flags) {
+    LOG(INFO) << "Try to find cuda library: " << dso_path
+              << " from default system path.";
+    // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH 
+    *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+    
+    // DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
+    // bring System Integrity Projection (SIP), if dso_handle
+    // is null, search from default package path in Mac OS.
+    #if defined(__APPLE__) || defined(__OSX__)
+    if (nullptr == *dso_handle) {
+        dso_path = join("/usr/local/cuda/lib/", dso_path);
+        *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+        if (nullptr == *dso_handle) {
+            if (dso_path == "libcudnn.dylib") {
+                LOG(FATAL) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n"
+                << "For instance, sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C "
+                << "/usr/local \n sudo chmod a+r /usr/local/cuda/include/cudnn.h "
+                << "/usr/local/cuda/lib/libcudnn*";
+            }
+        } 
+    }   
+    #endif
+}
+
+static inline void GetDsoHandleFromSearchPath(
         const std::string& search_root,
-        const std::string& dso_path,
+        const std::string& dso_name,
         void** dso_handle) {
     int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
     *dso_handle = nullptr;
 
-    std::string dlPath = dso_path;
+    std::string dlPath = dso_name;
     if (search_root.empty()) {
-        // default search xxx.so from LD_LIBRARY_PATH
-        *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
+        GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
     } else {
         // search xxx.so from custom path
-        dlPath = join(search_root, dso_path);
+        dlPath = join(search_root, dso_name);
         *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
-        // then, search xxx.so from LD_LIBRARY_PATH
-        if (nullptr == *dso_handle) {
-            *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+        // if not found, search from default path
+        if (nullptr == dso_handle) {
+            LOG(WARNING) << "Failed to find cuda library: " << dlPath;
+            dlPath = dso_name;
+            GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
         }
     }
 
     CHECK(nullptr != *dso_handle)
-      << "For Gpu version of PaddlePaddle, it couldn't find CUDA library: "
-      << dlPath.c_str() << ". Please make sure you already specify its path. "
-      << "Note: for training data on Cpu using Gpu version of PaddlePaddle, "
-      << "you must specify libcudart via export LD_LIBRARY_PATH for Linux or "
-      << "export DYLD_LIBRARY_PATH for MAC OS.";
+      << "Failed to find cuda library: " << dlPath << std::endl
+      << "Please specify its path correctly using one of the following ideas: \n"
+
+      << "Idea 1. set cuda and cudnn lib path at runtime. "
+      << "http://www.paddlepaddle.org/doc/ui/cmd_argument/argument_outline.html \n"
+      << "For instance, issue command: paddle train --use_gpu=1 "
+      << "--cuda_dir=/usr/local/cudnn/lib --cudnn_dir=/usr/local/cudnn/lib ...\n"
+
+      << "Idea 2. set environment variable LD_LIBRARY_PATH on Linux or "
+      << "DYLD_LIBRARY_PATH on Mac OS. \n"
+      << "For instance, issue command: export LD_LIBRARY_PATH=... \n"
+
+      << "Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is impossible "
+      << "unless System Integrity Protection (SIP) is disabled. However, @Idea 1"
+      << "always work well.";
 }
 
 void GetCublasDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
+    GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
 #else
-    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
+    GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
 #endif
 }
 
 void GetCudnnDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-    GetDsoHandleWithSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
+    GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
 #else
-    GetDsoHandleWithSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
+    GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
 #endif
 }
 
 void GetCudartDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-    GetDsoHandleWithSearchPath("", "libcudart.dylib", dso_handle);
+    GetDsoHandleFromSearchPath("", "libcudart.dylib", dso_handle);
 #else
-    GetDsoHandleWithSearchPath("", "libcudart.so", dso_handle);
+    GetDsoHandleFromSearchPath("", "libcudart.so", dso_handle);
 #endif
 }
 
 void GetCurandDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
+    GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
 #else
-    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
+    GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
 #endif
 }
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index 9918d20d9082ae6c07684ce05eba68c4989dd5d5..27eed75d4d76c351e381a3b71dc44a3254fb1a4d 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -295,6 +295,7 @@ void forward(Argument& act) {
 
 void backward(Argument& act) { act.grad->squareDerivative(*act.in); }
 END_DEFINE_ACTIVATION(square)
+
 /**
  * @brief Exponential Activation.
  * \f[
@@ -307,8 +308,36 @@ void forward(Argument& act) { act.value->exp(*act.value); }
 void backward(Argument& act) { act.grad->expDerivative(*act.value); }
 END_DEFINE_ACTIVATION(exponential)
 
+/**
+ * @brief Logarithm Activation.
+ * \f[
+ * f(z) = log(z)
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(log)
+void forward(Argument& act) {
+  SetDevice device(act.deviceId);
+  Matrix::resizeOrCreate(act.in, act.value->getHeight(), act.value->getWidth(),
+                         /* trans */ false, useGpu(act.deviceId));
+
+  act.in->copyFrom(*act.value);
+  act.value->log(*act.value);
+}
+
+void backward(Argument& act) { act.grad->dotDiv(*act.grad, *act.in); }
+END_DEFINE_ACTIVATION(log)
+
 ActivationFunction* ActivationFunction::create(const std::string& type) {
   return gActivationRegistrar.createByType(type);
 }
 
+std::vector<std::string> ActivationFunction::getAllRegisteredTypes() {
+  std::vector<std::string> types;
+  gActivationRegistrar.forEachType([&](const std::string& type) {
+      types.push_back(type);
+    });
+  return types;
+}
+
+
 }  // namespace paddle
diff --git a/paddle/gserver/activations/ActivationFunction.h b/paddle/gserver/activations/ActivationFunction.h
index 29860b4a736c37dee70c56731820a4197ea4cdbe..c483372256c035e39bfdbcaa4193a1a2e7fd80b8 100644
--- a/paddle/gserver/activations/ActivationFunction.h
+++ b/paddle/gserver/activations/ActivationFunction.h
@@ -15,6 +15,7 @@ limitations under the License. */
 
 #pragma once
 #include <string>
+#include <vector>
 
 namespace paddle {
 
@@ -32,6 +33,7 @@ struct Argument;
 class ActivationFunction {
 public:
   static ActivationFunction* create(const std::string& type);
+  static std::vector<std::string> getAllRegisteredTypes();
 
   ActivationFunction() {}
 
diff --git a/paddle/gserver/dataproviders/DataProvider.cpp b/paddle/gserver/dataproviders/DataProvider.cpp
index 8cefbb30ada46d1ff1b0a4952dde0aeafb5419b1..2cfb5a3a18c8a63d69bf0598eeee2807376340bc 100644
--- a/paddle/gserver/dataproviders/DataProvider.cpp
+++ b/paddle/gserver/dataproviders/DataProvider.cpp
@@ -131,9 +131,10 @@ void DoubleBuffer::asyncLoadBatch() {
     taskReadySem_.wait();
     if (stopping_) break;
 
-    while (batchSize_ == 0) {
+    while (batchSize_ == 0 && !stopping_) {
       usleep(5);
     }
+    if (stopping_) break;
 
     do {
       DataBatch newBatch;
diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/gserver/dataproviders/PyDataProvider2.cpp
index ca8b07af49ca071940960336be6cc652fcd62a44..90391a7c307d8dff7e289d445cafd27dc5008547 100644
--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
@@ -433,26 +433,34 @@ private:
 
   inline void resetImpl(bool startNewThread) {
     DBG << "Reseting " << startNewThread;
+    exit_.store(true);
     if (loadThread_) {  // is loading.
-      exit_.store(true);
       loadThread_->join();
       loadThread_.reset();
     }
     {
       PyGuard g;
       callingContexts_.clear();
+      this->pullCV_.notify_one();
+    }
+
+    std::lock_guard<std::mutex> guard(mutexForReset_);
+    {
+      PyGuard g;
       dataPool_.clear();
     }
     poolActualSize_ = 0;
-    exit_ = false;
+
     if (startNewThread && cache_->reset()) {
       DBG << "Start new thread.";
       loadThread_.reset(new std::thread([this] {
+        exit_ = false;
         loadThread();
       }));
       callingContextCreated_.wait();
     }
     DBG << "Reset done";
+    exit_ = false;
   }
 
 private:
@@ -465,6 +473,8 @@ private:
   std::condition_variable pullCV_;
   std::mutex mtx_;
 
+  std::mutex mutexForReset_;
+
   ThreadBarrier callingContextCreated_;
   std::unique_ptr<IPyDataProviderCache> cache_;
 
@@ -529,6 +539,7 @@ public:
    * Loading a batch of data.
    */
   int64_t getNextBatchInternal(int64_t size_, DataBatch *batch) {
+    std::lock_guard<std::mutex> guard(mutexForReset_);
     REGISTER_TIMER("PyDP2.getNextBatchInternal")
     CHECK_GE(size_, 0);
     size_t size = (size_t) size_;
@@ -554,6 +565,10 @@ public:
     } else {  // loading from cache.
       poolPtr = this->cache_->load();
     }
+    if (exit_) {
+      // PyDataProvider is destructing.
+      return 0;
+    }
     CHECK(poolPtr != nullptr);
 
     std::deque<PyObjectPtr>& pool = *poolPtr;
diff --git a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp
index 952df60a7d78666c84d5fd9176c3113fdbdacdc9..22698f586701774d884e6eeca943f6bf75fe7a96 100644
--- a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp
@@ -28,6 +28,12 @@ void ParallelNeuralNetwork::init(
     const std::vector<ParameterType>& parameterTypes, bool useGpu) {
   NeuralNetwork::init(config, callback, parameterTypes, useGpu);
 
+  if (config.type() == "recurrent_nn") {
+    LOG(FATAL)
+      << "You can not add `--parallel_nn=true` on the command line, "
+      << "parallel_nn training mode does not support the recurrent_nn model.";
+  }
+
   useGpu_ = useGpu;
   numDevices_ = 0;
   if (useGpu_) {
diff --git a/paddle/gserver/layers/ConcatenateLayer.cpp b/paddle/gserver/layers/ConcatenateLayer.cpp
index 52a7cb6f777c3a380d51c6c48e994075ff1ef5eb..bb6709b8df330b5f06a9df9887f54644c82d2878 100644
--- a/paddle/gserver/layers/ConcatenateLayer.cpp
+++ b/paddle/gserver/layers/ConcatenateLayer.cpp
@@ -97,7 +97,8 @@ void ConcatenateLayer::backward(const UpdateCallback& callback) {
  */
 class ConcatenateLayer2 : public Layer {
 public:
-  explicit ConcatenateLayer2(const LayerConfig& config) : Layer(config) {}
+  explicit ConcatenateLayer2(const LayerConfig& config) :
+      Layer(config) {}
 
   ~ConcatenateLayer2() {}
 
@@ -110,6 +111,8 @@ protected:
   std::vector<std::unique_ptr<Projection>> projections_;
   std::vector<Argument> projOutput_;
   std::vector<std::pair<size_t, size_t>> projCol_;
+  bool sharedBias_;
+  std::unique_ptr<Weight> biases_;
 };
 
 REGISTER_LAYER(concat2, ConcatenateLayer2);
@@ -119,7 +122,6 @@ bool ConcatenateLayer2::init(const LayerMap& layerMap,
   /* Initialize the basic parent class */
   if (!Layer::init(layerMap, parameterMap)) return false;
 
-  CHECK(!biasParameter_);
   CHECK_EQ(inputLayers_.size(), parameters_.size());
   projections_.reserve(inputLayers_.size());
   projCol_.reserve(inputLayers_.size());
@@ -137,6 +139,13 @@ bool ConcatenateLayer2::init(const LayerMap& layerMap,
   }
   CHECK_EQ(getSize(), endCol);
 
+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    sharedBias_ = config_.shared_biases();
+    size_t psize = config_.bias_size();
+    biases_ = std::unique_ptr<Weight>(new Weight(1, psize, biasParameter_));
+  }
+
   return true;
 }
 
@@ -154,8 +163,17 @@ void ConcatenateLayer2::forward(PassType passType) {
     projOutput_[i].grad = output_.grad->subColMatrix(startCol, endCol);
   }
 
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    projections_[i]->forward(&getInput(i), &projOutput_[i], passType);
+  {
+    AsyncGpuBlock block;
+    for (size_t i = 0; i != inputLayers_.size(); ++i) {
+      projections_[i]->forward(&getInput(i), &projOutput_[i], passType);
+    }
+  }
+
+  /* add the bias-vector */
+  if (biases_) {
+    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
+    output_.value->addBias(*(biases_->getW()), 1, sharedBias_);
   }
 
   /* activation */ {
@@ -170,6 +188,13 @@ void ConcatenateLayer2::backward(const UpdateCallback& callback) {
     backwardActivation();
   }
 
+  AsyncGpuBlock block;
+  if (biases_ && biases_->getWGrad()) {
+    REGISTER_TIMER_INFO("Concat2BpBiasTimer", getName().c_str());
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBias_);
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
   for (size_t i = 0; i != inputLayers_.size(); ++i) {
     if (projections_[i]) {
       projections_[i]->backward(callback);
diff --git a/paddle/gserver/layers/ConvBaseLayer.cpp b/paddle/gserver/layers/ConvBaseLayer.cpp
index 9ed9572139dc8c2097857ed902a9f25a0af7ac7e..040510b7ad2116c1c624141185124556fc8fd7de 100644
--- a/paddle/gserver/layers/ConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ConvBaseLayer.cpp
@@ -35,25 +35,12 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
     filterSizeY_.push_back(conf.filter_size_y());
     filterPixels_.push_back(filterSize_.back() * filterSizeY_.back());
     channels_.push_back(conf.channels());
-    imgSize_.push_back(conf.img_size());
-    imgPixels_.push_back(imgSize_.back() * imgSize_.back());
+    imgSizeH_.push_back(conf.img_size());
+    imgSizeW_.push_back(conf.img_size());
     groups_.push_back(conf.groups());
     filterChannels_.push_back(conf.filter_channels());
-    outputX_.push_back(conf.output_x());
-    outputs_.push_back(outputX_.back() * outputX_.back());
-  }
-
-  /* initialize the weightList */
-  CHECK(inputLayers_.size() == parameters_.size());
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    size_t height, width;
-    height = filterPixels_[i] * filterChannels_[i];
-    width = numFilters_;
-
-    // create a new weight
-    CHECK_EQ(parameters_[i]->getSize(), width * height);
-    Weight* w = new Weight(height, width, parameters_[i]);
-    weights_.emplace_back(w);
+    outputH_.push_back(conf.output_x());
+    outputW_.push_back(conf.output_x());
   }
 
   /* initialize the biases_ */
@@ -74,4 +61,34 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
   return true;
 }
 
+size_t ConvBaseLayer::calOutputSize() {
+  auto clearAndReserve = [this](IntV* vec) {
+    vec->clear();
+    vec->reserve(this->inputLayers_.size());
+  };
+  clearAndReserve(&imgSizeH_);
+  clearAndReserve(&imgSizeW_);
+  clearAndReserve(&outputH_);
+  clearAndReserve(&outputW_);
+  size_t layerSize = 0;
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    imgSizeH_.push_back(inputLayers_[i]->getOutput().getFrameHeight());
+    imgSizeW_.push_back(inputLayers_[i]->getOutput().getFrameWidth());
+    if (imgSizeH_[i] == 0)
+      imgSizeH_[i] = config_.inputs(i).conv_conf().img_size();
+    if (imgSizeW_[i] == 0)
+      imgSizeW_[i] = config_.inputs(i).conv_conf().img_size();
+    outputH_.push_back(
+        outputSize(imgSizeH_[i], filterSizeY_[i], paddingY_[i], strideY_[i]));
+    outputW_.push_back(
+        outputSize(imgSizeW_[i], filterSize_[i], padding_[i], stride_[i]));
+    CHECK_EQ(outputH_[i], outputH_[0]);
+    CHECK_EQ(outputW_[i], outputW_[0]);
+  }
+  getOutput().setFrameHeight(outputH_[0]);
+  getOutput().setFrameWidth(outputW_[0]);
+  layerSize = outputH_[0] * outputW_[0] * size_t(numFilters_);
+  return layerSize;
+}
+
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseLayer.h b/paddle/gserver/layers/ConvBaseLayer.h
index eaeaebf43be252a3a90d7fd45f41de09c3ef5c81..316514acf1a0d15e60f918220241271db2b11133 100644
--- a/paddle/gserver/layers/ConvBaseLayer.h
+++ b/paddle/gserver/layers/ConvBaseLayer.h
@@ -43,19 +43,18 @@ protected:
   IntV filterSizeY_;
   /// The spatial dimensions of the convolution input.
   IntV channels_;
-  /// The spatial dimensions of input feature map.
-  IntV imgSize_;
-  /// The total pixel size of input feature map.
-  /// imgPixels_ = imgSizeX_ * imgSizeY_.
-  IntV imgPixels_;
+  /// The spatial dimensions of input feature map height.
+  IntV imgSizeH_;
+  /// The spatial dimensions of input feature map width.
+  IntV imgSizeW_;
   /// filterPixels_ = filterSizeX_ * filterSizeY_.
   IntV filterPixels_;
   /// filterChannels_ = channels_/groups_.
   IntV filterChannels_;
-  /// The spatial dimensions of output feature map.
-  IntV outputX_;
-  /// The spatial dimensions of output feature map.
-  IntV outputs_;
+  /// The spatial dimensions of output feature map height.
+  IntV outputH_;
+  /// The spatial dimensions of output feature map width.
+  IntV outputW_;
   /// Group size, refer to grouped convolution in
   /// Alex Krizhevsky's paper: when group=2, the first half of the
   /// filters are only connected to the first half of the input channels,
@@ -80,6 +79,13 @@ public:
 
   virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
 
+  /**
+   * imgSizeH_ and imgSizeW_ will be set according to the previous input layers
+   * in this function. Then it will calculate outputH_ and outputW_ and set them
+   * into output argument.
+   */
+  virtual size_t calOutputSize();
+
   Weight& getWeight(int idx) { return *weights_[idx]; }
 
   /**
diff --git a/paddle/gserver/layers/ConvProjection.cpp b/paddle/gserver/layers/ConvProjection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d1ce53fe26351926196a04418900a1555e0282c2
--- /dev/null
+++ b/paddle/gserver/layers/ConvProjection.cpp
@@ -0,0 +1,210 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Stat.h"
+#include "ConvProjection.h"
+
+namespace paddle {
+
+REGISTER_PROJECTION(conv, ConvProjection);
+
+ThreadLocalD<std::vector<MemoryHandle*>> ConvProjection::convMem_;
+
+ConvProjection::ConvProjection(const ProjectionConfig& config,
+                               ParameterPtr parameter, bool useGpu)
+    : Projection(config, parameter, useGpu) {
+
+  CHECK(useGpu);  // only support GPU
+  getConvParams();
+  initCudnn();
+
+  size_t height = filterH_ * filterW_ * channels_ / groups_;
+  size_t width = numFilters_;
+  weight_.reset(new Weight(height, width, parameter));
+  weightOffset_ = height * width / groups_;
+}
+
+void ConvProjection::getConvParams() {
+  const ConvConfig &conf = config_.conv_conf();
+  paddingH_ = conf.padding_y();
+  paddingW_ = conf.padding();
+
+  strideH_ = conf.stride_y();
+  strideW_ = conf.stride();
+
+  filterH_ = conf.filter_size_y();
+  filterW_ = conf.filter_size();
+
+  configImgH_ = conf.img_size();
+  configImgW_ = conf.img_size();
+
+  channels_ = conf.channels();
+  numFilters_ = config_.num_filters();
+
+  groups_ = conf.groups();
+  CHECK_EQ(channels_ % groups_, 0);
+  CHECK_EQ(numFilters_ % groups_, 0);
+}
+
+void ConvProjection::initCudnn() {
+  hl_create_filter_descriptor(&filterDesc_, channels_, numFilters_,
+                              filterH_, filterW_);
+  hl_create_tensor_descriptor(&inputDesc_);
+  hl_create_tensor_descriptor(&outputDesc_);
+  hl_create_convolution_descriptor(&convDesc_, inputDesc_, filterDesc_,
+                                   paddingH_, paddingW_, strideH_, strideW_);
+
+  // initialize all to default algorithms
+  fwdAlgo_ = 0;
+  bwdFilterAlgo_ = 0;
+  bwdDataAlgo_ = 0;
+  fwdLimitBytes_ = 0;
+  bwdDataLimitBytes_ = 0;
+  bwdFilterLimitBytes_ = 0;
+  workSpaceInBytes_ = 0;
+
+  batchNum_ = 0;
+  isSelectAlgo_ = false;
+}
+
+void ConvProjection::reshapeTensorDesc(int batchSize) {
+  hl_tensor_reshape(inputDesc_, batchSize, channels_, imageH_, imageW_,
+                    channels_ * imageH_ * imageW_, imageH_ * imageW_,
+                    imageW_, 1);
+  hl_reset_convolution_descriptor(convDesc_, inputDesc_, filterDesc_,
+                                  paddingH_, paddingW_, strideH_, strideW_);
+
+  // The stride between two consecutive images in ConvProjection may not be 1,
+  // for example, in the case of layer ConcatenateLayer2 with two
+  // ConvProjection, the stride is the output_size of layer ConcatenateLayer2.
+  // So the calculation of nStride is different from CudnnConvLayer.
+  // In fact, only "nStride = out_->value->getStride()" is ok.
+  size_t nStride = numFilters_ * outputH_ * outputW_;
+  if (out_->value->isContiguous()) {
+    CHECK_EQ(nStride, out_->value->getWidth());
+  } else {
+    nStride = out_->value->getStride();
+  }
+
+  hl_tensor_reshape(outputDesc_, batchSize, numFilters_, outputH_, outputW_,
+                    nStride, outputH_ * outputW_, outputW_, 1);
+}
+
+void ConvProjection::reshape(int batchSize) {
+  size_t width = calOutputSize();
+  CHECK_EQ(width, out_->value->getWidth());
+
+  isSelectAlgo_ = (batchSize == batchNum_);
+  batchNum_ = batchSize;
+
+  if (!isSelectAlgo_) {
+    reshapeTensorDesc(batchSize);
+    hl_conv_workspace(inputDesc_, outputDesc_, filterDesc_,
+                      convDesc_, &fwdAlgo_, &fwdLimitBytes_,
+                      &bwdDataAlgo_, &bwdDataLimitBytes_,
+                      &bwdFilterAlgo_, &bwdFilterLimitBytes_);
+
+    size_t maxWorkSpace = 0;
+    maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
+    maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
+    workSpaceInBytes_ = maxWorkSpace;
+
+
+    VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
+                         << " / " << bwdDataAlgo_
+                         << " / " << bwdFilterAlgo_;
+  }
+
+  isSelectAlgo_ = true;
+}
+
+void ConvProjection::forward() {
+  int batchSize = in_->value->getHeight();
+  reshape(batchSize);
+
+  void* workSpace = NULL;
+  if (workSpaceInBytes_ > 0) {
+    workSpace = getSpaceBytes(workSpaceInBytes_);
+  }
+
+  for (int g = 0; g < groups_; ++g) {
+    REGISTER_TIMER_INFO("CudnnConvFwTimer", getName().c_str());
+
+    real *inputData = in_->value->getData() + g * inputOffset_;
+    real *wgtData = weight_->getW()->getData() + g * weightOffset_;
+    real *outData = out_->value->getData() + g * outputOffset_;
+    hl_convolution_forward(inputDesc_, inputData, outputDesc_,
+                           outData, filterDesc_, wgtData,
+                           convDesc_, workSpace,
+                           fwdLimitBytes_, fwdAlgo_);
+  }
+}
+
+void ConvProjection::backward(const UpdateCallback& callback) {
+  REGISTER_TIMER_INFO("CudnnConvBpTimer", getName().c_str());
+
+  void* workSpace = NULL;
+  if (workSpaceInBytes_ > 0) {
+    workSpace = getSpaceBytes(workSpaceInBytes_);
+  }
+
+  for (int g = 0; g < groups_; ++g) {
+    real *outGrad = out_->grad->getData() + g * outputOffset_;
+    if (weight_->getWGrad()) {
+      real *inputData = in_->value->getData() + g * inputOffset_;
+      real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_;
+      hl_convolution_backward_filter(
+          inputDesc_, inputData, outputDesc_, outGrad, filterDesc_,
+          weightGrad, convDesc_, workSpace, bwdFilterLimitBytes_,
+          bwdFilterAlgo_);
+    }
+
+    MatrixPtr preGrad = in_->grad;
+    if (NULL != preGrad) {
+      real *inputGrad = preGrad->getData() + g * inputOffset_;
+      real *wgtData = weight_->getW()->getData() + g* weightOffset_;
+      hl_convolution_backward_data(
+          inputDesc_, inputGrad, outputDesc_, outGrad, filterDesc_,
+          wgtData, convDesc_, workSpace, bwdDataLimitBytes_,
+          bwdDataAlgo_);
+    }
+  }
+
+  weight_->getParameterPtr()->incUpdate(callback);
+}
+
+void* ConvProjection::getSpaceBytes(size_t size) {
+  std::vector<MemoryHandle*>& convMem = *convMem_;
+  if (convMem.empty()) {
+    int numDevices = hl_get_device_count();
+    convMem.resize(numDevices);
+  }
+
+  int devId = hl_get_device();
+  MemoryHandle** localMem = &(convMem[devId]);
+  if (NULL == *localMem || size > (*localMem)->getAllocSize()) {
+    *localMem = new GpuMemoryHandle(size);
+  }
+  return (*localMem)->getBuf();
+}
+
+ConvProjection::~ConvProjection() {
+  hl_destroy_tensor_descriptor(inputDesc_);
+  hl_destroy_tensor_descriptor(outputDesc_);
+  hl_destroy_filter_descriptor(filterDesc_);
+  hl_destroy_convolution_descriptor(convDesc_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvProjection.h b/paddle/gserver/layers/ConvProjection.h
new file mode 100644
index 0000000000000000000000000000000000000000..41a100ac3c50fe0180440b20a0b8dfa359e2848a
--- /dev/null
+++ b/paddle/gserver/layers/ConvProjection.h
@@ -0,0 +1,125 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "Projection.h"
+
+namespace paddle {
+
+/**
+ * @brief Convolution projection do the same calculation with CudnnConvLayer.
+ */
+class ConvProjection : public Projection {
+public:
+  /**
+   * Constructor.
+   */
+  ConvProjection(const ProjectionConfig& config, ParameterPtr parameter,
+                 bool useGpu);
+
+  ~ConvProjection();
+
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+
+protected:
+  void getConvParams();
+  void initCudnn();
+
+  void reshapeTensorDesc(int batchSize);
+  void reshape(int batchSize);
+
+  int outputSize(int imageSize, int filterSize, int padding, int stride) {
+    return (imageSize - filterSize + 2 * padding) / stride + 1;
+  }
+
+  size_t calOutputSize() {
+    imageH_ = in_->getFrameHeight();
+    imageW_ = in_->getFrameWidth();
+    if (imageH_ == 0) imageH_ = configImgH_;
+    if (imageW_ == 0) imageW_ = configImgW_;
+    outputH_ = outputSize(imageH_, filterH_, paddingH_, strideH_);
+    outputW_ = outputSize(imageW_, filterW_, paddingW_, strideW_);
+
+    const_cast<Argument*>(out_)->setFrameHeight(outputH_);
+    const_cast<Argument*>(out_)->setFrameWidth(outputW_);
+
+    inputOffset_ = (channels_ / groups_) * imageH_ * imageW_;
+    outputOffset_ = (numFilters_ / groups_) * outputH_ * outputW_;
+    return outputH_ * outputW_ * numFilters_;
+  }
+
+  static void* getSpaceBytes(size_t size);
+
+  /// imageH_ and imageW_ is calculated from the input layer.
+  int imageH_, imageW_;
+  /// configImgH_ and configImgW_ is obtained from config.
+  int configImgH_, configImgW_;
+  int outputH_, outputW_;
+  int channels_, numFilters_;
+  int paddingH_, paddingW_;
+  int strideH_, strideW_;
+  int filterH_, filterW_;
+  /// One group offset of input data.
+  int inputOffset_;
+  /// One group offset of output data.
+  int outputOffset_;
+  /// One group offset of weight.
+  int weightOffset_;
+  int groups_;
+
+  /// Cudnn tensor descriptor for input.
+  hl_tensor_descriptor inputDesc_;
+  /// Cudnn tensor descriptor for output.
+  hl_tensor_descriptor outputDesc_;
+  /// Cudnn tensor descriptor for filter.
+  hl_filter_descriptor filterDesc_;
+  /// Cudnn tensor descriptor for a convolution operation.
+  hl_convolution_descriptor convDesc_;
+
+  /// Record the algorithm for forward convolution, which is obtained by cudnn
+  /// api to search the best suited algorithm.
+  int fwdAlgo_;
+  /// Record the algorithm for computing convolution gradient with respect to
+  /// filter coefficients.
+  int bwdFilterAlgo_;
+  /// Record the algorithm for computing convolution gradient with respect to
+  /// the output.
+  int bwdDataAlgo_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// forward convolution with the specified algo.
+  size_t fwdLimitBytes_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// backwardFilter with the specified algo.
+  size_t bwdDataLimitBytes_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// backwardData with the specified algo.
+  size_t bwdFilterLimitBytes_;
+  /// Size of total work space.
+  size_t workSpaceInBytes_;
+
+  /// Whether to call cuDNN api to choose conv algorithm.
+  bool isSelectAlgo_;
+  /// batchNum is used to record batch size. If the batch size is changed,
+  /// the selection algorithm will be called.
+  int batchNum_;
+  bool bias_;
+
+  std::unique_ptr<Weight> weight_;
+  static ThreadLocalD<std::vector<MemoryHandle*>> convMem_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnConvLayer.cpp b/paddle/gserver/layers/CudnnConvLayer.cpp
index 0f932f960f6bacb5fc80273e5dfedf86bfb9d152..23ba2341185d1b86b90dee58939f8ca07fda9364 100644
--- a/paddle/gserver/layers/CudnnConvLayer.cpp
+++ b/paddle/gserver/layers/CudnnConvLayer.cpp
@@ -22,215 +22,64 @@ REGISTER_LAYER(cudnn_conv, CudnnConvLayer);
 
 bool CudnnConvLayer::init(const LayerMap &layerMap,
                           const ParameterMap &parameterMap) {
-  ConvBaseLayer::init(layerMap, parameterMap);
+  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
   CHECK(useGpu_) << "CudnnConvLayer only support gpu";
 
-  maxGroups_ = 0;
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    CHECK_EQ(channels_[i] % groups_[i], 0);
-    CHECK_EQ(numFilters_ % groups_[i], 0);
-
-    hl_filter_descriptor filter;
-    hl_create_filter_descriptor(&filter, channels_[i] / groups_[i],
-                                numFilters_ / groups_[i], filterSizeY_[i],
-                                filterSize_[i]);
-    filterDesc_.push_back(filter);
-
-    hl_tensor_descriptor input;
-    hl_create_tensor_descriptor(&input);
-    inputDesc_.push_back(input);
-
-    hl_tensor_descriptor output;
-    int outputX =
-        outputSize(imgSize_[i], filterSize_[i], padding_[i], stride_[i]);
-    CHECK_EQ(outputX, outputX_[i]);
-    hl_create_tensor_descriptor(&output);
-    outputDesc_.push_back(output);
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  projections_.reserve(inputLayers_.size());
+  projConf_.reserve(inputLayers_.size());
 
-    hl_convolution_descriptor conv;
-    hl_create_convolution_descriptor(&conv, input, filter, paddingY_[i],
-                                     padding_[i], strideY_[i], stride_[i]);
-    convDesc_.push_back(conv);
-
-    weightOffset_.push_back((numFilters_ / groups_[i]) *
-                            (channels_[i] / groups_[i]) * filterPixels_[i]);
-    inputOffset_.push_back((channels_[i] / groups_[i]) * imgSize_[i] *
-                           imgSize_[i]);
-    outputOffset_.push_back((numFilters_ / groups_[i]) * outputX_[i] *
-                            outputX_[i]);
-
-    // initialize all to default algorithms
-    fwdAlgo_.push_back(0);
-    bwdFilterAlgo_.push_back(0);
-    bwdDataAlgo_.push_back(0);
-    fwdLimitBytes_.push_back(0);
-    bwdFilterLimitBytes_.push_back(0);
-    bwdDataLimitBytes_.push_back(0);
-
-    // cudnn streams per group equal to 1
-    if (groups_[i] > maxGroups_) {
-      maxGroups_ = groups_[i];
-    }
-  }
-
-  workSpaceInBytes_ = 0;
-  workSpaceData_ = NULL;
-  for (int i = 0; i < maxGroups_; ++i) {
-    workSpace_.push_back(NULL);
+  numFilters_ = config_.num_filters();
+  CHECK(config_.shared_biases());
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    ProjectionConfig* conf = new ProjectionConfig();
+    conf->set_type("conv");
+    conf->set_num_filters(numFilters_);
+    ConvConfig* convConf = conf->mutable_conv_conf();
+    *convConf = *(config_.mutable_inputs(i)->mutable_conv_conf());
+    conf->set_input_size(getPrev(i)->getSize());
+    conf->set_output_size(getSize());
+    projConf_.emplace_back(conf);
+    projections_.emplace_back(Projection::create(*projConf_[i],
+                                                 parameters_[i], useGpu_));
   }
 
   if (biases_.get() && sharedBiases_) {
     hl_create_tensor_descriptor(&biasDesc_);
+    hl_create_tensor_descriptor(&outputDesc_);
     hl_tensor_reshape(biasDesc_, 1, numFilters_ / groups_[0], 1, 1);
     biasOffset_ = numFilters_ / groups_[0];
   }
 
-  batchNum_ = 0;
-  isSelectAlgo_ = false;
   return true;
 }
 
-void CudnnConvLayer::allocConvWorkSpace(size_t maxWorkSpace) {
-  size_t totalWorkSpace = maxWorkSpace * maxGroups_;
-
-  if (totalWorkSpace  > workSpaceInBytes_) {
-      if (workSpaceInBytes_ != 0) {
-          hl_free_mem_device(workSpaceData_);
-      }
-      // total amount of storage needed over all groups
-      workSpaceData_ = hl_malloc_device(totalWorkSpace);
-
-      // update work space address for each group
-      for (int i = 0; i < maxGroups_; ++i) {
-            workSpace_[i] = reinterpret_cast<char *>(workSpaceData_)
-                                  + i * maxWorkSpace;
-      }
-      workSpaceInBytes_ = totalWorkSpace;
-  }
-}
-
-void CudnnConvLayer::reshape(int batchSize) {
-  CHECK_NE(inputLayers_.size(), 0UL);
-  imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  if (imageH_ == 0) imageH_ = imgSize_[0];
-  if (imageW_ == 0) imageW_ = imgSize_[0];
-
-  for (size_t i = 1; i < inputLayers_.size(); i++) {
-    int imageH = inputLayers_[i]->getOutput().getFrameHeight();
-    int imageW = inputLayers_[i]->getOutput().getFrameWidth();
-    if (imageH) {
-      CHECK_EQ(imageH_, imageH) << "Inputs must have same height.";
-    }
-    if (imageW) {
-      CHECK_EQ(imageW_, imageW) << "Inputs must have same width.";
-    }
-  }
-
-  outputH_ = outputSize(imageH_, filterSizeY_[0], paddingY_[0], strideY_[0]);
-  outputW_ = outputSize(imageW_, filterSize_[0], padding_[0], stride_[0]);
-  // check outputH & outputW
-  getOutput().setFrameHeight(outputH_);
-  getOutput().setFrameWidth(outputW_);
-
-  // if the batchSize remains the same, set isSelectAlgo_ true.
-  // Otherwise, set isSelectAlgo_ false and select algo again.
-  isSelectAlgo_ = (batchSize == batchNum_);
-  batchNum_ = batchSize;
-
-  size_t maxWorkSpace = 0;
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    CHECK_EQ(inputLayers_[i]->getOutput().value->getWidth(),
-             (size_t)(channels_[i] * imageH_ * imageW_));
-
-    hl_tensor_reshape(inputDesc_[i], batchSize, channels_[i] / groups_[i],
-                      imageH_, imageW_, channels_[i] * imageH_ * imageW_,
-                      imageH_ * imageW_, imageW_, 1);
-
-    hl_tensor_reshape(outputDesc_[i], batchSize, numFilters_ / groups_[i],
-                      outputH_, outputW_, numFilters_ * outputH_ * outputW_,
-                      outputH_ * outputW_, outputW_, 1);
-
-    hl_reset_convolution_descriptor(convDesc_[i], inputDesc_[i],
-                                    filterDesc_[i], paddingY_[i],
-                                    padding_[i], strideY_[i], stride_[i]);
-
-    inputOffset_[i] = (channels_[i] / groups_[i]) * imageH_ * imageW_;
-    outputOffset_[i] = (numFilters_ / groups_[i]) * outputH_ * outputW_;
-
-    if (!isSelectAlgo_) {
-      hl_conv_workspace(inputDesc_[i], outputDesc_[i], filterDesc_[i],
-                        convDesc_[i], &fwdAlgo_[i], &fwdLimitBytes_[i],
-                        &bwdDataAlgo_[i], &bwdDataLimitBytes_[i],
-                        &bwdFilterAlgo_[i], &bwdFilterLimitBytes_[i]);
-
-      maxWorkSpace = std::max(fwdLimitBytes_[i], bwdDataLimitBytes_[i]);
-      maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_[i]);
-
-      VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_[i]
-                           << " / " << bwdDataAlgo_[i]
-                           << " / " << bwdFilterAlgo_[i];
-    }
-  }
-
-  if (!isSelectAlgo_) {
-    allocConvWorkSpace(maxWorkSpace);
-  }
-
-  isSelectAlgo_ = true;
-}
-
 void CudnnConvLayer::forward(PassType passType) {
   Layer::forward(passType);
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  reshape(batchSize);
-  resetOutput(batchSize, outputH_ * outputW_ * numFilters_);
+
+  int batchSize = getInput(0).getBatchSize();
+  resetOutput(batchSize, calOutputSize());
 
   for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    REGISTER_TIMER_INFO("CudnnConvFwTimer", getName().c_str());
-    for (int g = 0; g < groups_[i]; ++g) {
-      real *inputData = getInputValue(i)->getData() + inputOffset_[i] * g;
-      real *wgtData = weights_[i]->getW()->getData() + weightOffset_[i] * g;
-      real *outData = getOutputValue()->getData() + outputOffset_[i] * g;
-      hl_convolution_forward(inputDesc_[i], inputData, outputDesc_[i],
-                             outData, filterDesc_[i], wgtData,
-                             convDesc_[i], workSpace_[g],
-                             fwdLimitBytes_[i], fwdAlgo_[i]);
-    }
+    projections_[i]->forward(&getInput(i), &getOutput(), passType);
   }
 
   if (biases_) {
     REGISTER_TIMER_INFO("CudnnConvBiasTimer", getName().c_str());
-    addBiases();
-  }
-
-  forwardActivation();
-}
-
-void CudnnConvLayer::addBiases() {
-  if (sharedBiases_) {
+    int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+    hl_tensor_reshape(outputDesc_, batchSize, numFilters_ / groups_[0],
+        outputH_[0], outputW_[0], numFilters_ * outputH_[0] * outputW_[0],
+        outputH_[0] * outputW_[0], outputW_[0], 1);
+    outputOffset_ = getOutputValue()->getWidth() / groups_[0];
     for (int g = 0; g < groups_[0]; ++g) {
       real *biasData = biases_->getW()->getData() + biasOffset_ * g;
-      real *outData = getOutputValue()->getData() + outputOffset_[0] * g;
+      real *outData = getOutputValue()->getData() + outputOffset_ * g;
       hl_convolution_forward_add_bias(biasDesc_, biasData,
-                                      outputDesc_[0], outData);
+                                      outputDesc_, outData);
     }
-  } else {
-    LOG(FATAL) << "Not supported";
   }
-}
 
-void CudnnConvLayer::bpropBiases() {
-  if (sharedBiases_) {
-    for (int g = 0; g < groups_[0]; ++g) {
-      real *biasGrad = biases_->getWGrad()->getData() + biasOffset_ * g;
-      real *outGrad = getOutputGrad()->getData() + outputOffset_[0] * g;
-      hl_convolution_backward_bias(biasDesc_, biasGrad,
-                                   outputDesc_[0], outGrad);
-    }
-  } else {
-    LOG(FATAL) << "Not supported";
-  }
+  forwardActivation();
 }
 
 void CudnnConvLayer::backward(const UpdateCallback &callback) {
@@ -238,52 +87,23 @@ void CudnnConvLayer::backward(const UpdateCallback &callback) {
 
   if (biases_ && biases_->getWGrad()) {
     REGISTER_TIMER_INFO("CudnnConvBpBiasTimer", getName().c_str());
-    bpropBiases();
+    for (int g = 0; g < groups_[0]; ++g) {
+      real *biasGrad = biases_->getWGrad()->getData() + biasOffset_ * g;
+      real *outGrad = getOutputGrad()->getData() + outputOffset_ * g;
+      hl_convolution_backward_bias(biasDesc_, biasGrad, outputDesc_, outGrad);
+    }
     biases_->getParameterPtr()->incUpdate(callback);
   }
 
   for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    REGISTER_TIMER_INFO("CudnnConvBpTimer", getName().c_str());
-    for (int g = 0; g < groups_[i]; ++g) {
-      real *outGrad = getOutputGrad()->getData() + outputOffset_[i] * g;
-      if (weights_[i]->getWGrad()) {
-        real *inputData = getInputValue(i)->getData() + inputOffset_[i] * g;
-        real *weightGrad =
-            weights_[i]->getWGrad()->getData() + weightOffset_[i] * g;
-        hl_convolution_backward_filter(
-            inputDesc_[i], inputData, outputDesc_[i], outGrad, filterDesc_[i],
-            weightGrad, convDesc_[i], workSpace_[g], bwdFilterLimitBytes_[i],
-            bwdFilterAlgo_[i]);
-      }
-
-      MatrixPtr preGrad = getInputGrad(i);
-      if (NULL != preGrad) {
-        real *inputGrad = preGrad->getData() + inputOffset_[i] * g;
-        real *wgtData = weights_[i]->getW()->getData() + weightOffset_[i] * g;
-        hl_convolution_backward_data(
-            inputDesc_[i], inputGrad, outputDesc_[i], outGrad, filterDesc_[i],
-            wgtData, convDesc_[i], workSpace_[g], bwdDataLimitBytes_[i],
-            bwdDataAlgo_[i]);
-      }
-    }
-    weights_[i]->getParameterPtr()->incUpdate(callback);
+    projections_[i]->backward(callback);
   }
 }
 
 CudnnConvLayer::~CudnnConvLayer() {
-  if (biasDesc_) {
+  if (biases_) {
     hl_destroy_tensor_descriptor(biasDesc_);
-  }
-
-  for (size_t i = 0; i < inputDesc_.size(); i++) {
-    hl_destroy_tensor_descriptor(inputDesc_[i]);
-    hl_destroy_tensor_descriptor(outputDesc_[i]);
-    hl_destroy_filter_descriptor(filterDesc_[i]);
-    hl_destroy_convolution_descriptor(convDesc_[i]);
-  }
-  if (workSpaceInBytes_ != 0) {
-    hl_free_mem_device(workSpaceData_);
-    workSpaceInBytes_ = 0;
+    hl_destroy_tensor_descriptor(outputDesc_);
   }
 }
 
diff --git a/paddle/gserver/layers/CudnnConvLayer.h b/paddle/gserver/layers/CudnnConvLayer.h
index a6dadba10daa49d03e4a52a9c028a87400ca23ea..6390d96315cc4422c65e52f0d219b903c66f2cbd 100644
--- a/paddle/gserver/layers/CudnnConvLayer.h
+++ b/paddle/gserver/layers/CudnnConvLayer.h
@@ -17,12 +17,13 @@ limitations under the License. */
 
 #include "ConvBaseLayer.h"
 #include "paddle/math/Matrix.h"
+#include "Projection.h"
 #include <vector>
 
 namespace paddle {
 
 /**
- * @brief A subclass of ConvBaseLayer by cuDNN implementation. It only
+ * @brief A 2-dimension conv layer implemented by cuDNN. It only
  *        supports GPU mode. We automatic select CudnnConvLayer for GPU
  *        mode and ExpandConvLayer for CPU mode if you set type of "conv".
  *        User also can specfiy type of "exconv" or "cudnn_conv" for
@@ -31,81 +32,21 @@ namespace paddle {
  * The config file api is img_conv_layer.
  */
 class CudnnConvLayer : public ConvBaseLayer {
-private:
-  /// resize Cudnn workspace size
-  void allocConvWorkSpace(size_t maxWorkSpace);
-
 protected:
-  int imageH_, imageW_, outputH_, outputW_;
-  /// Cudnn tensor descriptor for bias.
+  std::vector<std::unique_ptr<ProjectionConfig>> projConf_;
+  std::vector<std::unique_ptr<Projection>> projections_;
+
   hl_tensor_descriptor biasDesc_;
-  /// Cudnn tensor descriptor for input.
-  std::vector<hl_tensor_descriptor> inputDesc_;
-  /// Cudnn tensor descriptor for output.
-  std::vector<hl_tensor_descriptor> outputDesc_;
-  /// Cudnn tensor descriptor for filter.
-  std::vector<hl_filter_descriptor> filterDesc_;
-  /// Cudnn tensor descriptor for a convolution operation.
-  std::vector<hl_convolution_descriptor> convDesc_;
-  /// One sample offset of input data.
-  IntV inputOffset_;
-  /// One sample offset of output data.
-  IntV outputOffset_;
-  /// One group offset of weight.
-  IntV weightOffset_;
-  /// One group offset of bias.
+  hl_tensor_descriptor outputDesc_;
   int biasOffset_;
-
-  /// Save the algorithm for forward convolution, which is obtained by cudnn
-  /// api to search the best suited algorithm.
-  std::vector<int> fwdAlgo_;
-  /// Save the algorithm for computing convolution gradient with respect to
-  /// filter coefficients.
-  std::vector<int> bwdFilterAlgo_;
-  /// Save the algorithm for computing convolution gradient with respect to
-  /// the output.
-  std::vector<int> bwdDataAlgo_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// forward convolution with the specified algo.
-  std::vector<size_t> fwdLimitBytes_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// backwardFilter with the specified algo.
-  std::vector<size_t> bwdFilterLimitBytes_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// backwardData with the specified algo.
-  std::vector<size_t> bwdDataLimitBytes_;
-
-  /// Device work space address for each group.
-  std::vector<void*> workSpace_;
-  /// Max number of groups.
-  int maxGroups_;
-  /// Total work space address in device for all groups.
-  void* workSpaceData_;
-  /// Size of total work space.
-  size_t workSpaceInBytes_;
-
-  /// Is or not select conv algorihtm.
-  bool isSelectAlgo_;
-
-  /// batchNum is used to record batch size. If the batch size is changed,
-  /// the selection algorithm will be called.
-  int batchNum_;
+  int outputOffset_;
 
 public:
   explicit CudnnConvLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
 
   ~CudnnConvLayer();
 
-  /**
-   * Intialization. Initialize member variables and create tenor descriptor.
-   */
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  /**
-   * Reshape is done each forward. Reshape tensor decriptor
-   * inputDesc_, outputDesc_, convDesc_. And search the faster algo
-   * or the fastest algo within a given memeory limit.
-   */
-  void reshape(int batchSize);
   void forward(PassType passType);
   void backward(const UpdateCallback& callback);
   void addBiases();
diff --git a/paddle/gserver/layers/ExpandConvLayer.cpp b/paddle/gserver/layers/ExpandConvLayer.cpp
index df79c3e3037cfce063c1e392bd4c30d1a800b402..80a6a62b5c0de768f9cc534adf68405a883ec10f 100644
--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -37,32 +37,29 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
     caffeMode_ = conf.caffe_mode();
   }
 
+  /* initialize the weightList */
+  CHECK(inputLayers_.size() == parameters_.size());
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    size_t height, width;
+    height = filterPixels_[i] * filterChannels_[i];
+    width = numFilters_;
+
+    // create a new weight
+    CHECK_EQ(parameters_[i]->getSize(), width * height);
+    Weight* w = new Weight(height, width, parameters_[i]);
+    weights_.emplace_back(w);
+  }
+
   return true;
 }
 
-size_t ExpandConvLayer::getSize() {
+size_t ExpandConvLayer::getOutputSize() {
   CHECK_NE(inputLayers_.size(), 0UL);
-  imgSizeH_.clear();
-  imgSizeW_.clear();
-  outputH_.clear();
-  outputW_.clear();
+  size_t layerSize = ConvBaseLayer::calOutputSize();
   subN_.clear();
-  size_t layerSize = 0;
   for (size_t i = 0; i < inputLayers_.size(); i++) {
-    imgSizeH_.push_back(inputLayers_[i]->getOutput().getFrameHeight());
-    imgSizeW_.push_back(inputLayers_[i]->getOutput().getFrameWidth());
-    if (imgSizeH_[i] == 0) imgSizeH_[i] = imgSize_[i];
-    if (imgSizeW_[i] == 0) imgSizeW_[i] = imgSize_[i];
-    outputH_.push_back(
-        outputSize(imgSizeH_[i], filterSize_[i], padding_[i], stride_[i]));
-    outputW_.push_back(
-        outputSize(imgSizeW_[i], filterSize_[i], padding_[i], stride_[i]));
     subN_.push_back(outputH_[i] * outputW_[i]);
-    CHECK(layerSize == 0 || subN_[i] * size_t(numFilters_) == layerSize);
-    layerSize = subN_[i] * numFilters_;
   }
-  getOutput().setFrameHeight(outputH_[0]);
-  getOutput().setFrameWidth(outputW_[0]);
   return layerSize;
 }
 
@@ -119,7 +116,7 @@ void ExpandConvLayer::expandFwdOnce(MatrixPtr image, int inIdx, int startIdx) {
 }
 
 void ExpandConvLayer::addSharedBias() {
-  size_t mapW = getSize() / numFilters_;
+  size_t mapW = getOutputValue()->getWidth() / numFilters_;
   size_t mapH = getOutputValue()->getElementCnt() / mapW;
   MatrixPtr out =
       Matrix::create(getOutputValue()->getData(), mapH, mapW, false, useGpu_);
@@ -158,7 +155,7 @@ void ExpandConvLayer::forward(PassType passType) {
    *   transOutValue correspond sample to one row */
   int batchSize = inputLayers_[0]->getOutputValue()->getWidth();
   batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  resetOutput(batchSize, getSize());
+  resetOutput(batchSize, getOutputSize());
 
   MatrixPtr image = nullptr;
   for (size_t i = 0; i != inputLayers_.size(); ++i) {
@@ -183,7 +180,7 @@ void ExpandConvLayer::forward(PassType passType) {
 }
 
 void ExpandConvLayer::bpropSharedBias(MatrixPtr biases, MatrixPtr v) {
-  size_t mapW = getSize() / numFilters_;
+  size_t mapW = v->getWidth() / numFilters_;
   size_t mapH = v->getElementCnt() / mapW;
   MatrixPtr vTmp = Matrix::create(v->getData(), mapH, mapW, false, useGpu_);
 
diff --git a/paddle/gserver/layers/ExpandConvLayer.h b/paddle/gserver/layers/ExpandConvLayer.h
index fc3d69b1b7d14c64c95ab66dbe7725857ec38261..030a3ba397ff41208bda84d0d6b876359d587c57 100644
--- a/paddle/gserver/layers/ExpandConvLayer.h
+++ b/paddle/gserver/layers/ExpandConvLayer.h
@@ -37,14 +37,6 @@ protected:
   IntV subN_;
   /// subK_ = channels_ * filterPixels_ * groups_.
   IntV subK_;
-  /// The spatial dimensions of height of input feature map.
-  IntV imgSizeH_;
-  /// The spatial dimensions of width of input feature map.
-  IntV imgSizeW_;
-  /// The spatial dimensions of height of output feature map.
-  IntV outputH_;
-  /// The spatial dimensions of width of output feature map.
-  IntV outputW_;
   /// Expand one sample at a time. shape:
   /// (numChannels * filterPixels_, outputSizeH * outputSizeW)
   MatrixPtr expandInput_;
@@ -58,7 +50,7 @@ public:
 
   bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
 
-  size_t getSize();
+  size_t getOutputSize();
 
   /**
    * Create or resize expandInput_.
diff --git a/paddle/gserver/layers/MixedLayer.cpp b/paddle/gserver/layers/MixedLayer.cpp
index 054ddd3a228edd78c5a451f445e02afda2985b9a..26b1360290ffba316816db898855d8c0b9bdaaa7 100644
--- a/paddle/gserver/layers/MixedLayer.cpp
+++ b/paddle/gserver/layers/MixedLayer.cpp
@@ -41,9 +41,13 @@ bool MixedLayer::init(const LayerMap& layerMap,
     }
     operators_.emplace_back(Operator::create(operator_conf, useGpu_));
   }
+
   /* initialize biases_ */
   if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+    sharedBias_ = config_.shared_biases();
+    size_t psize = config_.bias_size();
+    biases_ = std::unique_ptr<Weight>(
+        new Weight(1, psize, biasParameter_));
   }
 
   return true;
@@ -119,12 +123,6 @@ void MixedLayer::forward(PassType passType) {
 
   MatrixPtr outV = getOutputValue();
 
-  /* add the bias-vector */
-  if (biases_.get() != NULL) {
-    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
   for (size_t i = 0; i != inputLayers_.size(); ++i) {
     if (projections_[i]) {
       projections_[i]->forward(&getInput(i), &output_, passType);
@@ -140,6 +138,12 @@ void MixedLayer::forward(PassType passType) {
     op->forward(ins, &output_, passType);
   }
 
+  /* add the bias-vector */
+  if (biases_.get() != NULL) {
+    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
+    outV->addBias(*(biases_->getW()), 1, sharedBias_);
+  }
+
   /* activation */ {
     REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
     forwardActivation();
@@ -154,7 +158,7 @@ void MixedLayer::backward(const UpdateCallback& callback) {
 
   if (biases_ && biases_->getWGrad()) {
     REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBias_);
 
     /* Increasing the number of gradient */
     biases_->getParameterPtr()->incUpdate(callback);
diff --git a/paddle/gserver/layers/MixedLayer.h b/paddle/gserver/layers/MixedLayer.h
index 9bac1355bd21ff2b949e593249ee2cd9063c3c75..5842e51e1d79d959d580e9cb92bead2d1961c9e6 100644
--- a/paddle/gserver/layers/MixedLayer.h
+++ b/paddle/gserver/layers/MixedLayer.h
@@ -58,5 +58,6 @@ protected:
   /// the matrix size of projection state
   std::vector<int> projectionStateMatrixSize_;
   std::unique_ptr<Weight> biases_;
+  bool sharedBias_;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index ff2abf76973174ac2a437830b234f4c9937c08ed..26ee2b3aae64abfce69b543f13ab0f4254757fd8 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -20,6 +20,13 @@ add_unittest_without_exec(test_LayerGrad
 add_test(NAME test_LayerGrad
     COMMAND test_LayerGrad)
 
+add_unittest_without_exec(test_ActivationGrad
+    test_ActivationGrad.cpp
+    LayerGradUtil.cpp
+    TestUtil.cpp)
+add_test(NAME test_ActivationGrad
+    COMMAND test_ActivationGrad)
+
 ################## test_Evaluator #######################
 add_unittest(test_Evaluator
     test_Evaluator.cpp
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
index 552a6c5b41c7f896c52b2132578b136200967573..bc7bee0e4bbc8c365505619f6fa21d2a88433fcd 100644
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -669,12 +669,14 @@ void testLayerGrad(TestConfig testConf, string testLayerName, size_t batchSize,
 
 void testProjectionGrad(ProjectionConfig conf, InputType inputType,
                         size_t parameterSize, size_t batchSize, bool useGpu,
-                        bool testState) {
+                        bool testState, int biasSize, bool sharedBias) {
   TestConfig config;
   conf.set_name(conf.type());
   config.layerConfig.set_type("mixed");
   config.layerConfig.set_size(conf.output_size());
-  config.biasSize = config.layerConfig.size();
+  config.biasSize = biasSize == 0 ? config.layerConfig.size() : biasSize;
+  config.layerConfig.set_bias_size(config.biasSize);
+  config.layerConfig.set_shared_biases(sharedBias);
   config.inputDefs.push_back(
       {inputType, "layer_0", conf.input_size(), parameterSize});
   *config.layerConfig.add_inputs()->mutable_proj_conf() = conf;
diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h
index 1e608dc0620abd4fca5d7aa6a235daff13c41fb7..3b9ec803959b372a960ed705da5abf7d301a2c64 100644
--- a/paddle/gserver/tests/LayerGradUtil.h
+++ b/paddle/gserver/tests/LayerGradUtil.h
@@ -217,7 +217,8 @@ void testLayerGrad(TestConfig testConf, string testLayerName, size_t batchSize,
 
 void testProjectionGrad(ProjectionConfig conf, InputType inputType,
                         size_t parameterSize, size_t batchSize, bool useGpu,
-                        bool testState = false);
+                        bool testState = false, int biasSize = 0,
+                        bool sharedBias = false);
 
 void testOperatorGrad(TestConfig& config, OperatorConfig& operatorConf,
                       size_t batchSize, bool useGpu, bool testState = false);
diff --git a/paddle/gserver/tests/img_conv_a.conf b/paddle/gserver/tests/img_conv_a.conf
new file mode 100644
index 0000000000000000000000000000000000000000..940589ed9ac242d6a73a74c9be39fcaafe66b7be
--- /dev/null
+++ b/paddle/gserver/tests/img_conv_a.conf
@@ -0,0 +1,39 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+data = data_layer(name ="input", size=8*16*16)
+conv1 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                        num_channels=8,
+                        num_filters=16, stride=1,
+                        bias_attr=False,
+                        act=ReluActivation())
+conv2 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8,
+                       num_filters=16, stride=1,
+                       bias_attr=False,
+                       act=ReluActivation())
+
+concat = concat_layer(input=[conv1, conv2])
+
+conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                      num_channels=8,
+                      num_filters=16, stride=1,
+                      bias_attr=True,
+                      act=LinearActivation())
+
+outputs(concat, conv)
diff --git a/paddle/gserver/tests/img_conv_b.conf b/paddle/gserver/tests/img_conv_b.conf
new file mode 100644
index 0000000000000000000000000000000000000000..8ca9c94541504d208b94f45bf71c8da440d18411
--- /dev/null
+++ b/paddle/gserver/tests/img_conv_b.conf
@@ -0,0 +1,32 @@
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+data = data_layer(name ="input", size=8*16*16)
+proj1 = conv_projection(input=data, filter_size=1, filter_size_y=1,
+                        num_channels=8, num_filters=16, stride=1)
+proj2 = conv_projection(input=data, filter_size=1, filter_size_y=1,
+                        num_channels=8, num_filters=16, stride=1)
+concat = concat_layer(input=[proj1, proj2], bias_attr=False, act=ReluActivation())
+
+proj = conv_projection(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8, num_filters=16, stride=1)
+
+with mixed_layer(bias_attr=True, act=LinearActivation()) as conv:
+    conv += proj
+
+outputs(concat, conv)
diff --git a/paddle/gserver/tests/test_ActivationGrad.cpp b/paddle/gserver/tests/test_ActivationGrad.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2c5d17090dfc7772c84477cb721b084b7a03c835
--- /dev/null
+++ b/paddle/gserver/tests/test_ActivationGrad.cpp
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <vector>
+#include <string>
+#include "paddle/gserver/layers/DataLayer.h"
+#include "ModelConfig.pb.h"
+#include "paddle/trainer/Trainer.h"
+
+#include "TestUtil.h"
+#include "LayerGradUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+P_DECLARE_bool(use_gpu);
+P_DECLARE_bool(thread_local_rand_use_global_seed);
+
+void testActivation(const string& act) {
+  LOG(INFO) << "test activation: " << act;
+  size_t size = 10;
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("addto");
+  config.layerConfig.set_size(size);
+  config.layerConfig.set_active_type(act);
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", size, 0});
+  config.layerConfig.add_inputs();
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  act + "_activation",
+                  100,
+                  /* trans= */false,
+                  useGpu,
+                  /* useWeight */true);
+  }
+}
+
+TEST(Activation, activation) {
+  auto types = ActivationFunction::getAllRegisteredTypes();
+  std::set<string> excluded{"sequence_softmax"};
+  for (auto type : types) {
+    if (excluded.count(type)) continue;
+    testActivation(type);
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 54a9aea024423b5ff97cfaa1c863f8f04355dccb..8ee89992ddf1e9b8d67acb59b53a0c64af5981e7 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -152,6 +152,45 @@ TEST(Projection, identity) {
   }
 }
 
+
+#ifndef PADDLE_ONLY_CPU
+TEST(Projection, conv) {
+  const int NUM_FILTERS = 16;
+  const int FILTER_SIZE = 2;
+  const int FILTER_SIZE_Y = 3;
+  const int CHANNELS = 3;
+  const int IMAGE_SIZE = 16;
+
+  ProjectionConfig conf;
+  conf.set_type("conv");
+  conf.set_num_filters(NUM_FILTERS);
+
+  ConvConfig* conv = conf.mutable_conv_conf();
+  conv->set_filter_size(FILTER_SIZE);
+  conv->set_filter_size_y(FILTER_SIZE_Y);
+  conv->set_channels(CHANNELS);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(IMAGE_SIZE);
+  int outputSize = (2 * conv->padding() + conv->img_size() -
+      conv->filter_size()) / conv->stride() + 1;
+  int outputSizeY = (2 * conv->padding_y() + conv->img_size() -
+      conv->filter_size_y()) / conv->stride_y() + 1;
+  conv->set_output_x(outputSize);
+  conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
+  conf.set_output_size(outputSize * outputSizeY * NUM_FILTERS);
+
+  testProjectionGrad(conf, INPUT_DATA,
+      /* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE * FILTER_SIZE_Y,
+      /* batchSize */ 100, true, false, NUM_FILTERS, true);
+}
+#endif
+
+
 TEST(Layer, concat) {
   TestConfig config;
   config.biasSize = 0;
diff --git a/paddle/gserver/tests/test_NetworkCompare.cpp b/paddle/gserver/tests/test_NetworkCompare.cpp
index b3ef53067301b4f7f50ba799a035a80fa1c39e65..8d3eac5aca8d1567690f905b2e4b4f6fab7efdde 100644
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -236,6 +236,15 @@ TEST(Compare, img_pool) {
   compareNetwork(config_file_a, config_file_b);
   FLAGS_use_gpu = useGpu;
 }
+
+TEST(Compare, img_conv) {
+  std::string config_file_a = "./gserver/tests/img_conv_a.conf";
+  std::string config_file_b = "./gserver/tests/img_conv_b.conf";
+  bool useGpu = FLAGS_use_gpu;
+  FLAGS_use_gpu = true;
+  compareNetwork(config_file_a, config_file_b);
+  FLAGS_use_gpu = useGpu;
+}
 #endif
 
 
diff --git a/paddle/gserver/tests/test_PyDataProvider2.cpp b/paddle/gserver/tests/test_PyDataProvider2.cpp
index 6bf1e329251219fcbf68b95f2d80a3235cb7037f..b9867a728d9b4cc8d318578ab3e45021f87daa4c 100644
--- a/paddle/gserver/tests/test_PyDataProvider2.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider2.cpp
@@ -353,6 +353,23 @@ TEST(PyDataProvider2, test_check) {
   }
 }
 
+TEST(PyDataProvider2, multiThread) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_dense_no_seq");
+  config.set_async_load_data(true);
+
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+  provider->reset();
+  paddle::DataBatch batch;
+  provider->getNextBatch(100, &batch);
+  provider->reset();
+  provider.reset();
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   paddle::initMain(argc, argv);
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index da493379e3a37ecb8f4d8f9f333629b3e71d90a5..f8132066477db3b9762348e9baf7a5112d302fd6 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -39,6 +39,46 @@ void gemm<double>(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
               beta, C, ldc);
 }
 
+template<>
+int getrf<float>(const CBLAS_ORDER order, const int M, const int N,
+                  float *A, const int lda, int *ipiv) {
+#ifdef PADDLE_USE_ATLAS
+  return clapack_sgetrf(order, M, N, A, lda, ipiv);
+#else
+  return LAPACKE_sgetrf(order, M, N, A, lda, ipiv);
+#endif
+}
+
+template<>
+int getrf<double>(const CBLAS_ORDER order, const int M, const int N,
+                   double *A, const int lda, int *ipiv) {
+#ifdef PADDLE_USE_ATLAS
+  return clapack_dgetrf(order, M, N, A, lda, ipiv);
+#else
+  return LAPACKE_dgetrf(order, M, N, A, lda, ipiv);
+#endif
+}
+
+template<>
+int getri<float>(const CBLAS_ORDER order, const int N, float *A,
+                  const int lda, const int *ipiv) {
+#ifdef PADDLE_USE_ATLAS
+  return clapack_sgetri(order, N, A, lda, ipiv);
+#else
+  return LAPACKE_sgetri(order, N, A, lda, ipiv);
+#endif
+}
+
+template<>
+int getri<double>(const CBLAS_ORDER order, const int N, double *A,
+                  const int lda, const int *ipiv) {
+#ifdef PADDLE_USE_ATLAS
+  return clapack_dgetri(order, N, A, lda, ipiv);
+#else
+  return LAPACKE_dgetri(order, N, A, lda, ipiv);
+#endif
+}
+
 template<>
 void axpy<float>(const int n, const float alpha, const float* x, float* y) {
   cblas_saxpy(n, alpha, x, 1, y, 1);
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
index 43075977dc9cef1573cf6dd75d9ef577b07d337e..b322bd2bd719484b86b62bca5783d78bd8ca9a4c 100644
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -17,10 +17,18 @@ limitations under the License. */
 
 #ifdef PADDLE_USE_MKL
 #include <mkl.h>
+#include <mkl_lapacke.h>
 #else
 extern "C" {
 #include <cblas.h>
 }
+#ifdef PADDLE_USE_ATLAS
+extern "C" {
+#include <clapack.h>
+}
+#else
+#include <lapacke.h>
+#endif
 #endif
 
 #include <cmath>
@@ -34,6 +42,14 @@ void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
           const T* B, const int ldb,
           const T beta, T* C, const int ldc);
 
+template<class T>
+int getrf(const CBLAS_ORDER Order, const int M, const int N,
+          T *A, const int lda, int *ipiv);
+
+template<class T>
+int getri(const CBLAS_ORDER Order, const int N, T *A,
+          const int lda, const int *ipiv);
+
 template<class T>
 void axpy(const int n, const T alpha, const T* x, T* y);
 
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 33bc8d280fe8c3638a0aacc8052fd8464dc173b2..f1af9536ba5d6aac9907e70603a6f21ae2775c44 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -336,11 +336,44 @@ void GpuMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
   hl_matrix_transpose(data, dataTrans, height_, width_, lda, ldc);
 }
 
+
+MatrixPtr GpuMatrix::getInverse() {
+  MatrixPtr matInv;
+  inverse(matInv, true);
+  return matInv;
+}
+
+void GpuMatrix::inverse(MatrixPtr matInv, bool memAlloc) {
+  CHECK_EQ(height_, width_);
+
+  if (memAlloc) {
+    matInv = std::make_shared<GpuMatrix>(height_, width_);
+  } else {
+    CHECK(matInv != NULL);
+  }
+
+  real* data = getData();
+  real* dataInv = matInv->getData();
+  int lda = getStride();
+  int ldc = matInv->getStride();
+
+  hl_matrix_inverse(data, dataInv, height_, lda, ldc);
+}
+
 void GpuMatrix::addBias(Matrix& b, real scale) {
   CHECK(b.getHeight() == 1) << "the Bias should be a vector";
   BaseMatrix::addBias(b, scale);
 }
 
+void GpuMatrix::addSharedBias(Matrix& b, real scale) {
+  CHECK(b.getHeight() == 1) << "the Bias should be a vector";
+  CHECK_LE(b.getWidth(), getWidth());
+  CHECK_EQ(getWidth() % b.getWidth(), 0UL);
+  hl_matrix_add_shared_bias(getData(), b.getData(), b.getWidth(),
+                            getHeight(), getWidth(), scale);
+}
+
+
 void GpuMatrix::collectBias(Matrix& a, real scale) {
   CHECK_EQ(getHeight(), (size_t)1);
   CHECK_EQ(width_, a.getWidth());
@@ -355,6 +388,14 @@ void GpuMatrix::collectBias(Matrix& a, real scale) {
   }
 }
 
+void GpuMatrix::collectSharedBias(Matrix& a, real scale) {
+  CHECK_EQ(getHeight(), (size_t)1);
+  CHECK_EQ(a.getWidth() % getWidth(), 0UL);
+  hl_matrix_collect_shared_bias(getData(), a.getData(), getWidth(),
+                                a.getHeight(), a.getWidth(), scale);
+}
+
+
 void GpuMatrix::sequenceAvgForward(Matrix& a,
                                    const IVector& startsPos,
                                    int mode) {
@@ -1483,6 +1524,47 @@ void CpuMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
   }
 }
 
+
+MatrixPtr CpuMatrix::getInverse() {
+  MatrixPtr matInv;
+  inverse(matInv, true);
+  return matInv;
+}
+
+void CpuMatrix::inverse(MatrixPtr matInv, bool memAlloc) {
+  CHECK_EQ(height_, width_);
+
+  if (memAlloc) {
+    matInv = std::make_shared<CpuMatrix>(height_, width_);
+  } else {
+    CHECK(matInv != NULL);
+  }
+
+  CHECK_EQ(height_, matInv->getHeight());
+  CHECK_EQ(width_, matInv->getWidth());
+  matInv->copyFrom(*this);
+
+  real* data = getData();
+  real* dataInv = matInv->getData();
+  int ldc = matInv->getStride();
+
+  if (height_ == 1) {
+    CHECK_NE(*data, 0);
+    *dataInv = 1.0 / (*data);
+    return;
+  }
+
+  /* Compute the LU decomposition of the matrix */
+  std::vector<int> ipiv(height_);
+  CBLAS_ORDER order = (matInv->isTransposed() ? CblasColMajor : CblasRowMajor);
+  int info = getrf<real>(order, height_, height_, dataInv, ldc, ipiv.data());
+  CHECK_EQ(info, 0);
+
+  /* Compute the inverse of the matrix given its LU decompsotion */
+  info = getri<real>(order, height_, dataInv, ldc, ipiv.data());
+  CHECK_EQ(info, 0);
+}
+
 void CpuMatrix::convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
                            int channels, int blockH, int blockW, int strideH,
                            int strideW, int paddingH, int paddingW,
@@ -2046,6 +2128,24 @@ void CpuMatrix::addBias(Matrix& b, real scale) {
   }
 }
 
+void CpuMatrix::addSharedBias(Matrix& b, real scale) {
+  CHECK_EQ(b.getHeight(), (size_t)1);
+  real* aData = getData();
+  real* bData = b.getData();
+  size_t numSamples = getHeight();
+  size_t channel = b.getWidth();
+  CHECK_EQ(getWidth() % channel, 0UL);
+  size_t dim = getWidth() / channel;
+
+  for (size_t i = 0; i < numSamples; i++) {
+    for (size_t c = 0; c < channel; c++) {
+      for (size_t j = 0; j < dim; j++) {
+        aData[i * getStride() + c * dim + j] += scale * bData[c];
+      }
+    }
+  }
+}
+
 void CpuMatrix::collectBias(Matrix& a, real scale) {
   CHECK_EQ(getHeight(), (size_t)1);
   CHECK_EQ(width_, a.getWidth());
@@ -2063,6 +2163,23 @@ void CpuMatrix::collectBias(Matrix& a, real scale) {
   }
 }
 
+void CpuMatrix::collectSharedBias(Matrix& a, real scale) {
+  CHECK_EQ(getHeight(), (size_t)1);
+  real* B = getData();
+  real* A = a.getData();
+  size_t numSamples = a.getHeight();
+  size_t channel = getWidth();
+  CHECK_EQ(a.getWidth() % channel, 0UL);
+  size_t dim = a.getWidth() / channel;
+  for (size_t i = 0; i < numSamples; i++) {
+    for (size_t c = 0; c < channel; c++) {
+      for (size_t j = 0; j < dim; j++) {
+        B[c] += scale * A[i * channel * dim + c * dim + j];
+      }
+    }
+  }
+}
+
 void CpuMatrix::sequenceAvgForward(Matrix& a,
                                    const IVector& startsPos,
                                    int mode) {
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 9b16ceacbfe98a34ba4260f7e4036b40e795a5b0..25748a15696e1a763bc84cc5282fe5594d910349 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -328,6 +328,20 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
+  virtual MatrixPtr getInverse() {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  /**
+   * @brief  inverse.
+   *
+   * if allocate matInv's memory outside, then set memAlloc as false;
+   * else set as true.
+   */
+  virtual void inverse(MatrixPtr matInv, bool memAlloc) {
+    LOG(FATAL) << "Not implemented";
+  }
+
 public:
   /// Only set all variables to 0 or NULL but not free them.
   virtual void clear() {
@@ -343,11 +357,35 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
+  virtual void addSharedBias(Matrix& b, real scale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void addBias(Matrix& b, real scale, bool sharedBias) {
+    if (!sharedBias) {
+      addBias(b, scale);
+    } else {
+      addSharedBias(b, scale);
+    }
+  }
+
   /// add each sample from a to this.
   virtual void collectBias(Matrix& a, real scale) {
     LOG(FATAL) << "Not implemented";
   }
 
+  virtual void collectSharedBias(Matrix& a, real scale) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void collectBias(Matrix& a, real scale, bool sharedBias) {
+    if (!sharedBias) {
+      collectBias(a, scale);
+    } else {
+      collectSharedBias(a, scale);
+    }
+  }
+
   virtual void sequenceAvgForward(Matrix& a, const IVector& startsPos,
     int mode) {
     LOG(FATAL) << "Not implemented";
@@ -1035,8 +1073,12 @@ public:
   MatrixPtr getTranspose();
   void transpose(MatrixPtr matTrans, bool memAlloc);
 
+  MatrixPtr getInverse();
+  void inverse(MatrixPtr matInv, bool memAlloc);
+
   /// add b to each sample of this.
   void addBias(Matrix& b, real scale);
+  void addSharedBias(Matrix& b, real scale);
 
   /**
    * @code
@@ -1044,6 +1086,7 @@ public:
    * @endcode
    */
   void collectBias(Matrix& a, real scale);
+  void collectSharedBias(Matrix& a, real scale);
 
   void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
 
@@ -1286,6 +1329,9 @@ public:
   MatrixPtr getTranspose();
   void transpose(MatrixPtr matTrans, bool memAlloc);
 
+  MatrixPtr getInverse();
+  void inverse(MatrixPtr matInv, bool memAlloc);
+
   void copyFrom(const Matrix& src);
 
   void copyFrom(const Matrix& src, hl_stream_t stream);
@@ -1371,9 +1417,11 @@ public:
 public:
   /// add b to each sample of this.
   void addBias(Matrix& b, real scale);
+  void addSharedBias(Matrix& b, real scale);
 
   /// add each sample of a to this.
   void collectBias(Matrix& a, real scale);
+  void collectSharedBias(Matrix& a, real scale);
 
   void sequenceAvgForward(Matrix& a, const IVector& startsPos, int mode);
 
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 2cc38b82306e2b7ae50b89725d2095215714fa61..ef22e2aa8dd1734bce3dfff0241807ab5694d3fb 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -21,6 +21,8 @@ limitations under the License. */
 #include "paddle/math/SparseMatrix.h"
 #include <gtest/gtest.h>
 #include "paddle/gserver/tests/TestUtil.h"
+#include "paddle/utils/Stat.h"
+
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
@@ -705,9 +707,32 @@ void testMatrixTranspose(int height, int width) {
   MatrixCheckEqual(*cpuT, *outputCheck);
 }
 
+void testMatrixInverse(int height) {
+  MatrixPtr cpu = std::make_shared<CpuMatrix>(height, height);
+  MatrixPtr gpu = std::make_shared<GpuMatrix>(height, height);
+  MatrixPtr cpuI = std::make_shared<CpuMatrix>(height, height);
+  MatrixPtr gpuI = std::make_shared<GpuMatrix>(height, height);
+
+  cpu->randomizeUniform();
+  gpu->copyFrom(*cpu);
+  cpu->inverse(cpuI, false);
+  gpu->inverse(gpuI, false);
+
+  MatrixPtr outputCheck = std::make_shared<CpuMatrix>(height, height);
+  outputCheck->copyFrom(*gpuI);
+  MatrixCheckErr(*cpuI, *outputCheck);
+
+  outputCheck->mul(cpu, cpuI);
+  cpu->zeroMem();
+  for (int i = 0; i < height; i++) {
+    cpu->getRowBuf(i)[i] = 1.0;
+  }
+  MatrixCheckErr(*cpu, *outputCheck);
+}
+
 TEST(Matrix, unary) {
-  for (auto height : {1, 11, 73, 128, 200, 330}) {
-    for (auto width : {1, 32, 100, 512, 1000, 3210}) {
+  for (auto height : {1, 3, 11, 73, 128, 200, 330}) {
+    for (auto width : {1, 3, 32, 100, 512, 1000, 3210}) {
       VLOG(3) << " height=" << height << " width=" << width;
 
       // applyUnary
@@ -739,6 +764,8 @@ TEST(Matrix, unary) {
       // transpose
       testMatrixTranspose(height, width);
     }
+    // inverse
+    testMatrixInverse(height);
   }
 }
 
@@ -2137,6 +2164,60 @@ TEST(Matrix, MaxOutFwdBwd) {
   }
 }
 
+void testAddSharedBias(int numSamples, int dim, int channel) {
+  MatrixPtr cpuData = std::make_shared<CpuMatrix>(numSamples, dim);
+  MatrixPtr gpuData = std::make_shared<GpuMatrix>(numSamples, dim);
+
+  MatrixPtr cpuBias = std::make_shared<CpuMatrix>(1, channel);
+  MatrixPtr gpuBias = std::make_shared<GpuMatrix>(1, channel);
+
+  cpuData->randomizeUniform();
+  gpuData->copyFrom(*cpuData);
+  cpuBias->randomizeUniform();
+  gpuBias->copyFrom(*cpuBias);
+
+  cpuData->addSharedBias(*cpuBias, 1.0);
+  gpuData->addSharedBias(*gpuBias, 1.0);
+
+  MatrixPtr check = std::make_shared<CpuMatrix>(numSamples, dim);
+  check->copyFrom(*gpuData);
+  MatrixCheckErr(*cpuData, *check);
+}
+
+void testCollectSharedBias(int numSamples, int dim, int channel) {
+  MatrixPtr cpuData = std::make_shared<CpuMatrix>(numSamples, dim);
+  MatrixPtr gpuData = std::make_shared<GpuMatrix>(numSamples, dim);
+
+  MatrixPtr cpuBias = std::make_shared<CpuMatrix>(1, channel);
+  MatrixPtr gpuBias = std::make_shared<GpuMatrix>(1, channel);
+
+  cpuData->randomizeUniform();
+  gpuData->copyFrom(*cpuData);
+  cpuBias->randomizeUniform();
+  gpuBias->copyFrom(*cpuBias);
+
+  cpuBias->collectSharedBias(*cpuData, 1.0);
+  gpuBias->collectSharedBias(*gpuData, 1.0);
+
+  MatrixPtr check = std::make_shared<CpuMatrix>(1, channel);
+  check->copyFrom(*gpuBias);
+  MatrixCheckErr(*cpuBias, *check);
+}
+
+
+TEST(Matrix, sharedBias) {
+  for (auto numSamples : {1, 100, 520}) {
+    for (auto dim : {100 * 16, 100 * 32}) {
+      for (auto channel : {8, 16}) {
+        VLOG(3) << " numSamples=" << numSamples << " dim=" << dim
+                << " channel=" << channel;
+        testAddSharedBias(numSamples, dim, channel);
+        testCollectSharedBias(numSamples, dim, channel);
+      }
+    }
+  }
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/pserver/ParameterServer2.cpp b/paddle/pserver/ParameterServer2.cpp
index 8f72c1988d1676503f8ab1174d34a8ee6fe78516..c8f37d0bf4f84cdd9588c269dda3fb6bd72c1dc5 100644
--- a/paddle/pserver/ParameterServer2.cpp
+++ b/paddle/pserver/ParameterServer2.cpp
@@ -264,6 +264,15 @@ void ParameterServer2::setParameter(const SendParameterRequest& request,
   std::vector<int64_t> blockIds;
   blockIds.reserve(request.blocks_size());
   int bufferIndex = 0;
+
+  if (!request.blocks().size()) {
+    LOG(WARNING)
+          << "--ports_num or --ports_num_for_sparse might be too large, "
+          << "or total dense parameter size or sparse parameters size "
+          << "might be too small, this psever doesn't store any parameter.";
+    return;
+  }
+
   for (const auto& block : request.blocks()) {
     /// block size for parameter(e.g. 128 for sparse row, 1K for dense)
     uint64_t blockSize = getParameterConfig(block).parameter_block_size();
diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py
index 6d8f5da3e298fae1b9ab9309c5906cf4eeb11eb1..dd2e146d112c055a68c8279417ce07d06fa10a7e 100644
--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
@@ -63,7 +63,7 @@ class SparseBinaryScanner(IScanner):
 
     def scan(self, dat):
         self.extend_cols(dat)
-        self.__rows__.append(len(dat) + self.__rows__[-1])
+        self.__rows__.append(len(self.__cols__))
         self.__height__ += 1
 
     def extend_cols(self, dat):
diff --git a/paddle/py_paddle/util.py b/paddle/py_paddle/util.py
index e6cf2710ef523fa494ddfb0917dbf35ecb49d685..53f67a861e7d972648cfd22f451c6e56fa5aa149 100644
--- a/paddle/py_paddle/util.py
+++ b/paddle/py_paddle/util.py
@@ -79,6 +79,20 @@ class __ParameterCallbackWrapper__(swig_paddle.UpdateCallback):
         else:
             return __ParameterCallbackWrapper__(callback).__disown__()
 
+def __arguments_to_numpy__(i, arg):
+    assert isinstance(arg, swig_paddle.Arguments)
+    value = arg.getSlotValue(i)
+    if value is not None:
+        assert isinstance(value, swig_paddle.Matrix)
+        value = value.copyToNumpyMat()
+        ids = arg.getSlotIds(i)
+    if ids is not None:
+        assert isinstance(ids, swig_paddle.IVector)
+        ids = ids.copyToNumpyArray()
+    return {
+        "value": value,
+        "id": ids
+    }
 
 def __monkeypatch_gradient_machine__():
     """
@@ -88,20 +102,6 @@ def __monkeypatch_gradient_machine__():
     swig_paddle.GradientMachine.loadFromConfigFile = \
         staticmethod(loadGradientMachine)
 
-    def __arguments_to_numpy__(i, arg):
-        assert isinstance(arg, swig_paddle.Arguments)
-        value = arg.getSlotValue(i)
-        if value is not None:
-            assert isinstance(value, swig_paddle.Matrix)
-            value = value.copyToNumpyMat()
-        ids = arg.getSlotIds(i)
-        if ids is not None:
-            assert isinstance(ids, swig_paddle.IVector)
-            ids = ids.copyToNumpyArray()
-        return {
-            "value": value,
-            "id": ids
-        }
 
     def __matrix_to_numpy__(m):
         if isinstance(m, swig_paddle.Matrix):
@@ -126,7 +126,7 @@ def __monkeypatch_gradient_machine__():
         :type paramTypes: list of int
         :return: paddle.GradientMachine
         """
-        assert isinstance(protoObj, paddle.proto.ModelConfig_pb2.ModelConfig)
+        assert isinstance(protoObj, paddle.proto.ModelConfig)
         return swig_paddle.GradientMachine.createByConfigProtoStr(
             protoObj.SerializeToString(), createMode, paramTypes)
 
@@ -460,13 +460,29 @@ def __monkey_patch_protobuf_objects__():
         """
 
         assert isinstance(protoObj,
-                          paddle.proto.TrainerConfig_pb2.OptimizationConfig)
+                          paddle.proto.OptimizationConfig)
         return swig_paddle.OptimizationConfig.createFromProtoString(
             protoObj.SerializeToString())
 
     swig_paddle.OptimizationConfig.createFromProto = staticmethod(
         OptimizationConfig_createFromProto)
 
+    def TrainerConfig_createFromProto(protoObj):
+        """
+        Create a new paddle.TrainerConfig from
+        proto.OptimizationConfig
+
+        :param protoObj: proto.TrainerConfig
+        :return: paddle.TrainerConfig
+        """
+        assert isinstance(protoObj,
+                          paddle.proto.TrainerConfig)
+        return swig_paddle.TrainerConfig.createFromProtoString(
+            protoObj.SerializeToString())
+
+    swig_paddle.TrainerConfig.createFromProto = staticmethod(
+        TrainerConfig_createFromProto)
+
 
 def __monkey_patch_parameter__():
     def getBufs(self):
@@ -483,9 +499,66 @@ def __monkey_patch_parameter__():
     swig_paddle.Parameter.getBufs = getBufs
 
 
+def __monkey_patch_trainer__():
+    swig_paddle.Trainer.__create__ = staticmethod(swig_paddle.Trainer.create)
+
+    def Trainer_create(config, model=None):
+        """
+        Create a trainer for model with TrainerCOnfig trainer_config
+        trainer_config.model_config will be ignored when model is supplied.
+        Trainer.trainOneBatch() and Trainer.forwardOneBatch() can be used only
+        when trainer_config.data_config is set.
+
+        A typical usage for Trainer is:
+        .. code-block:: python
+           trainer = Trainer.create(trainer_config, model)
+           for p in xrange(num_passes)
+               while True:
+                   data = get_next_batch(batch_size)
+                   if not data:
+                       break
+                   trainer.trainOneDataBatch(batch_size, data)
+               trainer.finishTrainPass()
+           trainer.finishTrain()
+
+        The trainer will take care of logging, model saving, distributed
+        training, etc.
+
+        :param config: trainer configuration
+        :type config: paddle.proto.TrainerConfig
+        :param model: the model to be trained
+        :type model: swig_paddle.GradientMachine
+        :return: a trainer
+        :rtype swig_paddle.Trainer
+
+        """
+        assert isinstance(config, paddle.proto.TrainerConfig)
+        if model is not None:
+            assert isinstance(model, swig_paddle.GradientMachine)
+        return swig_paddle.Trainer.__create__(
+            swig_paddle.TrainerConfig.createFromProto(config), model)
+    swig_paddle.Trainer.create = staticmethod(Trainer_create)
+
+    swig_paddle.Trainer.__getForwardOutput__ = \
+        swig_paddle.Trainer.getForwardOutput
+
+    def getForwardOutput(self):
+        """
+        Get the netword outputs from the previous trainOneBatch(),
+        trainOneDataBatch(), testOneDataPatch(), or forwardOneBatch() call.
+
+        :return: list of dictionary with keys ['id', 'value'], each value is a
+                 numpy.ndarray.
+        """
+        outArgs = self.__getForwardOutput__()
+        return [__arguments_to_numpy__(i, outArgs) for i in xrange(
+            outArgs.getSlotNum())]
+
+    swig_paddle.Trainer.getForwardOutput = getForwardOutput
+
 def monkeypatches():
     patches = [__monkeypatch_init_paddle__, __monkeypatch_gradient_machine__,
                __monkey_patch_protobuf_objects__,
-               __monkey_patch_parameter__]
+               __monkey_patch_parameter__, __monkey_patch_trainer__]
     for patch in patches:
         patch()
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index 4cf5f41f195df7655c9e77eba23baf90e21cee13..213cf2f1cc7e491dc6455f1af434446806aa4ccc 100644
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -68,7 +68,7 @@ EOF
 if [ $? -eq 1 ]; then  # Older version installed, or not installed at all
    echo "First time run paddle, need to install some python dependencies."
    BASEDIR=$(dirname "$0")
-   pip install ${BASEDIR}/../opt/paddle/share/wheels/*.whl
+   pip install ${BASEDIR}/../opt/paddle/share/wheels/*-@PADDLE_VERSION@-*.whl
    if [ $? -ne 0 ]; then
       echo "pip install wheels failed. "
       echo "Please use 'sudo paddle' at the first time you use PaddlePaddle"
diff --git a/paddle/scripts/tools/build_docs/.gitignore b/paddle/scripts/tools/build_docs/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..6ec14c8f5bc3774a81dbe87c44f458594b38f12c
--- /dev/null
+++ b/paddle/scripts/tools/build_docs/.gitignore
@@ -0,0 +1,2 @@
+doc
+doc_cn
diff --git a/paddle/scripts/tools/build_docs/Dockerfile b/paddle/scripts/tools/build_docs/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..5db0b29c4739943f9e677dc7973b392a345b7da1
--- /dev/null
+++ b/paddle/scripts/tools/build_docs/Dockerfile
@@ -0,0 +1,6 @@
+FROM paddledev/paddle:cpu-devel-latest
+COPY build.sh /
+RUN pip install sphinx &&\
+    apt install -y doxygen graphviz &&\
+    pip install breathe recommonmark numpy protobuf==2.6.1
+CMD /build.sh
diff --git a/paddle/scripts/tools/build_docs/build.sh b/paddle/scripts/tools/build_docs/build.sh
new file mode 100755
index 0000000000000000000000000000000000000000..a23b6e61d45926e77015365627bfb7dca303ac65
--- /dev/null
+++ b/paddle/scripts/tools/build_docs/build.sh
@@ -0,0 +1,13 @@
+#!/bin/bash
+set -ex
+
+mkdir -p /build
+cd /build
+cmake /paddle -DWITH_DOC=ON
+make paddle_docs paddle_docs_cn -j `nproc`
+mkdir -p /output/doc
+mkdir -p /output/doc_cn
+cp -r doc/html/* /output/doc/
+cp -r doc_cn/html/* /output/doc_cn/
+cd /
+rm -rf /paddle/build
diff --git a/paddle/scripts/tools/build_docs/build_docs.sh b/paddle/scripts/tools/build_docs/build_docs.sh
new file mode 100755
index 0000000000000000000000000000000000000000..9f8b80435c8fb17907d7da52c864a448f0d8d136
--- /dev/null
+++ b/paddle/scripts/tools/build_docs/build_docs.sh
@@ -0,0 +1,4 @@
+#!/bin/bash
+set -e
+docker build . -t paddle_build_doc
+docker run --rm -v $PWD/../../../../:/paddle -v $PWD:/output paddle_build_doc
diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh
index a73c32344c8abe4d314fbac2c2ec02aafeeac9d1..54e3320c8c1584d0f41e8507c846b17f7c85d09c 100755
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
@@ -6,17 +6,19 @@ if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
 fi
 
 
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_TESTING=ON -DON_TRAVIS=ON ${CMAKE_EXTRA}
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_TESTING=ON -DON_TRAVIS=ON -DON_COVERALLS=ON ${CMAKE_EXTRA}
 
 NPROC=1
 if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
   NRPOC=`nproc`
+  make -j $NPROC
+  make coveralls
 elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
   NPROC=`sysctl -n hw.ncpu`
+  make -j $NPROC
+  env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="-j $NPROC"
 fi
 
 
-make -j $NPROC
-env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="-j $NPROC"
 sudo make install
 sudo paddle version
diff --git a/paddle/trainer/CMakeLists.txt b/paddle/trainer/CMakeLists.txt
index 08b411d2ccbae7745b5bd72f92c1190cb75ced87..06c019f0a97757b658d1bc3405246d8f47632aad 100644
--- a/paddle/trainer/CMakeLists.txt
+++ b/paddle/trainer/CMakeLists.txt
@@ -7,6 +7,7 @@ set(TRAINER_SOURCES
         Tester.cpp
         Trainer.cpp
         TrainerInternal.cpp
+        TrainerBenchmark.cpp
         ThreadParameterUpdater.cpp
         TrainerInternalConfig.cpp
         TrainerConfigHelper.cpp)
diff --git a/paddle/trainer/ParamUtil.cpp b/paddle/trainer/ParamUtil.cpp
index dae8b44b6db8eec2d8d3284bdc6883355b5128ea..bb309a54975a1dfc386bfb440c90a6dd408205c3 100644
--- a/paddle/trainer/ParamUtil.cpp
+++ b/paddle/trainer/ParamUtil.cpp
@@ -89,6 +89,9 @@ void ParameterUtil::saveParameters(int passId, int passInnerId) {
   }
 
   std::string basePath = config_->getSaveDir();
+  if (basePath.find('/') == std::string::npos) {
+    basePath = "./" + basePath;
+  }
   mkDirRecursively(basePath.c_str());
 
   std::string saveDir = path::join(basePath, buf);
diff --git a/paddle/trainer/Tester.cpp b/paddle/trainer/Tester.cpp
index ccf06e1d84edc4f57e982102479f99295c1955e3..d3b88019faa04b7cebf44dd63678aa9d4ffb5252 100644
--- a/paddle/trainer/Tester.cpp
+++ b/paddle/trainer/Tester.cpp
@@ -71,24 +71,36 @@ Tester::Tester(const std::shared_ptr<TrainerConfigHelper> &config,
       parameterUpdater_));
 }
 
+void Tester::startTestPeriod() {
+  testEvaluator_->start();
+  testContext_.cost = 0;
+  testContext_.numSamples = 0;
+
+  parameterUpdater_->apply();
+  if (intconfig_->prevBatchState) {
+    gradientMachine_->getState(*intconfig_->trainState);
+    gradientMachine_->setState(*intconfig_->testState);
+  }
+}
+
+void Tester::testOneDataBatch(
+    const DataBatch& dataBatch, std::vector<Argument>* outArgs) {
+  testContext_.cost += forwardOneBatch(
+    dataBatch, testEvaluator_.get(), outArgs);
+  testContext_.numSamples += dataBatch.getSize();
+}
+
 void Tester::testOnePeriod() {
   DataBatch dataBatch;
   int64_t batchSize = config_->getOptConfig().batch_size();
-  testEvaluator_->start();
-  real cost = 0;
-  int64_t numSamples = 0;
   bool testAllData =
       intconfig_->testPeriod == 0 || intconfig_->testAllDataInOnePeriod;
-
   int batches =
       testAllData ? std::numeric_limits<int>::max() : intconfig_->testPeriod;
 
-  parameterUpdater_->apply();
-  if (intconfig_->prevBatchState) {
-    gradientMachine_->getState(*intconfig_->trainState);
-    gradientMachine_->setState(*intconfig_->testState);
-  }
+  std::vector<Argument> outArgs;
 
+  startTestPeriod();
   for (int i = 0; i < batches; ++i) {
     int num = testDataProvider_->getNextBatch(batchSize, &dataBatch);
     if (num == 0) {
@@ -102,13 +114,18 @@ void Tester::testOnePeriod() {
         num = testDataProvider_->getNextBatch(batchSize, &dataBatch);
       }
     }
-    cost += testOneBatch(dataBatch, testEvaluator_.get());
-    numSamples += num;
+    testOneDataBatch(dataBatch, &outArgs);
   }
+  finishTestPeriod();
+}
+
+void Tester::finishTestPeriod() {
   testEvaluator_->finish();
-  CHECK_GT(numSamples, 0) << "There is no samples in your test batch. Possibly "
-                             "wrong implementation of DataProvidor.reset()";
-  LOG(INFO) << " Test samples=" << numSamples << " cost=" << cost / numSamples
+  CHECK_GT(testContext_.numSamples, 0)
+      << "There is no samples in your test batch. Possibly "
+         "wrong implementation of DataProvidor.reset()";
+  LOG(INFO) << " Test samples=" << testContext_.numSamples
+            << " cost=" << testContext_.cost / testContext_.numSamples
             << " Eval: " << *testEvaluator_;
   parameterUpdater_->restore();
   if (intconfig_->prevBatchState) {
@@ -128,9 +145,11 @@ int64_t Tester::testOneBatchById(int64_t batchId) {
     return 0;
   }
 
+  std::vector<Argument> outArgs;
+
   stats_ += std::pair<int64_t, real>{
       actualBatchSize,
-      testOneBatch(dataBatch, testEvaluator_.get())};
+      forwardOneBatch(dataBatch, testEvaluator_.get(), &outArgs)};
 
   if (((batchId + 1) % intconfig_->logPeriod) == 0) {
     LOG(INFO) << " Batch=" << batchId + 1 << " " << stats_.getStats(false);
@@ -139,7 +158,10 @@ int64_t Tester::testOneBatchById(int64_t batchId) {
   return actualBatchSize;
 }
 
-real Tester::testOneBatch(const DataBatch &dataBatch, Evaluator *evaluator) {
+real Tester::forwardOneBatch(const DataBatch &dataBatch,
+                             Evaluator *evaluator,
+                             std::vector<Argument>* pOutArgs) {
+  auto& outArgs = *pOutArgs;
   const std::vector<Argument>& inArgs = dataBatch.getStreams();
   if (intconfig_->loadsaveParametersInPserver) {
     REGISTER_TIMER("prefetch");
@@ -148,12 +170,11 @@ real Tester::testOneBatch(const DataBatch &dataBatch, Evaluator *evaluator) {
                                            true /*after apply*/);
   }
 
-  std::vector<Argument> outArgs;
   gradientMachine_->forward(inArgs, &outArgs, PASS_TEST);
 
   // write features if set this flag and outArgs is not empty
   std::string featFile = intconfig_->featFile;
-  if (!featFile.empty() && !outArgs.empty()) {
+  if (!featFile.empty() && outArgs.empty()) {
     size_t numOutputs = outArgs.size();
     std::vector<MatrixPtr> featMatrices;
     featMatrices.resize(numOutputs);
diff --git a/paddle/trainer/Tester.h b/paddle/trainer/Tester.h
index 9663b8def9145bc740f28150d34b8ff88fdfd66a..671ffc5220ebaf2e009225191f6a77e6fea80d33 100644
--- a/paddle/trainer/Tester.h
+++ b/paddle/trainer/Tester.h
@@ -68,6 +68,10 @@ public:
    * is training at same time.
    */
   void testOnePeriod();
+  void startTestPeriod();
+  void finishTestPeriod();
+  void testOneDataBatch(const DataBatch& dataBatch,
+                        std::vector<Argument>* outArgs);
 
   /**
    * Test for given data batch.
@@ -75,7 +79,9 @@ public:
    * @param evaluator Evaluator
    * @return cost
    */
-  real testOneBatch(const DataBatch &dataBatch, Evaluator *evaluator);
+  real forwardOneBatch(const DataBatch& dataBatch,
+                       Evaluator* evaluator,
+                       std::vector<Argument>* outArgs);
 
 
   /**
@@ -99,6 +105,10 @@ protected:
   std::ofstream os_;
   std::vector<MatrixPtr> cpuMat_;
   std::vector<IVectorPtr> cpuVec_;
+  struct {
+    int64_t numSamples;
+    real cost;
+  } testContext_;
 
 private:
   /**
diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp
index 275150e12d12b57550ce45355cb3c533b57b4b86..7fc48dd1fbec6588b71db031d89dd88c5c5cf92c 100644
--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -40,7 +40,7 @@ limitations under the License. */
 #include "TrainerConfigHelper.h"
 
 P_DEFINE_string(config, "", "Trainer config file");
-P_DEFINE_int32(test_period, 1000,
+P_DEFINE_int32(test_period, 0,
                "Run test every so many train batches."
                " 0 for testing after each pass."
                " If not 0, test log_period batches."
@@ -196,7 +196,8 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper> &config,
   if (!dataProvider_ && config_->hasDataConfig()) {
     dataProvider_.reset(DataProvider::create(*config_, *config_, gpuData));
   }
-  if (dataProvider_) {
+  if (!testDataProvider_) {
+    // No evaluator_ if there is testDataProvider but no dataProvider.
     evaluator_.reset(trainerInternal_.getGradientMachine()->makeEvaluator());
     currentEvaluator_.reset(
         trainerInternal_.getGradientMachine()->makeEvaluator());
@@ -215,10 +216,7 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper> &config,
         DataProvider::create(config_->getTestDataConfig(), *config_, gpuData));
   }
   if (testDataProvider_) {
-    tester_.reset(new Tester(config_, createTesterConfig(),
-                 trainerInternal_.getGradientMachine(),
-                 trainerInternal_.getParameterUpdater(),
-                 testDataProvider_));
+    createTester();
   }
 
   if (!testing &&
@@ -258,34 +256,25 @@ void Trainer::init(const std::shared_ptr<TrainerConfigHelper> &config,
     }
   }
 
-
   // set current evaluator and evalutor
   trainerInternal_.setCurrentEvaluator(currentEvaluator_.get());
   trainerInternal_.setEvaluator(evaluator_.get());
 }
 
 void Trainer::train(size_t numPasses) {
-  srand(config_->getConfig().start_pass() + 1);
-  dataProvider_->reset();
-
-  if (this->testDataProvider_) {
-    this->testDataProvider_->reset();
-  }
-
-  trainerInternal_.getGradientMachine()->start(*config_, dataProvider_);
-
+  startTrain();
   for (size_t i = 0; i < numPasses; ++i) {
     if (IGradientMachineMode::trainWholeDataInOneBatch(mode_)) {
       trainOnePassBatch(config_->getConfig().start_pass() + i);
     } else {
-      trainOnePass(config_->getConfig().start_pass() + i);
+      trainOnePass();
     }
     if (i < numPasses - 1) {
       dataProvider_->reset();
     }
   }
 
-  trainerInternal_.getGradientMachine()->finish();
+  finishTrain();
 }
 
 
@@ -387,13 +376,30 @@ real Trainer::checkGradient() {
   return maxDiff;
 }
 
-void Trainer::trainOnePass(int passId) {
-  this->stats_->reset();
-  int64_t batchId = 0;
-  int32_t batchSize = config_->getOptConfig().batch_size();
-  real avgTestCost = 0;
-  int64_t numAvgTests = 0;
-  int passInnerId = 1;
+void Trainer::startTrain() {
+  trainPassContext_.passId = config_->getConfig().start_pass();
+  srand(config_->getConfig().start_pass() + 1);
+  if (dataProvider_) {
+    dataProvider_->reset();
+  }
+
+  if (this->testDataProvider_) {
+    this->testDataProvider_->reset();
+  }
+
+  trainerInternal_.getGradientMachine()->start(*config_, dataProvider_);
+}
+
+void Trainer::finishTrain() {
+  trainerInternal_.getGradientMachine()->finish();
+}
+
+void Trainer::startTrainPass() {
+  stats_->reset();
+  trainPassContext_.batchId = 0;
+  trainPassContext_.avgTestCost = 0;
+  trainPassContext_.numAvgTests = 0;
+  trainPassContext_.passInnerId = 1;
 
   trainerInternal_.getParameterUpdater()->startPass();
   evaluator_->start();
@@ -401,81 +407,83 @@ void Trainer::trainOnePass(int passId) {
     trainerInternal_.getGradientMachine()->resetState();
     trainerInternal_.getGradientMachine()->getState(testState_);
   }
-  while (true) {
-    DataBatch dataBatch;
-
-    int num = 0;
-    {
-      REGISTER_TIMER("getTrainBatch");
-      num = dataProvider_->getNextBatch(batchSize, &dataBatch);
-    }
-    if (num == 0) break;
+}
 
-    if (averageEvaluator_) {
-      int64_t mod = batchId % FLAGS_average_test_period;
-      if (mod >= FLAGS_average_test_period - FLAGS_log_period) {
-        if (mod == FLAGS_average_test_period - FLAGS_log_period) {
-          averageEvaluator_->start();
-        }
-        trainerInternal_.getParameterUpdater()->apply();
-        if (FLAGS_prev_batch_state) {
-          trainerInternal_.getGradientMachine()->getState(trainState_);
-        }
-        avgTestCost +=
-            tester_->testOneBatch(dataBatch, averageEvaluator_.get());
-        if (FLAGS_prev_batch_state) {
-          trainerInternal_.getGradientMachine()->setState(trainState_);
-        }
-        numAvgTests += num;
-        trainerInternal_.getParameterUpdater()->restore();
+void Trainer::trainOneDataBatch(DataBatch& dataBatch) {
+  int num = dataBatch.getSize();
+  if (averageEvaluator_) {
+    int64_t mod = trainPassContext_.batchId % FLAGS_average_test_period;
+    if (mod >= FLAGS_average_test_period - FLAGS_log_period) {
+      if (mod == FLAGS_average_test_period - FLAGS_log_period) {
+        averageEvaluator_->start();
       }
+      trainerInternal_.getParameterUpdater()->apply();
+      if (FLAGS_prev_batch_state) {
+        trainerInternal_.getGradientMachine()->getState(trainState_);
+      }
+      trainPassContext_.avgTestCost +=
+          tester_->forwardOneBatch(
+            dataBatch, averageEvaluator_.get(), &forwardOutput_);
+      if (FLAGS_prev_batch_state) {
+        trainerInternal_.getGradientMachine()->setState(trainState_);
+      }
+      trainPassContext_.numAvgTests += num;
+      trainerInternal_.getParameterUpdater()->restore();
     }
-    {
-      REGISTER_TIMER("TrainBatch");
-      trainerInternal_.trainOneBatch(batchId, dataBatch);
-    }
+  }
+  {
+    REGISTER_TIMER("TrainBatch");
+    trainerInternal_.trainOneBatch(
+      trainPassContext_.batchId, dataBatch, &forwardOutput_);
+  }
 
-    if (averageEvaluator_ &&
-        batchId % FLAGS_average_test_period == FLAGS_average_test_period - 1) {
-      averageEvaluator_->finish();
-      LOG(INFO) << " Averaged parameter:"
-                << " cost=" << avgTestCost / numAvgTests
-                << " Eval: " << *averageEvaluator_;
-      numAvgTests = 0;
-      avgTestCost = 0;
-    }
+  if (averageEvaluator_ &&
+      trainPassContext_.batchId % FLAGS_average_test_period
+        == FLAGS_average_test_period - 1) {
+    averageEvaluator_->finish();
+    LOG(INFO) << " Averaged parameter:"
+              << " cost=" << trainPassContext_.avgTestCost
+                             / trainPassContext_.numAvgTests
+              << " Eval: " << *averageEvaluator_;
+    trainPassContext_.numAvgTests = 0;
+    trainPassContext_.avgTestCost = 0;
+  }
 
-    ++batchId;
+  ++trainPassContext_.batchId;
 
-    if (batchId % FLAGS_log_period == 0) {
-      FOR_TIMING(globalStat.setThreadInfo(true));
-      FOR_TIMING(globalStat.printAllStatus());
-      FOR_TIMING(globalStat.reset());
-    }
+  if (trainPassContext_.batchId % FLAGS_log_period == 0) {
+    FOR_TIMING(globalStat.setThreadInfo(true));
+    FOR_TIMING(globalStat.printAllStatus());
+    FOR_TIMING(globalStat.reset());
+  }
 
-    if (testDataProvider_ && FLAGS_test_period > 0 &&
-        batchId % FLAGS_test_period == 0) {
-      tester_->testOnePeriod();
-    }
+  if (testDataProvider_ && FLAGS_test_period > 0 &&
+      trainPassContext_.batchId % FLAGS_test_period == 0) {
+    tester_->testOnePeriod();
+  }
 
-    if (FLAGS_saving_period_by_batches > 0 &&
-        batchId > FLAGS_saving_period_by_batches * passInnerId &&
-        0 == FLAGS_trainer_id) {
-      trainerInternal_.getParameterUpdater()->catchUpWith();
-      if (testDataProvider_) {
-        tester_->testOnePeriod();
-      }
-      paramUtil_->saveParametersOnePass(passId, passInnerId);
-      ++passInnerId;
+  if (FLAGS_saving_period_by_batches > 0 &&
+      trainPassContext_.batchId
+          > FLAGS_saving_period_by_batches * trainPassContext_.passInnerId &&
+      0 == FLAGS_trainer_id) {
+    trainerInternal_.getParameterUpdater()->catchUpWith();
+    if (testDataProvider_) {
+      tester_->testOnePeriod();
     }
+    paramUtil_->saveParametersOnePass(
+      trainPassContext_.passId, trainPassContext_.passInnerId);
+    ++trainPassContext_.passInnerId;
   }
+}
 
-  if (batchId == 0) {
+void Trainer::finishTrainPass() {
+  if (trainPassContext_.batchId == 0) {
     // This means no more data from DataProvider
     return;
   }
 
-  trainerInternal_.finishTrainPass(passId, batchId);
+  trainerInternal_.finishTrainPass(
+    trainPassContext_.passId, trainPassContext_.batchId);
 
   FOR_TIMING(globalStat.setThreadInfo(true));
   FOR_TIMING(globalStat.printAllStatus());
@@ -485,9 +493,30 @@ void Trainer::trainOnePass(int passId) {
     tester_->testOnePeriod();
   }
 
-  if (passId % FLAGS_saving_period == 0 && FLAGS_trainer_id == 0) {
-    paramUtil_->saveParametersOnePass(passId);
+  if (trainPassContext_.passId % FLAGS_saving_period == 0
+      && FLAGS_trainer_id == 0) {
+    paramUtil_->saveParametersOnePass(trainPassContext_.passId);
   }
+  ++trainPassContext_.passId;
+}
+
+void Trainer::trainOnePass() {
+  startTrainPass();
+  size_t batchSize = config_->getOptConfig().batch_size();
+  while (true) {
+    DataBatch dataBatch;
+
+    int num = 0;
+    {
+      REGISTER_TIMER("getTrainBatch");
+      num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+    }
+    if (num == 0) break;
+    CHECK_EQ(num, dataBatch.getSize());
+    trainOneDataBatch(dataBatch);
+  }
+
+  finishTrainPass();
 }
 
 void Trainer::trainOnePassBatch(int passId) {
@@ -582,6 +611,13 @@ void Trainer::clearGradient() {
 
 int Trainer::getBatchSize() { return config_->getOptConfig().batch_size(); }
 
+void Trainer::createTester() {
+  tester_.reset(new paddle::Tester(config_, createTesterConfig(),
+                                   trainerInternal_.getGradientMachine(),
+                                   trainerInternal_.getParameterUpdater(),
+                                   testDataProvider_));
+}
+
 void Trainer::test() {
   tester_->test();
 }
diff --git a/paddle/trainer/Trainer.h b/paddle/trainer/Trainer.h
index 9bfd6d107a20438d2b1fc8d3143a39c7961c8115..7762722456c442cff956c3a551c66acb2bdebc62 100644
--- a/paddle/trainer/Trainer.h
+++ b/paddle/trainer/Trainer.h
@@ -94,6 +94,12 @@ public:
    */
   real checkGradient();
 
+  void startTrain();
+  void finishTrain();
+  void startTrainPass();
+  void finishTrainPass();
+  void trainOneDataBatch(DataBatch& dataBatch);
+  void time();
 
   /**
    * given a dataBatch and the current parameter value
@@ -144,11 +150,11 @@ public:
 
 protected:
   /**
-   * Train one pass of data. passId starts from 0
+   * Train one pass of data.
    *
    * SGD Method.
    */
-  void trainOnePass(int passId);
+  void trainOnePass();
 
   /**
    * Train one pass in one batch.
@@ -161,6 +167,8 @@ protected:
    */
   void clearGradient();
 
+  void createTester();
+
 private:
   std::unique_ptr<TesterConfig> createTesterConfig();
 
@@ -173,6 +181,17 @@ protected:
   MachineState trainState_;
   MachineState testState_;
 
+  struct TrainPassContext {
+    int64_t batchId;
+    real avgTestCost;
+    int64_t numAvgTests;
+    int passId;
+    int passInnerId;
+  };
+  std::vector<paddle::Argument> forwardOutput_;
+
+  TrainPassContext trainPassContext_;
+
   std::unique_ptr<Evaluator> evaluator_;
   std::unique_ptr<Evaluator> currentEvaluator_;
   std::unique_ptr<Evaluator> averageEvaluator_;
diff --git a/paddle/trainer/TrainerBenchmark.cpp b/paddle/trainer/TrainerBenchmark.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..54862e95b4a738b88dc256efbac9102fca383a4f
--- /dev/null
+++ b/paddle/trainer/TrainerBenchmark.cpp
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#undef PADDLE_DISABLE_TIMER
+
+#include "Trainer.h"
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/Util.h"
+
+P_DECLARE_int32(test_period);
+
+P_DEFINE_bool(feed_data, false, "Wether to read data from DataProvider.");
+
+namespace paddle {
+
+void Trainer::time() {
+  startTrain();
+
+  trainerInternal_.getParameterUpdater()->startPass();
+  evaluator_->start();
+
+  DataBatch dataBatch;
+  int32_t batchSize = config_->getOptConfig().batch_size();
+  int32_t num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+  CHECK_EQ(num, batchSize) << "The sample number is less than batch size "
+                           << num << " != " << batchSize;
+
+  CHECK(dataBatch.getSize()) << "No data from data provider";
+
+  std::vector<paddle::Argument> outputs;
+  // burning time
+  LOG(INFO) << "Burning time...";
+  for (int n = 0; n < 10; ++n) {
+    trainerInternal_.trainOneBatch(n, dataBatch, &outputs);
+  }
+  LOG(INFO) << "Burning time end.";
+
+  for (int n = 0; n < FLAGS_test_period; n++) {
+    if (FLAGS_feed_data) {
+      REGISTER_TIMER("GetData");
+      num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+    }
+
+    if (num != batchSize) {
+      break;
+    }
+
+    {
+      REGISTER_TIMER("FwdBwd");
+      trainerInternal_.trainOneBatch(n, dataBatch, &outputs);
+    }
+  }
+  globalStat.setThreadInfo(true);
+  globalStat.printSegTimerStatus();
+  globalStat.reset();
+
+  finishTrain();
+}
+
+}  // namespace paddle
diff --git a/paddle/trainer/TrainerInternal.cpp b/paddle/trainer/TrainerInternal.cpp
index 6029a4b2c1d0a0c04058bbd979523f26b72b5a5e..e23e42927c381d6efa9a3eef47f7e99f0a65b013 100644
--- a/paddle/trainer/TrainerInternal.cpp
+++ b/paddle/trainer/TrainerInternal.cpp
@@ -55,6 +55,8 @@ void TrainerInternal::init(const std::shared_ptr<TrainerConfigHelper> &config,
 
     gradientMachine_ = gradientMachine;
     if (!gradientMachine) {
+      CHECK(config_->getConfig().has_model_config())
+          << "Missing model_config in trainer_config";
       gradientMachine_.reset(GradientMachine::create(
         config_->getConfig().model_config(), intconfig_->mode,
         parameterUpdater_->getParameterTypes()));
@@ -62,7 +64,8 @@ void TrainerInternal::init(const std::shared_ptr<TrainerConfigHelper> &config,
 }
 
 void TrainerInternal::trainOneBatch(int64_t batchId,
-                                    const DataBatch& dataBatch) {
+                                    const DataBatch& dataBatch,
+                                    std::vector<Argument>* outArgs) {
   // true means updating parameter whenever gradient is ready during backward()
   bool doPipelineUpdate =
       (intconfig_->mode != GradientMachine::kSgdSparseCpuTraining) &&
@@ -84,7 +87,6 @@ void TrainerInternal::trainOneBatch(int64_t batchId,
   }
 
   const std::vector<Argument>& inArgs = dataBatch.getStreams();
-  std::vector<Argument> outArgs;
 
   PassType passType = parameterUpdater_->startBatch(actualBatchSize);
 
@@ -114,7 +116,7 @@ void TrainerInternal::trainOneBatch(int64_t batchId,
     timer.start();
 #endif
     REGISTER_TIMER("forwardBackward");
-    forwardBackwardBatch(inArgs, outArgs, passType, updateCallback,
+    forwardBackwardBatch(inArgs, *outArgs, passType, updateCallback,
                          doPipelineUpdate);
 #ifndef PADDLE_DISABLE_TIMER
     timer.stop();
@@ -132,7 +134,7 @@ void TrainerInternal::trainOneBatch(int64_t batchId,
   real cost = 0;
   {
     REGISTER_TIMER("sumCost");
-    cost = Argument::sumCosts(outArgs);
+    cost = Argument::sumCosts(*outArgs);
   }
 
   if (batchId % intconfig_->log_period == 0) {
diff --git a/paddle/trainer/TrainerInternal.h b/paddle/trainer/TrainerInternal.h
index 17011c4d2e46fee34e8abf08279327fa747d9c0a..3a53aa1d17b31ad3e7c1aa53f622c6399baa834e 100644
--- a/paddle/trainer/TrainerInternal.h
+++ b/paddle/trainer/TrainerInternal.h
@@ -81,7 +81,9 @@ public:
    * @param batchId current batch id
    * @param dataBatch data for the batch
    */
-  void trainOneBatch(int64_t batchId, const DataBatch& dataBatch);
+  void trainOneBatch(int64_t batchId,
+                     const DataBatch& dataBatch,
+                     std::vector<Argument>* outArgs);
 
   /**
    * showParameterStats
diff --git a/paddle/trainer/TrainerMain.cpp b/paddle/trainer/TrainerMain.cpp
index 94266639f94ade6b490eb26243dd964ddedf40b9..a486cc383ace62111dbdbdd98e83710831a64095 100644
--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/trainer/TrainerMain.cpp
@@ -103,6 +103,8 @@ int main(int argc, char** argv) {
     trainer.checkGradient();
   } else if (FLAGS_job == "test") {
     trainer.test();
+  } else if (FLAGS_job == "time") {
+    trainer.time();
   } else {
     LOG(FATAL) << "Unknown job type: " << FLAGS_job;
   }
diff --git a/paddle/utils/Util.cpp b/paddle/utils/Util.cpp
index 45251213d2d7930947f39d4730245ca8f7dfddc8..2cdff9d1aca927122fcdb0c2a7ab22a0e38b41c1 100644
--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
@@ -106,7 +106,7 @@ pid_t getTID() {
       #endif
       pid_t tid = syscall(__NR_gettid);
   #endif
-  CHECK_NE(tid, -1);
+  CHECK_NE((int)tid, -1);
   return tid;
 }
 
diff --git a/proto/ModelConfig.proto.m4 b/proto/ModelConfig.proto.m4
index a1eb11eccaeda6d1eb3e58300ac81a5a2f7d427a..41d081942778e2bd1c11452c41825cc4eb3e1848 100644
--- a/proto/ModelConfig.proto.m4
+++ b/proto/ModelConfig.proto.m4
@@ -262,7 +262,7 @@ sinclude(`ModelConfigLayer.proto.m4')
   // (which is how convnets are usually trained). Setting this to
   // false will untie the biases, yielding a separate bias for
   // every location at which the filter is applied.
-  optional bool shared_biases = 8;
+  optional bool shared_biases = 8 [default = false];
 
   // Valid values are ones that divide the area of the output
   // grid in this convolutional layer. For example if this layer
@@ -386,6 +386,18 @@ sinclude(`ModelConfigLayer.proto.m4')
 
   // use to compute moving mean and variance.
   optional real moving_average_fraction = 47 [default = 0.9];
+
+  // bias size
+  optional uint32 bias_size = 48 [default = 0];
+
+  // this parameter can be used as a user-defined parameter when necessary, 
+  // without changing the proto file.
+  // e.g., when a new layer with a user-defined parameter is implemented, 
+  // it can be used to pass that parameter, without modifying the proto file.
+  // string type is used for flexibility: different types can be converted
+  // to string and reinterpreted in the user's own layer implementation.  
+  optional string user_arg = 49;
+
 }
 
 message EvaluatorConfig {
diff --git a/proto/TrainerConfig.proto.m4 b/proto/TrainerConfig.proto.m4
index a42ff88d54b5e445e7cfadc7467c1bc7d8c7ef26..3b0e24f90bed8cdf0e102c12d2a4a041c17a8447 100644
--- a/proto/TrainerConfig.proto.m4
+++ b/proto/TrainerConfig.proto.m4
@@ -130,7 +130,7 @@ message OptimizationConfig {
 };
 
 message TrainerConfig {
-  required ModelConfig model_config = 1;
+  optional ModelConfig model_config = 1;
   optional DataConfig data_config = 2;
   required OptimizationConfig opt_config = 3;
   optional DataConfig test_data_config = 4;
diff --git a/python/paddle/proto/__init__.py b/python/paddle/proto/__init__.py
index 7f9e87eee6037666b86420fba194624859d356b3..cd6a59ecbb0952e89f34b11678a60ad300585979 100644
--- a/python/paddle/proto/__init__.py
+++ b/python/paddle/proto/__init__.py
@@ -12,3 +12,5 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from paddle.proto.TrainerConfig_pb2 import OptimizationConfig, TrainerConfig
+from paddle.proto.ModelConfig_pb2 import ModelConfig
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 574c02eefc35feaf2e42b3d2ba272954b2443a4e..c926d0ec079fffeeb75dc5afc80540b8c04c4112 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -633,6 +633,44 @@ class ContextProjection(Projection):
     _total_pad = 0
 
 
+@config_class
+class ConvProjection(Projection):
+    type = 'conv'
+
+    def __init__(
+            self,
+            input_layer_name,
+            num_filters=None,
+            conv_conf=None,
+            **xargs):
+        super(ConvProjection, self).__init__(input_layer_name, **xargs)
+
+        if num_filters is not None:
+            self.proj_conf.num_filters = num_filters
+
+        parse_conv(conv_conf,
+                   input_layer_name,
+                   self.proj_conf.conv_conf)
+        # TODO: support rectangle input
+        self.proj_conf.output_size = (self.proj_conf.conv_conf.output_x  ** 2) * num_filters
+
+    def calc_output_size(self, input_layer_config):
+        return self.proj_conf.output_size
+
+    def calc_parameter_size(self, input_size, output_size):
+        co = self.proj_conf.num_filters
+        ci = self.proj_conf.conv_conf.channels
+        fh = self.proj_conf.conv_conf.filter_size
+        fw = self.proj_conf.conv_conf.filter_size_y
+        return co * ci * fh * fw
+
+    def calc_bias_size(self):
+        return self.proj_conf.num_filters
+
+    def calc_parameter_dims(self, input_size, output_size):
+        return None
+
+
 # Define a operator for mixed layer
 @config_class
 class Operator(Cfg):
@@ -2560,8 +2598,16 @@ class MixedLayer(LayerBase):
             record_operator_conf = self.config.operator_confs.add()
             record_operator_conf.CopyFrom(operator_conf)
 
+        psize = self.config.size
+        if isinstance(self.inputs[0], ConvProjection):
+            self.config.shared_biases = True
+            psize = 0
+            for input in self.inputs:
+                psize += input.calc_bias_size()
 
-        self.create_bias_parameter(bias, self.config.size)
+        if bias:
+            self.config.bias_size = psize
+            self.create_bias_parameter(bias, psize)
 
         if error_clipping_threshold is not None:
             self.config.error_clipping_threshold = error_clipping_threshold
@@ -2579,8 +2625,10 @@ class ConcatenateLayer(LayerBase):
             self,
             name,
             inputs,
+            bias=False,
             **xargs):
         config_assert(inputs, 'inputs cannot be empty')
+        config_assert(not bias, 'ConcatenateLayer cannot support bias.')
         super(ConcatenateLayer, self).__init__(
             name, 'concat', 0, inputs=inputs, **xargs)
         size = 0
@@ -2599,10 +2647,19 @@ class ConcatenateLayer2(LayerBase):
             self,
             name,
             inputs,
+            bias=False,
             **xargs):
         config_assert(inputs, 'inputs cannot be empty')
         super(ConcatenateLayer2, self).__init__(
             name, 'concat2', 0, inputs=inputs, **xargs)
+
+        if isinstance(self.inputs[0], ConvProjection):
+          for input_index in xrange(len(self.inputs) - 1):
+              input = self.inputs[input_index + 1]
+              config_assert(isinstance(input, ConvProjection),
+                  "The first input of ConcatenateLayer2 is ConvProjection, "
+                  "the other inputs should also be ConvProjection.")
+
         size = 0
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
@@ -2628,6 +2685,17 @@ class ConcatenateLayer2(LayerBase):
               input.proj_conf.output_size)
             self.create_input_parameter(input_index, psize, dims)
 
+        psize = self.config.size
+        if isinstance(self.inputs[0], ConvProjection):
+            self.config.shared_biases = True
+            psize = 0
+            for input in self.inputs:
+                psize += input.calc_bias_size()
+
+        if bias:
+            self.config.bias_size = psize
+            self.create_bias_parameter(bias, psize)
+
 @config_layer('recurrent')
 class RecurrentLayer(LayerBase):
     def __init__(
diff --git a/python/paddle/trainer_config_helpers/activations.py b/python/paddle/trainer_config_helpers/activations.py
index 292014519374eabbe55c61daa73692814a52aac2..ad5cdc0a0eb13f7a58e7d89ebfb79d33a63b75d5 100644
--- a/python/paddle/trainer_config_helpers/activations.py
+++ b/python/paddle/trainer_config_helpers/activations.py
@@ -199,3 +199,12 @@ class ExpActivation(BaseActivation):
        f(z) = e^z.
     """
     def __init__(self): BaseActivation.__init__(self, 'exponential', False)
+
+class LogActivation(BaseActivation):
+    """
+    Logarithm Activation.
+
+    .. math::
+       f(z) = log(z)
+    """
+    def __init__(self): BaseActivation.__init__(self, 'log', False)
diff --git a/python/paddle/trainer_config_helpers/default_decorators.py b/python/paddle/trainer_config_helpers/default_decorators.py
index b20aebc685fe5a36b69c4e9f09b610631b233ecf..be00f48b457c137e3b0913da84ad2e6215f9e9ca 100644
--- a/python/paddle/trainer_config_helpers/default_decorators.py
+++ b/python/paddle/trainer_config_helpers/default_decorators.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 
 import functools
+import inspect
 from .attrs import ParamAttr
 from .activations import TanhActivation
 from paddle.trainer.config_parser import *
@@ -37,8 +38,12 @@ def wrap_param_default(param_names=None, default_factory=None,
         @functools.wraps(func)
         def __wrapper__(*args, **kwargs):
             if len(args) != 0:
-                logger.warning("please use keyword arguments in paddle config.")
-
+                argspec = inspect.getargspec(func)
+                num_positional = len(argspec.args)
+                if argspec.defaults:
+                    num_positional -= len(argspec.defaults)
+                if not argspec.varargs and len(args) > num_positional:
+                    logger.fatal("Must use keyword arguments for non-positional args")
             for name in param_names:
                 if not_set_callback(kwargs, name):  # Not set
                     kwargs[name] = default_factory(func)
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 6457c60a353922cae0d7f08648b25782cfcb6e41..bf722fb1ba4534b24e5ff0c91a34be8fd1e47c15 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -34,7 +34,7 @@ __all__ = ["full_matrix_projection", "AggregateLevel", "ExpandLevel",
            "table_projection", "mixed_layer", "data_layer",
            "embedding_layer", "fc_layer", "grumemory",
            "pooling_layer", "lstmemory", "last_seq", "first_seq",
-           "cos_sim", "hsigmoid",
+           "cos_sim", "hsigmoid", "conv_projection",
            "regression_cost", 'classification_cost', "LayerOutput",
            'img_conv_layer', 'img_pool_layer', 'batch_norm_layer',
            'img_cmrnorm_layer', 'addto_layer',
@@ -54,7 +54,7 @@ __all__ = ["full_matrix_projection", "AggregateLevel", "ExpandLevel",
            'cross_entropy_with_selfnorm', 'cross_entropy',
            'multi_binary_label_cross_entropy',
            'rank_cost', 'lambda_cost', 'huber_cost',
-           # 'block_expand_layer',  # TODO(yuyang18): this layer is not correct
+           'block_expand_layer',
            'maxout_layer', 'out_prod_layer', 'print_layer'
            ]
 
@@ -565,7 +565,7 @@ class MixedLayerType(LayerOutput):
         self.inputs = []
         self.finalized = False
 
-    def __add__(self, other):
+    def __iadd__(self, other):
         """
         + += operator
         :param other: Other projection.
@@ -2033,7 +2033,7 @@ def addto_layer(input, act=None, name=None, bias_attr=None,
 @wrap_act_default(act=IdentityActivation())
 @wrap_name_default("concat")
 @layer_support()
-def concat_layer(input, act=None, name=None, layer_attr=None):
+def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
     """
     Concat all input vector into one huge vector.
     Inputs can be list of LayerOutput or list of projection.
@@ -2092,10 +2092,14 @@ def concat_layer(input, act=None, name=None, layer_attr=None):
     layer_type = (LayerType.CONCAT_LAYER if is_concat_layer
                   else LayerType.CONCAT_PROJ_LAYER)
 
+    if layer_type == LayerType.CONCAT_LAYER:
+        assert not bias_attr
+    
     Layer(
         name=name, type=layer_type,
         inputs=[x.name for x in input] if is_concat_layer else input,
         active_type=act.name,
+        bias=ParamAttr.to_bias(bias_attr),
         **ExtraLayerAttribute.to_kwargs(layer_attr)
     )
 
@@ -2999,6 +3003,103 @@ def conv_operator(img, filter, filter_size, num_filters,
     op.origin = [img, filter]
     return op
 
+@wrap_param_attr_default()
+def conv_projection(input, filter_size, num_filters,
+                    num_channels=None, stride=1, padding=0,
+                    filter_size_y=None, stride_y=None, padding_y=None,
+                    groups=1, param_attr=None):
+    """
+    ConvProjection with a layer as input.
+    It performs element-wise multiplication with weight.
+
+    Different from img_conv_layer and conv_op, conv_projection is an Projection,
+    which can be used in mixed_layer and conat_layer. It use cudnn to implement
+    conv and only support GPU mode.
+
+    The example usage is:
+
+    .. code-block:: python
+
+       proj = conv_projection(img=input1,
+                              filter_size=3,
+                              num_filters=64,
+                              num_channels=64)
+
+    :param input: input layer
+    :type input: LayerOutput
+    :param filter_size: The x dimension of a filter kernel.
+    :type filter_size: int
+    :param filter_size_y: The y dimension of a filter kernel. Since
+                          PaddlePaddle now supports rectangular filters,
+                          the filter's shape can be (filter_size, filter_size_y).
+    :type filter_size_y: int
+    :param num_filters: channel of output data.
+    :type num_filters: int
+    :param num_channel: channel of input data.
+    :type num_channel: int
+    :param stride: The x dimension of the stride.
+    :type stride: int
+    :param stride_y: The y dimension of the stride.
+    :type stride_y: int
+    :param padding: The x dimension of padding.
+    :type padding: int
+    :param padding_y: The y dimension of padding.
+    :type padding_y: int
+    :param groups: The group number.
+    :type groups: int
+    :param param_attr: Convolution param attribute. None means default attribute
+    :type param_attr: ParameterAttribute
+    :return: A DotMulProjection Object.
+    :rtype: DotMulProjection
+    """
+    if num_channels is None:
+        assert input.num_filters is not None
+        num_channels = input.num_filters
+
+    if filter_size_y is None:
+        if isinstance(filter_size, collections.Sequence):
+            assert len(filter_size) == 2
+            filter_size, filter_size_y = filter_size
+        else:
+            filter_size_y = filter_size
+
+    if stride_y is None:
+        if isinstance(stride, collections.Sequence):
+            assert len(stride) == 2
+            stride, stride_y = stride
+        else:
+            stride_y = stride
+
+    if padding_y is None:
+        if isinstance(padding, collections.Sequence):
+            assert len(padding) == 2
+            padding, padding_y = padding
+        else:
+            padding_y = padding
+
+    if param_attr.attr.get('initial_smart'):
+        # special initial for conv layers.
+        init_w = (2.0 / (filter_size ** 2 * num_channels)) ** 0.5
+        param_attr.attr["initial_mean"] = 0.0
+        param_attr.attr["initial_std"] = init_w
+        param_attr.attr["initial_strategy"] = 0
+        param_attr.attr["initial_smart"] = False
+
+    proj = ConvProjection(input_layer_name=input.name,
+                          num_filters=num_filters,
+                          conv_conf=Conv(filter_size=filter_size,
+                                         padding=padding,
+                                         stride=stride,
+                                         channels=num_channels,
+                                         filter_size_y=filter_size_y,
+                                         padding_y=padding_y,
+                                         stride_y=stride_y,
+                                         groups=groups),
+                          **param_attr.attr)
+
+    proj.origin = input
+    return proj
+
 
 @wrap_name_default()
 @layer_support()
@@ -3333,18 +3434,18 @@ convex_comb_layer = linear_comb_layer
 @wrap_name_default()
 @layer_support()
 def block_expand_layer(input,
-                       channel=0,
                        block_x=0,
                        block_y=0,
                        stride_x=0,
                        stride_y=0,
                        padding_x=0,
                        padding_y=0,
+                       num_channels=None,
                        name=None,
                        layer_attr=None):
     """
     Expand feature map to minibatch matrix.
-       - matrix width is: block_y * block_x * channel
+       - matrix width is: block_y * block_x * num_channels
        - matirx height is: outputH * outputW
 
     .. math::
@@ -3356,7 +3457,7 @@ def block_expand_layer(input,
     The expand method is the same with ExpandConvLayer, but saved the transposed
     value. After expanding, output.sequenceStartPositions will store timeline.
     The number of time steps are outputH * outputW and the dimension of each
-    time step is block_y * block_x * channel. This layer can be used after
+    time step is block_y * block_x * num_channels. This layer can be used after
     convolution neural network, and before recurrent neural network.
 
     The simple usage is:
@@ -3364,7 +3465,7 @@ def block_expand_layer(input,
     .. code-block:: python
 
        block_expand = block_expand_layer(input,
-                                         channel=128,
+                                         num_channels=128,
                                          stride_x=1,
                                          stride_y=1,
                                          block_x=1,
@@ -3372,8 +3473,8 @@ def block_expand_layer(input,
 
     :param input: The input layer.
     :type input: LayerOutput
-    :param channel: The channel number of input layer.
-    :type channel: int
+    :param num_channels: The channel number of input layer.
+    :type num_channels: int|None
     :param block_x: The width of sub block.
     :type block_x: int
     :param block_y: The width of sub block.
@@ -3393,16 +3494,18 @@ def block_expand_layer(input,
     :return: LayerOutput object.
     :rtype: LayerOutput
     """
+    if num_channels is None:
+        assert input.num_filters is not None
+        num_channels = input.num_filters
     Layer(name=name,
-          input=Input(input.name,
-                      block_expand=BlockExpand(channels=channel,
-                                               block_x=block_x,
-                                               block_y=block_y,
-                                               stride_x=stride_x,
-                                               stride_y=stride_y,
-                                               padding_x=padding_x,
-                                               padding_y=padding_y)
-                      ),
+          inputs=Input(input.name,
+                       block_expand=BlockExpand(channels=num_channels,
+                                                block_x=block_x,
+                                                block_y=block_y,
+                                                stride_x=stride_x,
+                                                stride_y=stride_y,
+                                                padding_x=padding_x,
+                                                padding_y=padding_y)),
           type=LayerType.BLOCK_EXPAND,
           **ExtraLayerAttribute.to_kwargs(layer_attr)
           )
diff --git a/python/paddle/trainer_config_helpers/math.py b/python/paddle/trainer_config_helpers/math.py
new file mode 100644
index 0000000000000000000000000000000000000000..e35849b77ac531b4a4676019e01285af67925bd9
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/math.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .layers import LayerOutput, mixed_layer, identity_projection, \
+    slope_intercept_layer
+from .attrs import is_compatible_with
+from .default_decorators import *
+import activations as act
+
+__all__ = []
+
+def register_unary_math_op(op_name, act):
+    def op(input, name=None):
+        return mixed_layer(input=[identity_projection(input=input)],
+                           name=name,
+                           act=act)
+    op = wrap_name_default(op_name)(op)
+    op.__doc__ = type(act).__doc__
+    globals()[op_name] = op
+    __all__.append(op_name)
+
+register_unary_math_op('exp', act.ExpActivation())
+register_unary_math_op('log', act.LogActivation())
+register_unary_math_op('abs', act.AbsActivation())
+register_unary_math_op('sigmoid', act.SigmoidActivation())
+register_unary_math_op('tanh', act.TanhActivation())
+register_unary_math_op('square', act.SquareActivation())
+
+def add(layeroutput, other):
+    if is_compatible_with(other, float):
+        return slope_intercept_layer(input=layeroutput, intercept=other)
+    assert isinstance(other, LayerOutput)
+    return mixed_layer(input=[identity_projection(input=layeroutput),
+                              identity_projection(input=other)])
+
+LayerOutput.__radd__ = add
+LayerOutput.__add__ = add
+
+def sub(layeroutput, other):
+    if is_compatible_with(other, float):
+        return slope_intercept_layer(input=layeroutput, intercept=other)
+    assert isinstance(other, LayerOutput)
+    neg = slope_intercept_layer(input=other, slope=-1.0)
+    return mixed_layer(input=[identity_projection(input=layeroutput),
+                              identity_projection(input=neg)])
+
+LayerOutput.__sub__ = sub
+
+def rsub(layeroutput, other):
+    neg = slope_intercept_layer(input=layeroutput, slope=-1.0)
+    return add(neg, other)
+
+LayerOutput.__rsub__ = rsub
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index 65512b327cdc6f408bd36e67a09263ad81a4b85e..bce88f93626ecec04fa5970fabda69d26728f738 100644
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -29,7 +29,7 @@ __all__ = ['sequence_conv_pool', 'simple_lstm', "simple_img_conv_pool",
            "img_conv_bn_pool", 'dropout_layer', 'lstmemory_group',
            'lstmemory_unit', 'small_vgg', 'img_conv_group', 'vgg_16_network',
            'gru_unit', 'gru_group', 'simple_gru', 'simple_attention',
-           'text_conv_pool',
+           'simple_gru2', 'bidirectional_gru', 'text_conv_pool',
            'bidirectional_lstm', 'inputs', 'outputs']
 
 
@@ -811,22 +811,37 @@ def simple_gru(input,
                gru_layer_attr=None
                ):
     """
-    simple_gru is also a recurrent layer group version Gated Recurrent Unit as
-    gru_group. The difference only lies in implemention details.
+    You maybe see gru_step_layer, grumemory in layers.py, gru_unit, gru_group,
+    simple_gru in network.py. The reason why there are so many interfaces is
+    that we have two ways to implement recurrent neural network. One way is to
+    use one complete layer to implement rnn (including simple rnn, gru and lstm)
+    with multiple time steps, such as recurrent_layer, lstmemory, grumemory. But,
+    the multiplication operation :math:`W x_t` is not computed in these layers.
+    See details in their interfaces in layers.py. 
+    The other implementation is to use an recurrent group which can ensemble a
+    series of layers to compute rnn step by step. This way is flexible for
+    attenion mechanism or other complex connections.
+
+    - gru_step_layer: only compute rnn by one step. It needs an memory as input
+      and can be used in recurrent group.
+    - gru_unit: a wrapper of gru_step_layer with memory. 
+    - gru_group: a GRU cell implemented by a combination of multiple layers in
+      recurrent group.
+      But :math:`W x_t` is not done in group.  
+    - gru_memory: a GRU cell implemented by one layer, which does same calculation
+      with gru_group and is faster than gru_group. 
+    - simple_gru: a complete GRU implementation inlcuding :math:`W x_t` and 
+      gru_group. :math:`W` contains :math:`W_r`, :math:`W_z` and :math:`W`, see
+      formula in grumemory. 
+
     The computational speed is that, grumemory is relatively better than
     gru_group, and gru_group is relatively better than simple_gru.
 
-    simple_gru does exactly the same calculation as the grumemory layer does.
-    Please see grumemory in layers.py for more detail about the maths.
-
     The example usage is:
 
     ..  code-block:: python
 
-        gru = gur_group(input=[layer1],
-                        size=256,
-                        act=TanhActivation(),
-                        gate_act=SigmoidActivation())
+        gru = simple_gru(input=[layer1], size=256)
 
     :param input: input layer name.
     :type input: LayerOutput
@@ -863,6 +878,132 @@ def simple_gru(input,
                      gru_layer_attr=gru_layer_attr)
 
 
+@wrap_name_default('simple_gru2')
+def simple_gru2(input,
+                size,
+                name=None,
+                reverse=False,
+                mixed_param_attr=None,
+                mixed_bias_attr=None,
+                gru_param_attr=None,
+                gru_bias_attr=None,
+                act=None,
+                gate_act=None,
+                mixed_layer_attr=None,
+                gru_cell_attr=None
+                ):
+    """
+    simple_gru2 is the same with simple_gru, but using grumemory instead
+    Please see grumemory in layers.py for more detail about the maths.
+    simple_gru2 is faster than simple_gru.
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        gru = simple_gru2(input=[layer1], size=256)
+
+    :param input: input layer name.
+    :type input: LayerOutput
+    :param name: name of the gru group.
+    :type name: basestring
+    :param size: hidden size of the gru.
+    :type size: int
+    :param reverse: whether to process the input data in a reverse order
+    :type reverse: bool
+    :param act: type of the activiation
+    :type act: BaseActivation
+    :param gate_act: type of the gate activiation
+    :type gate_act: BaseActivation
+    :param gru_bias_attr: bias. False means no bias, None means default bias.
+    :type gru_bias_attr: ParameterAttribute|False
+    :param gru_layer_attr: Extra parameter attribute of the gru layer.
+    :type gru_layer_attr: ParameterAttribute|False
+    :return: the gru group.
+    :rtype: LayerOutput
+    """
+    with mixed_layer(name='%s_transform' % name,
+                     size=size * 3,
+                     bias_attr=mixed_bias_attr,
+                     layer_attr=mixed_layer_attr) as m:
+        m += full_matrix_projection(input=input, param_attr=mixed_param_attr)
+
+    return grumemory(name=name,
+                     size=size,
+                     input=m,
+                     reverse=reverse,
+                     bias_attr=gru_bias_attr,
+                     param_attr=gru_param_attr,
+                     act=act,
+                     gate_act=gate_act,
+                     layer_attr=gru_cell_attr)
+
+
+@wrap_name_default("bidirectional_gru")
+def bidirectional_gru(input, size, name=None, return_seq=False,
+                      fwd_mixed_param_attr=None, fwd_mixed_bias_attr=None,
+                      fwd_gru_param_attr=None, fwd_gru_bias_attr=None,
+                      fwd_act=None, fwd_gate_act=None,
+                      fwd_mixed_layer_attr=None, fwd_gru_cell_attr=None,
+
+                      bwd_mixed_param_attr=None, bwd_mixed_bias_attr=None,
+                      bwd_gru_param_attr=None, bwd_gru_bias_attr=None,
+                      bwd_act=None, bwd_gate_act=None,
+                      bwd_mixed_layer_attr=None, bwd_gru_cell_attr=None,
+
+                      last_seq_attr=None, first_seq_attr=None,
+                      concat_attr=None, concat_act=None):
+    """
+    A bidirectional_gru is a recurrent unit that iterates over the input
+    sequence both in forward and bardward orders, and then concatenate two
+    outputs to form a final output. However, concatenation of two outputs
+    is not the only way to form the final output, you can also, for example,
+    just add them together.
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        bi_gru = bidirectional_gru(input=[input1], size=512)
+
+    :param name: bidirectional gru layer name.
+    :type name: basestring
+    :param input: input layer.
+    :type input: LayerOutput
+    :param size: gru layer size.
+    :type size: int
+    :param return_seq: If set False, outputs of the last time step are
+                       concatenated and returned.
+                       If set True, the entire output sequences that are
+                       processed in forward and backward directions are
+                       concatenated and returned.
+    :type return_seq: bool
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    args = locals()
+
+    fw = simple_gru2(name='%s_fw' % name, input=input, size=size,
+                     **dict((k[len('fwd_'):], v) for k, v in args.iteritems()
+                        if k.startswith('fwd_')))
+
+    bw = simple_gru2(name="%s_bw" % name, input=input, size=size,
+                     reverse=True,
+                     **dict((k[len('bwd_'):], v) for k, v in args.iteritems()
+                        if k.startswith('bwd_')))
+
+    if return_seq:
+        return concat_layer(name=name, input=[fw, bw], layer_attr=concat_attr,
+                            act=concat_act)
+    else:
+        fw_seq = last_seq(name="%s_fw_last" % name, input=fw,
+                          layer_attr=last_seq_attr)
+        bw_seq = first_seq(name="%s_bw_last" % name, input=bw,
+                           layer_attr=first_seq_attr)
+        return concat_layer(name=name, input=[fw_seq, bw_seq],
+                            layer_attr=concat_attr, act=concat_act)
+
+
 @wrap_name_default("bidirectional_lstm")
 def bidirectional_lstm(input, size, name=None, return_seq=False,
                        fwd_mat_param_attr=None, fwd_bias_param_attr=None,
@@ -893,7 +1034,7 @@ def bidirectional_lstm(input, size, name=None, return_seq=False,
 
     ..  code-block:: python
 
-        lstm_step = bidirectional_lstm(input=[input1], size=512)
+        bi_lstm = bidirectional_lstm(input=[input1], size=512)
 
     :param name: bidirectional lstm layer name.
     :type name: basestring
@@ -907,7 +1048,7 @@ def bidirectional_lstm(input, size, name=None, return_seq=False,
                        processed in forward and backward directions are
                        concatenated and returned.
     :type return_seq: bool
-    :return: lstm layer name.
+    :return: LayerOutput object accroding to the return_seq.
     :rtype: LayerOutput
     """
     args = locals()
diff --git a/python/paddle/trainer_config_helpers/tests/configs/check.md5 b/python/paddle/trainer_config_helpers/tests/configs/check.md5
deleted file mode 100644
index 88ce5c129e552e12b89040855178db8864f7d559..0000000000000000000000000000000000000000
--- a/python/paddle/trainer_config_helpers/tests/configs/check.md5
+++ /dev/null
@@ -1,21 +0,0 @@
-86c0815275a9d5eb902e23c6a592f58a  img_layers.protostr
-a5d9259ff1fd7ca23d0ef090052cb1f2  last_first_seq.protostr
-9c038249ec8ff719753a746cdb04c026  layer_activations.protostr
-5913f87b39cee3b2701fa158270aca26  projections.protostr
-7334ba0a4544f0623231330fc51d390d  shared_fc.protostr
-8b8b6bb128a7dfcc937be86145f53e2f  shared_lstm.protostr
-6b39e34beea8dfb782bee9bd3dea9eb5  simple_rnn_layers.protostr
-0fc1409600f1a3301da994ab9d28b0bf  test_cost_layers.protostr
-6cd5f28a3416344f20120698470e0a4c  test_cost_layers_with_weight.protostr
-144bc6d3a509de74115fa623741797ed  test_expand_layer.protostr
-2378518bdb71e8c6e888b1842923df58  test_fc.protostr
-8bb44e1e5072d0c261572307e7672bda  test_grumemory_layer.protostr
-1f3510672dce7a9ed25317fc58579ac7  test_hsigmoid.protostr
-d350bd91a0dc13e854b1364c3d9339c6  test_lstmemory_layer.protostr
-6fa59551808ee7012bbd24f757e782d2  test_maxout.protostr
-251a948ba41c1071afcd3d9cf9c233f7  test_ntm_layers.protostr
-e6ff04e70aea27c7b06d808cc49c9497  test_print_layer.protostr
-2a75dd33b640c49a8821c2da6e574577  test_rnn_group.protostr
-67d6fde3afb54f389d0ce4ff14726fe1  test_sequence_pooling.protostr
-f586a548ef4350ba1ed47a81859a64cb  unused_layers.protostr
-8122477f4f65244580cec09edc590041  util_layers.protostr
diff --git a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
index 15c66a9754604c2fb3a58762608b9edf4bbb9b0b..0955fab297261578282d8e362f2e22b1267f1ee5 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
@@ -4,15 +4,18 @@ set -e
 cd `dirname $0`
 export PYTHONPATH=$PWD/../../../../
 
+protostr=$PWD/protostr
+
 configs=(test_fc layer_activations projections test_print_layer
 test_sequence_pooling test_lstmemory_layer test_grumemory_layer
 last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
 img_layers util_layers simple_rnn_layers unused_layers test_cost_layers
 test_rnn_group shared_fc shared_lstm test_cost_layers_with_weight
-test_bilinear_interp test_maxout)
+test_bilinear_interp test_maxout test_bi_grumemory math_ops)
+
 
 for conf in ${configs[*]}
 do
     echo "Generating " $conf
-    python -m paddle.utils.dump_config $conf.py > $conf.protostr
+    python -m paddle.utils.dump_config $conf.py > $protostr/$conf.protostr.unitest
 done
diff --git a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe515b7029336d093df5428ab8ac1c65a2d4e98a
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
@@ -0,0 +1,24 @@
+from paddle.trainer_config_helpers import *
+from paddle.trainer_config_helpers import math
+
+settings(
+    batch_size=1000,
+    learning_rate=1e-5
+)
+
+x = data_layer(name='data', size=100)
+x = math.exp(x)
+x = math.log(x)
+x = math.abs(x)
+x = math.sigmoid(x)
+x = math.square(x)
+x = math.square(x)
+y = 1 + x
+y = y + 1
+y = x + y
+y = y - x
+y = y - 2
+y = 2 - y
+
+outputs(y)
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..1f262af21126c17eb133b92c84a1ae3bb280a1d6
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
@@ -0,0 +1,176 @@
+type: "nn"
+layers {
+  name: "image"
+  type: "data"
+  size: 65536
+  active_type: ""
+}
+layers {
+  name: "__conv_0__"
+  type: "exconv"
+  size: 3297856
+  active_type: ""
+  inputs {
+    input_layer_name: "image"
+    input_parameter_name: "___conv_0__.w0"
+    conv_conf {
+      filter_size: 32
+      channels: 1
+      stride: 1
+      padding: 1
+      groups: 1
+      filter_channels: 1
+      output_x: 227
+      img_size: 256
+      caffe_mode: true
+      filter_size_y: 32
+      padding_y: 1
+      stride_y: 1
+    }
+  }
+  bias_parameter_name: "___conv_0__.wbias"
+  num_filters: 64
+  shared_biases: true
+}
+layers {
+  name: "__batch_norm_0__"
+  type: "batch_norm"
+  size: 3297856
+  active_type: "relu"
+  inputs {
+    input_layer_name: "__conv_0__"
+    input_parameter_name: "___batch_norm_0__.w0"
+    image_conf {
+      channels: 64
+      img_size: 227
+    }
+  }
+  inputs {
+    input_layer_name: "__conv_0__"
+    input_parameter_name: "___batch_norm_0__.w1"
+  }
+  inputs {
+    input_layer_name: "__conv_0__"
+    input_parameter_name: "___batch_norm_0__.w2"
+  }
+  bias_parameter_name: "___batch_norm_0__.wbias"
+  moving_average_fraction: 0.9
+}
+layers {
+  name: "__crmnorm_0__"
+  type: "norm"
+  size: 3297856
+  active_type: ""
+  inputs {
+    input_layer_name: "__batch_norm_0__"
+    norm_conf {
+      norm_type: "cmrnorm-projection"
+      channels: 64
+      size: 32
+      scale: 0.0004
+      pow: 0.75
+      output_x: 227
+      img_size: 227
+      blocked: false
+    }
+  }
+}
+layers {
+  name: "__pool_0__"
+  type: "pool"
+  size: 2458624
+  active_type: ""
+  inputs {
+    input_layer_name: "__conv_0__"
+    pool_conf {
+      pool_type: "max-projection"
+      channels: 64
+      size_x: 32
+      stride: 1
+      output_x: 196
+      img_size: 227
+      padding: 0
+      size_y: 32
+      stride_y: 1
+      output_y: 196
+      img_size_y: 227
+      padding_y: 0
+    }
+  }
+}
+parameters {
+  name: "___conv_0__.w0"
+  size: 65536
+  initial_mean: 0.0
+  initial_std: 0.0441941738242
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___conv_0__.wbias"
+  size: 64
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 64
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___batch_norm_0__.w0"
+  size: 64
+  initial_mean: 1.0
+  initial_std: 0.0
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___batch_norm_0__.w1"
+  size: 64
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 64
+  initial_strategy: 0
+  initial_smart: false
+  is_static: true
+  is_shared: true
+}
+parameters {
+  name: "___batch_norm_0__.w2"
+  size: 64
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 64
+  initial_strategy: 0
+  initial_smart: false
+  is_static: true
+  is_shared: true
+}
+parameters {
+  name: "___batch_norm_0__.wbias"
+  size: 64
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 64
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "image"
+output_layer_names: "__pool_0__"
+output_layer_names: "__crmnorm_0__"
+sub_models {
+  name: "root"
+  layer_names: "image"
+  layer_names: "__conv_0__"
+  layer_names: "__batch_norm_0__"
+  layer_names: "__crmnorm_0__"
+  layer_names: "__pool_0__"
+  input_layer_names: "image"
+  output_layer_names: "__pool_0__"
+  output_layer_names: "__crmnorm_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..7b2911f8e367ebf9d0797e815a7532c714ef698e
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr
@@ -0,0 +1,69 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 30
+  active_type: ""
+}
+layers {
+  name: "__first_seq_0__"
+  type: "seqlastins"
+  size: 30
+  active_type: "linear"
+  inputs {
+    input_layer_name: "data"
+  }
+  select_first: true
+  trans_type: "seq"
+}
+layers {
+  name: "__first_seq_1__"
+  type: "seqlastins"
+  size: 30
+  active_type: "linear"
+  inputs {
+    input_layer_name: "data"
+  }
+  select_first: true
+  trans_type: "non-seq"
+}
+layers {
+  name: "__last_seq_0__"
+  type: "seqlastins"
+  size: 30
+  active_type: "linear"
+  inputs {
+    input_layer_name: "data"
+  }
+  trans_type: "seq"
+}
+layers {
+  name: "__last_seq_1__"
+  type: "seqlastins"
+  size: 30
+  active_type: "linear"
+  inputs {
+    input_layer_name: "data"
+  }
+  trans_type: "non-seq"
+}
+input_layer_names: "data"
+output_layer_names: "__first_seq_0__"
+output_layer_names: "__first_seq_1__"
+output_layer_names: "__last_seq_0__"
+output_layer_names: "__last_seq_1__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__first_seq_0__"
+  layer_names: "__first_seq_1__"
+  layer_names: "__last_seq_0__"
+  layer_names: "__last_seq_1__"
+  input_layer_names: "data"
+  output_layer_names: "__first_seq_0__"
+  output_layer_names: "__first_seq_1__"
+  output_layer_names: "__last_seq_0__"
+  output_layer_names: "__last_seq_1__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/layer_activations.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/layer_activations.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..ecf39e4d32167d4e838c43929cc4e7a87ff421a8
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/layer_activations.protostr
@@ -0,0 +1,423 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "layer_0"
+  type: "fc"
+  size: 100
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "_layer_0.w0"
+  }
+  bias_parameter_name: "_layer_0.wbias"
+}
+layers {
+  name: "layer_1"
+  type: "fc"
+  size: 100
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "_layer_1.w0"
+  }
+  bias_parameter_name: "_layer_1.wbias"
+}
+layers {
+  name: "layer_2"
+  type: "fc"
+  size: 100
+  active_type: "softmax"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "_layer_2.w0"
+  }
+  bias_parameter_name: "_layer_2.wbias"
+}
+layers {
+  name: "layer_3"
+  type: "fc"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "_layer_3.w0"
+  }
+  bias_parameter_name: "_layer_3.wbias"
+}
+layers {
+  name: "layer_4"
+  type: "fc"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "_layer_4.w0"
+  }
+  bias_parameter_name: "_layer_4.wbias"
+}
+layers {
+  name: "layer_5"
+  type: "fc"
+  size: 100
+  active_type: "exponential"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "_layer_5.w0"
+  }
+  bias_parameter_name: "_layer_5.wbias"
+}
+layers {
+  name: "layer_6"
+  type: "fc"
+  size: 100
+  active_type: "relu"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "_layer_6.w0"
+  }
+  bias_parameter_name: "_layer_6.wbias"
+}
+layers {
+  name: "layer_7"
+  type: "fc"
+  size: 100
+  active_type: "brelu"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "_layer_7.w0"
+  }
+  bias_parameter_name: "_layer_7.wbias"
+}
+layers {
+  name: "layer_8"
+  type: "fc"
+  size: 100
+  active_type: "softrelu"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "_layer_8.w0"
+  }
+  bias_parameter_name: "_layer_8.wbias"
+}
+layers {
+  name: "layer_9"
+  type: "fc"
+  size: 100
+  active_type: "stanh"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "_layer_9.w0"
+  }
+  bias_parameter_name: "_layer_9.wbias"
+}
+layers {
+  name: "layer_10"
+  type: "fc"
+  size: 100
+  active_type: "abs"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "_layer_10.w0"
+  }
+  bias_parameter_name: "_layer_10.wbias"
+}
+layers {
+  name: "layer_11"
+  type: "fc"
+  size: 100
+  active_type: "square"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "_layer_11.w0"
+  }
+  bias_parameter_name: "_layer_11.wbias"
+}
+parameters {
+  name: "_layer_0.w0"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_layer_0.wbias"
+  size: 100
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 100
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_layer_1.w0"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_layer_1.wbias"
+  size: 100
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 100
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_layer_2.w0"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_layer_2.wbias"
+  size: 100
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 100
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_layer_3.w0"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_layer_3.wbias"
+  size: 100
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 100
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_layer_4.w0"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_layer_4.wbias"
+  size: 100
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 100
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_layer_5.w0"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_layer_5.wbias"
+  size: 100
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 100
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_layer_6.w0"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_layer_6.wbias"
+  size: 100
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 100
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_layer_7.w0"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_layer_7.wbias"
+  size: 100
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 100
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_layer_8.w0"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_layer_8.wbias"
+  size: 100
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 100
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_layer_9.w0"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_layer_9.wbias"
+  size: 100
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 100
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_layer_10.w0"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_layer_10.wbias"
+  size: 100
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 100
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_layer_11.w0"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_layer_11.wbias"
+  size: 100
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 100
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "input"
+output_layer_names: "layer_0"
+output_layer_names: "layer_1"
+output_layer_names: "layer_2"
+output_layer_names: "layer_3"
+output_layer_names: "layer_4"
+output_layer_names: "layer_5"
+output_layer_names: "layer_6"
+output_layer_names: "layer_7"
+output_layer_names: "layer_8"
+output_layer_names: "layer_9"
+output_layer_names: "layer_10"
+output_layer_names: "layer_11"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "layer_0"
+  layer_names: "layer_1"
+  layer_names: "layer_2"
+  layer_names: "layer_3"
+  layer_names: "layer_4"
+  layer_names: "layer_5"
+  layer_names: "layer_6"
+  layer_names: "layer_7"
+  layer_names: "layer_8"
+  layer_names: "layer_9"
+  layer_names: "layer_10"
+  layer_names: "layer_11"
+  input_layer_names: "input"
+  output_layer_names: "layer_0"
+  output_layer_names: "layer_1"
+  output_layer_names: "layer_2"
+  output_layer_names: "layer_3"
+  output_layer_names: "layer_4"
+  output_layer_names: "layer_5"
+  output_layer_names: "layer_6"
+  output_layer_names: "layer_7"
+  output_layer_names: "layer_8"
+  output_layer_names: "layer_9"
+  output_layer_names: "layer_10"
+  output_layer_names: "layer_11"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..1767445c44bf5c0ea7c1149ad9fef2dd92508c54
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr
@@ -0,0 +1,235 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__exp_0__"
+  type: "mixed"
+  size: 100
+  active_type: "exponential"
+  inputs {
+    input_layer_name: "data"
+    proj_conf {
+      type: "identity"
+      name: "___exp_0__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
+layers {
+  name: "__log_0__"
+  type: "mixed"
+  size: 100
+  active_type: "log"
+  inputs {
+    input_layer_name: "__exp_0__"
+    proj_conf {
+      type: "identity"
+      name: "___log_0__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
+layers {
+  name: "__abs_0__"
+  type: "mixed"
+  size: 100
+  active_type: "abs"
+  inputs {
+    input_layer_name: "__log_0__"
+    proj_conf {
+      type: "identity"
+      name: "___abs_0__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
+layers {
+  name: "__sigmoid_0__"
+  type: "mixed"
+  size: 100
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "__abs_0__"
+    proj_conf {
+      type: "identity"
+      name: "___sigmoid_0__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
+layers {
+  name: "__square_0__"
+  type: "mixed"
+  size: 100
+  active_type: "square"
+  inputs {
+    input_layer_name: "__sigmoid_0__"
+    proj_conf {
+      type: "identity"
+      name: "___square_0__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
+layers {
+  name: "__square_1__"
+  type: "mixed"
+  size: 100
+  active_type: "square"
+  inputs {
+    input_layer_name: "__square_0__"
+    proj_conf {
+      type: "identity"
+      name: "___square_1__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
+layers {
+  name: "__slope_intercept_layer_0__"
+  type: "slope_intercept"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__square_1__"
+  }
+  slope: 1.0
+  intercept: 1
+}
+layers {
+  name: "__slope_intercept_layer_1__"
+  type: "slope_intercept"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__slope_intercept_layer_0__"
+  }
+  slope: 1.0
+  intercept: 1
+}
+layers {
+  name: "__mixed_0__"
+  type: "mixed"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__square_1__"
+    proj_conf {
+      type: "identity"
+      name: "___mixed_0__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+  inputs {
+    input_layer_name: "__slope_intercept_layer_1__"
+    proj_conf {
+      type: "identity"
+      name: "___mixed_0__.w1"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
+layers {
+  name: "__slope_intercept_layer_2__"
+  type: "slope_intercept"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__square_1__"
+  }
+  slope: -1.0
+  intercept: 0.0
+}
+layers {
+  name: "__mixed_1__"
+  type: "mixed"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__mixed_0__"
+    proj_conf {
+      type: "identity"
+      name: "___mixed_1__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+  inputs {
+    input_layer_name: "__slope_intercept_layer_2__"
+    proj_conf {
+      type: "identity"
+      name: "___mixed_1__.w1"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
+layers {
+  name: "__slope_intercept_layer_3__"
+  type: "slope_intercept"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__mixed_1__"
+  }
+  slope: 1.0
+  intercept: 2
+}
+layers {
+  name: "__slope_intercept_layer_4__"
+  type: "slope_intercept"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__slope_intercept_layer_3__"
+  }
+  slope: -1.0
+  intercept: 0.0
+}
+layers {
+  name: "__slope_intercept_layer_5__"
+  type: "slope_intercept"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__slope_intercept_layer_4__"
+  }
+  slope: 1.0
+  intercept: 2
+}
+input_layer_names: "data"
+output_layer_names: "__slope_intercept_layer_5__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__exp_0__"
+  layer_names: "__log_0__"
+  layer_names: "__abs_0__"
+  layer_names: "__sigmoid_0__"
+  layer_names: "__square_0__"
+  layer_names: "__square_1__"
+  layer_names: "__slope_intercept_layer_0__"
+  layer_names: "__slope_intercept_layer_1__"
+  layer_names: "__mixed_0__"
+  layer_names: "__slope_intercept_layer_2__"
+  layer_names: "__mixed_1__"
+  layer_names: "__slope_intercept_layer_3__"
+  layer_names: "__slope_intercept_layer_4__"
+  layer_names: "__slope_intercept_layer_5__"
+  input_layer_names: "data"
+  output_layer_names: "__slope_intercept_layer_5__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..e47e531a2223ddaa9dd1dfaf1fcee8a11008cbbd
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
@@ -0,0 +1,315 @@
+type: "nn"
+layers {
+  name: "test"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__embedding_0__"
+  type: "mixed"
+  size: 256
+  active_type: ""
+  inputs {
+    input_layer_name: "test"
+    input_parameter_name: "___embedding_0__.w0"
+    proj_conf {
+      type: "table"
+      name: "___embedding_0__.w0"
+      input_size: 100
+      output_size: 256
+    }
+  }
+}
+layers {
+  name: "__mixed_0__"
+  type: "mixed"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__embedding_0__"
+    input_parameter_name: "___mixed_0__.w0"
+    proj_conf {
+      type: "fc"
+      name: "___mixed_0__.w0"
+      input_size: 256
+      output_size: 100
+    }
+  }
+}
+layers {
+  name: "__mixed_1__"
+  type: "mixed"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__mixed_0__"
+    input_parameter_name: "___mixed_1__.w0"
+    proj_conf {
+      type: "table"
+      name: "___mixed_1__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
+layers {
+  name: "__mixed_2__"
+  type: "mixed"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__mixed_1__"
+    proj_conf {
+      type: "identity"
+      name: "___mixed_2__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
+layers {
+  name: "__mixed_3__"
+  type: "mixed"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__mixed_2__"
+    input_parameter_name: "___mixed_3__.w0"
+    proj_conf {
+      type: "dot_mul"
+      name: "___mixed_3__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
+layers {
+  name: "__mixed_4__"
+  type: "mixed"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "__mixed_3__"
+    input_parameter_name: "___mixed_4__.w0"
+    proj_conf {
+      type: "context"
+      name: "___mixed_4__.w0"
+      input_size: 100
+      output_size: 300
+      context_start: -1
+      context_length: 3
+      trainable_padding: true
+    }
+  }
+}
+layers {
+  name: "__mixed_5__"
+  type: "mixed"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__mixed_2__"
+  }
+  inputs {
+    input_layer_name: "__mixed_3__"
+  }
+  operator_confs {
+    type: "dot_mul"
+    input_indices: 0
+    input_indices: 1
+    input_sizes: 100
+    input_sizes: 100
+    output_size: 100
+    dotmul_scale: 1
+  }
+}
+layers {
+  name: "img"
+  type: "data"
+  size: 1024
+  active_type: ""
+}
+layers {
+  name: "filter"
+  type: "data"
+  size: 576
+  active_type: ""
+}
+layers {
+  name: "__mixed_6__"
+  type: "mixed"
+  size: 57600
+  active_type: ""
+  inputs {
+    input_layer_name: "img"
+  }
+  inputs {
+    input_layer_name: "filter"
+  }
+  operator_confs {
+    type: "conv"
+    input_indices: 0
+    input_indices: 1
+    input_sizes: 1024
+    input_sizes: 576
+    output_size: 57600
+    conv_conf {
+      filter_size: 3
+      channels: 1
+      stride: 1
+      padding: 0
+      groups: 1
+      filter_channels: 1
+      output_x: 30
+      img_size: 32
+      caffe_mode: true
+      filter_size_y: 3
+      padding_y: 0
+      stride_y: 1
+    }
+    num_filters: 64
+  }
+}
+layers {
+  name: "__mixed_7__"
+  type: "mixed"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__mixed_4__"
+    input_parameter_name: "___mixed_7__.w0"
+    proj_conf {
+      type: "fc"
+      name: "___mixed_7__.w0"
+      input_size: 300
+      output_size: 100
+    }
+  }
+  inputs {
+    input_layer_name: "__mixed_5__"
+    input_parameter_name: "___mixed_7__.w1"
+    proj_conf {
+      type: "trans_fc"
+      name: "___mixed_7__.w1"
+      input_size: 100
+      output_size: 100
+    }
+  }
+  inputs {
+    input_layer_name: "__mixed_6__"
+    input_parameter_name: "___mixed_7__.w2"
+    proj_conf {
+      type: "fc"
+      name: "___mixed_7__.w2"
+      input_size: 57600
+      output_size: 100
+    }
+  }
+  drop_rate: 0.5
+}
+parameters {
+  name: "___embedding_0__.w0"
+  size: 25600
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 256
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___mixed_0__.w0"
+  size: 25600
+  initial_mean: 0.0
+  initial_std: 0.0625
+  dims: 256
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___mixed_1__.w0"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___mixed_3__.w0"
+  size: 100
+  initial_mean: 0.0
+  initial_std: 1.0
+  dims: 1
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___mixed_4__.w0"
+  size: 200
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 2
+  dims: 100
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___mixed_7__.w0"
+  size: 30000
+  initial_mean: 0.0
+  initial_std: 0.057735026919
+  dims: 300
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___mixed_7__.w1"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___mixed_7__.w2"
+  size: 5760000
+  initial_mean: 0.0
+  initial_std: 0.00416666666667
+  dims: 57600
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+input_layer_names: "test"
+input_layer_names: "img"
+input_layer_names: "filter"
+output_layer_names: "__mixed_7__"
+sub_models {
+  name: "root"
+  layer_names: "test"
+  layer_names: "__embedding_0__"
+  layer_names: "__mixed_0__"
+  layer_names: "__mixed_1__"
+  layer_names: "__mixed_2__"
+  layer_names: "__mixed_3__"
+  layer_names: "__mixed_4__"
+  layer_names: "__mixed_5__"
+  layer_names: "img"
+  layer_names: "filter"
+  layer_names: "__mixed_6__"
+  layer_names: "__mixed_7__"
+  input_layer_names: "test"
+  input_layer_names: "img"
+  input_layer_names: "filter"
+  output_layer_names: "__mixed_7__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_fc.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_fc.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..3e8633b0798318bfc50988dbd329256629d5176c
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_fc.protostr
@@ -0,0 +1,125 @@
+type: "nn"
+layers {
+  name: "feature_a"
+  type: "data"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "feature_b"
+  type: "data"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 200
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "feature_a"
+    input_parameter_name: "fc_param"
+  }
+  bias_parameter_name: "bias_param"
+}
+layers {
+  name: "__fc_layer_1__"
+  type: "fc"
+  size: 200
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "feature_b"
+    input_parameter_name: "fc_param"
+  }
+  bias_parameter_name: "bias_param"
+}
+layers {
+  name: "__fc_layer_2__"
+  type: "fc"
+  size: 10
+  active_type: "softmax"
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+    input_parameter_name: "softmax_param"
+  }
+  inputs {
+    input_layer_name: "__fc_layer_1__"
+    input_parameter_name: "softmax_param"
+  }
+}
+layers {
+  name: "label"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "__cost_0__"
+  type: "multi-class-cross-entropy"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "__fc_layer_2__"
+  }
+  inputs {
+    input_layer_name: "label"
+  }
+  coeff: 1.0
+}
+parameters {
+  name: "fc_param"
+  size: 40000
+  initial_mean: 0.0
+  initial_std: 1.0
+  dims: 200
+  dims: 200
+  initial_strategy: 1
+  initial_smart: false
+}
+parameters {
+  name: "bias_param"
+  size: 200
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 200
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "softmax_param"
+  size: 2000
+  initial_mean: 0.0
+  initial_std: 1.0
+  dims: 200
+  dims: 10
+  initial_strategy: 1
+  initial_smart: false
+}
+input_layer_names: "feature_a"
+input_layer_names: "feature_b"
+input_layer_names: "label"
+output_layer_names: "__cost_0__"
+evaluators {
+  name: "classification_error_evaluator"
+  type: "classification_error"
+  input_layers: "__fc_layer_2__"
+  input_layers: "label"
+}
+sub_models {
+  name: "root"
+  layer_names: "feature_a"
+  layer_names: "feature_b"
+  layer_names: "__fc_layer_0__"
+  layer_names: "__fc_layer_1__"
+  layer_names: "__fc_layer_2__"
+  layer_names: "label"
+  layer_names: "__cost_0__"
+  input_layer_names: "feature_a"
+  input_layer_names: "feature_b"
+  input_layer_names: "label"
+  output_layer_names: "__cost_0__"
+  evaluator_names: "classification_error_evaluator"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..0a83499b724806666a241489467207f3c7151a3a
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
@@ -0,0 +1,393 @@
+type: "recurrent_nn"
+layers {
+  name: "data_a"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "data_b"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__mixed_0__"
+  type: "mixed"
+  size: 400
+  active_type: ""
+  inputs {
+    input_layer_name: "data_a"
+    input_parameter_name: "mixed_param"
+    proj_conf {
+      type: "fc"
+      name: "___mixed_0__.w0"
+      input_size: 100
+      output_size: 400
+    }
+  }
+}
+layers {
+  name: "__mixed_1__"
+  type: "mixed"
+  size: 400
+  active_type: ""
+  inputs {
+    input_layer_name: "data_b"
+    input_parameter_name: "mixed_param"
+    proj_conf {
+      type: "fc"
+      name: "___mixed_1__.w0"
+      input_size: 100
+      output_size: 400
+    }
+  }
+}
+layers {
+  name: "__lstm_group_0___recurrent_group"
+  type: "recurrent_layer_group"
+  active_type: ""
+}
+layers {
+  name: "__mixed_0__@__lstm_group_0___recurrent_group"
+  type: "scatter_agent"
+  size: 400
+  active_type: ""
+}
+layers {
+  name: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
+  type: "agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
+  type: "agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group"
+  type: "mixed"
+  size: 400
+  active_type: ""
+  inputs {
+    input_layer_name: "__mixed_0__@__lstm_group_0___recurrent_group"
+    proj_conf {
+      type: "identity"
+      name: "___lstm_group_0___input_recurrent.w0"
+      input_size: 400
+      output_size: 400
+    }
+  }
+  inputs {
+    input_layer_name: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
+    input_parameter_name: "lstm_param"
+    proj_conf {
+      type: "fc"
+      name: "___lstm_group_0___input_recurrent.w1"
+      input_size: 100
+      output_size: 400
+    }
+  }
+}
+layers {
+  name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
+  type: "lstm_step"
+  size: 100
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "__lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group"
+  }
+  inputs {
+    input_layer_name: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
+  }
+  bias_parameter_name: "lstm_bias"
+  active_gate_type: "sigmoid"
+  active_state_type: "sigmoid"
+}
+layers {
+  name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
+  type: "get_output"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
+    input_layer_argument: "state"
+  }
+}
+layers {
+  name: "__lstm_group_0__"
+  type: "gather_agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__lstm_group_1___recurrent_group"
+  type: "recurrent_layer_group"
+  active_type: ""
+}
+layers {
+  name: "__mixed_1__@__lstm_group_1___recurrent_group"
+  type: "scatter_agent"
+  size: 400
+  active_type: ""
+}
+layers {
+  name: "__lstm_group_1__+delay1@__lstm_group_1___recurrent_group"
+  type: "agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__lstm_group_1___state+delay1@__lstm_group_1___recurrent_group"
+  type: "agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__lstm_group_1___input_recurrent@__lstm_group_1___recurrent_group"
+  type: "mixed"
+  size: 400
+  active_type: ""
+  inputs {
+    input_layer_name: "__mixed_1__@__lstm_group_1___recurrent_group"
+    proj_conf {
+      type: "identity"
+      name: "___lstm_group_1___input_recurrent.w0"
+      input_size: 400
+      output_size: 400
+    }
+  }
+  inputs {
+    input_layer_name: "__lstm_group_1__+delay1@__lstm_group_1___recurrent_group"
+    input_parameter_name: "lstm_param"
+    proj_conf {
+      type: "fc"
+      name: "___lstm_group_1___input_recurrent.w1"
+      input_size: 100
+      output_size: 400
+    }
+  }
+}
+layers {
+  name: "__lstm_group_1__@__lstm_group_1___recurrent_group"
+  type: "lstm_step"
+  size: 100
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "__lstm_group_1___input_recurrent@__lstm_group_1___recurrent_group"
+  }
+  inputs {
+    input_layer_name: "__lstm_group_1___state+delay1@__lstm_group_1___recurrent_group"
+  }
+  bias_parameter_name: "lstm_bias"
+  active_gate_type: "sigmoid"
+  active_state_type: "sigmoid"
+}
+layers {
+  name: "__lstm_group_1___state@__lstm_group_1___recurrent_group"
+  type: "get_output"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__lstm_group_1__@__lstm_group_1___recurrent_group"
+    input_layer_argument: "state"
+  }
+}
+layers {
+  name: "__lstm_group_1__"
+  type: "gather_agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__last_seq_0__"
+  type: "seqlastins"
+  size: 100
+  active_type: "linear"
+  inputs {
+    input_layer_name: "__lstm_group_0__"
+  }
+  trans_type: "non-seq"
+}
+layers {
+  name: "__last_seq_1__"
+  type: "seqlastins"
+  size: 100
+  active_type: "linear"
+  inputs {
+    input_layer_name: "__lstm_group_1__"
+  }
+  trans_type: "non-seq"
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 10
+  active_type: "softmax"
+  inputs {
+    input_layer_name: "__last_seq_0__"
+    input_parameter_name: "softmax_param"
+  }
+  inputs {
+    input_layer_name: "__last_seq_1__"
+    input_parameter_name: "softmax_param"
+  }
+}
+layers {
+  name: "label"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "__cost_0__"
+  type: "multi-class-cross-entropy"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+  }
+  inputs {
+    input_layer_name: "label"
+  }
+  coeff: 1.0
+}
+parameters {
+  name: "mixed_param"
+  size: 40000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 400
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "lstm_param"
+  size: 40000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 400
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "lstm_bias"
+  size: 300
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 300
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "softmax_param"
+  size: 1000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 10
+  initial_strategy: 0
+  initial_smart: true
+}
+input_layer_names: "data_a"
+input_layer_names: "data_b"
+input_layer_names: "label"
+output_layer_names: "__cost_0__"
+evaluators {
+  name: "classification_error_evaluator"
+  type: "classification_error"
+  input_layers: "__fc_layer_0__"
+  input_layers: "label"
+}
+sub_models {
+  name: "root"
+  layer_names: "data_a"
+  layer_names: "data_b"
+  layer_names: "__mixed_0__"
+  layer_names: "__mixed_1__"
+  layer_names: "__lstm_group_0___recurrent_group"
+  layer_names: "__lstm_group_0__"
+  layer_names: "__lstm_group_1___recurrent_group"
+  layer_names: "__lstm_group_1__"
+  layer_names: "__last_seq_0__"
+  layer_names: "__last_seq_1__"
+  layer_names: "__fc_layer_0__"
+  layer_names: "label"
+  layer_names: "__cost_0__"
+  input_layer_names: "data_a"
+  input_layer_names: "data_b"
+  input_layer_names: "label"
+  output_layer_names: "__cost_0__"
+  evaluator_names: "classification_error_evaluator"
+  is_recurrent_layer_group: false
+}
+sub_models {
+  name: "__lstm_group_0___recurrent_group"
+  layer_names: "__mixed_0__@__lstm_group_0___recurrent_group"
+  layer_names: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
+  layer_names: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
+  layer_names: "__lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group"
+  layer_names: "__lstm_group_0__@__lstm_group_0___recurrent_group"
+  layer_names: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
+  is_recurrent_layer_group: true
+  reversed: false
+  memories {
+    layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
+    link_name: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
+    is_sequence: false
+  }
+  memories {
+    layer_name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
+    link_name: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
+    is_sequence: false
+  }
+  in_links {
+    layer_name: "__mixed_0__"
+    link_name: "__mixed_0__@__lstm_group_0___recurrent_group"
+    has_subseq: false
+  }
+  out_links {
+    layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
+    link_name: "__lstm_group_0__"
+    has_subseq: false
+  }
+  target_inlinkid: -1
+}
+sub_models {
+  name: "__lstm_group_1___recurrent_group"
+  layer_names: "__mixed_1__@__lstm_group_1___recurrent_group"
+  layer_names: "__lstm_group_1__+delay1@__lstm_group_1___recurrent_group"
+  layer_names: "__lstm_group_1___state+delay1@__lstm_group_1___recurrent_group"
+  layer_names: "__lstm_group_1___input_recurrent@__lstm_group_1___recurrent_group"
+  layer_names: "__lstm_group_1__@__lstm_group_1___recurrent_group"
+  layer_names: "__lstm_group_1___state@__lstm_group_1___recurrent_group"
+  is_recurrent_layer_group: true
+  reversed: false
+  memories {
+    layer_name: "__lstm_group_1__@__lstm_group_1___recurrent_group"
+    link_name: "__lstm_group_1__+delay1@__lstm_group_1___recurrent_group"
+    is_sequence: false
+  }
+  memories {
+    layer_name: "__lstm_group_1___state@__lstm_group_1___recurrent_group"
+    link_name: "__lstm_group_1___state+delay1@__lstm_group_1___recurrent_group"
+    is_sequence: false
+  }
+  in_links {
+    layer_name: "__mixed_1__"
+    link_name: "__mixed_1__@__lstm_group_1___recurrent_group"
+    has_subseq: false
+  }
+  out_links {
+    layer_name: "__lstm_group_1__@__lstm_group_1___recurrent_group"
+    link_name: "__lstm_group_1__"
+    has_subseq: false
+  }
+  target_inlinkid: -1
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..dacb40185f863025528c2d4eeb8b325425953a93
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr
@@ -0,0 +1,418 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 200
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___fc_layer_0__.w0"
+  }
+  bias_parameter_name: "___fc_layer_0__.wbias"
+}
+layers {
+  name: "__recurrent_layer_0__"
+  type: "recurrent"
+  size: 200
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+    input_parameter_name: "___recurrent_layer_0__.w0"
+  }
+  bias_parameter_name: "___recurrent_layer_0__.wbias"
+  reversed: false
+}
+layers {
+  name: "__recurrent_layer_1__"
+  type: "recurrent"
+  size: 200
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+    input_parameter_name: "___recurrent_layer_1__.w0"
+  }
+  bias_parameter_name: "___recurrent_layer_1__.wbias"
+  reversed: true
+}
+layers {
+  name: "__fc_layer_1__"
+  type: "fc"
+  size: 800
+  active_type: ""
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+    input_parameter_name: "___fc_layer_1__.w0"
+  }
+}
+layers {
+  name: "__lstmemory_0__"
+  type: "lstmemory"
+  size: 200
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "__fc_layer_1__"
+    input_parameter_name: "___lstmemory_0__.w0"
+  }
+  bias_parameter_name: "___lstmemory_0__.wbias"
+  reversed: false
+  active_gate_type: "sigmoid"
+  active_state_type: "tanh"
+}
+layers {
+  name: "__fc_layer_2__"
+  type: "fc"
+  size: 800
+  active_type: ""
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+    input_parameter_name: "___fc_layer_2__.w0"
+  }
+}
+layers {
+  name: "__lstmemory_1__"
+  type: "lstmemory"
+  size: 200
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "__fc_layer_2__"
+    input_parameter_name: "___lstmemory_1__.w0"
+  }
+  bias_parameter_name: "___lstmemory_1__.wbias"
+  reversed: true
+  active_gate_type: "sigmoid"
+  active_state_type: "tanh"
+}
+layers {
+  name: "__fc_layer_3__"
+  type: "fc"
+  size: 600
+  active_type: ""
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+    input_parameter_name: "___fc_layer_3__.w0"
+  }
+}
+layers {
+  name: "__gru_0__"
+  type: "gated_recurrent"
+  size: 200
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "__fc_layer_3__"
+    input_parameter_name: "___gru_0__.w0"
+  }
+  bias_parameter_name: "___gru_0__.wbias"
+  reversed: false
+  active_gate_type: "sigmoid"
+}
+layers {
+  name: "__fc_layer_4__"
+  type: "fc"
+  size: 600
+  active_type: ""
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+    input_parameter_name: "___fc_layer_4__.w0"
+  }
+}
+layers {
+  name: "__gru_1__"
+  type: "gated_recurrent"
+  size: 200
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "__fc_layer_4__"
+    input_parameter_name: "___gru_1__.w0"
+  }
+  bias_parameter_name: "___gru_1__.wbias"
+  reversed: true
+  active_gate_type: "sigmoid"
+}
+layers {
+  name: "__last_seq_0__"
+  type: "seqlastins"
+  size: 200
+  active_type: "linear"
+  inputs {
+    input_layer_name: "__recurrent_layer_0__"
+  }
+  trans_type: "non-seq"
+}
+layers {
+  name: "__first_seq_0__"
+  type: "seqlastins"
+  size: 200
+  active_type: "linear"
+  inputs {
+    input_layer_name: "__recurrent_layer_1__"
+  }
+  select_first: true
+  trans_type: "non-seq"
+}
+layers {
+  name: "__last_seq_1__"
+  type: "seqlastins"
+  size: 200
+  active_type: "linear"
+  inputs {
+    input_layer_name: "__lstmemory_0__"
+  }
+  trans_type: "non-seq"
+}
+layers {
+  name: "__first_seq_1__"
+  type: "seqlastins"
+  size: 200
+  active_type: "linear"
+  inputs {
+    input_layer_name: "__lstmemory_1__"
+  }
+  select_first: true
+  trans_type: "non-seq"
+}
+layers {
+  name: "__last_seq_2__"
+  type: "seqlastins"
+  size: 200
+  active_type: "linear"
+  inputs {
+    input_layer_name: "__gru_0__"
+  }
+  trans_type: "non-seq"
+}
+layers {
+  name: "__first_seq_2__"
+  type: "seqlastins"
+  size: 200
+  active_type: "linear"
+  inputs {
+    input_layer_name: "__gru_1__"
+  }
+  select_first: true
+  trans_type: "non-seq"
+}
+parameters {
+  name: "___fc_layer_0__.w0"
+  size: 40000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 200
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_0__.wbias"
+  size: 200
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 200
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___recurrent_layer_0__.w0"
+  size: 40000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 200
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___recurrent_layer_0__.wbias"
+  size: 200
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 200
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___recurrent_layer_1__.w0"
+  size: 40000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 200
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___recurrent_layer_1__.wbias"
+  size: 200
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 200
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___fc_layer_1__.w0"
+  size: 160000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 800
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___lstmemory_0__.w0"
+  size: 160000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 200
+  dims: 4
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___lstmemory_0__.wbias"
+  size: 1400
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 1400
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___fc_layer_2__.w0"
+  size: 160000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 800
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___lstmemory_1__.w0"
+  size: 160000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 200
+  dims: 4
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___lstmemory_1__.wbias"
+  size: 1400
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 1400
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___fc_layer_3__.w0"
+  size: 120000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 600
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___gru_0__.w0"
+  size: 120000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 600
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___gru_0__.wbias"
+  size: 600
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 600
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___fc_layer_4__.w0"
+  size: 120000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 600
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___gru_1__.w0"
+  size: 120000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 600
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___gru_1__.wbias"
+  size: 600
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 600
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+output_layer_names: "__last_seq_0__"
+output_layer_names: "__first_seq_0__"
+output_layer_names: "__last_seq_1__"
+output_layer_names: "__first_seq_1__"
+output_layer_names: "__last_seq_2__"
+output_layer_names: "__first_seq_2__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__fc_layer_0__"
+  layer_names: "__recurrent_layer_0__"
+  layer_names: "__recurrent_layer_1__"
+  layer_names: "__fc_layer_1__"
+  layer_names: "__lstmemory_0__"
+  layer_names: "__fc_layer_2__"
+  layer_names: "__lstmemory_1__"
+  layer_names: "__fc_layer_3__"
+  layer_names: "__gru_0__"
+  layer_names: "__fc_layer_4__"
+  layer_names: "__gru_1__"
+  layer_names: "__last_seq_0__"
+  layer_names: "__first_seq_0__"
+  layer_names: "__last_seq_1__"
+  layer_names: "__first_seq_1__"
+  layer_names: "__last_seq_2__"
+  layer_names: "__first_seq_2__"
+  input_layer_names: "data"
+  output_layer_names: "__last_seq_0__"
+  output_layer_names: "__first_seq_0__"
+  output_layer_names: "__last_seq_1__"
+  output_layer_names: "__first_seq_1__"
+  output_layer_names: "__last_seq_2__"
+  output_layer_names: "__first_seq_2__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bi_grumemory.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bi_grumemory.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..b110e91498ce7d112987714bd769868179141c54
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bi_grumemory.protostr
@@ -0,0 +1,152 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 120
+  active_type: ""
+}
+layers {
+  name: "__bidirectional_gru_0___fw_transform"
+  type: "mixed"
+  size: 120
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___bidirectional_gru_0___fw_transform.w0"
+    proj_conf {
+      type: "fc"
+      name: "___bidirectional_gru_0___fw_transform.w0"
+      input_size: 120
+      output_size: 120
+    }
+  }
+}
+layers {
+  name: "__bidirectional_gru_0___fw"
+  type: "gated_recurrent"
+  size: 40
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "__bidirectional_gru_0___fw_transform"
+    input_parameter_name: "___bidirectional_gru_0___fw.w0"
+  }
+  bias_parameter_name: "___bidirectional_gru_0___fw.wbias"
+  reversed: false
+  active_gate_type: "sigmoid"
+}
+layers {
+  name: "__bidirectional_gru_0___bw_transform"
+  type: "mixed"
+  size: 120
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___bidirectional_gru_0___bw_transform.w0"
+    proj_conf {
+      type: "fc"
+      name: "___bidirectional_gru_0___bw_transform.w0"
+      input_size: 120
+      output_size: 120
+    }
+  }
+}
+layers {
+  name: "__bidirectional_gru_0___bw"
+  type: "gated_recurrent"
+  size: 40
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "__bidirectional_gru_0___bw_transform"
+    input_parameter_name: "___bidirectional_gru_0___bw.w0"
+  }
+  bias_parameter_name: "___bidirectional_gru_0___bw.wbias"
+  reversed: true
+  active_gate_type: "sigmoid"
+}
+layers {
+  name: "__bidirectional_gru_0__"
+  type: "concat"
+  size: 80
+  active_type: ""
+  inputs {
+    input_layer_name: "__bidirectional_gru_0___fw"
+  }
+  inputs {
+    input_layer_name: "__bidirectional_gru_0___bw"
+  }
+}
+parameters {
+  name: "___bidirectional_gru_0___fw_transform.w0"
+  size: 14400
+  initial_mean: 0.0
+  initial_std: 0.0912870929175
+  dims: 120
+  dims: 120
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___bidirectional_gru_0___fw.w0"
+  size: 4800
+  initial_mean: 0.0
+  initial_std: 0.158113883008
+  dims: 40
+  dims: 120
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___bidirectional_gru_0___fw.wbias"
+  size: 120
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 120
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___bidirectional_gru_0___bw_transform.w0"
+  size: 14400
+  initial_mean: 0.0
+  initial_std: 0.0912870929175
+  dims: 120
+  dims: 120
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___bidirectional_gru_0___bw.w0"
+  size: 4800
+  initial_mean: 0.0
+  initial_std: 0.158113883008
+  dims: 40
+  dims: 120
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___bidirectional_gru_0___bw.wbias"
+  size: 120
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 120
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+output_layer_names: "__bidirectional_gru_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__bidirectional_gru_0___fw_transform"
+  layer_names: "__bidirectional_gru_0___fw"
+  layer_names: "__bidirectional_gru_0___bw_transform"
+  layer_names: "__bidirectional_gru_0___bw"
+  layer_names: "__bidirectional_gru_0__"
+  input_layer_names: "data"
+  output_layer_names: "__bidirectional_gru_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..5261cf0c44943689a957bb99c21075bb7341cd49
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
@@ -0,0 +1,289 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "labels"
+  type: "data"
+  size: 5000
+  active_type: ""
+}
+layers {
+  name: "probs"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "xe-label"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "__ctc_layer_0__"
+  type: "ctc"
+  size: 5001
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+  }
+  inputs {
+    input_layer_name: "labels"
+  }
+  norm_by_times: false
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 4
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___fc_layer_0__.w0"
+  }
+  bias_parameter_name: "___fc_layer_0__.wbias"
+}
+layers {
+  name: "crf_label"
+  type: "data"
+  size: 4
+  active_type: ""
+}
+layers {
+  name: "__crf_layer_0__"
+  type: "crf"
+  size: 4
+  active_type: ""
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+    input_parameter_name: "___crf_layer_0__.w0"
+  }
+  inputs {
+    input_layer_name: "crf_label"
+  }
+  coeff: 1.0
+}
+layers {
+  name: "left"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "right"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "label"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "__rank_cost_0__"
+  type: "rank-cost"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "left"
+  }
+  inputs {
+    input_layer_name: "right"
+  }
+  inputs {
+    input_layer_name: "label"
+  }
+  coeff: 1.0
+}
+layers {
+  name: "list_feature"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "list_scores"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "__lambda_cost_0__"
+  type: "lambda_cost"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "list_feature"
+  }
+  inputs {
+    input_layer_name: "list_scores"
+  }
+  NDCG_num: 5
+  max_sort_size: -1
+}
+layers {
+  name: "__cross_entropy_0__"
+  type: "multi-class-cross-entropy"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "probs"
+  }
+  inputs {
+    input_layer_name: "xe-label"
+  }
+  coeff: 1.0
+}
+layers {
+  name: "__cross_entropy_with_selfnorm_0__"
+  type: "multi_class_cross_entropy_with_selfnorm"
+  active_type: ""
+  inputs {
+    input_layer_name: "probs"
+  }
+  inputs {
+    input_layer_name: "xe-label"
+  }
+  softmax_selfnorm_alpha: 0.1
+  coeff: 1.0
+}
+layers {
+  name: "huber_probs"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "huber_label"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "__huber_cost_0__"
+  type: "huber"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "huber_probs"
+  }
+  inputs {
+    input_layer_name: "huber_label"
+  }
+  coeff: 1.0
+}
+layers {
+  name: "__multi_binary_label_cross_entropy_0__"
+  type: "multi_binary_label_cross_entropy"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "probs"
+  }
+  inputs {
+    input_layer_name: "xe-label"
+  }
+  coeff: 1.0
+}
+parameters {
+  name: "___fc_layer_0__.w0"
+  size: 800
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 4
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_0__.wbias"
+  size: 4
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 4
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___crf_layer_0__.w0"
+  size: 24
+  initial_mean: 0.0
+  initial_std: 0.5
+  dims: 4
+  dims: 6
+  initial_strategy: 0
+  initial_smart: true
+}
+input_layer_names: "input"
+input_layer_names: "labels"
+input_layer_names: "crf_label"
+input_layer_names: "left"
+input_layer_names: "right"
+input_layer_names: "label"
+input_layer_names: "list_feature"
+input_layer_names: "list_scores"
+input_layer_names: "probs"
+input_layer_names: "xe-label"
+input_layer_names: "huber_probs"
+input_layer_names: "huber_label"
+output_layer_names: "__ctc_layer_0__"
+output_layer_names: "__crf_layer_0__"
+output_layer_names: "__rank_cost_0__"
+output_layer_names: "__lambda_cost_0__"
+output_layer_names: "__cross_entropy_0__"
+output_layer_names: "__cross_entropy_with_selfnorm_0__"
+output_layer_names: "__huber_cost_0__"
+output_layer_names: "__multi_binary_label_cross_entropy_0__"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "labels"
+  layer_names: "probs"
+  layer_names: "xe-label"
+  layer_names: "__ctc_layer_0__"
+  layer_names: "__fc_layer_0__"
+  layer_names: "crf_label"
+  layer_names: "__crf_layer_0__"
+  layer_names: "left"
+  layer_names: "right"
+  layer_names: "label"
+  layer_names: "__rank_cost_0__"
+  layer_names: "list_feature"
+  layer_names: "list_scores"
+  layer_names: "__lambda_cost_0__"
+  layer_names: "__cross_entropy_0__"
+  layer_names: "__cross_entropy_with_selfnorm_0__"
+  layer_names: "huber_probs"
+  layer_names: "huber_label"
+  layer_names: "__huber_cost_0__"
+  layer_names: "__multi_binary_label_cross_entropy_0__"
+  input_layer_names: "input"
+  input_layer_names: "labels"
+  input_layer_names: "crf_label"
+  input_layer_names: "left"
+  input_layer_names: "right"
+  input_layer_names: "label"
+  input_layer_names: "list_feature"
+  input_layer_names: "list_scores"
+  input_layer_names: "probs"
+  input_layer_names: "xe-label"
+  input_layer_names: "huber_probs"
+  input_layer_names: "huber_label"
+  output_layer_names: "__ctc_layer_0__"
+  output_layer_names: "__crf_layer_0__"
+  output_layer_names: "__rank_cost_0__"
+  output_layer_names: "__lambda_cost_0__"
+  output_layer_names: "__cross_entropy_0__"
+  output_layer_names: "__cross_entropy_with_selfnorm_0__"
+  output_layer_names: "__huber_cost_0__"
+  output_layer_names: "__multi_binary_label_cross_entropy_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..811b38ae4a51e8faedb59fea2b81a8be3cceeae6
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
@@ -0,0 +1,111 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "label"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "weight"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 10
+  active_type: "softmax"
+  inputs {
+    input_layer_name: "input"
+    input_parameter_name: "___fc_layer_0__.w0"
+  }
+  bias_parameter_name: "___fc_layer_0__.wbias"
+}
+layers {
+  name: "__cost_0__"
+  type: "multi-class-cross-entropy"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+  }
+  inputs {
+    input_layer_name: "label"
+  }
+  inputs {
+    input_layer_name: "weight"
+  }
+  coeff: 1.0
+}
+layers {
+  name: "__regression_cost_0__"
+  type: "square_error"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+  }
+  inputs {
+    input_layer_name: "label"
+  }
+  inputs {
+    input_layer_name: "weight"
+  }
+  coeff: 1.0
+}
+parameters {
+  name: "___fc_layer_0__.w0"
+  size: 3000
+  initial_mean: 0.0
+  initial_std: 0.057735026919
+  dims: 300
+  dims: 10
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_0__.wbias"
+  size: 10
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 10
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "input"
+input_layer_names: "label"
+input_layer_names: "weight"
+output_layer_names: "__cost_0__"
+output_layer_names: "__regression_cost_0__"
+evaluators {
+  name: "classification_error_evaluator"
+  type: "classification_error"
+  input_layers: "__fc_layer_0__"
+  input_layers: "label"
+  input_layers: "weight"
+}
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "label"
+  layer_names: "weight"
+  layer_names: "__fc_layer_0__"
+  layer_names: "__cost_0__"
+  layer_names: "__regression_cost_0__"
+  input_layer_names: "input"
+  input_layer_names: "label"
+  input_layer_names: "weight"
+  output_layer_names: "__cost_0__"
+  output_layer_names: "__regression_cost_0__"
+  evaluator_names: "classification_error_evaluator"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_expand_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_expand_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..f4b36052264bc41b4c06826c3b3c1428c103add7
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_expand_layer.protostr
@@ -0,0 +1,56 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 30
+  active_type: ""
+}
+layers {
+  name: "data_seq"
+  type: "data"
+  size: 30
+  active_type: ""
+}
+layers {
+  name: "__expand_layer_0__"
+  type: "expand"
+  size: 30
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+  }
+  inputs {
+    input_layer_name: "data_seq"
+  }
+  trans_type: "seq"
+}
+layers {
+  name: "__expand_layer_1__"
+  type: "expand"
+  size: 30
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+  }
+  inputs {
+    input_layer_name: "data_seq"
+  }
+  trans_type: "non-seq"
+}
+input_layer_names: "data"
+input_layer_names: "data_seq"
+output_layer_names: "__expand_layer_0__"
+output_layer_names: "__expand_layer_1__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "data_seq"
+  layer_names: "__expand_layer_0__"
+  layer_names: "__expand_layer_1__"
+  input_layer_names: "data"
+  input_layer_names: "data_seq"
+  output_layer_names: "__expand_layer_0__"
+  output_layer_names: "__expand_layer_1__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_fc.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_fc.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..8151898832ded3796fb8c56b201d5ebfca3ce6cb
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_fc.protostr
@@ -0,0 +1,98 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__trans_layer_0__"
+  type: "trans"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+  }
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 100
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "__trans_layer_0__"
+    input_parameter_name: "___fc_layer_0__.w0"
+  }
+}
+layers {
+  name: "mask"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__selective_fc_layer_0__"
+  type: "selective_fc"
+  size: 100
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___selective_fc_layer_0__.w0"
+  }
+  inputs {
+    input_layer_name: "mask"
+  }
+  bias_parameter_name: "___selective_fc_layer_0__.wbias"
+  selective_fc_pass_generation: false
+  has_selected_colums: true
+  selective_fc_full_mul_ratio: 0.02
+}
+parameters {
+  name: "___fc_layer_0__.w0"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___selective_fc_layer_0__.w0"
+  size: 10000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+  is_sparse: false
+}
+parameters {
+  name: "___selective_fc_layer_0__.wbias"
+  size: 100
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 100
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+input_layer_names: "mask"
+output_layer_names: "__fc_layer_0__"
+output_layer_names: "__selective_fc_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__trans_layer_0__"
+  layer_names: "__fc_layer_0__"
+  layer_names: "mask"
+  layer_names: "__selective_fc_layer_0__"
+  input_layer_names: "data"
+  input_layer_names: "mask"
+  output_layer_names: "__fc_layer_0__"
+  output_layer_names: "__selective_fc_layer_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_grumemory_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_grumemory_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..2c19b2fd120e7c01ee9aa088f674a74498540a3c
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_grumemory_layer.protostr
@@ -0,0 +1,51 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 120
+  active_type: ""
+}
+layers {
+  name: "__gru_0__"
+  type: "gated_recurrent"
+  size: 40
+  active_type: "sigmoid"
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___gru_0__.w0"
+  }
+  bias_parameter_name: "___gru_0__.wbias"
+  reversed: true
+  active_gate_type: "tanh"
+}
+parameters {
+  name: "___gru_0__.w0"
+  size: 4800
+  initial_mean: 0.0
+  initial_std: 0.158113883008
+  dims: 40
+  dims: 120
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___gru_0__.wbias"
+  size: 120
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 120
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+output_layer_names: "__gru_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__gru_0__"
+  input_layer_names: "data"
+  output_layer_names: "__gru_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_hsigmoid.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_hsigmoid.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..e81fcb13c4c6ee8e76036d71d47fdaac9cd3d716
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_hsigmoid.protostr
@@ -0,0 +1,62 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "label"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "__hsigmoid_0__"
+  type: "hsigmoid"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___hsigmoid_0__.w0"
+  }
+  inputs {
+    input_layer_name: "label"
+  }
+  bias_parameter_name: "___hsigmoid_0__.wbias"
+  num_classes: 10
+}
+parameters {
+  name: "___hsigmoid_0__.w0"
+  size: 900
+  initial_mean: 0.0
+  initial_std: 0.333333333333
+  dims: 9
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___hsigmoid_0__.wbias"
+  size: 9
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 9
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+input_layer_names: "label"
+output_layer_names: "__hsigmoid_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "label"
+  layer_names: "__hsigmoid_0__"
+  input_layer_names: "data"
+  input_layer_names: "label"
+  output_layer_names: "__hsigmoid_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_lstmemory_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_lstmemory_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..76a4afab82c59196564128cb9cb8d72ba2a7b101
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_lstmemory_layer.protostr
@@ -0,0 +1,53 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 128
+  active_type: ""
+}
+layers {
+  name: "__lstmemory_0__"
+  type: "lstmemory"
+  size: 32
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___lstmemory_0__.w0"
+  }
+  bias_parameter_name: "___lstmemory_0__.wbias"
+  reversed: true
+  active_gate_type: "tanh"
+  active_state_type: "tanh"
+}
+parameters {
+  name: "___lstmemory_0__.w0"
+  size: 4096
+  initial_mean: 0.0
+  initial_std: 0.176776695297
+  dims: 32
+  dims: 32
+  dims: 4
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___lstmemory_0__.wbias"
+  size: 224
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 224
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+output_layer_names: "__lstmemory_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__lstmemory_0__"
+  input_layer_names: "data"
+  output_layer_names: "__lstmemory_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..1be2a7ceebfb74d677ac056dcc3a9f72fd31ccd6
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
@@ -0,0 +1,209 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 2304
+  active_type: ""
+}
+layers {
+  name: "__conv_0__"
+  type: "exconv"
+  size: 36864
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___conv_0__.w0"
+    conv_conf {
+      filter_size: 3
+      channels: 1
+      stride: 1
+      padding: 1
+      groups: 1
+      filter_channels: 1
+      output_x: 48
+      img_size: 48
+      caffe_mode: true
+      filter_size_y: 3
+      padding_y: 1
+      stride_y: 1
+    }
+  }
+  bias_parameter_name: "___conv_0__.wbias"
+  num_filters: 16
+  shared_biases: true
+}
+layers {
+  name: "__maxout_layer_0__"
+  type: "maxout"
+  size: 18432
+  active_type: ""
+  inputs {
+    input_layer_name: "__conv_0__"
+    maxout_conf {
+      channels: 16
+      groups: 2
+      img_size_x: 0
+      img_size_y: 0
+    }
+  }
+}
+layers {
+  name: "__pool_0__"
+  type: "pool"
+  size: 4608
+  active_type: ""
+  inputs {
+    input_layer_name: "__maxout_layer_0__"
+    pool_conf {
+      pool_type: "max-projection"
+      channels: 8
+      size_x: 2
+      stride: 2
+      output_x: 24
+      img_size: 48
+      padding: 0
+      size_y: 2
+      stride_y: 2
+      output_y: 24
+      img_size_y: 48
+      padding_y: 0
+    }
+  }
+}
+layers {
+  name: "__conv_1__"
+  type: "exconv"
+  size: 18432
+  active_type: ""
+  inputs {
+    input_layer_name: "__pool_0__"
+    input_parameter_name: "___conv_1__.w0"
+    conv_conf {
+      filter_size: 3
+      channels: 32
+      stride: 1
+      padding: 1
+      groups: 1
+      filter_channels: 32
+      output_x: 12
+      img_size: 12
+      caffe_mode: true
+      filter_size_y: 3
+      padding_y: 1
+      stride_y: 1
+    }
+  }
+  bias_parameter_name: "___conv_1__.wbias"
+  num_filters: 128
+  shared_biases: true
+}
+layers {
+  name: "__maxout_layer_1__"
+  type: "maxout"
+  size: 9216
+  active_type: ""
+  inputs {
+    input_layer_name: "__conv_0__"
+    maxout_conf {
+      channels: 128
+      groups: 4
+      img_size_x: 0
+      img_size_y: 0
+    }
+  }
+}
+layers {
+  name: "__block_expand_layer_0__"
+  type: "blockexpand"
+  size: 192
+  active_type: ""
+  inputs {
+    input_layer_name: "__maxout_layer_0__"
+    block_expand_conf {
+      channels: 32
+      stride_x: 1
+      stride_y: 1
+      padding_x: 0
+      padding_y: 0
+      block_x: 1
+      block_y: 6
+      output_x: 0
+      output_y: 0
+      img_size_x: 0
+      img_size_y: 0
+    }
+  }
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 384
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "__block_expand_layer_0__"
+    input_parameter_name: "___fc_layer_0__.w0"
+  }
+}
+parameters {
+  name: "___conv_0__.w0"
+  size: 144
+  initial_mean: 0.0
+  initial_std: 0.471404520791
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___conv_0__.wbias"
+  size: 16
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 16
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___conv_1__.w0"
+  size: 36864
+  initial_mean: 0.0
+  initial_std: 0.0833333333333
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___conv_1__.wbias"
+  size: 128
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 128
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___fc_layer_0__.w0"
+  size: 73728
+  initial_mean: 0.0
+  initial_std: 0.0721687836487
+  dims: 192
+  dims: 384
+  initial_strategy: 0
+  initial_smart: true
+}
+input_layer_names: "data"
+output_layer_names: "__fc_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__conv_0__"
+  layer_names: "__maxout_layer_0__"
+  layer_names: "__pool_0__"
+  layer_names: "__conv_1__"
+  layer_names: "__maxout_layer_1__"
+  layer_names: "__block_expand_layer_0__"
+  layer_names: "__fc_layer_0__"
+  input_layer_names: "data"
+  output_layer_names: "__fc_layer_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..b30bbb2a4e24d74ebe1d6c8eda8be5aa09217f6d
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr
@@ -0,0 +1,225 @@
+type: "nn"
+layers {
+  name: "w"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "a"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "b"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "c"
+  type: "data"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "d"
+  type: "data"
+  size: 31
+  active_type: ""
+}
+layers {
+  name: "__interpolation_layer_0__"
+  type: "interpolation"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "w"
+  }
+  inputs {
+    input_layer_name: "a"
+  }
+  inputs {
+    input_layer_name: "b"
+  }
+}
+layers {
+  name: "__power_layer_0__"
+  type: "power"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "w"
+  }
+  inputs {
+    input_layer_name: "a"
+  }
+}
+layers {
+  name: "__scaling_layer_0__"
+  type: "scaling"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "w"
+  }
+  inputs {
+    input_layer_name: "a"
+  }
+}
+layers {
+  name: "__cos_sim_0__"
+  type: "cos"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "a"
+  }
+  inputs {
+    input_layer_name: "b"
+  }
+  cos_scale: 5
+}
+layers {
+  name: "__cos_sim_1__"
+  type: "cos_vm"
+  size: 2
+  active_type: ""
+  inputs {
+    input_layer_name: "a"
+  }
+  inputs {
+    input_layer_name: "c"
+  }
+  cos_scale: 5
+}
+layers {
+  name: "__sum_to_one_norm_layer_0__"
+  type: "sum_to_one_norm"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "a"
+  }
+}
+layers {
+  name: "__conv_shift_layer_0__"
+  type: "conv_shift"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "a"
+  }
+  inputs {
+    input_layer_name: "d"
+  }
+}
+layers {
+  name: "__tensor_layer_0__"
+  type: "tensor"
+  size: 1000
+  active_type: ""
+  inputs {
+    input_layer_name: "a"
+    input_parameter_name: "___tensor_layer_0__.w0"
+  }
+  inputs {
+    input_layer_name: "b"
+  }
+  bias_parameter_name: "___tensor_layer_0__.wbias"
+}
+layers {
+  name: "__slope_intercept_layer_0__"
+  type: "slope_intercept"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "a"
+  }
+  slope: 0.7
+  intercept: 0.9
+}
+layers {
+  name: "__linear_comb_layer_0__"
+  type: "convex_comb"
+  size: 2
+  active_type: ""
+  inputs {
+    input_layer_name: "b"
+  }
+  inputs {
+    input_layer_name: "c"
+  }
+}
+parameters {
+  name: "___tensor_layer_0__.w0"
+  size: 10000000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 100
+  dims: 1000
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___tensor_layer_0__.wbias"
+  size: 1000
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 1000
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "w"
+input_layer_names: "a"
+input_layer_names: "b"
+input_layer_names: "c"
+input_layer_names: "d"
+output_layer_names: "__interpolation_layer_0__"
+output_layer_names: "__power_layer_0__"
+output_layer_names: "__scaling_layer_0__"
+output_layer_names: "__cos_sim_0__"
+output_layer_names: "__cos_sim_1__"
+output_layer_names: "__sum_to_one_norm_layer_0__"
+output_layer_names: "__conv_shift_layer_0__"
+output_layer_names: "__tensor_layer_0__"
+output_layer_names: "__slope_intercept_layer_0__"
+output_layer_names: "__linear_comb_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "w"
+  layer_names: "a"
+  layer_names: "b"
+  layer_names: "c"
+  layer_names: "d"
+  layer_names: "__interpolation_layer_0__"
+  layer_names: "__power_layer_0__"
+  layer_names: "__scaling_layer_0__"
+  layer_names: "__cos_sim_0__"
+  layer_names: "__cos_sim_1__"
+  layer_names: "__sum_to_one_norm_layer_0__"
+  layer_names: "__conv_shift_layer_0__"
+  layer_names: "__tensor_layer_0__"
+  layer_names: "__slope_intercept_layer_0__"
+  layer_names: "__linear_comb_layer_0__"
+  input_layer_names: "w"
+  input_layer_names: "a"
+  input_layer_names: "b"
+  input_layer_names: "c"
+  input_layer_names: "d"
+  output_layer_names: "__interpolation_layer_0__"
+  output_layer_names: "__power_layer_0__"
+  output_layer_names: "__scaling_layer_0__"
+  output_layer_names: "__cos_sim_0__"
+  output_layer_names: "__cos_sim_1__"
+  output_layer_names: "__sum_to_one_norm_layer_0__"
+  output_layer_names: "__conv_shift_layer_0__"
+  output_layer_names: "__tensor_layer_0__"
+  output_layer_names: "__slope_intercept_layer_0__"
+  output_layer_names: "__linear_comb_layer_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_print_layer.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_print_layer.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..c402aff174ab7c7d7f63234960d4a24d84622dd4
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_print_layer.protostr
@@ -0,0 +1,26 @@
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__print_0__"
+  type: "print"
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+  }
+}
+input_layer_names: "input"
+output_layer_names: "input"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "__print_0__"
+  input_layer_names: "input"
+  output_layer_names: "input"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..41d2e2f2671f5c05425f9bd2e91d8adc33129761
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
@@ -0,0 +1,650 @@
+type: "recurrent_nn"
+layers {
+  name: "seq_input"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "sub_seq_input"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "label"
+  type: "data"
+  size: 1
+  active_type: ""
+}
+layers {
+  name: "__mixed_0__"
+  type: "mixed"
+  size: 400
+  active_type: ""
+  inputs {
+    input_layer_name: "seq_input"
+    input_parameter_name: "___mixed_0__.w0"
+    proj_conf {
+      type: "fc"
+      name: "___mixed_0__.w0"
+      input_size: 100
+      output_size: 400
+    }
+  }
+}
+layers {
+  name: "__mixed_1__"
+  type: "mixed"
+  size: 300
+  active_type: ""
+  inputs {
+    input_layer_name: "seq_input"
+    input_parameter_name: "___mixed_1__.w0"
+    proj_conf {
+      type: "fc"
+      name: "___mixed_1__.w0"
+      input_size: 100
+      output_size: 300
+    }
+  }
+}
+layers {
+  name: "__recurrent_group_0__"
+  type: "recurrent_layer_group"
+  active_type: ""
+}
+layers {
+  name: "seq_input@__recurrent_group_0__"
+  type: "scatter_agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "rnn_forward+delay1@__recurrent_group_0__"
+  type: "agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "rnn_forward@__recurrent_group_0__"
+  type: "fc"
+  size: 200
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "seq_input@__recurrent_group_0__"
+    input_parameter_name: "_rnn_forward@__recurrent_group_0__.w0"
+  }
+  inputs {
+    input_layer_name: "rnn_forward+delay1@__recurrent_group_0__"
+    input_parameter_name: "_rnn_forward@__recurrent_group_0__.w1"
+  }
+  bias_parameter_name: "_rnn_forward@__recurrent_group_0__.wbias"
+}
+layers {
+  name: "rnn_forward"
+  type: "gather_agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "__last_seq_0__"
+  type: "seqlastins"
+  size: 200
+  active_type: "linear"
+  inputs {
+    input_layer_name: "rnn_forward"
+  }
+  trans_type: "non-seq"
+}
+layers {
+  name: "__recurrent_group_1__"
+  type: "recurrent_layer_group"
+  active_type: ""
+}
+layers {
+  name: "seq_input@__recurrent_group_1__"
+  type: "scatter_agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "rnn_back+delay1@__recurrent_group_1__"
+  type: "agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "rnn_back@__recurrent_group_1__"
+  type: "fc"
+  size: 200
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "seq_input@__recurrent_group_1__"
+    input_parameter_name: "_rnn_back@__recurrent_group_1__.w0"
+  }
+  inputs {
+    input_layer_name: "rnn_back+delay1@__recurrent_group_1__"
+    input_parameter_name: "_rnn_back@__recurrent_group_1__.w1"
+  }
+  bias_parameter_name: "_rnn_back@__recurrent_group_1__.wbias"
+}
+layers {
+  name: "rnn_back"
+  type: "gather_agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "__first_seq_0__"
+  type: "seqlastins"
+  size: 200
+  active_type: "linear"
+  inputs {
+    input_layer_name: "rnn_back"
+  }
+  select_first: true
+  trans_type: "non-seq"
+}
+layers {
+  name: "__recurrent_group_2__"
+  type: "recurrent_layer_group"
+  active_type: ""
+}
+layers {
+  name: "sub_seq_input@__recurrent_group_2__"
+  type: "sequence_scatter_agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "rnn_subseq_forward+delay1@__recurrent_group_2__"
+  type: "agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "rnn_subseq_forward@__recurrent_group_2__"
+  type: "fc"
+  size: 200
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "sub_seq_input@__recurrent_group_2__"
+    input_parameter_name: "_rnn_subseq_forward@__recurrent_group_2__.w0"
+  }
+  inputs {
+    input_layer_name: "rnn_subseq_forward+delay1@__recurrent_group_2__"
+    input_parameter_name: "_rnn_subseq_forward@__recurrent_group_2__.w1"
+  }
+  bias_parameter_name: "_rnn_subseq_forward@__recurrent_group_2__.wbias"
+}
+layers {
+  name: "rnn_subseq_forward"
+  type: "sequence_gather_agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "__last_seq_1__"
+  type: "seqlastins"
+  size: 200
+  active_type: "linear"
+  inputs {
+    input_layer_name: "rnn_subseq_forward"
+  }
+  trans_type: "non-seq"
+}
+layers {
+  name: "__lstm_group_0___recurrent_group"
+  type: "recurrent_layer_group"
+  active_type: ""
+}
+layers {
+  name: "__mixed_0__@__lstm_group_0___recurrent_group"
+  type: "scatter_agent"
+  size: 400
+  active_type: ""
+}
+layers {
+  name: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
+  type: "agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
+  type: "agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group"
+  type: "mixed"
+  size: 400
+  active_type: ""
+  inputs {
+    input_layer_name: "__mixed_0__@__lstm_group_0___recurrent_group"
+    proj_conf {
+      type: "identity"
+      name: "___lstm_group_0___input_recurrent.w0"
+      input_size: 400
+      output_size: 400
+    }
+  }
+  inputs {
+    input_layer_name: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
+    input_parameter_name: "___lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group.w1"
+    proj_conf {
+      type: "fc"
+      name: "___lstm_group_0___input_recurrent.w1"
+      input_size: 100
+      output_size: 400
+    }
+  }
+}
+layers {
+  name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
+  type: "lstm_step"
+  size: 100
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "__lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group"
+  }
+  inputs {
+    input_layer_name: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
+  }
+  bias_parameter_name: "___lstm_group_0__@__lstm_group_0___recurrent_group.wbias"
+  active_gate_type: "sigmoid"
+  active_state_type: "sigmoid"
+}
+layers {
+  name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
+  type: "get_output"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
+    input_layer_argument: "state"
+  }
+}
+layers {
+  name: "__lstm_group_0__"
+  type: "gather_agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__last_seq_2__"
+  type: "seqlastins"
+  size: 100
+  active_type: "linear"
+  inputs {
+    input_layer_name: "__lstm_group_0__"
+  }
+  trans_type: "non-seq"
+}
+layers {
+  name: "__gru_group_0___recurrent_group"
+  type: "recurrent_layer_group"
+  active_type: ""
+}
+layers {
+  name: "__mixed_1__@__gru_group_0___recurrent_group"
+  type: "scatter_agent"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "__gru_group_0__+delay1@__gru_group_0___recurrent_group"
+  type: "agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__gru_group_0__@__gru_group_0___recurrent_group"
+  type: "gru_step"
+  size: 100
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "__mixed_1__@__gru_group_0___recurrent_group"
+    input_parameter_name: "___gru_group_0__@__gru_group_0___recurrent_group.w0"
+  }
+  inputs {
+    input_layer_name: "__gru_group_0__+delay1@__gru_group_0___recurrent_group"
+  }
+  bias_parameter_name: "___gru_group_0__@__gru_group_0___recurrent_group.wbias"
+  active_gate_type: "sigmoid"
+}
+layers {
+  name: "__gru_group_0__"
+  type: "gather_agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__last_seq_3__"
+  type: "seqlastins"
+  size: 100
+  active_type: "linear"
+  inputs {
+    input_layer_name: "__gru_group_0__"
+  }
+  trans_type: "non-seq"
+}
+parameters {
+  name: "___mixed_0__.w0"
+  size: 40000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 400
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___mixed_1__.w0"
+  size: 30000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 300
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_rnn_forward@__recurrent_group_0__.w0"
+  size: 20000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 200
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_rnn_forward@__recurrent_group_0__.w1"
+  size: 40000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 200
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_rnn_forward@__recurrent_group_0__.wbias"
+  size: 200
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 200
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_rnn_back@__recurrent_group_1__.w0"
+  size: 20000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 200
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_rnn_back@__recurrent_group_1__.w1"
+  size: 40000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 200
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_rnn_back@__recurrent_group_1__.wbias"
+  size: 200
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 200
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "_rnn_subseq_forward@__recurrent_group_2__.w0"
+  size: 20000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 200
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_rnn_subseq_forward@__recurrent_group_2__.w1"
+  size: 40000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 200
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "_rnn_subseq_forward@__recurrent_group_2__.wbias"
+  size: 200
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 200
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group.w1"
+  size: 40000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 400
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___lstm_group_0__@__lstm_group_0___recurrent_group.wbias"
+  size: 300
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 300
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___gru_group_0__@__gru_group_0___recurrent_group.w0"
+  size: 30000
+  initial_mean: 0.0
+  initial_std: 0.01
+  dims: 100
+  dims: 300
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___gru_group_0__@__gru_group_0___recurrent_group.wbias"
+  size: 300
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 300
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "seq_input"
+input_layer_names: "sub_seq_input"
+output_layer_names: "__last_seq_0__"
+output_layer_names: "__first_seq_0__"
+output_layer_names: "__last_seq_1__"
+output_layer_names: "__last_seq_2__"
+output_layer_names: "__last_seq_3__"
+sub_models {
+  name: "root"
+  layer_names: "seq_input"
+  layer_names: "sub_seq_input"
+  layer_names: "label"
+  layer_names: "__mixed_0__"
+  layer_names: "__mixed_1__"
+  layer_names: "__recurrent_group_0__"
+  layer_names: "rnn_forward"
+  layer_names: "__last_seq_0__"
+  layer_names: "__recurrent_group_1__"
+  layer_names: "rnn_back"
+  layer_names: "__first_seq_0__"
+  layer_names: "__recurrent_group_2__"
+  layer_names: "rnn_subseq_forward"
+  layer_names: "__last_seq_1__"
+  layer_names: "__lstm_group_0___recurrent_group"
+  layer_names: "__lstm_group_0__"
+  layer_names: "__last_seq_2__"
+  layer_names: "__gru_group_0___recurrent_group"
+  layer_names: "__gru_group_0__"
+  layer_names: "__last_seq_3__"
+  input_layer_names: "seq_input"
+  input_layer_names: "sub_seq_input"
+  output_layer_names: "__last_seq_0__"
+  output_layer_names: "__first_seq_0__"
+  output_layer_names: "__last_seq_1__"
+  output_layer_names: "__last_seq_2__"
+  output_layer_names: "__last_seq_3__"
+  is_recurrent_layer_group: false
+}
+sub_models {
+  name: "__recurrent_group_0__"
+  layer_names: "seq_input@__recurrent_group_0__"
+  layer_names: "rnn_forward+delay1@__recurrent_group_0__"
+  layer_names: "rnn_forward@__recurrent_group_0__"
+  is_recurrent_layer_group: true
+  reversed: false
+  memories {
+    layer_name: "rnn_forward@__recurrent_group_0__"
+    link_name: "rnn_forward+delay1@__recurrent_group_0__"
+    is_sequence: false
+  }
+  in_links {
+    layer_name: "seq_input"
+    link_name: "seq_input@__recurrent_group_0__"
+    has_subseq: false
+  }
+  out_links {
+    layer_name: "rnn_forward@__recurrent_group_0__"
+    link_name: "rnn_forward"
+    has_subseq: false
+  }
+  target_inlinkid: -1
+}
+sub_models {
+  name: "__recurrent_group_1__"
+  layer_names: "seq_input@__recurrent_group_1__"
+  layer_names: "rnn_back+delay1@__recurrent_group_1__"
+  layer_names: "rnn_back@__recurrent_group_1__"
+  is_recurrent_layer_group: true
+  reversed: true
+  memories {
+    layer_name: "rnn_back@__recurrent_group_1__"
+    link_name: "rnn_back+delay1@__recurrent_group_1__"
+    is_sequence: false
+  }
+  in_links {
+    layer_name: "seq_input"
+    link_name: "seq_input@__recurrent_group_1__"
+    has_subseq: false
+  }
+  out_links {
+    layer_name: "rnn_back@__recurrent_group_1__"
+    link_name: "rnn_back"
+    has_subseq: false
+  }
+  target_inlinkid: -1
+}
+sub_models {
+  name: "__recurrent_group_2__"
+  layer_names: "sub_seq_input@__recurrent_group_2__"
+  layer_names: "rnn_subseq_forward+delay1@__recurrent_group_2__"
+  layer_names: "rnn_subseq_forward@__recurrent_group_2__"
+  is_recurrent_layer_group: true
+  reversed: false
+  memories {
+    layer_name: "rnn_subseq_forward@__recurrent_group_2__"
+    link_name: "rnn_subseq_forward+delay1@__recurrent_group_2__"
+    is_sequence: false
+  }
+  in_links {
+    layer_name: "sub_seq_input"
+    link_name: "sub_seq_input@__recurrent_group_2__"
+    has_subseq: true
+  }
+  out_links {
+    layer_name: "rnn_subseq_forward@__recurrent_group_2__"
+    link_name: "rnn_subseq_forward"
+    has_subseq: true
+  }
+  target_inlinkid: -1
+}
+sub_models {
+  name: "__lstm_group_0___recurrent_group"
+  layer_names: "__mixed_0__@__lstm_group_0___recurrent_group"
+  layer_names: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
+  layer_names: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
+  layer_names: "__lstm_group_0___input_recurrent@__lstm_group_0___recurrent_group"
+  layer_names: "__lstm_group_0__@__lstm_group_0___recurrent_group"
+  layer_names: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
+  is_recurrent_layer_group: true
+  reversed: false
+  memories {
+    layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
+    link_name: "__lstm_group_0__+delay1@__lstm_group_0___recurrent_group"
+    is_sequence: false
+  }
+  memories {
+    layer_name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
+    link_name: "__lstm_group_0___state+delay1@__lstm_group_0___recurrent_group"
+    is_sequence: false
+  }
+  in_links {
+    layer_name: "__mixed_0__"
+    link_name: "__mixed_0__@__lstm_group_0___recurrent_group"
+    has_subseq: false
+  }
+  out_links {
+    layer_name: "__lstm_group_0__@__lstm_group_0___recurrent_group"
+    link_name: "__lstm_group_0__"
+    has_subseq: false
+  }
+  target_inlinkid: -1
+}
+sub_models {
+  name: "__gru_group_0___recurrent_group"
+  layer_names: "__mixed_1__@__gru_group_0___recurrent_group"
+  layer_names: "__gru_group_0__+delay1@__gru_group_0___recurrent_group"
+  layer_names: "__gru_group_0__@__gru_group_0___recurrent_group"
+  is_recurrent_layer_group: true
+  reversed: false
+  memories {
+    layer_name: "__gru_group_0__@__gru_group_0___recurrent_group"
+    link_name: "__gru_group_0__+delay1@__gru_group_0___recurrent_group"
+    is_sequence: false
+  }
+  in_links {
+    layer_name: "__mixed_1__"
+    link_name: "__mixed_1__@__gru_group_0___recurrent_group"
+    has_subseq: false
+  }
+  out_links {
+    layer_name: "__gru_group_0__@__gru_group_0___recurrent_group"
+    link_name: "__gru_group_0__"
+    has_subseq: false
+  }
+  target_inlinkid: -1
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..1999c006d237eb449d59c8e8a2a83c1e4fab9d0e
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
@@ -0,0 +1,111 @@
+type: "nn"
+layers {
+  name: "dat_in"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__seq_pooling_0__"
+  type: "max"
+  size: 100
+  active_type: "linear"
+  inputs {
+    input_layer_name: "dat_in"
+  }
+  trans_type: "seq"
+}
+layers {
+  name: "__seq_pooling_1__"
+  type: "max"
+  size: 100
+  active_type: "linear"
+  inputs {
+    input_layer_name: "dat_in"
+  }
+  trans_type: "non-seq"
+}
+layers {
+  name: "__seq_pooling_2__"
+  type: "average"
+  size: 100
+  active_type: "linear"
+  inputs {
+    input_layer_name: "dat_in"
+  }
+  average_strategy: "average"
+  trans_type: "seq"
+}
+layers {
+  name: "__seq_pooling_3__"
+  type: "average"
+  size: 100
+  active_type: "linear"
+  inputs {
+    input_layer_name: "dat_in"
+  }
+  average_strategy: "average"
+  trans_type: "non-seq"
+}
+layers {
+  name: "__seq_pooling_4__"
+  type: "average"
+  size: 100
+  active_type: "linear"
+  inputs {
+    input_layer_name: "dat_in"
+  }
+  average_strategy: "sum"
+  trans_type: "seq"
+}
+layers {
+  name: "__seq_pooling_5__"
+  type: "average"
+  size: 100
+  active_type: "linear"
+  inputs {
+    input_layer_name: "dat_in"
+  }
+  average_strategy: "sum"
+  trans_type: "non-seq"
+}
+layers {
+  name: "__seq_pooling_6__"
+  type: "max"
+  size: 100
+  active_type: "linear"
+  inputs {
+    input_layer_name: "dat_in"
+  }
+  output_max_index: true
+  trans_type: "non-seq"
+}
+input_layer_names: "dat_in"
+output_layer_names: "__seq_pooling_0__"
+output_layer_names: "__seq_pooling_1__"
+output_layer_names: "__seq_pooling_2__"
+output_layer_names: "__seq_pooling_3__"
+output_layer_names: "__seq_pooling_4__"
+output_layer_names: "__seq_pooling_5__"
+output_layer_names: "__seq_pooling_6__"
+sub_models {
+  name: "root"
+  layer_names: "dat_in"
+  layer_names: "__seq_pooling_0__"
+  layer_names: "__seq_pooling_1__"
+  layer_names: "__seq_pooling_2__"
+  layer_names: "__seq_pooling_3__"
+  layer_names: "__seq_pooling_4__"
+  layer_names: "__seq_pooling_5__"
+  layer_names: "__seq_pooling_6__"
+  input_layer_names: "dat_in"
+  output_layer_names: "__seq_pooling_0__"
+  output_layer_names: "__seq_pooling_1__"
+  output_layer_names: "__seq_pooling_2__"
+  output_layer_names: "__seq_pooling_3__"
+  output_layer_names: "__seq_pooling_4__"
+  output_layer_names: "__seq_pooling_5__"
+  output_layer_names: "__seq_pooling_6__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/unused_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/unused_layers.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..89ed28406e553ba93bec8c86879a85f0a5c1caa1
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/unused_layers.protostr
@@ -0,0 +1,27 @@
+type: "nn"
+layers {
+  name: "probs"
+  type: "data"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__sampling_id_layer_0__"
+  type: "sampling_id"
+  size: 100
+  active_type: ""
+  inputs {
+    input_layer_name: "probs"
+  }
+}
+input_layer_names: "probs"
+output_layer_names: "__sampling_id_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "probs"
+  layer_names: "__sampling_id_layer_0__"
+  input_layer_names: "probs"
+  output_layer_names: "__sampling_id_layer_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/util_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/util_layers.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..d0ad388165007b8f96f059e5b003c52f756383e5
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/util_layers.protostr
@@ -0,0 +1,81 @@
+type: "nn"
+layers {
+  name: "a"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "b"
+  type: "data"
+  size: 10
+  active_type: ""
+}
+layers {
+  name: "__addto_0__"
+  type: "addto"
+  size: 10
+  active_type: ""
+  inputs {
+    input_layer_name: "a"
+  }
+  inputs {
+    input_layer_name: "b"
+  }
+}
+layers {
+  name: "__concat_0__"
+  type: "concat"
+  size: 20
+  active_type: ""
+  inputs {
+    input_layer_name: "a"
+  }
+  inputs {
+    input_layer_name: "b"
+  }
+}
+layers {
+  name: "__concat_1__"
+  type: "concat2"
+  size: 20
+  active_type: ""
+  inputs {
+    input_layer_name: "a"
+    proj_conf {
+      type: "identity"
+      name: "___concat_1__.w0"
+      input_size: 10
+      output_size: 10
+    }
+  }
+  inputs {
+    input_layer_name: "b"
+    proj_conf {
+      type: "identity"
+      name: "___concat_1__.w1"
+      input_size: 10
+      output_size: 10
+    }
+  }
+}
+input_layer_names: "a"
+input_layer_names: "b"
+output_layer_names: "__addto_0__"
+output_layer_names: "__concat_0__"
+output_layer_names: "__concat_1__"
+sub_models {
+  name: "root"
+  layer_names: "a"
+  layer_names: "b"
+  layer_names: "__addto_0__"
+  layer_names: "__concat_0__"
+  layer_names: "__concat_1__"
+  input_layer_names: "a"
+  input_layer_names: "b"
+  output_layer_names: "__addto_0__"
+  output_layer_names: "__concat_0__"
+  output_layer_names: "__concat_1__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
index 78114ce32b019cde7a028acde4d281cf6b3dac8e..f05fc46cd55207149b0b8511881eb02b1150c000 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
@@ -1,5 +1,17 @@
 #!/bin/bash
 cd `dirname $0`
+
 set -e
+
+protostr=`dirname $0`/protostr
+
+files=`ls $protostr | grep -v "unitest"`
+
 ./generate_protostr.sh
-md5sum -c check.md5
+
+for file in $files
+do
+    base_protostr=$protostr/$file
+    new_protostr=$protostr/$file.unitest
+    diff $base_protostr $new_protostr
+done
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py b/python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py
new file mode 100644
index 0000000000000000000000000000000000000000..ab9f7c4948b856d2cc7ff348fc49a9d4de3fbc3a
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py
@@ -0,0 +1,10 @@
+from paddle.trainer_config_helpers import *
+
+settings(
+    batch_size=1000,
+    learning_rate=1e-4
+)
+
+din = data_layer(name='data', size=120)
+
+outputs(bidirectional_gru(input=din, size=40, return_seq=True))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
index 079e2cf4c432060ae19d1ad70faa6423b687f99a..7c1fb04766ae5485d246d8d7a994be3d8a6d9114 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
@@ -25,6 +25,25 @@ pool = img_pool_layer(input=maxout,
                       stride=2,
                       pool_type=MaxPooling())
 
-fc = fc_layer(input=pool, size=384, bias_attr=False)
+conv2 = img_conv_layer(input=pool,
+                       filter_size = 3,
+                       num_channels=32,
+                       num_filters=128,
+                       padding=1,
+                       act=LinearActivation(),
+                       bias_attr=True)
+
+maxout2 = maxout_layer(input=conv,
+                       num_channels=128,
+                       groups=4)
+
+block = block_expand_layer(input=maxout,
+                           num_channels=32,
+                           stride_x=1,
+                           stride_y=1,
+                           block_x=1,
+                           block_y=6)
+
+fc = fc_layer(input=block, size=384, bias_attr=False)
 
 outputs(fc)