Fix conflict

bd38faca · liaogang · 57348806 · e05f4ff2 · bd38faca · bd38faca
152 changed file
--- a/.travis.yml
+++ b/.travis.yml
@@ -35,6 +35,8 @@ addons:
      - libgoogle-glog-dev
      - libgflags-dev
      - libgtest-dev
+      - curl
+      - lcov
      - graphviz
 before_install:
  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,13 +3,13 @@ cmake_minimum_required(VERSION 2.8)
 project(paddle CXX C)
 set(PADDLE_MAJOR_VERSION 0)
 set(PADDLE_MINOR_VERSION 8)
-set(PADDLE_PATCH_VERSION 0b2)
+set(PADDLE_PATCH_VERSION 0b3)
 set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})

 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
 set(PROJ_ROOT ${CMAKE_SOURCE_DIR})
 include(package)
-include(swig)
+find_package(SWIG 2.0)
 find_package(CUDA QUIET)
 find_package(Protobuf REQUIRED)
 find_package(PythonLibs 2.7 REQUIRED)
@@ -40,6 +40,9 @@ option(WITH_TESTING "Compile and run unittest for PaddlePaddle" ${GTEST_FOUND})
 option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
 option(WITH_SWIG_PY "Compile PaddlePaddle with py PaddlePaddle prediction api" ${SWIG_FOUND})
 option(ON_TRAVIS "Running test on travis-ci or not." OFF)
+option(ON_COVERALLS "Generating code coverage data on coveralls or not." OFF)
+option(COVERALLS_UPLOAD "Uploading the generated coveralls json." ON)
+
 if(NOT CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING 
        "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
@@ -49,11 +52,16 @@ endif()
 include(enableCXX11)
 include(cpplint)
 include(ccache)
+if(WITH_RDMA)
+  include(rdma)
+endif()
 include(util)
 include(flags)
 include(cudnn)
 include(FindPythonModule)
 include(check_packages)
+include(swig)
+include(coveralls)

 # add PaddlePaddle version
 if(DEFINED ENV{PADDLE_VERSION})
@@ -129,12 +137,15 @@ else(WITH_PYTHON)
    add_definitions(-DPADDLE_NO_PYTHON)
 endif(WITH_PYTHON)

-if(NOT WITH_RDMA)
-    add_definitions(-DPADDLE_DISABLE_RDMA)
-endif()
+if(WITH_RDMA)
+  include_directories("${RDMA_INC_DIR}")
+else(WITH_RDMA)
+  add_definitions(-DPADDLE_DISABLE_RDMA)
+endif(WITH_RDMA)

 if(WITH_GLOG)
    add_definitions(-DPADDLE_USE_GLOG)
+    include_directories(${LIBGLOG_INCLUDE_DIR})
 endif()

 if(WITH_GFLAGS)

--- a/ISSUE_TEMPLATE.md
+++ b/ISSUE_TEMPLATE.md
+Thank you for contributing to PaddlePaddle. Submitting an issue is a great help for us.
+Both Chinese and English issues are welcome.
+
+It's hard to solve a problem when important details are missing.
+Before submitting the issue, look over the following criteria before handing your request in.
+
+- [ ] Was there a similar issue submitted or resolved before ? You could search issue in the github.
+- [ ] Did you retrieve your issue from widespread search engines ?
+- [ ] Is my description of the issue clear enough to reproduce this problem?
+   * If some errors occured, we need details about `how do you run your code?`, `what system do you use?`, `Are you using GPU or not?`, etc.
+   * If you use an recording [asciinema](https://asciinema.org/) to show what you are doing to make it happen, that's awesome! We could help you solve the problem more quickly.
+- [ ] Is my description of the issue use the github markdown correctly?
+   * Please use the proper markdown syntaxes for styling all forms of writing, e.g, source code, error information, etc.
+   * Check out [this page](https://guides.github.com/features/mastering-markdown/) to find out much more about markdown.
--- a/README.md
+++ b/README.md
 # PaddlePaddle

-|  **`Linux`**   | **`License`** | **`Chat Room`** |
-|----------------|---------------|-----------------|
-|[![Build Status](https://travis-ci.org/baidu/Paddle.svg?branch=master)](https://travis-ci.org/baidu/Paddle)|[![License](https://img.shields.io/badge/license-Apache%202.0-green.svg)](LICENSE)|[![Join the chat at https://gitter.im/PaddlePaddle/Deep_Learning](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/PaddlePaddle/Deep_Learning?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)|
+
+[![Build Status](https://travis-ci.org/baidu/Paddle.svg?branch=master)](https://travis-ci.org/baidu/Paddle)
+[![Coverage Status](https://coveralls.io/repos/github/baidu/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/baidu/Paddle?branch=develop)
+[![Join the chat at https://gitter.im/PaddlePaddle/Deep_Learning](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/PaddlePaddle/Deep_Learning?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge)
+[![License](https://img.shields.io/badge/license-Apache%202.0-green.svg)](LICENSE)

 Welcome to the PaddlePaddle GitHub.


--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
-# Find the CBlas libraries
+# Find the CBlas and lapack libraries
 #
 # It will search MKL, atlas, OpenBlas, reference-cblas in order.
 #
@@ -19,6 +19,8 @@ set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains MKL")

 find_path(MKL_INCLUDE_DIR mkl.h PATHS
  ${MKL_ROOT}/include)
+find_path(MKL_INCLUDE_DIR mkl_lapacke.h PATHS
+  ${MKL_ROOT}/include)
 find_library(MKL_CORE_LIB NAMES mkl_core PATHS
  ${MKL_ROOT}/lib
  ${MKL_ROOT}/lib/intel64)
@@ -37,6 +39,7 @@ if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
          ${MKL_SEQUENTIAL_LIB}
          ${MKL_CORE_LIB})
  add_definitions(-DPADDLE_USE_MKL)
+  message(STATUS "Found MKL (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
  return() # return file.
 endif()

@@ -55,15 +58,19 @@ set(ATLAS_LIB_SEARCH_PATHS
    )
 find_path(ATLAS_INC_DIR NAMES cblas.h 
  PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
+find_path(ATLAS_CLAPACK_INC_DIR NAMES clapack.h
+  PATHS ${ATLAS_INCLUDE_SEARCH_PATHS})
 find_library(ATLAS_CBLAS_LIB NAMES cblas libcblas.so.3 
  PATHS ${ATLAS_LIB_SEARCH_PATHS})
-find_library(ATLAS_LIB NAMES atlas libatlas.so.3
+find_library(ATLAS_LIB NAMES lapack_atlas liblapack_atlas.so.3
  PATHS ${ATLAS_LIB_SEARCH_PATHS})

 if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB)
  set(CBLAS_PROVIDER ATLAS)
-  set(CBLAS_INC_DIR ${ATLAS_INC_DIR})
+  set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
  set(CBLAS_LIBS ${ATLAS_LIB} ${ATLAS_CBLAS_LIB})
+  add_definitions(-DPADDLE_USE_ATLAS)  
+  message(STATUS "Found Atlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
  return()
 endif()

@@ -83,6 +90,8 @@ set(OPENBLAS_LIB_SEARCH_PATHS

 find_path(OPENBLAS_INC_DIR NAMES cblas.h
  PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
+find_path(OPENBLAS_LAPACKE_INC_DIR NAMES lapacke.h
+  PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})
 find_library(OPENBLAS_LIB NAMES openblas
  PATHS ${OPENBLAS_LIB_SEARCH_PATHS})

@@ -90,6 +99,7 @@ if(OPENBLAS_INC_DIR AND OPENBLAS_LIB)
  set(CBLAS_PROVIDER OPENBLAS)
  set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR})
  set(CBLAS_LIBS ${OPENBLAS_LIB})
+  message(STATUS "Found OpenBlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
  return()
 endif()


--- a/cmake/coveralls.cmake
+++ b/cmake/coveralls.cmake
+# CMake script for code coverage.
+# If _COVERALLS_UPLOAD is ON, it will upload json files to overalls.io automatically.
+
+# Param _COVERAGE_SRCS          A list of coverage source files.
+# Param _COVERALLS_UPLOAD       Upload the result to coveralls.
+# Param _CMAKE_SCRIPT_PATH      CMake script path.
+function(code_coverage _COVERAGE_SRCS _COVERALLS_UPLOAD _CMAKE_SCRIPT_PATH)
+    # clean previous gcov data.
+    file(REMOVE_RECURSE ${PROJECT_BINARY_DIR}/*.gcda)
+
+    # find curl for upload JSON soon.
+    if (_COVERALLS_UPLOAD)
+        find_program(CURL_EXECUTABLE curl)
+        if (NOT CURL_EXECUTABLE)
+            message(FATAL_ERROR "Coveralls: curl not found!")
+        endif()
+    endif()
+
+    # When passing a CMake list to an external process, the list
+    # will be converted from the format "1;2;3" to "1 2 3".
+    set(COVERAGE_SRCS "")
+    foreach (SINGLE_SRC ${_COVERAGE_SRCS})
+        set(COVERAGE_SRCS "${COVERAGE_SRCS}*${SINGLE_SRC}")
+    endforeach()
+
+    # query number of logical cores
+    cmake_host_system_information(RESULT core_size QUERY NUMBER_OF_LOGICAL_CORES)
+    # coveralls json file.
+    set(COVERALLS_FILE ${PROJECT_BINARY_DIR}/coveralls.json)
+    add_custom_target(coveralls_generate
+        # Run regress tests.
+        COMMAND ${CMAKE_CTEST_COMMAND}
+                -j ${core_size}
+                --output-on-failure
+        # Generate Gcov and translate it into coveralls JSON.
+        COMMAND ${CMAKE_COMMAND}
+                -DCOVERAGE_SRCS="${COVERAGE_SRCS}"
+                -DCOVERALLS_OUTPUT_FILE="${COVERALLS_FILE}"
+                -DCOV_PATH="${PROJECT_BINARY_DIR}"
+                -DPROJECT_ROOT="${PROJECT_SOURCE_DIR}"
+                -P "${_CMAKE_SCRIPT_PATH}/coverallsGcovJsons.cmake"
+        WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
+        COMMENT "Coveralls: generating coveralls output..."
+    )
+
+    if (_COVERALLS_UPLOAD)
+        message("COVERALLS UPLOAD: ON")
+        # Upload the JSON to coveralls.
+        add_custom_target(coveralls_upload
+            COMMAND ${CURL_EXECUTABLE}
+                    -S -F json_file=@${COVERALLS_FILE}
+                    https://coveralls.io/api/v1/jobs
+            DEPENDS coveralls_generate
+            WORKING_DIRECTORY ${PROJECT_BINARY_DIR}
+            COMMENT "Coveralls: uploading coveralls output...")
+
+        add_custom_target(coveralls DEPENDS coveralls_upload)
+    else()
+        message("COVERALLS UPLOAD: OFF")
+        add_custom_target(coveralls DEPENDS coveralls_generate)
+    endif()
+endfunction()
+
+if(ON_COVERALLS)
+    set(CMAKE_BUILD_TYPE "Debug")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
+
+    set(EXCLUDE_DIRS
+        "demo/"
+        "build/"
+        "tests/"
+        ".test_env/"
+    )
+
+    if(WITH_GPU)
+        file(GLOB_RECURSE PADDLE_SOURCES RELATIVE "${PROJECT_SOURCE_DIR}" "*.cpp" "*.cc" ".c" "*.cu")
+    else()
+        file(GLOB_RECURSE PADDLE_SOURCES RELATIVE "${PROJECT_SOURCE_DIR}" "*.cpp" "*.cc" "*.c")
+    endif()
+
+    # exclude trivial files in PADDLE_SOURCES
+    foreach(EXCLUDE_DIR ${EXCLUDE_DIRS})
+        foreach(TMP_PATH ${PADDLE_SOURCES})
+            string(FIND ${TMP_PATH} ${EXCLUDE_DIR} EXCLUDE_DIR_FOUND)
+            if(NOT ${EXCLUDE_DIR_FOUND} EQUAL -1)
+                list(REMOVE_ITEM PADDLE_SOURCES ${TMP_PATH})
+            endif()
+        endforeach(TMP_PATH)
+    endforeach()
+
+    # convert to absolute path
+    set(PADDLE_SRCS "")
+    foreach(PADDLE_SRC ${PADDLE_SOURCES})
+        set(PADDLE_SRCS "${PADDLE_SRCS};${PROJECT_SOURCE_DIR}/${PADDLE_SRC}")
+    endforeach()
+
+    code_coverage(
+        "${PADDLE_SRCS}"
+        ${COVERALLS_UPLOAD}
+        "${PROJECT_SOURCE_DIR}/cmake"
+    )
+endif()
--- a/cmake/coverallsGcovJsons.cmake
+++ b/cmake/coverallsGcovJsons.cmake
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# Copyright (C) 2014 Joakim Söderberg <joakim.soderberg@gmail.com>
+#
+# This is intended to be run by a custom target in a CMake project like this.
+# 0. Compile program with coverage support.
+# 1. Clear coverage data. (Recursively delete *.gcda in build dir)
+# 2. Run the unit tests.
+# 3. Run this script specifying which source files the coverage should be performed on.
+#
+# This script will then use gcov to generate .gcov files in the directory specified
+# via the COV_PATH var. This should probably be the same as your cmake build dir.
+#
+# It then parses the .gcov files to convert them into the Coveralls JSON format:
+# https://coveralls.io/docs/api
+#
+
+CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
+
+# Since it's not possible to pass a CMake list properly in the
+# "1;2;3" format to an external process, we have replaced the
+# ";" with "*", so reverse that here so we get it back into the
+# CMake list format.
+string(REGEX REPLACE "\\*" ";" COVERAGE_SRCS ${COVERAGE_SRCS})
+
+find_program(GCOV_EXECUTABLE gcov)
+if (NOT GCOV_EXECUTABLE)
+	message(FATAL_ERROR "gcov not found! Aborting...")
+endif()
+
+find_package(Git)
+
+# TODO: Add these git things to the coveralls json.
+if (GIT_FOUND)
+	# Branch.
+	execute_process(
+		COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref HEAD
+		WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+		OUTPUT_VARIABLE GIT_BRANCH
+		OUTPUT_STRIP_TRAILING_WHITESPACE
+	)
+
+	macro (git_log_format FORMAT_CHARS VAR_NAME)
+		execute_process(
+			COMMAND ${GIT_EXECUTABLE} log -1 --pretty=format:%${FORMAT_CHARS}
+			WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}
+			OUTPUT_VARIABLE ${VAR_NAME}
+			OUTPUT_STRIP_TRAILING_WHITESPACE
+		)
+	endmacro()
+
+	git_log_format(an GIT_AUTHOR_EMAIL)
+	git_log_format(ae GIT_AUTHOR_EMAIL)
+	git_log_format(cn GIT_COMMITTER_NAME)
+	git_log_format(ce GIT_COMMITTER_EMAIL)
+	git_log_format(B GIT_COMMIT_MESSAGE)
+
+	message("Git exe: ${GIT_EXECUTABLE}")
+	message("Git branch: ${GIT_BRANCH}")
+	message("Git author: ${GIT_AUTHOR_NAME}")
+	message("Git e-mail: ${GIT_AUTHOR_EMAIL}")
+	message("Git commiter name: ${GIT_COMMITTER_NAME}")
+	message("Git commiter e-mail: ${GIT_COMMITTER_EMAIL}")
+	message("Git commit message: ${GIT_COMMIT_MESSAGE}")
+
+endif()
+
+############################# Macros #########################################
+
+#
+# This macro converts from the full path format gcov outputs:
+#
+#    /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov
+#
+# to the original source file path the .gcov is for:
+#
+#   /path/to/project/root/subdir/the_file.c
+#
+macro(get_source_path_from_gcov_filename _SRC_FILENAME _GCOV_FILENAME)
+
+	# /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov 
+	# -> 
+	# #path#to#project#root#subdir#the_file.c.gcov   
+	get_filename_component(_GCOV_FILENAME_WEXT ${_GCOV_FILENAME} NAME)
+
+	# #path#to#project#root#subdir#the_file.c.gcov -> /path/to/project/root/subdir/the_file.c
+	string(REGEX REPLACE "\\.gcov$" "" SRC_FILENAME_TMP ${_GCOV_FILENAME_WEXT})
+	string(REGEX REPLACE "\#" "/" SRC_FILENAME_TMP ${SRC_FILENAME_TMP})
+	set(${_SRC_FILENAME} "${SRC_FILENAME_TMP}")
+endmacro()
+
+##############################################################################
+
+# Get the coverage data.
+file(GLOB_RECURSE GCDA_FILES "${COV_PATH}" "*.gcda")
+message("GCDA files:")
+
+# Get a list of all the object directories needed by gcov
+# (The directories the .gcda files and .o files are found in)
+# and run gcov on those.
+foreach(GCDA ${GCDA_FILES})
+	message("Process: ${GCDA}")
+	message("------------------------------------------------------------------------------")
+	get_filename_component(GCDA_DIR ${GCDA} PATH)
+
+	#
+	# The -p below refers to "Preserve path components",
+	# This means that the generated gcov filename of a source file will
+	# keep the original files entire filepath, but / is replaced with #.
+	# Example:
+	#
+	# /path/to/project/root/build/CMakeFiles/the_file.dir/subdir/the_file.c.gcda
+	# ------------------------------------------------------------------------------
+	# File '/path/to/project/root/subdir/the_file.c'
+	# Lines executed:68.34% of 199
+	# /path/to/project/root/subdir/the_file.c:creating '#path#to#project#root#subdir#the_file.c.gcov'
+	#
+	# If -p is not specified then the file is named only "the_file.c.gcov"
+	#
+	execute_process(
+		COMMAND ${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA}
+		WORKING_DIRECTORY ${GCDA_DIR}
+	)
+endforeach()
+
+# TODO: Make these be absolute path
+file(GLOB_RECURSE ALL_GCOV_FILES "${COV_PATH}" "*.gcov")
+
+# Get only the filenames to use for filtering.
+#set(COVERAGE_SRCS_NAMES "")
+#foreach (COVSRC ${COVERAGE_SRCS})
+#	get_filename_component(COVSRC_NAME ${COVSRC} NAME)
+#	message("${COVSRC} -> ${COVSRC_NAME}")
+#	list(APPEND COVERAGE_SRCS_NAMES "${COVSRC_NAME}")
+#endforeach()
+
+#
+# Filter out all but the gcov files we want.
+#
+# We do this by comparing the list of COVERAGE_SRCS filepaths that the
+# user wants the coverage data for with the paths of the generated .gcov files,
+# so that we only keep the relevant gcov files.
+#
+# Example:
+# COVERAGE_SRCS =
+#				/path/to/project/root/subdir/the_file.c
+#
+# ALL_GCOV_FILES =
+#				/path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov
+#				/path/to/project/root/build/#path#to#project#root#subdir#other_file.c.gcov
+# 
+# Result should be:
+# GCOV_FILES = 
+#				/path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov
+#
+set(GCOV_FILES "")
+#message("Look in coverage sources: ${COVERAGE_SRCS}")
+message("\nFilter out unwanted GCOV files:")
+message("===============================")
+
+set(COVERAGE_SRCS_REMAINING ${COVERAGE_SRCS})
+
+foreach (GCOV_FILE ${ALL_GCOV_FILES})
+
+	#
+	# /path/to/project/root/build/#path#to#project#root#subdir#the_file.c.gcov 
+	# -> 
+	# /path/to/project/root/subdir/the_file.c 
+	get_source_path_from_gcov_filename(GCOV_SRC_PATH ${GCOV_FILE})
+
+	# Is this in the list of source files?
+	# TODO: We want to match against relative path filenames from the source file root...
+	list(FIND COVERAGE_SRCS ${GCOV_SRC_PATH} WAS_FOUND)
+
+	if (NOT WAS_FOUND EQUAL -1)
+		message("YES: ${GCOV_FILE}")
+		list(APPEND GCOV_FILES ${GCOV_FILE})
+
+		# We remove it from the list, so we don't bother searching for it again.
+		# Also files left in COVERAGE_SRCS_REMAINING after this loop ends should
+		# have coverage data generated from them (no lines are covered).
+		list(REMOVE_ITEM COVERAGE_SRCS_REMAINING ${GCOV_SRC_PATH})
+	else()
+		message("NO:  ${GCOV_FILE}")
+	endif()
+endforeach()
+
+# TODO: Enable setting these
+set(JSON_SERVICE_NAME "travis-ci")
+set(JSON_SERVICE_JOB_ID $ENV{TRAVIS_JOB_ID})
+
+set(JSON_TEMPLATE
+"{
+  \"service_name\": \"\@JSON_SERVICE_NAME\@\",
+  \"service_job_id\": \"\@JSON_SERVICE_JOB_ID\@\",
+  \"source_files\": \@JSON_GCOV_FILES\@
+}"
+)
+
+set(SRC_FILE_TEMPLATE
+"{
+      \"name\": \"\@GCOV_SRC_REL_PATH\@\",
+      \"source_digest\": \"\@GCOV_CONTENTS_MD5\@\",
+      \"coverage\": \@GCOV_FILE_COVERAGE\@
+  }"
+)
+
+message("\nGenerate JSON for files:")
+message("=========================")
+
+set(JSON_GCOV_FILES "[")
+
+# Read the GCOV files line by line and get the coverage data.
+foreach (GCOV_FILE ${GCOV_FILES})
+
+	get_source_path_from_gcov_filename(GCOV_SRC_PATH ${GCOV_FILE})
+	file(RELATIVE_PATH GCOV_SRC_REL_PATH "${PROJECT_ROOT}" "${GCOV_SRC_PATH}")
+
+	# The new coveralls API doesn't need the entire source (Yay!)
+	# However, still keeping that part for now. Will cleanup in the future.
+	file(MD5 "${GCOV_SRC_PATH}" GCOV_CONTENTS_MD5)
+	message("MD5: ${GCOV_SRC_PATH} = ${GCOV_CONTENTS_MD5}")
+
+	# Loads the gcov file as a list of lines.
+	# (We first open the file and replace all occurences of [] with _
+	#  because CMake will fail to parse a line containing unmatched brackets...
+	#  also the \ to escaped \n in macros screws up things.)
+	# https://public.kitware.com/Bug/view.php?id=15369
+	file(READ ${GCOV_FILE} GCOV_CONTENTS)
+	string(REPLACE "[" "_" GCOV_CONTENTS "${GCOV_CONTENTS}")
+	string(REPLACE "]" "_" GCOV_CONTENTS "${GCOV_CONTENTS}")
+	string(REPLACE "\\" "_" GCOV_CONTENTS "${GCOV_CONTENTS}")
+	file(WRITE ${GCOV_FILE}_tmp "${GCOV_CONTENTS}")
+
+	file(STRINGS ${GCOV_FILE}_tmp GCOV_LINES)
+	list(LENGTH GCOV_LINES LINE_COUNT)
+
+	# Instead of trying to parse the source from the
+	# gcov file, simply read the file contents from the source file.
+	# (Parsing it from the gcov is hard because C-code uses ; in many places
+	#  which also happens to be the same as the CMake list delimeter).
+	file(READ ${GCOV_SRC_PATH} GCOV_FILE_SOURCE)
+
+	string(REPLACE "\\" "\\\\" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+	string(REGEX REPLACE "\"" "\\\\\"" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+	string(REPLACE "\t" "\\\\t" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+	string(REPLACE "\r" "\\\\r" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+	string(REPLACE "\n" "\\\\n" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+	# According to http://json.org/ these should be escaped as well.
+	# Don't know how to do that in CMake however...
+	#string(REPLACE "\b" "\\\\b" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+	#string(REPLACE "\f" "\\\\f" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+	#string(REGEX REPLACE "\u([a-fA-F0-9]{4})" "\\\\u\\1" GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}")
+
+	# We want a json array of coverage data as a single string
+	# start building them from the contents of the .gcov
+	set(GCOV_FILE_COVERAGE "[")
+
+	set(GCOV_LINE_COUNT 1) # Line number for the .gcov.
+	set(DO_SKIP 0)
+	foreach (GCOV_LINE ${GCOV_LINES})
+		#message("${GCOV_LINE}")
+		# Example of what we're parsing:
+		# Hitcount  |Line | Source
+		# "        8:   26:        if (!allowed || (strlen(allowed) == 0))"
+		string(REGEX REPLACE 
+			"^([^:]*):([^:]*):(.*)$" 
+			"\\1;\\2;\\3"
+			RES
+			"${GCOV_LINE}")
+
+		# Check if we should exclude lines using the Lcov syntax.
+		string(REGEX MATCH "LCOV_EXCL_START" START_SKIP "${GCOV_LINE}")
+		string(REGEX MATCH "LCOV_EXCL_END" END_SKIP "${GCOV_LINE}")
+		string(REGEX MATCH "LCOV_EXCL_LINE" LINE_SKIP "${GCOV_LINE}")
+
+		set(RESET_SKIP 0)
+		if (LINE_SKIP AND NOT DO_SKIP)
+			set(DO_SKIP 1)
+			set(RESET_SKIP 1)
+		endif()
+
+		if (START_SKIP)
+			set(DO_SKIP 1)
+			message("${GCOV_LINE_COUNT}: Start skip")
+		endif()
+
+		if (END_SKIP)
+			set(DO_SKIP 0)
+		endif()
+
+		list(LENGTH RES RES_COUNT)
+
+		if (RES_COUNT GREATER 2)
+			list(GET RES 0 HITCOUNT)
+			list(GET RES 1 LINE)
+			list(GET RES 2 SOURCE)
+
+			string(STRIP ${HITCOUNT} HITCOUNT)
+			string(STRIP ${LINE} LINE)
+
+			# Lines with 0 line numbers are metadata and can be ignored.
+			if (NOT ${LINE} EQUAL 0)
+				
+				if (DO_SKIP)
+					set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}null, ")
+				else()
+					# Translate the hitcount into valid JSON values.
+					if (${HITCOUNT} STREQUAL "#####")
+						set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}0, ")
+					elseif (${HITCOUNT} STREQUAL "-")
+						set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}null, ")
+					else()
+						set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}${HITCOUNT}, ")
+					endif()
+				endif()
+			endif()
+		else()
+			message(WARNING "Failed to properly parse line (RES_COUNT = ${RES_COUNT}) ${GCOV_FILE}:${GCOV_LINE_COUNT}\n-->${GCOV_LINE}")
+		endif()
+
+		if (RESET_SKIP)
+			set(DO_SKIP 0)
+		endif()
+		math(EXPR GCOV_LINE_COUNT "${GCOV_LINE_COUNT}+1")
+	endforeach()
+
+	message("${GCOV_LINE_COUNT} of ${LINE_COUNT} lines read!")
+
+	# Advanced way of removing the trailing comma in the JSON array.
+	# "[1, 2, 3, " -> "[1, 2, 3"
+	string(REGEX REPLACE ",[ ]*$" "" GCOV_FILE_COVERAGE ${GCOV_FILE_COVERAGE})
+
+	# Append the trailing ] to complete the JSON array.
+	set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]")
+
+	# Generate the final JSON for this file.
+	message("Generate JSON for file: ${GCOV_SRC_REL_PATH}...")
+	string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON)
+
+	set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ")
+endforeach()
+
+# Loop through all files we couldn't find any coverage for
+# as well, and generate JSON for those as well with 0% coverage.
+foreach(NOT_COVERED_SRC ${COVERAGE_SRCS_REMAINING})
+
+	# Loads the source file as a list of lines.
+	file(STRINGS ${NOT_COVERED_SRC} SRC_LINES)
+
+	set(GCOV_FILE_COVERAGE "[")
+	set(GCOV_FILE_SOURCE "")
+
+	foreach (SOURCE ${SRC_LINES})
+		set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}0, ")
+
+		string(REPLACE "\\" "\\\\" SOURCE "${SOURCE}")
+		string(REGEX REPLACE "\"" "\\\\\"" SOURCE "${SOURCE}")
+		string(REPLACE "\t" "\\\\t" SOURCE "${SOURCE}")
+		string(REPLACE "\r" "\\\\r" SOURCE "${SOURCE}")
+		set(GCOV_FILE_SOURCE "${GCOV_FILE_SOURCE}${SOURCE}\\n")
+	endforeach()
+
+	# Remove trailing comma, and complete JSON array with ]
+	string(REGEX REPLACE ",[ ]*$" "" GCOV_FILE_COVERAGE ${GCOV_FILE_COVERAGE})
+	set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]")
+
+	# Generate the final JSON for this file.
+	message("Generate JSON for non-gcov file: ${NOT_COVERED_SRC}...")
+	string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON)
+	set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ")
+endforeach()
+
+# Get rid of trailing comma.
+string(REGEX REPLACE ",[ ]*$" "" JSON_GCOV_FILES ${JSON_GCOV_FILES})
+set(JSON_GCOV_FILES "${JSON_GCOV_FILES}]")
+
+# Generate the final complete JSON!
+message("Generate final JSON...")
+string(CONFIGURE ${JSON_TEMPLATE} JSON)
+
+file(WRITE "${COVERALLS_OUTPUT_FILE}" "${JSON}")
+message("###########################################################################")
+message("Generated coveralls JSON containing coverage data:") 
+message("${COVERALLS_OUTPUT_FILE}")
+message("###########################################################################")
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -21,12 +21,6 @@ function(safe_set_flag is_c src_list flag_name)
    endif()
    if(${safe_name})
        set(${src_list} "${${src_list}} ${flag_name}" PARENT_SCOPE)
-        if(is_c)
-          set(CUDA_NVCC_FLAGS
-              --compiler-options;${flag_name}
-              ${CUDA_NVCC_FLAGS}
-              PARENT_SCOPE)
-        endif()
    endif()
 endfunction()

@@ -40,6 +34,20 @@ macro(safe_set_cxxflag src_list flag_name)
    safe_set_flag(OFF ${src_list} ${flag_name})
 endmacro()

+# helper macro to set nvcc flag
+macro(safe_set_nvflag flag_name)
+    string(REPLACE "-" "_" safe_name ${flag_name})
+    string(REPLACE "=" "_" safe_name ${safe_name})
+    CHECK_C_COMPILER_FLAG(${flag_name} C_COMPILER_SUPPORT_FLAG_${safe_name})
+    set(safe_name C_COMPILER_SUPPORT_FLAG_${safe_name})
+    if(${safe_name})
+        set(CUDA_NVCC_FLAGS
+            --compiler-options;${flag_name}
+            ${CUDA_NVCC_FLAGS})
+    endif()
+endmacro()
+
+
 CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS)
 if(NOT UINT64_MAX_EXISTS)
  set(CMAKE_REQUIRED_DEFINITIONS -D__STDC_LIMIT_MACROS)
@@ -63,20 +71,43 @@ set(COMMON_FLAGS
    -Wnon-virtual-dtor
    -Wdelete-non-virtual-dtor
    -Wno-unused-parameter
+    -Wno-unused-function
+    -Wno-error=literal-suffix
+    -Wno-error=unused-local-typedefs)
+
+set(GPU_COMMON_FLAGS
+    -fPIC
+    -fno-omit-frame-pointer
+    -Wnon-virtual-dtor
+    -Wdelete-non-virtual-dtor
+    -Wno-unused-parameter
+    -Wno-unused-function
    -Wno-error=literal-suffix
    -Wno-error=unused-local-typedefs
    -Wno-error=unused-function  # Warnings in Numpy Header.
 )

+if (APPLE)
+    # On Mac OS X build fat binaries with x86_64 architectures by default.
+    set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
+else()
+    set(GPU_COMMON_FLAGS
+        -Wall
+        -Wextra
+        -Werror
+        ${GPU_COMMON_FLAGS})
+endif()
+
+
 foreach(flag ${COMMON_FLAGS})
    safe_set_cflag(CMAKE_C_FLAGS ${flag})
    safe_set_cxxflag(CMAKE_CXX_FLAGS ${flag})
 endforeach()

-# On Mac OS X build fat binaries with x86_64 architectures by default.
-if (APPLE)
-    set (CMAKE_OSX_ARCHITECTURES "x86_64" CACHE STRING "Build architectures for OSX" FORCE)
-endif ()
+foreach(flag ${GPU_COMMON_FLAGS})
+    safe_set_nvflag(${flag})
+endforeach()
+

 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.

--- a/cmake/rdma.cmake
+++ b/cmake/rdma.cmake
+# user should download rdma first from subversion repository
+
+# execute following instruction to download svn mannally
+# svn co https://svn.baidu.com/sys/ip/trunk/rdma/sockrdmav1 rdma/
+# svn co https://svn.baidu.com/sys/ip/trunk/rdma/thirdparty rdma/
+# we use static output in svn repositories to avoid implict bugs from not standard runtime env.
+
+set(RDMA_ROOT $ENV{RDMA_ROOT} CACHE PATH "Folder contains RDMA sock library and thirdparty library")
+
+function(generate_rdma_links)
+  #redirect to current DIR to isolate the pollution from system runtime environment
+  #it can benifits unified control for different gcc environment. 
+  #e.g, by default gcc48 did not refer /usr/lib64 which could contain low version
+  #runtime libraries that will crash process while loading it. That redirect trick
+  #can fix it.
+  execute_process(
+    COMMAND mkdir -p librdma
+    COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so.1
+    COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so
+    COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so.1
+    COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so 
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
+  )
+endfunction(generate_rdma_links)
+
+
+#check and set headers
+find_path(RDMA_INC_SXISOCK sxi_sock.h PATHS ${RDMA_ROOT}/sockrdmav1/output/include)
+find_path(RDMA_INC_XIO libxio.h PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
+find_path(RDMA_INC_EVENT event2 PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+find_path(RDMA_INC_NUMA numa.h PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
+
+#check and set libs
+find_library(RDMA_LIB_SXISOCK NAMES sxisock PATHS ${RDMA_ROOT}/sockrdmav1/output)
+find_library(RDMA_LIB_XIO NAMES xio PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
+find_library(RDMA_LIB_EVENT NAMES event PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+find_library(RDMA_LIB_EVENT_CORE NAMES event_core PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+find_library(RDMA_LIB_EVENT_EXTRA NAMES event_extra PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+find_library(RDMA_LIB_EVENT_PTHREADS NAMES event_pthreads PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+find_library(RDMA_LIB_NUMA NAMES numa PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
+
+if(
+    RDMA_INC_SXISOCK AND
+    RDMA_INC_XIO AND
+    RDMA_INC_EVENT AND
+    RDMA_INC_NUMA AND
+    RDMA_LIB_SXISOCK AND 
+    RDMA_LIB_XIO AND
+    RDMA_LIB_EVENT AND
+    RDMA_LIB_EVENT_CORE AND
+    RDMA_LIB_EVENT_EXTRA AND
+    RDMA_LIB_EVENT_PTHREADS AND
+    RDMA_LIB_NUMA
+    )
+
+  set(RDMA_INC_DIR 
+    ${RDMA_INC_SXISOCK} 
+    ${RDMA_INC_XIO}
+    ${RDMA_INC_EVENT}
+    ${RDMA_INC_NUMA})
+  set(RDMA_LIBS  
+    ${RDMA_LIB_SXISOCK} 
+    ${RDMA_LIB_XIO} 
+    ${RDMA_LIB_EVENT} 
+    ${RDMA_LIB_EVENT_CORE} 
+    ${RDMA_LIB_EVENT_EXTRA} 
+    ${RDMA_LIB_EVENT_PTHREADS} 
+    ${RDMA_LIB_NUMA} 
+    )
+  set(RDMA_LD_FLAGS "-L./librdma -libverbs -lrdmacm -Xlinker -rpath ./librdma")
+  return()
+endif()
+
+#if this module is not called, RDMA_INC_DIR RDMA_LIBS will be null, so top module always refer this variable
+
+message(FATAL_ERROR, "RDMA libraries are not found, try to set RDMA_ROOT or check all related libraries.")
--- a/cmake/swig.cmake
+++ b/cmake/swig.cmake
-find_program(
-    SWIG_BINARY_PATH
-    swig)
-
-if(${SWIG_BINARY_PATH} STREQUAL "SWIG_BINARY_PATH-NOTFOUND")
-    set(SWIG_FOUND OFF)
-else()
-    set(SWIG_FOUND ON)
-endif()
-
-set(MIN_SWIG_VERSION 2)
-if(SWIG_FOUND)
-    execute_process(COMMAND sh -c "${SWIG_BINARY_PATH} -version | grep Version | cut -f3 -d' '"
-        OUTPUT_VARIABLE _SWIG_VERSION
-        OUTPUT_STRIP_TRAILING_WHITESPACE)
-    if(${_SWIG_VERSION} VERSION_LESS ${MIN_SWIG_VERSION})
-        message("swig version ${MIN_SWIG_VERSION} or greater is needed for generating python api. "
-                 "Only version ${_SWIG_VERSION} is found. Set SWIG_FOUND to FALSE")
-        set(SWIG_FOUND FALSE)
-    endif(${_SWIG_VERSION} VERSION_LESS ${MIN_SWIG_VERSION})
-endif(SWIG_FOUND)
-
 function(generate_python_api target_name)
    add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
                              ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
@@ -27,6 +5,7 @@ function(generate_python_api target_name)
        COMMAND swig -python -c++ -outcurrentdir -I../ api/Paddle.swig
                && mv ${PROJ_ROOT}/paddle/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
        DEPENDS ${PROJ_ROOT}/paddle/api/Paddle.swig
+                ${PROJ_ROOT}/paddle/api/PaddleAPI.h
        WORKING_DIRECTORY ${PROJ_ROOT}/paddle
        COMMENT "Generate Python API from swig")
    add_custom_target(${target_name} ALL DEPENDS

--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -67,6 +67,10 @@ endmacro()
 #
 # It will handle WITH_PYTHON/WITH_GLOG etc.
 function(link_paddle_exe TARGET_NAME)
+    if(WITH_RDMA)
+        generate_rdma_links()
+    endif()
+
    if(WITH_METRIC)
        if(WITH_GPU)
            set(METRIC_LIBS paddle_metric_learning paddle_dserver_lib metric metric_cpu)
@@ -109,6 +113,12 @@ function(link_paddle_exe TARGET_NAME)
        ${ZLIB_LIBRARIES}
        ${INTERAL_LIBS}
        ${CMAKE_DL_LIBS})
+
+    if(WITH_RDMA)
+        target_link_libraries(${TARGET_NAME}
+            ${RDMA_LD_FLAGS}
+            ${RDMA_LIBS})
+    endif()
    
    if(WITH_PYTHON)
        target_link_libraries(${TARGET_NAME}

--- a/demo/introduction/README.md
+++ b/demo/introduction/README.md
+This folder contains scripts used in PaddlePaddle introduction.
+- use `bash train.sh` to train a simple linear regression model
+- use `python evaluate_model.py` to read model parameters. You can see that `w` and `b` are very close to [2, 0.3].
+
--- a/demo/introduction/dataprovider.py
+++ b/demo/introduction/dataprovider.py
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.PyDataProvider2 import *
+import random
+
+# define data types of input: 2 real numbers
+@provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
+def process(settings, input_file):
+    for i in xrange(2000):
+        x = random.random()
+        yield [x], [2*x+0.3]
+
--- a/demo/introduction/evaluate_model.py
+++ b/demo/introduction/evaluate_model.py
+#!/usr/bin/env python
+# -*- coding: UTF-8 -*-
+
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Print model parameters in last model
+
+Usage:
+    python evaluate_model.py
+"""
+import numpy as np
+import os
+
+def load(file_name):
+    with open(file_name, 'rb') as f:
+        f.read(16) # skip header for float type.
+        return np.fromfile(f, dtype=np.float32)
+
+def main():
+    print 'w=%.6f, b=%.6f from pass 29' % (load('output/pass-00029/w'),
+            load('output/pass-00029/b'))
+
+if __name__ == '__main__':
+    main()
--- a/demo/introduction/train.sh
+++ b/demo/introduction/train.sh
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+paddle train \
+    --config=trainer_config.py \
+    --save_dir=./output \
+    --num_passes=30 \
+    2>&1 |tee 'train.log'
--- a/demo/introduction/trainer_config.py
+++ b/demo/introduction/trainer_config.py
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+# 1. read data. Suppose you saved above python code as dataprovider.py
+data_file = 'empty.list'
+with open(data_file, 'w') as f: f.writelines(' ')
+define_py_data_sources2(train_list=data_file, test_list=None, 
+        module='dataprovider', obj='process',args={})
+
+# 2. learning algorithm
+settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
+
+# 3. Network configuration
+x = data_layer(name='x', size=1)
+y = data_layer(name='y', size=1)
+y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
+cost = regression_cost(input=y_predict, label=y)
+outputs(cost)
+
--- a/demo/quick_start/api_train.py
+++ b/demo/quick_start/api_train.py
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import itertools
+import random
+
+from paddle.trainer.config_parser import parse_config
+from py_paddle import swig_paddle as api
+from py_paddle import DataProviderConverter
+from paddle.trainer.PyDataProvider2 \
+    import integer_value, integer_value_sequence, sparse_binary_vector
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--train_data",
+                        type=str, required=False, help="train data file")
+    parser.add_argument("--test_data", type=str, help="test data file")
+    parser.add_argument("--config",
+                        type=str, required=True, help="config file name")
+    parser.add_argument("--dict_file", required=True, help="dictionary file")
+    parser.add_argument("--seq",
+                        default=1, type=int,
+                        help="whether use sequence training")
+    parser.add_argument("--use_gpu", default=0, type=int,
+                        help="whether use GPU for training")
+    parser.add_argument("--trainer_count", default=1, type=int,
+                        help="Number of threads for training")
+    parser.add_argument("--num_passes", default=5, type=int,
+                        help="Number of training passes")
+    return parser.parse_args()
+
+UNK_IDX = 0
+
+def load_data(file_name, word_dict):
+    with open(file_name, 'r') as f:
+        for line in f:
+            label, comment = line.strip().split('\t')
+            words = comment.split()
+            word_slot = [word_dict.get(w, UNK_IDX) for w in words]
+            yield word_slot, int(label)
+
+def load_dict(dict_file):
+    word_dict = dict()
+    with open(dict_file, 'r') as f:
+        for i, line in enumerate(f):
+            w = line.strip().split()[0]
+            word_dict[w] = i
+    return word_dict
+
+def main():
+    options = parse_arguments()
+    api.initPaddle("--use_gpu=%s" % options.use_gpu,
+                   "--trainer_count=%s" % options.trainer_count)
+
+    word_dict = load_dict(options.dict_file)
+    train_dataset = list(load_data(options.train_data, word_dict))
+    if options.test_data:
+        test_dataset = list(load_data(options.test_data, word_dict))
+    else:
+        test_dataset = None
+
+    trainer_config = parse_config(options.config,
+                                  "dict_file=%s" % options.dict_file)
+    # No need to have data provider for trainer
+    trainer_config.ClearField('data_config')
+    trainer_config.ClearField('test_data_config')
+
+    # create a GradientMachine from the model configuratin
+    model = api.GradientMachine.createFromConfigProto(
+        trainer_config.model_config)
+    # create a trainer for the gradient machine
+    trainer = api.Trainer.create(trainer_config, model)
+
+    # create a data converter which converts data to PaddlePaddle
+    # internal format
+    input_types = [
+        integer_value_sequence(len(word_dict)) if options.seq
+            else sparse_binary_vector(len(word_dict)),
+        integer_value(2)]
+    converter = DataProviderConverter(input_types)
+
+    batch_size = trainer_config.opt_config.batch_size
+    trainer.startTrain()
+    for train_pass in xrange(options.num_passes):
+        trainer.startTrainPass()
+        random.shuffle(train_dataset)
+        for pos in xrange(0, len(train_dataset), batch_size):
+            batch = itertools.islice(train_dataset, pos, pos + batch_size)
+            size = min(batch_size, len(train_dataset) - pos)
+            trainer.trainOneDataBatch(size, converter(batch))
+        trainer.finishTrainPass()
+        if test_dataset:
+            trainer.startTestPeriod();
+            for pos in xrange(0, len(test_dataset), batch_size):
+                batch = itertools.islice(test_dataset, pos, pos + batch_size)
+                size = min(batch_size, len(test_dataset) - pos)
+                trainer.testOneDataBatch(size, converter(batch))
+            trainer.finishTestPeriod()
+    trainer.finishTrain()
+
+if __name__ == '__main__':
+    main()
--- a/demo/quick_start/api_train.sh
+++ b/demo/quick_start/api_train.sh
+#!/bin/bash
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+# Note: if using trainer_config.emb.py, trainer_config.cnn.py
+# or trainer_config.lstm.py, you need to change --seq to --seq=1
+# because they are sequence models.
+python api_train.py \
+  --config=trainer_config.lr.py \
+  --trainer_count=2 \
+  --num_passes=15 \
+  --use_gpu=0 \
+  --seq=0 \
+  --train_data=data/train.txt \
+  --test_data=data/test.txt \
+  --dict_file=data/dict.txt \
+  2>&1 | tee 'train.log'
--- a/demo/quick_start/dataprovider_emb.py
+++ b/demo/quick_start/dataprovider_emb.py
@@ -16,6 +16,7 @@ from paddle.trainer.PyDataProvider2 import *

 UNK_IDX = 0

+
 def initializer(settings, dictionary, **kwargs):
    settings.word_dict = dictionary
    settings.input_types = [

--- a/demo/quick_start/train.sh
+++ b/demo/quick_start/train.sh
@@ -24,7 +24,7 @@ paddle train \
  --config=$cfg \
  --save_dir=./output \
  --trainer_count=4 \
-  --log_period=20 \
+  --log_period=100 \
  --num_passes=15 \
  --use_gpu=false \
  --show_parameter_stats_period=100 \

--- a/demo/quick_start/trainer_config.lr.py
+++ b/demo/quick_start/trainer_config.lr.py
@@ -16,7 +16,7 @@

 from paddle.trainer_config_helpers import *

-dict_file = "./data/dict.txt"
+dict_file = get_config_arg('dict_file', str, "./data/dict.txt")
 word_dict = dict()
 with open(dict_file, 'r') as f:
    for i, line in enumerate(f):
@@ -63,7 +63,6 @@ if not is_predict:
    label = data_layer(name="label", size=2)

    # Define cross-entropy classification loss and error.
-    classification_cost(input=output, label=label)
    cls = classification_cost(input=output, label=label)
    outputs(cls)
 else:

--- a/demo/quick_start/trainer_config.lstm.py
+++ b/demo/quick_start/trainer_config.lstm.py
@@ -42,20 +42,13 @@ settings(
    gradient_clipping_threshold=25
 )

-bias_attr = ParamAttr(initial_std=0.,l2_rate=0.)

 data = data_layer(name="word", size=len(word_dict))
 emb = embedding_layer(input=data, size=128)
-fc = fc_layer(input=emb, size=512,
-              act=LinearActivation(),
-              bias_attr=bias_attr,
-              layer_attr=ExtraAttr(drop_rate=0.1))
-lstm = lstmemory(input=fc, act=TanhActivation(),
-                 bias_attr=bias_attr,
-                 layer_attr=ExtraAttr(drop_rate=0.25))
-lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling())
-output = fc_layer(input=lstm_last, size=2,
-                  bias_attr=bias_attr,
+lstm = simple_lstm(input=emb, size=128,
+                   lstm_cell_attr=ExtraAttr(drop_rate=0.25))
+lstm_max = pooling_layer(input=lstm, pooling_type=MaxPooling())
+output = fc_layer(input=lstm_max, size=2,
                  act=SoftmaxActivation())
 if is_predict:
    maxid = maxid_layer(output)

--- a/demo/semantic_role_labeling/.gitignore
+++ b/demo/semantic_role_labeling/.gitignore
+*.pyc
+train.log
+data/feature
+data/conll05st-release/
+data/src.dict
+data/test.wsj.props
+data/test.wsj.seq_pair
+data/test.wsj.words
+data/tgt.dict
+output
--- a/demo/sentiment/predict.py
+++ b/demo/sentiment/predict.py
@@ -46,8 +46,8 @@ class SentimentPrediction():
        conf = parse_config(train_conf, "is_predict=1")
        self.network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
        self.network.loadParameters(self.model_dir)
-        slots = [integer_value_sequence(self.dict_dim)]
-        self.converter = DataProviderConverter(slots)
+        input_types = [integer_value_sequence(self.dict_dim)]
+        self.converter = DataProviderConverter(input_types)

    def load_dict(self):
        """

--- a/doc/build/build_from_source.md
+++ b/doc/build/build_from_source.md
@@ -153,12 +153,12 @@ As a simple example, consider the following:
 - **Only CPU**

  ```bash
-  cmake  .. -DWITH_GPU=OFF -DWITH_DOC=OFF
+  cmake  .. -DWITH_GPU=OFF
  ```
 - **GPU**

  ```bash
-  cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF
+  cmake .. -DWITH_GPU=ON
  ```

 - **GPU with doc and swig**
@@ -171,7 +171,7 @@ Finally, you can build PaddlePaddle:

 ```bash
 # you can add build option here, such as:    
-cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF -DCMAKE_INSTALL_PREFIX=<path to install>
+cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX=<path to install>
 # please use sudo make install, if you want to install PaddlePaddle into the system
 make -j `nproc` && make install
 # set PaddlePaddle installation path in ~/.bashrc
@@ -219,10 +219,9 @@ easy_install pip
  # Install google test on Mac OS X
  # Download gtest 1.7.0
  wget https://github.com/google/googletest/archive/release-1.7.0.tar.gz
-  tar -xvf googletest-release-1.7.0.tar.gz && cd googletest-release-1.7.0
+  tar -xzf googletest-release-1.7.0.tar.gz && cd googletest-release-1.7.0
  # Build gtest
-  mkdir build && cmake ..
-  make
+  mkdir build && cd build && cmake .. && make
  # Install gtest library
  sudo cp -r ../include/gtest /usr/local/include/
  sudo cp lib*.a /usr/local/lib
@@ -246,7 +245,7 @@ easy_install pip

        ```bash
        sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local
-        sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
+        sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib/libcudnn*
        ```
    2. Then you need to set DYLD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.

@@ -273,12 +272,12 @@ As a simple example, consider the following:
 - **Only CPU**

  ```bash
-  cmake  .. -DWITH_GPU=OFF -DWITH_DOC=OFF
+  cmake  .. -DWITH_GPU=OFF
  ```
 - **GPU**

  ```bash
-  cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF
+  cmake .. -DWITH_GPU=ON
  ```

 - **GPU with doc and swig**
@@ -291,9 +290,9 @@ Finally, you can build PaddlePaddle:

 ```bash
 # you can add build option here, such as:    
-cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF -DCMAKE_INSTALL_PREFIX=<installation path>
+cmake .. -DWITH_GPU=ON -DCMAKE_INSTALL_PREFIX=<installation path>
 # please use sudo make install, if you want to install PaddlePaddle into the system
-make -j `nproc` && make install
+make -j `sysctl -n hw.ncpu` && make install
 # set PaddlePaddle installation path in ~/.bashrc
 export PATH=<installation path>/bin:$PATH
 ```

--- a/doc/build/contribute_to_paddle.md
+++ b/doc/build/contribute_to_paddle.md
@@ -4,7 +4,7 @@ We sincerely appreciate your contributions. You can use fork and pull request
 workflow to merge your code. 
 
 ## Code Requirements
- Your code mush be fully documented by
+- Your code must be fully documented by
  [doxygen](http://www.stack.nl/~dimitri/doxygen/) style.
 - Make sure the compiler option WITH\_STYLE\_CHECK is on and the compiler
  passes the code style check.
@@ -20,16 +20,30 @@ It's just that simple.

 ## Clone

+Paddle is currently using [git-flow branching model](http://nvie.com/posts/a-successful-git-branching-model/).
+The **develop** is the main branch, and other user's branches are feature branches.
+
 Once you've created a fork, you can use your favorite git client to clone your
 repo or just head straight to the command line:
 
 ```shell
 # Clone your fork to your local machine
-git clone https://github.com/USERNAME/Paddle.git
+git clone --branch develop https://github.com/USERNAME/Paddle.git
+```
+If your repository doesn't contain **develop** branch, just create it by your own.
+
+```shell
+git clone https://github.com/USERNAME/Paddle.git Paddle
+cd Paddle
+git checkout -b develop  # create develop branch.
+git remote add upstream https://github.com/baidu/Paddle.git  # add upstream to baidu/Paddle
+git pull upstream develop  # update to upstream
 ```
+
 Then you can start to develop by making a local developement branch
+
 ```shell
-git checkout -b MY_COOL_STUFF_BRANCH origin/master
+git checkout -b MY_COOL_STUFF_BRANCH
 ```

 ## Commit
@@ -41,7 +55,7 @@ Commit your changes by following command lines:
 git status
 # add modified files
 git add xx
-git commit -m "commit info"
+env EDITOR=vim git commit  # You can write your comments by vim/nano/emacs.
 ```
 The first line of commit infomation is the title. The second and later lines
 are the details if any.
@@ -63,7 +77,7 @@ git remote -v
 Update your fork with the latest upstream changes:

 ```shell
-git pull --rebase upstream HEAD
+git pull --rebase upstream develop
 ```

 If there are no unique commits locally, git will simply perform a fast-forward.
@@ -76,7 +90,7 @@ Now, your local master branch is up-to-date with everything modified upstream.

 ```shell
 # push to your repository in Github
-git push origin HEAD
+git push -u origin MY_COOL_STUFF_BRANCH  # create remote branch MY_COOL_STUFF_BRANCH to origin.
 ```

 ## Pull Request
@@ -93,13 +107,24 @@ of conflict, you need to do the update manually. You need to do the following on
 your local repository:
 ```shell
 git checkout MY_COOL_STUFF_BRANCH
-git pull --rebase upstream HEAD
+git pull upstream develop
 # You may need to resolve the conflict according to the git prompt.
 # Make and test your code.
-git push -f origin HEAD
+git push origin MY_COOL_STUFF_BRANCH
 ```
 Now your Pull Request is updated with the latest version.

 ## Revise your pull request

 When you revise your pull request according to reviewer's comments, please use 'git commit' instead of 'git commit --amend' to commit your changes so that the reviewers can see the difference between the new pull requrest and the old pull request.
+
+The possible commands are
+
+```shell
+git checkout MY_COOL_STUFF_BRANCH
+git pull upstream develop   # update local to newest code base.
+# May be some conflicts will occured.
+# And develop your cool stuff
+env EDITOR=vim git commit  # add your revise log
+git push origin MY_COOL_STUFF_BRANCH
+```
--- a/doc/index.md
+++ b/doc/index.md
@@ -3,6 +3,7 @@ PaddlePaddle Documentation

 User Guide
 ----------
+* [Introduction](introduction/index.md)
 * [Quick Start](demo/quick_start/index_en.md)
 * [Build and Installation](build/index.rst)
 * [Contribute Code](build/contribute_to_paddle.md)

--- a/doc/introduction/index.md
+++ b/doc/introduction/index.md
+# Introduction
+
+PaddlePaddle is a deep learning platform open-sourced by Baidu. With PaddlePaddle, you can easily train a classic neural network within a couple lines of configuration, or you can build sophisticated models that provide state-of-the-art performance on difficult learning tasks like sentiment analysis, machine translation, image caption and so on.
+
+## 1. A Classic Problem
+
+Now, to give you a hint of what using PaddlePaddle looks like, let's start with a fundamental learning problem - <a href="https://en.wikipedia.org/wiki/Simple_linear_regression">**simple linear regression**</a> : you have observed a set of two-dimensional data points of `X` and `Y`, where `X` is an explanatory variable and `Y` is corresponding dependent variable, and you want to recover the underlying correlation between `X` and `Y`. Linear regression can be used in many practical scenarios. For example, `X` can be a variable about house size, and `Y` a variable about house price. You can build a model that captures relationship between them by observing real estate markets.
+
+## 2. Prepare the Data
+
+Suppose the true relationship can be characterized as `Y = 2X + 0.3`, let's see how to recover this pattern only from observed data. Here is a piece of python code that feeds synthetic data to PaddlePaddle. The code is pretty self-explanatory, the only extra thing you need to add for PaddlePaddle is a definition of input data types.
+
+```python
+# dataprovider.py
+from paddle.trainer.PyDataProvider2 import *
+import random
+
+# define data types of input: 2 real numbers
+@provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
+def process(settings, input_file):
+    for i in xrange(2000):
+        x = random.random()
+        yield [x], [2*x+0.3]
+```
+
+## 3. Train a NeuralNetwork in PaddlePaddle
+
+To recover this relationship between `X` and `Y`, we use a neural network with one layer of linear activation units and a square error cost layer. Don't worry if you are not familiar with these terminologies, it's just saying that we are starting from a random line `Y' = wX + b` , then we gradually adapt `w` and `b` to minimize the difference between `Y'` and `Y`. Here is what it looks like in PaddlePaddle:
+
+```python
+# trainer_config.py
+from paddle.trainer_config_helpers import *
+
+# 1. read data. Suppose you saved above python code as dataprovider.py
+data_file = 'empty.list'
+with open(data_file, 'w') as f: f.writelines(' ')
+define_py_data_sources2(train_list=data_file, test_list=None, 
+        module='dataprovider', obj='process',args={})
+
+# 2. learning algorithm
+settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
+
+# 3. Network configuration
+x = data_layer(name='x', size=1)
+y = data_layer(name='y', size=1)
+y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
+cost = regression_cost(input=y_predict, label=y)
+outputs(cost)
+```
+
+Some of the most fundamental usages of PaddlePaddle are demonstrated:
+
+-  The first part shows how to feed data into PaddlePaddle. In general cases, PaddlePaddle reads raw data from a list of files, and then do some user-defined process to get real input. In this case, we only need to create a placeholder file since we are generating synthetic data on the fly.
+
+-  The second part describes learning algorithm. It defines in what ways adjustments are made to model parameters. PaddlePaddle provides a rich set of optimizers, but a simple momentum based optimizer will suffice here, and it processes 12 data points each time.
+
+-  Finally, the network configuration. It usually is as simple as "stacking" layers. Three kinds of layers are used in this configuration:
+	-  **Data Layer**: a network always starts with one or more data layers. They provide input data to the rest of the network. In this problem, two data layers are used respectively for `X` and `Y`.
+	-  **FC Layer**: FC layer is short for Fully Connected Layer, which connects all the input units to current layer and does the actual computation specified as activation function. Computation layers like this are the fundamental building blocks of a deeper model.
+	-  **Cost Layer**: in training phase, cost layers are usually the last layers of the network. They measure the performance of current model, and provide guidence to adjust parameters.
+
+Now that everything is ready, you can train the network with a simple command line call:
+ ```
+ paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
+ ```
+
+This means that PaddlePaddle will train this network on the synthectic dataset for 30 passes, and save all the models under path `./output`. You will see from the messages printed out during training phase that the model cost is decreasing as time goes by, which indicates we are getting a closer guess.
+
+
+## 4. Evaluate the Model
+
+Usually, a different dataset that left out during training phase should be used to evalute the models. However, we are lucky enough to know the real answer: `w=2, b=0.3`, thus a better option is to check out model parameters directly.
+
+In PaddlePaddle, training is just to get a collection of model parameters, which are `w` and `b` in this case. Each parameter is saved in an individual file in the popular `numpy` array format. Here is the code that reads parameters from last pass.
+
+```python
+import numpy as np
+import os
+
+def load(file_name):
+    with open(file_name, 'rb') as f:
+        f.read(16) # skip header for float type.
+        return np.fromfile(f, dtype=np.float32)
+        
+print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
+# w=1.999743, b=0.300137
+```
+
+<center> ![](./parameters.png) </center>
+
+Although starts from a random guess, you can see that value of `w` changes quickly towards 2 and `b` changes quickly towards 0.3. In the end, the predicted line is almost identical with real answer.
+
+There, you have recovered the underlying pattern between `X` and `Y` only from observed data.
+
+
+## 5. Where to Go from Here
+
+- <a href="../build/index.html"> Build and Installation </a>
+- <a href="../demo/quick_start/index_en.html">Quick Start</a>
+- <a href="../demo/index.html">Example and Demo</a>
+
--- a/doc/introduction/parameters.png
+++ b/doc/introduction/parameters.png
+../../doc_cn/introduction/parameters.png
\ No newline at end of file
--- a/doc/ui/api/trainer_config_helpers/activations.rst
+++ b/doc/ui/api/trainer_config_helpers/activations.rst
@@ -32,6 +32,13 @@ LinearActivation
 ..  automodule:: paddle.trainer_config_helpers.activations
    :members: LinearActivation
    :noindex:
+
+LogActivation
+==================
+
+..  automodule:: paddle.trainer_config_helpers.activations
+    :members: LogActivation
+    :noindex:
    
 SquareActivation
 ================

--- a/doc/ui/cmd_argument/argument_outline.md
+++ b/doc/ui/cmd_argument/argument_outline.md
@@ -183,7 +183,7 @@ It looks like there are a lot of arguments. However, most of them are for develo
 </tr>

 <tr>
-<td class="left" rowspan = "5">GPU</td><td class="left">gpu_id</td>
+<td class="left" rowspan = "6">GPU</td><td class="left">gpu_id</td>
 <td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
 </tr>

@@ -207,6 +207,11 @@ It looks like there are a lot of arguments. However, most of them are for develo
 <td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
 </tr>

+<tr>
+<td class="left">cudnn_conv_workspace_limit_in_mb</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
 <tr>
 <td class="left" rowspan = "4">RNN</td>
 <td class="left">beam_size</td>

--- a/doc/ui/cmd_argument/detail_introduction.md
+++ b/doc/ui/cmd_argument/detail_introduction.md
@@ -163,6 +163,10 @@
  - Choose path to dynamic load NVIDIA CUDA library, for instance, /usr/local/cuda/lib64. [Default]: LD_LIBRARY_PATH
  - type: string (default: "", null)

+* `--cudnn_conv_workspace_limit_in_mb`
+  - Specify cuDNN max workspace limit, in units MB, 4096MB=4GB by default. 
+  - type: int32 (default: 4096MB=4GB)
+
 ## NLP: RNN/LSTM/GRU
 * `--rnn_use_batch`
  - Whether to use batch method for calculation in simple RecurrentLayer.

--- a/doc_cn/concepts/nn.rst
+++ b/doc_cn/concepts/nn.rst
+TBD
+
+目前正在书写中。敬请期待。
\ No newline at end of file
--- a/doc_cn/concepts/program_concepts.rst
+++ b/doc_cn/concepts/program_concepts.rst
+TBD
+###
+
+目前正在书写中。敬请期待。
\ No newline at end of file
--- a/doc_cn/concepts/pserver_topology.dot
+++ b/doc_cn/concepts/pserver_topology.dot
+graph pp_topology {
+	rankdir=BT;
+	subgraph cluster_node0 {
+		style=filled;
+		color=lightgrey;
+		node [style=filled, color=white, shape=box];
+		label = "机器0"
+
+		pserver0 [label="Parameter \n Server 0"]
+		trainer0 [label="Trainer 0"]
+	}
+	subgraph cluster_node1 {
+		style=filled;
+		color=lightgrey;
+		node [style=filled, color=white, shape=box];
+		label = "机器1"
+
+		pserver1 [label="Parameter \n Server 1"]
+		trainer1 [label="Trainer 1"]
+	}
+
+	subgraph cluster_node2 {
+		style=filled;
+		color=lightgrey;
+		node [style=filled, color=white, shape=box];
+		label = "机器2"
+
+		pserver2 [label="Parameter \n Server 2"]
+		trainer2 [label="Trainer 2"]
+	}
+
+	subgraph cluster_node3 {
+		style=filled;
+		color=lightgrey;
+		node [style=filled, color=white, shape=box];
+		label = "机器3"
+
+		pserver3 [label="Parameter \n Server 3"]
+		trainer3 [label="Trainer 3"]
+	}
+
+	data [label="数据", shape=hexagon]
+
+	trainer0 -- pserver0
+	trainer0 -- pserver1
+	trainer0 -- pserver2
+	trainer0 -- pserver3
+
+	trainer1 -- pserver0
+	trainer1 -- pserver1
+	trainer1 -- pserver2
+	trainer1 -- pserver3
+
+	trainer2 -- pserver0
+	trainer2 -- pserver1
+	trainer2 -- pserver2
+	trainer2 -- pserver3
+
+	trainer3 -- pserver0
+	trainer3 -- pserver1
+	trainer3 -- pserver2
+	trainer3 -- pserver3
+
+	data -- trainer0
+	data -- trainer1
+	data -- trainer2
+	data -- trainer3
+}
--- a/doc_cn/concepts/trainer_config.py
+++ b/doc_cn/concepts/trainer_config.py
+from paddle.trainer_config_helpers import *
+
+define_py_data_sources2(train_list='train.list',
+                        test_list='test.list',
+                        module='provider',
+                        obj='process')
+settings(
+    batch_size=128,
+    learning_rate=1e-3,
+    learning_method=AdamOptimizer(),
+    regularization=L2Regularization(0.5)
+)
+
+img = data_layer(name='pixel', size=28 * 28)
+
+hidden1 = simple_img_conv_pool(input=img, filter_size=3, num_filters=32, pool_size=3,
+                               num_channel=1)
+
+hidden2 = fc_layer(input=hidden1, size=200, act=TanhActivation(),
+                   layer_attr=ExtraAttr(drop_rate=0.5))
+predict = fc_layer(input=hidden2, size=10, act=SoftmaxActivation())
+
+outputs(classification_cost(input=predict, label=data_layer(name='label', size=10)))
--- a/doc_cn/concepts/use_concepts.rst
+++ b/doc_cn/concepts/use_concepts.rst
+#########################
+PaddlePaddle 基本使用概念
+#########################
+
+PaddlePaddle是一个神经网络学习框架。其单机进程为 :code:`paddle train`。 单机的所有设备使用，均在单机进程内调度完成。 而多机辅助进程 :code:`paddle pserver` 负责联合多个单机进程进行通信，进而充分利用集群的计算资源。 PaddlePaddle同时以 :code:`swig api` 的形式，提供训练结果模型预测的方法和自定义训练流程。
+
+下面我们会分别介绍主要进程 :code:`paddle train` 中的一些概念。这些概念会对如何使用PaddlePaddle有一定的帮助。 了解这些概念的前提是，读者已经了解 `基本的神经网络/机器学习原理和概念 <nn.html>`_ 。同时，如果想要了解PaddlePaddle实现中的一些概念，请参考 `PaddlePaddle 编程中的基本概念 <program_concepts.html>`_ 。
+
+..	contents::
+
+PaddlePaddle 的进程模型
+=======================
+
+PaddlePaddle进程内嵌了一个 :code:`python` 解释器。 这个 :code:`python` 解释器负责解析用户定义的神经网络配置，和解析用户数据，并将用户数据传入给 PaddlePaddle。
+
+..	graphviz:: 
+
+	digraph pp_process {
+		rankdir=LR;
+		config_file [label="用户神经网络配置"];
+		subgraph cluster_pp {
+			style=filled;
+			color=lightgrey;
+			node [style=filled, color=white, shape=box];
+			label = "PaddlePaddle C++";
+			py [label="Python解释器"];
+		}
+		data_provider [label="用户数据解析"];
+		config_file -> py;
+		py -> data_provider [dir="back"];
+	}
+
+所以，PaddlePaddle单机训练进程，:code:`paddle train` , 对于用户的主要接口语言为 python。 主要需要用户配置的两个文件为 :code:`DataProvider` 和训练文件 :code:`TrainerConfig` 。
+
+
+DataProvider
+============
+
+DataProvider是 :code:`paddle train` 的数据提供器。 它负责将用户的原始数据转换成 PaddlePaddle 可以识别的数据类型。每当 PaddlePaddle 需要新的数据训练时，都会调用 DataProvider 返回数据。 当所有数据读取完一轮后，DataProvider 便返回空数据通知 PaddlePaddle。PaddlePaddle负责在下一轮训练开始前，将DataProvider重置。
+
+需要注意的是，DataProvider在PaddlePaddle中是被训练逻辑调用的关系， 而不是新的数据驱动训练。并且所有的 :code:`shuffle` , 和一些随机化的噪声添加，都应该在 DataProvider 阶段完成。
+
+为了方便用户使用自己的数据格式， PaddlePaddle 提供了 `PyDataProvider`_ 来处理数据。 并且在这个Provider中，PaddlePaddle的 C++ 部分接管了如何shuffle，处理 batch，GPU/CPU通信，双缓冲，异步读取等问题。 用户可以参考 `PyDataProvider`_ 的相关文档，继续深入了解 DataProvider 的使用。
+
+
+训练文件
+========
+
+训练文件是PaddlePaddle中配置神经网络结构、学习优化算法、数据传入方式的地方。 训练文件是一个python文件，使用命令行参数 :code:`--config` 传给 paddle 的主程序。 例如\:
+
+..	code-block:: bash
+
+	paddle train --config=trainer_config.py
+
+一个典型简单的训练文件可能为
+
+..  literalinclude:: trainer_config.py
+    :linenos:
+
+下面我们详细的介绍一下训练文件中各个模块的概念。
+
+
+trainer_config_helpers
+----------------------
+
+PaddlePaddle的配置文件与PaddlePaddle C++端通信的最基础协议是 :code:`protobuf` 。而为了避免用户直接写比较难写的 protobuf string，我们书写了一个helpers来生成这个protobuf包。所以在文件的开始，import这些helpers函数。
+
+需要注意的是，这个 :code:`paddle.trainer_config_helpers` 包是标准的python包，这意味着用户可以选择自己喜欢的 :code:`ide` 或者编辑器来编写Paddle的配置文件，这个python包注释文档比较完善，并且考虑了IDE的代码提示与类型注释。
+
+data_sources
+------------
+
+data_sources是配置神经网络的数据源。这里使用的函数是 :code:`define_py_data_sources2` ，这个函数是定义了使用 `PyDataProvider`_ 作为数据源。 而后缀 :code:`2` 是Paddle历史遗留问题，因为Paddle之前使用的 PyDataProvider 性能较差，所以完全重构了一个新的 `PyDataProvider`_ 。
+
+data_sources里面的 train_list 和 test_list 指定的是训练文件列表和测试文件列表。 如果传入一个字符串的话，是指一个训练列表文件。这个训练列表文件中包含的是每一个训练或者测试文件的路径。如果传入一个list的话，则会默认生成一个 list 文件，再传入给 train.list 或者 test.list 。
+
+而 :code:`module` 和 :code:`obj` 指定了 DataProvider 的模块名和函数名。
+
+更具体的使用，请参考 `PyDataProvider`_ 。
+
+settings
+--------
+
+`settings`_ 是神经网络训练算法相关的设置项。包括学习率，batch_size，优化算法，正则方法等等。具体的使用方法请参考 `settings`_ 文档。
+
+网络配置
+--------
+
+上述网络配置中余下的部分均是神经网络配置。第一行是定义一个名字叫 "pixel" 的 :code:`data_layer` 。每一个layer返回的都是一个 :code:`LayerOutput` 对象。 这里第一层的输出对象是 :code:`img` 。然后这个对象传输给了另一个 layer 函数，
+:code:`simple_img_conv_pool` 。:code:`simple_img_conv_pool` 是一个组合层，
+包括了图像的卷积 (convolution) 和池化(pooling)，
+并继续接了一个全连接层( :code:`fc_layer` )，然后再接了一个Softmax的全连接层。
+
+最终，网络配置输出了 :code:`classification_cost` 。标记网络输出的函数为 
+:code:`outputs` 。网络的输出是神经网络的优化目标，神经网络训练的时候，实际上就是
+要最小化这个输出。
+
+在神经网络进行预测的时候，实际上网络的输出也是通过 :code:`outputs` 标记。
+
+
+Layer、Projection、Operator
+===========================
+
+PaddlePaddle的网络基本上是基于Layer来配置的。所谓的Layer即是神经网络的某一层，
+而神经网络的某一层，一般是封装了许多复杂操作的操作集合。比如最简单的
+:code:`fc_layer` ，也包括矩阵乘法，多输入的求和，和activation。
+
+..	code-block:: python
+
+	data = data_layer(name='data', size=200)
+	out = fc_layer(input=data, size=200, act=TanhActivation())
+
+而对于更灵活配置需求，可能这样基于Layer的配置是不灵活的。于是 PaddlePaddle 提供
+了基于 Projection 或者 Operator 的配置。使用Projection和Operator需要与
+:code:`mixed_layer` 配合使用。 :code:`mixed_layer` 是将layer中的元素累加求和，
+并且做一个 :code:`activation` ， 而这个layer具体如何计算，是交由内部的Projection
+和 Operator 定义。Projection是指含有可学习参数的操作，而Operator不含有可学习的
+参数，输入全是其他Layer的输出。
+
+
+例如，和 :code:`fc_layer` 同样功能的 :code:`mixed_layer` 。
+
+..	code-block:: python
+
+	data = data_layer(name='data', size=200)
+	with mixed_layer(size=200) as out:
+		out += full_matrix_projection(input=data)
+
+PaddlePaddle可以使用的mixed layer 配置出非常复杂的网络，甚至可以直接配置一个完整的LSTM。
+用户可以参考 `mixed_layer`_ 的相关文档进行配置。
+
+如何利用单机的所有GPU或所有CPU核心
+==================================
+
+PaddlePaddle的单机进程 :code:`paddle train` 可以充分利用一台计算机上所有的GPU资
+源或者CPU。
+
+如果要使用机器上多块GPU，使用如下命令即可\:
+
+..	code-block:: bash
+
+	paddle train --use_gpu=true --trainer_count=4  # use 4 gpu card, 0, 1, 2, 3
+
+如果要使用机器上多块CPU, 使用如下命令即可\:
+
+..	code-block:: bash
+
+	paddle train --trainer_config=4  # use 4 cpu cores.
+
+对于其他设置GPU的选择情况，例如选择第0、2号GPU显卡，则可以使用 :code:`CUDA_VISIBLE_DEVICES` 环境变量来选择部分的显卡。 具体可以参考连接`masking-gpus`_ 。 可以使用的命令为
+
+..	code-block:: bash
+
+	env CUDA_VISIBLE_DEVICES=0,2 paddle train --use_gpu=true --trainer_config=2
+
+如何利用多台机器的计算资源训练神经网络
+======================================
+
+PaddlePaddle多机使用的经典方法是通过 :code:`Parameter Server` 来对多机的 :code:`paddle train` 进行同步。 而多机训练神经网络，首先要讲数据切分到不同的机器上。 切分数据文件的方式在PaddlePaddle的开源实现中并没有提供工具包。 但是切分数据并不是一件非常复杂的事情，也不是神经网络实现的重点。
+
+多机训练过程中，经典的拓扑结构如下\:
+
+..	graphviz:: pserver_topology.dot
+
+图中每个灰色方块是一台机器，在每个机器中，先去启动一个 :code:`paddle pserver` 进程，并确定整体的端口号。可能的参数是\:
+
+..	code-block:: bash
+
+	paddle pserver --port=5000 --num_gradient_servers=4 --nics='eth0'
+
+这里说明系统的 :code:`paddle pserver` 的起始端口是 :code:`5000` ，并且有四个训练进程(:code:`gradient_servers`，Paddle同时将 :code:`paddle train` 进程称作 :code:`GradientServer` 。因为其为负责提供Gradient的进程)。 而对于训练进程的话，则需要在 :code:`paddle pserver` 启动之后，再在各个节点上运行如下命令\:
+
+..	code-block:: bash
+
+	paddle train --port=5000 --pservers=192.168.100.101,192.168.100.102,192.168.100.103,192.168.100.104 --config=...
+
+对于简单的多机协同使用上述方式即可。同时，pserver/train 通常在高级情况下，还有两个参数需要设置，他们是
+
+* --ports_num\: 一个 pserver进程共绑定多少个端口用来做稠密更新。默认是1
+* --ports_num_for_sparse\: 一个pserver进程共绑定多少端口用来做稀疏更新，默认是0
+
+使用手工指定端口数量，是因为Paddle的网络通信中，使用了 :code:`int32` 作为消息长度，比较容易在大模型下溢出。所以，在 :code:`paddle pserver` 进程中可以启动多个子线程去接受 trainer 的数据，这样单个子线程的长度就不会溢出了。但是这个值不可以调的过大，因为增加这个值，还是对性能，尤其是内存占用有一定的开销的，另外稀疏更新的端口如果太大的话，很容易某一个参数服务器没有分配到任何参数。
+
+详细的说明可以参考，使用 `集群训练Paddle`_ 。
+
+
+..  _PyDataProvider: ../ui/data_provider/pydataprovider2.html
+..	_settings: ../../doc/ui/api/trainer_config_helpers/optimizers.html#settings
+..	_mixed_layer: ../../doc/ui/api/trainer_config_helpers/layers.html#mixed-layer
+..	_masking-gpu: http://www.acceleware.com/blog/cudavisibledevices-masking-gpus
+..  _集群训练Paddle: ../cluster/index.html
--- a/doc_cn/faq/index.rst
+++ b/doc_cn/faq/index.rst
@@ -166,4 +166,14 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字

 这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。

+7. *-cp27mu-linux_x86_64.whl is not a supported wheel on this platform.
+-----------------------------------------------------------------------
+
+出现这个问题的主要原因是，系统编译wheel包的时候，使用的 :code:`wheel` 包是最新的，
+而系统中的 :code:`pip` 包比较老。具体的解决方法是，更新 :code:`pip` 包并重新编译PaddlePaddle。
+更新 :code:`pip` 包的方法是\:
+
+..  code-block:: bash
+
+    pip install --upgrade pip

--- a/doc_cn/howto/how_to_write_docs/index.rst
+++ b/doc_cn/howto/how_to_write_docs/index.rst
+###############################
+如何贡献/修改PaddlePaddle的文档
+###############################
+
+PaddlePaddle的文档使用 `cmake`_ 驱动 `sphinx`_ 生成。公有两个文档，:code:`doc` 和 :code:`doc_cn` 。这两者会在 `cmake`_ 中进行编译，生成后的文档会存储在服务器的 :code:`doc` 和 :code:`doc_cn` 两个目录下。
+
+下面分几个部分介绍一下PaddlePaddle文档的贡献方法。
+
+如何书写PaddlePaddle的文档
+==========================
+
+TBD
+
+如何构建PaddlePaddle的文档
+==========================
+
+构建PaddlePaddle文档，需要使用构建Paddle的全部环境。准备这个环境相对来说比较复杂，所以本文档提供两种方式构建PaddlePaddle的文档，即
+
+* 使用Docker构建PaddlePaddle的文档
+* 直接构建PaddlePaddle的文档。
+
+并且，我们推荐使用Docker来构建PaddlePaddle的文档。
+
+
+使用Docker构建PaddlePaddle的文档
+--------------------------------
+
+使用Docker构建PaddlePaddle的文档，首先要求在系统里安装好Docker工具包。安装Docker请参考 `Docker的官网 <https://docs.docker.com/>`_ 。
+
+安装好Docker之后可以使用源码目录下的脚本构建文档，即
+
+..	code-block:: bash
+
+	cd TO_YOUR_PADDLE_CLONE_PATH
+	cd paddle/scripts/tools/build_docs
+	bash build_docs.sh
+
+执行完这个脚本后，该目录下会生成两个目录，分别是\:
+
+* doc 目录，英文文档地址
+* doc_cn 目录，中文文档地址
+
+打开浏览器访问对应目录下的index.html即可访问本地文档。
+
+..	code-block:: bash
+
+	open doc_cn/index.html
+
+
+直接构建PaddlePaddle的文档
+--------------------------
+
+TBD
+
+
+如何更新www.paddlepaddle.org文档
+================================
+
+TBD
+
+
+..	_cmake: https://cmake.org/
+..	_sphinx: http://www.sphinx-doc.org/en/1.4.8/
\ No newline at end of file
--- a/doc_cn/index.rst
+++ b/doc_cn/index.rst
@@ -3,8 +3,9 @@ PaddlePaddle文档

 使用指南
 --------
-
+* `介绍 <introduction/index.html>`_
 * `快速入门 <demo/quick_start/index.html>`_
+* `基本使用概念 <concepts/use_concepts.html>`_
 * `编译与安装 <build_and_install/index.html>`_
 * `用户接口 <ui/index.html>`_
 * `使用示例 <demo/index.html>`_
@@ -14,6 +15,7 @@ PaddlePaddle文档
 开发指南
 --------
 * `新写Layer <../doc/dev/new_layer/index.html>`_
+* `如何贡献文档 <howto/how_to_write_docs/index.html>`_

 算法教程
 --------

--- a/doc_cn/introduction/index.md
+++ b/doc_cn/introduction/index.md
+# 简介
+
+PaddlePaddle 是起源于百度的开源深度学习平台。它是简单易用的：你可以通过简单的十数行配置搭建经典的神经网络模型；它也是高效强大的：PaddlePaddle可以支撑复杂集群环境下超大模型的训练，令你受益于深度学习的前沿成果。在百度内部，已经有大量产品线使用了基于PaddlePaddle的深度学习技术。
+
+这份简短的介绍将像你展示如何利用PaddlePaddle解决一个经典的学习问题。
+
+## 1. 一个经典的任务
+
+让我们从一个基础问题开始：<a href="https://www.baidu.com/s?wd=单变量线性回归">单变量的线性回归</a>。问题假定观测到了一批二维空间上的点`(x, y) `，并且已知 `x` 和 `y` 之间存在着某种线性关系，我们的目标是通过观测数据还原这个线性关系。作为一个简单基础的模型，线性回归却有着广泛的应用场景。比如可以想象一个资产定价的简化场景，其中 `x` 对应于房屋的大小，`y` 对应于房屋价格。我们可以通过观察市场上房屋的情况获得二者之间的关系，从而为新房屋的定价提供参考。
+
+
+## 2. 准备数据
+
+假设变量 `X` 和 `Y` 的真实关系为： `Y = 2X + 0.3`，这里展示如何使用观测数据还原这一线性关系。如下Python代码将随机产生2000个观测点，它们将被用作PaddlePaddle的输入。产生PaddlePaddle的输入数据和写一段普通的Python脚本几乎一样，你唯一需要增加的就是定义输入数据的类型。
+
+```python
+# -*- coding:utf-8 -*-
+# dataprovider.py
+from paddle.trainer.PyDataProvider2 import *
+import random
+
+# 定义输入数据的类型: 2个浮点数
+@provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
+def process(settings, input_file):
+    for i in xrange(2000):
+        x = random.random()
+        yield [x], [2*x+0.3]
+```
+
+## 3. 训练模型
+
+为了还原 `Y = 2X + 0.3`，我们先从一条随机的直线 `Y' = wX + b` 开始，然后利用观测数据调整 `w` 和 `b` 使得 `Y'` 和 `Y` 的差距不断减小，最终趋于相同。这个过程就是模型的训练过程，而 `w` 和 `b` 就是模型的参数，即我们的训练目标。
+
+在PaddlePaddle里，该模型的网络配置如下。
+
+```python
+# -*- coding:utf-8 -*-
+# trainer_config.py
+from paddle.trainer_config_helpers import *
+
+# 1. 定义数据来源，调用上面的process函数获得观测数据
+data_file = 'empty.list'
+with open(data_file, 'w') as f: f.writelines(' ')
+define_py_data_sources2(train_list=data_file, test_list=None, 
+        module='dataprovider', obj='process',args={})
+
+# 2. 学习算法。控制如何改变模型参数 w 和 b
+settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
+
+# 3. 神经网络配置
+x = data_layer(name='x', size=1)
+y = data_layer(name='y', size=1)
+# 线性计算单元: y_predict = wx + b
+y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
+# 损失计算，度量 y_predict 和真实 y 之间的差距
+cost = regression_cost(input=y_predict, label=y)
+outputs(cost)
+```
+这段简短的配置展示了PaddlePaddle的基本用法：
+
+- 首先，第一部分定义了数据输入。一般情况下，PaddlePaddle先从一个文件列表里获得数据文件地址，然后交给用户自定义的函数（例如上面的`process`函数）进行读入和预处理从而得到真实输入。本文中由于输入数据是随机生成的不需要读输入文件，所以放一个空列表（`empty.list`）即可。
+
+- 第二部分主要是选择学习算法，它定义了模型参数如何改变。PaddlePaddle提供了很多优秀的学习算法，但这里使用一个简单的基于momentum的算法就足够了，它每次读取12个数据进行计算和模型更新。
+
+- 最后一部分是神经网络的配置。由于PaddlePaddle已经实现了丰富的网络单元（Layer），所以很多时候你需要做的只是声明正确的网络单元并把它们拼接起来。这里使用了三种网络单元：
+	- **数据层**：数据层 `data_layer` 是神经网络的入口，它读入数据并将它们传输到下游的其它单元。这里数据层有两个，分别对应于变量 `X` 和 `Y`。
+	- **全连接层**：全连接层 `fc_layer` 是基础的计算单元，这里利用它建模变量之间的线性关系。计算单元是神经网络的核心，PaddlePaddle支持大量的计算单元和任意深度的网络连接，从而可以挖掘复杂的数据关系。
+	- **回归损失层**：回归损失层 `regression_cost`是众多损失函数层的一种，它们在训练过程作为网络的出口，用来计算模型的表现，并指导模型参数的改变。
+
+这样定义了网络结构并保存为`trainer_config.py`之后，运行训练命令即可：
+ ```
+ paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
+ ```
+
+PaddlePaddle将在观测数据集上迭代训练30轮，并将每轮的模型结果存放在 `./output` 路径下。从输出日志可以看到，随着轮数增加损失函数的输出在不断的减小，这意味着模型在不断的改进，直到逼近真实解：` Y = 2X + 0.3 `
+
+## 4. 模型检验
+
+训练完成后，我们希望能够检验模型的好坏。一种常用的做法是用模型对另外一组数据进行预测，然后评价预测的效果。但在这个例子中，由于已经知道了真实答案，我们可以直接观察模型的参数是否符合预期来进行检验。
+
+PaddlePaddle将每个模型参数作为一个numpy数组单独存为一个文件，所以可以利用如下方法读取模型的参数。
+
+```python
+import numpy as np
+import os
+
+def load(file_name):
+    with open(file_name, 'rb') as f:
+        f.read(16) # skip header for float type.
+        return np.fromfile(f, dtype=np.float32)
+        
+print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
+# w=1.999743, b=0.300137
+```
+<center> ![](./parameters.png) </center>
+
+从图中可以看到，虽然 `w` 和 `b` 都使用随机值初始化，但在起初的几轮训练中它们都在快速逼近真实值，并且后续仍在不断改进，使得最终得到的模型几乎与真实模型重合。
+
+这样，我们就完成了对单变量线性回归问题的解决：将数据输入PaddlePaddle，训练模型，最后验证结果。
+
+## 5. 推荐后续阅读
+
+- <a href="../build_and_install/index.html">安装/编译</a>：PaddlePaddle的安装与编译文档。
+- <a href="../demo/quick_start/index.html">快速入门 </a>：使用商品评论分类任务，系统性的介绍如何一步步改进，最终得到产品级的深度模型。
+- <a href="../demo/index.html">示例</a>：各种实用案例，涵盖图像、文本、推荐等多个领域。
--- a/doc_cn/introduction/parameters.png
+++ b/doc_cn/introduction/parameters.png
--- a/paddle/api/Arguments.cpp
+++ b/paddle/api/Arguments.cpp
@@ -14,27 +14,10 @@ limitations under the License. */


 #include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"

 #include "paddle/parameter/Argument.h"

-struct ArgumentsPrivate {
-  std::vector<paddle::Argument> outputs;
-
-  inline paddle::Argument& getArg(size_t idx) throw(RangeError) {
-    if (idx < outputs.size()) {
-      return outputs[idx];
-    } else {
-      RangeError e;
-      throw e;
-    }
-  }
-
-  template <typename T>
-  std::shared_ptr<T>& cast(void* rawPtr) const {
-    return *(std::shared_ptr<T>*)(rawPtr);
-  }
-};
-
 size_t Arguments::getSlotNum() const { return m->outputs.size(); }

 Arguments* Arguments::createArguments(size_t slotNum) {

--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -40,6 +40,8 @@ configure_file(

 generate_python_api(python_swig_sources)

+file(GLOB PY_PADDLE_PYTHON_FILES ${PROJ_ROOT}/paddle/py_paddle/*.py)
+
 # TODO(yuyang18) : make wheel name calculated by cmake
 add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/dist/.timestamp
    COMMAND ${PYTHON_EXECUTABLE} setup.py  bdist_wheel
@@ -55,6 +57,7 @@ add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/dist/.timestamp
            paddle_trainer
            paddle_api
            paddle_cuda
+	    ${PY_PADDLE_PYTHON_FILES}
 )

 install(DIRECTORY ${PROJ_ROOT}/paddle/dist/

--- a/paddle/api/ConfigParser.cpp
+++ b/paddle/api/ConfigParser.cpp
@@ -14,17 +14,9 @@ limitations under the License. */


 #include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
 #include "paddle/trainer/Trainer.h"

-struct TrainerConfigPrivate {
-  std::shared_ptr<paddle::TrainerConfig> conf;
-  TrainerConfigPrivate() : conf(std::make_shared<paddle::TrainerConfig>()) {}
-};
-
-struct ModelConfigPrivate {
-  std::shared_ptr<paddle::TrainerConfig> conf;
-};
-
 struct ParameterConfigPrivate {
  paddle::ParameterPtr parameter;
  paddle::ParameterConfig config;
@@ -39,19 +31,6 @@ struct ParameterConfigPrivate {
  }
 };

-struct OptimizationConfigPrivate {
-  std::shared_ptr<paddle::TrainerConfig> trainer_config;
-  paddle::OptimizationConfig config;
-
-  paddle::OptimizationConfig& getConfig() {
-    if (trainer_config != nullptr) {
-      return *trainer_config->mutable_opt_config();
-    } else {
-      return config;
-    }
-  }
-};
-
 TrainerConfig::TrainerConfig() : m(new TrainerConfigPrivate()) {}

 TrainerConfig::~TrainerConfig() { delete m; }
@@ -59,10 +38,19 @@ TrainerConfig::~TrainerConfig() { delete m; }
 TrainerConfig* TrainerConfig::createFromTrainerConfigFile(
    const std::string& confPath) {
  LOG(INFO) << "load trainer config from " << confPath;
-  paddle::TrainerConfigHelper helper(confPath);
-  //! TODO(yuyang18): Make TrainerConfigPrivate to TrainerConfigHelper
+  auto conf = std::make_shared<paddle::TrainerConfigHelper>(confPath);
  auto retv = new TrainerConfig();
-  *retv->m->conf = helper.getConfig();
+  retv->m->conf = conf;
+  return retv;
+}
+
+TrainerConfig* TrainerConfig::createFromProtoString(
+    const std::string& str) {
+  auto retv = new TrainerConfig();
+  paddle::TrainerConfig trainerConfigProto;
+  auto conf = std::make_shared<paddle::TrainerConfigHelper>(trainerConfigProto);
+  CHECK(conf->getMutableConfig().ParseFromString(str));
+  retv->m->conf = conf;
  return retv;
 }

@@ -76,10 +64,6 @@ ModelConfig* TrainerConfig::getModelConfig() const {
  return retv;
 }

-void* ModelConfig::getPaddleModelConfig() const {
-  return m->conf->mutable_model_config();
-}
-
 ParameterConfig::ParameterConfig() : m(new ParameterConfigPrivate()) {}

 ParameterConfig::~ParameterConfig() {
@@ -132,8 +116,6 @@ OptimizationConfig* TrainerConfig::getOptimizationConfig() const {
  return opt_config;
 }

-void* OptimizationConfig::getRawPtr() { return &m->getConfig(); }
-
 OptimizationConfig* OptimizationConfig::createFromProtoString(
    const std::string& str) {
  auto conf = new OptimizationConfig();

--- a/paddle/api/GradientMachine.cpp
+++ b/paddle/api/GradientMachine.cpp
@@ -14,30 +14,22 @@ limitations under the License. */


 #include "PaddleAPI.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
+#include "PaddleAPIPrivate.h"
+
 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 #include "Internal.h"

 std::vector<int> GradientMachine::defaultParamTypes = {
    PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM};

-struct GradientMachinePrivate {
-  std::shared_ptr<paddle::GradientMachine> machine;
-
-  template <typename T>
-  inline T& cast(void* ptr) {
-    return *(T*)(ptr);
-  }
-};
-
 GradientMachine::GradientMachine() : m(new GradientMachinePrivate()) {}

 GradientMachine::~GradientMachine() { delete m; }

 GradientMachine* GradientMachine::createFromPaddleModelPtr(
-    void* confPtr, GradientMatchineCreateMode mode,
+    const void* confPtr, GradientMatchineCreateMode mode,
    const std::vector<int>& types) {
-  auto& conf = *(paddle::ModelConfig*)(confPtr);
+  auto& conf = *(const paddle::ModelConfig*)(confPtr);
  std::vector<ParameterType> realTypes;
  staticCastVector(&realTypes, types);
  auto machineRawPtr = paddle::GradientMachine::create(conf, mode, realTypes);
@@ -66,7 +58,7 @@ GradientMachine* GradientMachine::createByConfigProtoStr(
 GradientMachine* GradientMachine::createByModelConfig(
    ModelConfig* conf, GradientMatchineCreateMode mode,
    const std::vector<int>& types) {
-  auto confPtr = (paddle::ModelConfig*)conf->getPaddleModelConfig();
+  auto confPtr = &conf->m->conf->getModelConfig();
  return GradientMachine::createFromPaddleModelPtr(confPtr, mode, types);
 }


--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -446,7 +446,6 @@ struct OptimizationConfigPrivate;
 class OptimizationConfig {
  DISABLE_COPY_AND_ASSIGN(OptimizationConfig);
  OptimizationConfig();
-  void* getRawPtr();

 public:
  static OptimizationConfig* createFromProtoString(const std::string& str);
@@ -462,6 +461,7 @@ private:

  friend class TrainerConfig;
  friend class ParameterOptimizer;
+  friend class Trainer;
 };

 struct ParameterPrivate;
@@ -515,8 +515,6 @@ public:
  virtual ~ModelConfig();

 private:
-  void* getPaddleModelConfig() const;
-
  ModelConfigPrivate* m;
  friend class TrainerConfig;
  friend struct TrainerConfigPrivate;
@@ -539,6 +537,7 @@ public:

  static TrainerConfig* createFromTrainerConfigFile(
      const std::string& configPath);
+  static TrainerConfig* createFromProtoString(const std::string& str);

  ModelConfig* getModelConfig() const;

@@ -546,6 +545,7 @@ public:

 private:
  TrainerConfigPrivate* m;
+  friend class Trainer;
 };

 /**
@@ -700,11 +700,12 @@ private:
  GradientMachinePrivate* m;

  static GradientMachine* createFromPaddleModelPtr(
-      void* confPtr, GradientMatchineCreateMode mode,
+      const void* confPtr, GradientMatchineCreateMode mode,
      const std::vector<int>& types);

  // Not to use c++ 11 init-list, so we use static var as function default arg.
  static std::vector<int> defaultParamTypes;
+  friend class Trainer;
 };

 struct TrainerPrivate;
@@ -712,6 +713,7 @@ class Trainer {
 private:
  TrainerPrivate* m;
  Trainer();
+  Trainer(TrainerConfig* optConfig, GradientMachine* gm);
  DISABLE_COPY_AND_ASSIGN(Trainer);

 public:
@@ -720,38 +722,42 @@ public:
  /// Create A Trainer By TrainerConfig. using paddle command line.
  static Trainer* createByCommandLine() throw(IOError);

-  /// Start Train.
+  static Trainer* create(TrainerConfig* optConfig, GradientMachine* gm)
+      throw(IOError);
+
+  /// Start training
  void startTrain();
+
+  /// Finish training
  void finishTrain();

-  /// Start Pass.
+  /// Start a pass.
  void startTrainPass();
-  void finishTrainPass();

-  void setBatchSize(size_t batchSize);
+  /// Finish a pass
+  void finishTrainPass();

  /**
   * Train one batch,
   *
-   * @param batchSize -1 wiil use command line or batch size set before,
-   *                  otherwise use this batchSize for train.
-   *
   * @return true if all batch finished.
   */
-  bool trainOneBatch(size_t batchSize = -1UL);
+  bool trainOneBatch(size_t batchSize);

-  bool prepareBatchData(size_t batchSize = -1UL);
+  void trainOneDataBatch(size_t batchSize, const Arguments& args);

-  void finishTrainOneBatch();
+  void startTestPeriod();
+  void testOneDataBatch(size_t batchSize, const Arguments& args);
+  void finishTestPeriod();

-  void forwardOneBatch() throw(UnsupportError);
+  void forwardOneBatch(size_t batchSize);

-  Arguments* getNetworkOutput();
+  Arguments* getForwardOutput();

  Matrix* getLayerOutput(const std::string& layerName);
 };

-/// The N-Best results generated from one input sequence.
+/// the N-Best results generated from one input sequence.
 class ISequenceResults {
 public:
  virtual ~ISequenceResults();

--- a/paddle/api/PaddleAPIPrivate.h
+++ b/paddle/api/PaddleAPIPrivate.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/gserver/gradientmachines/GradientMachine.h"
+#include "paddle/trainer/TrainerConfigHelper.h"
+
+#pragma once
+
+struct GradientMachinePrivate {
+  std::shared_ptr<paddle::GradientMachine> machine;
+
+  template <typename T>
+  inline T& cast(void* ptr) {
+    return *(T*)(ptr);
+  }
+};
+
+struct OptimizationConfigPrivate {
+  std::shared_ptr<paddle::TrainerConfigHelper> trainer_config;
+  paddle::OptimizationConfig config;
+
+  const paddle::OptimizationConfig& getConfig() {
+    if (trainer_config != nullptr) {
+      return trainer_config->getOptConfig();
+    } else {
+      return config;
+    }
+  }
+};
+
+struct TrainerConfigPrivate {
+  std::shared_ptr<paddle::TrainerConfigHelper> conf;
+  TrainerConfigPrivate() {}
+};
+
+struct ModelConfigPrivate {
+  std::shared_ptr<paddle::TrainerConfigHelper> conf;
+};
+
+struct ArgumentsPrivate {
+  std::vector<paddle::Argument> outputs;
+
+  inline paddle::Argument& getArg(size_t idx) throw(RangeError) {
+    if (idx < outputs.size()) {
+      return outputs[idx];
+    } else {
+      RangeError e;
+      throw e;
+    }
+  }
+
+  template <typename T>
+  std::shared_ptr<T>& cast(void* rawPtr) const {
+    return *(std::shared_ptr<T>*)(rawPtr);
+  }
+};
+
--- a/paddle/api/ParameterOptimizer.cpp
+++ b/paddle/api/ParameterOptimizer.cpp
@@ -14,6 +14,7 @@ limitations under the License. */


 #include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
 #include "paddle/parameter/ParameterOptimizer.h"
 #include "Internal.h"
 #include <algorithm>
@@ -60,10 +61,9 @@ ParameterOptimizer::~ParameterOptimizer() {

 ParameterOptimizer* ParameterOptimizer::create(OptimizationConfig* config) {
  CHECK(config != nullptr);
-  auto opt_config_ptr = (paddle::OptimizationConfig*)config->getRawPtr();
  auto retOptimizer = new ParameterOptimizer();
  retOptimizer->m->optimizer.reset(
-      paddle::ParameterOptimizer::create(*opt_config_ptr, false));
+      paddle::ParameterOptimizer::create(config->m->getConfig(), false));
  return retOptimizer;
 }


--- a/paddle/api/Trainer.cpp
+++ b/paddle/api/Trainer.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"

 #include <stdlib.h>
 #include <memory>
@@ -30,31 +31,17 @@ P_DECLARE_string(config);
 P_DECLARE_string(init_model_path);
 P_DECLARE_int32(start_pass);

-struct TrainPassContext {
-  int64_t batchId;
-  int32_t batchSize;
-  real avgTestCost;
-  int64_t numAvgTests;
-  int passInnerId;
-  paddle::DataBatch data;
-  std::vector<paddle::Argument> forwardOutput;
-};
-
 struct TrainerPrivate : public paddle::Trainer {
-  void startTrain();
-  void finishTrain();
-
-  void startTrainPass();
-  void finishTrainPass();
-
-  bool _trainOneBatch();
-
-  bool _prepareBatchData();
-  void _forwardOneBatch() throw(UnsupportError);
-
+  bool _trainOneBatch(size_t batchSize);
+  bool forwardOneBatch(size_t batchSize);
+  void forwardOneDataBatch(const std::vector<paddle::Argument>& inArgs);
+  void setBatchSize(size_t batchSize);
+  std::vector<paddle::Argument>& getForwardOutput();
+
+  void startTestPeriod();
+  void finishTestPeriod();
+  void testOneDataBatch(const paddle::DataBatch& dataBatch);
  TrainerPrivate() : paddle::Trainer() {}
-
-  TrainPassContext trainPassContext;
 };

 Trainer::Trainer() : m(new TrainerPrivate()) {
@@ -75,61 +62,76 @@ Trainer* Trainer::createByCommandLine() throw(IOError) {
  }
 }

-void Trainer::startTrain() { m->startTrain(); }
+Trainer::Trainer(TrainerConfig* config, GradientMachine* gm)
+    : m(new TrainerPrivate()) {
+  m->init(config->m->conf, /* testing= */false, gm ? gm->m->machine : nullptr);
+}

-void TrainerPrivate::startTrain() {
-  srand(this->config_->getConfig().start_pass() + 1);
-  this->dataProvider_->reset();
-  this->trainerInternal_.getGradientMachine()->start(*config_, dataProvider_);
+Trainer* Trainer::create(TrainerConfig* config, GradientMachine* gm)
+    throw(IOError)
+{
+  auto retv = new Trainer(config, gm);
+  if (retv->m->getConfig().IsInitialized()) {
+    return retv;
+  } else {
+    retv->m->getConfig().CheckInitialized();
+    throw IOError();
+  }
 }

-void Trainer::finishTrain() { m->finishTrain(); }
+void Trainer::startTrain() { m->startTrain(); }

-void TrainerPrivate::finishTrain() {
-  this->trainerInternal_.getGradientMachine()->finish();
-}
+void Trainer::finishTrain() { m->finishTrain(); }

 void Trainer::startTrainPass() { m->startTrainPass(); }

-void TrainerPrivate::startTrainPass() {
-  this->stats_.reset();
-  this->trainPassContext.batchId = 0;
-  this->trainPassContext.batchSize = this->config_->getOptConfig().batch_size();
-  this->trainPassContext.avgTestCost = 0;
-  this->trainPassContext.numAvgTests = 0;
-  this->trainPassContext.passInnerId = 0;
-  this->trainerInternal_.getParameterUpdater()->startPass();
-  this->evaluator_->start();
-}
-
 void Trainer::finishTrainPass() { m->finishTrainPass(); }

-void TrainerPrivate::finishTrainPass() {
-  this->trainerInternal_.getGradientMachine()->onPassEnd();
-  this->trainerInternal_.getParameterUpdater()->finishPass();
-  evaluator_->finish();
+void Trainer::trainOneDataBatch(size_t batchSize, const Arguments& inArgs) {
+  paddle::DataBatch dataBatch;
+  dataBatch.getStreams() = inArgs.m->outputs;
+  dataBatch.setSize(batchSize);
+  m->trainOneDataBatch(dataBatch);
 }

-void Trainer::setBatchSize(size_t batchSize) {
-  this->m->trainPassContext.batchSize = batchSize;
+bool Trainer::trainOneBatch(size_t batchSize) {
+  return m->_trainOneBatch(batchSize);
 }

-bool Trainer::trainOneBatch(size_t batchSize) {
-  if (batchSize == -1UL) {
-    this->setBatchSize(batchSize);
+bool TrainerPrivate::_trainOneBatch(size_t batchSize) {
+  paddle::DataBatch dataBatch;
+  CHECK(dataProvider_) << "data_provider is not specified";
+  int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+  if (num == 0) {
+    return false;
  }
-  return m->_trainOneBatch();
+  trainOneDataBatch(dataBatch);
+  return false;
 }

-bool TrainerPrivate::_trainOneBatch() {
-  if (this->_prepareBatchData()) {
-    return true;
+void TrainerPrivate::startTestPeriod() {
+  if (!tester_) {
+    createTester();
  }
-  this->trainerInternal_.trainOneBatch(this->trainPassContext.batchId,
-                                       this->trainPassContext.data);
-  return false;
+  tester_->startTestPeriod();
+}
+
+void Trainer::startTestPeriod() { m->startTestPeriod(); }
+
+void TrainerPrivate::testOneDataBatch(const paddle::DataBatch& dataBatch) {
+  tester_->testOneDataBatch(dataBatch, &forwardOutput_);
+}
+
+void Trainer::testOneDataBatch(size_t batchSize, const Arguments& args) {
+  paddle::DataBatch dataBatch;
+  dataBatch.getStreams() = args.m->outputs;
+  dataBatch.setSize(batchSize);
+  m->testOneDataBatch(dataBatch);
 }

+void TrainerPrivate::finishTestPeriod() { tester_->finishTestPeriod(); }
+void Trainer::finishTestPeriod() { m->finishTestPeriod(); }
+
 Matrix* Trainer::getLayerOutput(const std::string& layerName) {
  auto nn = std::dynamic_pointer_cast<paddle::NeuralNetwork>(
          this->m->getGradientMachine());
@@ -138,46 +140,37 @@ Matrix* Trainer::getLayerOutput(const std::string& layerName) {
  return Matrix::createByPaddleMatrixPtr(&m);
 }

-bool Trainer::prepareBatchData(size_t batchSize) {
-  if (batchSize != -1UL) {
-    this->setBatchSize(batchSize);
+void Trainer::forwardOneBatch(size_t batchSize) { m->forwardOneBatch(batchSize); }
+
+bool TrainerPrivate::forwardOneBatch(size_t batchSize)  {
+  CHECK(dataProvider_) << "data_provider is not specified";
+  paddle::DataBatch dataBatch;
+  int num = dataProvider_->getNextBatch(batchSize, &dataBatch);
+  if (num == 0) {
+    return false;
  }
-  return this->m->_prepareBatchData();
-}

-bool TrainerPrivate::_prepareBatchData() {
-  int num = dataProvider_->getNextBatch(this->trainPassContext.batchSize,
-                                        &this->trainPassContext.data);
-  return num == 0;
+  forwardOneDataBatch(dataBatch.getStreams());
+  return true;
 }

-void Trainer::finishTrainOneBatch() { ++m->trainPassContext.batchId; }
+void TrainerPrivate::forwardOneDataBatch(
+    const std::vector<paddle::Argument>& inArgs) {

-void Trainer::forwardOneBatch() throw(UnsupportError) { m->_forwardOneBatch(); }
-
-void TrainerPrivate::_forwardOneBatch() throw(UnsupportError) {
-  auto& dataBatch = this->trainPassContext.data;
-
-  int64_t actualBatchSize = dataBatch.getSize();
-  if (actualBatchSize == 0) {
-    return;
-  }
-
-  const std::vector<paddle::Argument>& inArgs = dataBatch.getStreams();
-  std::vector<paddle::Argument>& outArgs = this->trainPassContext.forwardOutput;
-  outArgs.clear();
-  paddle::PassType passType =
-      this->trainerInternal_.getParameterUpdater()->startBatch(actualBatchSize);
+  std::vector<paddle::Argument>& outArgs = forwardOutput_;

  if (config_->getOptConfig().use_sparse_remote_updater()) {
-    this->trainerInternal_.getGradientMachine()->prefetch(inArgs);
-    this->trainerInternal_.getParameterUpdater()->getParametersRemote();
+    trainerInternal_.getGradientMachine()->prefetch(inArgs);
+    trainerInternal_.getParameterUpdater()->getParametersRemote();
  }
-  this->trainerInternal_.getGradientMachine()->forward(
-        inArgs, &outArgs, passType);
+  trainerInternal_.getGradientMachine()->forward(
+      inArgs, &outArgs, paddle::PASS_TEST);
+}
+
+Arguments* Trainer::getForwardOutput() {
+  return Arguments::createByPaddleArgumentVector(&m->getForwardOutput());
 }

-Arguments* Trainer::getNetworkOutput() {
-  return Arguments::createByPaddleArgumentVector(
-      &m->trainPassContext.forwardOutput);
+std::vector<paddle::Argument>& TrainerPrivate::getForwardOutput() {
+  return forwardOutput_;
 }
--- a/paddle/api/test/run_tests.sh
+++ b/paddle/api/test/run_tests.sh
@@ -30,7 +30,7 @@ source .test_env/bin/activate

 pip --timeout 600  install ../../dist/*.whl

-test_list="testArguments.py testGradientMachine.py testMatrix.py  testVector.py testTrain.py"
+test_list="testArguments.py testGradientMachine.py testMatrix.py  testVector.py testTrain.py testTrainer.py"

 export PYTHONPATH=$PWD/../../../python/


--- a/paddle/api/test/testTrain.py
+++ b/paddle/api/test/testTrain.py
@@ -12,9 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from py_paddle import swig_paddle, DataProviderWrapperConverter
+from py_paddle import swig_paddle
 import paddle.trainer.config_parser
-from paddle.trainer.PyDataProviderWrapper import DenseSlot, IndexSlot
 import numpy
 import util


--- a/paddle/api/test/testTrainer.py
+++ b/paddle/api/test/testTrainer.py
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer.config_parser import parse_config
+from paddle.trainer.config_parser import logger
+from py_paddle import swig_paddle
+import util
+
+def main():
+    trainer_config = parse_config(
+        "./testTrainConfig.py", "")
+    model = swig_paddle.GradientMachine.createFromConfigProto(
+        trainer_config.model_config)
+    trainer = swig_paddle.Trainer.create(trainer_config, model)
+    trainer.startTrain()
+    for train_pass in xrange(2):
+        trainer.startTrainPass()
+        num = 0
+        cost = 0
+        while True:  # Train one batch
+            batch_size = 1000
+            data, atEnd = util.loadMNISTTrainData(batch_size)
+            if atEnd:
+                break
+            trainer.trainOneDataBatch(batch_size, data)
+            outs = trainer.getForwardOutput()
+            cost += sum(outs[0]['value'])
+            num += batch_size
+        trainer.finishTrainPass()
+        logger.info('train cost=%f' % (cost / num))
+
+        trainer.startTestPeriod()
+        num = 0
+        cost = 0
+        while True:  # Test one batch
+            batch_size = 1000
+            data, atEnd = util.loadMNISTTrainData(batch_size)
+            if atEnd:
+                break
+            trainer.testOneDataBatch(batch_size, data)
+            outs = trainer.getForwardOutput()
+            cost += sum(outs[0]['value'])
+            num += batch_size
+        trainer.finishTestPeriod()
+        logger.info('test cost=%f' % (cost / num))
+
+    trainer.finishTrain()
+ 
+
+if __name__ == '__main__':
+    swig_paddle.initPaddle("--use_gpu=0", "--trainer_count=1")
+    main()
--- a/paddle/cuda/include/hl_cuda_cublas.h
+++ b/paddle/cuda/include/hl_cuda_cublas.h
@@ -21,8 +21,8 @@ limitations under the License. */
 /**
 * @brief   Matrix transpose: C_d = T(A_d)
 *
- * @param[in]   A_d     input matrix (M x N).
- * @param[out]  C_d     output matrix (N x M).
+ * @param[in]   A_d     input matrix (dimM x dimN).
+ * @param[out]  C_d     output matrix (dimN x dimM).
 * @param[in]   dimM    matrix height.
 * @param[in]   dimN    matrix width.
 * @param[in]   lda     the first dimension of A_d.
@@ -39,8 +39,8 @@ extern void hl_matrix_transpose(real *A_d,
 /*
 * @brief Matrix transpose, while lda = dimN, ldc = dimM.
 *
- * @param[in]   A_d     input matrix (M x N).
- * @param[out]  C_d     output matrix (N x M).
+ * @param[in]   A_d     input matrix (dimM x dimN).
+ * @param[out]  C_d     output matrix (dimN x dimM).
 * @param[in]   dimM    matrix height.
 * @param[in]   dimN    matrix width.
 *
@@ -50,6 +50,22 @@ extern void hl_matrix_transpose(real *A_d,
                                int dimM,
                                int dimN);

+/*
+ * @brief Matrix inverse
+ *
+ * @param[in]   A_d    input matrix (dimN x dimN).
+ * @param[out]  C_d    output matrix (dimN x dimN).
+ * @param[in]   dimN   matrix height = matrix width
+ * @param[in]   lda    the first dimension of A_d
+ * @param[in]   ldc    the first dimension of C_d
+ *
+ */
+extern void hl_matrix_inverse(real *A_d,
+                              real *C_d,
+                              int dimN,
+                              int lda,
+                              int ldc);
+
 /**
 * @brief   C_d = alpha*(op(A_d) * op(B_d)) + beta*C_d
 *

--- a/paddle/cuda/include/hl_device_functions.cuh
+++ b/paddle/cuda/include/hl_device_functions.cuh
@@ -48,5 +48,24 @@ inline __device__ double paddleAtomicAdd(double* address, double val) {
 }
 }  // namespace paddle

+/**
+ * @brief  sum reduction
+ *
+ * @param[in,out]  smem       input data, better to use __shared__ memory.
+ * @param[in]      tid        thread index.
+ * @param[in]      threads    the total thread number used to reduce,
+ *                            such as, blockDim.x.
+ *
+ * @return smem[0]: the sum of each elements in smem.
+ */
+__device__ __forceinline__
+void simpleReduce(real* smem, int tid, int threads) {
+  for (unsigned int s = threads / 2; s > 0; s >>= 1) {
+    if (tid < s) {
+      smem[tid] += smem[tid + s];
+    }
+    __syncthreads();
+  }
+}

 #endif /* HL_DEVICE_FUNCTIONS_CUH_ */
--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
@@ -229,4 +229,40 @@ extern void hl_cossim_derivative(real* grad,
                                 int input2_height,
                                 real scale);

+/**
+ * @brief   Matrix addition: A_d[i][j] += scale * B_d[j/channel].
+ *
+ * @param[in]   A_d     input matrix (M x N).
+ * @param[in]   B_d     input matrix (1 x channel).
+ * @param[in]   channel width of B.
+ * @param[in]   dimM    height of A.
+ * @param[in]   dimN    width of A.
+ * @param[in]   scale   scalar used for addition.
+ *
+ */
+extern void hl_matrix_add_shared_bias(real* A_d,
+                                      real* B_d,
+                                      const int channel,
+                                      const int dimM,
+                                      const int dimN,
+                                      real scale);
+
+/**
+ * @brief   Matrix addition: A_d[i][j] += scale * B_d[j/channel].
+ *
+ * @param[in]   B_d     input matrix (1 x channel).
+ * @param[in]   A_d     input matrix (M x N).
+ * @param[in]   channel width of B.
+ * @param[in]   dimM    height of A.
+ * @param[in]   dimN    width of A.
+ * @param[in]   scale   scalar used for addition.
+ *
+ */
+extern void hl_matrix_collect_shared_bias(real* B_d,
+                                          real* A_d,
+                                          const int channel,
+                                          const int dimM,
+                                          const int dimN,
+                                          real scale);
+
 #endif /* HL_MATRIX_H_ */
--- a/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_cublas_stub.h
@@ -30,6 +30,12 @@ inline void hl_matrix_transpose(real *A_d,
                                int dimM,
                                int dimN) {}

+inline void hl_matrix_inverse(real *A_d,
+                              real *C_d,
+                              int dimN,
+                              int lda,
+                              int ldc) {}
+
 inline void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
                          real *B_d, hl_trans_op_t transb,
                          real *C_d,

--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@@ -101,4 +101,17 @@ inline void hl_cossim_derivative(real* grad,
                                 int input2_height,
                                 real scale) {}

+inline void hl_matrix_add_shared_bias(real* A_d,
+                                      real* B_d,
+                                      const int channel,
+                                      const int dimM,
+                                      const int dimN,
+                                      real scale) {}
+
+inline void hl_matrix_collect_shared_bias(real* B_d,
+                                          real* A_d,
+                                          const int channel,
+                                          const int dimM,
+                                          const int dimN,
+                                          real scale) {}
 #endif  // HL_MATRIX_STUB_H_
--- a/paddle/cuda/src/hl_cuda_cublas.cc
+++ b/paddle/cuda/src/hl_cuda_cublas.cc
@@ -15,6 +15,7 @@ limitations under the License. */

 #include <sys/time.h>
 #include <mutex>
+#include "hl_cuda.h"
 #include "hl_cuda_cublas.h"
 #include "hl_thread.ph"
 #include "hl_dso_loader.h"
@@ -75,6 +76,8 @@ DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmBatched)
 DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmBatched)
 DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmBatched)
 DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetrfBatched)
+DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgetriBatched)
 CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)

 #undef DYNAMIC_LOAD_CUBLAS_WRAP
@@ -88,10 +91,14 @@ CUBLAS_BLAS_ROUTINE_EACH(DYNAMIC_LOAD_CUBLAS_V2_WRAP)
 #define     CUBLAS_GEAM     dynload::cublasSgeam
 #define     CUBLAS_GEMV     dynload::cublasSgemv
 #define     CUBLAS_GEMM     dynload::cublasSgemm
+#define     CUBLAS_GETRF    dynload::cublasSgetrfBatched
+#define     CUBLAS_GETRI    dynload::cublasSgetriBatched
 #else
 #define     CUBLAS_GEAM     dynload::cublasDgeam
 #define     CUBLAS_GEMV     dynload::cublasDgemv
 #define     CUBLAS_GEMM     dynload::cublasDgemm
+#define     CUBLAS_GETRF    dynload::cublasDgetrfBatched
+#define     CUBLAS_GETRI    dynload::cublasDgetriBatched
 #endif

 const char* hl_cublas_get_error_string(cublasStatus_t status) {
@@ -162,6 +169,54 @@ void hl_matrix_transpose(real *A_d, real *C_d, int dimM, int dimN) {
  hl_matrix_transpose(A_d, C_d, dimM, dimN, dimN, dimM);
 }

+void hl_matrix_inverse(real *A_d, real *C_d, int dimN, int lda, int ldc) {
+  /* Solve Ax = I */
+  CHECK_NOTNULL(A_d);
+  CHECK_NOTNULL(C_d);
+
+  /* Step 1: Compute the LU decomposition of matrix A */
+  real **inout_h = &A_d;
+  real **inout_d = (real **)hl_malloc_device(sizeof(real *));
+  hl_memcpy(inout_d, inout_h, sizeof(real *));
+
+  int *pivot_d = (int *)hl_malloc_device(dimN*sizeof(int));  
+  int *info_d = (int *)t_resource.gpu_mem;
+
+  /* Note: cublasSgetrfBatched is used to calculate a number of
+     small-sized matrices. There may be a better way to reconstruct
+     the API for better performance.
+   */
+  CHECK_CUBLAS(CUBLAS_GETRF(t_resource.handle,
+	       dimN, inout_d, lda, pivot_d,
+               info_d, 1));
+
+  int info_h; 
+  hl_memcpy(&info_h, info_d, sizeof(int));
+  if (info_h != 0) {
+      LOG(FATAL) << "Factorization of matrix failed: matrix may be singular.\n";
+  }
+
+  /* Step 2: Compute the inverse of the matrix given its LU decomposition */
+  real **out_h = &C_d;
+  real **out_d = (real **)hl_malloc_device(sizeof(real *));
+  hl_memcpy(out_d, out_h, sizeof(real *));
+
+  CHECK_CUBLAS(CUBLAS_GETRI(t_resource.handle,
+	       dimN, (const real **)inout_d, lda, pivot_d,
+	       out_d, ldc, info_d, 1));
+
+  hl_memcpy(&info_h, info_d, sizeof(int));
+  if (info_h != 0) {
+      LOG(FATAL) << "Inversion of matrix failed: matrix may be singular.\n";
+  }
+
+  hl_free_mem_device(inout_d);
+  hl_free_mem_device(pivot_d);
+  hl_free_mem_device(out_d);
+  
+  CHECK_SYNC("hl_matrix_inverse failed");
+}
+
 void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
                   real *B_d, hl_trans_op_t transb,
                   real *C_d,

--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -20,6 +20,11 @@ limitations under the License. */
 #include "hl_thread.ph"
 #include "hl_dso_loader.h"
 #include "paddle/utils/Logging.h"
+#include "paddle/utils/CommandLineParser.h"
+
+P_DEFINE_int32(cudnn_conv_workspace_limit_in_mb, 4096,
+                "Specify cuDNN max workspace limit, in units MB, "
+                "4096MB=4GB by default.");

 namespace dynload {

@@ -36,65 +41,28 @@ void* cudnn_dso_handle = nullptr;

 #ifdef PADDLE_USE_DSO

-#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                         \
-  struct DynLoad__##__name {                                    \
-    template <typename... Args>                                 \
-    cudnnStatus_t operator()(Args... args) {                    \
-      typedef cudnnStatus_t (*cudnnFunc)(Args...);              \
-      std::call_once(cudnn_dso_flag, GetCudnnDsoHandle,         \
-                     &cudnn_dso_handle);                        \
-      void* p_##__name = dlsym(cudnn_dso_handle, #__name);      \
-      return reinterpret_cast<cudnnFunc>(p_##__name)(args...);  \
-    }                                                           \
+#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                          \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      using cudnn_func = decltype(__name(args...))(*)(Args...);  \
+      std::call_once(cudnn_dso_flag, GetCudnnDsoHandle,          \
+                     &cudnn_dso_handle);                         \
+      void* p_##__name = dlsym(cudnn_dso_handle, #__name);       \
+      return reinterpret_cast<cudnn_func>(p_##__name)(args...);  \
+    }                                                            \
  } __name; /* struct DynLoad__##__name */

-struct DynLoad__cudnnGetVersion {
-  template <typename... Args>
-  size_t operator()(Args... args) {
-    typedef size_t (*cudnnFunc)(Args...);
-    std::call_once(cudnn_dso_flag, GetCudnnDsoHandle,
-                   &cudnn_dso_handle);
-    void* p_name = dlsym(cudnn_dso_handle, "cudnnGetVersion");
-    return reinterpret_cast<cudnnFunc>(p_name)(args...);
-  }
-} cudnnGetVersion; /* struct DynLoad__##__name */
-
-struct DynLoad__cudnnGetErrorString {
-  template <typename... Args>
-  const char* operator()(Args... args) {
-    typedef const char* (*cudnnFunc)(Args...);
-    std::call_once(cudnn_dso_flag, GetCudnnDsoHandle,
-                   &cudnn_dso_handle);
-    void* p_name = dlsym(cudnn_dso_handle, "cudnnGetErrorString");
-    return reinterpret_cast<cudnnFunc>(p_name)(args...);
-  }
-} cudnnGetErrorString; /* struct DynLoad__##__name */
-
-
 #else

-#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                         \
-  struct DynLoad__##__name {                                    \
-    template <typename... Args>                                 \
-    cudnnStatus_t operator()(Args... args) {                    \
-      return __name(args...);                                   \
-    }                                                           \
+#define DYNAMIC_LOAD_CUDNN_WRAP(__name)                          \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      return __name(args...);                                    \
+    }                                                            \
  } __name; /* struct DynLoad__##__name */

-struct DynLoad__cudnnGetVersion {
-  template <typename... Args>
-  size_t operator()(Args... args) {
-    return cudnnGetVersion(args...);
-  }
-} cudnnGetVersion; /* struct DynLoad__##__name */
-
-struct DynLoad__cudnnGetErrorString {
-  template <typename... Args>
-  const char* operator()(Args... args) {
-    return cudnnGetErrorString(args...);
-  }
-} cudnnGetErrorString; /* struct DynLoad__##__name */
-
 #endif

 /**
@@ -128,7 +96,9 @@ struct DynLoad__cudnnGetErrorString {
  __macro(cudnnPoolingForward)                            \
  __macro(cudnnPoolingBackward)                           \
  __macro(cudnnSoftmaxBackward)                           \
-  __macro(cudnnSoftmaxForward)
+  __macro(cudnnSoftmaxForward)                            \
+  __macro(cudnnGetVersion)                                \
+  __macro(cudnnGetErrorString)
 CUDNN_DNN_ROUTINE_EACH(DYNAMIC_LOAD_CUDNN_WRAP)

 #define CUDNN_DNN_ROUTINE_EACH_R2(__macro)                \
@@ -242,7 +212,7 @@ void hl_conv_workspace(hl_tensor_descriptor input,
    CHECK_NOTNULL(conv);

    // Specify workspace limit directly
-    size_t memoryLimitBytes = 8 * 1024 * 1024;
+    size_t memoryLimitBytes = (1LL << 20) * FLAGS_cudnn_conv_workspace_limit_in_mb;

    // cudnn convolution forward configuration
    cudnnTensorDescriptor_t       fwd_src_desc = GET_TENSOR_DESCRIPTOR(input);

--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -85,44 +85,24 @@ void* cudart_dso_handle = nullptr;
 #define DYNAMIC_LOAD_CUDART_WRAP(__name)                            \
  struct DynLoad__##__name {                                        \
    template <typename... Args>                                     \
-    cudaError_t operator()(Args... args) {                          \
-      typedef cudaError_t (*cudartFunc)(Args...);                   \
+    auto operator()(Args... args) -> decltype(__name(args...)) {    \
+      using cudart_func = decltype(__name(args...))(*)(Args...);    \
      std::call_once(cudart_dso_flag, GetCudartDsoHandle,           \
                     &cudart_dso_handle);                           \
      void* p_##__name = dlsym(cudart_dso_handle, #__name);         \
-      return reinterpret_cast<cudartFunc>(p_##__name)(args...);     \
+      return reinterpret_cast<cudart_func>(p_##__name)(args...);    \
    }                                                               \
  } __name;  /* struct DynLoad__##__name */
 #else
 #define DYNAMIC_LOAD_CUDART_WRAP(__name)                            \
  struct DynLoad__##__name {                                        \
    template <typename... Args>                                     \
-    cudaError_t operator()(Args... args) {                          \
+    auto operator()(Args... args) -> decltype(__name(args...)) {    \
      return __name(args...);                                       \
    }                                                               \
  } __name;  /* struct DynLoad__##__name */
 #endif

-#ifdef PADDLE_USE_DSO
-  struct DynLoad__cudaGetErrorString {
-    template <typename... Args>
-    const char* operator()(Args... args) {
-      typedef const char* (*cudaFunc)(Args...);
-      std::call_once(cudart_dso_flag, GetCudartDsoHandle,
-                     &cudart_dso_handle);
-      void* p_func = dlsym(cudart_dso_handle, "cudaGetErrorString");
-      return reinterpret_cast<cudaFunc>(p_func)(args...);
-    }
-  } cudaGetErrorString;  /* struct DynLoad__cudaGetErrorString */
-#else
-struct DynLoad__cudaGetErrorString {
-  template <typename... Args>
-  const char* operator()(Args... args) {
-    return cudaGetErrorString(args...);
-  }
-} cudaGetErrorString;  /* struct DynLoad__cudaGetErrorString */
-#endif
-
 /* include all needed cuda functions in HPPL */
 #define CUDA_ROUTINE_EACH(__macro)        \
  __macro(cudaMalloc)                     \
@@ -152,7 +132,8 @@ struct DynLoad__cudaGetErrorString {
  __macro(cudaSetDeviceFlags)             \
  __macro(cudaGetLastError)               \
  __macro(cudaFuncSetCacheConfig)         \
-  __macro(cudaRuntimeGetVersion)
+  __macro(cudaRuntimeGetVersion)          \
+  __macro(cudaGetErrorString)

 CUDA_ROUTINE_EACH(DYNAMIC_LOAD_CUDART_WRAP)


--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -20,6 +20,7 @@ limitations under the License. */
 #include "hl_sequence.h"
 #include "paddle/utils/Logging.h"
 #include "hl_device_functions.cuh"
+#include "hl_gpu_matrix_kernel.cuh"

 DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
 DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1*a + p2*b);
@@ -673,3 +674,89 @@ void hl_cossim_derivative(real* grad,
        input1_height, input2_height, scale);
  CHECK_SYNC("hl_cossim_derivate failed");
 }
+
+__global__ void KeMatrixAddSharedBias(real* A,
+                                      real* B,
+                                      const int channel,
+                                      const int M,
+                                      const int N,
+                                      real scale) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int dim = N / channel;
+  if (index < M * N) {
+    int i = index % N;
+    i = i / dim; 
+    A[index] += scale * B[i];
+  }
+}
+
+void hl_matrix_add_shared_bias(real* A_d,
+                               real* B_d,
+                               const int channel,
+                               const int dimM,
+                               const int dimN,
+                               real scale) {
+  const int blocks = 512;
+  const int grids = DIVUP(dimM * dimN, blocks);
+  KeMatrixAddSharedBias<<<grids, blocks, 0, STREAM_DEFAULT>>>
+    (A_d, B_d, channel, dimM, dimN, scale);
+  CHECK_SYNC("hl_matrix_add_shared_bias failed");
+}
+
+
+template <int blockSize>
+__global__ void KeMatrixCollectSharedBias(real *B,
+                                          real *A,
+                                          const int channel,
+                                          const int M,
+                                          const int N,
+                                          const int dim,
+                                          const int limit,
+                                          real scale) {
+  if (dim < limit) { 
+    int index = blockIdx.x * blockDim.x + threadIdx.x;
+    if (index < channel) {
+      real sum = 0.0;
+      for (int i = 0; i < M; ++i) {
+        for (int j = 0; j < dim; ++j) {
+          sum += A[i * N + index * dim + j];
+        }
+      }
+      B[index] += scale * sum;
+    }
+  } else {
+    const int tid = threadIdx.x;
+    const int bid = blockIdx.x;
+    __shared__ real smem[blockSize];
+    real sum = 0.0;
+    for (int j = 0; j < ((dim * M + blockSize - 1) / blockSize); ++j) {
+      int n = j * blockSize + tid;
+      int m = n / dim;
+      int w = n % dim;
+      smem[tid] =  (m < M && w < dim) ? A[m * N + bid * dim + w] : 0.0;
+      __syncthreads();
+      simpleReduce(smem, tid, blockSize);
+      sum += smem[0];
+    }
+    if (tid == 0) {
+      B[bid] += scale * sum;
+    }
+  }
+}
+
+void hl_matrix_collect_shared_bias(real* B_d,
+                                   real* A_d,
+                                   const int channel,
+                                   const int dimM,
+                                   const int dimN,
+                                   real scale) {
+  const int dim = dimN / channel;
+  const int blocks = 256;
+  const int limit = 64;
+  int grids = (dimM * dim) < limit ? DIVUP(channel, blocks) : channel;
+
+  KeMatrixCollectSharedBias<blocks>
+      <<< grids, blocks, 0, STREAM_DEFAULT>>>
+      (B_d, A_d, channel, dimM, dimN, dim, limit, scale);
+  CHECK_SYNC("hl_matrix_collect_shared_bias failed");
+}
--- a/paddle/cuda/src/hl_cuda_sparse.cuh
+++ b/paddle/cuda/src/hl_cuda_sparse.cuh
@@ -908,24 +908,6 @@ int findIndex(int* indice, int num, int index) {
  return (end - 1);
 }

-/**
- * @brief  sum reduction
- *
- * @param[in,out]  smem       input data, better to use __shared__ memory.
- * @param[in]      tid        local thread index.
- * @param[in]      blockDimX  the size of blockDim.x.
- *
- * note: return smem[0]: the sum of each elements of smem.
- */
-__device__ __forceinline__
-void reduce(real* smem, int tid, int blockDimX) {
-  for (unsigned int s = blockDimX / 2; s > 0; s >>= 1) {
-    if (tid < s) {
-      smem[tid] += smem[tid + s];
-    }
-    __syncthreads();
-  }
-}

 /**
 * @brief sum columns of csr sparse matrix (csr_val), then add to a_val.

--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -46,63 +46,100 @@ static inline std::string join(const std::string& part1, const std::string& part
  return ret;
 }

-static inline void GetDsoHandleWithSearchPath(
+static inline void GetDsoHandleFromDefaultPath(
+        std::string& dso_path, void** dso_handle, int dynload_flags) {
+    LOG(INFO) << "Try to find cuda library: " << dso_path
+              << " from default system path.";
+    // default search from LD_LIBRARY_PATH/DYLD_LIBRARY_PATH 
+    *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+    
+    // DYLD_LIBRARY_PATH is disabled after Mac OS 10.11 to
+    // bring System Integrity Projection (SIP), if dso_handle
+    // is null, search from default package path in Mac OS.
+    #if defined(__APPLE__) || defined(__OSX__)
+    if (nullptr == *dso_handle) {
+        dso_path = join("/usr/local/cuda/lib/", dso_path);
+        *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+        if (nullptr == *dso_handle) {
+            if (dso_path == "libcudnn.dylib") {
+                LOG(FATAL) << "Note: [Recommend] copy cudnn into /usr/local/cuda/ \n"
+                << "For instance, sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C "
+                << "/usr/local \n sudo chmod a+r /usr/local/cuda/include/cudnn.h "
+                << "/usr/local/cuda/lib/libcudnn*";
+            }
+        } 
+    }   
+    #endif
+}
+
+static inline void GetDsoHandleFromSearchPath(
        const std::string& search_root,
-        const std::string& dso_path,
+        const std::string& dso_name,
        void** dso_handle) {
    int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
    *dso_handle = nullptr;

-    std::string dlPath = dso_path;
+    std::string dlPath = dso_name;
    if (search_root.empty()) {
-        // default search xxx.so from LD_LIBRARY_PATH
-        *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
+        GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
    } else {
        // search xxx.so from custom path
-        dlPath = join(search_root, dso_path);
+        dlPath = join(search_root, dso_name);
        *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
-        // then, search xxx.so from LD_LIBRARY_PATH
-        if (nullptr == *dso_handle) {
-            *dso_handle = dlopen(dso_path.c_str(), dynload_flags);
+        // if not found, search from default path
+        if (nullptr == dso_handle) {
+            LOG(WARNING) << "Failed to find cuda library: " << dlPath;
+            dlPath = dso_name;
+            GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
        }
    }

    CHECK(nullptr != *dso_handle)
-      << "For Gpu version of PaddlePaddle, it couldn't find CUDA library: "
-      << dlPath.c_str() << ". Please make sure you already specify its path. "
-      << "Note: for training data on Cpu using Gpu version of PaddlePaddle, "
-      << "you must specify libcudart via export LD_LIBRARY_PATH for Linux or "
-      << "export DYLD_LIBRARY_PATH for MAC OS.";
+      << "Failed to find cuda library: " << dlPath << std::endl
+      << "Please specify its path correctly using one of the following ideas: \n"
+
+      << "Idea 1. set cuda and cudnn lib path at runtime. "
+      << "http://www.paddlepaddle.org/doc/ui/cmd_argument/argument_outline.html \n"
+      << "For instance, issue command: paddle train --use_gpu=1 "
+      << "--cuda_dir=/usr/local/cudnn/lib --cudnn_dir=/usr/local/cudnn/lib ...\n"
+
+      << "Idea 2. set environment variable LD_LIBRARY_PATH on Linux or "
+      << "DYLD_LIBRARY_PATH on Mac OS. \n"
+      << "For instance, issue command: export LD_LIBRARY_PATH=... \n"
+
+      << "Note: After Mac OS 10.11, using the DYLD_LIBRARY_PATH is impossible "
+      << "unless System Integrity Protection (SIP) is disabled. However, @Idea 1"
+      << "always work well.";
 }

 void GetCublasDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
+    GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
 #else
-    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
+    GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
 #endif
 }

 void GetCudnnDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-    GetDsoHandleWithSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
+    GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
 #else
-    GetDsoHandleWithSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
+    GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
 #endif
 }

 void GetCudartDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-    GetDsoHandleWithSearchPath("", "libcudart.dylib", dso_handle);
+    GetDsoHandleFromSearchPath("", "libcudart.dylib", dso_handle);
 #else
-    GetDsoHandleWithSearchPath("", "libcudart.so", dso_handle);
+    GetDsoHandleFromSearchPath("", "libcudart.so", dso_handle);
 #endif
 }

 void GetCurandDsoHandle(void** dso_handle) {
 #if defined(__APPLE__) || defined(__OSX__)
-    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
+    GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
 #else
-    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
+    GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
 #endif
 }
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -295,6 +295,7 @@ void forward(Argument& act) {

 void backward(Argument& act) { act.grad->squareDerivative(*act.in); }
 END_DEFINE_ACTIVATION(square)
+
 /**
 * @brief Exponential Activation.
 * \f[
@@ -307,8 +308,36 @@ void forward(Argument& act) { act.value->exp(*act.value); }
 void backward(Argument& act) { act.grad->expDerivative(*act.value); }
 END_DEFINE_ACTIVATION(exponential)

+/**
+ * @brief Logarithm Activation.
+ * \f[
+ * f(z) = log(z)
+ * \f]
+ */
+BEGIN_DEFINE_ACTIVATION(log)
+void forward(Argument& act) {
+  SetDevice device(act.deviceId);
+  Matrix::resizeOrCreate(act.in, act.value->getHeight(), act.value->getWidth(),
+                         /* trans */ false, useGpu(act.deviceId));
+
+  act.in->copyFrom(*act.value);
+  act.value->log(*act.value);
+}
+
+void backward(Argument& act) { act.grad->dotDiv(*act.grad, *act.in); }
+END_DEFINE_ACTIVATION(log)
+
 ActivationFunction* ActivationFunction::create(const std::string& type) {
  return gActivationRegistrar.createByType(type);
 }

+std::vector<std::string> ActivationFunction::getAllRegisteredTypes() {
+  std::vector<std::string> types;
+  gActivationRegistrar.forEachType([&](const std::string& type) {
+      types.push_back(type);
+    });
+  return types;
+}
+
+
 }  // namespace paddle
--- a/paddle/gserver/activations/ActivationFunction.h
+++ b/paddle/gserver/activations/ActivationFunction.h
@@ -15,6 +15,7 @@ limitations under the License. */

 #pragma once
 #include <string>
+#include <vector>

 namespace paddle {

@@ -32,6 +33,7 @@ struct Argument;
 class ActivationFunction {
 public:
  static ActivationFunction* create(const std::string& type);
+  static std::vector<std::string> getAllRegisteredTypes();

  ActivationFunction() {}


--- a/paddle/gserver/dataproviders/DataProvider.cpp
+++ b/paddle/gserver/dataproviders/DataProvider.cpp
@@ -131,9 +131,10 @@ void DoubleBuffer::asyncLoadBatch() {
    taskReadySem_.wait();
    if (stopping_) break;

-    while (batchSize_ == 0) {
+    while (batchSize_ == 0 && !stopping_) {
      usleep(5);
    }
+    if (stopping_) break;

    do {
      DataBatch newBatch;

--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
@@ -433,26 +433,34 @@ private:

  inline void resetImpl(bool startNewThread) {
    DBG << "Reseting " << startNewThread;
+    exit_.store(true);
    if (loadThread_) {  // is loading.
-      exit_.store(true);
      loadThread_->join();
      loadThread_.reset();
    }
    {
      PyGuard g;
      callingContexts_.clear();
+      this->pullCV_.notify_one();
+    }
+
+    std::lock_guard<std::mutex> guard(mutexForReset_);
+    {
+      PyGuard g;
      dataPool_.clear();
    }
    poolActualSize_ = 0;
-    exit_ = false;
+
    if (startNewThread && cache_->reset()) {
      DBG << "Start new thread.";
      loadThread_.reset(new std::thread([this] {
+        exit_ = false;
        loadThread();
      }));
      callingContextCreated_.wait();
    }
    DBG << "Reset done";
+    exit_ = false;
  }

 private:
@@ -465,6 +473,8 @@ private:
  std::condition_variable pullCV_;
  std::mutex mtx_;

+  std::mutex mutexForReset_;
+
  ThreadBarrier callingContextCreated_;
  std::unique_ptr<IPyDataProviderCache> cache_;

@@ -529,6 +539,7 @@ public:
   * Loading a batch of data.
   */
  int64_t getNextBatchInternal(int64_t size_, DataBatch *batch) {
+    std::lock_guard<std::mutex> guard(mutexForReset_);
    REGISTER_TIMER("PyDP2.getNextBatchInternal")
    CHECK_GE(size_, 0);
    size_t size = (size_t) size_;
@@ -554,6 +565,10 @@ public:
    } else {  // loading from cache.
      poolPtr = this->cache_->load();
    }
+    if (exit_) {
+      // PyDataProvider is destructing.
+      return 0;
+    }
    CHECK(poolPtr != nullptr);

    std::deque<PyObjectPtr>& pool = *poolPtr;

--- a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.cpp
@@ -28,6 +28,12 @@ void ParallelNeuralNetwork::init(
    const std::vector<ParameterType>& parameterTypes, bool useGpu) {
  NeuralNetwork::init(config, callback, parameterTypes, useGpu);

+  if (config.type() == "recurrent_nn") {
+    LOG(FATAL)
+      << "You can not add `--parallel_nn=true` on the command line, "
+      << "parallel_nn training mode does not support the recurrent_nn model.";
+  }
+
  useGpu_ = useGpu;
  numDevices_ = 0;
  if (useGpu_) {

--- a/paddle/gserver/layers/ConcatenateLayer.cpp
+++ b/paddle/gserver/layers/ConcatenateLayer.cpp
@@ -97,7 +97,8 @@ void ConcatenateLayer::backward(const UpdateCallback& callback) {
 */
 class ConcatenateLayer2 : public Layer {
 public:
-  explicit ConcatenateLayer2(const LayerConfig& config) : Layer(config) {}
+  explicit ConcatenateLayer2(const LayerConfig& config) :
+      Layer(config) {}

  ~ConcatenateLayer2() {}

@@ -110,6 +111,8 @@ protected:
  std::vector<std::unique_ptr<Projection>> projections_;
  std::vector<Argument> projOutput_;
  std::vector<std::pair<size_t, size_t>> projCol_;
+  bool sharedBias_;
+  std::unique_ptr<Weight> biases_;
 };

 REGISTER_LAYER(concat2, ConcatenateLayer2);
@@ -119,7 +122,6 @@ bool ConcatenateLayer2::init(const LayerMap& layerMap,
  /* Initialize the basic parent class */
  if (!Layer::init(layerMap, parameterMap)) return false;

-  CHECK(!biasParameter_);
  CHECK_EQ(inputLayers_.size(), parameters_.size());
  projections_.reserve(inputLayers_.size());
  projCol_.reserve(inputLayers_.size());
@@ -137,6 +139,13 @@ bool ConcatenateLayer2::init(const LayerMap& layerMap,
  }
  CHECK_EQ(getSize(), endCol);

+  /* initialize biases_ */
+  if (biasParameter_.get() != NULL) {
+    sharedBias_ = config_.shared_biases();
+    size_t psize = config_.bias_size();
+    biases_ = std::unique_ptr<Weight>(new Weight(1, psize, biasParameter_));
+  }
+
  return true;
 }

@@ -154,8 +163,17 @@ void ConcatenateLayer2::forward(PassType passType) {
    projOutput_[i].grad = output_.grad->subColMatrix(startCol, endCol);
  }

-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    projections_[i]->forward(&getInput(i), &projOutput_[i], passType);
+  {
+    AsyncGpuBlock block;
+    for (size_t i = 0; i != inputLayers_.size(); ++i) {
+      projections_[i]->forward(&getInput(i), &projOutput_[i], passType);
+    }
+  }
+
+  /* add the bias-vector */
+  if (biases_) {
+    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
+    output_.value->addBias(*(biases_->getW()), 1, sharedBias_);
  }

  /* activation */ {
@@ -170,6 +188,13 @@ void ConcatenateLayer2::backward(const UpdateCallback& callback) {
    backwardActivation();
  }

+  AsyncGpuBlock block;
+  if (biases_ && biases_->getWGrad()) {
+    REGISTER_TIMER_INFO("Concat2BpBiasTimer", getName().c_str());
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBias_);
+    biases_->getParameterPtr()->incUpdate(callback);
+  }
+
  for (size_t i = 0; i != inputLayers_.size(); ++i) {
    if (projections_[i]) {
      projections_[i]->backward(callback);

--- a/paddle/gserver/layers/ConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ConvBaseLayer.cpp
@@ -35,25 +35,12 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
    filterSizeY_.push_back(conf.filter_size_y());
    filterPixels_.push_back(filterSize_.back() * filterSizeY_.back());
    channels_.push_back(conf.channels());
-    imgSize_.push_back(conf.img_size());
-    imgPixels_.push_back(imgSize_.back() * imgSize_.back());
+    imgSizeH_.push_back(conf.img_size());
+    imgSizeW_.push_back(conf.img_size());
    groups_.push_back(conf.groups());
    filterChannels_.push_back(conf.filter_channels());
-    outputX_.push_back(conf.output_x());
-    outputs_.push_back(outputX_.back() * outputX_.back());
-  }
-
-  /* initialize the weightList */
-  CHECK(inputLayers_.size() == parameters_.size());
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    size_t height, width;
-    height = filterPixels_[i] * filterChannels_[i];
-    width = numFilters_;
-
-    // create a new weight
-    CHECK_EQ(parameters_[i]->getSize(), width * height);
-    Weight* w = new Weight(height, width, parameters_[i]);
-    weights_.emplace_back(w);
+    outputH_.push_back(conf.output_x());
+    outputW_.push_back(conf.output_x());
  }

  /* initialize the biases_ */
@@ -74,4 +61,34 @@ bool ConvBaseLayer::init(const LayerMap& layerMap,
  return true;
 }

+size_t ConvBaseLayer::calOutputSize() {
+  auto clearAndReserve = [this](IntV* vec) {
+    vec->clear();
+    vec->reserve(this->inputLayers_.size());
+  };
+  clearAndReserve(&imgSizeH_);
+  clearAndReserve(&imgSizeW_);
+  clearAndReserve(&outputH_);
+  clearAndReserve(&outputW_);
+  size_t layerSize = 0;
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    imgSizeH_.push_back(inputLayers_[i]->getOutput().getFrameHeight());
+    imgSizeW_.push_back(inputLayers_[i]->getOutput().getFrameWidth());
+    if (imgSizeH_[i] == 0)
+      imgSizeH_[i] = config_.inputs(i).conv_conf().img_size();
+    if (imgSizeW_[i] == 0)
+      imgSizeW_[i] = config_.inputs(i).conv_conf().img_size();
+    outputH_.push_back(
+        outputSize(imgSizeH_[i], filterSizeY_[i], paddingY_[i], strideY_[i]));
+    outputW_.push_back(
+        outputSize(imgSizeW_[i], filterSize_[i], padding_[i], stride_[i]));
+    CHECK_EQ(outputH_[i], outputH_[0]);
+    CHECK_EQ(outputW_[i], outputW_[0]);
+  }
+  getOutput().setFrameHeight(outputH_[0]);
+  getOutput().setFrameWidth(outputW_[0]);
+  layerSize = outputH_[0] * outputW_[0] * size_t(numFilters_);
+  return layerSize;
+}
+
 }  // namespace paddle
--- a/paddle/gserver/layers/ConvBaseLayer.h
+++ b/paddle/gserver/layers/ConvBaseLayer.h
@@ -43,19 +43,18 @@ protected:
  IntV filterSizeY_;
  /// The spatial dimensions of the convolution input.
  IntV channels_;
-  /// The spatial dimensions of input feature map.
-  IntV imgSize_;
-  /// The total pixel size of input feature map.
-  /// imgPixels_ = imgSizeX_ * imgSizeY_.
-  IntV imgPixels_;
+  /// The spatial dimensions of input feature map height.
+  IntV imgSizeH_;
+  /// The spatial dimensions of input feature map width.
+  IntV imgSizeW_;
  /// filterPixels_ = filterSizeX_ * filterSizeY_.
  IntV filterPixels_;
  /// filterChannels_ = channels_/groups_.
  IntV filterChannels_;
-  /// The spatial dimensions of output feature map.
-  IntV outputX_;
-  /// The spatial dimensions of output feature map.
-  IntV outputs_;
+  /// The spatial dimensions of output feature map height.
+  IntV outputH_;
+  /// The spatial dimensions of output feature map width.
+  IntV outputW_;
  /// Group size, refer to grouped convolution in
  /// Alex Krizhevsky's paper: when group=2, the first half of the
  /// filters are only connected to the first half of the input channels,
@@ -80,6 +79,13 @@ public:

  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);

+  /**
+   * imgSizeH_ and imgSizeW_ will be set according to the previous input layers
+   * in this function. Then it will calculate outputH_ and outputW_ and set them
+   * into output argument.
+   */
+  virtual size_t calOutputSize();
+
  Weight& getWeight(int idx) { return *weights_[idx]; }

  /**

--- a/paddle/gserver/layers/ConvProjection.cpp
+++ b/paddle/gserver/layers/ConvProjection.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#include "paddle/utils/Stat.h"
+#include "ConvProjection.h"
+
+namespace paddle {
+
+REGISTER_PROJECTION(conv, ConvProjection);
+
+ThreadLocalD<std::vector<MemoryHandle*>> ConvProjection::convMem_;
+
+ConvProjection::ConvProjection(const ProjectionConfig& config,
+                               ParameterPtr parameter, bool useGpu)
+    : Projection(config, parameter, useGpu) {
+
+  CHECK(useGpu);  // only support GPU
+  getConvParams();
+  initCudnn();
+
+  size_t height = filterH_ * filterW_ * channels_ / groups_;
+  size_t width = numFilters_;
+  weight_.reset(new Weight(height, width, parameter));
+  weightOffset_ = height * width / groups_;
+}
+
+void ConvProjection::getConvParams() {
+  const ConvConfig &conf = config_.conv_conf();
+  paddingH_ = conf.padding_y();
+  paddingW_ = conf.padding();
+
+  strideH_ = conf.stride_y();
+  strideW_ = conf.stride();
+
+  filterH_ = conf.filter_size_y();
+  filterW_ = conf.filter_size();
+
+  configImgH_ = conf.img_size();
+  configImgW_ = conf.img_size();
+
+  channels_ = conf.channels();
+  numFilters_ = config_.num_filters();
+
+  groups_ = conf.groups();
+  CHECK_EQ(channels_ % groups_, 0);
+  CHECK_EQ(numFilters_ % groups_, 0);
+}
+
+void ConvProjection::initCudnn() {
+  hl_create_filter_descriptor(&filterDesc_, channels_, numFilters_,
+                              filterH_, filterW_);
+  hl_create_tensor_descriptor(&inputDesc_);
+  hl_create_tensor_descriptor(&outputDesc_);
+  hl_create_convolution_descriptor(&convDesc_, inputDesc_, filterDesc_,
+                                   paddingH_, paddingW_, strideH_, strideW_);
+
+  // initialize all to default algorithms
+  fwdAlgo_ = 0;
+  bwdFilterAlgo_ = 0;
+  bwdDataAlgo_ = 0;
+  fwdLimitBytes_ = 0;
+  bwdDataLimitBytes_ = 0;
+  bwdFilterLimitBytes_ = 0;
+  workSpaceInBytes_ = 0;
+
+  batchNum_ = 0;
+  isSelectAlgo_ = false;
+}
+
+void ConvProjection::reshapeTensorDesc(int batchSize) {
+  hl_tensor_reshape(inputDesc_, batchSize, channels_, imageH_, imageW_,
+                    channels_ * imageH_ * imageW_, imageH_ * imageW_,
+                    imageW_, 1);
+  hl_reset_convolution_descriptor(convDesc_, inputDesc_, filterDesc_,
+                                  paddingH_, paddingW_, strideH_, strideW_);
+
+  // The stride between two consecutive images in ConvProjection may not be 1,
+  // for example, in the case of layer ConcatenateLayer2 with two
+  // ConvProjection, the stride is the output_size of layer ConcatenateLayer2.
+  // So the calculation of nStride is different from CudnnConvLayer.
+  // In fact, only "nStride = out_->value->getStride()" is ok.
+  size_t nStride = numFilters_ * outputH_ * outputW_;
+  if (out_->value->isContiguous()) {
+    CHECK_EQ(nStride, out_->value->getWidth());
+  } else {
+    nStride = out_->value->getStride();
+  }
+
+  hl_tensor_reshape(outputDesc_, batchSize, numFilters_, outputH_, outputW_,
+                    nStride, outputH_ * outputW_, outputW_, 1);
+}
+
+void ConvProjection::reshape(int batchSize) {
+  size_t width = calOutputSize();
+  CHECK_EQ(width, out_->value->getWidth());
+
+  isSelectAlgo_ = (batchSize == batchNum_);
+  batchNum_ = batchSize;
+
+  if (!isSelectAlgo_) {
+    reshapeTensorDesc(batchSize);
+    hl_conv_workspace(inputDesc_, outputDesc_, filterDesc_,
+                      convDesc_, &fwdAlgo_, &fwdLimitBytes_,
+                      &bwdDataAlgo_, &bwdDataLimitBytes_,
+                      &bwdFilterAlgo_, &bwdFilterLimitBytes_);
+
+    size_t maxWorkSpace = 0;
+    maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
+    maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
+    workSpaceInBytes_ = maxWorkSpace;
+
+
+    VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
+                         << " / " << bwdDataAlgo_
+                         << " / " << bwdFilterAlgo_;
+  }
+
+  isSelectAlgo_ = true;
+}
+
+void ConvProjection::forward() {
+  int batchSize = in_->value->getHeight();
+  reshape(batchSize);
+
+  void* workSpace = NULL;
+  if (workSpaceInBytes_ > 0) {
+    workSpace = getSpaceBytes(workSpaceInBytes_);
+  }
+
+  for (int g = 0; g < groups_; ++g) {
+    REGISTER_TIMER_INFO("CudnnConvFwTimer", getName().c_str());
+
+    real *inputData = in_->value->getData() + g * inputOffset_;
+    real *wgtData = weight_->getW()->getData() + g * weightOffset_;
+    real *outData = out_->value->getData() + g * outputOffset_;
+    hl_convolution_forward(inputDesc_, inputData, outputDesc_,
+                           outData, filterDesc_, wgtData,
+                           convDesc_, workSpace,
+                           fwdLimitBytes_, fwdAlgo_);
+  }
+}
+
+void ConvProjection::backward(const UpdateCallback& callback) {
+  REGISTER_TIMER_INFO("CudnnConvBpTimer", getName().c_str());
+
+  void* workSpace = NULL;
+  if (workSpaceInBytes_ > 0) {
+    workSpace = getSpaceBytes(workSpaceInBytes_);
+  }
+
+  for (int g = 0; g < groups_; ++g) {
+    real *outGrad = out_->grad->getData() + g * outputOffset_;
+    if (weight_->getWGrad()) {
+      real *inputData = in_->value->getData() + g * inputOffset_;
+      real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_;
+      hl_convolution_backward_filter(
+          inputDesc_, inputData, outputDesc_, outGrad, filterDesc_,
+          weightGrad, convDesc_, workSpace, bwdFilterLimitBytes_,
+          bwdFilterAlgo_);
+    }
+
+    MatrixPtr preGrad = in_->grad;
+    if (NULL != preGrad) {
+      real *inputGrad = preGrad->getData() + g * inputOffset_;
+      real *wgtData = weight_->getW()->getData() + g* weightOffset_;
+      hl_convolution_backward_data(
+          inputDesc_, inputGrad, outputDesc_, outGrad, filterDesc_,
+          wgtData, convDesc_, workSpace, bwdDataLimitBytes_,
+          bwdDataAlgo_);
+    }
+  }
+
+  weight_->getParameterPtr()->incUpdate(callback);
+}
+
+void* ConvProjection::getSpaceBytes(size_t size) {
+  std::vector<MemoryHandle*>& convMem = *convMem_;
+  if (convMem.empty()) {
+    int numDevices = hl_get_device_count();
+    convMem.resize(numDevices);
+  }
+
+  int devId = hl_get_device();
+  MemoryHandle** localMem = &(convMem[devId]);
+  if (NULL == *localMem || size > (*localMem)->getAllocSize()) {
+    *localMem = new GpuMemoryHandle(size);
+  }
+  return (*localMem)->getBuf();
+}
+
+ConvProjection::~ConvProjection() {
+  hl_destroy_tensor_descriptor(inputDesc_);
+  hl_destroy_tensor_descriptor(outputDesc_);
+  hl_destroy_filter_descriptor(filterDesc_);
+  hl_destroy_convolution_descriptor(convDesc_);
+}
+
+}  // namespace paddle
--- a/paddle/gserver/layers/ConvProjection.h
+++ b/paddle/gserver/layers/ConvProjection.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#pragma once
+
+#include "Projection.h"
+
+namespace paddle {
+
+/**
+ * @brief Convolution projection do the same calculation with CudnnConvLayer.
+ */
+class ConvProjection : public Projection {
+public:
+  /**
+   * Constructor.
+   */
+  ConvProjection(const ProjectionConfig& config, ParameterPtr parameter,
+                 bool useGpu);
+
+  ~ConvProjection();
+
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+
+protected:
+  void getConvParams();
+  void initCudnn();
+
+  void reshapeTensorDesc(int batchSize);
+  void reshape(int batchSize);
+
+  int outputSize(int imageSize, int filterSize, int padding, int stride) {
+    return (imageSize - filterSize + 2 * padding) / stride + 1;
+  }
+
+  size_t calOutputSize() {
+    imageH_ = in_->getFrameHeight();
+    imageW_ = in_->getFrameWidth();
+    if (imageH_ == 0) imageH_ = configImgH_;
+    if (imageW_ == 0) imageW_ = configImgW_;
+    outputH_ = outputSize(imageH_, filterH_, paddingH_, strideH_);
+    outputW_ = outputSize(imageW_, filterW_, paddingW_, strideW_);
+
+    const_cast<Argument*>(out_)->setFrameHeight(outputH_);
+    const_cast<Argument*>(out_)->setFrameWidth(outputW_);
+
+    inputOffset_ = (channels_ / groups_) * imageH_ * imageW_;
+    outputOffset_ = (numFilters_ / groups_) * outputH_ * outputW_;
+    return outputH_ * outputW_ * numFilters_;
+  }
+
+  static void* getSpaceBytes(size_t size);
+
+  /// imageH_ and imageW_ is calculated from the input layer.
+  int imageH_, imageW_;
+  /// configImgH_ and configImgW_ is obtained from config.
+  int configImgH_, configImgW_;
+  int outputH_, outputW_;
+  int channels_, numFilters_;
+  int paddingH_, paddingW_;
+  int strideH_, strideW_;
+  int filterH_, filterW_;
+  /// One group offset of input data.
+  int inputOffset_;
+  /// One group offset of output data.
+  int outputOffset_;
+  /// One group offset of weight.
+  int weightOffset_;
+  int groups_;
+
+  /// Cudnn tensor descriptor for input.
+  hl_tensor_descriptor inputDesc_;
+  /// Cudnn tensor descriptor for output.
+  hl_tensor_descriptor outputDesc_;
+  /// Cudnn tensor descriptor for filter.
+  hl_filter_descriptor filterDesc_;
+  /// Cudnn tensor descriptor for a convolution operation.
+  hl_convolution_descriptor convDesc_;
+
+  /// Record the algorithm for forward convolution, which is obtained by cudnn
+  /// api to search the best suited algorithm.
+  int fwdAlgo_;
+  /// Record the algorithm for computing convolution gradient with respect to
+  /// filter coefficients.
+  int bwdFilterAlgo_;
+  /// Record the algorithm for computing convolution gradient with respect to
+  /// the output.
+  int bwdDataAlgo_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// forward convolution with the specified algo.
+  size_t fwdLimitBytes_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// backwardFilter with the specified algo.
+  size_t bwdDataLimitBytes_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// backwardData with the specified algo.
+  size_t bwdFilterLimitBytes_;
+  /// Size of total work space.
+  size_t workSpaceInBytes_;
+
+  /// Whether to call cuDNN api to choose conv algorithm.
+  bool isSelectAlgo_;
+  /// batchNum is used to record batch size. If the batch size is changed,
+  /// the selection algorithm will be called.
+  int batchNum_;
+  bool bias_;
+
+  std::unique_ptr<Weight> weight_;
+  static ThreadLocalD<std::vector<MemoryHandle*>> convMem_;
+};
+
+}  // namespace paddle
--- a/paddle/gserver/layers/CudnnConvLayer.cpp
+++ b/paddle/gserver/layers/CudnnConvLayer.cpp
@@ -22,215 +22,64 @@ REGISTER_LAYER(cudnn_conv, CudnnConvLayer);

 bool CudnnConvLayer::init(const LayerMap &layerMap,
                          const ParameterMap &parameterMap) {
-  ConvBaseLayer::init(layerMap, parameterMap);
+  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
  CHECK(useGpu_) << "CudnnConvLayer only support gpu";

-  maxGroups_ = 0;
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    CHECK_EQ(channels_[i] % groups_[i], 0);
-    CHECK_EQ(numFilters_ % groups_[i], 0);
-
-    hl_filter_descriptor filter;
-    hl_create_filter_descriptor(&filter, channels_[i] / groups_[i],
-                                numFilters_ / groups_[i], filterSizeY_[i],
-                                filterSize_[i]);
-    filterDesc_.push_back(filter);
-
-    hl_tensor_descriptor input;
-    hl_create_tensor_descriptor(&input);
-    inputDesc_.push_back(input);
-
-    hl_tensor_descriptor output;
-    int outputX =
-        outputSize(imgSize_[i], filterSize_[i], padding_[i], stride_[i]);
-    CHECK_EQ(outputX, outputX_[i]);
-    hl_create_tensor_descriptor(&output);
-    outputDesc_.push_back(output);
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  projections_.reserve(inputLayers_.size());
+  projConf_.reserve(inputLayers_.size());

-    hl_convolution_descriptor conv;
-    hl_create_convolution_descriptor(&conv, input, filter, paddingY_[i],
-                                     padding_[i], strideY_[i], stride_[i]);
-    convDesc_.push_back(conv);
-
-    weightOffset_.push_back((numFilters_ / groups_[i]) *
-                            (channels_[i] / groups_[i]) * filterPixels_[i]);
-    inputOffset_.push_back((channels_[i] / groups_[i]) * imgSize_[i] *
-                           imgSize_[i]);
-    outputOffset_.push_back((numFilters_ / groups_[i]) * outputX_[i] *
-                            outputX_[i]);
-
-    // initialize all to default algorithms
-    fwdAlgo_.push_back(0);
-    bwdFilterAlgo_.push_back(0);
-    bwdDataAlgo_.push_back(0);
-    fwdLimitBytes_.push_back(0);
-    bwdFilterLimitBytes_.push_back(0);
-    bwdDataLimitBytes_.push_back(0);
-
-    // cudnn streams per group equal to 1
-    if (groups_[i] > maxGroups_) {
-      maxGroups_ = groups_[i];
-    }
-  }
-
-  workSpaceInBytes_ = 0;
-  workSpaceData_ = NULL;
-  for (int i = 0; i < maxGroups_; ++i) {
-    workSpace_.push_back(NULL);
+  numFilters_ = config_.num_filters();
+  CHECK(config_.shared_biases());
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    ProjectionConfig* conf = new ProjectionConfig();
+    conf->set_type("conv");
+    conf->set_num_filters(numFilters_);
+    ConvConfig* convConf = conf->mutable_conv_conf();
+    *convConf = *(config_.mutable_inputs(i)->mutable_conv_conf());
+    conf->set_input_size(getPrev(i)->getSize());
+    conf->set_output_size(getSize());
+    projConf_.emplace_back(conf);
+    projections_.emplace_back(Projection::create(*projConf_[i],
+                                                 parameters_[i], useGpu_));
  }

  if (biases_.get() && sharedBiases_) {
    hl_create_tensor_descriptor(&biasDesc_);
+    hl_create_tensor_descriptor(&outputDesc_);
    hl_tensor_reshape(biasDesc_, 1, numFilters_ / groups_[0], 1, 1);
    biasOffset_ = numFilters_ / groups_[0];
  }

-  batchNum_ = 0;
-  isSelectAlgo_ = false;
  return true;
 }

-void CudnnConvLayer::allocConvWorkSpace(size_t maxWorkSpace) {
-  size_t totalWorkSpace = maxWorkSpace * maxGroups_;
-
-  if (totalWorkSpace  > workSpaceInBytes_) {
-      if (workSpaceInBytes_ != 0) {
-          hl_free_mem_device(workSpaceData_);
-      }
-      // total amount of storage needed over all groups
-      workSpaceData_ = hl_malloc_device(totalWorkSpace);
-
-      // update work space address for each group
-      for (int i = 0; i < maxGroups_; ++i) {
-            workSpace_[i] = reinterpret_cast<char *>(workSpaceData_)
-                                  + i * maxWorkSpace;
-      }
-      workSpaceInBytes_ = totalWorkSpace;
-  }
-}
-
-void CudnnConvLayer::reshape(int batchSize) {
-  CHECK_NE(inputLayers_.size(), 0UL);
-  imageH_ = inputLayers_[0]->getOutput().getFrameHeight();
-  imageW_ = inputLayers_[0]->getOutput().getFrameWidth();
-  if (imageH_ == 0) imageH_ = imgSize_[0];
-  if (imageW_ == 0) imageW_ = imgSize_[0];
-
-  for (size_t i = 1; i < inputLayers_.size(); i++) {
-    int imageH = inputLayers_[i]->getOutput().getFrameHeight();
-    int imageW = inputLayers_[i]->getOutput().getFrameWidth();
-    if (imageH) {
-      CHECK_EQ(imageH_, imageH) << "Inputs must have same height.";
-    }
-    if (imageW) {
-      CHECK_EQ(imageW_, imageW) << "Inputs must have same width.";
-    }
-  }
-
-  outputH_ = outputSize(imageH_, filterSizeY_[0], paddingY_[0], strideY_[0]);
-  outputW_ = outputSize(imageW_, filterSize_[0], padding_[0], stride_[0]);
-  // check outputH & outputW
-  getOutput().setFrameHeight(outputH_);
-  getOutput().setFrameWidth(outputW_);
-
-  // if the batchSize remains the same, set isSelectAlgo_ true.
-  // Otherwise, set isSelectAlgo_ false and select algo again.
-  isSelectAlgo_ = (batchSize == batchNum_);
-  batchNum_ = batchSize;
-
-  size_t maxWorkSpace = 0;
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    CHECK_EQ(inputLayers_[i]->getOutput().value->getWidth(),
-             (size_t)(channels_[i] * imageH_ * imageW_));
-
-    hl_tensor_reshape(inputDesc_[i], batchSize, channels_[i] / groups_[i],
-                      imageH_, imageW_, channels_[i] * imageH_ * imageW_,
-                      imageH_ * imageW_, imageW_, 1);
-
-    hl_tensor_reshape(outputDesc_[i], batchSize, numFilters_ / groups_[i],
-                      outputH_, outputW_, numFilters_ * outputH_ * outputW_,
-                      outputH_ * outputW_, outputW_, 1);
-
-    hl_reset_convolution_descriptor(convDesc_[i], inputDesc_[i],
-                                    filterDesc_[i], paddingY_[i],
-                                    padding_[i], strideY_[i], stride_[i]);
-
-    inputOffset_[i] = (channels_[i] / groups_[i]) * imageH_ * imageW_;
-    outputOffset_[i] = (numFilters_ / groups_[i]) * outputH_ * outputW_;
-
-    if (!isSelectAlgo_) {
-      hl_conv_workspace(inputDesc_[i], outputDesc_[i], filterDesc_[i],
-                        convDesc_[i], &fwdAlgo_[i], &fwdLimitBytes_[i],
-                        &bwdDataAlgo_[i], &bwdDataLimitBytes_[i],
-                        &bwdFilterAlgo_[i], &bwdFilterLimitBytes_[i]);
-
-      maxWorkSpace = std::max(fwdLimitBytes_[i], bwdDataLimitBytes_[i]);
-      maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_[i]);
-
-      VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_[i]
-                           << " / " << bwdDataAlgo_[i]
-                           << " / " << bwdFilterAlgo_[i];
-    }
-  }
-
-  if (!isSelectAlgo_) {
-    allocConvWorkSpace(maxWorkSpace);
-  }
-
-  isSelectAlgo_ = true;
-}
-
 void CudnnConvLayer::forward(PassType passType) {
  Layer::forward(passType);
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  reshape(batchSize);
-  resetOutput(batchSize, outputH_ * outputW_ * numFilters_);
+
+  int batchSize = getInput(0).getBatchSize();
+  resetOutput(batchSize, calOutputSize());

  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    REGISTER_TIMER_INFO("CudnnConvFwTimer", getName().c_str());
-    for (int g = 0; g < groups_[i]; ++g) {
-      real *inputData = getInputValue(i)->getData() + inputOffset_[i] * g;
-      real *wgtData = weights_[i]->getW()->getData() + weightOffset_[i] * g;
-      real *outData = getOutputValue()->getData() + outputOffset_[i] * g;
-      hl_convolution_forward(inputDesc_[i], inputData, outputDesc_[i],
-                             outData, filterDesc_[i], wgtData,
-                             convDesc_[i], workSpace_[g],
-                             fwdLimitBytes_[i], fwdAlgo_[i]);
-    }
+    projections_[i]->forward(&getInput(i), &getOutput(), passType);
  }

  if (biases_) {
    REGISTER_TIMER_INFO("CudnnConvBiasTimer", getName().c_str());
-    addBiases();
-  }
-
-  forwardActivation();
-}
-
-void CudnnConvLayer::addBiases() {
-  if (sharedBiases_) {
+    int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+    hl_tensor_reshape(outputDesc_, batchSize, numFilters_ / groups_[0],
+        outputH_[0], outputW_[0], numFilters_ * outputH_[0] * outputW_[0],
+        outputH_[0] * outputW_[0], outputW_[0], 1);
+    outputOffset_ = getOutputValue()->getWidth() / groups_[0];
    for (int g = 0; g < groups_[0]; ++g) {
      real *biasData = biases_->getW()->getData() + biasOffset_ * g;
-      real *outData = getOutputValue()->getData() + outputOffset_[0] * g;
+      real *outData = getOutputValue()->getData() + outputOffset_ * g;
      hl_convolution_forward_add_bias(biasDesc_, biasData,
-                                      outputDesc_[0], outData);
+                                      outputDesc_, outData);
    }
-  } else {
-    LOG(FATAL) << "Not supported";
  }
-}

-void CudnnConvLayer::bpropBiases() {
-  if (sharedBiases_) {
-    for (int g = 0; g < groups_[0]; ++g) {
-      real *biasGrad = biases_->getWGrad()->getData() + biasOffset_ * g;
-      real *outGrad = getOutputGrad()->getData() + outputOffset_[0] * g;
-      hl_convolution_backward_bias(biasDesc_, biasGrad,
-                                   outputDesc_[0], outGrad);
-    }
-  } else {
-    LOG(FATAL) << "Not supported";
-  }
+  forwardActivation();
 }

 void CudnnConvLayer::backward(const UpdateCallback &callback) {
@@ -238,52 +87,23 @@ void CudnnConvLayer::backward(const UpdateCallback &callback) {

  if (biases_ && biases_->getWGrad()) {
    REGISTER_TIMER_INFO("CudnnConvBpBiasTimer", getName().c_str());
-    bpropBiases();
+    for (int g = 0; g < groups_[0]; ++g) {
+      real *biasGrad = biases_->getWGrad()->getData() + biasOffset_ * g;
+      real *outGrad = getOutputGrad()->getData() + outputOffset_ * g;
+      hl_convolution_backward_bias(biasDesc_, biasGrad, outputDesc_, outGrad);
+    }
    biases_->getParameterPtr()->incUpdate(callback);
  }

  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    REGISTER_TIMER_INFO("CudnnConvBpTimer", getName().c_str());
-    for (int g = 0; g < groups_[i]; ++g) {
-      real *outGrad = getOutputGrad()->getData() + outputOffset_[i] * g;
-      if (weights_[i]->getWGrad()) {
-        real *inputData = getInputValue(i)->getData() + inputOffset_[i] * g;
-        real *weightGrad =
-            weights_[i]->getWGrad()->getData() + weightOffset_[i] * g;
-        hl_convolution_backward_filter(
-            inputDesc_[i], inputData, outputDesc_[i], outGrad, filterDesc_[i],
-            weightGrad, convDesc_[i], workSpace_[g], bwdFilterLimitBytes_[i],
-            bwdFilterAlgo_[i]);
-      }
-
-      MatrixPtr preGrad = getInputGrad(i);
-      if (NULL != preGrad) {
-        real *inputGrad = preGrad->getData() + inputOffset_[i] * g;
-        real *wgtData = weights_[i]->getW()->getData() + weightOffset_[i] * g;
-        hl_convolution_backward_data(
-            inputDesc_[i], inputGrad, outputDesc_[i], outGrad, filterDesc_[i],
-            wgtData, convDesc_[i], workSpace_[g], bwdDataLimitBytes_[i],
-            bwdDataAlgo_[i]);
-      }
-    }
-    weights_[i]->getParameterPtr()->incUpdate(callback);
+    projections_[i]->backward(callback);
  }
 }

 CudnnConvLayer::~CudnnConvLayer() {
-  if (biasDesc_) {
+  if (biases_) {
    hl_destroy_tensor_descriptor(biasDesc_);
-  }
-
-  for (size_t i = 0; i < inputDesc_.size(); i++) {
-    hl_destroy_tensor_descriptor(inputDesc_[i]);
-    hl_destroy_tensor_descriptor(outputDesc_[i]);
-    hl_destroy_filter_descriptor(filterDesc_[i]);
-    hl_destroy_convolution_descriptor(convDesc_[i]);
-  }
-  if (workSpaceInBytes_ != 0) {
-    hl_free_mem_device(workSpaceData_);
-    workSpaceInBytes_ = 0;
+    hl_destroy_tensor_descriptor(outputDesc_);
  }
 }


--- a/paddle/gserver/layers/CudnnConvLayer.h
+++ b/paddle/gserver/layers/CudnnConvLayer.h
@@ -17,12 +17,13 @@ limitations under the License. */

 #include "ConvBaseLayer.h"
 #include "paddle/math/Matrix.h"
+#include "Projection.h"
 #include <vector>

 namespace paddle {

 /**
- * @brief A subclass of ConvBaseLayer by cuDNN implementation. It only
+ * @brief A 2-dimension conv layer implemented by cuDNN. It only
 *        supports GPU mode. We automatic select CudnnConvLayer for GPU
 *        mode and ExpandConvLayer for CPU mode if you set type of "conv".
 *        User also can specfiy type of "exconv" or "cudnn_conv" for
@@ -31,81 +32,21 @@ namespace paddle {
 * The config file api is img_conv_layer.
 */
 class CudnnConvLayer : public ConvBaseLayer {
-private:
-  /// resize Cudnn workspace size
-  void allocConvWorkSpace(size_t maxWorkSpace);
-
 protected:
-  int imageH_, imageW_, outputH_, outputW_;
-  /// Cudnn tensor descriptor for bias.
+  std::vector<std::unique_ptr<ProjectionConfig>> projConf_;
+  std::vector<std::unique_ptr<Projection>> projections_;
+
  hl_tensor_descriptor biasDesc_;
-  /// Cudnn tensor descriptor for input.
-  std::vector<hl_tensor_descriptor> inputDesc_;
-  /// Cudnn tensor descriptor for output.
-  std::vector<hl_tensor_descriptor> outputDesc_;
-  /// Cudnn tensor descriptor for filter.
-  std::vector<hl_filter_descriptor> filterDesc_;
-  /// Cudnn tensor descriptor for a convolution operation.
-  std::vector<hl_convolution_descriptor> convDesc_;
-  /// One sample offset of input data.
-  IntV inputOffset_;
-  /// One sample offset of output data.
-  IntV outputOffset_;
-  /// One group offset of weight.
-  IntV weightOffset_;
-  /// One group offset of bias.
+  hl_tensor_descriptor outputDesc_;
  int biasOffset_;
-
-  /// Save the algorithm for forward convolution, which is obtained by cudnn
-  /// api to search the best suited algorithm.
-  std::vector<int> fwdAlgo_;
-  /// Save the algorithm for computing convolution gradient with respect to
-  /// filter coefficients.
-  std::vector<int> bwdFilterAlgo_;
-  /// Save the algorithm for computing convolution gradient with respect to
-  /// the output.
-  std::vector<int> bwdDataAlgo_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// forward convolution with the specified algo.
-  std::vector<size_t> fwdLimitBytes_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// backwardFilter with the specified algo.
-  std::vector<size_t> bwdFilterLimitBytes_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// backwardData with the specified algo.
-  std::vector<size_t> bwdDataLimitBytes_;
-
-  /// Device work space address for each group.
-  std::vector<void*> workSpace_;
-  /// Max number of groups.
-  int maxGroups_;
-  /// Total work space address in device for all groups.
-  void* workSpaceData_;
-  /// Size of total work space.
-  size_t workSpaceInBytes_;
-
-  /// Is or not select conv algorihtm.
-  bool isSelectAlgo_;
-
-  /// batchNum is used to record batch size. If the batch size is changed,
-  /// the selection algorithm will be called.
-  int batchNum_;
+  int outputOffset_;

 public:
  explicit CudnnConvLayer(const LayerConfig& config) : ConvBaseLayer(config) {}

  ~CudnnConvLayer();

-  /**
-   * Intialization. Initialize member variables and create tenor descriptor.
-   */
  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
-  /**
-   * Reshape is done each forward. Reshape tensor decriptor
-   * inputDesc_, outputDesc_, convDesc_. And search the faster algo
-   * or the fastest algo within a given memeory limit.
-   */
-  void reshape(int batchSize);
  void forward(PassType passType);
  void backward(const UpdateCallback& callback);
  void addBiases();

--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -37,32 +37,29 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
    caffeMode_ = conf.caffe_mode();
  }

+  /* initialize the weightList */
+  CHECK(inputLayers_.size() == parameters_.size());
+  for (size_t i = 0; i < inputLayers_.size(); i++) {
+    size_t height, width;
+    height = filterPixels_[i] * filterChannels_[i];
+    width = numFilters_;
+
+    // create a new weight
+    CHECK_EQ(parameters_[i]->getSize(), width * height);
+    Weight* w = new Weight(height, width, parameters_[i]);
+    weights_.emplace_back(w);
+  }
+
  return true;
 }

-size_t ExpandConvLayer::getSize() {
+size_t ExpandConvLayer::getOutputSize() {
  CHECK_NE(inputLayers_.size(), 0UL);
-  imgSizeH_.clear();
-  imgSizeW_.clear();
-  outputH_.clear();
-  outputW_.clear();
+  size_t layerSize = ConvBaseLayer::calOutputSize();
  subN_.clear();
-  size_t layerSize = 0;
  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    imgSizeH_.push_back(inputLayers_[i]->getOutput().getFrameHeight());
-    imgSizeW_.push_back(inputLayers_[i]->getOutput().getFrameWidth());
-    if (imgSizeH_[i] == 0) imgSizeH_[i] = imgSize_[i];
-    if (imgSizeW_[i] == 0) imgSizeW_[i] = imgSize_[i];
-    outputH_.push_back(
-        outputSize(imgSizeH_[i], filterSize_[i], padding_[i], stride_[i]));
-    outputW_.push_back(
-        outputSize(imgSizeW_[i], filterSize_[i], padding_[i], stride_[i]));
    subN_.push_back(outputH_[i] * outputW_[i]);
-    CHECK(layerSize == 0 || subN_[i] * size_t(numFilters_) == layerSize);
-    layerSize = subN_[i] * numFilters_;
  }
-  getOutput().setFrameHeight(outputH_[0]);
-  getOutput().setFrameWidth(outputW_[0]);
  return layerSize;
 }

@@ -119,7 +116,7 @@ void ExpandConvLayer::expandFwdOnce(MatrixPtr image, int inIdx, int startIdx) {
 }

 void ExpandConvLayer::addSharedBias() {
-  size_t mapW = getSize() / numFilters_;
+  size_t mapW = getOutputValue()->getWidth() / numFilters_;
  size_t mapH = getOutputValue()->getElementCnt() / mapW;
  MatrixPtr out =
      Matrix::create(getOutputValue()->getData(), mapH, mapW, false, useGpu_);
@@ -158,7 +155,7 @@ void ExpandConvLayer::forward(PassType passType) {
   *   transOutValue correspond sample to one row */
  int batchSize = inputLayers_[0]->getOutputValue()->getWidth();
  batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  resetOutput(batchSize, getSize());
+  resetOutput(batchSize, getOutputSize());

  MatrixPtr image = nullptr;
  for (size_t i = 0; i != inputLayers_.size(); ++i) {
@@ -183,7 +180,7 @@ void ExpandConvLayer::forward(PassType passType) {
 }

 void ExpandConvLayer::bpropSharedBias(MatrixPtr biases, MatrixPtr v) {
-  size_t mapW = getSize() / numFilters_;
+  size_t mapW = v->getWidth() / numFilters_;
  size_t mapH = v->getElementCnt() / mapW;
  MatrixPtr vTmp = Matrix::create(v->getData(), mapH, mapW, false, useGpu_);


--- a/paddle/gserver/layers/ExpandConvLayer.h
+++ b/paddle/gserver/layers/ExpandConvLayer.h
@@ -37,14 +37,6 @@ protected:
  IntV subN_;
  /// subK_ = channels_ * filterPixels_ * groups_.
  IntV subK_;
-  /// The spatial dimensions of height of input feature map.
-  IntV imgSizeH_;
-  /// The spatial dimensions of width of input feature map.
-  IntV imgSizeW_;
-  /// The spatial dimensions of height of output feature map.
-  IntV outputH_;
-  /// The spatial dimensions of width of output feature map.
-  IntV outputW_;
  /// Expand one sample at a time. shape:
  /// (numChannels * filterPixels_, outputSizeH * outputSizeW)
  MatrixPtr expandInput_;
@@ -58,7 +50,7 @@ public:

  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);

-  size_t getSize();
+  size_t getOutputSize();

  /**
   * Create or resize expandInput_.

--- a/paddle/gserver/layers/MixedLayer.cpp
+++ b/paddle/gserver/layers/MixedLayer.cpp
@@ -41,9 +41,13 @@ bool MixedLayer::init(const LayerMap& layerMap,
    }
    operators_.emplace_back(Operator::create(operator_conf, useGpu_));
  }
+
  /* initialize biases_ */
  if (biasParameter_.get() != NULL) {
-    biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+    sharedBias_ = config_.shared_biases();
+    size_t psize = config_.bias_size();
+    biases_ = std::unique_ptr<Weight>(
+        new Weight(1, psize, biasParameter_));
  }

  return true;
@@ -119,12 +123,6 @@ void MixedLayer::forward(PassType passType) {

  MatrixPtr outV = getOutputValue();

-  /* add the bias-vector */
-  if (biases_.get() != NULL) {
-    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
-    outV->addBias(*(biases_->getW()), 1);
-  }
-
  for (size_t i = 0; i != inputLayers_.size(); ++i) {
    if (projections_[i]) {
      projections_[i]->forward(&getInput(i), &output_, passType);
@@ -140,6 +138,12 @@ void MixedLayer::forward(PassType passType) {
    op->forward(ins, &output_, passType);
  }

+  /* add the bias-vector */
+  if (biases_.get() != NULL) {
+    REGISTER_TIMER_INFO("FwBiasTimer", getName().c_str());
+    outV->addBias(*(biases_->getW()), 1, sharedBias_);
+  }
+
  /* activation */ {
    REGISTER_TIMER_INFO("FwAtvTimer", getName().c_str());
    forwardActivation();
@@ -154,7 +158,7 @@ void MixedLayer::backward(const UpdateCallback& callback) {

  if (biases_ && biases_->getWGrad()) {
    REGISTER_TIMER_INFO("BpBiasTimer", getName().c_str());
-    biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+    biases_->getWGrad()->collectBias(*getOutputGrad(), 1, sharedBias_);

    /* Increasing the number of gradient */
    biases_->getParameterPtr()->incUpdate(callback);

--- a/paddle/gserver/layers/MixedLayer.h
+++ b/paddle/gserver/layers/MixedLayer.h
@@ -58,5 +58,6 @@ protected:
  /// the matrix size of projection state
  std::vector<int> projectionStateMatrixSize_;
  std::unique_ptr<Weight> biases_;
+  bool sharedBias_;
 };
 }  // namespace paddle
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -20,6 +20,13 @@ add_unittest_without_exec(test_LayerGrad
 add_test(NAME test_LayerGrad
    COMMAND test_LayerGrad)

+add_unittest_without_exec(test_ActivationGrad
+    test_ActivationGrad.cpp
+    LayerGradUtil.cpp
+    TestUtil.cpp)
+add_test(NAME test_ActivationGrad
+    COMMAND test_ActivationGrad)
+
 ################## test_Evaluator #######################
 add_unittest(test_Evaluator
    test_Evaluator.cpp

--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -669,12 +669,14 @@ void testLayerGrad(TestConfig testConf, string testLayerName, size_t batchSize,

 void testProjectionGrad(ProjectionConfig conf, InputType inputType,
                        size_t parameterSize, size_t batchSize, bool useGpu,
-                        bool testState) {
+                        bool testState, int biasSize, bool sharedBias) {
  TestConfig config;
  conf.set_name(conf.type());
  config.layerConfig.set_type("mixed");
  config.layerConfig.set_size(conf.output_size());
-  config.biasSize = config.layerConfig.size();
+  config.biasSize = biasSize == 0 ? config.layerConfig.size() : biasSize;
+  config.layerConfig.set_bias_size(config.biasSize);
+  config.layerConfig.set_shared_biases(sharedBias);
  config.inputDefs.push_back(
      {inputType, "layer_0", conf.input_size(), parameterSize});
  *config.layerConfig.add_inputs()->mutable_proj_conf() = conf;

--- a/paddle/gserver/tests/LayerGradUtil.h
+++ b/paddle/gserver/tests/LayerGradUtil.h
@@ -217,7 +217,8 @@ void testLayerGrad(TestConfig testConf, string testLayerName, size_t batchSize,

 void testProjectionGrad(ProjectionConfig conf, InputType inputType,
                        size_t parameterSize, size_t batchSize, bool useGpu,
-                        bool testState = false);
+                        bool testState = false, int biasSize = 0,
+                        bool sharedBias = false);

 void testOperatorGrad(TestConfig& config, OperatorConfig& operatorConf,
                      size_t batchSize, bool useGpu, bool testState = false);

--- a/paddle/gserver/tests/img_conv_a.conf
+++ b/paddle/gserver/tests/img_conv_a.conf
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+data = data_layer(name ="input", size=8*16*16)
+conv1 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                        num_channels=8,
+                        num_filters=16, stride=1,
+                        bias_attr=False,
+                        act=ReluActivation())
+conv2 = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8,
+                       num_filters=16, stride=1,
+                       bias_attr=False,
+                       act=ReluActivation())
+
+concat = concat_layer(input=[conv1, conv2])
+
+conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                      num_channels=8,
+                      num_filters=16, stride=1,
+                      bias_attr=True,
+                      act=LinearActivation())
+
+outputs(concat, conv)
--- a/paddle/gserver/tests/img_conv_b.conf
+++ b/paddle/gserver/tests/img_conv_b.conf
+#edit-mode: -*- python -*-
+# Copyright (c) 2016 Baidu, Inc. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=10)
+data = data_layer(name ="input", size=8*16*16)
+proj1 = conv_projection(input=data, filter_size=1, filter_size_y=1,
+                        num_channels=8, num_filters=16, stride=1)
+proj2 = conv_projection(input=data, filter_size=1, filter_size_y=1,
+                        num_channels=8, num_filters=16, stride=1)
+concat = concat_layer(input=[proj1, proj2], bias_attr=False, act=ReluActivation())
+
+proj = conv_projection(input=data, filter_size=1, filter_size_y=1,
+                       num_channels=8, num_filters=16, stride=1)
+
+with mixed_layer(bias_attr=True, act=LinearActivation()) as conv:
+    conv += proj
+
+outputs(concat, conv)
--- a/paddle/gserver/tests/test_ActivationGrad.cpp
+++ b/paddle/gserver/tests/test_ActivationGrad.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <vector>
+#include <string>
+#include "paddle/gserver/layers/DataLayer.h"
+#include "ModelConfig.pb.h"
+#include "paddle/trainer/Trainer.h"
+
+#include "TestUtil.h"
+#include "LayerGradUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+P_DECLARE_bool(use_gpu);
+P_DECLARE_bool(thread_local_rand_use_global_seed);
+
+void testActivation(const string& act) {
+  LOG(INFO) << "test activation: " << act;
+  size_t size = 10;
+  TestConfig config;
+  config.biasSize = 0;
+  config.layerConfig.set_type("addto");
+  config.layerConfig.set_size(size);
+  config.layerConfig.set_active_type(act);
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", size, 0});
+  config.layerConfig.add_inputs();
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  act + "_activation",
+                  100,
+                  /* trans= */false,
+                  useGpu,
+                  /* useWeight */true);
+  }
+}
+
+TEST(Activation, activation) {
+  auto types = ActivationFunction::getAllRegisteredTypes();
+  std::set<string> excluded{"sequence_softmax"};
+  for (auto type : types) {
+    if (excluded.count(type)) continue;
+    testActivation(type);
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -152,6 +152,45 @@ TEST(Projection, identity) {
  }
 }

+
+#ifndef PADDLE_ONLY_CPU
+TEST(Projection, conv) {
+  const int NUM_FILTERS = 16;
+  const int FILTER_SIZE = 2;
+  const int FILTER_SIZE_Y = 3;
+  const int CHANNELS = 3;
+  const int IMAGE_SIZE = 16;
+
+  ProjectionConfig conf;
+  conf.set_type("conv");
+  conf.set_num_filters(NUM_FILTERS);
+
+  ConvConfig* conv = conf.mutable_conv_conf();
+  conv->set_filter_size(FILTER_SIZE);
+  conv->set_filter_size_y(FILTER_SIZE_Y);
+  conv->set_channels(CHANNELS);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(IMAGE_SIZE);
+  int outputSize = (2 * conv->padding() + conv->img_size() -
+      conv->filter_size()) / conv->stride() + 1;
+  int outputSizeY = (2 * conv->padding_y() + conv->img_size() -
+      conv->filter_size_y()) / conv->stride_y() + 1;
+  conv->set_output_x(outputSize);
+  conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
+  conf.set_output_size(outputSize * outputSizeY * NUM_FILTERS);
+
+  testProjectionGrad(conf, INPUT_DATA,
+      /* parameterSize */ NUM_FILTERS * CHANNELS * FILTER_SIZE * FILTER_SIZE_Y,
+      /* batchSize */ 100, true, false, NUM_FILTERS, true);
+}
+#endif
+
+
 TEST(Layer, concat) {
  TestConfig config;
  config.biasSize = 0;

--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -236,6 +236,15 @@ TEST(Compare, img_pool) {
  compareNetwork(config_file_a, config_file_b);
  FLAGS_use_gpu = useGpu;
 }
+
+TEST(Compare, img_conv) {
+  std::string config_file_a = "./gserver/tests/img_conv_a.conf";
+  std::string config_file_b = "./gserver/tests/img_conv_b.conf";
+  bool useGpu = FLAGS_use_gpu;
+  FLAGS_use_gpu = true;
+  compareNetwork(config_file_a, config_file_b);
+  FLAGS_use_gpu = useGpu;
+}
 #endif



--- a/paddle/gserver/tests/test_PyDataProvider2.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider2.cpp
@@ -353,6 +353,23 @@ TEST(PyDataProvider2, test_check) {
  }
 }

+TEST(PyDataProvider2, multiThread) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_dense_no_seq");
+  config.set_async_load_data(true);
+
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+  provider->reset();
+  paddle::DataBatch batch;
+  provider->getNextBatch(100, &batch);
+  provider->reset();
+  provider.reset();
+}
+
 int main(int argc, char** argv) {
  testing::InitGoogleTest(&argc, argv);
  paddle::initMain(argc, argv);

--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -39,6 +39,46 @@ void gemm<double>(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
              beta, C, ldc);
 }

+template<>
+int getrf<float>(const CBLAS_ORDER order, const int M, const int N,
+                  float *A, const int lda, int *ipiv) {
+#ifdef PADDLE_USE_ATLAS
+  return clapack_sgetrf(order, M, N, A, lda, ipiv);
+#else
+  return LAPACKE_sgetrf(order, M, N, A, lda, ipiv);
+#endif
+}
+
+template<>
+int getrf<double>(const CBLAS_ORDER order, const int M, const int N,
+                   double *A, const int lda, int *ipiv) {
+#ifdef PADDLE_USE_ATLAS
+  return clapack_dgetrf(order, M, N, A, lda, ipiv);
+#else
+  return LAPACKE_dgetrf(order, M, N, A, lda, ipiv);
+#endif
+}
+
+template<>
+int getri<float>(const CBLAS_ORDER order, const int N, float *A,
+                  const int lda, const int *ipiv) {
+#ifdef PADDLE_USE_ATLAS
+  return clapack_sgetri(order, N, A, lda, ipiv);
+#else
+  return LAPACKE_sgetri(order, N, A, lda, ipiv);
+#endif
+}
+
+template<>
+int getri<double>(const CBLAS_ORDER order, const int N, double *A,
+                  const int lda, const int *ipiv) {
+#ifdef PADDLE_USE_ATLAS
+  return clapack_dgetri(order, N, A, lda, ipiv);
+#else
+  return LAPACKE_dgetri(order, N, A, lda, ipiv);
+#endif
+}
+
 template<>
 void axpy<float>(const int n, const float alpha, const float* x, float* y) {
  cblas_saxpy(n, alpha, x, 1, y, 1);

--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -17,10 +17,18 @@ limitations under the License. */

 #ifdef PADDLE_USE_MKL
 #include <mkl.h>
+#include <mkl_lapacke.h>
 #else
 extern "C" {
 #include <cblas.h>
 }
+#ifdef PADDLE_USE_ATLAS
+extern "C" {
+#include <clapack.h>
+}
+#else
+#include <lapacke.h>
+#endif
 #endif

 #include <cmath>
@@ -34,6 +42,14 @@ void gemm(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
          const T* B, const int ldb,
          const T beta, T* C, const int ldc);

+template<class T>
+int getrf(const CBLAS_ORDER Order, const int M, const int N,
+          T *A, const int lda, int *ipiv);
+
+template<class T>
+int getri(const CBLAS_ORDER Order, const int N, T *A,
+          const int lda, const int *ipiv);
+
 template<class T>
 void axpy(const int n, const T alpha, const T* x, T* y);


--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -336,11 +336,44 @@ void GpuMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
  hl_matrix_transpose(data, dataTrans, height_, width_, lda, ldc);
 }

+
+MatrixPtr GpuMatrix::getInverse() {
+  MatrixPtr matInv;
+  inverse(matInv, true);
+  return matInv;
+}
+
+void GpuMatrix::inverse(MatrixPtr matInv, bool memAlloc) {
+  CHECK_EQ(height_, width_);
+
+  if (memAlloc) {
+    matInv = std::make_shared<GpuMatrix>(height_, width_);
+  } else {
+    CHECK(matInv != NULL);
+  }
+
+  real* data = getData();
+  real* dataInv = matInv->getData();
+  int lda = getStride();
+  int ldc = matInv->getStride();
+
+  hl_matrix_inverse(data, dataInv, height_, lda, ldc);
+}
+
 void GpuMatrix::addBias(Matrix& b, real scale) {
  CHECK(b.getHeight() == 1) << "the Bias should be a vector";
  BaseMatrix::addBias(b, scale);
 }

+void GpuMatrix::addSharedBias(Matrix& b, real scale) {
+  CHECK(b.getHeight() == 1) << "the Bias should be a vector";
+  CHECK_LE(b.getWidth(), getWidth());
+  CHECK_EQ(getWidth() % b.getWidth(), 0UL);
+  hl_matrix_add_shared_bias(getData(), b.getData(), b.getWidth(),
+                            getHeight(), getWidth(), scale);
+}
+
+
 void GpuMatrix::collectBias(Matrix& a, real scale) {
  CHECK_EQ(getHeight(), (size_t)1);
  CHECK_EQ(width_, a.getWidth());
@@ -355,6 +388,14 @@ void GpuMatrix::collectBias(Matrix& a, real scale) {
  }
 }

+void GpuMatrix::collectSharedBias(Matrix& a, real scale) {
+  CHECK_EQ(getHeight(), (size_t)1);
+  CHECK_EQ(a.getWidth() % getWidth(), 0UL);
+  hl_matrix_collect_shared_bias(getData(), a.getData(), getWidth(),
+                                a.getHeight(), a.getWidth(), scale);
+}
+
+
 void GpuMatrix::sequenceAvgForward(Matrix& a,
                                   const IVector& startsPos,
                                   int mode) {
@@ -1483,6 +1524,47 @@ void CpuMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
  }
 }

+
+MatrixPtr CpuMatrix::getInverse() {
+  MatrixPtr matInv;
+  inverse(matInv, true);
+  return matInv;
+}
+
+void CpuMatrix::inverse(MatrixPtr matInv, bool memAlloc) {
+  CHECK_EQ(height_, width_);
+
+  if (memAlloc) {
+    matInv = std::make_shared<CpuMatrix>(height_, width_);
+  } else {
+    CHECK(matInv != NULL);
+  }
+
+  CHECK_EQ(height_, matInv->getHeight());
+  CHECK_EQ(width_, matInv->getWidth());
+  matInv->copyFrom(*this);
+
+  real* data = getData();
+  real* dataInv = matInv->getData();
+  int ldc = matInv->getStride();
+
+  if (height_ == 1) {
+    CHECK_NE(*data, 0);
+    *dataInv = 1.0 / (*data);
+    return;
+  }
+
+  /* Compute the LU decomposition of the matrix */
+  std::vector<int> ipiv(height_);
+  CBLAS_ORDER order = (matInv->isTransposed() ? CblasColMajor : CblasRowMajor);
+  int info = getrf<real>(order, height_, height_, dataInv, ldc, ipiv.data());
+  CHECK_EQ(info, 0);
+
+  /* Compute the inverse of the matrix given its LU decompsotion */
+  info = getri<real>(order, height_, dataInv, ldc, ipiv.data());
+  CHECK_EQ(info, 0);
+}
+
 void CpuMatrix::convExpand(Matrix& feature, int feaImgHeight, int feaImgWidth,
                           int channels, int blockH, int blockW, int strideH,
                           int strideW, int paddingH, int paddingW,
@@ -2046,6 +2128,24 @@ void CpuMatrix::addBias(Matrix& b, real scale) {
  }
 }

+void CpuMatrix::addSharedBias(Matrix& b, real scale) {
+  CHECK_EQ(b.getHeight(), (size_t)1);
+  real* aData = getData();
+  real* bData = b.getData();
+  size_t numSamples = getHeight();
+  size_t channel = b.getWidth();
+  CHECK_EQ(getWidth() % channel, 0UL);
+  size_t dim = getWidth() / channel;
+
+  for (size_t i = 0; i < numSamples; i++) {
+    for (size_t c = 0; c < channel; c++) {
+      for (size_t j = 0; j < dim; j++) {
+        aData[i * getStride() + c * dim + j] += scale * bData[c];
+      }
+    }
+  }
+}
+
 void CpuMatrix::collectBias(Matrix& a, real scale) {
  CHECK_EQ(getHeight(), (size_t)1);
  CHECK_EQ(width_, a.getWidth());
@@ -2063,6 +2163,23 @@ void CpuMatrix::collectBias(Matrix& a, real scale) {
  }
 }

+void CpuMatrix::collectSharedBias(Matrix& a, real scale) {
+  CHECK_EQ(getHeight(), (size_t)1);
+  real* B = getData();
+  real* A = a.getData();
+  size_t numSamples = a.getHeight();
+  size_t channel = getWidth();
+  CHECK_EQ(a.getWidth() % channel, 0UL);
+  size_t dim = a.getWidth() / channel;
+  for (size_t i = 0; i < numSamples; i++) {
+    for (size_t c = 0; c < channel; c++) {
+      for (size_t j = 0; j < dim; j++) {
+        B[c] += scale * A[i * channel * dim + c * dim + j];
+      }
+    }
+  }
+}
+
 void CpuMatrix::sequenceAvgForward(Matrix& a,
                                   const IVector& startsPos,
                                   int mode) {

--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
--- a/paddle/pserver/ParameterServer2.cpp
+++ b/paddle/pserver/ParameterServer2.cpp
@@ -264,6 +264,15 @@ void ParameterServer2::setParameter(const SendParameterRequest& request,
  std::vector<int64_t> blockIds;
  blockIds.reserve(request.blocks_size());
  int bufferIndex = 0;
+
+  if (!request.blocks().size()) {
+    LOG(WARNING)
+          << "--ports_num or --ports_num_for_sparse might be too large, "
+          << "or total dense parameter size or sparse parameters size "
+          << "might be too small, this psever doesn't store any parameter.";
+    return;
+  }
+
  for (const auto& block : request.blocks()) {
    /// block size for parameter(e.g. 128 for sparse row, 1K for dense)
    uint64_t blockSize = getParameterConfig(block).parameter_block_size();

--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
--- a/paddle/py_paddle/util.py
+++ b/paddle/py_paddle/util.py
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
--- a/paddle/scripts/tools/build_docs/.gitignore
+++ b/paddle/scripts/tools/build_docs/.gitignore
+doc
+doc_cn
--- a/paddle/scripts/tools/build_docs/Dockerfile
+++ b/paddle/scripts/tools/build_docs/Dockerfile
--- a/paddle/scripts/tools/build_docs/build.sh
+++ b/paddle/scripts/tools/build_docs/build.sh
--- a/paddle/scripts/tools/build_docs/build_docs.sh
+++ b/paddle/scripts/tools/build_docs/build_docs.sh
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
--- a/paddle/trainer/CMakeLists.txt
+++ b/paddle/trainer/CMakeLists.txt
--- a/paddle/trainer/ParamUtil.cpp
+++ b/paddle/trainer/ParamUtil.cpp
--- a/paddle/trainer/Tester.cpp
+++ b/paddle/trainer/Tester.cpp
--- a/paddle/trainer/Tester.h
+++ b/paddle/trainer/Tester.h
--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
--- a/paddle/trainer/Trainer.h
+++ b/paddle/trainer/Trainer.h
--- a/paddle/trainer/TrainerBenchmark.cpp
+++ b/paddle/trainer/TrainerBenchmark.cpp
--- a/paddle/trainer/TrainerInternal.cpp
+++ b/paddle/trainer/TrainerInternal.cpp
--- a/paddle/trainer/TrainerInternal.h
+++ b/paddle/trainer/TrainerInternal.h
--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/trainer/TrainerMain.cpp
--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
--- a/proto/ModelConfig.proto.m4
+++ b/proto/ModelConfig.proto.m4
--- a/proto/TrainerConfig.proto.m4
+++ b/proto/TrainerConfig.proto.m4
--- a/python/paddle/proto/__init__.py
+++ b/python/paddle/proto/__init__.py
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
--- a/python/paddle/trainer_config_helpers/activations.py
+++ b/python/paddle/trainer_config_helpers/activations.py
--- a/python/paddle/trainer_config_helpers/default_decorators.py
+++ b/python/paddle/trainer_config_helpers/default_decorators.py
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
--- a/python/paddle/trainer_config_helpers/math.py
+++ b/python/paddle/trainer_config_helpers/math.py
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
--- a/python/paddle/trainer_config_helpers/tests/configs/check.md5
+++ b/python/paddle/trainer_config_helpers/tests/configs/check.md5
--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
--- a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_layers.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/last_first_seq.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/layer_activations.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/layer_activations.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_fc.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_fc.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/simple_rnn_layers.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bi_grumemory.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_bi_grumemory.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_expand_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_expand_layer.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_fc.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_fc.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_grumemory_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_grumemory_layer.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_hsigmoid.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_hsigmoid.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_lstmemory_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_lstmemory_layer.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_maxout.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_print_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_print_layer.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_sequence_pooling.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/unused_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/unused_layers.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/util_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/util_layers.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
--- a/python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_bi_grumemory.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_maxout.py