diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a6e45028ebc3f53ea20806f0dd2a7acc820607fe..3402223b044b8950e7772f4d87cc64e5772f8dcd 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,12 +2,12 @@
     sha: c25201a00e6b0514370501050cf2a8538ac12270
     hooks:
     -   id: remove-crlf
-        files: (?!.*third_party)^.*$
+        files: (?!.*third_party)^.*$ | (?!.*book)^.*$
 -   repo: https://github.com/reyoung/mirrors-yapf.git
     sha: v0.13.2
     hooks:
     - id: yapf
-      files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$  # Bazel BUILD files follow Python syntax.
+      files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$
 -   repo: https://github.com/pre-commit/pre-commit-hooks
     sha: 7539d8bd1a00a3c1bfd34cdb606d3a6372e83469
     hooks:
@@ -15,7 +15,7 @@
     -   id: check-merge-conflict
     -   id: check-symlinks
     -   id: detect-private-key
-        files: (?!.*third_party)^.*$
+        files: (?!.*third_party)^.*$ | (?!.*book)^.*$
     -   id: end-of-file-fixer
 -   repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
     sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
diff --git a/.travis.yml b/.travis.yml
index 5d82d9729b75ef493a0bd03921c453f9a519c8cd..5a7f45a748ac7e81f3f90c245bcf2cd84c4e9027 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -8,17 +8,10 @@ sudo: required
 dist: trusty
 os:
   - linux
-  - osx
 env:
   - JOB=DOCS
   - JOB=BUILD_AND_TEST
   - JOB=PRE_COMMIT
-matrix:
-  exclude:
-    - os: osx
-      env: JOB=DOCS  # Only generate documentation in linux.
-    - os: osx
-      env: JOB=PRE_COMMIT # Only check pre-commit hook in linux
 
 addons:
   apt:
@@ -52,11 +45,10 @@ before_install:
         fi
       fi
     fi
-  - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
   - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
   # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python 
   # protobuf version.
-  - pip install numpy wheel 'protobuf==3.1' sphinx recommonmark sphinx_rtd_theme virtualenv pre-commit requests==2.9.2 LinkChecker
+  - pip install numpy wheel 'protobuf==3.1' sphinx recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
 script:
   - paddle/scripts/travis/main.sh
 notifications:
diff --git a/paddle/scripts/docker/Dockerfile.gpu b/Dockerfile
similarity index 58%
rename from paddle/scripts/docker/Dockerfile.gpu
rename to Dockerfile
index da20b2635e10e702d7c121a4f95e6a52a68487b0..536adb0716447aa8b8c10beef8b974ae3f016f05 100644
--- a/paddle/scripts/docker/Dockerfile.gpu
+++ b/Dockerfile
@@ -1,42 +1,58 @@
+# A image for building paddle binaries
+# Use cuda devel base image for both cpu and gpu environment
 FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 
 ARG DEBIAN_FRONTEND=noninteractive
 ARG UBUNTU_MIRROR
-RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
+RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
+
+# ENV variables
+ARG BUILD_WOBOQ
+ARG BUILD_AND_INSTALL
+ARG WITH_GPU
+ARG WITH_AVX
+ARG WITH_DOC
+ARG WITH_STYLE_CHECK
+
+ENV BUILD_WOBOQ=${BUILD_WOBOQ:-OFF}
+ENV BUILD_AND_INSTALL=${BUILD_AND_INSTALL:-OFF}
+ENV WITH_GPU=${WITH_AVX:-OFF}
+ENV WITH_AVX=${WITH_AVX:-ON}
+ENV WITH_DOC=${WITH_DOC:-OFF}
+ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
+
+ENV HOME /root
+# Add bash enhancements
+COPY ./paddle/scripts/docker/root/ /root/
 
 RUN apt-get update && \
     apt-get install -y git python-pip python-dev openssh-server bison && \
     apt-get install -y wget unzip tar xz-utils bzip2 gzip coreutils && \
     apt-get install -y curl sed grep graphviz libjpeg-dev zlib1g-dev && \
     apt-get install -y python-numpy python-matplotlib gcc g++ gfortran && \
-    apt-get install -y automake clang-3.8 llvm-3.8 libclang-3.8-dev && \
+    apt-get install -y automake locales clang-format-3.8 && \
     apt-get clean -y
 
+# git credential to skip password typing
+RUN git config --global credential.helper store
+
+# Fix locales to en_US.UTF-8
+RUN localedef -i en_US -f UTF-8 en_US.UTF-8
+
 RUN pip install --upgrade pip && \
-    pip install -U protobuf && \
+    pip install -U 'protobuf==3.1.0' && \
     pip install -U wheel pillow BeautifulSoup && \
     pip install -U docopt PyYAML sphinx && \
-    pip install -U sphinx_rtd_theme recommonmark jupyter
+    pip install -U sphinx-rtd-theme==0.1.9 recommonmark && \
+    pip install -U pre-commit 'requests==2.9.2' jupyter
 
 RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
-    cd cmake-3.4.1 && ./bootstrap && make -j4 && make install && \
+    cd cmake-3.4.1 && ./bootstrap && make -j `nproc` && make install && \
     cd .. && rm -rf cmake-3.4.1
 
-ARG BUILD_AND_INSTALL
-ARG WITH_AVX
-ARG WITH_DOC
-ARG WITH_STYLE_CHECK
+RUN apt-get install -y swig
 
-ENV BUILD_AND_INSTALL=${BUILD_AND_INSTALL:-OFF}
-ENV WITH_GPU=ON
-ENV WITH_AVX=${WITH_AVX:-ON}
-ENV WITH_DOC=${WITH_DOC:-ON}
-ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
-
-RUN mkdir /paddle
-COPY . /paddle/
-RUN /paddle/paddle/scripts/docker/build.sh
 VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"]
 
 # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
@@ -46,12 +62,5 @@ RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
 RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
 EXPOSE 22
 
-# Jupyter Notebook directory.
-RUN mkdir /notes/
-WORKDIR "/notes"
-EXPOSE 8888
-
-RUN mkdir -p /opt/bin
-COPY ./paddle/scripts/docker/entrypoint /opt/bin/
-
-CMD ["/opt/bin/entrypoint"]
+# development image default do build work
+CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]
diff --git a/authors b/authors
index ab4d3118ff1f7e94677c89073c4ea05bf991165e..daac4ec5d8173cba95df9f9b3c69c02b5256f5b2 100644
--- a/authors
+++ b/authors
@@ -29,13 +29,16 @@ Luo, Tao
 Lyu, Qin
 Mao, Hongyue
 Qian, Xiaojun
+Qiao, Longfei
 Qi, Jun
 Qin, Duohao
 Shen, Guolong
 Shi, Guangchuan
 Song, Xiang
+Wang, Helin
 Wang, Jiang
 Wang, Yanfei
+Wang, Yi
 Wang, Yong
 Weng, Renliang
 Xu, Tianbing
diff --git a/cmake/FindSphinx.cmake b/cmake/FindSphinx.cmake
index d319442ef10b38b9edf5844e5540a92c7094c7ce..1c29cb22a31f1e41a6b5575837c6374175cfdea5 100644
--- a/cmake/FindSphinx.cmake
+++ b/cmake/FindSphinx.cmake
@@ -72,7 +72,7 @@ function( Sphinx_add_target target_name builder conf cache source destination )
     ${source}
     ${destination}
     COMMENT "Generating sphinx documentation: ${builder}"
-    COMMAND ln -sf ${destination}/index_*.html ${destination}/index.html
+    COMMAND cd ${destination} && ln -s ./index_*.html index.html
     )
 
   set_property(
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 26306f9849100d4463dde267acae5392cc81d7ac..235c95f017f2b6ef24195a0210ccafff36b6ed61 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -16,7 +16,8 @@
 set(CBLAS_FOUND OFF)
 
 ## Find MKL First.
-set(MKL_ROOT $ENV{MKLROOT} CACHE PATH "Folder contains MKL")
+set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs")
+set(MKL_ROOT ${INTEL_ROOT}/mkl CACHE PATH "Folder contains MKL")
 
 find_path(MKL_INCLUDE_DIR mkl.h PATHS
   ${MKL_ROOT}/include)
diff --git a/cmake/ccache.cmake b/cmake/ccache.cmake
index 968d41801d73c4082d2673efe415c1cdd0305b5e..900f59d4cb83bc9ce1893b2d3bd95f5a08b164bb 100644
--- a/cmake/ccache.cmake
+++ b/cmake/ccache.cmake
@@ -1,9 +1,9 @@
 # Use ccache if found ccache program
 
-find_program(CCACHE_FOUND ccache)
+find_program(CCACHE_PATH ccache)
 
-if(CCACHE_FOUND)
+if(CCACHE_PATH)
     message(STATUS "Ccache is founded, use ccache to speed up compile.")
-    set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ccache)
-    set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ccache)
-endif(CCACHE_FOUND)
\ No newline at end of file
+    set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_PATH})
+    set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK ${CCACHE_PATH})
+endif(CCACHE_PATH)
diff --git a/cmake/coverallsGcovJsons.cmake b/cmake/coverallsGcovJsons.cmake
index ae3530c3a0eeb79ddbcbf4f2e99be75aa7968a2f..ad9a10cb8616159b9e3aff445e698cb2edb92820 100644
--- a/cmake/coverallsGcovJsons.cmake
+++ b/cmake/coverallsGcovJsons.cmake
@@ -110,14 +110,13 @@ endmacro()
 
 # Get the coverage data.
 file(GLOB_RECURSE GCDA_FILES "${COV_PATH}" "*.gcda")
-message("GCDA files:")
+message("Process GCDA files:")
+message("===============================")
 
 # Get a list of all the object directories needed by gcov
 # (The directories the .gcda files and .o files are found in)
 # and run gcov on those.
 foreach(GCDA ${GCDA_FILES})
-	message("Process: ${GCDA}")
-	message("------------------------------------------------------------------------------")
 	get_filename_component(GCDA_DIR ${GCDA} PATH)
 
 	#
@@ -135,7 +134,7 @@ foreach(GCDA ${GCDA_FILES})
 	# If -p is not specified then the file is named only "the_file.c.gcov"
 	#
 	execute_process(
-		COMMAND ${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA}
+		COMMAND "${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA} >/dev/null"
 		WORKING_DIRECTORY ${GCDA_DIR}
 	)
 endforeach()
@@ -383,7 +382,6 @@ foreach(NOT_COVERED_SRC ${COVERAGE_SRCS_REMAINING})
 	set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]")
 
 	# Generate the final JSON for this file.
-	message("Generate JSON for non-gcov file: ${NOT_COVERED_SRC}...")
 	string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON)
 	set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ")
 endforeach()
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 84f459033f06f89d3b150317793c7e62274468b2..446a7532c55bd3ca66662efe70db93551580b8cc 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -14,46 +14,58 @@
 
 INCLUDE(ExternalProject)
 
-SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/protobuf)
-SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/protobuf)
-SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" CACHE PATH "protobuf include directory." FORCE)
+FIND_PACKAGE(Protobuf 3.1)
 
-INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})
+IF(PROTOBUF_FOUND)
+    EXEC_PROGRAM(${PROTOBUF_PROTOC_EXECUTABLE} ARGS --version OUTPUT_VARIABLE PROTOBUF_VERSION)
+    STRING(REGEX MATCH "[0-9]+.[0-9]+" PROTOBUF_VERSION "${PROTOBUF_VERSION}")
+    IF (${PROTOBUF_VERSION} VERSION_LESS "3.1.0")
+        SET(PROTOBUF_FOUND OFF)
+    ENDIF()
+ENDIF(PROTOBUF_FOUND)
+
+IF(NOT PROTOBUF_FOUND)
+    SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/protobuf)
+    SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/protobuf)
+    SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" CACHE PATH "protobuf include directory." FORCE)
+
+    IF(WIN32)
+        SET(PROTOBUF_LITE_LIBRARY
+            "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.lib" CACHE FILEPATH "protobuf lite library." FORCE)
+        SET(PROTOBUF_LIBRARY
+            "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.lib" CACHE FILEPATH "protobuf library." FORCE)
+        SET(PROTOBUF_PROTOC_LIBRARY
+            "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.lib" CACHE FILEPATH "protoc library." FORCE)
+        SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc.exe" CACHE FILEPATH "protobuf executable." FORCE)
+    ELSE(WIN32)
+        SET(PROTOBUF_LITE_LIBRARY
+            "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.a" CACHE FILEPATH "protobuf lite library." FORCE)
+        SET(PROTOBUF_LIBRARY
+            "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.a" CACHE FILEPATH "protobuf library." FORCE)
+        SET(PROTOBUF_PROTOC_LIBRARY
+            "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.a" CACHE FILEPATH "protoc library." FORCE)
+        SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc" CACHE FILEPATH "protobuf executable." FORCE)
+    ENDIF(WIN32)
 
-IF(WIN32)
-  SET(PROTOBUF_LITE_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.lib" CACHE FILEPATH "protobuf lite library." FORCE)
-  SET(PROTOBUF_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.lib" CACHE FILEPATH "protobuf library." FORCE)
-  SET(PROTOBUF_PROTOC_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.lib" CACHE FILEPATH "protoc library." FORCE)
-  SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc.exe" CACHE FILEPATH "protobuf executable." FORCE)
-ELSE(WIN32)
-  SET(PROTOBUF_LITE_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.a" CACHE FILEPATH "protobuf lite library." FORCE)
-  SET(PROTOBUF_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.a" CACHE FILEPATH "protobuf library." FORCE)
-  SET(PROTOBUF_PROTOC_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.a" CACHE FILEPATH "protoc library." FORCE)
-  SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc" CACHE FILEPATH "protobuf executable." FORCE)
-ENDIF(WIN32)
-
-ExternalProject_Add(
-  protobuf
-  ${EXTERNAL_PROJECT_LOG_ARGS}
-  PREFIX          ${PROTOBUF_SOURCES_DIR}
-  UPDATE_COMMAND  ""
-  DEPENDS         zlib
-  GIT_REPOSITORY  "https://github.com/google/protobuf.git"
-  GIT_TAG         "9f75c5aa851cd877fb0d93ccc31b8567a6706546"
-  CONFIGURE_COMMAND
-    ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake
-    -Dprotobuf_BUILD_TESTS=OFF
-    -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
-    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    -DCMAKE_BUILD_TYPE=Release
-    -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
-    -DCMAKE_INSTALL_LIBDIR=lib
-)
-
-LIST(APPEND external_project_dependencies protobuf)
+    ExternalProject_Add(
+        protobuf
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        PREFIX          ${PROTOBUF_SOURCES_DIR}
+        UPDATE_COMMAND  ""
+        DEPENDS         zlib
+        GIT_REPOSITORY  "https://github.com/google/protobuf.git"
+        GIT_TAG         "9f75c5aa851cd877fb0d93ccc31b8567a6706546"
+        CONFIGURE_COMMAND
+        ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake
+        -Dprotobuf_BUILD_TESTS=OFF
+        -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
+        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+        -DCMAKE_BUILD_TYPE=Release
+        -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
+        -DCMAKE_INSTALL_LIBDIR=lib
+    )
+
+    LIST(APPEND external_project_dependencies protobuf)
+ENDIF(NOT PROTOBUF_FOUND)
+
+INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index 6372a9a768e580f74f837ccb6c57d4f4395eb779..0accf1a8dd83560324716f0f4936be56dd7a9f1b 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -221,7 +221,3 @@ ENDIF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
 
 INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
 INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
-
-MESSAGE("[Paddle] Python Executable: ${PYTHON_EXECUTABLE}")
-MESSAGE("[Paddle] Python Include: ${PYTHON_INCLUDE_DIRS}")
-MESSAGE("[Paddle] Python Libraries: ${PYTHON_LIBRARIES}")
diff --git a/cmake/util.cmake b/cmake/util.cmake
index 24ad5c815ca20d9b6b317b1be4d2dc93a9e06fba..3640e4651fdd8b491f63875a7ea886afcadf978a 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -71,21 +71,10 @@ function(link_paddle_exe TARGET_NAME)
         generate_rdma_links()
     endif()
 
-    if(WITH_METRIC)
-        if(WITH_GPU)
-            set(METRIC_LIBS paddle_metric_learning paddle_dserver_lib metric metric_cpu)
-        else()
-            set(METRIC_LIBS paddle_metric_learning paddle_dserver_lib metric_cpu)
-        endif()
-    else()
-        set(METRIC_LIBS "")
-    endif()
-
     target_circle_link_libraries(${TARGET_NAME}
         ARCHIVE_START
         paddle_gserver
         paddle_function
-        ${METRIC_LIBS}
         ARCHIVE_END
         paddle_pserver
         paddle_trainer_lib
@@ -95,7 +84,6 @@ function(link_paddle_exe TARGET_NAME)
         paddle_parameter
         paddle_proto
         paddle_cuda
-        ${METRIC_LIBS}
         ${EXTERNAL_LIBS}
         ${CMAKE_THREAD_LIBS_INIT}
         ${CMAKE_DL_LIBS}
diff --git a/demo/image_classification/api_v2_resnet.py b/demo/image_classification/api_v2_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..19d20540780becf504973a23b50445d4b65dc2ef
--- /dev/null
+++ b/demo/image_classification/api_v2_resnet.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2 as paddle
+
+__all__ = ['resnet_cifar10']
+
+
+def conv_bn_layer(input,
+                  ch_out,
+                  filter_size,
+                  stride,
+                  padding,
+                  active_type=paddle.activation.Relu(),
+                  ch_in=None):
+    tmp = paddle.layer.img_conv(
+        input=input,
+        filter_size=filter_size,
+        num_channels=ch_in,
+        num_filters=ch_out,
+        stride=stride,
+        padding=padding,
+        act=paddle.activation.Linear(),
+        bias_attr=False)
+    return paddle.layer.batch_norm(input=tmp, act=active_type)
+
+
+def shortcut(ipt, n_in, n_out, stride):
+    if n_in != n_out:
+        return conv_bn_layer(ipt, n_out, 1, stride, 0,
+                             paddle.activation.Linear())
+    else:
+        return ipt
+
+
+def basicblock(ipt, ch_out, stride):
+    ch_in = ch_out * 2
+    tmp = conv_bn_layer(ipt, ch_out, 3, stride, 1)
+    tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, paddle.activation.Linear())
+    short = shortcut(ipt, ch_in, ch_out, stride)
+    return paddle.layer.addto(input=[tmp, short], act=paddle.activation.Relu())
+
+
+def layer_warp(block_func, ipt, features, count, stride):
+    tmp = block_func(ipt, features, stride)
+    for i in range(1, count):
+        tmp = block_func(tmp, features, 1)
+    return tmp
+
+
+def resnet_cifar10(ipt, depth=32):
+    # depth should be one of 20, 32, 44, 56, 110, 1202
+    assert (depth - 2) % 6 == 0
+    n = (depth - 2) / 6
+    nStages = {16, 64, 128}
+    conv1 = conv_bn_layer(
+        ipt, ch_in=3, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 64, n, 2)
+    pool = paddle.layer.img_pool(
+        input=res3, pool_size=8, stride=1, pool_type=paddle.pooling.Avg())
+    return pool
diff --git a/demo/image_classification/api_v2_train.py b/demo/image_classification/api_v2_train.py
new file mode 100644
index 0000000000000000000000000000000000000000..53cffa6fb4e8b2e19725f4f44bf7b9ffffb25232
--- /dev/null
+++ b/demo/image_classification/api_v2_train.py
@@ -0,0 +1,92 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import sys
+
+import paddle.v2 as paddle
+
+from api_v2_vgg import vgg_bn_drop
+
+
+def main():
+    datadim = 3 * 32 * 32
+    classdim = 10
+
+    # PaddlePaddle init
+    paddle.init(use_gpu=False, trainer_count=1)
+
+    image = paddle.layer.data(
+        name="image", type=paddle.data_type.dense_vector(datadim))
+
+    # Add neural network config
+    # option 1. resnet
+    # net = resnet_cifar10(image, depth=32)
+    # option 2. vgg
+    net = vgg_bn_drop(image)
+
+    out = paddle.layer.fc(input=net,
+                          size=classdim,
+                          act=paddle.activation.Softmax())
+
+    lbl = paddle.layer.data(
+        name="label", type=paddle.data_type.integer_value(classdim))
+    cost = paddle.layer.classification_cost(input=out, label=lbl)
+
+    # Create parameters
+    parameters = paddle.parameters.create(cost)
+
+    # Create optimizer
+    momentum_optimizer = paddle.optimizer.Momentum(
+        momentum=0.9,
+        regularization=paddle.optimizer.L2Regularization(rate=0.0002 * 128),
+        learning_rate=0.1 / 128.0,
+        learning_rate_decay_a=0.1,
+        learning_rate_decay_b=50000 * 100,
+        learning_rate_schedule='discexp',
+        batch_size=128)
+
+    # End batch and end pass event handler
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                print "\nPass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+            else:
+                sys.stdout.write('.')
+                sys.stdout.flush()
+        if isinstance(event, paddle.event.EndPass):
+            result = trainer.test(
+                reader=paddle.batch(
+                    paddle.dataset.cifar.test10(), batch_size=128),
+                feeding={'image': 0,
+                         'label': 1})
+            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+
+    # Create trainer
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=momentum_optimizer)
+    trainer.train(
+        reader=paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.cifar.train10(), buf_size=50000),
+            batch_size=128),
+        num_passes=5,
+        event_handler=event_handler,
+        feeding={'image': 0,
+                 'label': 1})
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/image_classification/api_v2_vgg.py b/demo/image_classification/api_v2_vgg.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e0e6b93adde30425f17aa9cd07542275f4fec37
--- /dev/null
+++ b/demo/image_classification/api_v2_vgg.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2 as paddle
+
+__all__ = ['vgg_bn_drop']
+
+
+def vgg_bn_drop(input):
+    def conv_block(ipt, num_filter, groups, dropouts, num_channels=None):
+        return paddle.networks.img_conv_group(
+            input=ipt,
+            num_channels=num_channels,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act=paddle.activation.Relu(),
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type=paddle.pooling.Max())
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0], 3)
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = paddle.layer.dropout(input=conv5, dropout_rate=0.5)
+    fc1 = paddle.layer.fc(input=drop, size=512, act=paddle.activation.Linear())
+    bn = paddle.layer.batch_norm(
+        input=fc1,
+        act=paddle.activation.Relu(),
+        layer_attr=paddle.attr.Extra(drop_rate=0.5))
+    fc2 = paddle.layer.fc(input=bn, size=512, act=paddle.activation.Linear())
+    return fc2
diff --git a/demo/image_classification/prediction.py b/demo/image_classification/prediction.py
index 9a86aafcb2fa4d4354d1dd9443c1b73ddcda980b..49c0ff600c40e0222093ff0a8a2f7e8e38ccba29 100755
--- a/demo/image_classification/prediction.py
+++ b/demo/image_classification/prediction.py
@@ -126,7 +126,7 @@ class ImageClassifier():
         # For oversampling, average predictions across crops.
         # If not, the shape of output[name]: (1, class_number),
         # the mean is also applicable.
-        return output[output_layer].mean(0)
+        return output[output_layer]['value'].mean(0)
 
     def predict(self, image=None, output_layer=None):
         assert isinstance(image, basestring)
diff --git a/demo/introduction/api_train_v2.py b/demo/introduction/api_train_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ba971b3688ce3dec078998df2c0b183a4e449f8
--- /dev/null
+++ b/demo/introduction/api_train_v2.py
@@ -0,0 +1,58 @@
+import paddle.v2 as paddle
+import paddle.v2.dataset.uci_housing as uci_housing
+
+
+def main():
+    # init
+    paddle.init(use_gpu=False, trainer_count=1)
+
+    # network config
+    x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+    y_predict = paddle.layer.fc(input=x,
+                                param_attr=paddle.attr.Param(name='w'),
+                                size=1,
+                                act=paddle.activation.Linear(),
+                                bias_attr=paddle.attr.Param(name='b'))
+    y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
+    cost = paddle.layer.mse_cost(input=y_predict, label=y)
+
+    # create parameters
+    parameters = paddle.parameters.create(cost)
+
+    # create optimizer
+    optimizer = paddle.optimizer.Momentum(momentum=0)
+
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer)
+
+    # event_handler to print training and testing info
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                print "Pass %d, Batch %d, Cost %f" % (
+                    event.pass_id, event.batch_id, event.cost)
+
+        if isinstance(event, paddle.event.EndPass):
+            if (event.pass_id + 1) % 10 == 0:
+                result = trainer.test(
+                    reader=paddle.batch(
+                        uci_housing.test(), batch_size=2),
+                    feeding={'x': 0,
+                             'y': 1})
+                print "Test %d, %.2f" % (event.pass_id, result.cost)
+
+    # training
+    trainer.train(
+        reader=paddle.batch(
+            paddle.reader.shuffle(
+                uci_housing.train(), buf_size=500),
+            batch_size=2),
+        feeding={'x': 0,
+                 'y': 1},
+        event_handler=event_handler,
+        num_passes=30)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/introduction/trainer_config.py b/demo/introduction/trainer_config.py
index ecafe955f9e5c1062168d5d7b6b4c639d6e72a99..651dfaa4b7b4873810a0b393655541a62d1a311b 100644
--- a/demo/introduction/trainer_config.py
+++ b/demo/introduction/trainer_config.py
@@ -34,5 +34,5 @@ y_predict = fc_layer(
     size=1,
     act=LinearActivation(),
     bias_attr=ParamAttr(name='b'))
-cost = regression_cost(input=y_predict, label=y)
+cost = mse_cost(input=y_predict, label=y)
 outputs(cost)
diff --git a/demo/mnist/.gitignore b/demo/mnist/.gitignore
index 8bd9837523ccf98e6e72d5b82934b7b104816217..7e61d5e3a0cabd46d4185454d46610ac2ee2e63f 100644
--- a/demo/mnist/.gitignore
+++ b/demo/mnist/.gitignore
@@ -5,3 +5,6 @@ plot.png
 train.log
 *pyc
 .ipynb_checkpoints
+params.pkl
+params.tar
+params.tar.gz
diff --git a/demo/mnist/api_train.py b/demo/mnist/api_train.py
index 8573d8143a085b8d2e0bcf7df17b1abe177029df..ea1caa7dd9653a2cc2860ace736fe3d25a3767e0 100644
--- a/demo/mnist/api_train.py
+++ b/demo/mnist/api_train.py
@@ -6,25 +6,15 @@ passed to C++ side of Paddle.
 
 The user api could be simpler and carefully designed.
 """
-import py_paddle.swig_paddle as api
-from py_paddle import DataProviderConverter
-import paddle.trainer.PyDataProvider2 as dp
-import numpy as np
 import random
-from mnist_util import read_from_mnist
-from paddle.trainer_config_helpers import *
-import paddle.v2
 
+import numpy as np
+import paddle.v2 as paddle_v2
+import py_paddle.swig_paddle as api
+from paddle.trainer_config_helpers import *
+from py_paddle import DataProviderConverter
 
-def network_config():
-    imgs = data_layer(name='pixel', size=784)
-    hidden1 = fc_layer(input=imgs, size=200)
-    hidden2 = fc_layer(input=hidden1, size=200)
-    inference = fc_layer(input=hidden2, size=10, act=SoftmaxActivation())
-    cost = classification_cost(
-        input=inference, label=data_layer(
-            name='label', size=10))
-    outputs(cost)
+from mnist_util import read_from_mnist
 
 
 def init_parameter(network):
@@ -67,7 +57,7 @@ def input_order_converter(generator):
 def main():
     api.initPaddle("-use_gpu=false", "-trainer_count=4")  # use 4 cpu cores
 
-    optimizer = paddle.v2.optimizer.Adam(
+    optimizer = paddle_v2.optimizer.Adam(
         learning_rate=1e-4,
         batch_size=1000,
         model_average=ModelAverage(average_window=0.5),
@@ -79,8 +69,20 @@ def main():
     updater = optimizer.create_local_updater()
     assert isinstance(updater, api.ParameterUpdater)
 
+    # define network
+    images = paddle_v2.layer.data(
+        name='pixel', type=paddle_v2.data_type.dense_vector(784))
+    label = paddle_v2.layer.data(
+        name='label', type=paddle_v2.data_type.integer_value(10))
+    hidden1 = paddle_v2.layer.fc(input=images, size=200)
+    hidden2 = paddle_v2.layer.fc(input=hidden1, size=200)
+    inference = paddle_v2.layer.fc(input=hidden2,
+                                   size=10,
+                                   act=paddle_v2.activation.Softmax())
+    cost = paddle_v2.layer.classification_cost(input=inference, label=label)
+
     # Create Simple Gradient Machine.
-    model_config = parse_network_config(network_config)
+    model_config = paddle_v2.layer.parse_network(cost)
     m = api.GradientMachine.createFromConfigProto(model_config,
                                                   api.CREATE_MODE_NORMAL,
                                                   optimizer.enable_types())
@@ -97,8 +99,7 @@ def main():
 
     # DataProvider Converter is a utility convert Python Object to Paddle C++
     # Input. The input format is as same as Paddle's DataProvider.
-    converter = DataProviderConverter(
-        input_types=[dp.dense_vector(784), dp.integer_value(10)])
+    converter = DataProviderConverter(input_types=[images.type, label.type])
 
     train_file = './data/raw_data/train'
     test_file = './data/raw_data/t10k'
diff --git a/demo/mnist/api_train_v2.py b/demo/mnist/api_train_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..6b95a88042a13a280bcb80f753b3887fcef37296
--- /dev/null
+++ b/demo/mnist/api_train_v2.py
@@ -0,0 +1,137 @@
+import paddle.v2 as paddle
+import gzip
+
+
+def softmax_regression(img):
+    predict = paddle.layer.fc(input=img,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+
+
+def multilayer_perceptron(img):
+    # The first fully-connected layer
+    hidden1 = paddle.layer.fc(input=img, size=128, act=paddle.activation.Relu())
+    # The second fully-connected layer and the according activation function
+    hidden2 = paddle.layer.fc(input=hidden1,
+                              size=64,
+                              act=paddle.activation.Relu())
+    # The thrid fully-connected layer, note that the hidden size should be 10,
+    # which is the number of unique digits
+    predict = paddle.layer.fc(input=hidden2,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+
+
+def convolutional_neural_network(img):
+    # first conv layer
+    conv_pool_1 = paddle.networks.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        num_channel=1,
+        pool_size=2,
+        pool_stride=2,
+        act=paddle.activation.Tanh())
+    # second conv layer
+    conv_pool_2 = paddle.networks.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        num_channel=20,
+        pool_size=2,
+        pool_stride=2,
+        act=paddle.activation.Tanh())
+    # The first fully-connected layer
+    fc1 = paddle.layer.fc(input=conv_pool_2,
+                          size=128,
+                          act=paddle.activation.Tanh())
+    # The softmax layer, note that the hidden size should be 10,
+    # which is the number of unique digits
+    predict = paddle.layer.fc(input=fc1,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+
+
+def main():
+    paddle.init(use_gpu=False, trainer_count=1)
+
+    # define network topology
+    images = paddle.layer.data(
+        name='pixel', type=paddle.data_type.dense_vector(784))
+    label = paddle.layer.data(
+        name='label', type=paddle.data_type.integer_value(10))
+
+    # Here we can build the prediction network in different ways. Please
+    # choose one by uncomment corresponding line.
+    predict = softmax_regression(images)
+    #predict = multilayer_perceptron(images)
+    #predict = convolutional_neural_network(images)
+
+    cost = paddle.layer.classification_cost(input=predict, label=label)
+
+    try:
+        with gzip.open('params.tar.gz', 'r') as f:
+            parameters = paddle.parameters.Parameters.from_tar(f)
+    except IOError:
+        parameters = paddle.parameters.create(cost)
+
+    optimizer = paddle.optimizer.Momentum(
+        learning_rate=0.1 / 128.0,
+        momentum=0.9,
+        regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
+
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer)
+
+    lists = []
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 1000 == 0:
+                print "Pass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+
+                with gzip.open('params.tar.gz', 'w') as f:
+                    parameters.to_tar(f)
+
+        elif isinstance(event, paddle.event.EndPass):
+            result = trainer.test(reader=paddle.batch(
+                paddle.dataset.mnist.test(), batch_size=128))
+            print "Test with Pass %d, Cost %f, %s\n" % (
+                event.pass_id, result.cost, result.metrics)
+            lists.append((event.pass_id, result.cost,
+                          result.metrics['classification_error_evaluator']))
+
+    trainer.train(
+        reader=paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.mnist.train(), buf_size=8192),
+            batch_size=128),
+        event_handler=event_handler,
+        num_passes=100)
+
+    # find the best pass
+    best = sorted(lists, key=lambda list: float(list[1]))[0]
+    print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1])
+    print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100)
+
+    test_creator = paddle.dataset.mnist.test()
+    test_data = []
+    for item in test_creator():
+        test_data.append((item[0], ))
+        if len(test_data) == 100:
+            break
+
+    # output is a softmax layer. It returns probabilities.
+    # Shape should be (100, 10)
+    probs = paddle.infer(
+        output_layer=predict, parameters=parameters, input=test_data)
+    print probs.shape
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/model_zoo/resnet/classify.py b/demo/model_zoo/resnet/classify.py
index 4631816c43ef48839df1863a0a86c3ab00924d3f..6074cc1d3a85e13e3e8d336d81e22104f9d8e7cf 100755
--- a/demo/model_zoo/resnet/classify.py
+++ b/demo/model_zoo/resnet/classify.py
@@ -156,7 +156,7 @@ class ImageClassifier():
             # For oversampling, average predictions across crops.
             # If not, the shape of output[name]: (1, class_number),
             # the mean is also applicable.
-            res[name] = output[name].mean(0)
+            res[name] = output[name]['value'].mean(0)
 
         return res
 
diff --git a/demo/recommendation/api_train_v2.py b/demo/recommendation/api_train_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..f6a061799e3ac50236a68beedaf700dd6c698a05
--- /dev/null
+++ b/demo/recommendation/api_train_v2.py
@@ -0,0 +1,125 @@
+import paddle.v2 as paddle
+import cPickle
+import copy
+
+
+def main():
+    paddle.init(use_gpu=False)
+    movie_title_dict = paddle.dataset.movielens.get_movie_title_dict()
+    uid = paddle.layer.data(
+        name='user_id',
+        type=paddle.data_type.integer_value(
+            paddle.dataset.movielens.max_user_id() + 1))
+    usr_emb = paddle.layer.embedding(input=uid, size=32)
+
+    usr_gender_id = paddle.layer.data(
+        name='gender_id', type=paddle.data_type.integer_value(2))
+    usr_gender_emb = paddle.layer.embedding(input=usr_gender_id, size=16)
+
+    usr_age_id = paddle.layer.data(
+        name='age_id',
+        type=paddle.data_type.integer_value(
+            len(paddle.dataset.movielens.age_table)))
+    usr_age_emb = paddle.layer.embedding(input=usr_age_id, size=16)
+
+    usr_job_id = paddle.layer.data(
+        name='job_id',
+        type=paddle.data_type.integer_value(paddle.dataset.movielens.max_job_id(
+        ) + 1))
+
+    usr_job_emb = paddle.layer.embedding(input=usr_job_id, size=16)
+
+    usr_combined_features = paddle.layer.fc(
+        input=[usr_emb, usr_gender_emb, usr_age_emb, usr_job_emb],
+        size=200,
+        act=paddle.activation.Tanh())
+
+    mov_id = paddle.layer.data(
+        name='movie_id',
+        type=paddle.data_type.integer_value(
+            paddle.dataset.movielens.max_movie_id() + 1))
+    mov_emb = paddle.layer.embedding(input=mov_id, size=32)
+
+    mov_categories = paddle.layer.data(
+        name='category_id',
+        type=paddle.data_type.sparse_binary_vector(
+            len(paddle.dataset.movielens.movie_categories())))
+
+    mov_categories_hidden = paddle.layer.fc(input=mov_categories, size=32)
+
+    mov_title_id = paddle.layer.data(
+        name='movie_title',
+        type=paddle.data_type.integer_value_sequence(len(movie_title_dict)))
+    mov_title_emb = paddle.layer.embedding(input=mov_title_id, size=32)
+    mov_title_conv = paddle.networks.sequence_conv_pool(
+        input=mov_title_emb, hidden_size=32, context_len=3)
+
+    mov_combined_features = paddle.layer.fc(
+        input=[mov_emb, mov_categories_hidden, mov_title_conv],
+        size=200,
+        act=paddle.activation.Tanh())
+
+    inference = paddle.layer.cos_sim(
+        a=usr_combined_features, b=mov_combined_features, size=1, scale=5)
+    cost = paddle.layer.mse_cost(
+        input=inference,
+        label=paddle.layer.data(
+            name='score', type=paddle.data_type.dense_vector(1)))
+
+    parameters = paddle.parameters.create(cost)
+
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=paddle.optimizer.Adam(
+                                     learning_rate=1e-4))
+    feeding = {
+        'user_id': 0,
+        'gender_id': 1,
+        'age_id': 2,
+        'job_id': 3,
+        'movie_id': 4,
+        'category_id': 5,
+        'movie_title': 6,
+        'score': 7
+    }
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                print "Pass %d Batch %d Cost %.2f" % (
+                    event.pass_id, event.batch_id, event.cost)
+
+    trainer.train(
+        reader=paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.movielens.train(), buf_size=8192),
+            batch_size=256),
+        event_handler=event_handler,
+        feeding=feeding,
+        num_passes=1)
+
+    user_id = 234
+    movie_id = 345
+
+    user = paddle.dataset.movielens.user_info()[user_id]
+    movie = paddle.dataset.movielens.movie_info()[movie_id]
+
+    feature = user.value() + movie.value()
+
+    def reader():
+        yield feature
+
+    infer_dict = copy.copy(feeding)
+    del infer_dict['score']
+
+    prediction = paddle.infer(
+        output=inference,
+        parameters=parameters,
+        reader=paddle.batch(
+            reader, batch_size=32),
+        feeding=infer_dict)
+    print(prediction + 5) / 2
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/recommendation/trainer_config.py b/demo/recommendation/trainer_config.py
index aabcd335253faf69c940024ac8098a54da030463..25f529d7d7c430f179107fb189ade34760ab309d 100755
--- a/demo/recommendation/trainer_config.py
+++ b/demo/recommendation/trainer_config.py
@@ -86,10 +86,7 @@ movie_feature = construct_feature("movie")
 user_feature = construct_feature("user")
 similarity = cos_sim(a=movie_feature, b=user_feature)
 if not is_predict:
-    outputs(
-        regression_cost(
-            input=similarity, label=data_layer(
-                'rating', size=1)))
+    outputs(mse_cost(input=similarity, label=data_layer('rating', size=1)))
 
     define_py_data_sources2(
         'data/train.list',
diff --git a/demo/semantic_role_labeling/api_train_v2.py b/demo/semantic_role_labeling/api_train_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..036cad4b0a32357bb42580ef577a1eba558be8fe
--- /dev/null
+++ b/demo/semantic_role_labeling/api_train_v2.py
@@ -0,0 +1,190 @@
+import sys
+import math
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.dataset.conll05 as conll05
+
+
+def db_lstm():
+    word_dict, verb_dict, label_dict = conll05.get_dict()
+    word_dict_len = len(word_dict)
+    label_dict_len = len(label_dict)
+    pred_len = len(verb_dict)
+
+    mark_dict_len = 2
+    word_dim = 32
+    mark_dim = 5
+    hidden_dim = 512
+    depth = 8
+
+    #8 features
+    def d_type(size):
+        return paddle.data_type.integer_value_sequence(size)
+
+    word = paddle.layer.data(name='word_data', type=d_type(word_dict_len))
+    predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len))
+
+    ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len))
+    ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_len))
+    ctx_0 = paddle.layer.data(name='ctx_0_data', type=d_type(word_dict_len))
+    ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_len))
+    ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len))
+    mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len))
+
+    target = paddle.layer.data(name='target', type=d_type(label_dict_len))
+
+    default_std = 1 / math.sqrt(hidden_dim) / 3.0
+
+    emb_para = paddle.attr.Param(name='emb', initial_std=0., learning_rate=0.)
+    std_0 = paddle.attr.Param(initial_std=0.)
+    std_default = paddle.attr.Param(initial_std=default_std)
+
+    predicate_embedding = paddle.layer.embedding(
+        size=word_dim,
+        input=predicate,
+        param_attr=paddle.attr.Param(
+            name='vemb', initial_std=default_std))
+    mark_embedding = paddle.layer.embedding(
+        size=mark_dim, input=mark, param_attr=std_0)
+
+    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+    emb_layers = [
+        paddle.layer.embedding(
+            size=word_dim, input=x, param_attr=emb_para) for x in word_input
+    ]
+    emb_layers.append(predicate_embedding)
+    emb_layers.append(mark_embedding)
+
+    hidden_0 = paddle.layer.mixed(
+        size=hidden_dim,
+        bias_attr=std_default,
+        input=[
+            paddle.layer.full_matrix_projection(
+                input=emb, param_attr=std_default) for emb in emb_layers
+        ])
+
+    mix_hidden_lr = 1e-3
+    lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0)
+    hidden_para_attr = paddle.attr.Param(
+        initial_std=default_std, learning_rate=mix_hidden_lr)
+
+    lstm_0 = paddle.layer.lstmemory(
+        input=hidden_0,
+        act=paddle.activation.Relu(),
+        gate_act=paddle.activation.Sigmoid(),
+        state_act=paddle.activation.Sigmoid(),
+        bias_attr=std_0,
+        param_attr=lstm_para_attr)
+
+    #stack L-LSTM and R-LSTM with direct edges
+    input_tmp = [hidden_0, lstm_0]
+
+    for i in range(1, depth):
+        mix_hidden = paddle.layer.mixed(
+            size=hidden_dim,
+            bias_attr=std_default,
+            input=[
+                paddle.layer.full_matrix_projection(
+                    input=input_tmp[0], param_attr=hidden_para_attr),
+                paddle.layer.full_matrix_projection(
+                    input=input_tmp[1], param_attr=lstm_para_attr)
+            ])
+
+        lstm = paddle.layer.lstmemory(
+            input=mix_hidden,
+            act=paddle.activation.Relu(),
+            gate_act=paddle.activation.Sigmoid(),
+            state_act=paddle.activation.Sigmoid(),
+            reverse=((i % 2) == 1),
+            bias_attr=std_0,
+            param_attr=lstm_para_attr)
+
+        input_tmp = [mix_hidden, lstm]
+
+    feature_out = paddle.layer.mixed(
+        size=label_dict_len,
+        bias_attr=std_default,
+        input=[
+            paddle.layer.full_matrix_projection(
+                input=input_tmp[0], param_attr=hidden_para_attr),
+            paddle.layer.full_matrix_projection(
+                input=input_tmp[1], param_attr=lstm_para_attr)
+        ], )
+
+    crf_cost = paddle.layer.crf(size=label_dict_len,
+                                input=feature_out,
+                                label=target,
+                                param_attr=paddle.attr.Param(
+                                    name='crfw',
+                                    initial_std=default_std,
+                                    learning_rate=mix_hidden_lr))
+
+    crf_dec = paddle.layer.crf_decoding(
+        name='crf_dec_l',
+        size=label_dict_len,
+        input=feature_out,
+        label=target,
+        param_attr=paddle.attr.Param(name='crfw'))
+
+    return crf_cost, crf_dec
+
+
+def load_parameter(file_name, h, w):
+    with open(file_name, 'rb') as f:
+        f.read(16)  # skip header.
+        return np.fromfile(f, dtype=np.float32).reshape(h, w)
+
+
+def main():
+    paddle.init(use_gpu=False, trainer_count=1)
+
+    # define network topology
+    crf_cost, crf_dec = db_lstm()
+
+    # create parameters
+    parameters = paddle.parameters.create([crf_cost, crf_dec])
+
+    # create optimizer
+    optimizer = paddle.optimizer.Momentum(
+        momentum=0,
+        learning_rate=2e-2,
+        regularization=paddle.optimizer.L2Regularization(rate=8e-4),
+        model_average=paddle.optimizer.ModelAverage(
+            average_window=0.5, max_average_window=10000), )
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                print "Pass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+
+    trainer = paddle.trainer.SGD(cost=crf_cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer)
+    parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32))
+
+    trn_reader = paddle.batch(
+        paddle.reader.shuffle(
+            conll05.test(), buf_size=8192), batch_size=10)
+
+    feeding = {
+        'word_data': 0,
+        'ctx_n2_data': 1,
+        'ctx_n1_data': 2,
+        'ctx_0_data': 3,
+        'ctx_p1_data': 4,
+        'ctx_p2_data': 5,
+        'verb_data': 6,
+        'mark_data': 7,
+        'target': 8
+    }
+
+    trainer.train(
+        reader=trn_reader,
+        event_handler=event_handler,
+        num_passes=10000,
+        feeding=feeding)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/sentiment/dataprovider.py b/demo/sentiment/dataprovider.py
index 00f72cecacb454a0dd1184fa2098be4543007de7..4b7f5d0e504aef3884a04cbed8c16503a4079772 100755
--- a/demo/sentiment/dataprovider.py
+++ b/demo/sentiment/dataprovider.py
@@ -32,4 +32,6 @@ def process(settings, file_name):
             word_slot = [
                 settings.word_dict[w] for w in words if w in settings.word_dict
             ]
+            if not word_slot:
+                continue
             yield word_slot, label
diff --git a/demo/sentiment/predict.py b/demo/sentiment/predict.py
index 8ec490f64691924013200a3d0038d39aa834b038..64c78e0d6b9297e7a321a4f070517593b0bfe332 100755
--- a/demo/sentiment/predict.py
+++ b/demo/sentiment/predict.py
@@ -138,7 +138,11 @@ def main():
 
     batch = []
     for line in sys.stdin:
-        batch.append([predict.get_index(line)])
+        words = predict.get_index(line)
+        if words:
+            batch.append([words])
+        else:
+            print('All the words in [%s] are not in the dictionary.' % line)
         if len(batch) == batch_size:
             predict.batch_predict(batch)
             batch = []
diff --git a/demo/sentiment/train_v2.py b/demo/sentiment/train_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c856556bd0cb32f60eba322469b3621c37e1349
--- /dev/null
+++ b/demo/sentiment/train_v2.py
@@ -0,0 +1,159 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import paddle.v2 as paddle
+
+
+def convolution_net(input_dim, class_dim=2, emb_dim=128, hid_dim=128):
+    data = paddle.layer.data("word",
+                             paddle.data_type.integer_value_sequence(input_dim))
+    emb = paddle.layer.embedding(input=data, size=emb_dim)
+    conv_3 = paddle.networks.sequence_conv_pool(
+        input=emb, context_len=3, hidden_size=hid_dim)
+    conv_4 = paddle.networks.sequence_conv_pool(
+        input=emb, context_len=4, hidden_size=hid_dim)
+    output = paddle.layer.fc(input=[conv_3, conv_4],
+                             size=class_dim,
+                             act=paddle.activation.Softmax())
+    lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
+    cost = paddle.layer.classification_cost(input=output, label=lbl)
+    return cost
+
+
+def stacked_lstm_net(input_dim,
+                     class_dim=2,
+                     emb_dim=128,
+                     hid_dim=512,
+                     stacked_num=3):
+    """
+    A Wrapper for sentiment classification task.
+    This network uses bi-directional recurrent network,
+    consisting three LSTM layers. This configure is referred to
+    the paper as following url, but use fewer layrs.
+        http://www.aclweb.org/anthology/P15-1109
+
+    input_dim: here is word dictionary dimension.
+    class_dim: number of categories.
+    emb_dim: dimension of word embedding.
+    hid_dim: dimension of hidden layer.
+    stacked_num: number of stacked lstm-hidden layer.
+    """
+    assert stacked_num % 2 == 1
+
+    layer_attr = paddle.attr.Extra(drop_rate=0.5)
+    fc_para_attr = paddle.attr.Param(learning_rate=1e-3)
+    lstm_para_attr = paddle.attr.Param(initial_std=0., learning_rate=1.)
+    para_attr = [fc_para_attr, lstm_para_attr]
+    bias_attr = paddle.attr.Param(initial_std=0., l2_rate=0.)
+    relu = paddle.activation.Relu()
+    linear = paddle.activation.Linear()
+
+    data = paddle.layer.data("word",
+                             paddle.data_type.integer_value_sequence(input_dim))
+    emb = paddle.layer.embedding(input=data, size=emb_dim)
+
+    fc1 = paddle.layer.fc(input=emb,
+                          size=hid_dim,
+                          act=linear,
+                          bias_attr=bias_attr)
+    lstm1 = paddle.layer.lstmemory(
+        input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr)
+
+    inputs = [fc1, lstm1]
+    for i in range(2, stacked_num + 1):
+        fc = paddle.layer.fc(input=inputs,
+                             size=hid_dim,
+                             act=linear,
+                             param_attr=para_attr,
+                             bias_attr=bias_attr)
+        lstm = paddle.layer.lstmemory(
+            input=fc,
+            reverse=(i % 2) == 0,
+            act=relu,
+            bias_attr=bias_attr,
+            layer_attr=layer_attr)
+        inputs = [fc, lstm]
+
+    fc_last = paddle.layer.pooling(
+        input=inputs[0], pooling_type=paddle.pooling.Max())
+    lstm_last = paddle.layer.pooling(
+        input=inputs[1], pooling_type=paddle.pooling.Max())
+    output = paddle.layer.fc(input=[fc_last, lstm_last],
+                             size=class_dim,
+                             act=paddle.activation.Softmax(),
+                             bias_attr=bias_attr,
+                             param_attr=para_attr)
+
+    lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
+    cost = paddle.layer.classification_cost(input=output, label=lbl)
+    return cost
+
+
+if __name__ == '__main__':
+    # init
+    paddle.init(use_gpu=False)
+
+    #data
+    print 'load dictionary...'
+    word_dict = paddle.dataset.imdb.word_dict()
+    dict_dim = len(word_dict)
+    class_dim = 2
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            lambda: paddle.dataset.imdb.train(word_dict), buf_size=1000),
+        batch_size=100)
+    test_reader = paddle.batch(
+        lambda: paddle.dataset.imdb.test(word_dict), batch_size=100)
+
+    feeding = {'word': 0, 'label': 1}
+
+    # network config
+    # Please choose the way to build the network
+    # by uncommenting the corresponding line.
+    cost = convolution_net(dict_dim, class_dim=class_dim)
+    # cost = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3)
+
+    # create parameters
+    parameters = paddle.parameters.create(cost)
+
+    # create optimizer
+    adam_optimizer = paddle.optimizer.Adam(
+        learning_rate=2e-3,
+        regularization=paddle.optimizer.L2Regularization(rate=8e-4),
+        model_average=paddle.optimizer.ModelAverage(average_window=0.5))
+
+    # End batch and end pass event handler
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                print "\nPass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+            else:
+                sys.stdout.write('.')
+                sys.stdout.flush()
+        if isinstance(event, paddle.event.EndPass):
+            result = trainer.test(reader=test_reader, feeding=feeding)
+            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+
+    # create trainer
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=adam_optimizer)
+
+    trainer.train(
+        reader=train_reader,
+        event_handler=event_handler,
+        feeding=feeding,
+        num_passes=2)
diff --git a/demo/seqToseq/api_train_v2.py b/demo/seqToseq/api_train_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..5d138a8c4f91976d90b19441781248f7b67c854a
--- /dev/null
+++ b/demo/seqToseq/api_train_v2.py
@@ -0,0 +1,146 @@
+import sys
+import paddle.v2 as paddle
+
+
+def seqToseq_net(source_dict_dim, target_dict_dim):
+    ### Network Architecture
+    word_vector_dim = 512  # dimension of word vector
+    decoder_size = 512  # dimension of hidden unit in GRU Decoder network
+    encoder_size = 512  # dimension of hidden unit in GRU Encoder network
+
+    #### Encoder
+    src_word_id = paddle.layer.data(
+        name='source_language_word',
+        type=paddle.data_type.integer_value_sequence(source_dict_dim))
+    src_embedding = paddle.layer.embedding(
+        input=src_word_id,
+        size=word_vector_dim,
+        param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
+    src_forward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size)
+    src_backward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size, reverse=True)
+    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
+
+    #### Decoder
+    with paddle.layer.mixed(size=decoder_size) as encoded_proj:
+        encoded_proj += paddle.layer.full_matrix_projection(
+            input=encoded_vector)
+
+    backward_first = paddle.layer.first_seq(input=src_backward)
+
+    with paddle.layer.mixed(
+            size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot:
+        decoder_boot += paddle.layer.full_matrix_projection(
+            input=backward_first)
+
+    def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
+
+        decoder_mem = paddle.layer.memory(
+            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
+
+        context = paddle.networks.simple_attention(
+            encoded_sequence=enc_vec,
+            encoded_proj=enc_proj,
+            decoder_state=decoder_mem)
+
+        with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
+            decoder_inputs += paddle.layer.full_matrix_projection(input=context)
+            decoder_inputs += paddle.layer.full_matrix_projection(
+                input=current_word)
+
+        gru_step = paddle.layer.gru_step(
+            name='gru_decoder',
+            input=decoder_inputs,
+            output_mem=decoder_mem,
+            size=decoder_size)
+
+        with paddle.layer.mixed(
+                size=target_dict_dim,
+                bias_attr=True,
+                act=paddle.activation.Softmax()) as out:
+            out += paddle.layer.full_matrix_projection(input=gru_step)
+        return out
+
+    decoder_group_name = "decoder_group"
+    group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True)
+    group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True)
+    group_inputs = [group_input1, group_input2]
+
+    trg_embedding = paddle.layer.embedding(
+        input=paddle.layer.data(
+            name='target_language_word',
+            type=paddle.data_type.integer_value_sequence(target_dict_dim)),
+        size=word_vector_dim,
+        param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+    group_inputs.append(trg_embedding)
+
+    # For decoder equipped with attention mechanism, in training,
+    # target embeding (the groudtruth) is the data input,
+    # while encoded source sequence is accessed to as an unbounded memory.
+    # Here, the StaticInput defines a read-only memory
+    # for the recurrent_group.
+    decoder = paddle.layer.recurrent_group(
+        name=decoder_group_name,
+        step=gru_decoder_with_attention,
+        input=group_inputs)
+
+    lbl = paddle.layer.data(
+        name='target_language_next_word',
+        type=paddle.data_type.integer_value_sequence(target_dict_dim))
+    cost = paddle.layer.classification_cost(input=decoder, label=lbl)
+
+    return cost
+
+
+def main():
+    paddle.init(use_gpu=False, trainer_count=1)
+
+    # source and target dict dim.
+    dict_size = 30000
+    source_dict_dim = target_dict_dim = dict_size
+
+    # define network topology
+    cost = seqToseq_net(source_dict_dim, target_dict_dim)
+    parameters = paddle.parameters.create(cost)
+
+    # define optimize method and trainer
+    optimizer = paddle.optimizer.Adam(
+        learning_rate=5e-5,
+        regularization=paddle.optimizer.L2Regularization(rate=1e-3))
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer)
+
+    # define data reader
+    feeding = {
+        'source_language_word': 0,
+        'target_language_word': 1,
+        'target_language_next_word': 2
+    }
+
+    wmt14_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.wmt14.train(dict_size=dict_size), buf_size=8192),
+        batch_size=5)
+
+    # define event_handler callback
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 10 == 0:
+                print "\nPass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+            else:
+                sys.stdout.write('.')
+                sys.stdout.flush()
+
+    # start to train
+    trainer.train(
+        reader=wmt14_reader,
+        event_handler=event_handler,
+        num_passes=10000,
+        feeding=feeding)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/word2vec/train_v2.py b/demo/word2vec/train_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d952b446f9db432062fc3305a6b65b0ad66dd47
--- /dev/null
+++ b/demo/word2vec/train_v2.py
@@ -0,0 +1,80 @@
+import math
+
+import paddle.v2 as paddle
+
+dictsize = 1953
+embsize = 32
+hiddensize = 256
+N = 5
+
+
+def wordemb(inlayer):
+    wordemb = paddle.layer.table_projection(
+        input=inlayer,
+        size=embsize,
+        param_attr=paddle.attr.Param(
+            name="_proj",
+            initial_std=0.001,
+            learning_rate=1,
+            l2_rate=0, ))
+    return wordemb
+
+
+def main():
+    paddle.init(use_gpu=False, trainer_count=1)
+    word_dict = paddle.dataset.imikolov.build_dict()
+    dict_size = len(word_dict)
+    firstword = paddle.layer.data(
+        name="firstw", type=paddle.data_type.integer_value(dict_size))
+    secondword = paddle.layer.data(
+        name="secondw", type=paddle.data_type.integer_value(dict_size))
+    thirdword = paddle.layer.data(
+        name="thirdw", type=paddle.data_type.integer_value(dict_size))
+    fourthword = paddle.layer.data(
+        name="fourthw", type=paddle.data_type.integer_value(dict_size))
+    nextword = paddle.layer.data(
+        name="fifthw", type=paddle.data_type.integer_value(dict_size))
+
+    Efirst = wordemb(firstword)
+    Esecond = wordemb(secondword)
+    Ethird = wordemb(thirdword)
+    Efourth = wordemb(fourthword)
+
+    contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
+    hidden1 = paddle.layer.fc(input=contextemb,
+                              size=hiddensize,
+                              act=paddle.activation.Sigmoid(),
+                              layer_attr=paddle.attr.Extra(drop_rate=0.5),
+                              bias_attr=paddle.attr.Param(learning_rate=2),
+                              param_attr=paddle.attr.Param(
+                                  initial_std=1. / math.sqrt(embsize * 8),
+                                  learning_rate=1))
+    predictword = paddle.layer.fc(input=hidden1,
+                                  size=dict_size,
+                                  bias_attr=paddle.attr.Param(learning_rate=2),
+                                  act=paddle.activation.Softmax())
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                result = trainer.test(
+                    paddle.batch(
+                        paddle.dataset.imikolov.test(word_dict, N), 32))
+                print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics,
+                    result.metrics)
+
+    cost = paddle.layer.classification_cost(input=predictword, label=nextword)
+    parameters = paddle.parameters.create(cost)
+    adam_optimizer = paddle.optimizer.Adam(
+        learning_rate=3e-3,
+        regularization=paddle.optimizer.L2Regularization(8e-4))
+    trainer = paddle.trainer.SGD(cost, parameters, adam_optimizer)
+    trainer.train(
+        paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
+        num_passes=30,
+        event_handler=event_handler)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/doc/api/index_cn.rst b/doc/api/index_cn.rst
index 3718cd73a2003b8ef6c406a9bd51dc68e76402dc..9be0b370ee5e301aee4a6e31b1cfa905754968e8 100644
--- a/doc/api/index_cn.rst
+++ b/doc/api/index_cn.rst
@@ -1,37 +1,9 @@
-API中文手册
-============
-
-DataProvider API
-----------------
-
-..  toctree::
-    :maxdepth: 1
-
-    data_provider/dataprovider_cn.rst
-    data_provider/pydataprovider2_cn.rst
-
-..  _api_trainer_config:
-
-Model Config API
-----------------
-
-..  toctree::
-    :maxdepth: 1
-
-    trainer_config_helpers/optimizers.rst
-    trainer_config_helpers/data_sources.rst
-    trainer_config_helpers/layers.rst
-    trainer_config_helpers/activations.rst 
-    trainer_config_helpers/poolings.rst
-    trainer_config_helpers/networks.rst
-    trainer_config_helpers/evaluators.rst
-    trainer_config_helpers/attrs.rst
-
-
-Applications API
-----------------
+API
+===
 
 ..  toctree::
     :maxdepth: 1
 
-    predict/swig_py_paddle_cn.rst
+    模型配置 <v2/model_configs.rst>
+    数据访问 <v2/data.rst>
+    训练与应用 <v2/run_logic.rst>
diff --git a/doc/api/index_en.rst b/doc/api/index_en.rst
index 10c297a71d6988c002de868e804ed9ee2345fbd7..25c1dd00b9cbb3ab647e04cdc2b4c27c552a2332 100644
--- a/doc/api/index_en.rst
+++ b/doc/api/index_en.rst
@@ -1,37 +1,9 @@
 API
 ===
 
-DataProvider API
-----------------
-
-..  toctree::
-    :maxdepth: 1
-
-    data_provider/dataprovider_en.rst
-    data_provider/pydataprovider2_en.rst
-
-..  _api_trainer_config:
-
-Model Config API
-----------------
-
-..  toctree::
-    :maxdepth: 1
-
-    trainer_config_helpers/optimizers.rst
-    trainer_config_helpers/data_sources.rst
-    trainer_config_helpers/layers.rst
-    trainer_config_helpers/activations.rst 
-    trainer_config_helpers/poolings.rst
-    trainer_config_helpers/networks.rst
-    trainer_config_helpers/evaluators.rst
-    trainer_config_helpers/attrs.rst
-
-
-Applications API
-----------------
-
 ..  toctree::
     :maxdepth: 1
 
-    predict/swig_py_paddle_en.rst
+    v2/model_configs.rst
+    v2/data.rst
+    v2/run_logic.rst
diff --git a/doc/api/data_provider/dataprovider_cn.rst b/doc/api/v1/data_provider/dataprovider_cn.rst
similarity index 100%
rename from doc/api/data_provider/dataprovider_cn.rst
rename to doc/api/v1/data_provider/dataprovider_cn.rst
diff --git a/doc/api/data_provider/dataprovider_en.rst b/doc/api/v1/data_provider/dataprovider_en.rst
similarity index 100%
rename from doc/api/data_provider/dataprovider_en.rst
rename to doc/api/v1/data_provider/dataprovider_en.rst
diff --git a/doc/api/data_provider/pydataprovider2_cn.rst b/doc/api/v1/data_provider/pydataprovider2_cn.rst
similarity index 100%
rename from doc/api/data_provider/pydataprovider2_cn.rst
rename to doc/api/v1/data_provider/pydataprovider2_cn.rst
diff --git a/doc/api/data_provider/pydataprovider2_en.rst b/doc/api/v1/data_provider/pydataprovider2_en.rst
similarity index 100%
rename from doc/api/data_provider/pydataprovider2_en.rst
rename to doc/api/v1/data_provider/pydataprovider2_en.rst
diff --git a/doc/api/data_provider/src/mnist_config.py b/doc/api/v1/data_provider/src/mnist_config.py
similarity index 100%
rename from doc/api/data_provider/src/mnist_config.py
rename to doc/api/v1/data_provider/src/mnist_config.py
diff --git a/doc/api/data_provider/src/mnist_provider.dict.py b/doc/api/v1/data_provider/src/mnist_provider.dict.py
similarity index 100%
rename from doc/api/data_provider/src/mnist_provider.dict.py
rename to doc/api/v1/data_provider/src/mnist_provider.dict.py
diff --git a/doc/api/data_provider/src/mnist_train.txt b/doc/api/v1/data_provider/src/mnist_train.txt
similarity index 100%
rename from doc/api/data_provider/src/mnist_train.txt
rename to doc/api/v1/data_provider/src/mnist_train.txt
diff --git a/doc/api/data_provider/src/sentimental_config.py b/doc/api/v1/data_provider/src/sentimental_config.py
similarity index 100%
rename from doc/api/data_provider/src/sentimental_config.py
rename to doc/api/v1/data_provider/src/sentimental_config.py
diff --git a/doc/api/data_provider/src/sentimental_provider.py b/doc/api/v1/data_provider/src/sentimental_provider.py
similarity index 100%
rename from doc/api/data_provider/src/sentimental_provider.py
rename to doc/api/v1/data_provider/src/sentimental_provider.py
diff --git a/doc/api/data_provider/src/sentimental_train.txt b/doc/api/v1/data_provider/src/sentimental_train.txt
similarity index 100%
rename from doc/api/data_provider/src/sentimental_train.txt
rename to doc/api/v1/data_provider/src/sentimental_train.txt
diff --git a/doc/api/data_provider/src/train.list b/doc/api/v1/data_provider/src/train.list
similarity index 100%
rename from doc/api/data_provider/src/train.list
rename to doc/api/v1/data_provider/src/train.list
diff --git a/doc/api/v1/index_cn.rst b/doc/api/v1/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3718cd73a2003b8ef6c406a9bd51dc68e76402dc
--- /dev/null
+++ b/doc/api/v1/index_cn.rst
@@ -0,0 +1,37 @@
+API中文手册
+============
+
+DataProvider API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    data_provider/dataprovider_cn.rst
+    data_provider/pydataprovider2_cn.rst
+
+..  _api_trainer_config:
+
+Model Config API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    trainer_config_helpers/optimizers.rst
+    trainer_config_helpers/data_sources.rst
+    trainer_config_helpers/layers.rst
+    trainer_config_helpers/activations.rst 
+    trainer_config_helpers/poolings.rst
+    trainer_config_helpers/networks.rst
+    trainer_config_helpers/evaluators.rst
+    trainer_config_helpers/attrs.rst
+
+
+Applications API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    predict/swig_py_paddle_cn.rst
diff --git a/doc/api/v1/index_en.rst b/doc/api/v1/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..10c297a71d6988c002de868e804ed9ee2345fbd7
--- /dev/null
+++ b/doc/api/v1/index_en.rst
@@ -0,0 +1,37 @@
+API
+===
+
+DataProvider API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    data_provider/dataprovider_en.rst
+    data_provider/pydataprovider2_en.rst
+
+..  _api_trainer_config:
+
+Model Config API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    trainer_config_helpers/optimizers.rst
+    trainer_config_helpers/data_sources.rst
+    trainer_config_helpers/layers.rst
+    trainer_config_helpers/activations.rst 
+    trainer_config_helpers/poolings.rst
+    trainer_config_helpers/networks.rst
+    trainer_config_helpers/evaluators.rst
+    trainer_config_helpers/attrs.rst
+
+
+Applications API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    predict/swig_py_paddle_en.rst
diff --git a/doc/api/predict/src/predict_sample.py b/doc/api/v1/predict/src/predict_sample.py
similarity index 100%
rename from doc/api/predict/src/predict_sample.py
rename to doc/api/v1/predict/src/predict_sample.py
diff --git a/doc/api/predict/swig_py_paddle_cn.rst b/doc/api/v1/predict/swig_py_paddle_cn.rst
similarity index 100%
rename from doc/api/predict/swig_py_paddle_cn.rst
rename to doc/api/v1/predict/swig_py_paddle_cn.rst
diff --git a/doc/api/predict/swig_py_paddle_en.rst b/doc/api/v1/predict/swig_py_paddle_en.rst
similarity index 100%
rename from doc/api/predict/swig_py_paddle_en.rst
rename to doc/api/v1/predict/swig_py_paddle_en.rst
diff --git a/doc/api/trainer_config_helpers/activations.rst b/doc/api/v1/trainer_config_helpers/activations.rst
similarity index 100%
rename from doc/api/trainer_config_helpers/activations.rst
rename to doc/api/v1/trainer_config_helpers/activations.rst
diff --git a/doc/api/trainer_config_helpers/attrs.rst b/doc/api/v1/trainer_config_helpers/attrs.rst
similarity index 100%
rename from doc/api/trainer_config_helpers/attrs.rst
rename to doc/api/v1/trainer_config_helpers/attrs.rst
diff --git a/doc/api/trainer_config_helpers/data_sources.rst b/doc/api/v1/trainer_config_helpers/data_sources.rst
similarity index 100%
rename from doc/api/trainer_config_helpers/data_sources.rst
rename to doc/api/v1/trainer_config_helpers/data_sources.rst
diff --git a/doc/api/trainer_config_helpers/evaluators.rst b/doc/api/v1/trainer_config_helpers/evaluators.rst
similarity index 100%
rename from doc/api/trainer_config_helpers/evaluators.rst
rename to doc/api/v1/trainer_config_helpers/evaluators.rst
diff --git a/doc/api/trainer_config_helpers/layers.rst b/doc/api/v1/trainer_config_helpers/layers.rst
similarity index 93%
rename from doc/api/trainer_config_helpers/layers.rst
rename to doc/api/v1/trainer_config_helpers/layers.rst
index 8b0e553eacc932bc59062103ac6e6ac4245d03cb..24389c2d8574dfda4bec9298776aa6b1aee51535 100644
--- a/doc/api/trainer_config_helpers/layers.rst
+++ b/doc/api/v1/trainer_config_helpers/layers.rst
@@ -139,24 +139,12 @@ lstmemory
     :members: lstmemory
     :noindex:
 
-lstm_step_layer
----------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: lstm_step_layer
-    :noindex:
-
 grumemory
 ---------
 ..  automodule:: paddle.trainer_config_helpers.layers
     :members: grumemory
     :noindex:
 
-gru_step_layer
----------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: gru_step_layer
-    :noindex:
-
 Recurrent Layer Group
 =====================
 
@@ -172,6 +160,18 @@ recurrent_group
     :members: recurrent_group
     :noindex:
     
+lstm_step_layer
+---------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: lstm_step_layer
+    :noindex:
+
+gru_step_layer
+---------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: gru_step_layer
+    :noindex:
+
 beam_search
 ------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -279,6 +279,12 @@ concat_layer
     :members: concat_layer
     :noindex:
 
+seq_concat_layer
+----------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: seq_concat_layer
+    :noindex:
+
 Reshaping Layers
 ================
 
@@ -302,6 +308,18 @@ repeat_layer
     :members: repeat_layer
     :noindex:
 
+rotate_layer
+------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: rotate_layer
+    :noindex:
+
+seq_reshape_layer
+-----------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: seq_reshape_layer
+    :noindex:
+
 Math Layers
 ===========
 
@@ -414,6 +432,12 @@ multi_binary_label_cross_entropy
     :members: multi_binary_label_cross_entropy
     :noindex:
 
+mse_cost
+---------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: mse_cost
+    :noindex:
+
 huber_cost
 ----------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -432,6 +456,12 @@ rank_cost
     :members: rank_cost
     :noindex:
 
+sum_cost
+---------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: sum_cost
+    :noindex:
+
 crf_layer
 -----------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -450,6 +480,12 @@ ctc_layer
     :members: ctc_layer
     :noindex:
 
+warp_ctc_layer
+--------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: warp_ctc_layer
+    :noindex:
+
 nce_layer
 -----------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -462,12 +498,6 @@ hsigmoid
     :members: hsigmoid
     :noindex:
 
-sum_cost
----------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: sum_cost
-    :noindex:
-
 Check Layer 
 ============
 
diff --git a/doc/api/trainer_config_helpers/networks.rst b/doc/api/v1/trainer_config_helpers/networks.rst
similarity index 100%
rename from doc/api/trainer_config_helpers/networks.rst
rename to doc/api/v1/trainer_config_helpers/networks.rst
diff --git a/doc/api/trainer_config_helpers/optimizers.rst b/doc/api/v1/trainer_config_helpers/optimizers.rst
similarity index 100%
rename from doc/api/trainer_config_helpers/optimizers.rst
rename to doc/api/v1/trainer_config_helpers/optimizers.rst
diff --git a/doc/api/trainer_config_helpers/poolings.rst b/doc/api/v1/trainer_config_helpers/poolings.rst
similarity index 100%
rename from doc/api/trainer_config_helpers/poolings.rst
rename to doc/api/v1/trainer_config_helpers/poolings.rst
diff --git a/doc/api/v2/config/activation.rst b/doc/api/v2/config/activation.rst
new file mode 100644
index 0000000000000000000000000000000000000000..eca3ce03bcdc599edca802d8dfca48d4f28275a2
--- /dev/null
+++ b/doc/api/v2/config/activation.rst
@@ -0,0 +1,101 @@
+===========
+Activation
+===========
+
+Abs
+===
+
+..  automodule:: paddle.v2.activation
+    :members: Abs
+    :noindex:
+    
+Exp
+===
+
+..  automodule:: paddle.v2.activation
+    :members: Exp
+    :noindex:
+    
+Identity
+========
+
+..  automodule:: paddle.v2.activation
+    :members: Identity
+    :noindex:
+    
+Linear
+======
+
+..  automodule:: paddle.v2.activation
+    :members: Linear
+    :noindex:
+
+Log
+===
+
+..  automodule:: paddle.v2.activation
+    :members: Log
+    :noindex:
+    
+Square
+======
+
+..  automodule:: paddle.v2.activation
+    :members: Square
+    :noindex:
+    
+Sigmoid
+=======
+
+..  automodule:: paddle.v2.activation
+    :members: Sigmoid
+    :noindex:
+    
+Softmax
+=======
+
+..  automodule:: paddle.v2.activation
+    :members: Softmax
+    :noindex:
+    
+SequenceSoftmax
+===============
+
+..  automodule:: paddle.v2.activation
+    :members: SequenceSoftmax
+    :noindex:
+    
+Relu
+====
+
+..  automodule:: paddle.v2.activation
+    :members: Relu
+    :noindex:
+    
+BRelu
+=====
+
+..  automodule:: paddle.v2.activation
+    :members: BRelu
+    :noindex:
+    
+SoftRelu
+========
+
+..  automodule:: paddle.v2.activation
+    :members: SoftRelu
+    :noindex:
+    
+Tanh
+====
+
+..  automodule:: paddle.v2.activation
+    :members: Tanh
+    :noindex:
+    
+STanh
+=====
+
+..  automodule:: paddle.v2.activation
+    :members: STanh
+    :noindex:
diff --git a/doc/api/v2/config/attr.rst b/doc/api/v2/config/attr.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a93f41b86779200d8bac651614f4d61f4895875f
--- /dev/null
+++ b/doc/api/v2/config/attr.rst
@@ -0,0 +1,6 @@
+Parameter Attribute
+===================
+
+..  automodule:: paddle.v2.attr
+    :members:
+    :noindex:
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..db33a20487e579cda67a01c52ee646829df0f4e6
--- /dev/null
+++ b/doc/api/v2/config/layer.rst
@@ -0,0 +1,487 @@
+..  _api_v2.layer:
+
+======
+Layers
+======
+
+Data layer
+===========
+
+..  _api_v2.layer_data:
+
+data
+----
+..  automodule:: paddle.v2.layer
+    :members: data
+    :noindex:
+
+Fully Connected Layers
+======================
+
+..  _api_v2.layer_fc:
+
+fc
+--
+..  automodule:: paddle.v2.layer
+    :members: fc
+    :noindex:
+
+selective_fc
+------------
+..  automodule:: paddle.v2.layer
+    :members: selective_fc
+    :noindex:
+
+Conv Layers
+===========
+
+conv_operator
+-------------
+..  automodule:: paddle.v2.layer
+    :members: conv_operator
+    :noindex:
+
+conv_projection
+---------------
+..  automodule:: paddle.v2.layer
+    :members: conv_projection
+    :noindex:
+
+conv_shift
+----------
+..  automodule:: paddle.v2.layer
+    :members: conv_shift
+    :noindex:
+
+img_conv
+--------
+..  automodule:: paddle.v2.layer
+    :members: img_conv
+    :noindex:
+
+..  _api_v2.layer_context_projection:
+
+context_projection 
+------------------
+..  automodule:: paddle.v2.layer
+    :members: context_projection
+    :noindex:
+
+Image Pooling Layer
+===================
+
+img_pool
+--------
+..  automodule:: paddle.v2.layer
+    :members: img_pool
+    :noindex:   
+
+spp
+---
+..  automodule:: paddle.v2.layer
+    :members: spp
+    :noindex:
+
+maxout
+------
+..  automodule:: paddle.v2.layer
+    :members: maxout
+    :noindex:
+
+Norm Layer
+==========
+
+img_cmrnorm
+-----------
+..  automodule:: paddle.v2.layer
+    :members: img_cmrnorm
+    :noindex:
+
+batch_norm
+----------
+..  automodule:: paddle.v2.layer
+    :members: batch_norm
+    :noindex:
+
+sum_to_one_norm
+---------------
+..  automodule:: paddle.v2.layer
+    :members: sum_to_one_norm
+    :noindex:
+    
+Recurrent Layers
+================
+
+recurrent
+---------
+..  automodule:: paddle.v2.layer
+    :members: recurrent
+    :noindex:
+
+lstmemory
+---------
+..  automodule:: paddle.v2.layer
+    :members: lstmemory
+    :noindex:
+
+grumemory
+---------
+..  automodule:: paddle.v2.layer
+    :members: grumemory
+    :noindex:
+
+Recurrent Layer Group
+=====================
+
+memory
+------
+..  automodule:: paddle.v2.layer
+    :members: memory
+    :noindex:
+
+recurrent_group
+---------------
+..  automodule:: paddle.v2.layer
+    :members: recurrent_group
+    :noindex:
+    
+lstm_step
+---------
+..  automodule:: paddle.v2.layer
+    :members: lstm_step
+    :noindex:
+
+gru_step
+--------
+..  automodule:: paddle.v2.layer
+    :members: gru_step
+    :noindex:
+
+beam_search
+------------
+..  automodule:: paddle.v2.layer
+    :members: beam_search
+    :noindex:
+    
+get_output
+----------
+..  automodule:: paddle.v2.layer
+    :members: get_output
+    :noindex:
+    
+Mixed Layer
+===========
+
+..  _api_v2.layer_mixed:
+
+mixed
+-----
+..  automodule:: paddle.v2.layer
+    :members: mixed
+    :noindex:
+
+..  _api_v2.layer_embedding:
+
+embedding
+---------
+..  automodule:: paddle.v2.layer
+    :members: embedding
+    :noindex:
+
+scaling_projection
+------------------
+..  automodule:: paddle.v2.layer
+    :members: scaling_projection
+    :noindex:
+
+dotmul_projection
+-----------------
+..  automodule:: paddle.v2.layer
+    :members: dotmul_projection
+    :noindex:
+
+dotmul_operator
+---------------
+..  automodule:: paddle.v2.layer
+    :members: dotmul_operator
+    :noindex:
+
+full_matrix_projection
+----------------------
+..  automodule:: paddle.v2.layer
+    :members: full_matrix_projection
+    :noindex:
+
+identity_projection
+-------------------
+..  automodule:: paddle.v2.layer
+    :members: identity_projection
+    :noindex:
+
+
+table_projection
+----------------
+..  automodule:: paddle.v2.layer
+    :members: table_projection
+    :noindex:
+
+trans_full_matrix_projection
+----------------------------
+..  automodule:: paddle.v2.layer
+    :members: trans_full_matrix_projection
+    :noindex:
+    
+Aggregate Layers
+================
+
+..  _api_v2.layer_pooling:
+
+pooling
+-------
+..  automodule:: paddle.v2.layer
+    :members: pooling
+    :noindex:
+
+..  _api_v2.layer_last_seq:
+
+last_seq
+--------
+..  automodule:: paddle.v2.layer
+    :members: last_seq
+    :noindex:
+
+..  _api_v2.layer_first_seq:
+
+first_seq
+---------
+..  automodule:: paddle.v2.layer
+    :members: first_seq
+    :noindex:
+
+concat
+------
+..  automodule:: paddle.v2.layer
+    :members: concat
+    :noindex:
+
+seq_concat
+----------
+..  automodule:: paddle.v2.layer
+    :members: seq_concat
+    :noindex:
+
+Reshaping Layers
+================
+
+block_expand
+------------
+..  automodule:: paddle.v2.layer
+    :members: block_expand
+    :noindex:
+
+..  _api_v2.layer_expand:
+
+expand
+------
+..  automodule:: paddle.v2.layer
+    :members: expand
+    :noindex:
+
+repeat
+------
+..  automodule:: paddle.v2.layer
+    :members: repeat
+    :noindex:
+
+rotate
+------
+..  automodule:: paddle.v2.layer
+    :members: rotate
+    :noindex:
+
+seq_reshape
+-----------
+..  automodule:: paddle.v2.layer
+    :members: seq_reshape
+    :noindex:
+
+Math Layers
+===========
+
+addto
+-----
+..  automodule:: paddle.v2.layer
+    :members: addto
+    :noindex:
+
+linear_comb
+-----------
+..  automodule:: paddle.v2.layer
+    :members: linear_comb
+    :noindex:
+
+interpolation
+-------------
+..  automodule:: paddle.v2.layer
+    :members: interpolation
+    :noindex:
+
+bilinear_interp
+---------------
+..  automodule:: paddle.v2.layer
+    :members: bilinear_interp
+    :noindex:
+
+power
+-----
+..  automodule:: paddle.v2.layer
+    :members: power
+    :noindex:
+
+scaling
+-------
+..  automodule:: paddle.v2.layer
+    :members: scaling
+    :noindex:
+
+slope_intercept
+---------------
+..  automodule:: paddle.v2.layer
+    :members: slope_intercept
+    :noindex:
+
+tensor
+------
+..  automodule:: paddle.v2.layer
+    :members: tensor
+    :noindex:
+
+..  _api_v2.layer_cos_sim:
+
+cos_sim
+-------
+..  automodule:: paddle.v2.layer
+    :members: cos_sim
+    :noindex:
+
+trans
+-----
+..  automodule:: paddle.v2.layer
+    :members: trans
+    :noindex:
+
+Sampling Layers
+===============
+
+maxid
+-----
+..  automodule:: paddle.v2.layer
+    :members: maxid
+    :noindex:
+
+sampling_id
+-----------
+..  automodule:: paddle.v2.layer
+    :members: sampling_id
+    :noindex:
+
+Slicing and Joining Layers
+==========================
+
+pad
+----
+..  automodule:: paddle.v2.layer
+    :members: pad
+    :noindex:
+
+..  _api_v2.layer_costs:
+
+Cost Layers
+===========
+
+cross_entropy_cost
+------------------
+..  automodule:: paddle.v2.layer
+    :members: cross_entropy_cost
+    :noindex:
+
+cross_entropy_with_selfnorm_cost
+--------------------------------
+..  automodule:: paddle.v2.layer
+    :members: cross_entropy_with_selfnorm_cost
+    :noindex:
+
+multi_binary_label_cross_entropy_cost
+-------------------------------------
+..  automodule:: paddle.v2.layer
+    :members: multi_binary_label_cross_entropy_cost
+    :noindex:
+
+huber_cost
+----------
+..  automodule:: paddle.v2.layer
+    :members: huber_cost
+    :noindex:
+
+lambda_cost
+-----------
+..  automodule:: paddle.v2.layer
+    :members: lambda_cost
+    :noindex:
+
+rank_cost
+---------
+..  automodule:: paddle.v2.layer
+    :members: rank_cost
+    :noindex:
+
+sum_cost
+---------
+..  automodule:: paddle.v2.layer
+    :members: sum_cost
+    :noindex:
+
+crf
+---
+..  automodule:: paddle.v2.layer
+    :members: crf
+    :noindex:
+
+crf_decoding
+------------
+..  automodule:: paddle.v2.layer
+    :members: crf_decoding
+    :noindex:
+
+ctc
+---
+..  automodule:: paddle.v2.layer
+    :members: ctc
+    :noindex:
+
+warp_ctc
+--------
+..  automodule:: paddle.v2.layer
+    :members: warp_ctc
+    :noindex:
+
+nce
+---
+..  automodule:: paddle.v2.layer
+    :members: nce
+    :noindex:
+
+hsigmoid
+---------
+..  automodule:: paddle.v2.layer
+    :members: hsigmoid
+    :noindex:
+
+Check Layer 
+============
+
+eos
+---
+..  automodule:: paddle.v2.layer
+    :members: eos
+    :noindex:
diff --git a/doc/api/v2/config/networks.rst b/doc/api/v2/config/networks.rst
new file mode 100644
index 0000000000000000000000000000000000000000..6f209bc95bec7279051118bb857a96515e0371a9
--- /dev/null
+++ b/doc/api/v2/config/networks.rst
@@ -0,0 +1,117 @@
+========
+Networks
+========
+
+The v2.networks module contains pieces of neural network that combine multiple layers.
+
+NLP
+===
+
+sequence_conv_pool
+------------------
+..  automodule:: paddle.v2.networks
+    :members: sequence_conv_pool
+    :noindex:
+
+..  _api_trainer_config_helpers_network_text_conv_pool:
+
+text_conv_pool
+--------------
+..  automodule:: paddle.v2.networks
+    :members: text_conv_pool
+    :noindex:
+
+Images
+======
+
+img_conv_bn_pool
+----------------
+..  automodule:: paddle.v2.networks
+    :members: img_conv_bn_pool
+    :noindex:
+
+img_conv_group
+--------------
+..  automodule:: paddle.v2.networks
+    :members: img_conv_group
+    :noindex:
+
+..  _api_trainer_config_helpers_network_simple_img_conv_pool:
+
+simple_img_conv_pool
+--------------------
+..  automodule:: paddle.v2.networks
+    :members: simple_img_conv_pool
+    :noindex:
+
+vgg_16_network
+---------------
+..  automodule:: paddle.v2.networks
+    :members: vgg_16_network
+    :noindex:
+
+Recurrent
+=========
+
+LSTM
+----
+
+lstmemory_unit
+``````````````
+..  automodule:: paddle.v2.networks
+    :members: lstmemory_unit
+    :noindex:
+
+lstmemory_group
+```````````````
+..  automodule:: paddle.v2.networks
+    :members: lstmemory_group
+    :noindex:
+
+simple_lstm
+```````````
+..  automodule:: paddle.v2.networks
+    :members: simple_lstm
+    :noindex:
+
+bidirectional_lstm
+``````````````````
+..  automodule:: paddle.v2.networks
+    :members: bidirectional_lstm
+    :noindex:
+
+GRU
+---
+
+gru_unit
+````````
+..  automodule:: paddle.v2.networks
+    :members: gru_unit
+    :noindex:
+
+gru_group
+`````````
+..  automodule:: paddle.v2.networks
+    :members: gru_group
+    :noindex:
+
+simple_gru
+``````````
+..  automodule:: paddle.v2.networks
+    :members: simple_gru
+    :noindex:
+
+simple_attention
+----------------
+..  automodule:: paddle.v2.networks
+    :members: simple_attention
+    :noindex:
+
+Miscs
+=====
+
+dropout_layer
+--------------
+..  automodule:: paddle.v2.networks
+    :members: dropout_layer
+    :noindex:
diff --git a/doc/api/v2/config/optimizer.rst b/doc/api/v2/config/optimizer.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ec6ba0aa46239f3806ca950e8863b953d0c4150b
--- /dev/null
+++ b/doc/api/v2/config/optimizer.rst
@@ -0,0 +1,47 @@
+..  _api_v2.optimizer:
+
+==========
+Optimizer
+==========
+
+Momentum
+========
+..  automodule:: paddle.v2.optimizer
+    :members: Momentum
+    :noindex:
+
+Adam
+====
+..  automodule:: paddle.v2.optimizer
+    :members: Adam
+    :noindex:
+
+Adamax
+======
+..  automodule:: paddle.v2.optimizer
+    :members: Adamax
+    :noindex:
+
+AdaGrad
+=======
+..  automodule:: paddle.v2.optimizer
+    :members: AdaGrad
+    :noindex:
+
+DecayedAdaGrad
+==============
+..  automodule:: paddle.v2.optimizer
+    :members: DecayedAdaGrad
+    :noindex:
+
+AdaDelta
+========
+..  automodule:: paddle.v2.optimizer
+    :members: AdaDelta
+    :noindex:
+
+RMSProp
+=======
+..  automodule:: paddle.v2.optimizer
+    :members: RMSProp
+    :noindex:
diff --git a/doc/api/v2/config/pooling.rst b/doc/api/v2/config/pooling.rst
new file mode 100644
index 0000000000000000000000000000000000000000..d26b365c9284632210a1532853e39feedc70758b
--- /dev/null
+++ b/doc/api/v2/config/pooling.rst
@@ -0,0 +1,46 @@
+=======
+Pooling
+=======
+
+BasePool
+========
+..  automodule:: paddle.v2.pooling
+    :members: BasePool
+    :noindex:
+
+Avg
+===
+..  automodule:: paddle.v2.pooling
+    :members: Avg
+    :noindex:
+
+Max
+===
+..  automodule:: paddle.v2.pooling
+    :members: Max
+    :noindex:
+
+Sum
+===
+..  automodule:: paddle.v2.pooling
+    :members: Sum
+    :noindex:
+
+SquareRootN
+===========
+..  automodule:: paddle.v2.pooling
+    :members: SquareRootN
+    :noindex:
+
+CudnnAvg
+========
+..  automodule:: paddle.v2.pooling
+    :members: CudnnAvg
+    :noindex:
+
+CudnnMax
+========
+..  automodule:: paddle.v2.pooling
+    :members: CudnnMax
+    :noindex:
+
diff --git a/doc/api/v2/data.rst b/doc/api/v2/data.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b042320bc2922a1ddfa06b5d8479ac9134ae9d89
--- /dev/null
+++ b/doc/api/v2/data.rst
@@ -0,0 +1,108 @@
+========
+Datasets
+========
+
+
+DataTypes
+=========
+
+..  automodule:: paddle.v2.data_type
+    :members:
+    :noindex:
+
+DataFeeder
+==========
+
+..  automodule:: paddle.v2.data_feeder
+    :members:
+    :noindex:
+
+Reader
+======
+
+..  automodule:: paddle.v2.reader
+    :members:
+    :noindex:
+
+..  automodule:: paddle.v2.reader.creator
+    :members:
+    :noindex:
+
+minibatch
+=========
+
+..  automodule:: paddle.v2.minibatch
+    :members:
+    :noindex:
+
+Dataset
+=======
+
+..  automodule:: paddle.v2.dataset
+    :members:
+    :noindex:
+
+mnist
++++++
+
+..  automodule:: paddle.v2.dataset.mnist
+    :members:
+    :noindex:
+
+
+cifar
++++++
+
+..  automodule:: paddle.v2.dataset.cifar
+    :members:
+    :noindex:
+
+conll05
++++++++
+
+..  automodule:: paddle.v2.dataset.conll05
+    :members:
+    :noindex:
+
+imdb
+++++
+
+..  automodule:: paddle.v2.dataset.imdb
+    :members:
+    :noindex:
+
+imikolov
+++++++++
+
+..  automodule:: paddle.v2.dataset.imikolov
+    :members:
+    :noindex:
+
+movielens
++++++++++
+
+..  automodule:: paddle.v2.dataset.movielens
+    :members:
+    :noindex:
+
+sentiment
++++++++++
+
+..  automodule:: paddle.v2.dataset.sentiment
+    :members:
+    :noindex:
+
+uci_housing
++++++++++++
+
+..  automodule:: paddle.v2.dataset.uci_housing
+    :members:
+    :noindex:
+
+wmt14
++++++
+
+..  automodule:: paddle.v2.dataset.uci_housing
+    :members:
+    :noindex:
+
diff --git a/doc/api/v2/model_configs.rst b/doc/api/v2/model_configs.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a5fae7e29e56d9e236d489353a5c8967d1954641
--- /dev/null
+++ b/doc/api/v2/model_configs.rst
@@ -0,0 +1,12 @@
+Model Configuration
+===================
+
+..  toctree::
+    :maxdepth: 1
+
+    config/activation.rst
+    config/layer.rst
+    config/optimizer.rst
+    config/pooling.rst
+    config/networks.rst
+    config/attr.rst
diff --git a/doc/api/v2/run_logic.rst b/doc/api/v2/run_logic.rst
new file mode 100644
index 0000000000000000000000000000000000000000..94921e1a7b9c0a95931136bfb65d2560dba8b8ee
--- /dev/null
+++ b/doc/api/v2/run_logic.rst
@@ -0,0 +1,27 @@
+======================
+Training and Inference
+======================
+
+Parameters
+==========
+
+..  automodule:: paddle.v2.parameters
+    :noindex:
+
+Trainer
+=======
+
+..  automodule:: paddle.v2.trainer
+    :noindex:
+
+Event
+=====
+
+..  automodule:: paddle.v2.event
+    :noindex:
+
+Inference
+=========
+
+..  autofunction:: paddle.v2.infer
+    :noindex:
diff --git a/doc/design/api.md b/doc/design/api.md
index dd4341b32490dc61a16b5580900cbfaa0dd70e2a..8185d2af0ea264a2e7b4e28b9ed05279e4a22014 100644
--- a/doc/design/api.md
+++ b/doc/design/api.md
@@ -2,140 +2,148 @@
 
 ## Ingredients
 
-As the first step of our design, we list important concepts in deep
-learning and try to figure their relationship, as shown below:
+As our design principle is starting from the essence: how could we
+allow users to express and solve their problems at neural networks.
+Some essential concepts that our API have to provide include:
 
-```
-Model = {topology, parameters}
+1. A *topology* is an expression of *layers*.
 
-Evaluator = {Model*, activations}
-- forward
-- test(cost, ...)
+1. A layer could be any kind of computation, including *cost*.
 
-GradientMachine = {Evaluator*, gradients}
-- backward
+1. Some layers have parameters, some don't. Most costs don't have
+   parameters.
 
-Optimizer = {GradientMachine*}
-- train(cost, ...)
-- update
-- checkpoint
-```
+1. In some topologies, layers share parameters.  For
+   example,
+   [the network for training a ranking model](https://github.com/PaddlePaddle/Paddle/issues/1311#issuecomment-279121850).
 
-where the pair of curly braces `{` and `}` indicate *composition*, `*`
-indicates a *reference*, and `-` marks a "class method".
+1. At programming time, users specify topologies and possible sharing
+   of parameters.  PaddlePaddle can figure out and create parameters
+   required (and possibly shared) by one or more topologies.
 
 
-### Model
+## Starting from Examples
 
-We used to think that parameters are part of the topology (or layers).
-But that is not true because multiple layers could share the same
-parameter matrix.  An example is a network that compares two text
-segments in a semantic space:
+As a summarization
+of
+[our disucssion](https://github.com/PaddlePaddle/Paddle/issues/1315),
+let us present two examples here:
 
-```
-          semantic
-text A -> projection ---\
-          layer A        \
-                          cosine
-                          similarity -> output
-                          layer
-          semantic       /
-text B -> projection ---/
-          layer B
-```
 
-In this network, the two semantic projection layers (A and B) share
-the same parameter matrix.
+### Example 1. Sharing Parameters between Layers
 
-For more information about our API that specifies topology and
-parameter sharing, please refer to [TODO: API].
+We use
+the
+[3-branch ranking](https://github.com/PaddlePaddle/Paddle/issues/1311#issuecomment-279121850) model
+in this example.  For your convenience, I copy-a-paste the model's
+topology as follows:
 
+```
+A -> f -\
+Q -> f --> cost
+B -> f -/
+```
 
-### Evaluator
-
-Supposed that we have a trained ranking model, we should be able to
-use it in our search engine.  The search engine's Web server is a
-concurrent program so to serve many HTTP requests simultaneously.  It
-doesn't make sense for each of these threads to have its own copy of the model because that would duplicate topologies and parameters.
-However, each thread should be able to record layer outputs, i.e.,
-activations, computed from an input, derived from the request.  With
-*Evaluator* that saves activations, we can write the over-simplified
-server program as:
+The following program trains the topology including the cost, and then
+use the sub-network in the trained topology in inference:
 
 ```python
-m = paddle.model.load("trained.model")
-
-http.handle("/",
-            lambda req:
-                e = paddle.evaluator.create(m)
-                e.forward(req)
-                e.activation(layer="output")) # returns activations of layer "output"
+def f(in):
+    e = paddle.layer.embedding(in, parameter_name="embedding")
+    o = paddle.layer.softmax(e, parameter_name="semantic")
+    return o
+
+# Create 3 topologies (subnets), they share parameters because all
+# correspoinding layers have the same parameter names.
+fA = f(paddle.layer.data(input_name="A"))
+fB = f(paddle.layer.data(input_name="B"))
+fQ = f(paddle.layer.data(input_name="Q"))
+
+topology = paddle.layer.less_than(
+               paddle.layer.cross_entropy(fA, fQ),
+               paddle.layer.corss_entropy(fB, fQ))
+
+# Derive parameters required in topology and create them in model.
+parameters = paddle.parameters.create(topology)
+
+# Estimate parameters used in topology from data.
+paddle.train(topology, parameters, reader=read_ranking_model_data)
+
+# Inference using fA (or fB or fC, as they share their parameters).
+[testA, testB, testQ] = read_ranking_model_data()
+print "The sematic-vector of testA: ", paddle.infer(fA, parameters, testA)
 ```
 
-### GradientMachine
-
-Similar to the evaluation, the training needs to compute gradients so
-to update model parameters.  Because an [optimizer](#optimizer) might
-run multiple simultaneous threads to update the same model, gradients
-should be separated from the model.  Because gradients are only used
-in training, but not serving, they should be separate from Evaluator.
-Hence the `GradientMachine`.
-
-### Optimizer
-
-None of Model, Evaluator, nor GradientMachine implements the training
-loop, hence Optimizer.  We can define a concurrent optimizer that runs
-multiple simultaneous threads to train a model -- just let each
-thread has its own GradientMachine object.
 
-Most models should be able to be trained using the
-`paddle.optimizer.SGD` by calling its `train` method.  Many
-customizations to the SGD algorithm happens with the update equation,
-e.g., momentum and the Adam SGD algorithm.  We make `train` calls
-`update` to do an update, so that we can derive a `paddle.optimizer.Adam`
-from `paddle.optimizer.SGD` by overrides only the `update` method.
+### Example 2. Sharing Parameters between "Models"
 
+We use [GAN](https://github.com/PaddlePaddle/book/tree/develop/gan) in
+this example.  In the following example program, `d0` and `d1`
+correspond to the two networks in the following figure:
 
-## Programming Interface
-
-A fictive example of PaddlePaddle program looks like the following:
+<img src="https://github.com/wangyang59/book/raw/00036f4b0da5225041a6824587c1a01cf20159b1/gan/image/gan_ig.png" width=400 />
 
 ```python
-import paddle
+def G(in):
+    # over-simplified example as G has only one layers:
+    return paddle.layer.fc(in, parameter_name="G")
+
+def D(in);
+    # again, over-simplified:
+    return paddle.layer.fc(in, parameter_name="D")
+
+# Construct the first topology, which contains both D and G.
+# By learning this topology, we update parameters of G.
+d0 = paddle.layer.should_be_false(D(G(paddle.layer.data())))
+
+# Construct a second topology d1, which contains only D. By
+# training this topology, we update parameters of D.  Note
+# that d1 share parameters with d0.
+d1 = paddle.layer.should_be_true(D(paddle.layer.data()))
+
+# Create parameters from a list of multiple topologies (models) for
+# the chance to share parameters between these topologies.
+parameters = paddle.parameters.create([d0, d1])
+
+# Iterative training of GAN.
+for ...:
+    train(d0, parameters, reader=read_from_rng, immutable_parameters={"D"})
+    train(d1, parameters, reader=read_from_realistic_images)
+
+# Use d1 for inference:
+print "D thinks a batch of images are realistic ", infer(d1, parameters, read_mnist_images)
+```
 
-def read(args):
-    f = open_file(args["filename"])
-    mb = read_a_minibatch(f)
-    end_pass = eof(f)
-    if end_pass:
-       f = open_file(args["filename"]) # rewind for reading again
-    yield mb, end_pass
 
-input = paddle.layer.data(...)
-intermediate = paddle.layers.fc(input)
-output = paddle.layer.softmax(intermediate)
+### Summarization
 
-model = paddle.model.create(output)
 
-paddle.train(model, data_provider=read)
-```
+Above two programs reveal some important design concerns:
 
-This shows some important part of a program:
+1. Users describe a topology as an expression of layers.  Every layer
+   has a *parameter name*.  If the users don't specify it explicitly, it's automatically generated as a unique name.  By
+   specifying the parameter name, users can specify the sharing of
+   parameters between layers and even between topologies.
 
-1. Define how to read (and augment) data by defining a function, in
-   this example, `read`, that `yields` a minibatch and a boolean flag
-   `eof_of_pass`.
+1. `paddle.parameters.create` figures out parameters required by one
+   or more topologies from parameter names of layers.  It creates these
+   parameters and returns a `ParameterSet` object, which is in essence
+   a map from *parameter names* to *parameters*.
 
-1. Define the topology, `input`, `intermediate`, and `output` in this
-   example.
+1. At training and inference time, `paddle.train` and `paddle.infer`
+   requires both a topology and the parameter set that holds the parameters of that topology.  There are some reasons:
 
-1. Create parameters from the topology thus forms the model by calling
-   `paddel.model.create`.
+   1. This prevents users from forgetting to call
+      `paddle.parameters.create`.
+   1. `paddle.train` needs to know which parameter set to update.
+   1. Users could load another (pre-trained) parameter set and use it
+      with a topology in `train.infer`.
 
-1. Train the model by calling `paddle.train`.
+1. By specifying the `immutable_parameters` parameter of
+   `paddle.train`, we can forbid the update of these parameters.
 
 
-### Reader
+## Reader
 
 Not all programming frameworks allow users to define I/O functions.
 An example is Google MapReduce, which can only read from text,
@@ -145,91 +153,67 @@ readers and writers by deriving from base classes `Reader` and
 decide to provide the flexibility to users to define their readers.
 
 
-#### A Synthetic Data Reader
+There are some open questions here:
 
-Sometimes we want to test a topology and/or a training algorithm using
-synthetic data.  We can do this by defining the reader a synthesizer:
+1. **Should a reader return a Python dictionary?**
 
-```python
-def read(args):
-    x = sample_from_uniform(0.0, 1.0)
-    y = sample_from_gauss(2 * x, sigma)
-    yield {x, y}, False # no end-of-file so no end-of-pass
-```
+1. **How to map multiple outputs from a reader to multiple data layers?**
 
-#### A Reader for Online Learning
+1. **How to easily compose some existing readers to read more data and
+   feed a topology with more data layers?**
 
-Readers can also read an infinite data stream, e.g., a log stream from
-a search engine and collected by Kafka:
 
-```python
-def read(args):
-    log_stream = kafka.open_channel(args["kafka channel name"])
-    yeild log_stream.read(), False # no end-of-pass in online learning
-```
+## Training
 
-### Topology
-
-By default, layers don't have names.  But if we want to refer to a
-layer later some time, for example, when we do serving using the model
-and wants activations/outputs of a layer, we should give it a name.
+The recommended way to training a model is to call `paddle.train`,
+which simply calls `paddle.trainer.Default`, a global variable of
+type `paddle.trainer.SGD`.  Equivalently, we can do
 
 ```python
-input = paddle.layer.data(...)
-intermediate = paddle.layer.fc(input, name="inter", ...)
-output = paddle.layer.softmax(intermediate, name="output", ...)
-
-m = paddle.model.create(output)
-e = paddle.evaluator.create(model)
-e.forward(read_an_input()) # compute activations of all layers.
-print e.activations(layer="inter")  # retrieve the activations of layer "inter"
-print e.activations(layer="output") # retrieve the activations of layer "output"
+opt = paddle.trainer.SGD(..., paddle.updater.Adam(...))
+opt.train(topology, parameters, reader=read, ...)
 ```
 
-#### Sharing Parameters
+### Updater
 
-In [above section](#model) we shows a network whose two layers share
-the same parameter matrix.  To specify such cases, we give "parameter
-names" to layers.  If some layers have the same paraemter names,
-`paddle.model.create` creates a single parameter matrix for these
-layers:
+Please be aware that a trainer can accept an updater as its data
+member, where an updater is a class derived from
+`paddle.trainer.Updater`.  This is to make it easier to customize
+trainers, as discussed
+[here](https://github.com/PaddlePaddle/Paddle/issues/1319).
 
-```python
-text1 = paddle.layer.data(...)
-sematic1 = paddle.layer.fc(text1, ..., parameter_name="sematic_projection")
-text2 = paddle.layer.data(...)
-sematic2 = paddle.layer.fc(text2, ..., parameter_name="sematic_projection")
-out = paddle.layer.cosine(semantic1, semantic2)
-```
+### Event Handler
 
-We can also share parameter matrices between layers in different
-models.  To do this, we need an additional parameter that refers to a
-model:
+`paddle.train` and `paddle.trainer.XXX.train` take an optional
+parameter `event_handler`, which should be either `None` or a function
+that handle some events:
 
-```python
-model1_input = paddle.layer.data(...)
-model1_output = paddle.layer.softmax(model1_input, ...,
-                                     parameter_name="a_parameter_matrix")
-model1 = paddle.model.create(model1_output)
-
-# Another model
-model2_semantic = paddle.layer.fc(text2, ...,
-                                  parameter_name="a_parameter_matrix",
-                                  parameter_model=model1)
-```
+1. BeginTraining
+1. EndTraining
+1. BeginIteration
+1. EndIteration
+1. BeginPass
+1. EndPass
 
-### Training
+where EndPass is sent if and only if the reader yields
+`end_pass=True`.
 
-The recommended way to training a model is to call `paddle.train`,
-which simply calls `paddle.optimizer.Default`, a global variable of
-type `paddle.optimizer.SGD`.  Equivalently, we can do
+An example as follows:
 
 ```python
-opt = paddle.optimizer.SGD(...)
-opt.train(model, reader=read, ...)
+def event_handler(event):
+    if ininstance(event, paddle.event.EndIteration):
+        print paddle.test(...)
+
+paddle.train(topology, parameters, reader, event_handler)
 ```
 
-#### Distributed Training
+If we are writing a PaddlePaddle program in and for iPython/Jypyter,
+we can use metaplotlib in the event handler to plot a curve of
+cost/error versus iterations, as shown
+[here](https://blog.dominodatalab.com/interactive-dashboards-in-jupyter/).
+
+### Distributed Training
 
 If users want to do distributed training on a cluster, s/he should
 call `paddle.dist_train` and provides access tokens to the cluster as
@@ -240,8 +224,9 @@ access a Kubernetes cluster, s/he should be able to call
 
 ```python
 paddle.dist_train(model,
+                  trainer=paddle.trainer.SGD(...,
+                                             paddle.updater.Adam(...)),
                   reader=read,
-                  optimizer=paddle.optimizer.SGDOptimizer(...),
                   k8s_user="yi",
                   k8s_token="kube_cluster_tls.pem",
                   k8s_job="hello",
@@ -251,7 +236,7 @@ paddle.dist_train(model,
 The pseudo code if `paddle.dist_train` is as follows:
 
 ```python
-def dist_train():
+def dist_train(topology, parameters, trainer, reader, ...):
     if os.getenv("KUBERNETES_SERVICE_HOST") == None:
         image_name = k8s_user + '/' + k8s_job
         docker_build(image_name)
@@ -264,13 +249,13 @@ def dist_train():
         elif rank < 15:
             parameter_server()
         else:
-            optimizer.train(model, reader=read)
+            trainer.train(model, reader=read)
 ```
 
 Please be aware that if a process is running on the Kubernetes
 cluster, it will have some environment variables pre-defined.
 
-If `dist_train` doesn't see these environment variables, it knowns
+If `dist_train` doesn't see these environment variables, it knows
 that it's running on users' personal computer, and it should work as a
 *launcher*.  Otherwise, it knows that it's running on the cluster and
 need to figure out its role as either the master, or a trainer, or a
diff --git a/doc/design/multi_language_interface/why_plain_c.md b/doc/design/multi_language_interface/why_plain_c.md
new file mode 100644
index 0000000000000000000000000000000000000000..a3f41ca7b93de8a55d927c88812802ef12246182
--- /dev/null
+++ b/doc/design/multi_language_interface/why_plain_c.md
@@ -0,0 +1,118 @@
+# Paddle多语言接口实现
+## 背景
+
+Paddle需要一个多语言接口，这个接口需要做到:
+
+* 有标准的，良好的文档
+    * 例如Python可以使用[Sphinx](http://www.sphinx-doc.org/en/stable/)生成API文档，golang可以使用[GoDoc](https://godoc.org/golang.org/x/tools/cmd/godoc)生成文档。这都需要这个接口按照约定俗成的规则来注释完备。
+* 不同语言的接口适应不同语言的特性
+    * 例如Java与Python的错误处理是直接扔出来Exception，而对于golang错误处理应该使用返回值。
+
+## 基本要求
+
+Paddle的多语言接口实现包括一下几个方面:
+
+* 我们使用动态库来分发Paddle。在这个动态库中不嵌入任何其他语言的解释器，也不使用其他动态库。
+* 这个动态库使用C99标准的头文件导出一些函数，不使用/导出C++符号。
+* 不导出Paddle内部的结构体、类，仅仅使用`void*`指针作为类型的句柄(handler)。
+* 不使用SWIG这种代码生成器，而是手写多语言绑定。
+
+
+## 原因
+
+### 使用动态库来分发Paddle
+
+* Paddle的链接方式比较复杂
+    * 如果用户要把Paddle的静态库（libpaddle.a）链接到自己的程序里，得使用 `--whole-archive` (for GCC) 或者 `--force_load` (for Clang) 参数，来确保把 libpaddle.a 里所有的符号都写入自己的程序的二进制文件里。这是因为 Paddle 的源码里使用了[object factory design pattern](http://stackoverflow.com/a/1310326/724872)。
+* 编译型语言，例如C/C++使用静态库和动态库难度差不多。但是解释性语言，例如[Python](http://stackoverflow.com/questions/19560594/how-to-import-static-library-in-python)或者[Java](http://stackoverflow.com/questions/24493337/linking-static-library-with-jni)，只能调用Paddle的动态库，否则得把Paddle静态库链接到解释器里。
+    * 解释性语言实际运行的二进制是解释器本身，如果调用静态库只能将静态库与解释器链接。例如对于Java来说，便是将静态库加入JVM中。这对于通常的Java的开发者来说，是不常见的做法。
+
+### 动态库中不嵌入任何其他语言的解释器
+
+* 目前Paddle的进程模型是C++内部驱动Python解释器进行模型配置解析和数据读取
+* 我们最终的动态库中不嵌入Python或者其他任何语言的解释器。模型配置解析，数据读取均交由其他语言完成
+
+现阶段Paddle有一个问题是，Paddle内嵌的Python解释器和外部使用的Python如果版本不同，会直接报错退出。
+
+### Paddle动态库中，不引用其他动态库
+
+* 即这个动态库是不依赖于其他任何文件的，可以在任何机器上执行的。
+
+###  这个动态库使用C99标准的头文件导出一些函数，不使用/导出C++符号
+
+* 由于C++编译器没有[名字修饰](https://en.wikipedia.org/wiki/Name_mangling#C.2B.2B)的规范，不同版本的编译器之间，对于同一段C++代码生成的符号可能不一致。而多语言接口需要直接读取生成的二进制(动态库)，需要有稳定的导出符号。
+* C语言是有导出符号的标准的，并且在常见的平台上，都是ABI调用标准的。
+* 大多数语言都支持使用C语言API
+* 使用C99而不使用C89，是因为C99支持[Fixed-width integer types](https://en.wikipedia.org/wiki/C_data_types#Fixed-width_integer_types)和[Boolean type](https://en.wikipedia.org/wiki/C_data_types#Boolean_type)。
+* 使用C99而不使用C11的原因是，[C11](https://en.wikipedia.org/wiki/C11_(C_standard_revision))并没有Paddle特别需要的特性，且C99相对于C11使用更加广泛。
+
+### 不导出Paddle内部的结构体、类，仅仅使用`void*`指针作为类型的句柄(handler)
+
+* Paddle内部的类为C++书写，直接导出到C的接口比较困难。
+* 在C-API中使用`void*`来表示Paddle内部类。再在每一个API中自己检查类型。
+
+在C的头文件 `paddle_matrix.h` 中:
+
+```C
+typedef void* paddle_matrix;
+typedef int paddle_error;
+
+extern "C"
+paddle_error paddle_matrix_shape(paddle_matrix matrix,
+                                 uint64_t* width,
+                                 uint64_t* height);
+```
+而在CPP里面实现这个C的接口，文件 `paddle_matrix.cpp`
+
+```cpp
+#include "paddle/math/matrix.hpp"
+extern "C"
+paddle_error paddle_matrix_shape(paddle_matrix matrix,
+                                 uint64_t *width,
+                                 uint64_t *height) {
+  auto m = (paddle::math::matrix*)(matrix);
+  *width = m->width();
+  *height = m->height();
+}
+```
+
+其中`paddle/math/matrix.hpp`文件内容为:
+
+```cpp
+namespace paddle {
+namespace math {  
+
+class Matrix {
+  //...
+};
+
+}  // namespace math
+}  // namespace paddle
+```
+
+### 不使用SWIG这种代码生成器，而是手写多语言绑定
+
+* [SWIG](http://www.swig.org/)是一个多语言接口的代码生成器。他的目标是使用C/C++写代码，SWIG直接读取C/C++的头文件，生成各种语言的绑定代码。
+    * 对于多语言接口，SWIG需要写一个interface文件。这个文件具有独特的语法，学习成本高。且增加一个第三方语言，就需要对这个第三方语言增加一些定义。有的时候，interface文件的写法非常[tricky](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/api/Paddle.swig#L36)。社区贡献代码学习成本高。
+    * SWIG暴露的接口保留了C++的接口样式，很难保证多语言代码风格的一致性。(函数命名，错误处理)
+        * 因为SWIG在第三方语言中暴露的函数名，类名和C++中完全一致。C++的命名风格并不能适应其他第三方语言。如果使用SWIG我们需要将在interface文件里，将大量的`SomeCppClass`重命名成`some_python_class`，或者`SomeGoTypes`。
+        * 对于不同语言，错误处理的方式也不尽相同。例如对于Java或者Python，最常见的错误处理方式是Exception，而对于Golang，错误处理方式是返回值。而SWIG只能简单的暴露C++接口，无法做到对于各种语言错误处理方式的适配。
+    * 对于大多数语言，直接使用C语言的.h并不困难。例如Python的[cffi](https://cffi.readthedocs.io/en/latest/overview.html#simple-example-abi-level-in-line)或者[Cython](http://cython.org/), golang的[cgo](https://golang.org/cmd/cgo/)。
+    * SWIG支持的语言或者解释器有局限。例如对于Python，使用SWIG只支持CPython解释器，而不支持PyPy解释器。
+
+
+## 原因列表
+
+| 结论 | 对比 | 原因 |
+|---| --- | --- |
+| 使用动态库 | 不使用静态库 | 解释型语言只能调用动态库，Paddle静态库链接复杂 |
+| 不嵌入其他语言解释器 | 不嵌入Python解释器 | Paddle C++目前嵌入Python解释器，会导致不同版本Python在一个进程里的bug |
+| 不引用其他动态库 | | Paddle一个动态库可以在任何Linux系统上运行 |
+| 使用C99做接口 | 不使用C++做接口 | C有标准的ABI，C99是目前C最广泛的使用标准，且C99支持bool类型和定长整数(uint64_t等)类型 |
+| 使用void*作为类句柄 | 不显示的写每个类具体包含什么| 实现简单，并且让接口脱离实现细节 |
+| 手写多语言绑定 | 不使用SWIG | 使用SWIG需要多语言绑定的开发人员熟练掌握SWIG配置，社区参与困难。SWIG生成的代码不能保证多语言代码风格的一致性 |
+
+
+## 简单实现
+
+TBD
diff --git a/doc/design/reader/README.md b/doc/design/reader/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..f21f7af520df5171798326818ecb97c3bcd14a12
--- /dev/null
+++ b/doc/design/reader/README.md
@@ -0,0 +1,202 @@
+# Python Data Reader Design Doc
+
+At training and testing time, PaddlePaddle programs need to read data. To ease the users' work to write data reading code, we define that
+
+- A *reader* is a function that reads data (from file, network, random number generator, etc) and yields data items.
+- A *reader creator* is a function that returns a reader function.
+- A *reader decorator* is a function, which accepts one or more readers, and returns a reader.
+- A *batch reader* is a function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items.
+
+and provide function which converts reader to batch reader, frequently used reader creators and reader decorators.
+
+## Data Reader Interface
+
+Indeed, *data reader* doesn't have to be a function that reads and yields data items. It can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`):
+
+```
+iterable = data_reader()
+```
+
+Element produced from the iterable should be a **single** entry of data, **not** a mini batch. That entry of data could be a single item, or a tuple of items. Item should be of [supported type](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int)
+
+An example implementation for single item data reader creator:
+
+```python
+def reader_creator_random_image(width, height):
+    def reader():
+        while True:
+            yield numpy.random.uniform(-1, 1, size=width*height)
+    return reader
+```
+
+An example implementation for multiple item data reader creator:
+```python
+def reader_creator_random_image_and_label(width, height, label):
+    def reader():
+        while True:
+            yield numpy.random.uniform(-1, 1, size=width*height), label
+    return reader
+```
+
+## Batch Reader Interface
+
+*batch reader* can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list must be a tuple.
+
+Here are valid outputs:
+```python
+# a mini batch of three data items. Each data item consist three columns of data, each of which is 1.
+[(1, 1, 1),
+(2, 2, 2),
+(3, 3, 3)]
+
+# a mini batch of three data items, each data item is a list (single column).
+[([1,1,1],),
+([2,2,2],),
+([3,3,3],),
+```
+
+Please note that each item inside the list must be a tuple, below is an invalid output:
+```python
+ # wrong, [1,1,1] needs to be inside a tuple: ([1,1,1],).
+ # Otherwise it's ambiguous whether [1,1,1] means a single column of data [1, 1, 1],
+ # or three column of datas, each of which is 1.
+[[1,1,1],
+[2,2,2],
+[3,3,3]]
+```
+
+It's easy to convert from reader to batch reader:
+```python
+mnist_train = paddle.dataset.mnist.train()
+mnist_train_batch_reader = paddle.batch(mnist_train, 128)
+```
+
+Also easy to create custom batch reader:
+```python
+def custom_batch_reader():
+    while True:
+        batch = []
+        for i in xrange(128):
+            batch.append((numpy.random.uniform(-1, 1, 28*28),)) # note that it's a tuple being appended.
+        yield batch
+
+mnist_random_image_batch_reader = custom_batch_reader
+```
+
+## Usage
+
+batch reader, mapping from item(s) read to data layer, batch size and number of total pass will be passed into `paddle.train`:
+
+```python
+# two data layer is created:
+image_layer = paddle.layer.data("image", ...)
+label_layer = paddle.layer.data("label", ...)
+
+# ...
+batch_reader = paddle.batch(paddle.dataset.mnist.train(), 128)
+paddle.train(batch_reader, {"image":0, "label":1}, 128, 10, ...)
+```
+
+## Data Reader Decorator
+
+*Data reader decorator* takes a single or multiple data reader, returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` syntax.
+
+Since we have a strict interface for data readers (no parameter, return a single data item). Data reader can be used flexiable via data reader decorators. Following are a few examples:
+
+### Prefetch Data
+
+Since reading data may take time and training can not proceed without data. It is generally a good idea to prefetch data.
+
+Use `paddle.reader.buffered` to prefetch data:
+
+```python
+buffered_reader = paddle.reader.buffered(paddle.dataset.mnist.train(), 100)
+```
+
+`buffered_reader` will try to buffer (prefetch) `100` data entries.
+
+### Compose Multiple Data Readers
+
+For example, we want to use a source of real images (reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
+
+We can do:
+
+```python
+def reader_creator_random_image(width, height):
+    def reader():
+        while True:
+            yield numpy.random.uniform(-1, 1, size=width*height)
+    return reader
+
+def reader_creator_bool(t):
+    def reader:
+        while True:
+            yield t
+    return reader
+
+true_reader = reader_creator_bool(True)
+false_reader = reader_creator_bool(False)
+
+reader = paddle.reader.compose(paddle.dataset.mnist.train(), data_reader_creator_random_image(20, 20), true_reader, false_reader)
+# Skipped 1 because paddle.dataset.mnist.train() produces two items per data entry.
+# And we don't care second item at this time.
+paddle.train(paddle.batch(reader, 128), {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...)
+```
+
+### Shuffle
+
+Given shuffle buffer size `n`, `paddle.reader.shuffle` will return a data reader that buffers `n` data entries and shuffle them before a data entry is read.
+
+Example:
+```python
+reader = paddle.reader.shuffle(paddle.dataset.mnist.train(), 512)
+```
+
+## Q & A
+
+### Why reader return only a single entry, but not a mini batch?
+
+Always returning a single entry make reusing existing data readers much easier (e.g., if existing reader return not a single entry but 3 entries, training code will be more complex because it need to handle cases like batch size 2).
+
+We provide function `paddle.batch` to turn (single entry) reader into batch reader.
+
+### Why do we need batch reader, isn't train take reader and batch_size as arguments sufficient?
+
+In most of the case, train taking reader and batch_size as arguments would be sufficent. However sometimes user want to customize order of data entries inside a mini batch. Or even change batch size dynamically.
+
+### Why use a dictionary but not a list to provide mapping?
+
+We decided to use dictionary (`{"image":0, "label":1}`) instead of list (`["image", "label"]`) is because that user can easily resue item (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or skip item (e.g., using `{"image_a":0, "label":2}`).
+
+### How to create custom data reader creator
+
+```python
+def image_reader_creator(image_path, label_path, n):
+    def reader():
+        f = open(image_path)
+        l = open(label_path)
+        images = numpy.fromfile(
+            f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32')
+        images = images / 255.0 * 2.0 - 1.0
+        labels = numpy.fromfile(l, 'ubyte', count=n).astype("int")
+        for i in xrange(n):
+            yield images[i, :], labels[i] # a single entry of data is created each time
+        f.close()
+        l.close()
+    return reader
+
+# images_reader_creator creates a reader
+reader = image_reader_creator("/path/to/image_file", "/path/to/label_file", 1024)
+paddle.train(paddle.batch(reader, 128), {"image":0, "label":1}, ...)
+```
+
+### How is `paddle.train` implemented
+
+An example implementation of paddle.train could be:
+
+```python
+def train(batch_reader, mapping, batch_size, total_pass):
+    for pass_idx in range(total_pass):
+        for mini_batch in batch_reader(): # this loop will never end in online learning.
+            do_forward_backward(mini_batch, mapping)
+```
diff --git a/doc/faq/index_cn.rst b/doc/faq/index_cn.rst
index 6d5367177da2af6276698f94f86664a5b506dca2..df5e172252277a881480cd2816eb901b711abe6b 100644
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@@ -286,3 +286,16 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 ..      code-block:: bash
 
         paddle train --use_gpu=true --trainer_count=2 --gpu_id=2
+
+
+12. 训练过程中出现 :code:`Floating point exception`, 训练因此退出怎么办?
+------------------------------------------------------------------------
+
+Paddle二进制在运行时捕获了浮点数异常，只要出现浮点数异常(即训练过程中出现NaN或者Inf)，立刻退出。浮点异常通常的原因是浮点数溢出、除零等问题。
+主要原因包括两个方面:
+
+* 训练过程中参数或者训练过程中的梯度尺度过大，导致参数累加，乘除等时候，导致了浮点数溢出。
+* 模型一直不收敛，发散到了一个数值特别大的地方。
+* 训练数据有问题，导致参数收敛到了一些奇异的情况。或者输入数据尺度过大，有些特征的取值达到数百万，这时进行矩阵乘法运算就可能导致浮点数溢出。
+
+主要的解决办法是减小学习律或者对数据进行归一化处理。
diff --git a/doc/getstarted/basic_usage/index_cn.rst b/doc/getstarted/basic_usage/index_cn.rst
index d01cdaaeb75ec7d02480eb9162cabaad2a947db9..428f58830e0b10c024f31238b7404c6df193eecd 100644
--- a/doc/getstarted/basic_usage/index_cn.rst
+++ b/doc/getstarted/basic_usage/index_cn.rst
@@ -55,7 +55,7 @@ PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍
     # 线性计算网络层: ȳ = wx + b
     ȳ = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
     # 计算误差函数，即  ȳ 和真实 y 之间的距离
-    cost = regression_cost(input= ȳ, label=y)
+    cost = mse_cost(input= ȳ, label=y)
     outputs(cost)
 
 
@@ -69,7 +69,7 @@ PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍
     
     - **数据层**：数据层 `data_layer` 是神经网络的入口，它读入数据并将它们传输到接下来的网络层。这里数据层有两个，分别对应于变量 `x` 和 `y`。
     - **全连接层**：全连接层 `fc_layer` 是基础的计算单元，这里利用它建模变量之间的线性关系。计算单元是神经网络的核心，PaddlePaddle支持大量的计算单元和任意深度的网络连接，从而可以拟合任意的函数来学习复杂的数据关系。
-    - **回归误差代价层**：回归误差代价层 `regression_cost` 是众多误差代价函数层的一种，它们在训练过程作为网络的出口，用来计算模型的误差，是模型参数优化的目标函数。
+    - **回归误差代价层**：回归误差代价层 `mse_cost` 是众多误差代价函数层的一种，它们在训练过程作为网络的出口，用来计算模型的误差，是模型参数优化的目标函数。
 
 定义了网络结构并保存为 `trainer_config.py` 之后，运行以下训练命令：
 
diff --git a/doc/getstarted/basic_usage/index_en.rst b/doc/getstarted/basic_usage/index_en.rst
index c10b897d4292d0c2b062b5c8e23466505afa408a..6775da20c2f51000f305b095d40abd27b8fa6c0e 100644
--- a/doc/getstarted/basic_usage/index_en.rst
+++ b/doc/getstarted/basic_usage/index_en.rst
@@ -49,7 +49,7 @@ To recover this relationship between ``X`` and ``Y``, we use a neural network wi
         x = data_layer(name='x', size=1)
         y = data_layer(name='y', size=1)
         y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
-        cost = regression_cost(input=y_predict, label=y)
+        cost = mse_cost(input=y_predict, label=y)
         outputs(cost)
 
 Some of the most fundamental usages of PaddlePaddle are demonstrated:
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index 6b132d2a4d31ab85347bd41d0243ffee858ac909..af889ec9d1b4f43f8e4a266b21822f773ab62ec2 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -1,158 +1,153 @@
-安装PaddlePaddle的Docker镜像
-============================
+PaddlePaddle的Docker容器使用方式
+================================
 
-PaddlePaddle项目提供官方 `Docker <https://www.docker.com/>`_ 镜像。Docker镜像是我们目前唯一官方支持的部署和运行方式。
+PaddlePaddle目前唯一官方支持的运行的方式是Docker容器。因为Docker能在所有主要操作系统（包括Linux，Mac OS X和Windows）上运行。 请注意，您需要更改 `Dockers设置 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 才能充分利用Mac OS X和Windows上的硬件资源。
 
-下述内容将分为如下几个类别描述。
 
-* PaddlePaddle提供的Docker镜像版本
-* 下载和运行Docker镜像
-* 注意事项
+纯CPU和GPU的docker镜像使用说明
+------------------------------
 
-PaddlePaddle提供的Docker镜像版本
---------------------------------
+对于每一个PaddlePaddle版本，我们都会发布两个Docker镜像：纯CPU的和GPU的。
+我们通过设置 `dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_ 自动生成最新的docker镜像：
+`paddledev/paddle:0.10.0rc1-cpu` 和 `paddledev/paddle:0.10.0rc1-gpu`。
 
-我们提供了12个 `Docker image <https://hub.docker.com/r/paddledev/paddle/tags/>`_ ，他们的image name都是 :code:`paddledev/paddle` ，tag分别为
+以交互容器方式运行纯CPU的镜像：
 
-+-----------------+------------------+------------------------+-----------------------+
-|                 |   normal         |           devel        |          demo         |
-+=================+==================+========================+=======================+
-|       CPU       | cpu-latest       | cpu-devel-latest       | cpu-demo-latest       |
-+-----------------+------------------+------------------------+-----------------------+
-|       GPU       | gpu-latest       | gpu-devel-latest       | gpu-demo-latest       |
-+-----------------+------------------+------------------------+-----------------------+
-| CPU WITHOUT AVX | cpu-noavx-latest | cpu-noavx-devel-latest | cpu-noavx-demo-latest |
-+-----------------+------------------+------------------------+-----------------------+
-| GPU WITHOUT AVX | gpu-noavx-latest | gpu-noavx-devel-latest | gpu-noavx-demo-latest |
-+-----------------+------------------+------------------------+-----------------------+
+.. code-block:: bash
 
-其中，横向包括三个版本，normal，devel和demo。
+    docker run -it --rm paddledev/paddle:0.10.0rc1-cpu /bin/bash
 
-* Normal: 正常的Docker image，只包括paddle的二进制
-* Devel: 包括Paddle的二进制、编译环境和源代码
-* Demo: 包括Paddle运行demo所需要的依赖
+或者，可以以后台进程方式运行容器：
 
-纵向包括四个版本，他们是。
+.. code-block:: bash
 
-* CPU: CPU版本。需要支持AVX指令集的CPU
-* GPU: GPU版本。需要支持AVX指令集的CPU
-* CPU WITHOUT AVX: CPU版本，不支持AVX指令集的CPU也可以运行
-* GPU WITHOUT AVX: GPU版本，不需要AVX指令集的CPU也可以运行。
+    docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:0.10.0rc1-cpu
 
-用户可以选择对应版本的docker image。使用如下脚本可以确定本机的CPU是否支持 :code:`AVX` 指令集\:
+然后用密码 :code:`root` SSH进入容器：
 
-..  code-block:: bash
+.. code-block:: bash
 
-    if cat /proc/cpuinfo | grep -q avx ; then echo "Support AVX"; else echo "Not support AVX"; fi
+    ssh -p 2202 root@localhost
 
-如果输出 :code:`Support AVX`，则可以选择上表中的AVX版本PaddlePaddle。否则需要选择非AVX的PaddlePaddle。选择普通CPU版本的devel版本的image，则可以使用 :code:`paddledev/paddle:cpu-devel-latest` 来引用这个image。
+SSH方式的一个优点是我们可以从多个终端进入容器。比如，一个终端运行vi，另一个终端运行Python。另一个好处是我们可以把PaddlePaddle容器运行在远程服务器上，并在笔记本上通过SSH与其连接。
 
-PaddlePaddle提供的镜像并不包含任何命令运行，想要运行PaddlePaddle，您需要进入镜像运行PaddlePaddle
-程序或者自定义一个含有启动脚本的image。具体请参考注意事项中的 :code:`使用ssh访问PaddlePaddle镜像`
 
-下载和运行Docker镜像
---------------------
+以上方法在GPU镜像里也能用－只是请不要忘记按装CUDA驱动，以及告诉Docker：
 
-为了运行PaddlePaddle的docker镜像，您需要在机器中安装好Docker。安装Docker需要您的机器
-至少具有3.10以上的linux kernel。安装方法请参考
-`Docker的官方文档 <https://docs.docker.com/engine/installation/>`_ 。如果您使用
-mac osx或者是windows机器，请参考 
-`mac osx的安装文档 <https://docs.docker.com/engine/installation/mac/>`_ 和
-`windows 的安装文档 <https://docs.docker.com/engine/installation/windows/>`_ 。
+.. code-block:: bash
 
-您可以使用 :code:`docker pull` 命令预先下载镜像，也可以直接执行 
-:code:`docker run` 命令运行镜像。执行方法如下:
+    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+    export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+    docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0rc1-gpu
 
-..  code-block:: bash
+
+运行PaddlePaddle书籍
+---------------------
+
+Jupyter Notebook是一个开源的web程序，大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。
+
+PaddlePaddle书籍是为用户和开发者制作的一个交互式的Jupyter Nodebook。
+如果您想要更深入了解deep learning，PaddlePaddle书籍一定是您最好的选择。
+
+当您进入容器内之后，只用运行以下命令：
+
+.. code-block:: bash
+        
+    jupyter notebook
+
+然后在浏览器中输入以下网址：
     
-    $ docker run -it paddledev/paddle:cpu-latest
+.. code-block:: text
 
-即可启动和进入PaddlePaddle的container。如果运行GPU版本的PaddlePaddle，则需要先将
-cuda相关的Driver和设备映射进container中，脚本类似于
+    http://localhost:8888/
 
-..  code-block:: bash
+就这么简单，享受您的旅程！
 
-    $ export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
-    $ export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    $ docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:gpu-latest
 
-进入Docker container后，运行 :code:`paddle version` 即可打印出PaddlePaddle的版本和构建
-信息。安装完成的PaddlePaddle主体包括三个部分， :code:`paddle` 脚本， python的
-:code:`paddle` 包和 :code:`py_paddle` 包。其中\:
+非AVX镜像
+---------
 
-* :code:`paddle` 脚本和 :code:`paddle` 的python包是PaddlePaddle的训练主要程序。使用 
-  :code:`paddle` 脚本可以启动PaddlePaddle的训练进程和pserver。而 :code:`paddle` 脚本
-  中的二进制使用了 :code:`paddle` 的python包来做配置文件解析等工作。
-* python包 :code:`py_paddle` 是一个swig封装的PaddlePaddle包，用来做预测和简单的定制化
-  训练。
+纯CPU镜像以及GPU镜像都会用到AVX指令集，但是2008年之前生产的旧电脑不支持AVX。以下指令能检查Linux电脑是否支持AVX：
 
-注意事项
---------
+.. code-block:: bash
 
-性能问题
-++++++++
+   if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
 
-由于Docker是基于容器的轻量化虚拟方案，所以在CPU的运算性能上并不会有严重的影响。
-而GPU的驱动和设备全部映射到了容器内，所以GPU在运算性能上也不会有严重的影响。
+如果输出是No，我们就需要手动编译一个非AVX版本的镜像：
 
-但是如果使用了高性能的网卡，例如RDMA网卡(RoCE 40GbE 或者 IB 56GbE)，或者高性能的
-以太网卡 (10GbE)。推荐使用将本地网卡，即 "--net=host" 来进行训练。而不使用docker
-的网桥来进行网络通信。
+.. code-block:: bash
 
-远程访问问题和二次开发
-++++++++++++++++++++++
+   cd ~
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
+   docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
 
-由于PaddlePaddle的Docker镜像并不包含任何预定义的运行命令。所以如果想要在后台启用ssh
-远程访问，则需要进行一定的二次开发，将ssh装入系统内并开启远程访问。二次开发可以
-使用Dockerfile构建一个全新的docker image。需要参考 
-`Dockerfile的文档 <https://docs.docker.com/engine/reference/builder/>`_ 和
-`Dockerfile的最佳实践 <https://docs.docker.com/engine/userguide/eng-image/dockerfile_best-practices/>`_ 
-两个文档。
 
-简单的含有ssh的Dockerfile如下：
+通过Docker容器开发PaddlePaddle
+------------------------------
 
-..  code-block:: bash
+开发人员可以在Docker中开发PaddlePaddle。这样开发人员可以以一致的方式在不同的平台上工作 - Linux，Mac OS X和Windows。
 
-    FROM paddledev/paddle:cpu-latest
+1. 将开发环境构建为Docker镜像
+   
+   .. code-block:: bash
 
-    MAINTAINER PaddlePaddle dev team <paddle-dev@baidu.com>
+      git clone --recursive https://github.com/PaddlePaddle/Paddle
+      cd Paddle
+      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile .
 
-    RUN apt-get update
-    RUN apt-get install -y openssh-server
-    RUN mkdir /var/run/sshd
-    RUN echo 'root:root' | chpasswd
 
-    RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
-    RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+   请注意，默认情况下，:code:`docker build` 不会将源码导入到镜像中并编译它。如果我们想这样做，需要设置一个参数：
 
-    EXPOSE 22
+   .. code-block:: bash
 
-    CMD    ["/usr/sbin/sshd", "-D"]
+      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile --build-arg BUILD_AND_INSTALL=ON .
 
 
-使用该Dockerfile构建出镜像，然后运行这个container即可。相关命令为\:
+2. 运行开发环境
 
-..  code-block:: bash
+   当我们编译好了 :code:`paddle:dev`， 我们可以在docker容器里做开发，源代码可以通过挂载本地文件来被载入Docker的开发环境里面：
+   
+   .. code-block:: bash
 
-    # cd到含有Dockerfile的路径中
-    $ docker build . -t paddle_ssh
-    # 运行这个container，将宿主机的8022端口映射到container的22端口上
-    $ docker run -d -p 8022:22  --name paddle_ssh_machine paddle_ssh
+      docker run -d -p 2202:22 -v $PWD:/paddle paddle:dev
 
-执行如下命令即可以关闭这个container，并且删除container中的数据\:
+   以上代码会启动一个带有PaddlePaddle开发环境的docker容器，源代码会被挂载到 :code:`/paddle` 。
 
-..  code-block:: bash
-    
-    # 关闭container
-    $ docker stop paddle_ssh_machine
-    # 删除container
-    $ docker rm paddle_ssh_machine
+   请注意， :code:`paddle:dev` 的默认入口是 :code:`sshd` 。以上的 :code:`docker run` 命令其实会启动一个在2202端口监听的SSHD服务器。这样，我们就能SSH进入我们的开发容器了：
+   
+   .. code-block:: bash
+
+      ssh root@localhost -p 2202
+
+3. 在Docker开发环境中编译与安装PaddlPaddle代码
+
+   当在容器里面的时候，可以用脚本 :code:`paddle/scripts/docker/build.sh` 来编译、安装与测试PaddlePaddle：
+   
+   .. code-block:: bash
+		      
+      /paddle/paddle/scripts/docker/build.sh
+
+   以上指令会在 :code:`/paddle/build` 中编译PaddlePaddle。通过以下指令可以运行单元测试：
+   
+   .. code-block:: bash
+
+      cd /paddle/build
+      ctest
+
+
+文档
+----
+
+Paddle的Docker镜像带有一个通过 `woboq code browser
+<https://github.com/woboq/woboq_codebrowser>`_ 生成的HTML版本的C++源代码，便于用户浏览C++源码。
 
-如果想要在外部机器访问这个container，即可以使用ssh访问宿主机的8022端口。用户名为
-root，密码也是root。命令为\:
+只要在Docker里启动PaddlePaddle的时候给它一个名字，就可以再运行另一个Nginx Docker镜像来服务HTML代码：
 
-..  code-block:: bash
+.. code-block:: bash
 
-    $ ssh -p 8022 root@YOUR_HOST_MACHINE
+   docker run -d --name paddle-cpu-doc paddle:0.10.0rc1-cpu
+   docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
 
-至此，您就可以远程的使用PaddlePaddle啦。
+接着我们就能够打开浏览器在 http://localhost:8088/paddle/ 浏览代码。
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index 5a1056e859a0c977c9cd365ff1e4ffe58596f41f..606746597acc0da00588b7eb05935f6c05c169f2 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -9,6 +9,100 @@ Please be aware that you will need to change `Dockers settings
 of your hardware resource on Mac OS X and Windows.
 
 
+Usage of CPU-only and GPU Images
+----------------------------------
+
+For each version of PaddlePaddle, we release 2 Docker images, a
+CPU-only one and a CUDA GPU one.  We do so by configuring
+`dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_
+automatically generate the latest docker images `paddledev/paddle:0.10.0rc1-cpu`
+and `paddledev/paddle:0.10.0rc1-gpu`.
+
+To run the CPU-only image as an interactive container:
+
+.. code-block:: bash
+
+    docker run -it --rm paddledev/paddle:0.10.0rc1-cpu /bin/bash
+
+or, we can run it as a daemon container
+
+.. code-block:: bash
+
+    docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:0.10.0rc1-cpu
+
+and SSH to this container using password :code:`root`:
+
+.. code-block:: bash
+
+    ssh -p 2202 root@localhost
+
+An advantage of using SSH is that we can connect to PaddlePaddle from
+more than one terminals.  For example, one terminal running vi and
+another one running Python interpreter.  Another advantage is that we
+can run the PaddlePaddle container on a remote server and SSH to it
+from a laptop.
+
+Above methods work with the GPU image too -- just please don't forget
+to install CUDA driver and let Docker knows about it:
+
+.. code-block:: bash
+
+    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+    export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+    docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0rc1-gpu
+
+
+PaddlePaddle Book
+------------------
+
+The Jupyter Notebook is an open-source web application that allows
+you to create and share documents that contain live code, equations,
+visualizations and explanatory text in a single browser.
+
+PaddlePaddle Book is an interactive Jupyter Notebook for users and developers. 
+We already exposed port 8888 for this book. If you want to
+dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
+
+Once you are inside the container, simply issue the command:
+
+.. code-block:: bash
+        
+    jupyter notebook
+
+Then, you would back and paste the address into the local browser:
+    
+.. code-block:: text
+
+    http://localhost:8888/
+
+That's all. Enjoy your journey!
+
+
+Non-AVX Images
+--------------
+
+Please be aware that the CPU-only and the GPU images both use the AVX
+instruction set, but old computers produced before 2008 do not support
+AVX.  The following command checks if your Linux computer supports
+AVX:
+
+.. code-block:: bash
+
+   if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
+
+
+If it doesn't, we will need to build non-AVX images manually from
+source code:
+
+.. code-block:: bash
+
+   cd ~
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
+   docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
+
+
 Development Using Docker
 ------------------------
 
@@ -42,7 +136,7 @@ Windows -- in a consistent way.
 
    .. code-block:: bash
 
-      docker run -d -p 2202:22 -v $PWD:/paddle paddle:dev
+      docker run -d -p 2202:22 -p 8888:8888 -v $PWD:/paddle paddle:dev
 
    This runs a container of the development environment Docker image
    with the local source tree mounted to :code:`/paddle` of the
@@ -83,80 +177,6 @@ Windows -- in a consistent way.
       ctest
 
 
-CPU-only and GPU Images
------------------------
-
-For each version of PaddlePaddle, we release 2 Docker images, a
-CPU-only one and a CUDA GPU one.  We do so by configuring
-`dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_
-automatically runs the following commands:
-
-.. code-block:: bash
-
-   docker build -t paddle:cpu -f paddle/scripts/docker/Dockerfile .
-   docker build -t paddle:gpu -f paddle/scripts/docker/Dockerfile.gpu .
-
-
-To run the CPU-only image as an interactive container:
-
-.. code-block:: bash
-
-    docker run -it --rm paddledev/paddle:cpu-latest /bin/bash
-
-or, we can run it as a daemon container
-
-.. code-block:: bash
-
-    docker run -d -p 2202:22 paddledev/paddle:cpu-latest
-
-and SSH to this container using password :code:`root`:
-
-.. code-block:: bash
-
-    ssh -p 2202 root@localhost
-
-An advantage of using SSH is that we can connect to PaddlePaddle from
-more than one terminals.  For example, one terminal running vi and
-another one running Python interpreter.  Another advantage is that we
-can run the PaddlePaddle container on a remote server and SSH to it
-from a laptop.
-
-
-Above methods work with the GPU image too -- just please don't forget
-to install CUDA driver and let Docker knows about it:
-
-.. code-block:: bash
-
-    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
-    export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:gpu-latest
-
-
-Non-AVX Images
---------------
-
-Please be aware that the CPU-only and the GPU images both use the AVX
-instruction set, but old computers produced before 2008 do not support
-AVX.  The following command checks if your Linux computer supports
-AVX:
-
-.. code-block:: bash
-
-   if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
-
-
-If it doesn't, we will need to build non-AVX images manually from
-source code:
-
-.. code-block:: bash
-
-   cd ~
-   git clone https://github.com/PaddlePaddle/Paddle.git
-   cd Paddle
-   docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
-   docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
-
-
 Documentation
 -------------
 
@@ -171,7 +191,7 @@ container:
 
 .. code-block:: bash
 
-   docker run -d --name paddle-cpu-doc paddle:cpu
+   docker run -d --name paddle-cpu-doc paddle:0.10.0rc1-cpu
    docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
 
 
diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst
index bd3d0ec292057037414792b1ac176d12605b90d5..5b84eea491f874459ed2071e4c942657cdc9b18b 100644
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -10,6 +10,7 @@
   usage/cmd_parameter/index_cn.rst
   usage/concepts/use_concepts_cn.rst
   usage/cluster/cluster_train_cn.md
+  usage/k8s/k8s_basis_cn.md
   usage/k8s/k8s_cn.md
   usage/k8s/k8s_distributed_cn.md
 
diff --git a/doc/howto/usage/cluster/cluster_train_cn.md b/doc/howto/usage/cluster/cluster_train_cn.md
index acdcfa1c0047ced85c0a9c53d691edc0b4489336..274452fbf0c595ad7b4dbeffe85ad9038f12b458 100644
--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
@@ -6,7 +6,7 @@
 
 在本文中，我们将阐释如何在集群上运行分布式 Paddle 训练作业。我们将以[推荐系统](https://github.com/baidu/Paddle/tree/develop/demo/recommendation)为例创建分布式的单进程训练。
 
-在本文中使用的[脚本](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train)通过 SSH 运行分布式作业。 它们还可以供那些运行更复杂的集群管理系统（如 MPI 和 [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/k8s) ）的用户参考。
+在本文中使用的[脚本](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train)通过 SSH 运行分布式作业。 它们还可以供那些运行更复杂的集群管理系统（如 MPI 和 [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/k8s) ）的用户参考。
 
 ## 前提条件
 
diff --git a/doc/howto/usage/cluster/cluster_train_en.md b/doc/howto/usage/cluster/cluster_train_en.md
index 30963dcd927250651f3ed0b39949f541cc28ed4a..c60876721cbf5565d6e48c8061811aacada748cd 100644
--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
@@ -2,7 +2,7 @@
 
 In this article, we explain how to run distributed Paddle training jobs on clusters.  We will create the distributed version of the single-process training example, [recommendation](https://github.com/baidu/Paddle/tree/develop/demo/recommendation).
 
-[Scripts](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train) used in this article launch distributed jobs via SSH.  They also work as a reference for users running more sophisticated cluster management systems like MPI and [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/k8s).
+[Scripts](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train) used in this article launch distributed jobs via SSH.  They also work as a reference for users running more sophisticated cluster management systems like MPI and [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/k8s).
 
 ## Prerequisite
 
diff --git a/doc/howto/usage/cmd_parameter/arguments_cn.md b/doc/howto/usage/cmd_parameter/arguments_cn.md
index 2e2a2fcc54a09f4f41e4ebbc317e1409591ddd9c..f7aa525054468670f59309ddf9206af55bb77869 100644
--- a/doc/howto/usage/cmd_parameter/arguments_cn.md
+++ b/doc/howto/usage/cmd_parameter/arguments_cn.md
@@ -228,16 +228,6 @@
 <td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
 </tr>
 
-<tr>
-<td class="left" rowspan = "2">度量学习(metric learning)</td><td class="left">external</td>
-<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
-</tr>
-
-<tr>
-<td class="left">data_server_port</td>
-<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
-</tr>
-
 <tr>
 <td class="left" rowspan = "16">参数服务器(PServer)</td><td class="left">start_pserver</td>
 <td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
diff --git a/doc/howto/usage/cmd_parameter/arguments_en.md b/doc/howto/usage/cmd_parameter/arguments_en.md
index e5546f0ddc78a9f8bdc306a19c2fe9a415463e5a..d1963067bda949b11ececefed3db7db1432c6223 100644
--- a/doc/howto/usage/cmd_parameter/arguments_en.md
+++ b/doc/howto/usage/cmd_parameter/arguments_en.md
@@ -228,16 +228,6 @@ It looks like there are a lot of arguments. However, most of them are for develo
 <td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
 </tr>
 
-<tr>
-<td class="left" rowspan = "2">metric learning</td><td class="left">external</td>
-<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
-</tr>
-
-<tr>
-<td class="left">data_server_port</td>
-<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
-</tr>
-
 <tr>
 <td class="left" rowspan = "16">PServer</td><td class="left">start_pserver</td>
 <td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
diff --git a/doc/howto/usage/cmd_parameter/detail_introduction_cn.md b/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
index 3b573a324d541b024600a254d5266e517db229c5..b4625ba68cf23e5697554ba94efaf0b873f2c1de 100644
--- a/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
+++ b/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
@@ -180,15 +180,6 @@
   - 用户可以自定义beam search的方法，编译成动态库，供PaddlePaddle加载。 该参数用于指定动态库路径.
   - 类型: string (默认: "", null).
 
-## 度量学习(Metric Learning)
-* `--external`
-   - 指示是否使用外部机器进行度量学习.
-   - 类型: bool (默认: 0).
-
-* `--data_server_port`
-  - 数据服务器(data server)的监听端口，主要用在度量学习中.
-  - 类型: int32 (默认: 21134).
-
 ## 数据支持(DataProvider)
 
 * `--memory_threshold_on_load_data`
diff --git a/doc/howto/usage/cmd_parameter/detail_introduction_en.md b/doc/howto/usage/cmd_parameter/detail_introduction_en.md
index 33b7ec0d51a96ee126197e7aa819fdae0d3dc353..b681ebc81a355dfc1a7638a4463dff6979929a45 100644
--- a/doc/howto/usage/cmd_parameter/detail_introduction_en.md
+++ b/doc/howto/usage/cmd_parameter/detail_introduction_en.md
@@ -184,15 +184,6 @@
   - Specify shared dynamic library. It can be defined out of paddle by user.
   - type: string (default: "", null).
 
-## Metric Learning
-* `--external`
-   - Whether to use external machine for metric learning.
-   - type: bool (default: 0).
-
-* `--data_server_port`
-  - Listening port for dserver (data server), dserver is mainly used in metric learning.
-  - type: int32 (default: 21134).
-
 ## DataProvider
 
 * `--memory_threshold_on_load_data`
diff --git a/doc/howto/usage/k8s/k8s_basis_cn.md b/doc/howto/usage/k8s/k8s_basis_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..6278dacb17a378da660b2f5434247efd41c995fc
--- /dev/null
+++ b/doc/howto/usage/k8s/k8s_basis_cn.md
@@ -0,0 +1,75 @@
+# Kubernetes 简介
+
+[*Kubernetes*](http://kubernetes.io/)是Google开源的容器集群管理系统，其提供应用部署、维护、扩展机制等功能，利用Kubernetes能方便地管理跨机器运行容器化的应用。Kubernetes可以在物理机或虚拟机上运行，且支持部署到[AWS](http://kubernetes.io/docs/getting-started-guides/aws)，[Azure](http://kubernetes.io/docs/getting-started-guides/azure/)，[GCE](http://kubernetes.io/docs/getting-started-guides/gce)等多种公有云环境。介绍分布式训练之前，需要对[Kubernetes](http://kubernetes.io/)有一个基本的认识，下面先简要介绍一下本文用到的几个Kubernetes概念。
+
+- [*Node*](http://kubernetes.io/docs/admin/node/) 表示一个Kubernetes集群中的一个工作节点，这个节点可以是物理机或者虚拟机，Kubernetes集群就是由node节点与master节点组成的。
+
+- [*Pod*](http://kubernetes.io/docs/user-guide/pods/) 是一组(一个或多个)容器，pod是Kubernetes的最小调度单元，一个pod中的所有容器会被调度到同一个node上。Pod中的容器共享NET，PID，IPC，UTS等Linux namespace。由于容器之间共享NET namespace，所以它们使用同一个IP地址，可以通过*localhost*互相通信。不同pod之间可以通过IP地址访问。
+
+- [*Job*](http://kubernetes.io/docs/user-guide/jobs/) 描述Kubernetes上运行的作业，一次作业称为一个job，通常每个job包括一个或者多个pods，job启动后会创建这些pod并开始执行一个程序，等待这个程序执行成功并返回0则成功退出，如果执行失败，也可以配置不同的重试机制。
+
+- [*Volume*](http://kubernetes.io/docs/user-guide/volumes/) 存储卷，是pod内的容器都可以访问的共享目录，也是容器与node之间共享文件的方式，因为容器内的文件都是暂时存在的，当容器因为各种原因被销毁时，其内部的文件也会随之消失。通过volume，就可以将这些文件持久化存储。Kubernetes支持多种volume，例如hostPath(宿主机目录)，gcePersistentDisk，awsElasticBlockStore等。
+
+- [*Namespaces*](https://kubernetes.io/docs/user-guide/namespaces/) 命名空间，在kubernetes中创建的所有资源对象(例如上文的pod，job)等都属于一个命名空间，在同一个命名空间中，资源对象的名字是唯一的，不同空间的资源名可以重复，命名空间主要为了对象进行逻辑上的分组便于管理。本文只使用了默认命名空间。
+
+- [*PersistentVolume*](https://kubernetes.io/docs/user-guide/persistent-volumes/): 和[*PersistentVolumeClaim*](https://kubernetes.io/docs/user-guide/persistent-volumes/#persistentvolumeclaims)结合，将外部的存储服务在Kubernetes中描述成为统一的资源形式，便于存储资源管理和Pod引用。
+
+# 部署Kubernetes集群
+
+Kubernetes提供了多种集群部署的方案，本文档内不重复介绍。这里给出集中常见的部署方法：
+
+- [*minikube*](https://kubernetes.io/docs/getting-started-guides/minikube/): 快速在本地启动一个单机的kubernetes服务器，便于本地验证和测试。
+- [*kubeadm*](http://kubernetes.io/docs/getting-started-guides/kubeadm/): 在不同操作系统，不同主机(Bare-Metal, AWS, GCE)条件下，快速部署集群。
+- [*AWS EC2*](https://kubernetes.io/docs/getting-started-guides/aws/): 在aws上快速部署集群。
+- [*Bare-Metal*](https://kubernetes.io/docs/getting-started-guides/centos/centos_manual_config/): 在物理机上手动部署。
+
+可以参考[这个表格](https://kubernetes.io/docs/getting-started-guides/#table-of-solutions)选择适合您的场景的合适方案。
+
+# 选择存储方案
+
+容器不会保留在运行时生成的数据，job或者应用程序在容器中运行时生成的数据会在容器销毁时消失。为了完成分布式机器学习训练任务，需要有一个外部的存储服务来保存训练所需数据和训练输出。
+常见的可选存储服务包括：
+
+- [*NFS*](https://github.com/kubernetes/kubernetes/tree/master/examples/volumes/nfs): 可以将磁盘上某个目录共享给网络中其他机器访问。部署和配置比较简单，可以用于小量数据的验证。不提供分布式存储，高可用，冗余等功能。NFS的部署方法可以参考[这里](http://www.tecmint.com/how-to-setup-nfs-server-in-linux/)。
+- [*GlusterFS*](http://gluster.readthedocs.io/en/latest/Quick-Start-Guide/Quickstart/): 网络分布式文件系统，可以在Kubernetes中按照[这个](https://github.com/kubernetes/kubernetes/tree/master/examples/volumes/glusterfs)例子使用。
+- [*Ceph*](http://docs.ceph.com/docs/master/): 分布式文件系统，支持rbd，POSIX API接口(ceph fs)和对象存储API，参考[这里](https://kubernetes.io/docs/user-guide/volumes/#rbd)。
+- [*MooseFS*](https://moosefs.com/documentation.html): 一个分布式的存储系统。需要先挂载到服务器Node上再通过kubernetes hostPath Volume挂载到容器中。
+
+# 配置kubectl
+
+## 安装kubectl
+```
+# OS X
+curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/darwin/amd64/kubectl
+
+# Linux
+curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl
+
+# Windows
+curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/windows/amd64/kubectl.exe
+```
+
+## 配置kubectl访问你的kubernetes集群
+
+编辑`~/.kube/config`这个配置文件，修改`Master-IP`的地址。如果使用SSL认证，则需要配置`certificate-authority`和`users`中的用户证书。如果是使用非SSL方式访问（比如通过8080端口），也可以去掉这些证书的配置。
+```
+apiVersion: v1
+clusters:
+- cluster:
+    certificate-authority: /path/to/ca.crt
+    server: https://[Master-IP]:443
+  name: minikube
+contexts:
+- context:
+    cluster: minikube
+    user: minikube
+  name: minikube
+current-context: minikube
+kind: Config
+preferences: {}
+users:
+- name: minikube
+  user:
+    client-certificate: /path/to/apiserver.crt
+    client-key: /Users/wuyi/.minikube/apiserver.key
+```
diff --git a/doc/howto/usage/k8s/k8s_distributed_cn.md b/doc/howto/usage/k8s/k8s_distributed_cn.md
index 2063b98ca8aab9c348fe2b53bb1e6d96b7750dd3..3121b3f59df650c0a22d0bd305a6f793b202d30e 100644
--- a/doc/howto/usage/k8s/k8s_distributed_cn.md
+++ b/doc/howto/usage/k8s/k8s_distributed_cn.md
@@ -2,181 +2,96 @@
 
 前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里，我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练，文章 [Cluster Training](https://github.com/baidu/Paddle/blob/develop/doc/cluster/opensource/cluster_train.md)介绍了一种通过SSH远程分发任务，进行分布式训练的方法，与此不同的是，本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群，进行分布式训练的方案。
 
-## Kubernetes 基本概念
-
-[*Kubernetes*](http://kubernetes.io/)是Google开源的容器集群管理系统，其提供应用部署、维护、 扩展机制等功能，利用Kubernetes能方便地管理跨机器运行容器化的应用。Kubernetes可以在物理机或虚拟机上运行，且支持部署到[AWS](http://kubernetes.io/docs/getting-started-guides/aws)，[Azure](http://kubernetes.io/docs/getting-started-guides/azure/)，[GCE](http://kubernetes.io/docs/getting-started-guides/gce)等多种公有云环境。介绍分布式训练之前，需要对[Kubernetes](http://kubernetes.io/)有一个基本的认识，下面先简要介绍一下本文用到的几个Kubernetes概念。
-
-- [*Node*](http://kubernetes.io/docs/admin/node/) 表示一个Kubernetes集群中的一个工作节点，这个节点可以是物理机或者虚拟机，Kubernetes集群就是由node节点与master节点组成的。
-
-- [*Pod*](http://kubernetes.io/docs/user-guide/pods/) 是一组(一个或多个)容器，pod是Kubernetes的最小调度单元，一个pod中的所有容器会被调度到同一个node上。Pod中的容器共享NET，PID，IPC，UTS等Linux namespace。由于容器之间共享NET namespace，所以它们使用同一个IP地址，可以通过*localhost*互相通信。不同pod之间可以通过IP地址访问。
-
-- [*Job*](http://kubernetes.io/docs/user-guide/jobs/) 是Kubernetes上运行的作业，一次作业称为一个job，通常每个job包括一个或者多个pods。
-
-- [*Volume*](http://kubernetes.io/docs/user-guide/volumes/) 存储卷，是pod内的容器都可以访问的共享目录，也是容器与node之间共享文件的方式，因为容器内的文件都是暂时存在的，当容器因为各种原因被销毁时，其内部的文件也会随之消失。通过volume，就可以将这些文件持久化存储。Kubernetes支持多种volume，例如hostPath(宿主机目录)，gcePersistentDisk，awsElasticBlockStore等。
-
-- [*Namespaces*](http://kubernetes.io/docs/user-guide/volumes/) 命名空间，在kubernetes中创建的所有资源对象(例如上文的pod，job)等都属于一个命名空间，在同一个命名空间中，资源对象的名字是唯一的，不同空间的资源名可以重复，命名空间主要为了对象进行逻辑上的分组便于管理。本文只使用了默认命名空间。
+有关Kubernetes相关概念以及如何搭建和配置Kubernetes集群，可以参考[k8s_basis](./k8s_basis_cn.md)。
 
 ## 整体方案
 
-### 部署Kubernetes集群
-
-首先，我们需要拥有一个Kubernetes集群，在这个集群中所有node与pod都可以互相通信。关于Kubernetes集群搭建，可以参考[官方文档](http://kubernetes.io/docs/getting-started-guides/kubeadm/)，在以后的文章中我们也会介绍AWS上搭建的方案。本文假设大家能找到几台物理机，并且可以按照官方文档在上面部署Kubernetes。在本文的环境中，Kubernetes集群中所有node都挂载了一个[MFS](http://moosefs.org/)（Moose filesystem，一种分布式文件系统）共享目录，我们通过这个目录来存放训练文件与最终输出的模型。关于MFS的安装部署，可以参考[MooseFS documentation](https://moosefs.com/documentation.html)。在训练之前，用户将配置与训练数据切分好放在MFS目录中，训练时，程序从此目录拷贝文件到容器内进行训练，将结果保存到此目录里。整体的结构图如下：
+在训练之前，用户将配置与训练数据切分好放在分布式文件系统预先分配好的目录中(不同的分布式文件系统，需要使用其制定的方式挂载后并导入数据)，训练时，程序从此目录拷贝文件到容器内进行训练，将结果保存到此目录里。整体的结构图如下：
 
 ![paddle on kubernetes结构图](src/k8s-paddle-arch.png)
 
-上图描述了一个3节点的分布式训练场景，Kubernetes集群的每个node上都挂载了一个MFS目录，这个目录可以通过volume的形式挂载到容器中。Kubernetes为这次训练创建了3个pod并且调度到了3个node上运行，每个pod包含一个PaddlePaddle容器。在容器创建后，会启动pserver与trainer进程，读取volume中的数据进行这次分布式训练。
-
-### 使用 Job
-
-我们使用Kubernetes中的job这个概念来代表一次分布式训练。Job表示一次性作业，在作业完成后，Kubernetes会销毁job产生的容器并且释放相关资源。
-
-在Kubernetes中，可以通过编写一个YAML文件，来描述这个job，在这个文件中，主要包含了一些配置信息，例如PaddlePaddle的节点个数，`paddle pserver`开放的端口个数与端口号，使用的网卡设备等，这些信息通过环境变量的形式传递给容器内的程序使用。
-
-在一次分布式训练中，用户确定好本次训练需要的PaddlePaddle节点个数，将切分好的训练数据与配置文件上传到MFS共享目录中。然后编写这次训练的job YAML文件，提交给Kubernetes集群创建并开始作业。
-
-### 创建PaddlePaddle节点
+上图描述了一个3节点的分布式训练场景，在每个Pod上都通过volume方式挂载分布式文件系统的一个目录用于保存训练数据和输出结果。Kubernetes为这次训练创建了3个pod并且调度到了3个node上运行，每个pod包含一个PaddlePaddle容器。在容器创建后，会启动pserver与trainer进程，读取volume中的数据进行这次分布式训练。
 
-当Kubernetes master收到请求，解析完YAML文件后，会创建出多个pod(个数为PaddlePaddle节点数)，Kubernetes会把这些pod调度到集群的node上运行。一个pod就代表一个PaddlePaddle节点，当pod被成功分配到一台物理/虚拟机上后，Kubernetes会启动pod内的容器，这个容器会根据YAML文件中的环境变量，启动`paddle pserver`与`paddle train`进程。
+根据前文的描述，要在已有的Kubernetes集群上进行PaddlePaddle的分布式训练，按照下面步骤即可：
 
-### 启动训练
-
-在容器启动后，会通过脚本来启动这次分布式训练，我们知道`paddle train`进程启动时需要知道其他节点的IP地址以及本节点的trainer_id，由于PaddlePaddle本身不提供类似服务发现的功能，所以在本文的启动脚本中，每个节点会根据job name向Kubernetes apiserver查询这个job对应的所有pod信息(Kubernetes默认会在每个容器的环境变量中写入apiserver的地址)。
-
-根据这些pod信息，就可以通过某种方式，为每个pod分配一个唯一的trainer_id。本文把所有pod的IP地址进行排序，将顺序作为每个PaddlePaddle节点的trainer_id。启动脚本的工作流程大致如下：
-
-  1. 查询Kubernetes apiserver获取pod信息，根据IP分配trainer_id
-  1. 从MFS共享目录中拷贝训练文件到容器内
-  1. 根据环境变量，解析出`paddle pserver`与`paddle train`的启动参数，启动进程
-  1. 训练时，PaddlePaddle会自动将结果保存在trainer_id为0的节点上，将输出路径设置为MFS目录，保存输出的文件
-
-
-## 搭建过程
-
-根据前文的描述，要在已有的Kubernetes集群上进行PaddlePaddle的分布式训练，主要分为以下几个步骤：
-
-1. 制作PaddlePaddle镜像
-1. 将训练文件与切分好的数据上传到共享存储
-1. 编写本次训练的YAML文件，创建一个Kubernetes job
-1. 训练结束后查看输出结果
+1. [制作PaddlePaddle镜像](#制作镜像)
+1. [将训练文件与切分好的数据上传到共享存储](#上传训练文件)
+1. [编写本次训练的YAML文件，创建一个Kubernetes job](#创建Job)
+1. [训练结束后查看输出结果](#查看输出)
 
 下面就根据这几个步骤分别介绍。
 
-
 ### 制作镜像
 
 PaddlePaddle镜像需要提供`paddle pserver`与`paddle train`进程的运行环境，用这个镜像创建的容器需要有以下两个功能：
 
 - 拷贝训练文件到容器内
-
 - 生成`paddle pserver`与`paddle train`进程的启动参数，并且启动训练
 
-因为官方镜像 `paddledev/paddle:cpu-latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能，所以我们可以在这个基础上，添加启动脚本，制作新镜像来完成以上的工作。镜像的*Dockerfile*如下：
-
-```Dockerfile
-FROM paddledev/paddle:cpu-latest
-
-MAINTAINER zjsxzong89@gmail.com
-
-COPY start.sh /root/
-COPY start_paddle.py /root/
-CMD ["bash"," -c","/root/start.sh"]
-```
-
-[start.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/k8s/start.sh)文件拷贝训练文件到容器内，然后执行[start_paddle.py](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/k8s/start_paddle.py)脚本启动训练，前文提到的获取其他节点IP地址，分配`trainer_id`等都在`start_paddle.py`脚本中完成。
-
-`start_paddle.py`脚本开始时，会先进行参数的初始化与解析。
-
-```python
-parser = argparse.ArgumentParser(prog="start_paddle.py",
-                                     description='simple tool for k8s')
-    args, train_args_list = parser.parse_known_args()
-    train_args = refine_unknown_args(train_args_list)
-    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
-    podlist = getPodList()
-```
-
-然后通过函数`getPodList()`访问Kubernetes的接口来查询此job对应的所有pod信息。当所有pod都处于running状态（容器运行都运行）时，再通过函数`getIdMap(podlist)`获取trainer_id。
-
-```python
-    podlist = getPodList()
-    # need to wait until all pods are running
-    while not isPodAllRunning(podlist):
-        time.sleep(10)
-        podlist = getPodList()
-    idMap = getIdMap(podlist)
-```
-
-在函数`getIdMap(podlist)`内部，我们通过读取`podlist`中每个pod的IP地址，将IP排序生成的序号作为trainer_id。
-
-```python
-def getIdMap(podlist):
-    '''
-    generate tainer_id by ip
-    '''
-    ips = []
-    for pod in podlist["items"]:
-        ips.append(pod["status"]["podIP"])
-    ips.sort()
-    idMap = {}
-    for i in range(len(ips)):
-        idMap[ips[i]] = i
-    return idMap
-```
-
-在得到`idMap`后，通过函数`startPaddle(idMap, train_args_dict)`构造`paddle pserver`与`paddle train`的启动参数并执行进程。
-
-在函数`startPaddle`中，最主要的工作就是解析出`paddle pserver`与`paddle train`的启动参数。例如`paddle train`参数的解析，解析环境变量得到`PADDLE_NIC`，`PADDLE_PORT`，`PADDLE_PORTS_NUM`等参数，然后通过自身的IP地址在`idMap`中获取`trainerId`。
-
-```python
-    program = 'paddle train'
-    args = " --nics=" + PADDLE_NIC
-    args += " --port=" + str(PADDLE_PORT)
-    args += " --ports_num=" + str(PADDLE_PORTS_NUM)
-    args += " --comment=" + "paddle_process_by_paddle"
-    ip_string = ""
-    for ip in idMap.keys():
-        ip_string += (ip + ",")
-    ip_string = ip_string.rstrip(",")
-    args += " --pservers=" + ip_string
-    args_ext = ""
-    for key, value in train_args_dict.items():
-        args_ext += (' --' + key + '=' + value)
-    localIP = socket.gethostbyname(socket.gethostname())
-    trainerId = idMap[localIP]
-    args += " " + args_ext + " --trainer_id=" + \
-        str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
-```
-
-使用 `docker build` 构建镜像：
+因为官方镜像 `paddledev/paddle:cpu-latest` 内已经包含PaddlePaddle的执行程序但是还没上述功能，所以我们可以在这个基础上，添加启动脚本，制作新镜像来完成以上的工作。参考镜像的[*Dockerfile*](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/usage/cluster/k8s/src/k8s_train/Dockerfile)。
 
 ```bash
-docker build -t your_repo/paddle:mypaddle .
+$ cd doc/howto/usage/k8s/src/k8s_train
+$ docker build -t [YOUR_REPO]/paddle:mypaddle .
 ```
 
 然后将构建成功的镜像上传到镜像仓库。
 
 ```bash
-docker push  your_repo/paddle:mypaddle
+docker push  [YOUR_REPO]/paddle:mypaddle
 ```
 
-注意上述命令中`your_repo`表示读者所使用的Docker镜像仓库地址，读者需要替换成自己使用的仓库地址。下文使用`your_repo/paddle:mypaddle`这个地址来表示此步骤所构建出的镜像。
+注意上述命令中`[YOUR_REPO]`表示读者所使用的Docker镜像仓库地址，读者需要替换成自己使用的仓库地址。下文使用`[YOUR_REPO]/paddle:mypaddle`这个地址来表示此步骤所构建出的镜像。
 
-### 上传训练文件
+### 准备训练数据
 
-本文使用PaddlePaddle官方的[recommendation demo](http://www.paddlepaddle.org/doc/demo/index.html#recommendation)作为这次训练的内容，我们将训练文件与数据放在一个job name命名的目录中，上传到MFS共享存储。完成后MFS上的文件内容大致如下：
+这里我们通过在Kubernetes集群上启动一个Job来下载并切割数据，也可以通过修改[k8s_train](./src/k8s_train/README.md)的内容来定制image.
 
-```bash
-[root@paddle-kubernetes-node0 mfs]# tree -d
+在启动Job之前，需要根据不同的分布式存储来绑定一个[persistentVolumeClaim](https://kubernetes.io/docs/user-guide/persistent-volumes/),生成的数据将会存储在这个volume下.
+
+```yaml
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-data
+spec:
+  template:
+    metadata:
+      name: pi
+    spec:
+      hostNetwork: true
+      containers:
+      - name: paddle-data
+        image: paddledev/paddle-tutorial:k8s_data
+        imagePullPolicy: Always
+        volumeMounts:
+        - mountPath: "/mnt"
+          name: nfs
+        env:
+        - name: OUT_DIR
+          value: /home/work/mfs/paddle-cluster-job
+        - name: SPLIT_COUNT
+          value: "3"
+      volumes:
+        - name: nfs
+          persistentVolumeClaim:
+            claimName: mfs
+      restartPolicy: Never
+```
+
+完成后volume中的文件内容大致如下：
+```base
+[root@paddle-kubernetes-node0 nfsdir]$ tree -d
 .
-└── paddle-cluster-job
-    ├── data
-    │   ├── 0
-    │   │
-    │   ├── 1
-    │   │
-    │   └── 2
-    ├── output
-    └── recommendation
+`-- paddle-cluster-job
+    |-- 0
+    |   `-- data
+    |-- 1
+    |   `-- data
+    |-- 2
+    |   `-- data
+    |-- output
+    |-- quick_start
 ```
 
 目录中paddle-cluster-job是本次训练对应的job name，本次训练要求有3个PaddlePaddle节点，在paddle-cluster-job/data目录中存放切分好的数据，文件夹0，1，2分别代表3个节点的trainer_id。recommendation文件夹内存放训练文件，output文件夹存放训练结果与日志。
@@ -205,7 +120,7 @@ spec:
           path: /home/work/mfs
       containers:
       - name: trainer
-        image: your_repo/paddle:mypaddle
+        image: [YOUR_REPO]/paddle:mypaddle
         command: ["bin/bash",  "-c", "/root/start.sh"]
         env:
         - name: JOB_NAME
@@ -236,15 +151,16 @@ spec:
 
 `env`字段表示容器的环境变量，我们将`paddle`运行的一些参数通过这种方式传递到容器内。
 
-`JOB_PATH`表示共享存储挂载的路径，`JOB_NAME`表示job名字，`TRAIN_CONFIG_DIR`表示本次训练文件所在目录，这三个变量组合就可以找到本次训练需要的文件路径。
-
-`CONF_PADDLE_NIC`表示`paddle pserver`进程需要的`--nics`参数，即网卡名
-
-`CONF_PADDLE_PORT`表示`paddle pserver`的`--port`参数，`CONF_PADDLE_PORTS_NUM`则表示稠密更新的端口数量，也就是`--ports_num`参数。
-
-`CONF_PADDLE_PORTS_NUM_SPARSE`表示稀疏更新的端口数量，也就是`--ports_num_for_sparse`参数。
-
-`CONF_PADDLE_GRADIENT_NUM`表示训练节点数量，即`--num_gradient_servers`参数
+环境变量 | 说明
+--- | ---
+JOB_PATH | 共享存储挂在的路径
+JOB_NAME | Job的名字
+TRAIN_CONFIG_DIR | 本次训练文件所在目录，与JOB_PATH,JOB_NAME组合可以找到本次训练需要的文件路径
+CONF_PADDLE_NIC | `paddle pserver`进程需要的`--nics`参数，即网卡名
+CONF_PADDLE_PORT | `paddle paserver`的`--port`参数
+CONF_PADDLE_PORTS_NUM | 稠密更新的端口数量，即`--ports_num`参数
+CONF_PADDLE_PORTS_NUM_SPARSE | 稀疏更新的端口数量，即`--ports_num_for_sparse`参数
+CONF_PADDLE_GRADIENT_NUM | 训练节点数量，即`--num_gradient_servers参数`
 
 这些参数的具体描述，读者可以查看[这里](http://www.paddlepaddle.org/doc/ui/cmd_argument/detail_introduction.html#parameter-server-and-distributed-communication)。
 
@@ -289,15 +205,15 @@ I1116 09:10:17.123121    50 Util.cpp:155] commandline:
     --ports_num=2 --comment=paddle_process_by_paddle
     --pservers=192.168.129.66,192.168.223.143,192.168.129.71
     --ports_num_for_sparse=2 --config=./trainer_config.py
-    --trainer_count=4 --num_passes=10 --use_gpu=0 
-    --log_period=50 --dot_period=10 --saving_period=1 
+    --trainer_count=4 --num_passes=10 --use_gpu=0
+    --log_period=50 --dot_period=10 --saving_period=1
     --local=0 --trainer_id=0
     --save_dir=/home/jobpath/paddle-cluster-job/output
 I1116 09:10:17.123440    50 Util.cpp:130] Calling runInitFunctions
 I1116 09:10:17.123764    50 Util.cpp:143] Call runInitFunctions done.
 [WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config.
 [INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating]
-[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__regression_cost_0__]
+[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__mse_cost_0__]
 I1116 09:10:17.392917    50 Trainer.cpp:170] trainer mode: Normal
 I1116 09:10:17.613910    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
 I1116 09:10:17.680917    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
@@ -310,3 +226,90 @@ I1116 09:10:18.019492    50 ParameterClient2.cpp:122] pserver 3 192.168.223.143:
 I1116 09:10:18.019716    50 ParameterClient2.cpp:122] pserver 4 192.168.129.71:7164
 I1116 09:10:18.019836    50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7165
 ```
+
+
+## 一些细节的补充
+
+### 使用环境变量
+
+使用容器方式运行训练任务的Kubernetes Job，通常会使用环境变量配置Job的配置信息`start_paddle.py`提供了一个启动脚本，将环境变量转换成paddle的命令行参数：
+```
+API = "/api/v1/namespaces/"
+JOBSELECTOR = "labelSelector=job-name="
+JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME")
+JOB_PATH_OUTPUT = JOB_PATH + "/output"
+JOBNAME = os.getenv("JOB_NAME")
+NAMESPACE = os.getenv("JOB_NAMESPACE")
+PADDLE_NIC = os.getenv("CONF_PADDLE_NIC")
+PADDLE_PORT = os.getenv("CONF_PADDLE_PORT")
+PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM")
+PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE")
+PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM")
+```
+
+### Pod间通信
+`start_paddle.py`脚本开始时，会先进行参数的初始化与解析。
+
+```python
+parser = argparse.ArgumentParser(prog="start_paddle.py",
+                                     description='simple tool for k8s')
+    args, train_args_list = parser.parse_known_args()
+    train_args = refine_unknown_args(train_args_list)
+    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
+    podlist = getPodList()
+```
+
+然后通过函数`getPodList()`访问Kubernetes的接口来查询此job对应的所有pod信息。当所有pod都处于running状态（容器运行都运行）时，再通过函数`getIdMap(podlist)`获取trainer_id。
+
+```python
+    podlist = getPodList()
+    # need to wait until all pods are running
+    while not isPodAllRunning(podlist):
+        time.sleep(10)
+        podlist = getPodList()
+    idMap = getIdMap(podlist)
+```
+* *注意*: `getPodList()`会获取当前namespace下的所有pod，如果已经有pod运行，可能会导致出错。这种集群节点管理方式会在将来使用[statfulsets](https://kubernetes.io/docs/concepts/abstractions/controllers/statefulsets/)代替。
+
+在函数`getIdMap(podlist)`内部，我们通过读取`podlist`中每个pod的IP地址，将IP排序生成的序号作为trainer_id。
+
+```python
+def getIdMap(podlist):
+    '''
+    generate tainer_id by ip
+    '''
+    ips = []
+    for pod in podlist["items"]:
+        ips.append(pod["status"]["podIP"])
+    ips.sort()
+    idMap = {}
+    for i in range(len(ips)):
+        idMap[ips[i]] = i
+    return idMap
+```
+
+在得到`idMap`后，通过函数`startPaddle(idMap, train_args_dict)`构造`paddle pserver`与`paddle train`的启动参数并执行进程。
+
+### 启动任务
+
+在函数`startPaddle`中，最主要的工作就是解析出`paddle pserver`与`paddle train`的启动参数。例如`paddle train`参数的解析，解析环境变量得到`PADDLE_NIC`，`PADDLE_PORT`，`PADDLE_PORTS_NUM`等参数，然后通过自身的IP地址在`idMap`中获取`trainerId`。
+
+```python
+    program = 'paddle train'
+    args = " --nics=" + PADDLE_NIC
+    args += " --port=" + str(PADDLE_PORT)
+    args += " --ports_num=" + str(PADDLE_PORTS_NUM)
+    args += " --comment=" + "paddle_process_by_paddle"
+    ip_string = ""
+    for ip in idMap.keys():
+        ip_string += (ip + ",")
+    ip_string = ip_string.rstrip(",")
+    args += " --pservers=" + ip_string
+    args_ext = ""
+    for key, value in train_args_dict.items():
+        args_ext += (' --' + key + '=' + value)
+    localIP = socket.gethostbyname(socket.gethostname())
+    trainerId = idMap[localIP]
+    args += " " + args_ext + " --trainer_id=" + \
+        str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
+```
diff --git a/doc/howto/usage/k8s/src/k8s-paddle-arch.png b/doc/howto/usage/k8s/src/k8s-paddle-arch.png
index a8c64550b1fa7f41de1eaa9a037c65cddc0cd30e..2183a232ad402b76f82a67234a5c93e13ce97ac3 100644
Binary files a/doc/howto/usage/k8s/src/k8s-paddle-arch.png and b/doc/howto/usage/k8s/src/k8s-paddle-arch.png differ
diff --git a/doc/howto/usage/k8s/src/k8s_train/start_paddle.py b/doc/howto/usage/k8s/src/k8s_train/start_paddle.py
index f1a770ccb54fbd7d4c3cf6bf134d00d7bf5961ca..935c12bb67e1fe08bc135a7a2220fcd43c548482 100755
--- a/doc/howto/usage/k8s/src/k8s_train/start_paddle.py
+++ b/doc/howto/usage/k8s/src/k8s_train/start_paddle.py
@@ -132,7 +132,8 @@ def startPaddle(idMap={}, train_args_dict=None):
     logDir = JOB_PATH_OUTPUT + "/node_" + str(trainerId)
     if not os.path.exists(JOB_PATH_OUTPUT):
         os.makedirs(JOB_PATH_OUTPUT)
-    os.mkdir(logDir)
+    if not os.path.exists(logDir):
+        os.mkdir(logDir)
     copyCommand = 'cp -rf ' + JOB_PATH + \
         "/" + str(trainerId) + "/data/*" + " ./data/"
     os.system(copyCommand)
diff --git a/doc/templates/conf.py.cn.in b/doc/templates/conf.py.cn.in
index 418d718fbd9c61bff3acb9c2dab0638c0b650bab..6dc48704bc230bd1a573c4b4b2e7c07791e48ced 100644
--- a/doc/templates/conf.py.cn.in
+++ b/doc/templates/conf.py.cn.in
@@ -15,13 +15,19 @@ import sys
 import os, subprocess
 import shlex
 from recommonmark import parser, transform
+try:
+   import py_paddle
+   import paddle
+   import paddle.v2
+except ImportError:
+   print("Must install paddle python package before generating documentation")
+   sys.exit(1)
 
 MarkdownParser = parser.CommonMarkParser
 AutoStructify = transform.AutoStructify
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-sys.path.insert(0, '@PROJ_ROOT@/python')
 templates_path = ["@PROJ_ROOT@/doc_theme/templates"]
 
 # -- General configuration ------------------------------------------------
diff --git a/doc/templates/conf.py.en.in b/doc/templates/conf.py.en.in
index e96c25cb75bee20d2e2949423d80ddab1d3450a1..b477f0120c4fa0544012080b7cfb8572d3c44b04 100644
--- a/doc/templates/conf.py.en.in
+++ b/doc/templates/conf.py.en.in
@@ -15,14 +15,20 @@ import sys
 import os, subprocess
 import shlex
 from recommonmark import parser, transform
+try:
+   import py_paddle
+   import paddle
+   import paddle.v2
+except ImportError:
+   print("Must install paddle python package before generating documentation")
+   sys.exit(1)
+
 
 MarkdownParser = parser.CommonMarkParser
 AutoStructify = transform.AutoStructify
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-sys.path.insert(0, '@PROJ_ROOT@/python')
-
 templates_path = ["@PROJ_ROOT@/doc_theme/templates"]
 
 # -- General configuration ------------------------------------------------
diff --git a/doc/tutorials/quick_start/index_en.md b/doc/tutorials/quick_start/index_en.md
index 70dec2eb2a8c397bc56b1e6f52a624a3a6877905..ca110431cf921ae0480d3fb2b17c58f90a84cc0e 100644
--- a/doc/tutorials/quick_start/index_en.md
+++ b/doc/tutorials/quick_start/index_en.md
@@ -156,14 +156,14 @@ define_py_data_sources2(train_list='data/train.list',
                         obj="process",
                         args={"dictionary": word_dict})
 ```
-You can refer to the following link for more detailed examples and data formats: <a href = "../../api/data_provider/pydataprovider2_en.html">PyDataProvider2</a>.
+You can refer to the following link for more detailed examples and data formats: <a href = "../../api/v1/data_provider/pydataprovider2_en.html">PyDataProvider2</a>.
 
 ## Network Architecture
 We will describe four kinds of network architectures in this section.
 <center> ![](./src/PipelineNetwork_en.jpg) </center>
 
 First, you will build a logistic regression model. Later, you will also get chance to build other more powerful network architectures.
-For more detailed documentation, you could refer to: <a href = "../../api/trainer_config_helpers/layers.html">layer documentation</a>. All configuration files are in `demo/quick_start` directory.
+For more detailed documentation, you could refer to: <a href = "../../api/v1/trainer_config_helpers/layers.html">layer documentation</a>. All configuration files are in `demo/quick_start` directory.
 
 ### Logistic Regression
 The architecture is illustrated in the following picture:
@@ -366,7 +366,7 @@ You can use single layer LSTM model with Dropout for our text classification pro
 <br>
 
 ## Optimization Algorithm
-<a href = "../../api/trainer_config_helpers/optimizers.html">Optimization algorithms</a> include Momentum, RMSProp, AdaDelta, AdaGrad, Adam, and Adamax. You can use Adam optimization method here, with L2 regularization and gradient clipping, because Adam has been proved to work very well for training recurrent neural network.
+<a href = "../../api/v1/trainer_config_helpers/optimizers.html">Optimization algorithms</a> include Momentum, RMSProp, AdaDelta, AdaGrad, Adam, and Adamax. You can use Adam optimization method here, with L2 regularization and gradient clipping, because Adam has been proved to work very well for training recurrent neural network.
 
 ```python
 settings(batch_size=128,
@@ -407,7 +407,7 @@ paddle train \
 --init_model_path=./output/pass-0000x
 ```
 
-We will give an example of performing prediction using Recurrent model on a dataset with no labels. You can refer to <a href = "../../api/predict/swig_py_paddle_en.html">Python Prediction API</a> tutorial，or other <a href = "../../tutorials/index_en.html">demo</a> for the prediction process using Python. You can also use the following script for inference or evaluation.
+We will give an example of performing prediction using Recurrent model on a dataset with no labels. You can refer to <a href = "../../api/v1/predict/swig_py_paddle_en.html">Python Prediction API</a> tutorial，or other <a href = "../../tutorials/index_en.html">demo</a> for the prediction process using Python. You can also use the following script for inference or evaluation.
 
 inference script (predict.sh)：
 
diff --git a/doc_theme/static/css/override.css b/doc_theme/static/css/override.css
index 438a87848a0176a7857177aeb672c59f35bd8d4b..09ecff688b9a2dae3d834572217922640c529c5e 100644
--- a/doc_theme/static/css/override.css
+++ b/doc_theme/static/css/override.css
@@ -1,3 +1,6 @@
+* {
+    font-family:"Roboto","Lato","proxima-nova","Helvetica Neue",Arial,sans-serif;
+}
 body {
     padding-top: 80px;
     background-image: none !important;
diff --git a/paddle/api/Arguments.cpp b/paddle/api/Arguments.cpp
index 41beed38a87601cb57072c8966cd0fd2ea156524..d49b189e253f7a0792fe3f1fe7c8fdbb7071acd4 100644
--- a/paddle/api/Arguments.cpp
+++ b/paddle/api/Arguments.cpp
@@ -38,6 +38,13 @@ Arguments* Arguments::createByPaddleArgumentVector(void* ptr) {
   return args;
 }
 
+Arguments* Arguments::createByPaddleArgument(const void* ptr) {
+  auto p = (paddle::Argument*)(ptr);
+  auto args = new Arguments();
+  args->m->outputs.push_back(*p);
+  return args;
+}
+
 Matrix* Arguments::getSlotValue(size_t idx) const throw(RangeError) {
   auto& a = m->getArg(idx);
   return Matrix::createByPaddleMatrixPtr(&a.value);
@@ -137,9 +144,7 @@ void Arguments::setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError) {
   a.cpuSequenceDims = m->cast<paddle::IVector>(vec->getSharedPtr());
 }
 
-float Arguments::sumCosts() const {
-  return paddle::Argument::sumCosts(m->outputs);
-}
+float Arguments::sum() const { return paddle::Argument::sum(m->outputs); }
 
 int64_t Arguments::getBatchSize(size_t idx) const throw(RangeError) {
   auto& a = m->getArg(idx);
diff --git a/paddle/api/Evaluator.cpp b/paddle/api/Evaluator.cpp
index c30e09876397e37ef9ed4ec3200d1aa372ceb609..681e3a380912339c531c16c88f43255c2f34c32f 100644
--- a/paddle/api/Evaluator.cpp
+++ b/paddle/api/Evaluator.cpp
@@ -27,3 +27,18 @@ std::string Evaluator::toString() {
   m->rawPtr->printStats(sout);
   return sout.str();
 }
+
+std::vector<std::string> Evaluator::getNames() const {
+  std::vector<std::string> retv;
+  m->rawPtr->getNames(&retv);
+  return retv;
+}
+
+double Evaluator::getValue(const std::string name) const {
+  paddle::Error err;
+  double v = m->rawPtr->getValue(name, &err);
+  if (err) {
+    throw std::runtime_error(err.msg());
+  }
+  return v;
+}
diff --git a/paddle/api/GradientMachine.cpp b/paddle/api/GradientMachine.cpp
index 66115f8293b905809639afff779abfdb2bb3a54e..dcb5fe086fdccf8ec62ee52cbaaac4b7dbbe2f9d 100644
--- a/paddle/api/GradientMachine.cpp
+++ b/paddle/api/GradientMachine.cpp
@@ -142,14 +142,28 @@ Parameter* GradientMachine::getParameter(size_t i) throw(RangeError) {
   }
 }
 
+size_t GradientMachine::getNonStaticParameterSize() const {
+  return m->machine->getNonStaticParameters().size();
+}
+
+Parameter* GradientMachine::getNonStaticParameter(size_t i) throw(RangeError) {
+  auto params = m->machine->getNonStaticParameters();
+  if (i < params.size()) {
+    return Parameter::createFromSharedPtr(
+        &m->machine->getNonStaticParameters()[i]);
+  } else {
+    throw RangeError();
+  }
+}
+
 void GradientMachine::randParameters() { m->machine->randParameters(); }
 
-Matrix* GradientMachine::getLayerOutput(const std::string& layerName) const
+Arguments* GradientMachine::getLayerOutput(const std::string& layerName) const
     throw(UnsupportError) {
-  auto nn = std::dynamic_pointer_cast<paddle::NeuralNetwork>(m->machine);
+  auto nn = m->machine;
   if (nn) {
-    auto mat = nn->getLayerOutput(layerName);
-    return Matrix::createByPaddleMatrixPtr(&mat);
+    auto arg = nn->getLayerOutput(layerName);
+    return Arguments::createByPaddleArgument(&arg);
   } else {
     throw UnsupportError();
   }
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index f5af8b0035b44d97832dd90ca2eeba079503715c..c4f5dca26cc6a5e9fdd23ee27b594ced29a25c7a 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -47,6 +47,9 @@ void setUseGpu(bool useGpu);
 /// Return true if this py_paddle is compiled in GPU Version
 bool isGpuVersion();
 
+/// Return FLAGS_trainer_count
+int getTrainerCount();
+
 /// The Error of IO Operation. Such as file not found, etc.
 class IOError {};
 
@@ -450,10 +453,11 @@ public:
                                         IVector* vec) throw(RangeError);
   void setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError);
 
-  float sumCosts() const;
+  float sum() const;
 
 private:
   static Arguments* createByPaddleArgumentVector(void* ptr);
+  static Arguments* createByPaddleArgument(const void* ptr);
   void* getInternalArgumentsPtr() const;
 
 private:
@@ -767,9 +771,12 @@ public:
   size_t getParameterSize() const;
   Parameter* getParameter(size_t i) throw(RangeError);
 
+  size_t getNonStaticParameterSize() const;
+  Parameter* getNonStaticParameter(size_t i) throw(RangeError);
+
   void randParameters();
 
-  Matrix* getLayerOutput(const std::string& layerName) const
+  Arguments* getLayerOutput(const std::string& layerName) const
       throw(UnsupportError);
 
   /**
@@ -900,6 +907,10 @@ public:
    */
   std::string toString();
 
+  std::vector<std::string> getNames() const;
+
+  double getValue(const std::string name) const;
+
 private:
   EvaluatorPrivate* m;
 
@@ -952,7 +963,7 @@ public:
 
   Arguments* getForwardOutput();
 
-  Matrix* getLayerOutput(const std::string& layerName);
+  Arguments* getLayerOutput(const std::string& layerName) const;
 };
 
 /// the N-Best results generated from one input sequence.
diff --git a/paddle/api/Trainer.cpp b/paddle/api/Trainer.cpp
index d83dc380beeec3747451a483f4811eb833e8c226..84e4ca054abb0100a02c8a40e31c49c17684ef40 100644
--- a/paddle/api/Trainer.cpp
+++ b/paddle/api/Trainer.cpp
@@ -131,12 +131,11 @@ void Trainer::testOneDataBatch(size_t batchSize, const Arguments& args) {
 void TrainerPrivate::finishTestPeriod() { tester_->finishTestPeriod(); }
 void Trainer::finishTestPeriod() { m->finishTestPeriod(); }
 
-Matrix* Trainer::getLayerOutput(const std::string& layerName) {
-  auto nn = std::dynamic_pointer_cast<paddle::NeuralNetwork>(
-      this->m->getGradientMachine());
+Arguments* Trainer::getLayerOutput(const std::string& layerName) const {
+  auto nn = this->m->getGradientMachine();
   CHECK(nn) << "trainerInternal_.getGradientMachine() is not NeuralNetwork";
-  auto m = nn->getLayerOutput(layerName);
-  return Matrix::createByPaddleMatrixPtr(&m);
+  auto arg = nn->getLayerOutput(layerName);
+  return Arguments::createByPaddleArgument(&arg);
 }
 
 void Trainer::forwardOneBatch(size_t batchSize) {
diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
index 54d67aa62f4d87ad03282962c722019698dc621a..d369df5d4e04b4a8d822db0e72a8051150868ce6 100644
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -54,5 +54,7 @@ bool isGpuVersion() {
 #endif
 }
 
+int getTrainerCount() { return FLAGS_trainer_count; }
+
 static_assert(NUM_PARAMETER_TYPES == paddle::NUM_PARAMETER_TYPES,
               "The Parameter Type should be same in core/api and core/common");
diff --git a/paddle/api/test/testArguments.py b/paddle/api/test/testArguments.py
index a04a805d7a64ef906c8388f1241b9ef823e4d9e0..9fe44de94ea6ddb71d2dfbb2243fc86ede0d0531 100644
--- a/paddle/api/test/testArguments.py
+++ b/paddle/api/test/testArguments.py
@@ -22,7 +22,7 @@ class TestArguments(unittest.TestCase):
         args = swig_paddle.Arguments.createArguments(1)
         args.setSlotValue(0, m)
 
-        self.assertAlmostEqual(27.0, args.sumCosts())
+        self.assertAlmostEqual(27.0, args.sum())
 
         mat = args.getSlotValue(0)
         assert isinstance(mat, swig_paddle.Matrix)
diff --git a/paddle/api/test/testMatrix.py b/paddle/api/test/testMatrix.py
index 37666bdccc9aedfe8f8079124129aad2ade53a43..f08fbf3ccdf5d7c0a5c739868b1bcb516146c23d 100644
--- a/paddle/api/test/testMatrix.py
+++ b/paddle/api/test/testMatrix.py
@@ -68,7 +68,7 @@ class TestMatrix(unittest.TestCase):
 
     def test_numpyCpu(self):
         numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype="float32")
-        m = swig_paddle.Matrix.createCpuDenseFromNumpy(numpy_mat, copy=False)
+        m = swig_paddle.Matrix.createCpuDenseFromNumpy(numpy_mat, False)
         self.assertEqual((int(m.getHeight()), int(m.getWidth())),
                          numpy_mat.shape)
 
diff --git a/paddle/api/test/testTrain.py b/paddle/api/test/testTrain.py
index a90d15c272a3a2b56e35c979e053deb2b54eebc1..7061a4c43bf01158b5f084d0c310dedd81773a04 100644
--- a/paddle/api/test/testTrain.py
+++ b/paddle/api/test/testTrain.py
@@ -89,9 +89,14 @@ def main():
             except Exception as e:
                 print e
 
+        ev = m.makeEvaluator()
+        ev.start()
         m.forwardBackward(inArgs, outArgs, swig_paddle.PASS_TRAIN,
                           update_callback)
-
+        m.eval(ev)
+        ev.finish()
+        for name in ev.getNames():
+            print name, ev.getValue(name)
         for optimizer in optimizers:
             optimizer.finishBatch()
 
diff --git a/paddle/api/test/testVector.py b/paddle/api/test/testVector.py
index 1ab095c1d3d0d2c84d2d2f95a03f172b901de209..6339cf8542607bdda99eb9ccaa8b06480f144b78 100644
--- a/paddle/api/test/testVector.py
+++ b/paddle/api/test/testVector.py
@@ -43,7 +43,7 @@ class TestIVector(unittest.TestCase):
 
     def test_cpu_numpy(self):
         vec = np.array([1, 3, 4, 65, 78, 1, 4], dtype="int32")
-        iv = swig_paddle.IVector.createCpuVectorFromNumpy(vec, copy=False)
+        iv = swig_paddle.IVector.createCpuVectorFromNumpy(vec, False)
         self.assertEqual(vec.shape[0], int(iv.__len__()))
         vec[4] = 832
         for i in xrange(len(iv)):
@@ -106,7 +106,7 @@ class TestVector(unittest.TestCase):
 
     def testCpuNumpy(self):
         numpy_arr = np.array([1.2, 2.3, 3.4, 4.5], dtype="float32")
-        vec = swig_paddle.Vector.createCpuVectorFromNumpy(numpy_arr, copy=False)
+        vec = swig_paddle.Vector.createCpuVectorFromNumpy(numpy_arr, False)
         assert isinstance(vec, swig_paddle.Vector)
         numpy_arr[0] = 0.1
         for n, v in zip(numpy_arr, vec):
diff --git a/paddle/cuda/include/hl_matrix.h b/paddle/cuda/include/hl_matrix.h
index 6f21b82afdc6cdde785fdd8f13eef17a0fdd6324..eb454c59c1e58cf2b4817b4cb3230b9d75e320ac 100644
--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
@@ -69,19 +69,6 @@ extern void hl_sequence_softmax_forward(real* A_d,
                                         const int* index,
                                         int numSequence);
 
-/**
- * @brief   Matrix classification error.
- *
- * @param[in]   A_d     input matrix (M x N).
- * @param[in]   B_d     input vector (M x 1).
- * @param[out]  C_d     output vector (M x 1).
- * @param[in]   dimM    matrix height.
- * @param[in]   dimN    matrix width.
- *
- */
-extern void hl_matrix_classification_error(
-    real* A_d, int* B_d, real* C_d, int dimM, int dimN);
-
 /**
  * @brief   Matrix cross entropy.
  *
diff --git a/paddle/cuda/include/hl_top_k.h b/paddle/cuda/include/hl_top_k.h
index 77949ed295a6eaf7cc535853e53bef066ffac37c..79ae0d0e741de06e622454ccd220e2c749d795b3 100644
--- a/paddle/cuda/include/hl_top_k.h
+++ b/paddle/cuda/include/hl_top_k.h
@@ -58,4 +58,30 @@ extern void hl_sparse_matrix_top_k(real* topVal,
                                    int beamSize,
                                    int numSamples);
 
-#endif /* HL_TOP_K_H_ */
+/**
+ * @brief   Matrix classification error.
+ *
+ * @param[out]  topVal         top k element.
+ * @param[in]   ldv            leading dimension of topVal.
+ * @param[out]  topIds         top k index.
+ * @param[in]   src            input value.
+ * @param[in]   lds            leading dimension of src.
+ * @param[in]   dim            width of input value.
+ * @param[in]   topkSize       size of top k element.
+ * @param[in]   numSamples     height of input value.
+ * @param[in]   label          ground truth label.
+ * @param[out]  recResult      top-k classification error.
+ *
+ */
+extern void hl_matrix_classification_error(real* topVal,
+                                           int ldv,
+                                           int* topIds,
+                                           real* src,
+                                           int lds,
+                                           int dim,
+                                           int topkSize,
+                                           int numSamples,
+                                           int* label,
+                                           real* recResult);
+
+#endif  // HL_TOP_K_H_
diff --git a/paddle/cuda/include/stub/hl_matrix_stub.h b/paddle/cuda/include/stub/hl_matrix_stub.h
index f4e6461cdcf198637b2c96fee88d1de2766aaf18..127cb7e27983e8ff2c1ff6ef5108b5f8c5bd6ca5 100644
--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@@ -35,8 +35,16 @@ inline void hl_sequence_softmax_forward(real* A_d,
 inline void hl_matrix_softmax_derivative(
     real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {}
 
-inline void hl_matrix_classification_error(
-    real* A_d, int* B_d, real* C_d, int dimM, int dimN) {}
+inline void hl_matrix_classification_error(real* topVal,
+                                           int ldv,
+                                           int* topIds,
+                                           real* src,
+                                           int lds,
+                                           int dim,
+                                           int topkSize,
+                                           int numSamples,
+                                           int* label,
+                                           real* recResult) {}
 
 inline void hl_matrix_cross_entropy(
     real* A_d, real* C_d, int* label_d, int dimM, int dimN) {}
diff --git a/paddle/cuda/src/hl_cuda_matrix.cu b/paddle/cuda/src/hl_cuda_matrix.cu
index 96c07d9c3b7a37daa9198fd7ea66b7d811600348..9bcc7fb7de44b2211db450fb164655f7947dcad9 100644
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -265,59 +265,6 @@ void hl_matrix_softmax_derivative(real *grad_d,
   CHECK_SYNC("hl_matrix_softmax_derivative failed");
 }
 
-template<int blockSize>
-__global__ void KeMatrixClassificationError(real* in_A,
-                                            int* in_B,
-                                            real* out_C,
-                                            int dimN) {
-  __shared__ real max_s[blockSize];
-  __shared__ int max_l[blockSize];
-  const int tid = threadIdx.x;
-  const int rowId = blockIdx.x;
-
-  max_s[tid] = -1e30f;
-  in_A += rowId * dimN;
-  real tmp;
-  for (int colId = tid; colId < dimN; colId += blockSize) {
-    tmp = in_A[colId];
-    if (max_s[tid] < tmp) {
-      max_s[tid] = tmp;
-      max_l[tid] = colId;
-    }
-  }
-  __syncthreads();
-
-  for (int stride = blockSize/2; stride > 0; stride = stride/2) {
-    if (tid < stride) {
-      if (max_s[tid] < max_s[tid + stride]) {
-        max_s[tid] = max_s[tid + stride];
-        max_l[tid] = max_l[tid + stride];
-      }
-    }
-    __syncthreads();
-  }
-  __syncthreads();
-
-  if (tid == 0) {
-    out_C[rowId] = (max_l[0] == in_B[rowId] ? 0 : 1.0f);
-  }
-}
-
-void hl_matrix_classification_error(real* A_d,
-                                    int* B_d,
-                                    real* C_d,
-                                    int dimM,
-                                    int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-
-  // each sample is calculated by one block
-  KeMatrixClassificationError<1024><<< dimM, 1024, 0, STREAM_DEFAULT >>>
-    (A_d, B_d, C_d, dimN);
-  CHECK_SYNC("hl_matrix_classification_error");
-}
-
 __global__ void KeMatrixMultiBinaryCrossEntropy(real* output,
                                                 real* entropy,
                                                 int* row,
diff --git a/paddle/cuda/src/hl_top_k.cu b/paddle/cuda/src/hl_top_k.cu
index f0ef0cc3c51f9e7935dc3c40f630e4d70960802a..4f0bbfcf4e3aa51dd06acf254af65c62098a1df7 100644
--- a/paddle/cuda/src/hl_top_k.cu
+++ b/paddle/cuda/src/hl_top_k.cu
@@ -384,3 +384,81 @@ void hl_sparse_matrix_top_k(real* topVal, int ldv,
   CHECK_SYNC("hl_sparse_matrix_top_k failed");
 }
 
+/**
+ * Each block compute one sample.
+ * In a block:
+ * 1. every thread get top maxLength value;
+ * 2. merge to shTopK, block reduce and get max value;
+ * 3. go to the second setp, until one thread's topK value is null;
+ * 4. go to the first setp, until get the topK value.
+ */
+template<int maxLength, int blockSize>
+__global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
+                                                int * topIds,
+                                                real* src, int lds,
+                                                int dim,
+                                                int beamSize,
+                                                int* label,
+                                                real* recResult) {
+  __shared__ Pair shTopK[blockSize];
+  __shared__ int maxId[blockSize / 2];
+  const int tid = threadIdx.x;
+  const int warp = threadIdx.x / 32;
+  src += blockIdx.x * lds;
+  topVal += blockIdx.x * ldv;
+  topIds += blockIdx.x * beamSize;
+
+  Pair topK[maxLength]; // NOLINT
+  int beam = maxLength;
+  Pair max;
+  bool isEmpty = false;
+  bool firstStep = true;
+  int topkSize = beamSize;
+
+  for (int k = 0; k < maxLength; k++) {
+    topK[k].set(-HL_FLOAT_MAX, -1);
+  }
+
+  while (beamSize) {
+    threadGetTopK<maxLength, blockSize>
+      (topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
+
+    shTopK[tid] = topK[0];
+    blockReduce<maxLength, blockSize>
+      (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+  }
+
+  __syncthreads();
+  if (tid == 0) {
+    for (int i = 0; i < topkSize; i++) {
+        if (*--topIds == label[blockIdx.x]) {
+            recResult[blockIdx.x] = 0;
+            break;
+        }
+        recResult[blockIdx.x] = 1.0f;
+    }
+  }
+}
+
+void hl_matrix_classification_error(real* topVal, int ldv,
+                                   int* topIds,
+                                   real* src, int lds,
+                                   int dim,
+                                   int topkSize,
+                                   int numSamples,
+                                   int* label,
+                                   real* recResult) {
+  CHECK_NOTNULL(topVal);
+  CHECK_NOTNULL(topIds);
+  CHECK_NOTNULL(src);
+
+  if (topkSize > dim) topkSize = dim;
+
+  dim3 threads(256, 1);
+  dim3 grid(numSamples, 1);
+  KeMatrixTopKClassificationError<5, 256>
+  <<< grid, threads, 0, STREAM_DEFAULT >>>
+  (topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
+
+  CHECK_SYNC("hl_matrix_top_k classification error failed");
+}
diff --git a/paddle/cuda/src/hl_warpctc_wrap.cc b/paddle/cuda/src/hl_warpctc_wrap.cc
index 55b940ca67acce2c7ee7c1ee286ab96240652274..f57efb2b46797c303d99a5468ad96163a3e74972 100644
--- a/paddle/cuda/src/hl_warpctc_wrap.cc
+++ b/paddle/cuda/src/hl_warpctc_wrap.cc
@@ -54,22 +54,26 @@ DYNAMIC_LOAD_WARPCTC_WRAP(get_workspace_size)
 #define WARPCTC_GET_VERSION dynload::get_warpctc_version
 #define WARPCTC_GET_STATUS_STRING dynload::ctcGetStatusString
 
+static int g_warpctcVersion = -1;
 #ifndef PADDLE_TYPE_DOUBLE
 #define WARPCTC_COMPUTE_LOSS dynload::compute_ctc_loss
 #define WARPCTC_GET_WORKSPACE_SIZE dynload::get_workspace_size
 #else
-#define WARPCTC_LOG_FATAL                                \
-  LOG(FATAL) << "warp-ctc [version " << g_warpctcVersion \
-             << "] Error: not support double precision."
-#define WARPCTC_COMPUTE_LOSS(...) WARPCTC_LOG_FATAL(__VA_ARGS__)
-#define WARPCTC_GET_WORKSPACE_SIZE(...) WARPCTC_LOG_FATAL(__VA_ARGS__)
+hl_warpctc_status_t fatal(...) {
+  LOG(FATAL) << "warp-ctc [version " << g_warpctcVersion
+             << "] Error: not support double precision.";
+  // both of get_warpctc_version() and get_workspace_size() return an ctcStatus
+  // type value
+  return CTC_STATUS_EXECUTION_FAILED;
+}
+#define WARPCTC_COMPUTE_LOSS fatal
+#define WARPCTC_GET_WORKSPACE_SIZE fatal
 #endif
 
 /**
  * Check build-in warp-ctc function using glog and it also
  * support << operator for more details error info.
  */
-static int g_warpctcVersion = -1;
 #define CHECK_WARPCTC(warpctcStat)                \
   CHECK_EQ(CTC_STATUS_SUCCESS, warpctcStat)       \
       << "warp-ctc [version " << g_warpctcVersion \
diff --git a/paddle/function/CosSimOpGpu.cu b/paddle/function/CosSimOpGpu.cu
index 1dd733674fa0542c76070955ec63e008b083c7d2..c62ab39551f02288618244871ae31c6800df5b42 100644
--- a/paddle/function/CosSimOpGpu.cu
+++ b/paddle/function/CosSimOpGpu.cu
@@ -92,7 +92,6 @@ void CosSimForward<DEVICE_TYPE_GPU>(GpuMatrix& out_mat,
   CHECK(in1_mat.useGpu_ == true && in2_mat.useGpu_ == true)
       << "Matrix type are not GPU";
 
-  size_t num_samples = out_mat.getHeight();
   size_t dim = in1_mat.getWidth();
   real* out = out_mat.getData();
   const real* x = in1_mat.getData();
diff --git a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
index 05aa6c012ae2bc0afcbaf23f8ff78b3c782d050c..132119015f967c6e8d055792de8afe8450df5ec6 100644
--- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
@@ -20,7 +20,7 @@ namespace paddle {
 /**
  * calculate sequence-to-sequence edit distance
  */
-class CTCErrorEvaluator : public Evaluator {
+class CTCErrorEvaluator : public NotGetableEvaluator {
 private:
   MatrixPtr outActivations_;
   int numTimes_, numClasses_, numSequences_, blank_;
diff --git a/paddle/gserver/evaluators/Evaluator.cpp b/paddle/gserver/evaluators/Evaluator.cpp
index ae7508e2bb117a60492e0c28230f2fbb4b14915e..9db6d252d97bfeee3fe376bcda431fe94c65a678 100644
--- a/paddle/gserver/evaluators/Evaluator.cpp
+++ b/paddle/gserver/evaluators/Evaluator.cpp
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/gserver/evaluators/Evaluator.h"
-#include "paddle/utils/Stat.h"
-
 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/StringUtil.h"
 
 DECLARE_int32(trainer_id);
 
@@ -39,6 +39,14 @@ void Evaluator::eval(const NeuralNetwork& nn) {
  */
 class ClassificationErrorEvaluator : public Evaluator {
 public:
+  /*
+  ClassificationErrorEvaluator() : totalScore2_(0) {}
+
+  virtual void start() {
+    Evaluator::start();
+    totalScore2_ = 0;
+    } */
+
   virtual void updateSamplesNum(const std::vector<Argument>& arguments) {
     if (3 == arguments.size()) {
       numSamples_ += arguments[2].value->getSum();
@@ -76,9 +84,11 @@ public:
                                               1,
                                               /* trans= */ false,
                                               useGpu(arguments[0].deviceId));
+
     errorMat->zeroMem();
+
     if (label != nullptr) {
-      errorMat->classificationError(*output, *label);
+      errorMat->classificationError(*output, *label, config_.top_k());
     } else if (dynamic_cast<CpuSparseMatrix*>(multiBinaryLabel.get()) ||
                dynamic_cast<GpuSparseMatrix*>(multiBinaryLabel.get())) {
       errorMat->classificationErrorMulti(
@@ -94,6 +104,16 @@ public:
     return errorMat;
   }
 
+  void printStats(std::ostream& os) const {
+    if (config_.top_k() == 1) {
+      os << config_.name() << "="
+         << (numSamples_ ? totalScore_ / numSamples_ : 0);
+    } else {
+      os << " top_" << config_.top_k()
+         << "_error=" << (numSamples_ ? totalScore_ / numSamples_ : 0);
+    }
+  }
+
   virtual real evalImp(std::vector<Argument>& arguments) {
     MatrixPtr errorMat = calcError(arguments);
     return errorMat->getSum();
@@ -102,6 +122,10 @@ public:
   virtual void distributeEval(ParameterClient2* client) {
     mergeResultsOfAllClients(client);
   }
+
+  // Evaluator interface
+protected:
+  std::string getTypeImpl() const { return "classification_error"; }
 };
 
 /**
@@ -140,6 +164,10 @@ public:
   virtual void distributeEval(ParameterClient2* client) {
     mergeResultsOfAllClients(client);
   }
+
+  // Evaluator interface
+protected:
+  std::string getTypeImpl() const { return "seq_classification_error"; }
 };
 REGISTER_EVALUATOR(seq_classification_error,
                    SequenceClassificationErrorEvaluator);
@@ -230,6 +258,10 @@ public:
 private:
   IVectorPtr cpuLabel_;
   MatrixPtr cpuWeight_;
+
+  // Evaluator interface
+protected:
+  std::string getTypeImpl() const { return "sum"; }
 };
 /**
  * @brief column sum Evaluator
@@ -337,10 +369,18 @@ public:
   }
 
 private:
-  ColumnSumEvaluator() {}
   int32_t colIdx_;
   size_t colNum_;
   MatrixPtr sum_; /* cpu matrix */
+
+  // Evaluator interface
+protected:
+  std::string getTypeImpl() const {
+    if (colIdx_ == -1)
+      return "last-column-sum";
+    else
+      return "column-sum";
+  }
 };
 
 void AucEvaluator::start() {
@@ -449,6 +489,16 @@ double AucEvaluator::calcAuc() const {
   }
 }
 
+real AucEvaluator::getValueImpl() const { return calcAuc(); }
+
+std::string AucEvaluator::getTypeImpl() const {
+  if (colIdx_ == -1) {
+    return "last-column-auc";
+  } else {
+    return "auc";
+  }
+}
+
 // class RankAucEvaluator
 REGISTER_EVALUATOR(rankauc, RankAucEvaluator);
 
@@ -528,12 +578,15 @@ double RankAucEvaluator::calcRankAuc(real* outputData,
                                         : aucTmp / (clickSum * noClickSum);
 }
 
+std::string RankAucEvaluator::getTypeImpl() const { return "rankauc"; }
+
 // class PrecisionRecallEvaluator
 REGISTER_EVALUATOR(precision_recall, PrecisionRecallEvaluator);
 
 void PrecisionRecallEvaluator::start() {
   Evaluator::start();
   statsInfo_.clear();
+  values_.clear();
 }
 
 real PrecisionRecallEvaluator::evalImp(std::vector<Argument>& arguments) {
@@ -594,52 +647,23 @@ real PrecisionRecallEvaluator::evalImp(std::vector<Argument>& arguments) {
 }
 
 void PrecisionRecallEvaluator::printStats(std::ostream& os) const {
-  int label = config_.positive_label();
-  if (label != -1) {
-    CHECK(label >= 0 && label < (int)statsInfo_.size())
-        << "positive_label [" << label << "] should be in range [0, "
-        << statsInfo_.size() << ")";
-    double precision =
-        calcPrecision(statsInfo_[label].TP, statsInfo_[label].FP);
-    double recall = calcRecall(statsInfo_[label].TP, statsInfo_[label].FN);
-    os << "positive_label=" << label << " precision=" << precision
-       << " recall=" << recall
-       << " F1-score=" << calcF1Score(precision, recall);
-    return;
-  }
-
-  // micro average method: precision = (TP1+TP2)/(TP1+FP1+TP2+FP2)
-  // macro average method: precision = (precision1+precision2)/2
-  double microTotalTP = 0;
-  double microTotalFP = 0;
-  double microTotalFN = 0;
-  double macroAvgPrecision = 0;
-  double macroAvgRecall = 0;
-  size_t numLabels = statsInfo_.size();
-  for (size_t i = 0; i < numLabels; ++i) {
-    microTotalTP += statsInfo_[i].TP;
-    microTotalFP += statsInfo_[i].FP;
-    microTotalFN += statsInfo_[i].FN;
-    macroAvgPrecision += calcPrecision(statsInfo_[i].TP, statsInfo_[i].FP);
-    macroAvgRecall += calcRecall(statsInfo_[i].TP, statsInfo_[i].FN);
-  }
-  macroAvgPrecision /= numLabels;
-  macroAvgRecall /= numLabels;
-  double macroAvgF1Score = calcF1Score(macroAvgPrecision, macroAvgRecall);
-  os << "macro-average-precision=" << macroAvgPrecision
-     << " macro-average-recall=" << macroAvgRecall
-     << " macro-average-F1-score=" << macroAvgF1Score;
-
-  double microAvgPrecision = calcPrecision(microTotalTP, microTotalFP);
-  double microAvgRecall = calcPrecision(microTotalTP, microTotalFN);
-  double microAvgF1Score = calcF1Score(microAvgPrecision, microAvgRecall);
-  if (!isMultiBinaryLabel_) {
-    // precision and recall are equal in this case
-    os << " micro-average-precision=" << microAvgPrecision;
-  } else {
-    os << " micro-average-precision=" << microAvgPrecision
-       << " micro-average-recall=" << microAvgRecall
-       << " micro-average-F1-score=" << microAvgF1Score;
+  PrintStatsInfo info;
+  bool containMacroMicroInfo = getStatsInfo(&info);
+  os << "positive_label=" << config_.positive_label()
+     << " precision=" << info.precision << " recall=" << info.recall
+     << " F1-score=" << info.f1;
+  if (containMacroMicroInfo) {
+    os << "macro-average-precision=" << info.macroAvgPrecision
+       << " macro-average-recall=" << info.macroAvgRecall
+       << " macro-average-F1-score=" << info.macroAvgF1Score;
+    if (!isMultiBinaryLabel_) {
+      // precision and recall are equal in this case
+      os << " micro-average-precision=" << info.microAvgPrecision;
+    } else {
+      os << " micro-average-precision=" << info.microAvgPrecision
+         << " micro-average-recall=" << info.microAvgRecall
+         << " micro-average-F1-score=" << info.microAvgF1Score;
+    }
   }
 }
 
@@ -721,6 +745,60 @@ void PrecisionRecallEvaluator::calcStatsInfoMulti(const MatrixPtr& output,
   }
 }
 
+void PrecisionRecallEvaluator::storeLocalValues() const {
+  if (this->values_.size() == 0) {
+    PrintStatsInfo info;
+    bool containMacroMicroInfo = getStatsInfo(&info);
+    values_["precision"] = info.precision;
+    values_["recal"] = info.recall;
+    values_["F1-score"] = info.f1;
+    if (containMacroMicroInfo) {
+      values_["macro-average-precision"] = info.macroAvgPrecision;
+      values_["macro-average-recall"] = info.macroAvgRecall;
+      values_["macro-average-F1-score"] = info.macroAvgF1Score;
+      if (!isMultiBinaryLabel_) {
+        // precision and recall are equal in this case
+        values_["micro-average-precision"] = info.microAvgPrecision;
+      } else {
+        values_["micro-average-precision"] = info.microAvgPrecision;
+        values_["micro-average-recall"] = info.microAvgRecall;
+        values_["micro-average-F1-score"] = info.microAvgF1Score;
+      }
+    }
+  }
+}
+
+void PrecisionRecallEvaluator::getNames(std::vector<std::string>* names) {
+  this->storeLocalValues();
+  names->reserve(this->values_.size());
+  for (auto it = this->values_.begin(); it != this->values_.end(); ++it) {
+    names->push_back(this->config_.name() + "." + it->first);
+  }
+}
+
+real PrecisionRecallEvaluator::getValue(const std::string& name,
+                                        Error* err) const {
+  this->storeLocalValues();
+  std::vector<std::string> buffers;
+  paddle::str::split(name, '.', &buffers);
+  auto it = this->values_.find(buffers[buffers.size() - 1]);
+  if (it == this->values_.end()) {  // not found
+    *err = Error("No such key %s", name.c_str());
+    return .0f;
+  }
+
+  return it->second;
+}
+
+std::string PrecisionRecallEvaluator::getType(const std::string& name,
+                                              Error* err) const {
+  this->getValue(name, err);
+  if (!err->isOK()) {
+    return "";
+  }
+  return "precision_recall";
+}
+
 void PrecisionRecallEvaluator::distributeEval(ParameterClient2* client) {
   size_t size = 4 * statsInfo_.size();
   double* buf = new double[size];
@@ -740,6 +818,47 @@ void PrecisionRecallEvaluator::distributeEval(ParameterClient2* client) {
   delete[] buf;
 }
 
+bool PrecisionRecallEvaluator::getStatsInfo(
+    PrecisionRecallEvaluator::PrintStatsInfo* info) const {
+  int label = config_.positive_label();
+  if (label != -1) {
+    CHECK(label >= 0 && label < (int)statsInfo_.size())
+        << "positive_label [" << label << "] should be in range [0, "
+        << statsInfo_.size() << ")";
+    info->precision = calcPrecision(statsInfo_[label].TP, statsInfo_[label].FP);
+    info->recall = calcRecall(statsInfo_[label].TP, statsInfo_[label].FN);
+    info->f1 = calcF1Score(info->precision, info->recall);
+    return false;
+  }
+
+  // micro average method: precision = (TP1+TP2)/(TP1+FP1+TP2+FP2)
+  // macro average method: precision = (precision1+precision2)/2
+  double microTotalTP = 0;
+  double microTotalFP = 0;
+  double microTotalFN = 0;
+  info->macroAvgPrecision = 0;
+  info->macroAvgRecall = 0;
+  size_t numLabels = statsInfo_.size();
+  for (size_t i = 0; i < numLabels; ++i) {
+    microTotalTP += statsInfo_[i].TP;
+    microTotalFP += statsInfo_[i].FP;
+    microTotalFN += statsInfo_[i].FN;
+    info->macroAvgPrecision +=
+        calcPrecision(statsInfo_[i].TP, statsInfo_[i].FP);
+    info->macroAvgRecall += calcRecall(statsInfo_[i].TP, statsInfo_[i].FN);
+  }
+  info->macroAvgPrecision /= numLabels;
+  info->macroAvgRecall /= numLabels;
+  info->macroAvgF1Score =
+      calcF1Score(info->macroAvgPrecision, info->macroAvgRecall);
+
+  info->microAvgPrecision = calcPrecision(microTotalTP, microTotalFP);
+  info->microAvgRecall = calcPrecision(microTotalTP, microTotalFN);
+  info->microAvgF1Score =
+      calcF1Score(info->microAvgPrecision, info->microAvgRecall);
+  return true;
+}
+
 REGISTER_EVALUATOR(pnpair, PnpairEvaluator);
 void PnpairEvaluator::start() {
   Evaluator::start();
@@ -864,56 +983,35 @@ void PnpairEvaluator::calc(std::vector<PredictionResult>& predictArray) {
             << " calc total special pair: " << special;
 }
 
+std::string PnpairEvaluator::getTypeImpl() const { return "pnpair"; }
+
 ClassRegistrar<Evaluator> Evaluator::registrar_;
 Evaluator* Evaluator::create(const EvaluatorConfig& config) {
-  Evaluator* evaluator = nullptr;
-  if (config.type() == "classification_error") {
-    evaluator = new ClassificationErrorEvaluator();
-  } else if (config.type() == "sum") {
-    evaluator = new SumEvaluator();
-  } else if (config.type() == "last-column-sum") {
-    evaluator = new ColumnSumEvaluator(-1);
-  } else if (config.type() == "last-column-auc") {
-    evaluator = new AucEvaluator(-1);
-  } else {
-    evaluator = registrar_.createByType(config.type());
-  }
+  Evaluator* evaluator = registrar_.createByType(config.type());
   evaluator->init(config);
   return evaluator;
 }
+
+REGISTER_EVALUATOR(classification_error, ClassificationErrorEvaluator);
+REGISTER_EVALUATOR(sum, SumEvaluator);
+static InitFunction __reg_type_auc_sum__([]() {
+  Evaluator::registrar_.registerClass(
+      "last-column-sum", [] { return new ColumnSumEvaluator(-1); });
+  Evaluator::registrar_.registerClass("last-column-auc",
+                                      [] { return new AucEvaluator(-1); });
+});
+
 /**
  * @brief print value of each layer.
  *
  * The config file api is value_printer_evaluator.
  */
-class ValuePrinter : public Evaluator {
+class ValuePrinter : public NotGetableEvaluator {
 public:
-  ValuePrinter() {}
-
   virtual void eval(const NeuralNetwork& nn) {
     for (const std::string& name : config_.input_layers()) {
-      const Argument& argu = nn.getLayer(name)->getOutput();
-      if (argu.value) {
-        std::ostringstream os;
-        argu.value->print(os);
-        LOG(INFO) << "layer=" << name << " value matrix:\n" << os.str();
-      }
-      if (argu.ids) {
-        std::ostringstream os;
-        argu.ids->print(os, argu.ids->getSize());
-        LOG(INFO) << "layer=" << name << " ids vector:\n" << os.str();
-      }
-      if (auto startPos = argu.sequenceStartPositions) {
-        std::ostringstream os;
-        startPos->getVector(false)->print(os, startPos->getSize());
-        LOG(INFO) << "layer=" << name << " sequence pos vector:\n" << os.str();
-      }
-      if (auto subStartPos = argu.subSequenceStartPositions) {
-        std::ostringstream os;
-        subStartPos->getVector(false)->print(os, subStartPos->getSize());
-        LOG(INFO) << "layer=" << name << " sub-sequence pos vector:\n"
-                  << os.str();
-      }
+      nn.getLayer(name)->getOutput().printValueString(LOG(INFO),
+                                                      "layer=" + name + " ");
     }
   }
 
@@ -922,15 +1020,14 @@ public:
   virtual real evalImp(std::vector<Argument>& arguments) { return 0; }
 };
 REGISTER_EVALUATOR(value_printer, ValuePrinter);
+
 /**
  * @brief print gradient of each layer.
  *
  * The config file api is gradient_printer_evaluator.
  */
-class GradientPrinter : public Evaluator {
+class GradientPrinter : public NotGetableEvaluator {
 public:
-  GradientPrinter() {}
-
   virtual void eval(const NeuralNetwork& nn) {
     for (const std::string& name : config_.input_layers()) {
       const Argument& argu = nn.getLayer(name)->getOutput();
@@ -939,11 +1036,6 @@ public:
         argu.grad->print(os);
         LOG(INFO) << "layer=" << name << " grad matrix:\n" << os.str();
       }
-      if (auto startPos = argu.sequenceStartPositions) {
-        std::ostringstream os;
-        startPos->getVector(false)->print(os, startPos->getSize());
-        LOG(INFO) << "layer=" << name << " sequence pos vector:\n" << os.str();
-      }
     }
   }
 
@@ -957,7 +1049,7 @@ REGISTER_EVALUATOR(gradient_printer, GradientPrinter);
  *
  * The config file api is maxid_printer_evaluator.
  */
-class MaxIdPrinter : public Evaluator {
+class MaxIdPrinter : public NotGetableEvaluator {
 private:
   IVectorPtr maxIds_;
   MatrixPtr maxValues_;
@@ -999,7 +1091,7 @@ REGISTER_EVALUATOR(max_id_printer, MaxIdPrinter);
  *
  * The config file api is maxframe_printer_evaluator.
  */
-class MaxFramePrinter : public Evaluator {
+class MaxFramePrinter : public NotGetableEvaluator {
 private:
   IVectorPtr maxIds_;
   MatrixPtr maxValues_;
@@ -1086,7 +1178,7 @@ REGISTER_EVALUATOR(max_frame_printer, MaxFramePrinter);
  * The config file api is seqtext_printer_evaluator.
  *
  */
-class SequenceTextPrinter : public Evaluator {
+class SequenceTextPrinter : public NotGetableEvaluator {
 private:
   /// dict_file, which contains a list of tokens
   std::vector<std::string> dict_;
@@ -1253,4 +1345,6 @@ public:
 };
 REGISTER_EVALUATOR(classification_error_printer, ClassificationErrorPrinter);
 
+std::string DummyEvaluator::getTypeImpl() const { return "dummy"; }
+
 }  // namespace paddle
diff --git a/paddle/gserver/evaluators/Evaluator.h b/paddle/gserver/evaluators/Evaluator.h
index 5770847309670ef1856cfb9255fa847c24513b56..b114500e2b7c1e460a02c78b99b5f1a8fb63b8c3 100644
--- a/paddle/gserver/evaluators/Evaluator.h
+++ b/paddle/gserver/evaluators/Evaluator.h
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "paddle/parameter/Argument.h"
 #include "paddle/pserver/ParameterClient2.h"
 #include "paddle/utils/ClassRegistrar.h"
+#include "paddle/utils/Error.h"
 
 namespace paddle {
 
@@ -117,12 +118,105 @@ public:
 
   static ClassRegistrar<Evaluator> registrar_;
 
+  /**
+   * @brief getNames will return all field names of current evaluator.
+   *
+   * The format of name is `evaluator_name.evaluator_fields`. If the evaluator
+   * has multiple field, the name could be `evaluator_name.field1`. For example
+   * the PrecisionRecallEvaluator contains `precision`, `recall` fields. The get
+   * names will return `precision_recall_evaluator.precision`,
+   * `precision_recall_evaluator.recal`, etc.
+   *
+   * Also, if current Evaluator is a combined evaluator. getNames will return
+   * all names of all evaluators inside the combined evaluator.
+   *
+   * @param names [out]: the field names of current evaluator.
+   * @note Never clear the names parameter inside getNames.
+   */
+  virtual void getNames(std::vector<std::string>* names) {
+    names->push_back(config_.name());
+  }
+
+  /**
+   * @brief getValue will return the current evaluate value of one field.
+   *
+   * @param name: The field name of current evaluator.
+   * @param err [out]: The error state.
+   *
+   * @return The evaluate value(metric).
+   */
+  virtual real getValue(const std::string& name, Error* err) const {
+    if (name != config_.name()) {
+      *err = Error("no such name of evaluator %s", name.c_str());
+      return .0f;
+    }
+    return this->getValueImpl();
+  }
+
+  /**
+   * @brief getType will return the evaluator type by field name.
+   *
+   * Evaluate Type is the current type of evaluator in string. Such as 'auc',
+   * 'precision_recall'. In combined evaluator, different name may get different
+   * evaluate type because it could be evaluated by different evaluator inside.
+   *
+   * @param name: The field name of current Evaluator.
+   * @param err: The error state. nullptr means don't care.
+   * @return the evaluator type string.
+   */
+  virtual std::string getType(const std::string& name, Error* err) const {
+    if (name != config_.name()) {
+      *err = Error("no such name of evaluator %s", name.c_str());
+      return std::string();
+    }
+    return this->getTypeImpl();
+  }
+
+protected:
+  /**
+   * @brief getValueImpl The simplest way to define getValue result. If this
+   * evaluator doesn't contain multiple fields, and do not throw any error, just
+   * implemented this method to get the evaluate result(metric).
+   * @return Evaluate result(metric).
+   */
+  virtual real getValueImpl() const {
+    return numSamples_ != .0 ? totalScore_ / numSamples_ : .0;
+  }
+
+  /**
+   * @brief getTypeImpl The simplest way to define getType result. If this
+   * evaluator doesn't combine many evaluators, the get type should only return
+   * itself type.
+   * @return Evaluator type.
+   */
+  virtual std::string getTypeImpl() const { return "base"; }
+
 protected:
   EvaluatorConfig config_;
   double numSamples_;
   double totalScore_;
 };
 
+/**
+ * @brief The NotGetableEvaluator class is the base class of evaluator that
+ * cannot get value in runtime. The most NotGetableEvaluator is Printer
+ * Evaluator, which is only used to debug network configuration.
+ */
+class NotGetableEvaluator : public Evaluator {
+  // Evaluator interface
+public:
+  void getNames(std::vector<std::string>* names) {}
+
+  real getValue(const std::string& name, Error* err) const {
+    *err = Error("Not implemented");
+    return .0f;
+  }
+  std::string getType(const std::string& name, Error* err) const {
+    *err = Error("Not implemented");
+    return "";
+  }
+};
+
 class DummyEvaluator : public Evaluator {
 public:
   DummyEvaluator() {}
@@ -135,6 +229,10 @@ public:
   }
   virtual void finish() {}
   virtual void printStats(std::ostream&) const {}
+
+  // Evaluator interface
+protected:
+  std::string getTypeImpl() const;
 };
 /**
  * @brief evaluate AUC using colIdx-th column as prediction.
@@ -191,6 +289,11 @@ private:
   }
 
   double calcAuc() const;
+
+  // Evaluator interface
+protected:
+  real getValueImpl() const;
+  std::string getTypeImpl() const;
 };
 
 /**
@@ -223,6 +326,10 @@ private:
                      real* clickData,
                      real* pvData,
                      size_t size);
+
+  // Evaluator interface
+protected:
+  std::string getTypeImpl() const;
 };
 /**
  * @brief precision, recall and f1 score Evaluator
@@ -272,6 +379,20 @@ private:
   IVectorPtr cpuLabel_;
   MatrixPtr cpuWeight_;
 
+  struct PrintStatsInfo {
+    double precision;
+    double recall;
+    double f1;
+    double macroAvgPrecision;
+    double macroAvgRecall;
+    double macroAvgF1Score;
+    double microAvgPrecision;
+    double microAvgRecall;
+    double microAvgF1Score;
+  };
+
+  bool getStatsInfo(PrintStatsInfo* info) const;
+
   void calcStatsInfo(const MatrixPtr& output,
                      const IVectorPtr& label,
                      const MatrixPtr& weight);
@@ -303,6 +424,15 @@ private:
       return 0;
     }
   }
+
+  mutable std::unordered_map<std::string, real> values_;
+
+  void storeLocalValues() const;
+  // Evaluator interface
+public:
+  void getNames(std::vector<std::string>* names);
+  real getValue(const std::string& name, Error* err) const;
+  std::string getType(const std::string& name, Error* err) const;
 };
 
 /*
@@ -349,8 +479,7 @@ public:
   virtual void finish() { calc(predictArray_); }
 
   virtual void printStats(std::ostream& os) const {
-    os << " pos/neg"
-       << "=" << pairArray_[0] / ((pairArray_[1] <= 0) ? 1.0 : pairArray_[1]);
+    os << " pos/neg=" << this->getValueImpl();
   }
 
   virtual void distributeEval(ParameterClient2* client) {
@@ -366,6 +495,13 @@ private:
   IVectorPtr cpuLabel_;
   IVectorPtr cpuInfo_;
   MatrixPtr cpuWeight_;
+
+  // Evaluator interface
+protected:
+  real getValueImpl() const {
+    return pairArray_[0] / ((pairArray_[1] <= 0) ? 1.0 : pairArray_[1]);
+  }
+  std::string getTypeImpl() const;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/GradientMachine.h b/paddle/gserver/gradientmachines/GradientMachine.h
index 0829968d87c5dc7eeb2d1b70c758ff305d89496f..bc2f2f8563526aa045ea89f15152ee2d639b5774 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.h
+++ b/paddle/gserver/gradientmachines/GradientMachine.h
@@ -134,6 +134,10 @@ public:
     backward(callback);
   }
 
+  virtual Argument getLayerOutput(const std::string& layerName) {
+    return *((Argument*)nullptr);
+  }
+
   // see comment in Layer.h for the function with the same name
   virtual void resetState() {}
 
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
index 80f223824d8dccfb0e9386f4c076b28f9332a958..6ae60102b3e431727c0954e8b8073bfe0534f8ee 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -24,9 +24,6 @@ limitations under the License. */
 DEFINE_bool(allow_only_one_model_on_one_gpu,
             true,
             "If true, do not allow multiple models on one GPU device");
-#ifdef PADDLE_METRIC_LEARNING
-DECLARE_bool(external);
-#endif
 
 namespace paddle {
 
@@ -45,11 +42,7 @@ MultiGradientMachine::MultiGradientMachine(const ModelConfig& config,
       trainerBarrier_(FLAGS_trainer_count),
       allBarrier_(FLAGS_trainer_count + 1),
       inArgsCopied_(false) {
-#ifdef PADDLE_METRIC_LEARNING
-  isPassGrad_ = FLAGS_external;
-#else
   isPassGrad_ = false;
-#endif
   numThreads_ = FLAGS_trainer_count;
   if (useGpu) {
     //! TODO(yuyang18): When useGpu=false && paddle is not compiled with gpu,
@@ -282,6 +275,18 @@ void MultiGradientMachine::forwardBackward(const std::vector<Argument>& inArgs,
   backwardImp(callback);
 }
 
+Argument MultiGradientMachine::getLayerOutput(const std::string& layerName) {
+  std::vector<Argument> args;
+  args.reserve(threads_.size());
+
+  for (auto& thread : threads_) {
+    args.push_back(thread->getGradientMachine()->getLayerOutput(layerName));
+  }
+  outLayerArgs_.concat(args, false /* use_gpu */, outArgStream_, passType_);
+
+  return outLayerArgs_;
+}
+
 void MultiGradientMachine::backwardImp(const UpdateCallback& callback) {
   for (size_t i = 0; i < parameters_.size(); i++) {
     if (!parameters_[i]->useGpu() || parameters_[i]->isStatic()) continue;
@@ -334,7 +339,9 @@ Evaluator* MultiGradientMachine::makeEvaluator() const {
 void MultiGradientMachine::eval(Evaluator* evaluator) const {
   for (auto& thread : threads_) {
     SetDevice device(thread->getDeviceId());
-    thread->getGradientMachine()->eval(evaluator);
+    if (thread->hasInputData()) {
+      thread->getGradientMachine()->eval(evaluator);
+    }
   }
 }
 
@@ -344,14 +351,19 @@ void MultiGradientMachine::getOutArgs(std::vector<Argument>* outArgs,
     REGISTER_TIMER("waitOutArgs");
     thread->waitOutArgsReady();
   }
-  outArgs_.resize(threads_[0]->getOutArgs().size());
+
+  outArgs_.resize(threads_[threads_.size() - 1]->getOutArgs().size());
 
   REGISTER_TIMER("copyOutArgs");
   for (size_t i = 0; i < outArgs_.size(); ++i) {
     std::vector<Argument> args;
     args.reserve(threads_.size());
     for (auto& thread : threads_) {
-      args.push_back(thread->getOutArgs()[i]);
+      // If the thread input is empty, then the output is empty.
+      auto tmp = thread->getOutArgs();
+      if (tmp.size() > 0) {
+        args.push_back(tmp[i]);
+      }
     }
     outArgs_[i].concat(args, useGpu_, outArgStream_, passType);
   }
@@ -522,7 +534,7 @@ void TrainerThread::prefetch() {
 void TrainerThread::forward() {
   if (!inArgsCopied_) {
     REGISTER_TIMER("copyInArgs");
-    copyInArgs();
+    batchSize_ = copyInArgs();
   } else {
     inArgsCopied_ = false;
   }
@@ -552,7 +564,12 @@ void TrainerThread::forward() {
 
   {
     REGISTER_TIMER("thread_forward");
-    gradientMachine_->forward(inArgs_, &outArgs_, multiMachine_->getPassType());
+    if (batchSize_ > 0) {
+      gradientMachine_->forward(
+          inArgs_, &outArgs_, multiMachine_->getPassType());
+    } else {
+      outArgs_.clear();
+    }
   }
   outArgsReadySem_.post();
 }
@@ -562,7 +579,13 @@ void TrainerThread::backward() {
   if (multiMachine_->isPassGrad()) {
     copyOutputGrad();
   }
-  gradientMachine_->backward(backwardCallback_);
+  if (batchSize_ > 0) {
+    gradientMachine_->backward(backwardCallback_);
+  } else {
+    for (size_t i = parameters_.size(); i > 0; i--) {
+      backwardCallback(parameters_[i - 1].get());
+    }
+  }
   if (multiMachine_->hasNonstaticCpuParamters()) {
     mergeCpuGradients();
   }
@@ -720,7 +743,7 @@ void TrainerThread::notifyValueReady(int paramId) {
   notifyValueDispatch(paramId);
 }
 
-void TrainerThread::copyInArgs() {
+int TrainerThread::copyInArgs() {
   const std::vector<Argument>& fullInArgs = multiMachine_->getInArgs();
   int numThreads = multiMachine_->getAllThreads().size();
   int32_t numSequences = fullInArgs[0].getNumSequences();
@@ -736,7 +759,7 @@ void TrainerThread::copyInArgs() {
   }
 
   if (copySize == 0) {
-    return;
+    return 0;
   }
 
   for (size_t i = 0; i < fullInArgs.size(); i++) {
@@ -746,6 +769,7 @@ void TrainerThread::copyInArgs() {
         copySize,
         FLAGS_parallel_nn ? false : multiMachine_->useGpu());
   }
+  return copySize;
 }
 
 void TrainerThread::mergeCpuGradients() {
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.h b/paddle/gserver/gradientmachines/MultiGradientMachine.h
index 9be15ef4bcf34f26b7eceb9047252e537f20a4a8..70203bbb97fe79d72fbc6bd2b5d427cb1de7b61f 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.h
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.h
@@ -189,6 +189,8 @@ public:
                        PassType passType,
                        const UpdateCallback& callback);
 
+  virtual Argument getLayerOutput(const std::string& layerName);
+
   virtual void onPassEnd();
 
   virtual void finish();
@@ -314,6 +316,8 @@ protected:
   std::vector<Argument> outArgs_;
   hl_stream_t outArgStream_;
 
+  Argument outLayerArgs_;
+
   /// ParameterType which needs to be merged from each GPU
   std::vector<ParameterType> mergeTypes_;
   int numDevices_;         /* number of gpu devices */
@@ -383,6 +387,9 @@ public:
   /// copy the output gradient from the main GradientMachine.
   void copyOutputGrad();
 
+  /// Whether the thread has input data.
+  bool hasInputData() { return batchSize_ != 0; }
+
 protected:
   void mergeCpuGradients();
 
@@ -403,7 +410,7 @@ protected:
   void copyGradToBufferThread();
   void gradCollectThread();
 
-  void copyInArgs();
+  int copyInArgs();
   void forward();
   void backward();
   void backwardCallback(Parameter* para);
@@ -463,6 +470,7 @@ protected:
 
   /// indicate whether inArgs is copied before forward()
   bool inArgsCopied_;
+  int batchSize_;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index 22051e07ee0026bc3c44a8767e265a56b415b8e4..4512aacc81f86bf87fc9ea30adcf081327663f16 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -293,11 +293,10 @@ void NeuralNetwork::backward(const UpdateCallback& callback) {
   }
 }
 
-MatrixPtr NeuralNetwork::getLayerOutput(const std::string& layerName) {
-  auto it = layerMap_.find(layerName);
-  CHECK(it != layerMap_.end()) << "Cannot find layer: " << layerName;
-  return it->second->getOutputValue();
+Argument NeuralNetwork::getLayerOutput(const std::string& layerName) {
+  return getLayer(layerName)->getOutput();
 }
+
 void NeuralNetwork::onPassEnd() {
   for (auto& layer : layers_) {
     layer->onPassEnd();
@@ -306,7 +305,6 @@ void NeuralNetwork::onPassEnd() {
 
 class CombinedEvaluator : public Evaluator {
 public:
-  CombinedEvaluator() {}
   void addEvaluator(std::unique_ptr<Evaluator>&& evaluator) {
     evaluators_.emplace_back(std::move(evaluator));
   }
@@ -346,6 +344,55 @@ public:
 
 protected:
   std::vector<std::unique_ptr<Evaluator>> evaluators_;
+
+  // Evaluator interface
+public:
+  /**
+   * @brief getNames will return all inside evaluators' names.
+   * @param names [out]: return names.
+   */
+  void getNames(std::vector<std::string>* names) {
+    for (auto& eval : evaluators_) {
+      eval->getNames(names);
+    }
+  }
+
+  /**
+   * @brief getValue could get all inside evaluators' value.
+   */
+  real getValue(const std::string& name, Error* err) const {
+    return this->getMethodHelper<real>(
+        name, err, [&name, err](const std::unique_ptr<Evaluator>& eval) {
+          return eval->getValue(name, err);
+        });
+  }
+
+  /**
+   * @brief getType could get all inside evaluators' type.
+   */
+  std::string getType(const std::string& name, Error* err) const {
+    return this->getMethodHelper<std::string>(
+        name, err, [&name, err](const std::unique_ptr<Evaluator>& eval) {
+          return eval->getType(name, err);
+        });
+  }
+
+private:
+  template <typename T>
+  T getMethodHelper(const std::string& name,
+                    Error* err,
+                    const std::function<T(const std::unique_ptr<Evaluator>&)>&
+                        callback) const {
+    for (auto& eval : evaluators_) {
+      std::vector<std::string> names;
+      eval->getNames(&names);
+      if (std::find(names.begin(), names.end(), name) != names.end()) {
+        return callback(eval);
+      }
+    }
+    *err = Error("No such key %s", name.c_str());
+    return T();
+  }
 };
 
 Evaluator* NeuralNetwork::makeEvaluator() const {
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h
index 25af4abcf81700e200feea806fa3daed19df1275..e7b6c438407e7eab6eab1f6ed496f35caa9f2177 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.h
@@ -87,7 +87,8 @@ public:
 
   virtual void backward(const UpdateCallback& callback = nullptr);
 
-  MatrixPtr getLayerOutput(const std::string& layerName);
+  virtual Argument getLayerOutput(const std::string& layerName);
+
   const LayerPtr& getLayer(const std::string& layerName) const {
     auto it = layerMap_.find(layerName);
     CHECK(it != layerMap_.end()) << "Unknown layer " << layerName;
diff --git a/paddle/gserver/layers/CRFDecodingLayer.cpp b/paddle/gserver/layers/CRFDecodingLayer.cpp
index fdb46aba68e924480a6595b02c04ff4d1edd914d..191176ce985a8e12e33562f0cab73da6bbe667e6 100644
--- a/paddle/gserver/layers/CRFDecodingLayer.cpp
+++ b/paddle/gserver/layers/CRFDecodingLayer.cpp
@@ -24,7 +24,7 @@ bool CRFDecodingLayer::init(const LayerMap& layerMap,
     return false;
   }
   crf_.reset(new LinearChainCRF(
-      numClasses_, parameter_->getBuf(PARAMETER_VALUE)->getData(), nullptr));
+      numClasses_, parameter_->getBuf(PARAMETER_VALUE)->getData()));
   return true;
 }
 
diff --git a/paddle/gserver/layers/CRFLayer.cpp b/paddle/gserver/layers/CRFLayer.cpp
index 02b7aaf17e89d889ca0030f9de2b5d7431a28fd3..0b544420097e9150f8489731b6379dea633e992c 100644
--- a/paddle/gserver/layers/CRFLayer.cpp
+++ b/paddle/gserver/layers/CRFLayer.cpp
@@ -42,6 +42,7 @@ bool CRFLayer::init(const LayerMap& layerMap,
   CHECK_EQ(parameters_[0]->getSize(), numClasses_ * (numClasses_ + 2));
 
   parameter_ = parameters_[0];
+  weight_.reset(new Weight(numClasses_ + 2, numClasses_, parameter_));
 
   // We don't need sequenceStartPositions because each sample of output_ is
   // for the cost of one sequence.
@@ -69,11 +70,7 @@ void CRFLayer::forward(PassType passType) {
 
   for (size_t i = 0; i < numSequences; ++i) {
     if (i >= crfs_.size()) {
-      crfs_.emplace_back(numClasses_,
-                         parameter_->getBuf(PARAMETER_VALUE)->getData(),
-                         parameter_->getBuf(PARAMETER_GRADIENT)
-                             ? parameter_->getBuf(PARAMETER_GRADIENT)->getData()
-                             : nullptr);
+      crfs_.emplace_back(numClasses_, weight_->getW()->getData());
     }
     output_.value->getData()[i] =
         crfs_[i].forward(output.value->getData() + numClasses_ * starts[i],
@@ -93,22 +90,25 @@ void CRFLayer::backward(const UpdateCallback& callback) {
   const int* starts = label.sequenceStartPositions->getData(false);
   int numSequences = label.sequenceStartPositions->getSize() - 1;
 
+  bool needWGrad = weight_->getWGrad() ? true : false;
   for (int i = 0; i < numSequences; ++i) {
     crfs_[i].backward(output.value->getData() + numClasses_ * starts[i],
-                      output.grad->getData() + numClasses_ * starts[i],
                       label.ids->getData() + starts[i],
-                      starts[i + 1] - starts[i]);
-    if (weightLayer_) {
-      real weight = getInputValue(*weightLayer_)->getElement(i, 0);
-      MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i + 1]);
-      grad->mulScalar(weight);
+                      starts[i + 1] - starts[i],
+                      needWGrad);
+    real instanceWeight = weightLayer_
+                              ? getInputValue(*weightLayer_)->getElement(i, 0)
+                              : real(1.0f);
+    instanceWeight *= coeff_;
+
+    MatrixPtr grad = output.grad->subRowMatrix(starts[i], starts[i + 1]);
+    grad->add(*crfs_[i].getXGrad(), real(1.0f), instanceWeight);
+    if (needWGrad) {
+      weight_->getWGrad()->add(
+          *crfs_[i].getWGrad(), real(1.0f), instanceWeight);
     }
   }
 
-  if (coeff_ != real(1.0f)) {
-    output.grad->mulScalar(coeff_);
-  }
-
   parameter_->incUpdate(callback);
 }
 
diff --git a/paddle/gserver/layers/CRFLayer.h b/paddle/gserver/layers/CRFLayer.h
index de36a85083b6b293fd2d8522ec279a38cc4f8be3..00ec13cede97401b4c8a308df6fac27e47692146 100644
--- a/paddle/gserver/layers/CRFLayer.h
+++ b/paddle/gserver/layers/CRFLayer.h
@@ -38,8 +38,9 @@ protected:
   size_t numClasses_;
   ParameterPtr parameter_;
   std::vector<LinearChainCRF> crfs_;
-  LayerPtr weightLayer_;  // weight for each sequence
-  real coeff_;            // weight for the layer
+  LayerPtr weightLayer_;            // weight for each sequence
+  std::unique_ptr<Weight> weight_;  // parameters
+  real coeff_;                      // weight for the layer
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/CosSimLayer.cpp b/paddle/gserver/layers/CosSimLayer.cpp
index a6c0300acf6752a3536e7939577b561fd97d1eb8..57ba124e40cbd098fa8b0012ff31d6935b16862a 100644
--- a/paddle/gserver/layers/CosSimLayer.cpp
+++ b/paddle/gserver/layers/CosSimLayer.cpp
@@ -42,7 +42,7 @@ void CosSimLayer::forward(PassType passType) {
   /* malloc memory for the output_ if necessary */
   int batchSize = getInputValue(0)->getHeight();
   int size = getSize();
-  CHECK_EQ(forward_.size(), 1) << "Only one forward function needed";
+  CHECK_EQ(forward_.size(), 1UL) << "Only one forward function needed";
 
   {
     REGISTER_TIMER_INFO("CosFwResetTimer", getName().c_str());
@@ -68,7 +68,7 @@ void CosSimLayer::forward(PassType passType) {
 void CosSimLayer::backward(const UpdateCallback& callback) {
   /* activation */ {
     REGISTER_TIMER_INFO("CosBpAtvTimer", getName().c_str());
-    CHECK_EQ(backward_.size(), 1) << "Only one backward function needed";
+    CHECK_EQ(backward_.size(), 1UL) << "Only one backward function needed";
 
     const auto outG = this->getOutputGrad();
     const auto outV = this->getOutputValue();
diff --git a/paddle/gserver/layers/CosSimVecMatLayer.cpp b/paddle/gserver/layers/CosSimVecMatLayer.cpp
index aabafd473aa1e06a767d48d4c49b7b8662e992e7..0f887d8adfa053e8fe88ac4fa4e2a9ba08ac07b5 100644
--- a/paddle/gserver/layers/CosSimVecMatLayer.cpp
+++ b/paddle/gserver/layers/CosSimVecMatLayer.cpp
@@ -112,7 +112,7 @@ bool CosSimVecMatLayer::init(const LayerMap& layerMap,
 
 void CosSimVecMatLayer::forward(PassType passType) {
   Layer::forward(passType);
-  CHECK_EQ(forward_.size(), 1) << "Only one forward function needed";
+  CHECK_EQ(forward_.size(), 1UL) << "Only one forward function needed";
 
   MatrixPtr inV0 = getInputValue(0);
   MatrixPtr inV1 = getInputValue(1);
@@ -145,7 +145,7 @@ void CosSimVecMatLayer::forward(PassType passType) {
 }
 
 void CosSimVecMatLayer::backward(const UpdateCallback& callback) {
-  CHECK_EQ(backward_.size(), 1) << "Only one forward function needed";
+  CHECK_EQ(backward_.size(), 1UL) << "Only one forward function needed";
 
   MatrixPtr inV0 = getInputValue(0);
   MatrixPtr inV1 = getInputValue(1);
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
index f76d41ad3e8a3b1730f9d50c0773ee4f61ddb541..125aaf947f3c9d976b117667d1d1b7700a029cc6 100644
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -381,8 +381,7 @@ void Layer::backwardActivation() {
 void Layer::forwardDropOut() {
   auto& outV = getOutputValue();
 
-  if (passType_ == PASS_TRAIN || passType_ == PASS_METRIC_TRAIN ||
-      passType_ == PASS_METRIC_TRAIN_WITH_NOERROR) {
+  if (passType_ == PASS_TRAIN) {
     // new dropOutMask_ if dropOutMask_ is null ptr
     Matrix::resizeOrCreate(dropOutMask_,
                            outV->getHeight(),
diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h
index 6dfd48fb96618102b71e9f6de79a348dc7f62647..7c4bea072157aac17787afab184b51c09ff656f2 100644
--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -311,6 +311,7 @@ public:
         return *output->second;
       } else {
         LOG(FATAL) << "No specific output " << str;
+        return *((Argument*)nullptr);
       }
     }
   }
diff --git a/paddle/gserver/layers/LinearChainCRF.cpp b/paddle/gserver/layers/LinearChainCRF.cpp
index b7f748f3bb8a419429956724131e81dfdbd274c6..dc3dc156792bdf32c3b948a292597d0e9eca5d8b 100644
--- a/paddle/gserver/layers/LinearChainCRF.cpp
+++ b/paddle/gserver/layers/LinearChainCRF.cpp
@@ -17,18 +17,12 @@ limitations under the License. */
 
 namespace paddle {
 
-LinearChainCRF::LinearChainCRF(int numClasses, real* para, real* grad)
+LinearChainCRF::LinearChainCRF(int numClasses, real* para)
     : numClasses_(numClasses) {
   a_ = Matrix::create(para, 1, numClasses_);
   b_ = Matrix::create(para + numClasses_, 1, numClasses_);
   w_ = Matrix::create(para + 2 * numClasses_, numClasses_, numClasses_);
 
-  if (grad) {
-    da_ = Matrix::create(grad, 1, numClasses_);
-    db_ = Matrix::create(grad + numClasses_, 1, numClasses_);
-    dw_ = Matrix::create(grad + 2 * numClasses_, numClasses_, numClasses_);
-  }
-
   ones_ = Matrix::create(1, numClasses_);
   ones_->one();
 
@@ -107,19 +101,24 @@ real LinearChainCRF::forward(real* x, int* s, int length) {
   return -ll;
 }
 
-void LinearChainCRF::backward(real* x, real* dx, int* s, int length) {
+void LinearChainCRF::backward(real* x, int* s, int length, bool needWGrad) {
   MatrixPtr matX = Matrix::create(x, length, numClasses_);
-  MatrixPtr matDX = Matrix::create(dx, length, numClasses_);
-  MatrixPtr matGrad = Matrix::create(length, numClasses_);
+  Matrix::resizeOrCreate(matGrad_, length, numClasses_);
   Matrix::resizeOrCreate(beta_, length, numClasses_);
   real* b = b_->getData();
-  real* dw = dw_ ? dw_->getData() : nullptr;
+  if (needWGrad) {
+    Matrix::resizeOrCreate(matWGrad_, numClasses_ + 2, numClasses_);
+    matWGrad_->zeroMem();
+    da_ = matWGrad_->subRowMatrix(0, 1);
+    db_ = matWGrad_->subRowMatrix(1, 2);
+    dw_ = matWGrad_->subRowMatrix(2, numClasses_ + 2);
+  }
 
   real* alpha = alpha_->getData();
   real* beta = beta_->getData();
   real* expW = expW_->getData();
   real* expX = expX_->getData();
-  real* grad = matGrad->getData();
+  real* grad = matGrad_->getData();
 
   for (int i = 0; i < numClasses_; ++i) {
     beta[(length - 1) * numClasses_ + i] = exp(b[i]);
@@ -140,39 +139,38 @@ void LinearChainCRF::backward(real* x, real* dx, int* s, int length) {
     normalizeL1(beta + k * numClasses_, numClasses_);
   }
 
-  matGrad->dotMul(*alpha_, *beta_);
-  matGrad->rowNormalizeL1(*matGrad);
+  matGrad_->dotMul(*alpha_, *beta_);
+  matGrad_->rowNormalizeL1(*matGrad_);
   for (int k = 0; k < length; ++k) {
     grad[k * numClasses_ + s[k]] -= (real)1;
   }
-  matDX->add(*matGrad);
-  if (da_) {
-    da_->add(*matGrad->subMatrix(/* startRow= */ 0, /* numRows= */ 1));
-  }
-  if (db_) {
-    db_->add(*matGrad->subMatrix(/* startRow= */ length - 1, 1));
-  }
 
-  beta_->dotMul(*beta_, *expX_);
-  beta_->rowNormalizeL1(*beta_);
+  if (needWGrad) {
+    da_->add(*matGrad_->subMatrix(/* startRow= */ 0, /* numRows= */ 1));
+    db_->add(*matGrad_->subMatrix(/* startRow= */ length - 1, 1));
 
-  for (int k = 1; dw && k < length; ++k) {
-    real sum = 0;
-    for (int i = 0; i < numClasses_; ++i) {
-      for (int j = 0; j < numClasses_; ++j) {
-        sum += expW[i * numClasses_ + j] * alpha[(k - 1) * numClasses_ + i] *
-               beta[k * numClasses_ + j];
+    beta_->dotMul(*beta_, *expX_);
+    beta_->rowNormalizeL1(*beta_);
+
+    real* dw = dw_->getData();
+    for (int k = 1; k < length; ++k) {
+      real sum = 0;
+      for (int i = 0; i < numClasses_; ++i) {
+        for (int j = 0; j < numClasses_; ++j) {
+          sum += expW[i * numClasses_ + j] * alpha[(k - 1) * numClasses_ + i] *
+                 beta[k * numClasses_ + j];
+        }
       }
-    }
-    sum = 1 / sum;
-    for (int i = 0; i < numClasses_; ++i) {
-      for (int j = 0; j < numClasses_; ++j) {
-        dw[i * numClasses_ + j] += sum * expW[i * numClasses_ + j] *
-                                   alpha[(k - 1) * numClasses_ + i] *
-                                   beta[k * numClasses_ + j];
+      sum = 1 / sum;
+      for (int i = 0; i < numClasses_; ++i) {
+        for (int j = 0; j < numClasses_; ++j) {
+          dw[i * numClasses_ + j] += sum * expW[i * numClasses_ + j] *
+                                     alpha[(k - 1) * numClasses_ + i] *
+                                     beta[k * numClasses_ + j];
+        }
       }
+      dw[s[k - 1] * numClasses_ + s[k]] -= (real)1;
     }
-    dw[s[k - 1] * numClasses_ + s[k]] -= (real)1;
   }
 }
 
diff --git a/paddle/gserver/layers/LinearChainCRF.h b/paddle/gserver/layers/LinearChainCRF.h
index a905bf803dd5443ef8d4ad7702720a50a5220a9a..8daf1e14a6fa98bef41f4f32bff439df8302adfd 100644
--- a/paddle/gserver/layers/LinearChainCRF.h
+++ b/paddle/gserver/layers/LinearChainCRF.h
@@ -21,7 +21,7 @@ namespace paddle {
 class LinearChainCRF {
 public:
   /**
-   * The size of para and grad must be \f$(numClasses + 2) * numClasses\f$.
+   * The size of para must be \f$(numClasses + 2) * numClasses\f$.
    * The first numClasses values of para are for starting weights (\f$a\f$).
    * The next numClasses values of para are for ending weights (\f$b\f$),
    * The remaning values are for transition weights (\f$w\f$).
@@ -34,7 +34,7 @@ public:
    * all possible
    * sequences is \f$1\f$, and \f$x\f$ is the input feature to the CRF.
    */
-  LinearChainCRF(int numClasses, real* para, real* grad);
+  LinearChainCRF(int numClasses, real* para);
 
   /**
    * Calculate the negative log likelihood of s given x.
@@ -45,29 +45,45 @@ public:
 
   /**
    * Calculate the gradient with respect to x, a, b, and w.
-   * The gradient of x will be stored in dx.
    * backward() can only be called after a corresponding call to forward() with
    * the same x, s and length.
-   * @note The gradient is added to dx and grad (provided at constructor).
+   * The gradient with respect to a, b, and w will not be calculated if
+   * needWGrad is false.
+   * @note Please call getWGrad() and getXGrad() to get the gradient with
+   * respect to (a, b, w) and x respectively.
    */
-  void backward(real* x, real* dx, int* s, int length);
+  void backward(real* x, int* s, int length, bool needWGrad);
 
   /**
    * Find the most probable sequence given x. The result will be stored in s.
    */
   void decode(real* x, int* s, int length);
 
+  /*
+   * Return the gradient with respect to (a, b, w). It can only be called after
+   * a corresponding call to backward().
+   */
+  MatrixPtr getWGrad() { return matWGrad_; }
+
+  /*
+   * Return the gradient with respect to x. It can only be called after a
+   * corresponding call to backward().
+   */
+  MatrixPtr getXGrad() { return matGrad_; }
+
 protected:
   int numClasses_;
   MatrixPtr a_;
   MatrixPtr b_;
   MatrixPtr w_;
+  MatrixPtr matWGrad_;
   MatrixPtr da_;
   MatrixPtr db_;
   MatrixPtr dw_;
   MatrixPtr ones_;
 
   MatrixPtr expX_;
+  MatrixPtr matGrad_;
   MatrixPtr alpha_;
   MatrixPtr beta_;
   MatrixPtr maxX_;
diff --git a/paddle/gserver/layers/PrintLayer.cpp b/paddle/gserver/layers/PrintLayer.cpp
index 85f52ad5debd035c403c73afc7390904428e28a7..de198af111be4200dd1b240f6de9464e3f43b06d 100644
--- a/paddle/gserver/layers/PrintLayer.cpp
+++ b/paddle/gserver/layers/PrintLayer.cpp
@@ -19,38 +19,17 @@ namespace paddle {
 class PrintLayer : public Layer {
 public:
   explicit PrintLayer(const LayerConfig& config) : Layer(config) {}
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override {}
-};
 
-void PrintLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  for (size_t i = 0; i != inputLayers_.size(); ++i) {
-    const auto& argu = getInput(i);
-    const std::string& name = inputLayers_[i]->getName();
-    if (argu.value) {
-      std::ostringstream os;
-      argu.value->print(os);
-      LOG(INFO) << "layer=" << name << " value matrix:\n" << os.str();
-    }
-    if (argu.ids) {
-      std::ostringstream os;
-      argu.ids->print(os, argu.ids->getSize());
-      LOG(INFO) << "layer=" << name << " ids vector:\n" << os.str();
-    }
-    if (auto startPos = argu.sequenceStartPositions) {
-      std::ostringstream os;
-      startPos->getVector(false)->print(os, startPos->getSize());
-      LOG(INFO) << "layer=" << name << " sequence pos vector:\n" << os.str();
-    }
-    if (auto subStartPos = argu.subSequenceStartPositions) {
-      std::ostringstream os;
-      subStartPos->getVector(false)->print(os, subStartPos->getSize());
-      LOG(INFO) << "layer=" << name << " sub-sequence pos vector:\n"
-                << os.str();
+  void forward(PassType passType) override {
+    Layer::forward(passType);
+    for (size_t i = 0; i != inputLayers_.size(); ++i) {
+      getInput(i).printValueString(LOG(INFO),
+                                   "layer=" + inputLayers_[i]->getName() + " ");
     }
   }
-}
+
+  void backward(const UpdateCallback& callback) override {}
+};
 
 REGISTER_LAYER(print, PrintLayer);
 
diff --git a/paddle/gserver/layers/SequenceConcatLayer.cpp b/paddle/gserver/layers/SequenceConcatLayer.cpp
index b4677687a6cc7755fdb7584a9524de9b65a0c627..4b24d8f0c852e1bdc887d4ee1465b9ad05d210bb 100644
--- a/paddle/gserver/layers/SequenceConcatLayer.cpp
+++ b/paddle/gserver/layers/SequenceConcatLayer.cpp
@@ -21,9 +21,11 @@ namespace paddle {
 
 /**
  * A layer for concatenating the first sequence with the second sequence
- * following the first
- * Input: two sequences each containing some instances
+ * Input: two sequences each containing the same number of instances
+ *        seq1 = [a1, a2, ..., an]
+ *        seq2 = [b1, b2, ..., bn]
  * Output: a concatenated sequence of the two input sequences
+ *        out = [a1, b1, a2, b2, ..., an, bn]
  */
 
 class SequenceConcatLayer : public Layer {
@@ -168,13 +170,17 @@ void SequenceConcatLayer::backward(const UpdateCallback& callback) {
     size_t rightNumIns = 0;
     for (size_t seqId = 0; seqId < numSequences1; ++seqId) {
       leftNumIns = starts1[seqId + 1] - starts1[seqId];
-      inputGrad1->subMatrix(starts1[seqId], leftNumIns)
-          ->add(*(outputGrad->subMatrix(offset, leftNumIns)));
+      if (inputGrad1) {
+        inputGrad1->subMatrix(starts1[seqId], leftNumIns)
+            ->add(*(outputGrad->subMatrix(offset, leftNumIns)));
+      }
       offset += leftNumIns;
 
       rightNumIns = starts2[seqId + 1] - starts2[seqId];
-      inputGrad2->subMatrix(starts2[seqId], rightNumIns)
-          ->add(*(outputGrad->subMatrix(offset, rightNumIns)));
+      if (inputGrad2) {
+        inputGrad2->subMatrix(starts2[seqId], rightNumIns)
+            ->add(*(outputGrad->subMatrix(offset, rightNumIns)));
+      }
       offset += rightNumIns;
     }
   }
diff --git a/paddle/gserver/layers/SequenceReshapeLayer.cpp b/paddle/gserver/layers/SequenceReshapeLayer.cpp
index 66f49159087ab9e2c83b1d74e9b4d9bfe4f49e79..433592953b220eda4db4634124a57a2074cef4c0 100644
--- a/paddle/gserver/layers/SequenceReshapeLayer.cpp
+++ b/paddle/gserver/layers/SequenceReshapeLayer.cpp
@@ -20,9 +20,12 @@ limitations under the License. */
 namespace paddle {
 
 /**
- * A layer for reshaping the sequence
- * Input: a sequence
- * Output: a sequence
+ *  A layer for reshaping the sequence. Assume the input sequence has
+ *  T instances, the dimension of each instance is M, and the input
+ *  reshape_dim is N, then the output sequence has T*M/N instances,
+ *  the dimension of each instance is N.
+ *
+ *  Note that T*M/N must be an integer.
  */
 
 class SequenceReshapeLayer : public Layer {
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index 0caa5e1e11e6d42fadfa87149814c4b77b3b6271..3c4128b5b8a0ea420bd3027b9a36e5f75087c3cb 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -18,6 +18,14 @@ add_unittest_without_exec(test_LayerGrad
 add_test(NAME test_LayerGrad
     COMMAND test_LayerGrad)
 
+################ test_CRFLayerGrad ####################
+add_unittest_without_exec(test_CRFLayerGrad
+    test_CRFLayerGrad.cpp
+    LayerGradUtil.cpp)
+add_test(NAME test_CRFLayerGrad
+    COMMAND test_CRFLayerGrad)
+
+
 add_unittest_without_exec(test_ActivationGrad
     test_ActivationGrad.cpp
     LayerGradUtil.cpp)
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
index ae016e74eaa84f7c43a30c09c8c4577e25360c4e..7617af10ba719490d1b33dd297b070cd8c7c292c 100644
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -24,7 +24,7 @@ real getCostSum(LayerPtr& testLayer, MatrixPtr weights) {
   if (weights) {
     outArgs[0].value->dotMul(*outArgs[0].value, *weights);
   }
-  return Argument::sumCosts(outArgs);
+  return Argument::sum(outArgs);
 }
 
 real getDiffAndPrint(real newCost1,
@@ -241,7 +241,7 @@ void testBatchState(LayerPtr testLayer,
 
     std::vector<Argument> args;
     args.push_back(out);
-    EXPECT_EQ(0, Argument::sumCosts(args)) << "testBatchState failed";
+    EXPECT_EQ(0, Argument::sum(args)) << "testBatchState failed";
     for (size_t seqId = 0; seqId < numSequences; ++seqId) {
       start[seqId] += seqLens[seqId];
     }
@@ -672,7 +672,7 @@ void testLayerGradKernel(TestConfig testConf,
     outArgs[0].value->dotMul(*testLayer->getOutput().value, *weights);
   }
 
-  real cost = Argument::sumCosts(outArgs);
+  real cost = Argument::sum(outArgs);
   LOG(INFO) << " cost " << cost;
   EXPECT_FALSE(std::isnan(cost));
 
diff --git a/paddle/gserver/tests/test_CRFLayerGrad.cpp b/paddle/gserver/tests/test_CRFLayerGrad.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..df14449291e9ec08f45718de07bbb101f6dbea58
--- /dev/null
+++ b/paddle/gserver/tests/test_CRFLayerGrad.cpp
@@ -0,0 +1,174 @@
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "ModelConfig.pb.h"
+#include "paddle/gserver/layers/DataLayer.h"
+#include "paddle/gserver/layers/LinearChainCRF.h"
+#include "paddle/trainer/Trainer.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
+
+static inline bool getNextSequence(std::vector<int>& seq, int numClasses) {
+  for (auto& v : seq) {
+    if (++v < numClasses) {
+      return true;
+    }
+    v = 0;
+  }
+  return false;
+}
+
+// log(exp(x) + exp(y))
+static inline real logSum(real x, real y) {
+  real maxValue = std::max(x, y);
+  if (std::isinf(maxValue)) {
+    return -std::numeric_limits<real>::infinity();
+  } else {
+    return maxValue + log(exp(x - maxValue) + exp(y - maxValue));
+  }
+}
+
+static inline std::vector<int> genRandLabels(int numClasses, int length) {
+  std::vector<int> labels(length);
+  for (int i = 0; i < length; ++i) {
+    labels[i] = rand() % numClasses;  // NOLINT
+  }
+  return labels;
+}
+
+TEST(CRFLayer, cost) {
+  const int numClasses = 4;
+  CpuVector para(numClasses * (numClasses + 2));
+  real* a = para.getData();
+  real* b = para.getData() + numClasses;
+  real* w = para.getData() + 2 * numClasses;
+  LinearChainCRF crf(4, para.getData());
+  for (int length : {1, 2, 3, 10}) {
+    for (int tries = 0; tries < 10; ++tries) {
+      CpuMatrix x(length, numClasses);
+      x.randomizeUniform();
+      para.randnorm(0, 2);
+
+      std::vector<int> goldenLabels = genRandLabels(numClasses, length);
+
+      real cost = crf.forward(x.getData(), goldenLabels.data(), length);
+
+      real logZ = -std::numeric_limits<real>::infinity();
+      real logNominator = -std::numeric_limits<real>::infinity();
+      std::vector<int> testResult(length, 0);
+      do {
+        real score = a[testResult.front()];
+        score += x.getElement(0, testResult.front());
+        for (int k = 1; k < length; ++k) {
+          score += x.getElement(k, testResult[k]) +
+                   w[numClasses * testResult[k - 1] + testResult[k]];
+        }
+        score += b[testResult.back()];
+        logZ = logSum(logZ, score);
+
+        if (goldenLabels == testResult) {
+          logNominator = score;
+        }
+      } while (getNextSequence(testResult, numClasses));
+
+      real trueCost = -logNominator + logZ;
+
+      real diff = fabs(trueCost - cost);
+      diff /= fabs(cost) < fabs(trueCost) ? fabs(cost) : fabs(trueCost);
+      VLOG(1) << "cost=" << cost << " trueCost=" << trueCost << " diff=" << diff
+              << std::endl;
+      if (typeid(real) == typeid(double)) {  // NOLINT
+        EXPECT_LE(diff, 1e-10);
+      } else {
+        EXPECT_LE(diff, 5e-3);
+      }
+    }
+  }
+}
+
+inline real epsilon() { return typeid(real) == typeid(double) ? 1e-10 : 0.06; }
+
+TestConfig initTestConfig(size_t numClasses, bool withWeight) {
+  TestConfig config;
+  config.layerConfig.set_type("crf");
+  config.layerConfig.set_size(numClasses);
+  config.biasSize = 0;
+
+  config.inputDefs.push_back({INPUT_SEQUENCE_DATA,
+                              "layer_0",
+                              numClasses,
+                              numClasses * (numClasses + 2)});
+  config.layerConfig.add_inputs();
+  config.inputDefs.push_back(
+      {INPUT_SEQUENCE_LABEL, "layer_label", numClasses, 0});
+  config.layerConfig.add_inputs();
+
+  if (withWeight) {
+    config.inputDefs.push_back({INPUT_DENSE_DIM_DATA, "layer_weight", 1, 0});
+    config.layerConfig.add_inputs();
+  }
+
+  return config;
+}
+
+TEST(Layer, CRFLayer) {
+  size_t numClasses = 10;
+  for (int tries = 0; tries < 5; ++tries) {
+    TestConfig config = initTestConfig(numClasses, /* withWeight= */ false);
+    for (int length : {1, 3, 100}) {
+      // Not support GPU now
+      testLayerGrad(config,
+                    "crf",
+                    length,
+                    /* trans= */ false,
+                    /* useGpu= */ false,
+                    /* useWeight= */ false,
+                    epsilon());
+    }
+  }
+}
+
+TEST(Layer, CRFLayerUseWeight) {
+  size_t numClasses = 10;
+  for (int tries = 0; tries < 5; ++tries) {
+    TestConfig config = initTestConfig(numClasses, /* withWeight= */ true);
+    for (int length : {1, 3, 100}) {
+      // Not support GPU now
+      testLayerGrad(config,
+                    "crf",
+                    length,
+                    /* trans= */ false,
+                    /* useGpu= */ false,
+                    /* useWeight= */ false,
+                    epsilon());
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  initMain(argc, argv);
+  hl_start();
+  hl_init(FLAGS_gpu_id);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/gserver/tests/test_Evaluator.cpp b/paddle/gserver/tests/test_Evaluator.cpp
index 8165eb8269336193858962edac4f9637c2fc1c2f..4f5fdbb37ce024e18b8d39c5dda74c69bf82166a 100644
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ b/paddle/gserver/tests/test_Evaluator.cpp
@@ -110,6 +110,18 @@ void testEvaluator(TestConfig testConf,
   testEvaluator->finish();
   LOG(INFO) << *testEvaluator;
 
+  std::vector<std::string> names;
+  testEvaluator->getNames(&names);
+  paddle::Error err;
+  for (auto& name : names) {
+    auto value = testEvaluator->getValue(name, &err);
+    ASSERT_TRUE(err.isOK());
+    LOG(INFO) << name << " " << value;
+    auto tp = testEvaluator->getType(name, &err);
+    ASSERT_TRUE(err.isOK());
+    ASSERT_EQ(testConf.evaluatorConfig.type(), tp);
+  }
+
   double totalScore2 = 0.0;
   if (testConf.testAccumulate) {
     testEvaluator->start();
@@ -129,6 +141,7 @@ void testEvaluatorAll(TestConfig testConf,
 TEST(Evaluator, classification_error) {
   TestConfig config;
   config.evaluatorConfig.set_type("classification_error");
+  config.evaluatorConfig.set_top_k(5);
 
   config.inputDefs.push_back({INPUT_DATA, "output", 50});
   config.inputDefs.push_back({INPUT_LABEL, "label", 50});
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 14d9db52470b2828186eca04d303135910489266..ceb69359c992128635c199e56805d3f603ca4271 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -276,27 +276,6 @@ TEST(Layer, AddtoLayer) {
   }
 }
 
-TEST(Layer, CRFLayer) {
-  TestConfig config;
-  config.layerConfig.set_type("crf");
-  config.layerConfig.set_size(10);
-  config.biasSize = 0;
-
-  config.inputDefs.push_back({INPUT_SEQUENCE_DATA, "layer_0", 10, 120});
-  config.inputDefs.push_back({INPUT_SEQUENCE_LABEL, "layer_1", 10, 0});
-  config.layerConfig.add_inputs();
-  config.layerConfig.add_inputs();
-
-  // Not support GPU now
-  testLayerGrad(config,
-                "crf",
-                100,
-                /* trans */ false,
-                /* useGpu */ false,
-                false /*useWeight*/,
-                0.03 /*epsilon*/);
-}
-
 TEST(Layer, CTCLayer) {
   TestConfig config;
   config.layerConfig.set_type("ctc");
diff --git a/paddle/gserver/tests/test_LinearChainCRF.cpp b/paddle/gserver/tests/test_LinearChainCRF.cpp
index f046cb0b289c9ce22b98f3200bf0a3f7d48d77f5..b37277054c58a5f71cc4649fc6c062ca8dc1d4c9 100644
--- a/paddle/gserver/tests/test_LinearChainCRF.cpp
+++ b/paddle/gserver/tests/test_LinearChainCRF.cpp
@@ -36,7 +36,7 @@ TEST(LinearChainCRF, decoding) {
   real* a = para.getData();
   real* b = para.getData() + numClasses;
   real* w = para.getData() + 2 * numClasses;
-  LinearChainCRF crf(4, para.getData(), nullptr);
+  LinearChainCRF crf(4, para.getData());
   for (int length : {1, 2, 3, 10}) {
     for (int tries = 0; tries < 10; ++tries) {
       CpuMatrix x(length, numClasses);
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 1964b2f8bfaebc49fe3073e03c949a8a9c3e385a..07450bfb0ef709840f7e8253e87c227276529a2a 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -732,6 +732,7 @@ void GpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
   size_t beam = maxVal.getWidth();
   CHECK_EQ(maxIds.getSize(), numSamples * beam);
   CHECK_EQ(maxVal.getHeight(), numSamples);
+  CHECK_EQ(maxVal.getWidth(), beam);
 
   hl_matrix_top_k(maxVal.getData(),
                   maxVal.getStride(),
@@ -792,19 +793,32 @@ void GpuMatrix::maxoutBackward(Matrix& a,
 }
 
 /*calulate the error of classification */
-void GpuMatrix::classificationError(Matrix& output, IVector& label) {
-  auto output_ptr = dynamic_cast<const GpuMatrix*>(&output);
-  auto label_ptr = dynamic_cast<const GpuIVector*>(&label);
-  CHECK(output_ptr && label_ptr) << "Invalid argument pointer";
-
-  CHECK(height_ == output_ptr->height_ && width_ == 1)
+void GpuMatrix::classificationError(Matrix& output,
+                                    IVector& label,
+                                    size_t topkSize) {
+  auto gpuOutput = dynamic_cast<GpuMatrix*>(&output);
+  auto gpuLabel = dynamic_cast<GpuIVector*>(&label);
+  size_t numSamples = this->getHeight();
+  GpuMatrixPtr gpuTopVal = std::make_shared<GpuMatrix>(numSamples, topkSize);
+  GpuIVectorPtr gpuTopIds = std::make_shared<GpuIVector>(numSamples * topkSize);
+
+  CHECK(gpuOutput && gpuLabel) << "Invalid argument pointer";
+  CHECK(gpuTopVal && gpuTopIds) << "Allocate GPU memory failed";
+  CHECK(gpuLabel->getSize() == numSamples) << "Vector size is not equal";
+  CHECK(numSamples == gpuOutput->getHeight() && this->getWidth() == 1)
       << "Matrix dimensions are not equal";
 
-  hl_matrix_classification_error((real*)output_ptr->data_,
-                                 (int*)label_ptr->getData(),
-                                 data_,
-                                 height_,
-                                 output_ptr->width_);
+  size_t dim = gpuOutput->getWidth();
+  hl_matrix_classification_error(gpuTopVal->getData(),
+                                 gpuTopVal->getStride(),
+                                 gpuTopIds->getData(),
+                                 gpuOutput->getData(),
+                                 gpuOutput->getStride(),
+                                 dim,
+                                 topkSize,
+                                 numSamples,
+                                 gpuLabel->getData(),
+                                 this->getData());
 }
 
 /* copy -log(output[i * width + label]) to this->data[i] */
@@ -3039,7 +3053,7 @@ void CpuMatrix::rowMax(Matrix& max) {
   max.maxRows(*this);
 }
 
-/* get beam size of max ids and values */
+/* Get the top k elements of each row of this matrix */
 void CpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
   CHECK(isContiguous());
   CHECK(!maxIds.useGpu() && !maxVal.useGpu()) << "Matrix type are not equal";
@@ -3047,6 +3061,7 @@ void CpuMatrix::rowMax(IVector& maxIds, Matrix& maxVal) {
   size_t beam = maxVal.getWidth();
   CHECK_EQ(maxIds.getSize(), numSamples * beam);
   CHECK_EQ(maxVal.getHeight(), numSamples);
+  CHECK_EQ(maxVal.getWidth(), beam);
 
   real* a = getData();
   int* s = maxIds.getData();
@@ -3198,32 +3213,39 @@ void CpuMatrix::rowNormalizeL1(Matrix& out) {
 }
 
 /* calulate classification error */
-void CpuMatrix::classificationError(Matrix& output, IVector& label) {
-  CHECK(dynamic_cast<const CpuMatrix*>(&output));
-  CHECK(dynamic_cast<const CpuIVector*>(&label));
+void CpuMatrix::classificationError(Matrix& output,
+                                    IVector& label,
+                                    size_t topkSize) {
+  size_t numSamples = this->getHeight();
+  auto cpuOutput = dynamic_cast<CpuMatrix*>(&output);
+  auto cpuLabel = dynamic_cast<CpuIVector*>(&label);
+  IVectorPtr cpuTopIds = std::make_shared<CpuIVector>(numSamples * topkSize);
+  MatrixPtr cpuTopVal = std::make_shared<CpuMatrix>(numSamples, topkSize);
+
+  CHECK(cpuOutput && cpuLabel) << "Invalid argument pointer";
+  CHECK(cpuTopIds && cpuTopVal) << "Allocate cpu memory failed";
+  CHECK(cpuLabel->getSize() == numSamples) << "Vector size is not equal";
+  CHECK(cpuOutput->getHeight() == numSamples && this->getWidth() == 1)
+      << "Matrix dimensions are not equal";
 
-  CHECK_EQ(getWidth(), (size_t)1);
-  size_t numSamples = getHeight();
-  CHECK_EQ(label.getSize(), numSamples);
-  CHECK_EQ(output.getHeight(), numSamples);
+  // top k matrix classification
+  cpuOutput->rowMax(*cpuTopIds, *cpuTopVal);
 
-  size_t dim = output.getWidth();
-  real* out = output.getData();
-  int* lbl = label.getData();
-  real maxData = 0.0;
-  int maxIndex = -1;
+  size_t dim = cpuOutput->getWidth();
+  real* result = this->getData();
+  int* ids = cpuTopIds->getData();
+  int* lbl = cpuLabel->getData();
   for (size_t i = 0; i < numSamples; ++i) {
     CHECK_GE(lbl[i], 0);
     CHECK_LT((size_t)lbl[i], dim);
-    maxData = out[i * dim];
-    maxIndex = 0;
-    for (size_t j = 0; j < dim; ++j) {
-      if (maxData < out[i * dim + j]) {
-        maxIndex = j;
-        maxData = out[i * dim + j];
+
+    for (size_t j = 0; j < topkSize; ++j) {
+      if (ids[j + i * topkSize] == lbl[i]) {
+        result[i] = 0;
+        break;
       }
+      result[i] = 1.0f;
     }
-    getData()[i] = (maxIndex != lbl[i]);
   }
 }
 
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index ea4bbb86b057b526c5ea294b2cd835aef65de58d..d0ba2e93feabfcc11ac1d261bc40c9c6973a8c29 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -836,8 +836,11 @@ public:
    * output[i] = 1 if row i is an error.
    *
    * output[i] = 0 if row i is correct.
+   *
    */
-  virtual void classificationError(Matrix& output, IVector& label) {
+  virtual void classificationError(Matrix& output,
+                                   IVector& label,
+                                   size_t topkSize = 1) {
     LOG(FATAL) << "Not implemented";
   }
 
@@ -1314,7 +1317,7 @@ public:
   void check(std::ostream& os, Matrix& refMat, bool printDiff = true);
   void randomizeUniform();
 
-  void classificationError(Matrix& output, IVector& label);
+  void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
 
   void convExpand(Matrix& feature,
                   int feaImgHeight,
@@ -1739,7 +1742,7 @@ public:
 
   void randomizeUniform();
 
-  void classificationError(Matrix& output, IVector& label);
+  void classificationError(Matrix& output, IVector& label, size_t topkSize = 1);
 
   void addByBitCode(size_t numClasses, const IVector& codes, const Matrix& vec);
 
diff --git a/paddle/math/tests/test_RowBuffer.cpp b/paddle/math/tests/test_RowBuffer.cpp
index 5f66f22ef73dcff1868c1a3e03139a680b1ce2b5..8cc4c69a1a4d8afec08bf7fb13408e135a06c09c 100644
--- a/paddle/math/tests/test_RowBuffer.cpp
+++ b/paddle/math/tests/test_RowBuffer.cpp
@@ -17,10 +17,10 @@ limitations under the License. */
 
 TEST(RowBuffer, testAutoGrow) {
   paddle::RowBuffer buf(128);
-  ASSERT_EQ(128, buf.getWidth());
+  ASSERT_EQ(128UL, buf.getWidth());
   ASSERT_TRUE(buf.isAutoGrowth());
   buf.resize(2);
-  ASSERT_EQ(2, buf.getRowCount());
+  ASSERT_EQ(2UL, buf.getRowCount());
   for (size_t i = 0; i < buf.getWidth() * 2; ++i) {
     buf.data()[i] = i;
   }
@@ -35,7 +35,7 @@ TEST(RowBuffer, testAutoGrow) {
     data[i] = i;
   }
 
-  ASSERT_EQ(3, buf.getRowCount());
+  ASSERT_EQ(3UL, buf.getRowCount());
   for (size_t i = 0; i < buf.getRowCount() - 1; ++i) {
     for (size_t j = 0; j < buf.getWidth(); ++j) {
       ASSERT_NEAR(i * buf.getWidth() + j, buf.get(i)[j], 1e-5);
@@ -51,7 +51,7 @@ TEST(RowBuffer, testWithMemBuf) {
       std::make_shared<paddle::CpuMemoryHandle>(128 * 2 * sizeof(real));
   paddle::RowBuffer buf(mem, 128);
   ASSERT_TRUE(!buf.isAutoGrowth());
-  ASSERT_EQ(2, buf.getRowCount());
+  ASSERT_EQ(2UL, buf.getRowCount());
   for (size_t i = 0; i < buf.getWidth() * 2; ++i) {
     buf.data()[i] = i;
   }
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 6caaea443c1df756bfeb775154e8a90400cc3211..08b64c1bb6f5d359a2d2164e723a76c5360168ee 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -764,7 +764,7 @@ TEST(Matrix, paramReluBackwardDiff) {
   }
 }
 
-void testClassificationError(int numSamples, int dim) {
+void testClassificationError(int numSamples, int dim, int topkSize) {
   MatrixPtr cpuError = std::make_shared<CpuMatrix>(numSamples, 1);
   MatrixPtr gpuError = std::make_shared<GpuMatrix>(numSamples, 1);
   MatrixPtr cpuOutput = std::make_shared<CpuMatrix>(numSamples, dim);
@@ -777,17 +777,22 @@ void testClassificationError(int numSamples, int dim) {
   gpuOutput->copyFrom(*cpuOutput);
   gpuLabel->copyFrom(*cpuLabel);
 
-  cpuError->classificationError(*cpuOutput, *cpuLabel);
-  gpuError->classificationError(*gpuOutput, *gpuLabel);
+  cpuError->classificationError(*cpuOutput, *cpuLabel, topkSize);
+  gpuError->classificationError(*gpuOutput, *gpuLabel, topkSize);
 
   TensorCheckEqual(*cpuError, *gpuError);
 }
 
 TEST(Matrix, classificationError) {
-  for (auto numSamples : {1, 10, 100, 1000, 70000}) {
-    for (auto dim : {1, 10, 100, 1000}) {
-      VLOG(3) << " numSamples=" << numSamples << " dim=" << dim;
-      testClassificationError(numSamples, dim);
+  for (auto numSamples : {1, 5, 31, 90, 150, 300}) {
+    for (auto dim :
+         {1, 5, 8, 10, 15, 64, 80, 120, 256, 300, 1280, 5120, 50000}) {
+      for (auto topkSize : {1, 5, 10, 20, 40, (int)rand() % dim + 1}) {
+        if (topkSize > dim) continue;
+        VLOG(3) << " sample= " << numSamples << " topkSize= " << topkSize
+                << " dim= " << dim;
+        testClassificationError(numSamples, dim, topkSize);
+      }
     }
   }
 }
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index 65d01a15718ae2bebd4869eff0e5407524bc0e7c..7a343cca33f5b420be6192231ac73ca1c2da5fb9 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -602,6 +602,44 @@ void Argument::degradeSequence(const Argument& input, bool useGpu) {
   tgtBuf[numSequences] = numSubSequences;
 }
 
+void Argument::getValueString(
+    std::unordered_map<std::string, std::string>* out) const {
+  if (value) {
+    std::ostringstream os;
+    value->print(os);
+    out->insert({"value", os.str()});
+  }
+  if (ids) {
+    std::ostringstream os;
+    ids->print(os, ids->getSize());
+    out->insert({"ids", os.str()});
+  }
+  if (sequenceStartPositions) {
+    std::ostringstream os;
+    sequenceStartPositions->getVector(false)->print(
+        os, sequenceStartPositions->getSize());
+    out->insert({"sequence pos", os.str()});
+  }
+  if (subSequenceStartPositions) {
+    std::ostringstream os;
+    subSequenceStartPositions->getVector(false)->print(
+        os, subSequenceStartPositions->getSize());
+    out->insert({"sub-sequence pos", os.str()});
+  }
+}
+
+void Argument::printValueString(std::ostream& stream,
+                                const std::string& prefix) const {
+  std::unordered_map<std::string, std::string> out;
+  getValueString(&out);
+  for (auto field : {"value", "id", "sequence pos", "sub-sequence pos"}) {
+    auto it = out.find(field);
+    if (it != out.end()) {
+      stream << prefix << field << ":\n" << it->second;
+    }
+  }
+}
+
 void Argument::subArgFrom(const Argument& input,
                           size_t offset,
                           size_t height,
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
index afd2de0202bf0f14ec3d4c5b856455a3488e41f6..9ef44be0cb3b960db1e789f3f26bb66d1fe63c81 100644
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -163,7 +163,7 @@ struct Argument {
                        : sequenceStartPositions->getData(false);
   }
 
-  static inline real sumCosts(const std::vector<Argument>& arguments) {
+  static inline real sum(const std::vector<Argument>& arguments) {
     real cost = 0;
     for (auto& arg : arguments) {
       if (arg.value) {
@@ -297,6 +297,23 @@ struct Argument {
    sequence has sub-sequence degrades to a sequence.
    */
   void degradeSequence(const Argument& input, bool useGpu);
+
+  /**
+   * @brief getValueString will return the argument's output in string. There
+   * are several kinds of output. The keys of output dictionary are 'value',
+   * 'id', 'sequence pos', 'sub-sequence pos'.
+   * @param out [out]: the return values.
+   */
+  void getValueString(std::unordered_map<std::string, std::string>* out) const;
+
+  /**
+   * @brief printValueString will print the argument's output in order of
+   * 'value', 'id', 'sequence pos', 'sub-sequence pos'.
+   * @param stream: Output stream
+   * @param prefix: line prefix for printing.
+   */
+  void printValueString(std::ostream& stream,
+                        const std::string& prefix = "") const;
 };
 
 }  // namespace paddle
diff --git a/paddle/parameter/Parameter.cpp b/paddle/parameter/Parameter.cpp
index 29d6e20dc16968cdda3e79b66b0c81aaaf303ef4..1ccded818796798105a889df978618688b56ed36 100644
--- a/paddle/parameter/Parameter.cpp
+++ b/paddle/parameter/Parameter.cpp
@@ -375,10 +375,6 @@ bool Parameter::load(const std::string& filename) {
   std::ifstream fs(filename, std::ios_base::binary);
   if (!fs) {
     LOG(INFO) << "missing parameters [" << filename << "] while loading model.";
-    if (isStatic()) {
-      LOG(FATAL) << getName() << " is static but missing, not allowed.";
-      return false;
-    }
     if (kMissParameterFail == FLAGS_load_missing_parameter_strategy) {
       LOG(FATAL) << getName() << " missing, not allowed.";
       return false;
diff --git a/paddle/pserver/BaseClient.h b/paddle/pserver/BaseClient.h
index 11d7a147bf749ba2de0772b5efd5f73ab0ccdb1a..667bc451d16aa1436ac5d74dd96edbd70556edd0 100644
--- a/paddle/pserver/BaseClient.h
+++ b/paddle/pserver/BaseClient.h
@@ -30,9 +30,6 @@ namespace paddle {
  * the first solution arms with sendThreads_/recvThreads_ and sendJobQueue_/
  * recvJobQueue_. the second solution use some shared thread pool to manage
  * connections.
- * In addition to pserver, metric learning also uses network to exchange
- * features within multi-machines, so this class just abstracts some basic
- * threads and queue buffer creation for them
  */
 class BaseClient {
 protected:
diff --git a/paddle/pserver/ParameterServer2.cpp b/paddle/pserver/ParameterServer2.cpp
index 856fa0ad1ab30e3fc554ac96dd3bed71b1548579..877cbb86ec112739a5c7eeee969ca48ef491ee87 100644
--- a/paddle/pserver/ParameterServer2.cpp
+++ b/paddle/pserver/ParameterServer2.cpp
@@ -367,11 +367,8 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
                                    std::vector<Buffer>* outputBuffers) {
   VLOG(1) << "pserver: addGradient";
 
-/// forwardbackward delta from all trainers
-/// indicate the fluctuation caused by forwardbackward.
-#ifndef PADDLE_METRIC_LEARNING
-  // @TODO(yanfei):
-  // add support tuning forwardbackward balance for metric learning
+  // forwardbackward delta from all trainers
+  // indicate the fluctuation caused by forwardbackward.
   if (!numPassFinishClients_) {
     REGISTER_BARRIER_DELTA_SERVER_SET(
         *statSet_,
@@ -381,7 +378,6 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
         request.forwardbackward_time(),
         isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
   }
-#endif
 
   {
     /// approximately pure network overhead
diff --git a/paddle/pserver/test/CMakeLists.txt b/paddle/pserver/test/CMakeLists.txt
index 64654f67d0c2c82f05a5038fb33b220f3cff0f39..6e8f9c37f64b70921e09241089a5a480fd8ca47f 100644
--- a/paddle/pserver/test/CMakeLists.txt
+++ b/paddle/pserver/test/CMakeLists.txt
@@ -10,9 +10,11 @@ add_test(NAME socket_test
 add_unittest_without_exec(test_ProtoServer
     test_ProtoServer.cpp)
 
-add_test(NAME test_ProtoServer
-    COMMAND ${PROJ_ROOT}/paddle/.set_port.sh -p port
-        ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoServer)
+IF(NOT ON_TRAVIS)
+    add_test(NAME test_ProtoServer
+        COMMAND ${PROJ_ROOT}/paddle/.set_port.sh -p port
+            ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoServer)
+ENDIF(NOT ON_TRAVIS)
 
 # TODO(yuyang18): Run test_ProtoServer when with rdma
 # add_test(NAME test_ProtoServerRDMA
diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py
index 21d1cb75f4d40e6ed011b33c6366c9d31c0fcc7c..6d6a406cf61d467cb2701ca5e85e99648eea36eb 100644
--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
@@ -16,20 +16,74 @@ import paddle.trainer.PyDataProvider2 as dp2
 import collections
 import swig_paddle
 import numpy
+import itertools
 
 __all__ = ['DataProviderConverter']
 
 
 class IScanner(object):
+    """
+    The scanner will scan Python object two passes, then convert it to Paddle's
+    argument.
+
+    In the first pass, `pre_scan` will be invoked by every data instance, and
+    then invoke `finish_pre_scan` to arguments. And the second pass do the same
+    thing except the functions changed to `scan`, `finish_scan`.
+
+    During the first pass, a scanner may count the shape of input matrix and
+    allocate memory for this argument. Then fill the data into this  argument
+    in second pass.
+    """
+
     def __init__(self, input_type, pos):
         self.input_type = input_type
-        assert isinstance(self.input_type, dp2.InputType)
+        if not isinstance(self.input_type, dp2.InputType):
+            raise ValueError("input type should be dataprovider2.InputType")
         self.pos = pos
+        # data_in_gpu is used to indicate whether to create argument on GPU
+        # or not in GPU mode. Now if using one thread (trainer_count=1),
+        # trainer uses NeuralNetwork which needs to create argument on GPU
+        # before calling forward function. So, set data_in_gpu to True.
+        # Otherwise, trainer uses MultiGradientMachine which will transfer
+        # data from CPU to GPU in the forward function, set data_in_gpu to
+        # False in this case.
+        self.data_in_gpu = swig_paddle.isUsingGpu(
+        ) and swig_paddle.getTrainerCount() == 1
+
+    def pre_scan(self, dat):
+        """
+        First pass scan method. During this method, the scanner could count the
+        data number, and get the total memory size this batch would use.
+
+        :param dat: The python object.
+        """
+        pass
+
+    def finish_pre_scan(self, argument):
+        """
+        Finish first scan pass. Allocate the memory.
+
+        :param argument: Output arguments object.
+        :type argument: swig_paddle.Arguments
+        :return:
+        """
+        pass
 
     def scan(self, dat):
+        """
+        Second pass scan method. Copy the data to arguments.
+
+        :param dat: The python object.
+        """
         pass
 
     def finish_scan(self, argument):
+        """
+        Finish second pass. Finalize the resources, etc.
+
+        :param argument: Output arguments object.
+        :type argument: swig_paddle.Arguments
+        """
         pass
 
 
@@ -41,19 +95,26 @@ class DenseScanner(IScanner):
     def __init__(self, input_type, pos):
         IScanner.__init__(self, input_type, pos)
         self.__mat__ = None
+        self.__height__ = 0
+
+    def pre_scan(self, dat):
+        self.__height__ += 1
+
+    def finish_pre_scan(self, argument):
+        self.__mat__ = numpy.ndarray(
+            shape=(self.__height__, self.input_type.dim), dtype=numpy.float32)
+        self.__height__ = 0
 
     def scan(self, dat):
-        if self.__mat__ is None:
-            self.__mat__ = numpy.array([dat], dtype='float32')
-        else:
-            self.__mat__ = numpy.append(self.__mat__, [dat], axis=0)
+        self.__mat__[self.__height__] = dat
+        self.__height__ += 1
 
     def finish_scan(self, argument):
         assert isinstance(argument, swig_paddle.Arguments)
-        assert isinstance(self.input_type, dp2.InputType)
         if self.__mat__.dtype != numpy.float32:
             self.__mat__ = self.__mat__.astype(numpy.float32)
-        m = swig_paddle.Matrix.createDenseFromNumpy(self.__mat__, True, False)
+        m = swig_paddle.Matrix.createDenseFromNumpy(self.__mat__, True,
+                                                    self.data_in_gpu)
         argument.setSlotValue(self.pos, m)
 
 
@@ -63,7 +124,6 @@ class SparseBinaryScanner(IScanner):
         self.__rows__ = [0]
         self.__cols__ = []
         self.__height__ = 0
-        self.__nnz__ = 0
         self.__value__ = []
 
     def scan(self, dat):
@@ -76,11 +136,13 @@ class SparseBinaryScanner(IScanner):
 
     def finish_scan(self, argument):
         assert isinstance(argument, swig_paddle.Arguments)
-        assert isinstance(self.input_type, dp2.InputType)
-        m = swig_paddle.Matrix.createSparse(self.__height__,
-                                            self.input_type.dim,
-                                            len(self.__cols__),
-                                            len(self.__value__) == 0)
+        m = swig_paddle.Matrix.createSparse(
+            self.__height__,
+            self.input_type.dim,
+            len(self.__cols__),
+            len(self.__value__) == 0,
+            False,  # trans
+            False)  # TODO supoort GPU
         assert isinstance(m, swig_paddle.Matrix)
         m.sparseCopyFrom(self.__rows__, self.__cols__, self.__value__)
         argument.setSlotValue(self.pos, m)
@@ -104,7 +166,7 @@ class IndexScanner(IScanner):
         self.__ids__.append(dat)
 
     def finish_scan(self, argument):
-        ids = swig_paddle.IVector.create(self.__ids__)
+        ids = swig_paddle.IVector.create(self.__ids__, self.data_in_gpu)
         assert isinstance(argument, swig_paddle.Arguments)
         argument.setSlotIds(self.pos, ids)
 
@@ -152,7 +214,14 @@ class DataProviderConverter(object):
         ]
 
         for each_sample in dat:
-            for each_step, scanner in zip(each_sample, scanners):
+            for each_step, scanner in itertools.izip(each_sample, scanners):
+                scanner.pre_scan(each_step)
+
+        for scanner in scanners:
+            scanner.finish_pre_scan(argument)
+
+        for each_sample in dat:
+            for each_step, scanner in itertools.izip(each_sample, scanners):
                 scanner.scan(each_step)
 
         for scanner in scanners:
diff --git a/paddle/py_paddle/util.py b/paddle/py_paddle/util.py
index ce105d249aaf3e838443d3e0cf5996fe8c783a22..1c9455fab5f9c1179bddffb100cd53fe8adfb6b1 100644
--- a/paddle/py_paddle/util.py
+++ b/paddle/py_paddle/util.py
@@ -195,6 +195,12 @@ def __monkeypatch_gradient_machine__():
 
     swig_paddle.GradientMachine.getParameters = getParameters
 
+    def getNonStaticParameters(self):
+        return (self.getNonStaticParameter(i)
+                for i in xrange(self.getNonStaticParameterSize()))
+
+    swig_paddle.GradientMachine.getNonStaticParameters = getNonStaticParameters
+
     def getLayerOutputs(self, layerNames):
         """
         getLayerOutputs. get outputs of layers and return a numpy matrix dict.
@@ -208,7 +214,7 @@ def __monkeypatch_gradient_machine__():
 
         output = dict()
         for name in layerNames:
-            output[name] = __matrix_to_numpy__(self.getLayerOutput(name))
+            output[name] = __arguments_to_numpy__(0, self.getLayerOutput(name))
         return output
 
     swig_paddle.GradientMachine.getLayerOutputs = getLayerOutputs
diff --git a/paddle/scripts/deb/build_scripts/.gitignore b/paddle/scripts/deb/build_scripts/.gitignore
deleted file mode 100644
index 1521c8b7652b1eec8ed4fe50877aae880c758ee3..0000000000000000000000000000000000000000
--- a/paddle/scripts/deb/build_scripts/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-dist
diff --git a/paddle/scripts/deb/build_scripts/Dockerfile b/paddle/scripts/deb/build_scripts/Dockerfile
deleted file mode 100644
index db365a65b7d33429dc1260b2ce69d6dc46abe487..0000000000000000000000000000000000000000
--- a/paddle/scripts/deb/build_scripts/Dockerfile
+++ /dev/null
@@ -1,5 +0,0 @@
-FROM paddledev/paddle:gpu-latest
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-CMD cd /root/ && bash build.sh
-
diff --git a/paddle/scripts/deb/build_scripts/build.sh b/paddle/scripts/deb/build_scripts/build.sh
deleted file mode 100755
index d13dea514841b110c304b8aa0e65ad16e42c75f3..0000000000000000000000000000000000000000
--- a/paddle/scripts/deb/build_scripts/build.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-set -e
-apt-get update
-apt-get install -y dh-make
-cd ~
-mkdir -p ~/dist/gpu
-mkdir -p ~/dist/cpu
-mkdir -p ~/dist/cpu-noavx
-mkdir -p ~/dist/gpu-noavx
-cd paddle
-mkdir build
-cd build
-cmake .. -DWITH_GPU=OFF -DWITH_SWIG_PY=ON -DWITH_AVX=ON
-make -j `nproc`
-cpack -D CPACK_GENERATOR='DEB' ..
-mv *.deb ~/dist/cpu
-
-rm -rf *
-cmake .. -DWITH_GPU=ON -DWITH_SWIG_PY=ON -DWITH_AVX=ON -DCUDNN_ROOT=/usr/
-make -j `nproc`
-cpack -D CPACK_GENERATOR='DEB' ..
-mv *.deb ~/dist/gpu
-
-
-rm -rf *
-cmake .. -DWITH_GPU=OFF -DWITH_SWIG_PY=ON -DWITH_AVX=OFF
-make -j `nproc`
-cpack -D CPACK_GENERATOR='DEB' ..
-mv *.deb ~/dist/cpu-noavx
-
-rm -rf *
-cmake .. -DWITH_GPU=ON -DWITH_SWIG_PY=ON -DWITH_AVX=OFF -DCUDNN_ROOT=/usr/
-make -j `nproc`
-cpack -D CPACK_GENERATOR='DEB' ..
-mv *.deb ~/dist/gpu-noavx
diff --git a/paddle/scripts/deb/build_scripts/build_deb.sh b/paddle/scripts/deb/build_scripts/build_deb.sh
deleted file mode 100755
index c38c6299f840345b7f6f6e0aad7482241d36198a..0000000000000000000000000000000000000000
--- a/paddle/scripts/deb/build_scripts/build_deb.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-set -e
-docker build -t build_paddle_deb .
-rm -rf dist
-mkdir -p dist
-docker run -v$PWD/dist:/root/dist -v $PWD/../../../..:/root/paddle --name tmp_build_deb_container build_paddle_deb
-docker rm tmp_build_deb_container
-docker rmi build_paddle_deb
diff --git a/paddle/scripts/deb/postinst b/paddle/scripts/deb/postinst
deleted file mode 100644
index 1d2dd3171a132966832d87ae758d4e620475aed1..0000000000000000000000000000000000000000
--- a/paddle/scripts/deb/postinst
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash
-set -e
-echo "Post install paddle debian package."
-echo "Install some python package used for paddle. You can run "
-echo "  pip install /usr/opt/paddle/share/wheels/*.whl to install them."
-pip install /usr/opt/paddle/share/wheels/*.whl
-
diff --git a/paddle/scripts/docker/Dockerfile b/paddle/scripts/docker/Dockerfile
deleted file mode 100644
index d4845a72b61833b356779d44de5163f161e0cd4d..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/Dockerfile
+++ /dev/null
@@ -1,57 +0,0 @@
-FROM ubuntu:14.04
-MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-
-ARG DEBIAN_FRONTEND=noninteractive
-ARG UBUNTU_MIRROR
-RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
-
-RUN apt-get update && \
-    apt-get install -y git python-pip python-dev openssh-server bison && \
-    apt-get install -y wget unzip tar xz-utils bzip2 gzip coreutils && \
-    apt-get install -y curl sed grep graphviz libjpeg-dev zlib1g-dev && \
-    apt-get install -y python-numpy python-matplotlib gcc g++ gfortran && \
-    apt-get install -y automake clang-3.8 llvm-3.8 libclang-3.8-dev && \
-    apt-get clean -y
-
-RUN pip install --upgrade pip && \
-    pip install -U protobuf && \
-    pip install -U wheel pillow BeautifulSoup && \
-    pip install -U docopt PyYAML sphinx && \
-    pip install -U sphinx_rtd_theme recommonmark jupyter
-
-RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
-    cd cmake-3.4.1 && ./bootstrap && make -j4 && make install && \
-    cd .. && rm -rf cmake-3.4.1
-
-ARG BUILD_AND_INSTALL
-ARG WITH_AVX
-ARG WITH_DOC
-ARG WITH_STYLE_CHECK
-
-ENV BUILD_AND_INSTALL=${BUILD_AND_INSTALL:-OFF}
-ENV WITH_GPU=OFF
-ENV WITH_AVX=${WITH_AVX:-ON}
-ENV WITH_DOC=${WITH_DOC:-ON}
-ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
-
-RUN mkdir /paddle
-COPY . /paddle/
-RUN /paddle/paddle/scripts/docker/build.sh
-VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"]
-
-# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
-RUN mkdir /var/run/sshd
-RUN echo 'root:root' | chpasswd
-RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
-RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
-EXPOSE 22
-
-# Jupyter Notebook directory.
-RUN mkdir /notes/
-WORKDIR "/notes"
-EXPOSE 8888
-
-RUN mkdir -p /opt/bin
-COPY ./paddle/scripts/docker/entrypoint /opt/bin/
-
-CMD ["/opt/bin/entrypoint"]
diff --git a/paddle/scripts/docker/README.md b/paddle/scripts/docker/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8c35411fc390ef218e395c58808d644e7a35095e
--- /dev/null
+++ b/paddle/scripts/docker/README.md
@@ -0,0 +1,155 @@
+# Building PaddlePaddle
+
+## Goals
+
+We want the building procedure generates Docker images so that we can run PaddlePaddle applications on Kubernetes clusters.
+
+We want to build .deb packages so that enterprise users can run PaddlePaddle applications without Docker.
+
+We want to minimize the size of generated Docker images and .deb packages so to reduce the download time.
+
+We want to encapsulate building tools and dependencies in a *development* Docker image so to ease the tools installation for developers.
+
+Developers use various editors (emacs, vim, Eclipse, Jupyter Notebook), so the development Docker image contains only building tools, not editing tools, and developers are supposed to git clone source code into their development computers and map the code into the development container.
+
+We want the procedure and tools also work with testing, continuous integration, and releasing.
+
+
+## Docker Images
+
+So we need two Docker images for each version of PaddlePaddle:
+
+1. `paddle:<version>-dev`
+
+   This a development image contains only the development tools and standardizes the building procedure.  Users include:
+
+   - developers -- no longer need to install development tools on the host, and can build their current work on the host (development computer).
+   - release engineers -- use this to build the official release from certain branch/tag on Github.com.
+   - document writers / Website developers -- Our documents are in the source repo in the form of .md/.rst files and comments in source code.  We need tools to extract the information, typeset, and generate Web pages.
+
+   Of course, developers can install building tools on their development computers.  But different versions of PaddlePaddle might require different set or version of building tools.  Also, it makes collaborative debugging easier if all developers use a unified development environment.
+
+  The development image should include the following tools:
+
+   - gcc/clang
+   - nvcc
+   - Python
+   - sphinx
+   - woboq
+   - sshd
+
+   Many developers work on a remote computer with GPU; they could ssh into the computer and  `docker exec` into the development container. However, running `sshd` in the container allows developers to ssh into the container directly.
+
+1. `paddle:<version>`
+
+   This is the production image, generated using the development image. This image might have multiple variants:
+
+   - GPU/AVX   `paddle:<version>-gpu`
+   - GPU/no-AVX  `paddle:<version>-gpu-noavx`
+   - no-GPU/AVX  `paddle:<version>`
+   - no-GPU/no-AVX  `paddle:<version>-noavx`
+
+   We allow users to choose between GPU and no-GPU because the GPU version image is much larger than then the no-GPU version.
+
+   We allow users the choice between AVX and no-AVX, because some cloud providers don't provide AVX-enabled VMs.
+
+
+## Development Environment
+
+Here we describe how to use above two images.  We start from considering our daily development environment.
+
+Developers work on a computer, which is usually a laptop or desktop:
+
+<img src="doc/paddle-development-environment.png" width=500 />
+
+or, they might rely on a more sophisticated box (like with GPUs):
+
+<img src="doc/paddle-development-environment-gpu.png" width=500 />
+
+A principle here is that source code lies on the development computer (host) so that editors like Eclipse can parse the source code to support auto-completion.
+
+
+## Usages
+
+### Build the Development Docker Image
+
+The following commands check out the source code to the host and build the development image `paddle:dev`:
+
+```bash
+git clone https://github.com/PaddlePaddle/Paddle paddle
+cd paddle
+docker build -t paddle:dev .
+```
+
+The `docker build` command assumes that `Dockerfile` is in the root source tree.  Note that in this design, this `Dockerfile` is this only one in our repo.
+
+
+### Build PaddlePaddle from Source Code
+
+Given the development image `paddle:dev`, the following command builds PaddlePaddle from the source tree on the development computer (host):
+
+```bash
+docker run -v $PWD:/paddle -e "GPU=OFF" -e "AVX=ON" -e "TEST=ON" paddle:dev
+```
+
+This command mounts the source directory on the host into `/paddle` in the container, so the default entry point of `paddle:dev`, `build.sh`, could build the source code with possible local changes.  When it writes to `/paddle/build` in the container, it writes to `$PWD/build` on the host indeed.
+
+`build.sh` builds the following:
+
+- PaddlePaddle binaries,
+- `$PWD/build/paddle-<version>.deb` for production installation, and
+- `$PWD/build/Dockerfile`, which builds the production Docker image.
+
+
+### Build the Production Docker Image
+
+The following command builds the production image:
+
+```bash
+docker build -t paddle -f build/Dockerfile .
+```
+
+This production image is minimal -- it includes binary `paddle`, the shared library `libpaddle.so`, and Python runtime.
+
+### Run PaddlePaddle Applications
+
+Again the development happens on the host.  Suppose that we have a simple application program in `a.py`, we can test and run it using the production image:
+
+```bash
+docker run -it -v $PWD:/work paddle /work/a.py
+```
+
+But this works only if all dependencies of `a.py` are in the production image. If this is not the case, we need to build a new Docker image from the production image and with more dependencies installs.
+
+### Build and Run PaddlePaddle Applications
+
+We need a Dockerfile in https://github.com/paddlepaddle/book that builds Docker image `paddlepaddle/book:<version>`, basing on the PaddlePaddle production image:
+
+```
+FROM paddlepaddle/paddle:<version>
+RUN pip install -U matplotlib jupyter ...
+COPY . /book
+EXPOSE 8080
+CMD ["jupyter"]
+```
+
+The book image is an example of PaddlePaddle application image.  We can build it
+
+```bash
+git clone https://github.com/paddlepaddle/book
+cd book
+docker build -t book .
+```
+
+### Build and Run Distributed Applications
+
+In our [API design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md#distributed-training), we proposed an API that starts a distributed training job on a cluster.  This API need to build a PaddlePaddle application into a Docker image as above and calls kubectl to run it on the cluster.  This API might need to generate a Dockerfile look like above and call `docker build`.
+
+Of course, we can manually build an application image and launch the job using the kubectl tool:
+
+```bash
+docker build -f some/Dockerfile -t myapp .
+docker tag myapp me/myapp
+docker push
+kubectl ...
+```
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 6197b41d6b5f191a452cbf32b47e0ff490b61046..c44874eede03a8b1060b15e175ad89622f925940 100755
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -7,19 +7,48 @@ function abort(){
 
 trap 'abort' 0
 set -e
-
+mkdir -p /paddle/dist/cpu
+mkdir -p /paddle/dist/gpu
+mkdir -p /paddle/dist/cpu-noavx
+mkdir -p /paddle/dist/gpu-noavx
+# Set BASE_IMAGE and DEB_PATH according to env variables
+if [ ${WITH_GPU} == "ON" ]; then
+  BASE_IMAGE="nvidia/cuda:7.5-cudnn5-runtime-ubuntu14.04"
+  # additional packages to install when building gpu images
+  GPU_DOCKER_PKG="python-pip"
+  if [ ${WITH_AVX} == "ON" ]; then
+    DEB_PATH="dist/gpu/"
+    DOCKER_SUFFIX="gpu"
+  else
+    DEB_PATH="dist/gpu-noavx/"
+    DOCKER_SUFFIX="gpu-noavx"
+  fi
+else
+  BASE_IMAGE="python:2.7.13-slim"
+  if [ ${WITH_AVX} == "ON" ]; then
+    DEB_PATH="dist/cpu/"
+    DOCKER_SUFFIX="cpu"
+  else
+    DEB_PATH="dist/cpu-noavx/"
+    DOCKER_SUFFIX="noavx"
+  fi
+fi
 # If Dockerfile.* sets BUILD_AND_INSTALL to 'ON', it would have copied
 # source tree to /paddle, and this scripts should build it into
 # /paddle/build.
-if [[ ${BUILD_AND_INSTALL:-ON} == 'ON' ]]; then
+if [[ ${BUILD_AND_INSTALL:-OFF} == 'ON' ]]; then
     if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
 	ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so /usr/lib/libcudnn.so
     fi
 
     mkdir -p /paddle/build # -p means no error if exists
     cd /paddle/build
+    # clean local cmake and third_party cache
+    if [ ${DELETE_BUILD_CACHE} == 'ON' ]; then
+      rm -rf * && rm -rf ../third_party
+    fi
     cmake .. \
-	  -DWITH_DOC=ON \
+	  -DWITH_DOC=${WITH_DOC:-OFF} \
 	  -DWITH_GPU=${WITH_GPU:-OFF} \
 	  -DWITH_AVX=${WITH_AVX:-OFF} \
 	  -DWITH_SWIG_PY=ON \
@@ -28,30 +57,88 @@ if [[ ${BUILD_AND_INSTALL:-ON} == 'ON' ]]; then
 	  -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
     make -j `nproc`
     make install
+    # generate deb package for current build
+    # FIXME(typhoonzero): should we remove paddle/scripts/deb ?
+    # FIXME: CPACK_DEBIAN_PACKAGE_DEPENDS removes all dev dependencies, must
+    # install them in docker
+    cpack -D CPACK_GENERATOR='DEB' -D CPACK_DEBIAN_PACKAGE_DEPENDS="" ..
+    mv /paddle/build/*.deb /paddle/${DEB_PATH}
 
-    # Install woboq_codebrowser.
-    git clone https://github.com/woboq/woboq_codebrowser /woboq
-    cd /woboq
-    cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-	  -DCMAKE_BUILD_TYPE=Release \
-	  .
-    make
-
-    export WOBOQ_OUT=/usr/share/nginx/html/paddle
-    export BUILD_DIR=/paddle/build
-    mkdir -p $WOBOQ_OUT
-    cp -rv /woboq/data $WOBOQ_OUT/../data
-    /woboq/generator/codebrowser_generator \
-	-b /paddle/build \
-	-a \
-	-o $WOBOQ_OUT \
-	-p paddle:/paddle
-    /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
-    cd /woboq
-    make clean
-
-    pip install /usr/local/opt/paddle/share/wheels/*.whl
+    if [[ ${BUILD_WOBOQ:-OFF} == 'ON' ]]; then
+        apt-get install -y clang-3.8 llvm-3.8 libclang-3.8-dev
+        # Install woboq_codebrowser.
+        git clone https://github.com/woboq/woboq_codebrowser /woboq
+        cd /woboq
+        cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
+        -DCMAKE_BUILD_TYPE=Release \
+        .
+        make
+
+        export WOBOQ_OUT=/usr/share/nginx/html/paddle
+        export BUILD_DIR=/paddle/build
+        mkdir -p $WOBOQ_OUT
+        cp -rv /woboq/data $WOBOQ_OUT/../data
+        /woboq/generator/codebrowser_generator \
+        -b /paddle/build \
+        -a \
+        -o $WOBOQ_OUT \
+        -p paddle:/paddle
+        /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
+        cd /woboq
+        make clean
+    fi
+
+    pip install /usr/local/opt/paddle/share/wheels/py_paddle*linux*.whl
+    pip install /usr/local/opt/paddle/share/wheels/paddle*.whl
     paddle version
+
+    if [[ ${DOCKER_BUILD:-FALSE} == 'TRUE' ]]; then
+	# reduce docker image size
+	rm -rf /paddle/build
+	rm -rf /usr/local/opt/paddle/share/wheels/
+    fi
 fi
 
+# generate production docker image Dockerfile
+if [ ${USE_MIRROR} ]; then
+  MIRROR_UPDATE="sed 's@http:\/\/archive.ubuntu.com\/ubuntu\/@mirror:\/\/mirrors.ubuntu.com\/mirrors.txt@' -i /etc/apt/sources.list && \\"
+else
+  MIRROR_UPDATE="\\"
+fi
+
+cat > /paddle/build/Dockerfile.${DOCKER_SUFFIX} <<EOF
+FROM ${BASE_IMAGE}
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+
+# ENV variables
+ARG WITH_AVX
+ARG WITH_DOC
+ARG WITH_STYLE_CHECK
+
+ENV WITH_GPU=${WITH_GPU}
+ENV WITH_AVX=\${WITH_AVX:-ON}
+ENV WITH_DOC=\${WITH_DOC:-OFF}
+ENV WITH_STYLE_CHECK=\${WITH_STYLE_CHECK:-OFF}
+
+ENV HOME /root
+ENV LANG en_US.UTF-8
+
+# Use Fix locales to en_US.UTF-8
+
+RUN ${MIRROR_UPDATE}
+    apt-get update && \
+    apt-get install -y libgfortran3 ${GPU_DOCKER_PKG} && \
+    apt-get clean -y && \
+    pip install --upgrade pip && \
+    pip install -U 'protobuf==3.1.0' requests
+RUN pip install numpy
+# Use different deb file when building different type of images
+ADD \$PWD/${DEB_PATH}*.deb /usr/local/opt/paddle/deb/
+RUN dpkg --force-all -i /usr/local/opt/paddle/deb/*.deb && rm -f /usr/local/opt/paddle/deb/*.deb
+
+ENV PATH="/usr/local/opt/paddle/bin/:${PATH}"
+# default command shows the paddle version and exit
+CMD ["paddle", "version"]
+EOF
+
 trap : 0
diff --git a/paddle/scripts/docker/doc/paddle-development-environment-gpu.graffle b/paddle/scripts/docker/doc/paddle-development-environment-gpu.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..4629f9b9da7ababdafa0b964db18a98a819c6a9e
Binary files /dev/null and b/paddle/scripts/docker/doc/paddle-development-environment-gpu.graffle differ
diff --git a/paddle/scripts/docker/doc/paddle-development-environment-gpu.png b/paddle/scripts/docker/doc/paddle-development-environment-gpu.png
new file mode 100644
index 0000000000000000000000000000000000000000..61a96d7198d013f08f0f9c269cc352da5f7dd2e9
Binary files /dev/null and b/paddle/scripts/docker/doc/paddle-development-environment-gpu.png differ
diff --git a/paddle/scripts/docker/doc/paddle-development-environment.graffle b/paddle/scripts/docker/doc/paddle-development-environment.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..5b164c4832809de94ead7309af49c579135d7f48
Binary files /dev/null and b/paddle/scripts/docker/doc/paddle-development-environment.graffle differ
diff --git a/paddle/scripts/docker/doc/paddle-development-environment.png b/paddle/scripts/docker/doc/paddle-development-environment.png
new file mode 100644
index 0000000000000000000000000000000000000000..707ed45a335a981c23b3533984045f53848b55e2
Binary files /dev/null and b/paddle/scripts/docker/doc/paddle-development-environment.png differ
diff --git a/paddle/scripts/docker/entrypoint b/paddle/scripts/docker/entrypoint
index 87083467f50acd689ce57b86951f5f7a03c6a58b..bc194bd909aa308fd5fe920c9319f62a0ec2dac7 100755
--- a/paddle/scripts/docker/entrypoint
+++ b/paddle/scripts/docker/entrypoint
@@ -1,8 +1,4 @@
 #!/bin/bash
-LOG=/var/log/all
 
-touch $LOG
-
-/usr/sbin/sshd -D >> $LOG &
-jupyter notebook --ip=0.0.0.0 /notes/ >> $LOG &
-tail -f $LOG
+/usr/sbin/sshd -D &
+jupyter notebook --ip=0.0.0.0 /paddle/book/
diff --git a/paddle/scripts/docker/root/.bashrc b/paddle/scripts/docker/root/.bashrc
new file mode 100755
index 0000000000000000000000000000000000000000..4b3024e4e81a0fa206a796c12a8b9d72f1a8f5d9
--- /dev/null
+++ b/paddle/scripts/docker/root/.bashrc
@@ -0,0 +1,46 @@
+# Locales
+
+export LC_ALL=en_US.UTF-8
+export LANG=en_US.UTF-8
+export LANGUAGE=en_US.UTF-8
+
+# Aliases
+
+alias rm='rm -i'
+alias cp='cp -i'
+alias mv='mv -i'
+
+alias ls='ls -hFG'
+alias l='ls -lF'
+alias ll='ls -alF'
+alias lt='ls -ltrF'
+alias ll='ls -alF'
+alias lls='ls -alSrF'
+alias llt='ls -altrF'
+
+# Colorize directory listing
+
+alias ls="ls -ph --color=auto"
+
+# Colorize grep
+
+if echo hello|grep --color=auto l >/dev/null 2>&1; then
+  export GREP_OPTIONS="--color=auto" GREP_COLOR="1;31"
+fi
+
+# Shell
+
+export CLICOLOR="1"
+
+YELLOW="\[\033[1;33m\]"
+NO_COLOUR="\[\033[0m\]"
+GREEN="\[\033[1;32m\]"
+WHITE="\[\033[1;37m\]"
+
+source ~/.scripts/git-prompt.sh
+
+export PS1="\[\033[1;33m\]λ $WHITE\h $GREEN\w$YELLOW\$(__git_ps1 \" \[\033[35m\]{\[\033[36m\]%s\[\033[35m\]}\")$NO_COLOUR "
+
+# Git
+
+source ~/.scripts/git-completion.sh
diff --git a/paddle/scripts/docker/root/.gitconfig b/paddle/scripts/docker/root/.gitconfig
new file mode 100755
index 0000000000000000000000000000000000000000..6c249803a50403b9b79e36a13abe7fe88a35729d
--- /dev/null
+++ b/paddle/scripts/docker/root/.gitconfig
@@ -0,0 +1,43 @@
+[user]
+  name =
+  email =
+
+[alias]
+  st = status --branch --short
+  ci = commit
+  br = branch
+  co = checkout
+  df = diff
+  l = log --pretty=format:\"%h %ad | %s%d [%an]\" --graph --date=short
+  ll = log --stat
+
+[merge]
+  tool = vimdiff
+
+[core]
+  excludesfile = ~/.gitignore
+  editor = vim
+
+[color]
+  branch = auto
+  diff = auto
+  status = auto
+
+[color "branch"]
+  current = yellow reverse
+  local = yellow
+  remote = green
+
+[color "diff"]
+  meta = yellow bold
+  frag = magenta bold
+  old = red bold
+  new = green bold
+
+[color "status"]
+  added = yellow
+  changed = green
+  untracked = cyan
+
+[push]
+  default = matching
\ No newline at end of file
diff --git a/paddle/scripts/docker/root/.scripts/git-completion.sh b/paddle/scripts/docker/root/.scripts/git-completion.sh
new file mode 100755
index 0000000000000000000000000000000000000000..bdddef5ac2faf50b47dd03539dae8912bec8a16c
--- /dev/null
+++ b/paddle/scripts/docker/root/.scripts/git-completion.sh
@@ -0,0 +1,2663 @@
+#!bash
+#
+# bash/zsh completion support for core Git.
+#
+# Copyright (C) 2006,2007 Shawn O. Pearce <spearce@spearce.org>
+# Conceptually based on gitcompletion (http://gitweb.hawaga.org.uk/).
+# Distributed under the GNU General Public License, version 2.0.
+#
+# The contained completion routines provide support for completing:
+#
+#    *) local and remote branch names
+#    *) local and remote tag names
+#    *) .git/remotes file names
+#    *) git 'subcommands'
+#    *) tree paths within 'ref:path/to/file' expressions
+#    *) file paths within current working directory and index
+#    *) common --long-options
+#
+# To use these routines:
+#
+#    1) Copy this file to somewhere (e.g. ~/.git-completion.sh).
+#    2) Add the following line to your .bashrc/.zshrc:
+#        source ~/.git-completion.sh
+#    3) Consider changing your PS1 to also show the current branch,
+#       see git-prompt.sh for details.
+
+case "$COMP_WORDBREAKS" in
+*:*) : great ;;
+*)   COMP_WORDBREAKS="$COMP_WORDBREAKS:"
+esac
+
+# __gitdir accepts 0 or 1 arguments (i.e., location)
+# returns location of .git repo
+__gitdir ()
+{
+  if [ -z "${1-}" ]; then
+    if [ -n "${__git_dir-}" ]; then
+      echo "$__git_dir"
+    elif [ -n "${GIT_DIR-}" ]; then
+      test -d "${GIT_DIR-}" || return 1
+      echo "$GIT_DIR"
+    elif [ -d .git ]; then
+      echo .git
+    else
+      git rev-parse --git-dir 2>/dev/null
+    fi
+  elif [ -d "$1/.git" ]; then
+    echo "$1/.git"
+  else
+    echo "$1"
+  fi
+}
+
+# The following function is based on code from:
+#
+#   bash_completion - programmable completion functions for bash 3.2+
+#
+#   Copyright © 2006-2008, Ian Macdonald <ian@caliban.org>
+#             © 2009-2010, Bash Completion Maintainers
+#                     <bash-completion-devel@lists.alioth.debian.org>
+#
+#   This program is free software; you can redistribute it and/or modify
+#   it under the terms of the GNU General Public License as published by
+#   the Free Software Foundation; either version 2, or (at your option)
+#   any later version.
+#
+#   This program is distributed in the hope that it will be useful,
+#   but WITHOUT ANY WARRANTY; without even the implied warranty of
+#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+#   GNU General Public License for more details.
+#
+#   You should have received a copy of the GNU General Public License
+#   along with this program; if not, write to the Free Software Foundation,
+#   Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+#
+#   The latest version of this software can be obtained here:
+#
+#   http://bash-completion.alioth.debian.org/
+#
+#   RELEASE: 2.x
+
+# This function can be used to access a tokenized list of words
+# on the command line:
+#
+# __git_reassemble_comp_words_by_ref '=:'
+# if test "${words_[cword_-1]}" = -w
+# then
+#   ...
+# fi
+#
+# The argument should be a collection of characters from the list of
+# word completion separators (COMP_WORDBREAKS) to treat as ordinary
+# characters.
+#
+# This is roughly equivalent to going back in time and setting
+# COMP_WORDBREAKS to exclude those characters.  The intent is to
+# make option types like --date=<type> and <rev>:<path> easy to
+# recognize by treating each shell word as a single token.
+#
+# It is best not to set COMP_WORDBREAKS directly because the value is
+# shared with other completion scripts.  By the time the completion
+# function gets called, COMP_WORDS has already been populated so local
+# changes to COMP_WORDBREAKS have no effect.
+#
+# Output: words_, cword_, cur_.
+
+__git_reassemble_comp_words_by_ref()
+{
+  local exclude i j first
+  # Which word separators to exclude?
+  exclude="${1//[^$COMP_WORDBREAKS]}"
+  cword_=$COMP_CWORD
+  if [ -z "$exclude" ]; then
+    words_=("${COMP_WORDS[@]}")
+    return
+  fi
+  # List of word completion separators has shrunk;
+  # re-assemble words to complete.
+  for ((i=0, j=0; i < ${#COMP_WORDS[@]}; i++, j++)); do
+    # Append each nonempty word consisting of just
+    # word separator characters to the current word.
+    first=t
+    while
+      [ $i -gt 0 ] &&
+      [ -n "${COMP_WORDS[$i]}" ] &&
+      # word consists of excluded word separators
+      [ "${COMP_WORDS[$i]//[^$exclude]}" = "${COMP_WORDS[$i]}" ]
+    do
+      # Attach to the previous token,
+      # unless the previous token is the command name.
+      if [ $j -ge 2 ] && [ -n "$first" ]; then
+        ((j--))
+      fi
+      first=
+      words_[$j]=${words_[j]}${COMP_WORDS[i]}
+      if [ $i = $COMP_CWORD ]; then
+        cword_=$j
+      fi
+      if (($i < ${#COMP_WORDS[@]} - 1)); then
+        ((i++))
+      else
+        # Done.
+        return
+      fi
+    done
+    words_[$j]=${words_[j]}${COMP_WORDS[i]}
+    if [ $i = $COMP_CWORD ]; then
+      cword_=$j
+    fi
+  done
+}
+
+if ! type _get_comp_words_by_ref >/dev/null 2>&1; then
+_get_comp_words_by_ref ()
+{
+  local exclude cur_ words_ cword_
+  if [ "$1" = "-n" ]; then
+    exclude=$2
+    shift 2
+  fi
+  __git_reassemble_comp_words_by_ref "$exclude"
+  cur_=${words_[cword_]}
+  while [ $# -gt 0 ]; do
+    case "$1" in
+    cur)
+      cur=$cur_
+      ;;
+    prev)
+      prev=${words_[$cword_-1]}
+      ;;
+    words)
+      words=("${words_[@]}")
+      ;;
+    cword)
+      cword=$cword_
+      ;;
+    esac
+    shift
+  done
+}
+fi
+
+__gitcompadd ()
+{
+  local i=0
+  for x in $1; do
+    if [[ "$x" == "$3"* ]]; then
+      COMPREPLY[i++]="$2$x$4"
+    fi
+  done
+}
+
+# Generates completion reply, appending a space to possible completion words,
+# if necessary.
+# It accepts 1 to 4 arguments:
+# 1: List of possible completion words.
+# 2: A prefix to be added to each possible completion word (optional).
+# 3: Generate possible completion matches for this word (optional).
+# 4: A suffix to be appended to each possible completion word (optional).
+__gitcomp ()
+{
+  local cur_="${3-$cur}"
+
+  case "$cur_" in
+  --*=)
+    ;;
+  *)
+    local c i=0 IFS=$' \t\n'
+    for c in $1; do
+      c="$c${4-}"
+      if [[ $c == "$cur_"* ]]; then
+        case $c in
+        --*=*|*.) ;;
+        *) c="$c " ;;
+        esac
+        COMPREPLY[i++]="${2-}$c"
+      fi
+    done
+    ;;
+  esac
+}
+
+# Generates completion reply from newline-separated possible completion words
+# by appending a space to all of them.
+# It accepts 1 to 4 arguments:
+# 1: List of possible completion words, separated by a single newline.
+# 2: A prefix to be added to each possible completion word (optional).
+# 3: Generate possible completion matches for this word (optional).
+# 4: A suffix to be appended to each possible completion word instead of
+#    the default space (optional).  If specified but empty, nothing is
+#    appended.
+__gitcomp_nl ()
+{
+  local IFS=$'\n'
+  __gitcompadd "$1" "${2-}" "${3-$cur}" "${4- }"
+}
+
+# Generates completion reply with compgen from newline-separated possible
+# completion filenames.
+# It accepts 1 to 3 arguments:
+# 1: List of possible completion filenames, separated by a single newline.
+# 2: A directory prefix to be added to each possible completion filename
+#    (optional).
+# 3: Generate possible completion matches for this word (optional).
+__gitcomp_file ()
+{
+  local IFS=$'\n'
+
+  # XXX does not work when the directory prefix contains a tilde,
+  # since tilde expansion is not applied.
+  # This means that COMPREPLY will be empty and Bash default
+  # completion will be used.
+  __gitcompadd "$1" "${2-}" "${3-$cur}" ""
+
+  # use a hack to enable file mode in bash < 4
+  compopt -o filenames +o nospace 2>/dev/null ||
+  compgen -f /non-existing-dir/ > /dev/null
+}
+
+# Execute 'git ls-files', unless the --committable option is specified, in
+# which case it runs 'git diff-index' to find out the files that can be
+# committed.  It return paths relative to the directory specified in the first
+# argument, and using the options specified in the second argument.
+__git_ls_files_helper ()
+{
+  (
+    test -n "${CDPATH+set}" && unset CDPATH
+    cd "$1"
+    if [ "$2" == "--committable" ]; then
+      git diff-index --name-only --relative HEAD
+    else
+      # NOTE: $2 is not quoted in order to support multiple options
+      git ls-files --exclude-standard $2
+    fi
+  ) 2>/dev/null
+}
+
+
+# __git_index_files accepts 1 or 2 arguments:
+# 1: Options to pass to ls-files (required).
+# 2: A directory path (optional).
+#    If provided, only files within the specified directory are listed.
+#    Sub directories are never recursed.  Path must have a trailing
+#    slash.
+__git_index_files ()
+{
+  local dir="$(__gitdir)" root="${2-.}" file
+
+  if [ -d "$dir" ]; then
+    __git_ls_files_helper "$root" "$1" |
+    while read -r file; do
+      case "$file" in
+      ?*/*) echo "${file%%/*}" ;;
+      *) echo "$file" ;;
+      esac
+    done | sort | uniq
+  fi
+}
+
+__git_heads ()
+{
+  local dir="$(__gitdir)"
+  if [ -d "$dir" ]; then
+    git --git-dir="$dir" for-each-ref --format='%(refname:short)' \
+      refs/heads
+    return
+  fi
+}
+
+__git_tags ()
+{
+  local dir="$(__gitdir)"
+  if [ -d "$dir" ]; then
+    git --git-dir="$dir" for-each-ref --format='%(refname:short)' \
+      refs/tags
+    return
+  fi
+}
+
+# __git_refs accepts 0, 1 (to pass to __gitdir), or 2 arguments
+# presence of 2nd argument means use the guess heuristic employed
+# by checkout for tracking branches
+__git_refs ()
+{
+  local i hash dir="$(__gitdir "${1-}")" track="${2-}"
+  local format refs
+  if [ -d "$dir" ]; then
+    case "$cur" in
+    refs|refs/*)
+      format="refname"
+      refs="${cur%/*}"
+      track=""
+      ;;
+    *)
+      for i in HEAD FETCH_HEAD ORIG_HEAD MERGE_HEAD; do
+        if [ -e "$dir/$i" ]; then echo $i; fi
+      done
+      format="refname:short"
+      refs="refs/tags refs/heads refs/remotes"
+      ;;
+    esac
+    git --git-dir="$dir" for-each-ref --format="%($format)" \
+      $refs
+    if [ -n "$track" ]; then
+      # employ the heuristic used by git checkout
+      # Try to find a remote branch that matches the completion word
+      # but only output if the branch name is unique
+      local ref entry
+      git --git-dir="$dir" for-each-ref --shell --format="ref=%(refname:short)" \
+        "refs/remotes/" | \
+      while read -r entry; do
+        eval "$entry"
+        ref="${ref#*/}"
+        if [[ "$ref" == "$cur"* ]]; then
+          echo "$ref"
+        fi
+      done | sort | uniq -u
+    fi
+    return
+  fi
+  case "$cur" in
+  refs|refs/*)
+    git ls-remote "$dir" "$cur*" 2>/dev/null | \
+    while read -r hash i; do
+      case "$i" in
+      *^{}) ;;
+      *) echo "$i" ;;
+      esac
+    done
+    ;;
+  *)
+    echo "HEAD"
+    git for-each-ref --format="%(refname:short)" -- "refs/remotes/$dir/" | sed -e "s#^$dir/##"
+    ;;
+  esac
+}
+
+# __git_refs2 requires 1 argument (to pass to __git_refs)
+__git_refs2 ()
+{
+  local i
+  for i in $(__git_refs "$1"); do
+    echo "$i:$i"
+  done
+}
+
+# __git_refs_remotes requires 1 argument (to pass to ls-remote)
+__git_refs_remotes ()
+{
+  local i hash
+  git ls-remote "$1" 'refs/heads/*' 2>/dev/null | \
+  while read -r hash i; do
+    echo "$i:refs/remotes/$1/${i#refs/heads/}"
+  done
+}
+
+__git_remotes ()
+{
+  local i IFS=$'\n' d="$(__gitdir)"
+  test -d "$d/remotes" && ls -1 "$d/remotes"
+  for i in $(git --git-dir="$d" config --get-regexp 'remote\..*\.url' 2>/dev/null); do
+    i="${i#remote.}"
+    echo "${i/.url*/}"
+  done
+}
+
+__git_list_merge_strategies ()
+{
+  git merge -s help 2>&1 |
+  sed -n -e '/[Aa]vailable strategies are: /,/^$/{
+    s/\.$//
+    s/.*://
+    s/^[  ]*//
+    s/[   ]*$//
+    p
+  }'
+}
+
+__git_merge_strategies=
+# 'git merge -s help' (and thus detection of the merge strategy
+# list) fails, unfortunately, if run outside of any git working
+# tree.  __git_merge_strategies is set to the empty string in
+# that case, and the detection will be repeated the next time it
+# is needed.
+__git_compute_merge_strategies ()
+{
+  test -n "$__git_merge_strategies" ||
+  __git_merge_strategies=$(__git_list_merge_strategies)
+}
+
+__git_complete_revlist_file ()
+{
+  local pfx ls ref cur_="$cur"
+  case "$cur_" in
+  *..?*:*)
+    return
+    ;;
+  ?*:*)
+    ref="${cur_%%:*}"
+    cur_="${cur_#*:}"
+    case "$cur_" in
+    ?*/*)
+      pfx="${cur_%/*}"
+      cur_="${cur_##*/}"
+      ls="$ref:$pfx"
+      pfx="$pfx/"
+      ;;
+    *)
+      ls="$ref"
+      ;;
+    esac
+
+    case "$COMP_WORDBREAKS" in
+    *:*) : great ;;
+    *)   pfx="$ref:$pfx" ;;
+    esac
+
+    __gitcomp_nl "$(git --git-dir="$(__gitdir)" ls-tree "$ls" 2>/dev/null \
+        | sed '/^100... blob /{
+                   s,^.*  ,,
+                   s,$, ,
+               }
+               /^120000 blob /{
+                   s,^.*  ,,
+                   s,$, ,
+               }
+               /^040000 tree /{
+                   s,^.*  ,,
+                   s,$,/,
+               }
+               s/^.*  //')" \
+      "$pfx" "$cur_" ""
+    ;;
+  *...*)
+    pfx="${cur_%...*}..."
+    cur_="${cur_#*...}"
+    __gitcomp_nl "$(__git_refs)" "$pfx" "$cur_"
+    ;;
+  *..*)
+    pfx="${cur_%..*}.."
+    cur_="${cur_#*..}"
+    __gitcomp_nl "$(__git_refs)" "$pfx" "$cur_"
+    ;;
+  *)
+    __gitcomp_nl "$(__git_refs)"
+    ;;
+  esac
+}
+
+
+# __git_complete_index_file requires 1 argument:
+# 1: the options to pass to ls-file
+#
+# The exception is --committable, which finds the files appropriate commit.
+__git_complete_index_file ()
+{
+  local pfx="" cur_="$cur"
+
+  case "$cur_" in
+  ?*/*)
+    pfx="${cur_%/*}"
+    cur_="${cur_##*/}"
+    pfx="${pfx}/"
+    ;;
+  esac
+
+  __gitcomp_file "$(__git_index_files "$1" "$pfx")" "$pfx" "$cur_"
+}
+
+__git_complete_file ()
+{
+  __git_complete_revlist_file
+}
+
+__git_complete_revlist ()
+{
+  __git_complete_revlist_file
+}
+
+__git_complete_remote_or_refspec ()
+{
+  local cur_="$cur" cmd="${words[1]}"
+  local i c=2 remote="" pfx="" lhs=1 no_complete_refspec=0
+  if [ "$cmd" = "remote" ]; then
+    ((c++))
+  fi
+  while [ $c -lt $cword ]; do
+    i="${words[c]}"
+    case "$i" in
+    --mirror) [ "$cmd" = "push" ] && no_complete_refspec=1 ;;
+    --all)
+      case "$cmd" in
+      push) no_complete_refspec=1 ;;
+      fetch)
+        return
+        ;;
+      *) ;;
+      esac
+      ;;
+    -*) ;;
+    *) remote="$i"; break ;;
+    esac
+    ((c++))
+  done
+  if [ -z "$remote" ]; then
+    __gitcomp_nl "$(__git_remotes)"
+    return
+  fi
+  if [ $no_complete_refspec = 1 ]; then
+    return
+  fi
+  [ "$remote" = "." ] && remote=
+  case "$cur_" in
+  *:*)
+    case "$COMP_WORDBREAKS" in
+    *:*) : great ;;
+    *)   pfx="${cur_%%:*}:" ;;
+    esac
+    cur_="${cur_#*:}"
+    lhs=0
+    ;;
+  +*)
+    pfx="+"
+    cur_="${cur_#+}"
+    ;;
+  esac
+  case "$cmd" in
+  fetch)
+    if [ $lhs = 1 ]; then
+      __gitcomp_nl "$(__git_refs2 "$remote")" "$pfx" "$cur_"
+    else
+      __gitcomp_nl "$(__git_refs)" "$pfx" "$cur_"
+    fi
+    ;;
+  pull|remote)
+    if [ $lhs = 1 ]; then
+      __gitcomp_nl "$(__git_refs "$remote")" "$pfx" "$cur_"
+    else
+      __gitcomp_nl "$(__git_refs)" "$pfx" "$cur_"
+    fi
+    ;;
+  push)
+    if [ $lhs = 1 ]; then
+      __gitcomp_nl "$(__git_refs)" "$pfx" "$cur_"
+    else
+      __gitcomp_nl "$(__git_refs "$remote")" "$pfx" "$cur_"
+    fi
+    ;;
+  esac
+}
+
+__git_complete_strategy ()
+{
+  __git_compute_merge_strategies
+  case "$prev" in
+  -s|--strategy)
+    __gitcomp "$__git_merge_strategies"
+    return 0
+  esac
+  case "$cur" in
+  --strategy=*)
+    __gitcomp "$__git_merge_strategies" "" "${cur##--strategy=}"
+    return 0
+    ;;
+  esac
+  return 1
+}
+
+__git_commands () {
+  if test -n "${GIT_TESTING_COMMAND_COMPLETION:-}"
+  then
+    printf "%s" "${GIT_TESTING_COMMAND_COMPLETION}"
+  else
+    git help -a|egrep '^  [a-zA-Z0-9]'
+  fi
+}
+
+__git_list_all_commands ()
+{
+  local i IFS=" "$'\n'
+  for i in $(__git_commands)
+  do
+    case $i in
+    *--*)             : helper pattern;;
+    *) echo $i;;
+    esac
+  done
+}
+
+__git_all_commands=
+__git_compute_all_commands ()
+{
+  test -n "$__git_all_commands" ||
+  __git_all_commands=$(__git_list_all_commands)
+}
+
+__git_list_porcelain_commands ()
+{
+  local i IFS=" "$'\n'
+  __git_compute_all_commands
+  for i in $__git_all_commands
+  do
+    case $i in
+    *--*)             : helper pattern;;
+    applymbox)        : ask gittus;;
+    applypatch)       : ask gittus;;
+    archimport)       : import;;
+    cat-file)         : plumbing;;
+    check-attr)       : plumbing;;
+    check-ignore)     : plumbing;;
+    check-mailmap)    : plumbing;;
+    check-ref-format) : plumbing;;
+    checkout-index)   : plumbing;;
+    commit-tree)      : plumbing;;
+    count-objects)    : infrequent;;
+    credential-cache) : credentials helper;;
+    credential-store) : credentials helper;;
+    cvsexportcommit)  : export;;
+    cvsimport)        : import;;
+    cvsserver)        : daemon;;
+    daemon)           : daemon;;
+    diff-files)       : plumbing;;
+    diff-index)       : plumbing;;
+    diff-tree)        : plumbing;;
+    fast-import)      : import;;
+    fast-export)      : export;;
+    fsck-objects)     : plumbing;;
+    fetch-pack)       : plumbing;;
+    fmt-merge-msg)    : plumbing;;
+    for-each-ref)     : plumbing;;
+    hash-object)      : plumbing;;
+    http-*)           : transport;;
+    index-pack)       : plumbing;;
+    init-db)          : deprecated;;
+    local-fetch)      : plumbing;;
+    lost-found)       : infrequent;;
+    ls-files)         : plumbing;;
+    ls-remote)        : plumbing;;
+    ls-tree)          : plumbing;;
+    mailinfo)         : plumbing;;
+    mailsplit)        : plumbing;;
+    merge-*)          : plumbing;;
+    mktree)           : plumbing;;
+    mktag)            : plumbing;;
+    pack-objects)     : plumbing;;
+    pack-redundant)   : plumbing;;
+    pack-refs)        : plumbing;;
+    parse-remote)     : plumbing;;
+    patch-id)         : plumbing;;
+    peek-remote)      : plumbing;;
+    prune)            : plumbing;;
+    prune-packed)     : plumbing;;
+    quiltimport)      : import;;
+    read-tree)        : plumbing;;
+    receive-pack)     : plumbing;;
+    remote-*)         : transport;;
+    repo-config)      : deprecated;;
+    rerere)           : plumbing;;
+    rev-list)         : plumbing;;
+    rev-parse)        : plumbing;;
+    runstatus)        : plumbing;;
+    sh-setup)         : internal;;
+    shell)            : daemon;;
+    show-ref)         : plumbing;;
+    send-pack)        : plumbing;;
+    show-index)       : plumbing;;
+    ssh-*)            : transport;;
+    stripspace)       : plumbing;;
+    symbolic-ref)     : plumbing;;
+    tar-tree)         : deprecated;;
+    unpack-file)      : plumbing;;
+    unpack-objects)   : plumbing;;
+    update-index)     : plumbing;;
+    update-ref)       : plumbing;;
+    update-server-info) : daemon;;
+    upload-archive)   : plumbing;;
+    upload-pack)      : plumbing;;
+    write-tree)       : plumbing;;
+    var)              : infrequent;;
+    verify-pack)      : infrequent;;
+    verify-tag)       : plumbing;;
+    *) echo $i;;
+    esac
+  done
+}
+
+__git_porcelain_commands=
+__git_compute_porcelain_commands ()
+{
+  __git_compute_all_commands
+  test -n "$__git_porcelain_commands" ||
+  __git_porcelain_commands=$(__git_list_porcelain_commands)
+}
+
+__git_pretty_aliases ()
+{
+  local i IFS=$'\n'
+  for i in $(git --git-dir="$(__gitdir)" config --get-regexp "pretty\..*" 2>/dev/null); do
+    case "$i" in
+    pretty.*)
+      i="${i#pretty.}"
+      echo "${i/ */}"
+      ;;
+    esac
+  done
+}
+
+__git_aliases ()
+{
+  local i IFS=$'\n'
+  for i in $(git --git-dir="$(__gitdir)" config --get-regexp "alias\..*" 2>/dev/null); do
+    case "$i" in
+    alias.*)
+      i="${i#alias.}"
+      echo "${i/ */}"
+      ;;
+    esac
+  done
+}
+
+# __git_aliased_command requires 1 argument
+__git_aliased_command ()
+{
+  local word cmdline=$(git --git-dir="$(__gitdir)" \
+    config --get "alias.$1")
+  for word in $cmdline; do
+    case "$word" in
+    \!gitk|gitk)
+      echo "gitk"
+      return
+      ;;
+    \!*)  : shell command alias ;;
+    -*) : option ;;
+    *=*)  : setting env ;;
+    git)  : git itself ;;
+    *)
+      echo "$word"
+      return
+    esac
+  done
+}
+
+# __git_find_on_cmdline requires 1 argument
+__git_find_on_cmdline ()
+{
+  local word subcommand c=1
+  while [ $c -lt $cword ]; do
+    word="${words[c]}"
+    for subcommand in $1; do
+      if [ "$subcommand" = "$word" ]; then
+        echo "$subcommand"
+        return
+      fi
+    done
+    ((c++))
+  done
+}
+
+__git_has_doubledash ()
+{
+  local c=1
+  while [ $c -lt $cword ]; do
+    if [ "--" = "${words[c]}" ]; then
+      return 0
+    fi
+    ((c++))
+  done
+  return 1
+}
+
+# Try to count non option arguments passed on the command line for the
+# specified git command.
+# When options are used, it is necessary to use the special -- option to
+# tell the implementation were non option arguments begin.
+# XXX this can not be improved, since options can appear everywhere, as
+# an example:
+# git mv x -n y
+#
+# __git_count_arguments requires 1 argument: the git command executed.
+__git_count_arguments ()
+{
+  local word i c=0
+
+  # Skip "git" (first argument)
+  for ((i=1; i < ${#words[@]}; i++)); do
+    word="${words[i]}"
+
+    case "$word" in
+      --)
+        # Good; we can assume that the following are only non
+        # option arguments.
+        ((c = 0))
+        ;;
+      "$1")
+        # Skip the specified git command and discard git
+        # main options
+        ((c = 0))
+        ;;
+      ?*)
+        ((c++))
+        ;;
+    esac
+  done
+
+  printf "%d" $c
+}
+
+__git_whitespacelist="nowarn warn error error-all fix"
+
+_git_am ()
+{
+  local dir="$(__gitdir)"
+  if [ -d "$dir"/rebase-apply ]; then
+    __gitcomp "--skip --continue --resolved --abort"
+    return
+  fi
+  case "$cur" in
+  --whitespace=*)
+    __gitcomp "$__git_whitespacelist" "" "${cur##--whitespace=}"
+    return
+    ;;
+  --*)
+    __gitcomp "
+      --3way --committer-date-is-author-date --ignore-date
+      --ignore-whitespace --ignore-space-change
+      --interactive --keep --no-utf8 --signoff --utf8
+      --whitespace= --scissors
+      "
+    return
+  esac
+}
+
+_git_apply ()
+{
+  case "$cur" in
+  --whitespace=*)
+    __gitcomp "$__git_whitespacelist" "" "${cur##--whitespace=}"
+    return
+    ;;
+  --*)
+    __gitcomp "
+      --stat --numstat --summary --check --index
+      --cached --index-info --reverse --reject --unidiff-zero
+      --apply --no-add --exclude=
+      --ignore-whitespace --ignore-space-change
+      --whitespace= --inaccurate-eof --verbose
+      "
+    return
+  esac
+}
+
+_git_add ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "
+      --interactive --refresh --patch --update --dry-run
+      --ignore-errors --intent-to-add
+      "
+    return
+  esac
+
+  # XXX should we check for --update and --all options ?
+  __git_complete_index_file "--others --modified"
+}
+
+_git_archive ()
+{
+  case "$cur" in
+  --format=*)
+    __gitcomp "$(git archive --list)" "" "${cur##--format=}"
+    return
+    ;;
+  --remote=*)
+    __gitcomp_nl "$(__git_remotes)" "" "${cur##--remote=}"
+    return
+    ;;
+  --*)
+    __gitcomp "
+      --format= --list --verbose
+      --prefix= --remote= --exec=
+      "
+    return
+    ;;
+  esac
+  __git_complete_file
+}
+
+_git_bisect ()
+{
+  __git_has_doubledash && return
+
+  local subcommands="start bad good skip reset visualize replay log run"
+  local subcommand="$(__git_find_on_cmdline "$subcommands")"
+  if [ -z "$subcommand" ]; then
+    if [ -f "$(__gitdir)"/BISECT_START ]; then
+      __gitcomp "$subcommands"
+    else
+      __gitcomp "replay start"
+    fi
+    return
+  fi
+
+  case "$subcommand" in
+  bad|good|reset|skip|start)
+    __gitcomp_nl "$(__git_refs)"
+    ;;
+  *)
+    ;;
+  esac
+}
+
+_git_branch ()
+{
+  local i c=1 only_local_ref="n" has_r="n"
+
+  while [ $c -lt $cword ]; do
+    i="${words[c]}"
+    case "$i" in
+    -d|-m)  only_local_ref="y" ;;
+    -r) has_r="y" ;;
+    esac
+    ((c++))
+  done
+
+  case "$cur" in
+  --set-upstream-to=*)
+    __gitcomp "$(__git_refs)" "" "${cur##--set-upstream-to=}"
+    ;;
+  --*)
+    __gitcomp "
+      --color --no-color --verbose --abbrev= --no-abbrev
+      --track --no-track --contains --merged --no-merged
+      --set-upstream-to= --edit-description --list
+      --unset-upstream
+      "
+    ;;
+  *)
+    if [ $only_local_ref = "y" -a $has_r = "n" ]; then
+      __gitcomp_nl "$(__git_heads)"
+    else
+      __gitcomp_nl "$(__git_refs)"
+    fi
+    ;;
+  esac
+}
+
+_git_bundle ()
+{
+  local cmd="${words[2]}"
+  case "$cword" in
+  2)
+    __gitcomp "create list-heads verify unbundle"
+    ;;
+  3)
+    # looking for a file
+    ;;
+  *)
+    case "$cmd" in
+      create)
+        __git_complete_revlist
+      ;;
+    esac
+    ;;
+  esac
+}
+
+_git_checkout ()
+{
+  __git_has_doubledash && return
+
+  case "$cur" in
+  --conflict=*)
+    __gitcomp "diff3 merge" "" "${cur##--conflict=}"
+    ;;
+  --*)
+    __gitcomp "
+      --quiet --ours --theirs --track --no-track --merge
+      --conflict= --orphan --patch
+      "
+    ;;
+  *)
+    # check if --track, --no-track, or --no-guess was specified
+    # if so, disable DWIM mode
+    local flags="--track --no-track --no-guess" track=1
+    if [ -n "$(__git_find_on_cmdline "$flags")" ]; then
+      track=''
+    fi
+    __gitcomp_nl "$(__git_refs '' $track)"
+    ;;
+  esac
+}
+
+_git_cherry ()
+{
+  __gitcomp "$(__git_refs)"
+}
+
+_git_cherry_pick ()
+{
+  local dir="$(__gitdir)"
+  if [ -f "$dir"/CHERRY_PICK_HEAD ]; then
+    __gitcomp "--continue --quit --abort"
+    return
+  fi
+  case "$cur" in
+  --*)
+    __gitcomp "--edit --no-commit --signoff --strategy= --mainline"
+    ;;
+  *)
+    __gitcomp_nl "$(__git_refs)"
+    ;;
+  esac
+}
+
+_git_clean ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "--dry-run --quiet"
+    return
+    ;;
+  esac
+
+  # XXX should we check for -x option ?
+  __git_complete_index_file "--others"
+}
+
+_git_clone ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "
+      --local
+      --no-hardlinks
+      --shared
+      --reference
+      --quiet
+      --no-checkout
+      --bare
+      --mirror
+      --origin
+      --upload-pack
+      --template=
+      --depth
+      --single-branch
+      --branch
+      "
+    return
+    ;;
+  esac
+}
+
+_git_commit ()
+{
+  case "$prev" in
+  -c|-C)
+    __gitcomp_nl "$(__git_refs)" "" "${cur}"
+    return
+    ;;
+  esac
+
+  case "$cur" in
+  --cleanup=*)
+    __gitcomp "default strip verbatim whitespace
+      " "" "${cur##--cleanup=}"
+    return
+    ;;
+  --reuse-message=*|--reedit-message=*|\
+  --fixup=*|--squash=*)
+    __gitcomp_nl "$(__git_refs)" "" "${cur#*=}"
+    return
+    ;;
+  --untracked-files=*)
+    __gitcomp "all no normal" "" "${cur##--untracked-files=}"
+    return
+    ;;
+  --*)
+    __gitcomp "
+      --all --author= --signoff --verify --no-verify
+      --edit --no-edit
+      --amend --include --only --interactive
+      --dry-run --reuse-message= --reedit-message=
+      --reset-author --file= --message= --template=
+      --cleanup= --untracked-files --untracked-files=
+      --verbose --quiet --fixup= --squash=
+      "
+    return
+  esac
+
+  if git rev-parse --verify --quiet HEAD >/dev/null; then
+    __git_complete_index_file "--committable"
+  else
+    # This is the first commit
+    __git_complete_index_file "--cached"
+  fi
+}
+
+_git_describe ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "
+      --all --tags --contains --abbrev= --candidates=
+      --exact-match --debug --long --match --always
+      "
+    return
+  esac
+  __gitcomp_nl "$(__git_refs)"
+}
+
+__git_diff_algorithms="myers minimal patience histogram"
+
+__git_diff_common_options="--stat --numstat --shortstat --summary
+      --patch-with-stat --name-only --name-status --color
+      --no-color --color-words --no-renames --check
+      --full-index --binary --abbrev --diff-filter=
+      --find-copies-harder
+      --text --ignore-space-at-eol --ignore-space-change
+      --ignore-all-space --exit-code --quiet --ext-diff
+      --no-ext-diff
+      --no-prefix --src-prefix= --dst-prefix=
+      --inter-hunk-context=
+      --patience --histogram --minimal
+      --raw --word-diff
+      --dirstat --dirstat= --dirstat-by-file
+      --dirstat-by-file= --cumulative
+      --diff-algorithm=
+"
+
+_git_diff ()
+{
+  __git_has_doubledash && return
+
+  case "$cur" in
+  --diff-algorithm=*)
+    __gitcomp "$__git_diff_algorithms" "" "${cur##--diff-algorithm=}"
+    return
+    ;;
+  --*)
+    __gitcomp "--cached --staged --pickaxe-all --pickaxe-regex
+      --base --ours --theirs --no-index
+      $__git_diff_common_options
+      "
+    return
+    ;;
+  esac
+  __git_complete_revlist_file
+}
+
+__git_mergetools_common="diffuse ecmerge emerge kdiff3 meld opendiff
+      tkdiff vimdiff gvimdiff xxdiff araxis p4merge bc3 codecompare
+"
+
+_git_difftool ()
+{
+  __git_has_doubledash && return
+
+  case "$cur" in
+  --tool=*)
+    __gitcomp "$__git_mergetools_common kompare" "" "${cur##--tool=}"
+    return
+    ;;
+  --*)
+    __gitcomp "--cached --staged --pickaxe-all --pickaxe-regex
+      --base --ours --theirs
+      --no-renames --diff-filter= --find-copies-harder
+      --relative --ignore-submodules
+      --tool="
+    return
+    ;;
+  esac
+  __git_complete_revlist_file
+}
+
+__git_fetch_options="
+  --quiet --verbose --append --upload-pack --force --keep --depth=
+  --tags --no-tags --all --prune --dry-run
+"
+
+_git_fetch ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "$__git_fetch_options"
+    return
+    ;;
+  esac
+  __git_complete_remote_or_refspec
+}
+
+__git_format_patch_options="
+  --stdout --attach --no-attach --thread --thread= --no-thread
+  --numbered --start-number --numbered-files --keep-subject --signoff
+  --signature --no-signature --in-reply-to= --cc= --full-index --binary
+  --not --all --cover-letter --no-prefix --src-prefix= --dst-prefix=
+  --inline --suffix= --ignore-if-in-upstream --subject-prefix=
+  --output-directory --reroll-count --to= --quiet --notes
+"
+
+_git_format_patch ()
+{
+  case "$cur" in
+  --thread=*)
+    __gitcomp "
+      deep shallow
+      " "" "${cur##--thread=}"
+    return
+    ;;
+  --*)
+    __gitcomp "$__git_format_patch_options"
+    return
+    ;;
+  esac
+  __git_complete_revlist
+}
+
+_git_fsck ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "
+      --tags --root --unreachable --cache --no-reflogs --full
+      --strict --verbose --lost-found
+      "
+    return
+    ;;
+  esac
+}
+
+_git_gc ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "--prune --aggressive"
+    return
+    ;;
+  esac
+}
+
+_git_gitk ()
+{
+  _gitk
+}
+
+__git_match_ctag() {
+  awk "/^${1////\\/}/ { print \$1 }" "$2"
+}
+
+_git_grep ()
+{
+  __git_has_doubledash && return
+
+  case "$cur" in
+  --*)
+    __gitcomp "
+      --cached
+      --text --ignore-case --word-regexp --invert-match
+      --full-name --line-number
+      --extended-regexp --basic-regexp --fixed-strings
+      --perl-regexp
+      --files-with-matches --name-only
+      --files-without-match
+      --max-depth
+      --count
+      --and --or --not --all-match
+      "
+    return
+    ;;
+  esac
+
+  case "$cword,$prev" in
+  2,*|*,-*)
+    if test -r tags; then
+      __gitcomp_nl "$(__git_match_ctag "$cur" tags)"
+      return
+    fi
+    ;;
+  esac
+
+  __gitcomp_nl "$(__git_refs)"
+}
+
+_git_help ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "--all --info --man --web"
+    return
+    ;;
+  esac
+  __git_compute_all_commands
+  __gitcomp "$__git_all_commands $(__git_aliases)
+    attributes cli core-tutorial cvs-migration
+    diffcore gitk glossary hooks ignore modules
+    namespaces repository-layout tutorial tutorial-2
+    workflows
+    "
+}
+
+_git_init ()
+{
+  case "$cur" in
+  --shared=*)
+    __gitcomp "
+      false true umask group all world everybody
+      " "" "${cur##--shared=}"
+    return
+    ;;
+  --*)
+    __gitcomp "--quiet --bare --template= --shared --shared="
+    return
+    ;;
+  esac
+}
+
+_git_ls_files ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "--cached --deleted --modified --others --ignored
+      --stage --directory --no-empty-directory --unmerged
+      --killed --exclude= --exclude-from=
+      --exclude-per-directory= --exclude-standard
+      --error-unmatch --with-tree= --full-name
+      --abbrev --ignored --exclude-per-directory
+      "
+    return
+    ;;
+  esac
+
+  # XXX ignore options like --modified and always suggest all cached
+  # files.
+  __git_complete_index_file "--cached"
+}
+
+_git_ls_remote ()
+{
+  __gitcomp_nl "$(__git_remotes)"
+}
+
+_git_ls_tree ()
+{
+  __git_complete_file
+}
+
+# Options that go well for log, shortlog and gitk
+__git_log_common_options="
+  --not --all
+  --branches --tags --remotes
+  --first-parent --merges --no-merges
+  --max-count=
+  --max-age= --since= --after=
+  --min-age= --until= --before=
+  --min-parents= --max-parents=
+  --no-min-parents --no-max-parents
+"
+# Options that go well for log and gitk (not shortlog)
+__git_log_gitk_options="
+  --dense --sparse --full-history
+  --simplify-merges --simplify-by-decoration
+  --left-right --notes --no-notes
+"
+# Options that go well for log and shortlog (not gitk)
+__git_log_shortlog_options="
+  --author= --committer= --grep=
+  --all-match
+"
+
+__git_log_pretty_formats="oneline short medium full fuller email raw format:"
+__git_log_date_formats="relative iso8601 rfc2822 short local default raw"
+
+_git_log ()
+{
+  __git_has_doubledash && return
+
+  local g="$(git rev-parse --git-dir 2>/dev/null)"
+  local merge=""
+  if [ -f "$g/MERGE_HEAD" ]; then
+    merge="--merge"
+  fi
+  case "$cur" in
+  --pretty=*|--format=*)
+    __gitcomp "$__git_log_pretty_formats $(__git_pretty_aliases)
+      " "" "${cur#*=}"
+    return
+    ;;
+  --date=*)
+    __gitcomp "$__git_log_date_formats" "" "${cur##--date=}"
+    return
+    ;;
+  --decorate=*)
+    __gitcomp "long short" "" "${cur##--decorate=}"
+    return
+    ;;
+  --*)
+    __gitcomp "
+      $__git_log_common_options
+      $__git_log_shortlog_options
+      $__git_log_gitk_options
+      --root --topo-order --date-order --reverse
+      --follow --full-diff
+      --abbrev-commit --abbrev=
+      --relative-date --date=
+      --pretty= --format= --oneline
+      --cherry-pick
+      --graph
+      --decorate --decorate=
+      --walk-reflogs
+      --parents --children
+      $merge
+      $__git_diff_common_options
+      --pickaxe-all --pickaxe-regex
+      "
+    return
+    ;;
+  esac
+  __git_complete_revlist
+}
+
+__git_merge_options="
+  --no-commit --no-stat --log --no-log --squash --strategy
+  --commit --stat --no-squash --ff --no-ff --ff-only --edit --no-edit
+"
+
+_git_merge ()
+{
+  __git_complete_strategy && return
+
+  case "$cur" in
+  --*)
+    __gitcomp "$__git_merge_options"
+    return
+  esac
+  __gitcomp_nl "$(__git_refs)"
+}
+
+_git_mergetool ()
+{
+  case "$cur" in
+  --tool=*)
+    __gitcomp "$__git_mergetools_common tortoisemerge" "" "${cur##--tool=}"
+    return
+    ;;
+  --*)
+    __gitcomp "--tool="
+    return
+    ;;
+  esac
+}
+
+_git_merge_base ()
+{
+  __gitcomp_nl "$(__git_refs)"
+}
+
+_git_mv ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "--dry-run"
+    return
+    ;;
+  esac
+
+  if [ $(__git_count_arguments "mv") -gt 0 ]; then
+    # We need to show both cached and untracked files (including
+    # empty directories) since this may not be the last argument.
+    __git_complete_index_file "--cached --others --directory"
+  else
+    __git_complete_index_file "--cached"
+  fi
+}
+
+_git_name_rev ()
+{
+  __gitcomp "--tags --all --stdin"
+}
+
+_git_notes ()
+{
+  local subcommands='add append copy edit list prune remove show'
+  local subcommand="$(__git_find_on_cmdline "$subcommands")"
+
+  case "$subcommand,$cur" in
+  ,--*)
+    __gitcomp '--ref'
+    ;;
+  ,*)
+    case "$prev" in
+    --ref)
+      __gitcomp_nl "$(__git_refs)"
+      ;;
+    *)
+      __gitcomp "$subcommands --ref"
+      ;;
+    esac
+    ;;
+  add,--reuse-message=*|append,--reuse-message=*|\
+  add,--reedit-message=*|append,--reedit-message=*)
+    __gitcomp_nl "$(__git_refs)" "" "${cur#*=}"
+    ;;
+  add,--*|append,--*)
+    __gitcomp '--file= --message= --reedit-message=
+        --reuse-message='
+    ;;
+  copy,--*)
+    __gitcomp '--stdin'
+    ;;
+  prune,--*)
+    __gitcomp '--dry-run --verbose'
+    ;;
+  prune,*)
+    ;;
+  *)
+    case "$prev" in
+    -m|-F)
+      ;;
+    *)
+      __gitcomp_nl "$(__git_refs)"
+      ;;
+    esac
+    ;;
+  esac
+}
+
+_git_pull ()
+{
+  __git_complete_strategy && return
+
+  case "$cur" in
+  --*)
+    __gitcomp "
+      --rebase --no-rebase
+      $__git_merge_options
+      $__git_fetch_options
+    "
+    return
+    ;;
+  esac
+  __git_complete_remote_or_refspec
+}
+
+_git_push ()
+{
+  case "$prev" in
+  --repo)
+    __gitcomp_nl "$(__git_remotes)"
+    return
+  esac
+  case "$cur" in
+  --repo=*)
+    __gitcomp_nl "$(__git_remotes)" "" "${cur##--repo=}"
+    return
+    ;;
+  --*)
+    __gitcomp "
+      --all --mirror --tags --dry-run --force --verbose
+      --receive-pack= --repo= --set-upstream
+    "
+    return
+    ;;
+  esac
+  __git_complete_remote_or_refspec
+}
+
+_git_rebase ()
+{
+  local dir="$(__gitdir)"
+  if [ -d "$dir"/rebase-apply ] || [ -d "$dir"/rebase-merge ]; then
+    __gitcomp "--continue --skip --abort"
+    return
+  fi
+  __git_complete_strategy && return
+  case "$cur" in
+  --whitespace=*)
+    __gitcomp "$__git_whitespacelist" "" "${cur##--whitespace=}"
+    return
+    ;;
+  --*)
+    __gitcomp "
+      --onto --merge --strategy --interactive
+      --preserve-merges --stat --no-stat
+      --committer-date-is-author-date --ignore-date
+      --ignore-whitespace --whitespace=
+      --autosquash
+      "
+
+    return
+  esac
+  __gitcomp_nl "$(__git_refs)"
+}
+
+_git_reflog ()
+{
+  local subcommands="show delete expire"
+  local subcommand="$(__git_find_on_cmdline "$subcommands")"
+
+  if [ -z "$subcommand" ]; then
+    __gitcomp "$subcommands"
+  else
+    __gitcomp_nl "$(__git_refs)"
+  fi
+}
+
+__git_send_email_confirm_options="always never auto cc compose"
+__git_send_email_suppresscc_options="author self cc bodycc sob cccmd body all"
+
+_git_send_email ()
+{
+  case "$cur" in
+  --confirm=*)
+    __gitcomp "
+      $__git_send_email_confirm_options
+      " "" "${cur##--confirm=}"
+    return
+    ;;
+  --suppress-cc=*)
+    __gitcomp "
+      $__git_send_email_suppresscc_options
+      " "" "${cur##--suppress-cc=}"
+
+    return
+    ;;
+  --smtp-encryption=*)
+    __gitcomp "ssl tls" "" "${cur##--smtp-encryption=}"
+    return
+    ;;
+  --thread=*)
+    __gitcomp "
+      deep shallow
+      " "" "${cur##--thread=}"
+    return
+    ;;
+  --*)
+    __gitcomp "--annotate --bcc --cc --cc-cmd --chain-reply-to
+      --compose --confirm= --dry-run --envelope-sender
+      --from --identity
+      --in-reply-to --no-chain-reply-to --no-signed-off-by-cc
+      --no-suppress-from --no-thread --quiet
+      --signed-off-by-cc --smtp-pass --smtp-server
+      --smtp-server-port --smtp-encryption= --smtp-user
+      --subject --suppress-cc= --suppress-from --thread --to
+      --validate --no-validate
+      $__git_format_patch_options"
+    return
+    ;;
+  esac
+  __git_complete_revlist
+}
+
+_git_stage ()
+{
+  _git_add
+}
+
+__git_config_get_set_variables ()
+{
+  local prevword word config_file= c=$cword
+  while [ $c -gt 1 ]; do
+    word="${words[c]}"
+    case "$word" in
+    --system|--global|--local|--file=*)
+      config_file="$word"
+      break
+      ;;
+    -f|--file)
+      config_file="$word $prevword"
+      break
+      ;;
+    esac
+    prevword=$word
+    c=$((--c))
+  done
+
+  git --git-dir="$(__gitdir)" config $config_file --list 2>/dev/null |
+  while read -r line
+  do
+    case "$line" in
+    *.*=*)
+      echo "${line/=*/}"
+      ;;
+    esac
+  done
+}
+
+_git_config ()
+{
+  case "$prev" in
+  branch.*.remote|branch.*.pushremote)
+    __gitcomp_nl "$(__git_remotes)"
+    return
+    ;;
+  branch.*.merge)
+    __gitcomp_nl "$(__git_refs)"
+    return
+    ;;
+  branch.*.rebase)
+    __gitcomp "false true"
+    return
+    ;;
+  remote.pushdefault)
+    __gitcomp_nl "$(__git_remotes)"
+    return
+    ;;
+  remote.*.fetch)
+    local remote="${prev#remote.}"
+    remote="${remote%.fetch}"
+    if [ -z "$cur" ]; then
+      __gitcomp_nl "refs/heads/" "" "" ""
+      return
+    fi
+    __gitcomp_nl "$(__git_refs_remotes "$remote")"
+    return
+    ;;
+  remote.*.push)
+    local remote="${prev#remote.}"
+    remote="${remote%.push}"
+    __gitcomp_nl "$(git --git-dir="$(__gitdir)" \
+      for-each-ref --format='%(refname):%(refname)' \
+      refs/heads)"
+    return
+    ;;
+  pull.twohead|pull.octopus)
+    __git_compute_merge_strategies
+    __gitcomp "$__git_merge_strategies"
+    return
+    ;;
+  color.branch|color.diff|color.interactive|\
+  color.showbranch|color.status|color.ui)
+    __gitcomp "always never auto"
+    return
+    ;;
+  color.pager)
+    __gitcomp "false true"
+    return
+    ;;
+  color.*.*)
+    __gitcomp "
+      normal black red green yellow blue magenta cyan white
+      bold dim ul blink reverse
+      "
+    return
+    ;;
+  diff.submodule)
+    __gitcomp "log short"
+    return
+    ;;
+  help.format)
+    __gitcomp "man info web html"
+    return
+    ;;
+  log.date)
+    __gitcomp "$__git_log_date_formats"
+    return
+    ;;
+  sendemail.aliasesfiletype)
+    __gitcomp "mutt mailrc pine elm gnus"
+    return
+    ;;
+  sendemail.confirm)
+    __gitcomp "$__git_send_email_confirm_options"
+    return
+    ;;
+  sendemail.suppresscc)
+    __gitcomp "$__git_send_email_suppresscc_options"
+    return
+    ;;
+  --get|--get-all|--unset|--unset-all)
+    __gitcomp_nl "$(__git_config_get_set_variables)"
+    return
+    ;;
+  *.*)
+    return
+    ;;
+  esac
+  case "$cur" in
+  --*)
+    __gitcomp "
+      --system --global --local --file=
+      --list --replace-all
+      --get --get-all --get-regexp
+      --add --unset --unset-all
+      --remove-section --rename-section
+      "
+    return
+    ;;
+  branch.*.*)
+    local pfx="${cur%.*}." cur_="${cur##*.}"
+    __gitcomp "remote pushremote merge mergeoptions rebase" "$pfx" "$cur_"
+    return
+    ;;
+  branch.*)
+    local pfx="${cur%.*}." cur_="${cur#*.}"
+    __gitcomp_nl "$(__git_heads)" "$pfx" "$cur_" "."
+    return
+    ;;
+  guitool.*.*)
+    local pfx="${cur%.*}." cur_="${cur##*.}"
+    __gitcomp "
+      argprompt cmd confirm needsfile noconsole norescan
+      prompt revprompt revunmerged title
+      " "$pfx" "$cur_"
+    return
+    ;;
+  difftool.*.*)
+    local pfx="${cur%.*}." cur_="${cur##*.}"
+    __gitcomp "cmd path" "$pfx" "$cur_"
+    return
+    ;;
+  man.*.*)
+    local pfx="${cur%.*}." cur_="${cur##*.}"
+    __gitcomp "cmd path" "$pfx" "$cur_"
+    return
+    ;;
+  mergetool.*.*)
+    local pfx="${cur%.*}." cur_="${cur##*.}"
+    __gitcomp "cmd path trustExitCode" "$pfx" "$cur_"
+    return
+    ;;
+  pager.*)
+    local pfx="${cur%.*}." cur_="${cur#*.}"
+    __git_compute_all_commands
+    __gitcomp_nl "$__git_all_commands" "$pfx" "$cur_"
+    return
+    ;;
+  remote.*.*)
+    local pfx="${cur%.*}." cur_="${cur##*.}"
+    __gitcomp "
+      url proxy fetch push mirror skipDefaultUpdate
+      receivepack uploadpack tagopt pushurl
+      " "$pfx" "$cur_"
+    return
+    ;;
+  remote.*)
+    local pfx="${cur%.*}." cur_="${cur#*.}"
+    __gitcomp_nl "$(__git_remotes)" "$pfx" "$cur_" "."
+    return
+    ;;
+  url.*.*)
+    local pfx="${cur%.*}." cur_="${cur##*.}"
+    __gitcomp "insteadOf pushInsteadOf" "$pfx" "$cur_"
+    return
+    ;;
+  esac
+  __gitcomp "
+    add.ignoreErrors
+    advice.commitBeforeMerge
+    advice.detachedHead
+    advice.implicitIdentity
+    advice.pushNonFastForward
+    advice.resolveConflict
+    advice.statusHints
+    alias.
+    am.keepcr
+    apply.ignorewhitespace
+    apply.whitespace
+    branch.autosetupmerge
+    branch.autosetuprebase
+    browser.
+    clean.requireForce
+    color.branch
+    color.branch.current
+    color.branch.local
+    color.branch.plain
+    color.branch.remote
+    color.decorate.HEAD
+    color.decorate.branch
+    color.decorate.remoteBranch
+    color.decorate.stash
+    color.decorate.tag
+    color.diff
+    color.diff.commit
+    color.diff.frag
+    color.diff.func
+    color.diff.meta
+    color.diff.new
+    color.diff.old
+    color.diff.plain
+    color.diff.whitespace
+    color.grep
+    color.grep.context
+    color.grep.filename
+    color.grep.function
+    color.grep.linenumber
+    color.grep.match
+    color.grep.selected
+    color.grep.separator
+    color.interactive
+    color.interactive.error
+    color.interactive.header
+    color.interactive.help
+    color.interactive.prompt
+    color.pager
+    color.showbranch
+    color.status
+    color.status.added
+    color.status.changed
+    color.status.header
+    color.status.nobranch
+    color.status.untracked
+    color.status.updated
+    color.ui
+    commit.status
+    commit.template
+    core.abbrev
+    core.askpass
+    core.attributesfile
+    core.autocrlf
+    core.bare
+    core.bigFileThreshold
+    core.compression
+    core.createObject
+    core.deltaBaseCacheLimit
+    core.editor
+    core.eol
+    core.excludesfile
+    core.fileMode
+    core.fsyncobjectfiles
+    core.gitProxy
+    core.ignoreStat
+    core.ignorecase
+    core.logAllRefUpdates
+    core.loosecompression
+    core.notesRef
+    core.packedGitLimit
+    core.packedGitWindowSize
+    core.pager
+    core.preferSymlinkRefs
+    core.preloadindex
+    core.quotepath
+    core.repositoryFormatVersion
+    core.safecrlf
+    core.sharedRepository
+    core.sparseCheckout
+    core.symlinks
+    core.trustctime
+    core.warnAmbiguousRefs
+    core.whitespace
+    core.worktree
+    diff.autorefreshindex
+    diff.external
+    diff.ignoreSubmodules
+    diff.mnemonicprefix
+    diff.noprefix
+    diff.renameLimit
+    diff.renames
+    diff.statGraphWidth
+    diff.submodule
+    diff.suppressBlankEmpty
+    diff.tool
+    diff.wordRegex
+    diff.algorithm
+    difftool.
+    difftool.prompt
+    fetch.recurseSubmodules
+    fetch.unpackLimit
+    format.attach
+    format.cc
+    format.headers
+    format.numbered
+    format.pretty
+    format.signature
+    format.signoff
+    format.subjectprefix
+    format.suffix
+    format.thread
+    format.to
+    gc.
+    gc.aggressiveWindow
+    gc.auto
+    gc.autopacklimit
+    gc.packrefs
+    gc.pruneexpire
+    gc.reflogexpire
+    gc.reflogexpireunreachable
+    gc.rerereresolved
+    gc.rerereunresolved
+    gitcvs.allbinary
+    gitcvs.commitmsgannotation
+    gitcvs.dbTableNamePrefix
+    gitcvs.dbdriver
+    gitcvs.dbname
+    gitcvs.dbpass
+    gitcvs.dbuser
+    gitcvs.enabled
+    gitcvs.logfile
+    gitcvs.usecrlfattr
+    guitool.
+    gui.blamehistoryctx
+    gui.commitmsgwidth
+    gui.copyblamethreshold
+    gui.diffcontext
+    gui.encoding
+    gui.fastcopyblame
+    gui.matchtrackingbranch
+    gui.newbranchtemplate
+    gui.pruneduringfetch
+    gui.spellingdictionary
+    gui.trustmtime
+    help.autocorrect
+    help.browser
+    help.format
+    http.lowSpeedLimit
+    http.lowSpeedTime
+    http.maxRequests
+    http.minSessions
+    http.noEPSV
+    http.postBuffer
+    http.proxy
+    http.sslCAInfo
+    http.sslCAPath
+    http.sslCert
+    http.sslCertPasswordProtected
+    http.sslKey
+    http.sslVerify
+    http.useragent
+    i18n.commitEncoding
+    i18n.logOutputEncoding
+    imap.authMethod
+    imap.folder
+    imap.host
+    imap.pass
+    imap.port
+    imap.preformattedHTML
+    imap.sslverify
+    imap.tunnel
+    imap.user
+    init.templatedir
+    instaweb.browser
+    instaweb.httpd
+    instaweb.local
+    instaweb.modulepath
+    instaweb.port
+    interactive.singlekey
+    log.date
+    log.decorate
+    log.showroot
+    mailmap.file
+    man.
+    man.viewer
+    merge.
+    merge.conflictstyle
+    merge.log
+    merge.renameLimit
+    merge.renormalize
+    merge.stat
+    merge.tool
+    merge.verbosity
+    mergetool.
+    mergetool.keepBackup
+    mergetool.keepTemporaries
+    mergetool.prompt
+    notes.displayRef
+    notes.rewrite.
+    notes.rewrite.amend
+    notes.rewrite.rebase
+    notes.rewriteMode
+    notes.rewriteRef
+    pack.compression
+    pack.deltaCacheLimit
+    pack.deltaCacheSize
+    pack.depth
+    pack.indexVersion
+    pack.packSizeLimit
+    pack.threads
+    pack.window
+    pack.windowMemory
+    pager.
+    pretty.
+    pull.octopus
+    pull.twohead
+    push.default
+    rebase.autosquash
+    rebase.stat
+    receive.autogc
+    receive.denyCurrentBranch
+    receive.denyDeleteCurrent
+    receive.denyDeletes
+    receive.denyNonFastForwards
+    receive.fsckObjects
+    receive.unpackLimit
+    receive.updateserverinfo
+    remote.pushdefault
+    remotes.
+    repack.usedeltabaseoffset
+    rerere.autoupdate
+    rerere.enabled
+    sendemail.
+    sendemail.aliasesfile
+    sendemail.aliasfiletype
+    sendemail.bcc
+    sendemail.cc
+    sendemail.cccmd
+    sendemail.chainreplyto
+    sendemail.confirm
+    sendemail.envelopesender
+    sendemail.from
+    sendemail.identity
+    sendemail.multiedit
+    sendemail.signedoffbycc
+    sendemail.smtpdomain
+    sendemail.smtpencryption
+    sendemail.smtppass
+    sendemail.smtpserver
+    sendemail.smtpserveroption
+    sendemail.smtpserverport
+    sendemail.smtpuser
+    sendemail.suppresscc
+    sendemail.suppressfrom
+    sendemail.thread
+    sendemail.to
+    sendemail.validate
+    showbranch.default
+    status.relativePaths
+    status.showUntrackedFiles
+    status.submodulesummary
+    submodule.
+    tar.umask
+    transfer.unpackLimit
+    url.
+    user.email
+    user.name
+    user.signingkey
+    web.browser
+    branch. remote.
+  "
+}
+
+_git_remote ()
+{
+  local subcommands="add rename remove set-head set-branches set-url show prune update"
+  local subcommand="$(__git_find_on_cmdline "$subcommands")"
+  if [ -z "$subcommand" ]; then
+    __gitcomp "$subcommands"
+    return
+  fi
+
+  case "$subcommand" in
+  rename|remove|set-url|show|prune)
+    __gitcomp_nl "$(__git_remotes)"
+    ;;
+  set-head|set-branches)
+    __git_complete_remote_or_refspec
+    ;;
+  update)
+    local i c='' IFS=$'\n'
+    for i in $(git --git-dir="$(__gitdir)" config --get-regexp "remotes\..*" 2>/dev/null); do
+      i="${i#remotes.}"
+      c="$c ${i/ */}"
+    done
+    __gitcomp "$c"
+    ;;
+  *)
+    ;;
+  esac
+}
+
+_git_replace ()
+{
+  __gitcomp_nl "$(__git_refs)"
+}
+
+_git_reset ()
+{
+  __git_has_doubledash && return
+
+  case "$cur" in
+  --*)
+    __gitcomp "--merge --mixed --hard --soft --patch"
+    return
+    ;;
+  esac
+  __gitcomp_nl "$(__git_refs)"
+}
+
+_git_revert ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "--edit --mainline --no-edit --no-commit --signoff"
+    return
+    ;;
+  esac
+  __gitcomp_nl "$(__git_refs)"
+}
+
+_git_rm ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "--cached --dry-run --ignore-unmatch --quiet"
+    return
+    ;;
+  esac
+
+  __git_complete_index_file "--cached"
+}
+
+_git_shortlog ()
+{
+  __git_has_doubledash && return
+
+  case "$cur" in
+  --*)
+    __gitcomp "
+      $__git_log_common_options
+      $__git_log_shortlog_options
+      --numbered --summary
+      "
+    return
+    ;;
+  esac
+  __git_complete_revlist
+}
+
+_git_show ()
+{
+  __git_has_doubledash && return
+
+  case "$cur" in
+  --pretty=*|--format=*)
+    __gitcomp "$__git_log_pretty_formats $(__git_pretty_aliases)
+      " "" "${cur#*=}"
+    return
+    ;;
+  --diff-algorithm=*)
+    __gitcomp "$__git_diff_algorithms" "" "${cur##--diff-algorithm=}"
+    return
+    ;;
+  --*)
+    __gitcomp "--pretty= --format= --abbrev-commit --oneline
+      $__git_diff_common_options
+      "
+    return
+    ;;
+  esac
+  __git_complete_revlist_file
+}
+
+_git_show_branch ()
+{
+  case "$cur" in
+  --*)
+    __gitcomp "
+      --all --remotes --topo-order --current --more=
+      --list --independent --merge-base --no-name
+      --color --no-color
+      --sha1-name --sparse --topics --reflog
+      "
+    return
+    ;;
+  esac
+  __git_complete_revlist
+}
+
+_git_stash ()
+{
+  local save_opts='--keep-index --no-keep-index --quiet --patch'
+  local subcommands='save list show apply clear drop pop create branch'
+  local subcommand="$(__git_find_on_cmdline "$subcommands")"
+  if [ -z "$subcommand" ]; then
+    case "$cur" in
+    --*)
+      __gitcomp "$save_opts"
+      ;;
+    *)
+      if [ -z "$(__git_find_on_cmdline "$save_opts")" ]; then
+        __gitcomp "$subcommands"
+      fi
+      ;;
+    esac
+  else
+    case "$subcommand,$cur" in
+    save,--*)
+      __gitcomp "$save_opts"
+      ;;
+    apply,--*|pop,--*)
+      __gitcomp "--index --quiet"
+      ;;
+    show,--*|drop,--*|branch,--*)
+      ;;
+    show,*|apply,*|drop,*|pop,*|branch,*)
+      __gitcomp_nl "$(git --git-dir="$(__gitdir)" stash list \
+          | sed -n -e 's/:.*//p')"
+      ;;
+    *)
+      ;;
+    esac
+  fi
+}
+
+_git_submodule ()
+{
+  __git_has_doubledash && return
+
+  local subcommands="add status init deinit update summary foreach sync"
+  if [ -z "$(__git_find_on_cmdline "$subcommands")" ]; then
+    case "$cur" in
+    --*)
+      __gitcomp "--quiet --cached"
+      ;;
+    *)
+      __gitcomp "$subcommands"
+      ;;
+    esac
+    return
+  fi
+}
+
+_git_svn ()
+{
+  local subcommands="
+    init fetch clone rebase dcommit log find-rev
+    set-tree commit-diff info create-ignore propget
+    proplist show-ignore show-externals branch tag blame
+    migrate mkdirs reset gc
+    "
+  local subcommand="$(__git_find_on_cmdline "$subcommands")"
+  if [ -z "$subcommand" ]; then
+    __gitcomp "$subcommands"
+  else
+    local remote_opts="--username= --config-dir= --no-auth-cache"
+    local fc_opts="
+      --follow-parent --authors-file= --repack=
+      --no-metadata --use-svm-props --use-svnsync-props
+      --log-window-size= --no-checkout --quiet
+      --repack-flags --use-log-author --localtime
+      --ignore-paths= --include-paths= $remote_opts
+      "
+    local init_opts="
+      --template= --shared= --trunk= --tags=
+      --branches= --stdlayout --minimize-url
+      --no-metadata --use-svm-props --use-svnsync-props
+      --rewrite-root= --prefix= --use-log-author
+      --add-author-from $remote_opts
+      "
+    local cmt_opts="
+      --edit --rmdir --find-copies-harder --copy-similarity=
+      "
+
+    case "$subcommand,$cur" in
+    fetch,--*)
+      __gitcomp "--revision= --fetch-all $fc_opts"
+      ;;
+    clone,--*)
+      __gitcomp "--revision= $fc_opts $init_opts"
+      ;;
+    init,--*)
+      __gitcomp "$init_opts"
+      ;;
+    dcommit,--*)
+      __gitcomp "
+        --merge --strategy= --verbose --dry-run
+        --fetch-all --no-rebase --commit-url
+        --revision --interactive $cmt_opts $fc_opts
+        "
+      ;;
+    set-tree,--*)
+      __gitcomp "--stdin $cmt_opts $fc_opts"
+      ;;
+    create-ignore,--*|propget,--*|proplist,--*|show-ignore,--*|\
+    show-externals,--*|mkdirs,--*)
+      __gitcomp "--revision="
+      ;;
+    log,--*)
+      __gitcomp "
+        --limit= --revision= --verbose --incremental
+        --oneline --show-commit --non-recursive
+        --authors-file= --color
+        "
+      ;;
+    rebase,--*)
+      __gitcomp "
+        --merge --verbose --strategy= --local
+        --fetch-all --dry-run $fc_opts
+        "
+      ;;
+    commit-diff,--*)
+      __gitcomp "--message= --file= --revision= $cmt_opts"
+      ;;
+    info,--*)
+      __gitcomp "--url"
+      ;;
+    branch,--*)
+      __gitcomp "--dry-run --message --tag"
+      ;;
+    tag,--*)
+      __gitcomp "--dry-run --message"
+      ;;
+    blame,--*)
+      __gitcomp "--git-format"
+      ;;
+    migrate,--*)
+      __gitcomp "
+        --config-dir= --ignore-paths= --minimize
+        --no-auth-cache --username=
+        "
+      ;;
+    reset,--*)
+      __gitcomp "--revision= --parent"
+      ;;
+    *)
+      ;;
+    esac
+  fi
+}
+
+_git_tag ()
+{
+  local i c=1 f=0
+  while [ $c -lt $cword ]; do
+    i="${words[c]}"
+    case "$i" in
+    -d|-v)
+      __gitcomp_nl "$(__git_tags)"
+      return
+      ;;
+    -f)
+      f=1
+      ;;
+    esac
+    ((c++))
+  done
+
+  case "$prev" in
+  -m|-F)
+    ;;
+  -*|tag)
+    if [ $f = 1 ]; then
+      __gitcomp_nl "$(__git_tags)"
+    fi
+    ;;
+  *)
+    __gitcomp_nl "$(__git_refs)"
+    ;;
+  esac
+}
+
+_git_whatchanged ()
+{
+  _git_log
+}
+
+__git_main ()
+{
+  local i c=1 command __git_dir
+
+  while [ $c -lt $cword ]; do
+    i="${words[c]}"
+    case "$i" in
+    --git-dir=*) __git_dir="${i#--git-dir=}" ;;
+    --git-dir)   ((c++)) ; __git_dir="${words[c]}" ;;
+    --bare)      __git_dir="." ;;
+    --help) command="help"; break ;;
+    -c|--work-tree|--namespace) ((c++)) ;;
+    -*) ;;
+    *) command="$i"; break ;;
+    esac
+    ((c++))
+  done
+
+  if [ -z "$command" ]; then
+    case "$cur" in
+    --*)   __gitcomp "
+      --paginate
+      --no-pager
+      --git-dir=
+      --bare
+      --version
+      --exec-path
+      --exec-path=
+      --html-path
+      --man-path
+      --info-path
+      --work-tree=
+      --namespace=
+      --no-replace-objects
+      --help
+      "
+      ;;
+    *)     __git_compute_porcelain_commands
+           __gitcomp "$__git_porcelain_commands $(__git_aliases)" ;;
+    esac
+    return
+  fi
+
+  local completion_func="_git_${command//-/_}"
+  declare -f $completion_func >/dev/null && $completion_func && return
+
+  local expansion=$(__git_aliased_command "$command")
+  if [ -n "$expansion" ]; then
+    completion_func="_git_${expansion//-/_}"
+    declare -f $completion_func >/dev/null && $completion_func
+  fi
+}
+
+__gitk_main ()
+{
+  __git_has_doubledash && return
+
+  local g="$(__gitdir)"
+  local merge=""
+  if [ -f "$g/MERGE_HEAD" ]; then
+    merge="--merge"
+  fi
+  case "$cur" in
+  --*)
+    __gitcomp "
+      $__git_log_common_options
+      $__git_log_gitk_options
+      $merge
+      "
+    return
+    ;;
+  esac
+  __git_complete_revlist
+}
+
+if [[ -n ${ZSH_VERSION-} ]]; then
+  echo "WARNING: this script is deprecated, please see git-completion.zsh" 1>&2
+
+  autoload -U +X compinit && compinit
+
+  __gitcomp ()
+  {
+    emulate -L zsh
+
+    local cur_="${3-$cur}"
+
+    case "$cur_" in
+    --*=)
+      ;;
+    *)
+      local c IFS=$' \t\n'
+      local -a array
+      for c in ${=1}; do
+        c="$c${4-}"
+        case $c in
+        --*=*|*.) ;;
+        *) c="$c " ;;
+        esac
+        array[$#array+1]="$c"
+      done
+      compset -P '*[=:]'
+      compadd -Q -S '' -p "${2-}" -a -- array && _ret=0
+      ;;
+    esac
+  }
+
+  __gitcomp_nl ()
+  {
+    emulate -L zsh
+
+    local IFS=$'\n'
+    compset -P '*[=:]'
+    compadd -Q -S "${4- }" -p "${2-}" -- ${=1} && _ret=0
+  }
+
+  __gitcomp_file ()
+  {
+    emulate -L zsh
+
+    local IFS=$'\n'
+    compset -P '*[=:]'
+    compadd -Q -p "${2-}" -f -- ${=1} && _ret=0
+  }
+
+  _git ()
+  {
+    local _ret=1 cur cword prev
+    cur=${words[CURRENT]}
+    prev=${words[CURRENT-1]}
+    let cword=CURRENT-1
+    emulate ksh -c __${service}_main
+    let _ret && _default && _ret=0
+    return _ret
+  }
+
+  compdef _git git gitk
+  return
+fi
+
+__git_func_wrap ()
+{
+  local cur words cword prev
+  _get_comp_words_by_ref -n =: cur words cword prev
+  $1
+}
+
+# Setup completion for certain functions defined above by setting common
+# variables and workarounds.
+# This is NOT a public function; use at your own risk.
+__git_complete ()
+{
+  local wrapper="__git_wrap${2}"
+  eval "$wrapper () { __git_func_wrap $2 ; }"
+  complete -o bashdefault -o default -o nospace -F $wrapper $1 2>/dev/null \
+    || complete -o default -o nospace -F $wrapper $1
+}
+
+# wrapper for backwards compatibility
+_git ()
+{
+  __git_wrap__git_main
+}
+
+# wrapper for backwards compatibility
+_gitk ()
+{
+  __git_wrap__gitk_main
+}
+
+__git_complete git __git_main
+__git_complete gitk __gitk_main
+
+# The following are necessary only for Cygwin, and only are needed
+# when the user has tab-completed the executable name and consequently
+# included the '.exe' suffix.
+#
+if [ Cygwin = "$(uname -o 2>/dev/null)" ]; then
+__git_complete git.exe __git_main
+fi
diff --git a/paddle/scripts/docker/root/.scripts/git-prompt.sh b/paddle/scripts/docker/root/.scripts/git-prompt.sh
new file mode 100755
index 0000000000000000000000000000000000000000..576f4ec14c94a24ebffa9e2620acf881e6b5ddaa
--- /dev/null
+++ b/paddle/scripts/docker/root/.scripts/git-prompt.sh
@@ -0,0 +1,445 @@
+# bash/zsh git prompt support
+#
+# Copyright (C) 2006,2007 Shawn O. Pearce <spearce@spearce.org>
+# Distributed under the GNU General Public License, version 2.0.
+#
+# This script allows you to see repository status in your prompt.
+#
+# To enable:
+#
+#    1) Copy this file to somewhere (e.g. ~/.git-prompt.sh).
+#    2) Add the following line to your .bashrc/.zshrc:
+#        source ~/.git-prompt.sh
+#    3a) Change your PS1 to call __git_ps1 as
+#        command-substitution:
+#        Bash: PS1='[\u@\h \W$(__git_ps1 " (%s)")]\$ '
+#        ZSH:  setopt PROMPT_SUBST ; PS1='[%n@%m %c$(__git_ps1 " (%s)")]\$ '
+#        the optional argument will be used as format string.
+#    3b) Alternatively, for a slightly faster prompt, __git_ps1 can
+#        be used for PROMPT_COMMAND in Bash or for precmd() in Zsh
+#        with two parameters, <pre> and <post>, which are strings
+#        you would put in $PS1 before and after the status string
+#        generated by the git-prompt machinery.  e.g.
+#        Bash: PROMPT_COMMAND='__git_ps1 "\u@\h:\w" "\\\$ "'
+#          will show username, at-sign, host, colon, cwd, then
+#          various status string, followed by dollar and SP, as
+#          your prompt.
+#        ZSH:  precmd () { __git_ps1 "%n" ":%~$ " "|%s" }
+#          will show username, pipe, then various status string,
+#          followed by colon, cwd, dollar and SP, as your prompt.
+#        Optionally, you can supply a third argument with a printf
+#        format string to finetune the output of the branch status
+#
+# The repository status will be displayed only if you are currently in a
+# git repository. The %s token is the placeholder for the shown status.
+#
+# The prompt status always includes the current branch name.
+#
+# In addition, if you set GIT_PS1_SHOWDIRTYSTATE to a nonempty value,
+# unstaged (*) and staged (+) changes will be shown next to the branch
+# name.  You can configure this per-repository with the
+# bash.showDirtyState variable, which defaults to true once
+# GIT_PS1_SHOWDIRTYSTATE is enabled.
+#
+# You can also see if currently something is stashed, by setting
+# GIT_PS1_SHOWSTASHSTATE to a nonempty value. If something is stashed,
+# then a '$' will be shown next to the branch name.
+#
+# If you would like to see if there're untracked files, then you can set
+# GIT_PS1_SHOWUNTRACKEDFILES to a nonempty value. If there're untracked
+# files, then a '%' will be shown next to the branch name.  You can
+# configure this per-repository with the bash.showUntrackedFiles
+# variable, which defaults to true once GIT_PS1_SHOWUNTRACKEDFILES is
+# enabled.
+#
+# If you would like to see the difference between HEAD and its upstream,
+# set GIT_PS1_SHOWUPSTREAM="auto".  A "<" indicates you are behind, ">"
+# indicates you are ahead, "<>" indicates you have diverged and "="
+# indicates that there is no difference. You can further control
+# behaviour by setting GIT_PS1_SHOWUPSTREAM to a space-separated list
+# of values:
+#
+#     verbose       show number of commits ahead/behind (+/-) upstream
+#     legacy        don't use the '--count' option available in recent
+#                   versions of git-rev-list
+#     git           always compare HEAD to @{upstream}
+#     svn           always compare HEAD to your SVN upstream
+#
+# By default, __git_ps1 will compare HEAD to your SVN upstream if it can
+# find one, or @{upstream} otherwise.  Once you have set
+# GIT_PS1_SHOWUPSTREAM, you can override it on a per-repository basis by
+# setting the bash.showUpstream config variable.
+#
+# If you would like to see more information about the identity of
+# commits checked out as a detached HEAD, set GIT_PS1_DESCRIBE_STYLE
+# to one of these values:
+#
+#     contains      relative to newer annotated tag (v1.6.3.2~35)
+#     branch        relative to newer tag or branch (master~4)
+#     describe      relative to older annotated tag (v1.6.3.1-13-gdd42c2f)
+#     default       exactly matching tag
+#
+# If you would like a colored hint about the current dirty state, set
+# GIT_PS1_SHOWCOLORHINTS to a nonempty value. The colors are based on
+# the colored output of "git status -sb" and are available only when
+# using __git_ps1 for PROMPT_COMMAND or precmd.
+
+# stores the divergence from upstream in $p
+# used by GIT_PS1_SHOWUPSTREAM
+__git_ps1_show_upstream ()
+{
+  local key value
+  local svn_remote svn_url_pattern count n
+  local upstream=git legacy="" verbose=""
+
+  svn_remote=()
+  # get some config options from git-config
+  local output="$(git config -z --get-regexp '^(svn-remote\..*\.url|bash\.showupstream)$' 2>/dev/null | tr '\0\n' '\n ')"
+  while read -r key value; do
+    case "$key" in
+    bash.showupstream)
+      GIT_PS1_SHOWUPSTREAM="$value"
+      if [[ -z "${GIT_PS1_SHOWUPSTREAM}" ]]; then
+        p=""
+        return
+      fi
+      ;;
+    svn-remote.*.url)
+      svn_remote[$((${#svn_remote[@]} + 1))]="$value"
+      svn_url_pattern+="\\|$value"
+      upstream=svn+git # default upstream is SVN if available, else git
+      ;;
+    esac
+  done <<< "$output"
+
+  # parse configuration values
+  for option in ${GIT_PS1_SHOWUPSTREAM}; do
+    case "$option" in
+    git|svn) upstream="$option" ;;
+    verbose) verbose=1 ;;
+    legacy)  legacy=1  ;;
+    esac
+  done
+
+  # Find our upstream
+  case "$upstream" in
+  git)    upstream="@{upstream}" ;;
+  svn*)
+    # get the upstream from the "git-svn-id: ..." in a commit message
+    # (git-svn uses essentially the same procedure internally)
+    local -a svn_upstream
+    svn_upstream=($(git log --first-parent -1 \
+          --grep="^git-svn-id: \(${svn_url_pattern#??}\)" 2>/dev/null))
+    if [[ 0 -ne ${#svn_upstream[@]} ]]; then
+      svn_upstream=${svn_upstream[${#svn_upstream[@]} - 2]}
+      svn_upstream=${svn_upstream%@*}
+      local n_stop="${#svn_remote[@]}"
+      for ((n=1; n <= n_stop; n++)); do
+        svn_upstream=${svn_upstream#${svn_remote[$n]}}
+      done
+
+      if [[ -z "$svn_upstream" ]]; then
+        # default branch name for checkouts with no layout:
+        upstream=${GIT_SVN_ID:-git-svn}
+      else
+        upstream=${svn_upstream#/}
+      fi
+    elif [[ "svn+git" = "$upstream" ]]; then
+      upstream="@{upstream}"
+    fi
+    ;;
+  esac
+
+  # Find how many commits we are ahead/behind our upstream
+  if [[ -z "$legacy" ]]; then
+    count="$(git rev-list --count --left-right \
+        "$upstream"...HEAD 2>/dev/null)"
+  else
+    # produce equivalent output to --count for older versions of git
+    local commits
+    if commits="$(git rev-list --left-right "$upstream"...HEAD 2>/dev/null)"
+    then
+      local commit behind=0 ahead=0
+      for commit in $commits
+      do
+        case "$commit" in
+        "<"*) ((behind++)) ;;
+        *)    ((ahead++))  ;;
+        esac
+      done
+      count="$behind  $ahead"
+    else
+      count=""
+    fi
+  fi
+
+  # calculate the result
+  if [[ -z "$verbose" ]]; then
+    case "$count" in
+    "") # no upstream
+      p="" ;;
+    "0  0") # equal to upstream
+      p="=" ;;
+    "0  "*) # ahead of upstream
+      p=">" ;;
+    *"  0") # behind upstream
+      p="<" ;;
+    *)      # diverged from upstream
+      p="<>" ;;
+    esac
+  else
+    case "$count" in
+    "") # no upstream
+      p="" ;;
+    "0  0") # equal to upstream
+      p=" u=" ;;
+    "0  "*) # ahead of upstream
+      p=" u+${count#0 }" ;;
+    *"  0") # behind upstream
+      p=" u-${count%  0}" ;;
+    *)      # diverged from upstream
+      p=" u+${count#* }-${count%  *}" ;;
+    esac
+  fi
+
+}
+
+# Helper function that is meant to be called from __git_ps1.  It
+# injects color codes into the appropriate gitstring variables used
+# to build a gitstring.
+__git_ps1_colorize_gitstring ()
+{
+  if [[ -n ${ZSH_VERSION-} ]]; then
+    local c_red='%F{red}'
+    local c_green='%F{green}'
+    local c_lblue='%F{blue}'
+    local c_clear='%f'
+  else
+    # Using \[ and \] around colors is necessary to prevent
+    # issues with command line editing/browsing/completion!
+    local c_red='\[\e[31m\]'
+    local c_green='\[\e[32m\]'
+    local c_lblue='\[\e[1;34m\]'
+    local c_clear='\[\e[0m\]'
+  fi
+  local bad_color=$c_red
+  local ok_color=$c_green
+  local flags_color="$c_lblue"
+
+  local branch_color=""
+  if [ $detached = no ]; then
+    branch_color="$ok_color"
+  else
+    branch_color="$bad_color"
+  fi
+  c="$branch_color$c"
+
+  z="$c_clear$z"
+  if [ "$w" = "*" ]; then
+    w="$bad_color$w"
+  fi
+  if [ -n "$i" ]; then
+    i="$ok_color$i"
+  fi
+  if [ -n "$s" ]; then
+    s="$flags_color$s"
+  fi
+  if [ -n "$u" ]; then
+    u="$bad_color$u"
+  fi
+  r="$c_clear$r"
+}
+
+# __git_ps1 accepts 0 or 1 arguments (i.e., format string)
+# when called from PS1 using command substitution
+# in this mode it prints text to add to bash PS1 prompt (includes branch name)
+#
+# __git_ps1 requires 2 or 3 arguments when called from PROMPT_COMMAND (pc)
+# in that case it _sets_ PS1. The arguments are parts of a PS1 string.
+# when two arguments are given, the first is prepended and the second appended
+# to the state string when assigned to PS1.
+# The optional third parameter will be used as printf format string to further
+# customize the output of the git-status string.
+# In this mode you can request colored hints using GIT_PS1_SHOWCOLORHINTS=true
+__git_ps1 ()
+{
+  local pcmode=no
+  local detached=no
+  local ps1pc_start='\u@\h:\w '
+  local ps1pc_end='\$ '
+  local printf_format=' (%s)'
+
+  case "$#" in
+    2|3)  pcmode=yes
+      ps1pc_start="$1"
+      ps1pc_end="$2"
+      printf_format="${3:-$printf_format}"
+    ;;
+    0|1)  printf_format="${1:-$printf_format}"
+    ;;
+    *)  return
+    ;;
+  esac
+
+  local repo_info rev_parse_exit_code
+  repo_info="$(git rev-parse --git-dir --is-inside-git-dir \
+    --is-bare-repository --is-inside-work-tree \
+    --short HEAD 2>/dev/null)"
+  rev_parse_exit_code="$?"
+
+  if [ -z "$repo_info" ]; then
+    if [ $pcmode = yes ]; then
+      #In PC mode PS1 always needs to be set
+      PS1="$ps1pc_start$ps1pc_end"
+    fi
+    return
+  fi
+
+  local short_sha
+  if [ "$rev_parse_exit_code" = "0" ]; then
+    short_sha="${repo_info##*$'\n'}"
+    repo_info="${repo_info%$'\n'*}"
+  fi
+  local inside_worktree="${repo_info##*$'\n'}"
+  repo_info="${repo_info%$'\n'*}"
+  local bare_repo="${repo_info##*$'\n'}"
+  repo_info="${repo_info%$'\n'*}"
+  local inside_gitdir="${repo_info##*$'\n'}"
+  local g="${repo_info%$'\n'*}"
+
+  local r=""
+  local b=""
+  local step=""
+  local total=""
+  if [ -d "$g/rebase-merge" ]; then
+    read b 2>/dev/null <"$g/rebase-merge/head-name"
+    read step 2>/dev/null <"$g/rebase-merge/msgnum"
+    read total 2>/dev/null <"$g/rebase-merge/end"
+    if [ -f "$g/rebase-merge/interactive" ]; then
+      r="|REBASE-i"
+    else
+      r="|REBASE-m"
+    fi
+  else
+    if [ -d "$g/rebase-apply" ]; then
+      read step 2>/dev/null <"$g/rebase-apply/next"
+      read total 2>/dev/null <"$g/rebase-apply/last"
+      if [ -f "$g/rebase-apply/rebasing" ]; then
+        read b 2>/dev/null <"$g/rebase-apply/head-name"
+        r="|REBASE"
+      elif [ -f "$g/rebase-apply/applying" ]; then
+        r="|AM"
+      else
+        r="|AM/REBASE"
+      fi
+    elif [ -f "$g/MERGE_HEAD" ]; then
+      r="|MERGING"
+    elif [ -f "$g/CHERRY_PICK_HEAD" ]; then
+      r="|CHERRY-PICKING"
+    elif [ -f "$g/REVERT_HEAD" ]; then
+      r="|REVERTING"
+    elif [ -f "$g/BISECT_LOG" ]; then
+      r="|BISECTING"
+    fi
+
+    if [ -n "$b" ]; then
+      :
+    elif [ -h "$g/HEAD" ]; then
+      # symlink symbolic ref
+      b="$(git symbolic-ref HEAD 2>/dev/null)"
+    else
+      local head=""
+      if ! read head 2>/dev/null <"$g/HEAD"; then
+        if [ $pcmode = yes ]; then
+          PS1="$ps1pc_start$ps1pc_end"
+        fi
+        return
+      fi
+      # is it a symbolic ref?
+      b="${head#ref: }"
+      if [ "$head" = "$b" ]; then
+        detached=yes
+        b="$(
+        case "${GIT_PS1_DESCRIBE_STYLE-}" in
+        (contains)
+          git describe --contains HEAD ;;
+        (branch)
+          git describe --contains --all HEAD ;;
+        (describe)
+          git describe HEAD ;;
+        (* | default)
+          git describe --tags --exact-match HEAD ;;
+        esac 2>/dev/null)" ||
+
+        b="$short_sha..."
+        b="($b)"
+      fi
+    fi
+  fi
+
+  if [ -n "$step" ] && [ -n "$total" ]; then
+    r="$r $step/$total"
+  fi
+
+  local w=""
+  local i=""
+  local s=""
+  local u=""
+  local c=""
+  local p=""
+
+  if [ "true" = "$inside_gitdir" ]; then
+    if [ "true" = "$bare_repo" ]; then
+      c="BARE:"
+    else
+      b="GIT_DIR!"
+    fi
+  elif [ "true" = "$inside_worktree" ]; then
+    if [ -n "${GIT_PS1_SHOWDIRTYSTATE-}" ] &&
+       [ "$(git config --bool bash.showDirtyState)" != "false" ]
+    then
+      git diff --no-ext-diff --quiet --exit-code || w="*"
+      if [ -n "$short_sha" ]; then
+        git diff-index --cached --quiet HEAD -- || i="+"
+      else
+        i="#"
+      fi
+    fi
+    if [ -n "${GIT_PS1_SHOWSTASHSTATE-}" ] &&
+       [ -r "$g/refs/stash" ]; then
+      s="$"
+    fi
+
+    if [ -n "${GIT_PS1_SHOWUNTRACKEDFILES-}" ] &&
+       [ "$(git config --bool bash.showUntrackedFiles)" != "false" ] &&
+       git ls-files --others --exclude-standard --error-unmatch -- '*' >/dev/null 2>/dev/null
+    then
+      u="%${ZSH_VERSION+%}"
+    fi
+
+    if [ -n "${GIT_PS1_SHOWUPSTREAM-}" ]; then
+      __git_ps1_show_upstream
+    fi
+  fi
+
+  local z="${GIT_PS1_STATESEPARATOR-" "}"
+
+  # NO color option unless in PROMPT_COMMAND mode
+  if [ $pcmode = yes ] && [ -n "${GIT_PS1_SHOWCOLORHINTS-}" ]; then
+    __git_ps1_colorize_gitstring
+  fi
+
+  local f="$w$i$s$u"
+  local gitstring="$c${b##refs/heads/}${f:+$z$f}$r$p"
+
+  if [ $pcmode = yes ]; then
+    if [[ -n ${ZSH_VERSION-} ]]; then
+      gitstring=$(printf -- "$printf_format" "$gitstring")
+    else
+      printf -v gitstring -- "$printf_format" "$gitstring"
+    fi
+    PS1="$ps1pc_start$gitstring$ps1pc_end"
+  else
+    printf -- "$printf_format" "$gitstring"
+  fi
+}
diff --git a/paddle/scripts/travis/before_install.osx.sh b/paddle/scripts/travis/before_install.osx.sh
deleted file mode 100755
index 80f031a74e7052d183b5ef21d432476ff1cce722..0000000000000000000000000000000000000000
--- a/paddle/scripts/travis/before_install.osx.sh
+++ /dev/null
@@ -1,4 +0,0 @@
-#!/bin/bash
-brew update
-brew tap homebrew/science
-brew install openblas swig md5sha1sum
diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh
index 5e6350b57458594163f23cca41a546d7bd9b1eda..7deb3e62e88de7e1306fcbfc5a28aa4372d678e6 100755
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
@@ -2,18 +2,11 @@
 source ./common.sh
 
 NPROC=1
-if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
-  export PYTHONPATH=/opt/python/2.7.12/lib/python2.7/site-packages
-  export PYTHONHOME=/opt/python/2.7.12
-  export PATH=/opt/python/2.7.12/bin:${PATH}
-  cmake .. -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
-  NRPOC=`nproc`
-  make -j $NPROC
-  make coveralls
-  sudo make install
-elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
-  export PYTHONPATH=/usr/local/lib/python2.7/site-packages
-  cmake .. -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
-  NPROC=`sysctl -n hw.ncpu`
-  make -j $NPROC
-fi
+export PYTHONPATH=/opt/python/2.7.12/lib/python2.7/site-packages
+export PYTHONHOME=/opt/python/2.7.12
+export PATH=/opt/python/2.7.12/bin:${PATH}
+cmake .. -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
+NRPOC=`nproc`
+make -j $NPROC
+make coveralls
+sudo make install
diff --git a/paddle/scripts/travis/docs.sh b/paddle/scripts/travis/docs.sh
index 6b43cad20b76e9abeb3cb10a726d3d8e3da5f8e2..53e998ef6c1b96d9e7d82b7effd12a27e6dc69f2 100755
--- a/paddle/scripts/travis/docs.sh
+++ b/paddle/scripts/travis/docs.sh
@@ -2,8 +2,12 @@
 
 # Add set -e, cd to directory.
 source ./common.sh
-
 # Compile Documentation only.
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_STYLE_CHECK=OFF ${EXTRA_CMAKE_OPTS}
+mkdir output
+make DESTDIR=./output install -j `nproc`
+pip install ./output/usr/local/opt/paddle/share/wheels/*
+rm -rf *
 cmake .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DWITH_GPU=OFF -DWITH_DOC=ON ${EXTRA_CMAKE_OPTS}
 make paddle_docs paddle_docs_cn
 
@@ -25,26 +29,41 @@ TARGET_BRANCH="gh-pages"
 # Only deploy master branch to build latest documentation.
 SOURCE_BRANCH="master"
 
-# If is not a Github pull request, and in master branch.
-if [ "$TRAVIS_PULL_REQUEST" != "false" -o "$TRAVIS_BRANCH" != "$SOURCE_BRANCH"  ]; then
-  exit 0
-fi
-
 # Clone the repo to output directory
 git clone $REPO output
 cd output
 
-# checkout github page branch
-git checkout $TARGET_BRANCH || git checkout --orphan $TARGET_BRANCH
+function deploy_docs() {
+  SOURCE_BRANCH=$1
+  DIR=$2
+  # If is not a Github pull request
+  if [ "$TRAVIS_PULL_REQUEST" != "false" ]; then
+    exit 0
+  fi
+  # If it is not watched branch.
+  if [ "$TRAVIS_BRANCH" != "$SOURCE_BRANCH" ]; then
+    return
+  fi
 
-# remove old docs. mv new docs.
-rm -rf doc doc_cn
-mv ../doc/cn/html doc_cn
-mv ../doc/en/html doc
+  # checkout github page branch
+  git checkout $TARGET_BRANCH || git checkout --orphan $TARGET_BRANCH
+  
+  mkdir -p ${DIR}
+  # remove old docs. mv new docs.
+  set +e
+  rm -rf ${DIR}/doc ${DIR}/doc_cn
+  set -e
+  mv ../doc/cn/html ${DIR}/doc_cn
+  mv ../doc/en/html ${DIR}/doc
+  git add .
+}
+
+deploy_docs "master" "." 
+deploy_docs "develop" "./develop/"
 
 # Check is there anything changed.
 set +e
-git diff --exit-code >/dev/null
+git diff --cached --exit-code >/dev/null
 if [ $? -eq 0 ]; then
   echo "No changes to the output on this push; exiting."
   exit 0
@@ -57,7 +76,6 @@ if [ -n $SSL_KEY ]; then  # Only push updated docs for github.com/PaddlePaddle/P
   git config user.name "Travis CI"
   git config user.email "paddle-dev@baidu.com"
   git commit -m "Deploy to GitHub Pages: ${SHA}"
-
   # Set ssh private key
   openssl aes-256-cbc -K $SSL_KEY -iv $SSL_IV -in ../../paddle/scripts/travis/deploy_key.enc -out deploy_key -d
   chmod 600 deploy_key
diff --git a/paddle/setup.py.in b/paddle/setup.py.in
index c79666bc81b6f343f166422697cd3901ce8ff441..382d5be6ecfc26b4a524bb6a775bd1a805a34d96 100644
--- a/paddle/setup.py.in
+++ b/paddle/setup.py.in
@@ -55,6 +55,9 @@ elif is_osx == True:
 
 include_dirs = [np.get_include(), "../"]    # include numpy and paddle.
 
+os.environ["CC"] = "@CMAKE_C_COMPILER@"
+os.environ["CXX"] = "@CMAKE_CXX_COMPILER@"
+
 setup(name="py_paddle",
   version="@PADDLE_VERSION@",
   ext_modules=[
@@ -69,6 +72,7 @@ setup(name="py_paddle",
   packages=['py_paddle'],
   include_dirs = include_dirs,
   install_requires = [
+    'nltk>=3.2.2',
     'numpy>=1.8.0',      # The numpy is required.
     'protobuf>=3.0.0'    # The paddle protobuf version
   ],
diff --git a/paddle/trainer/Tester.cpp b/paddle/trainer/Tester.cpp
index 13aa28ae5d9699d267858d48e46797c756487ddd..80664fa877b324af73e3e3effa11e46eac6294e2 100644
--- a/paddle/trainer/Tester.cpp
+++ b/paddle/trainer/Tester.cpp
@@ -208,7 +208,7 @@ real Tester::forwardOneBatch(const DataBatch& dataBatch,
     return 0.0;  // In this case, there is no meaning to calculate cost
   }
 
-  return Argument::sumCosts(outArgs);
+  return Argument::sum(outArgs);
 }
 
 void Tester::testOnePassBatch(int passId) {
diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp
index bd84545375117b178d4324f0ad03f5bc35ae925d..b68e29cd5ea223272151e7a8b52d998832f47103 100644
--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -310,7 +310,7 @@ real Trainer::checkGradient() {
   std::vector<Argument> outArgs;
 
   trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
-  real cost = Argument::sumCosts(outArgs);
+  real cost = Argument::sum(outArgs);
   LOG(INFO) << "original cost=" << cost;
   trainerInternal_.getGradientMachine()->backward();
 
@@ -340,7 +340,7 @@ real Trainer::checkGradient() {
     parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara);
     parameter->setValueUpdated();
     trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
-    real newCost1 = Argument::sumCosts(outArgs);
+    real newCost1 = Argument::sum(outArgs);
 
     for (size_t i = 0; i < dim; ++i) {
       newp[i] = oldp[i] - step * d[i];
@@ -349,7 +349,7 @@ real Trainer::checkGradient() {
     parameter->getBuf(PARAMETER_VALUE)->copyFrom(newPara);
     parameter->setValueUpdated();
     trainerInternal_.getGradientMachine()->forward(inArgs, &outArgs, PASS_GC);
-    real newCost2 = Argument::sumCosts(outArgs);
+    real newCost2 = Argument::sum(outArgs);
 
     real trueDelta = 0.5 * (newCost1 - newCost2);
     real diff = (1e-20 + trueDelta) / (1e-20 + delta) - 1;
@@ -575,7 +575,7 @@ real Trainer::calcGradient(const DataBatch& dataBatch,
 
   trainerInternal_.getGradientMachine()->forwardBackward(
       inArgs, &outArgs, PASS_TRAIN);
-  real cost = Argument::sumCosts(outArgs);
+  real cost = Argument::sum(outArgs);
 
   offset = 0;
   for (auto& para : parameters) {
diff --git a/paddle/trainer/Trainer.h b/paddle/trainer/Trainer.h
index c8ee4726c24c335ceda22ea3a20049b01d11c149..fac589d1d711affcd008f90edf87d865c8362f69 100644
--- a/paddle/trainer/Trainer.h
+++ b/paddle/trainer/Trainer.h
@@ -30,10 +30,6 @@ limitations under the License. */
 #include "TrainerConfigHelper.h"
 #include "TrainerInternal.h"
 
-#ifdef PADDLE_METRIC_LEARNING
-#include "paddle/internals/metric_learning/MetricTrainer.h"
-#endif
-
 DECLARE_int32(num_passes);
 
 namespace paddle {
@@ -201,12 +197,8 @@ protected:
   // parameter util
   std::unique_ptr<ParameterUtil> paramUtil_;
 
-#ifdef PADDLE_METRIC_LEARNING
-  MetricTrainer trainerInternal_;
-#else
   // trainer Internal
   TrainerInternal trainerInternal_;
-#endif
 };
 
 }  // namespace paddle
diff --git a/paddle/trainer/TrainerInternal.cpp b/paddle/trainer/TrainerInternal.cpp
index f3b465b444167d4624a5e99c30e1257eda53ca2c..4c5d4a0913aaf3a9932b3d67806378ece4245304 100644
--- a/paddle/trainer/TrainerInternal.cpp
+++ b/paddle/trainer/TrainerInternal.cpp
@@ -134,7 +134,7 @@ void TrainerInternal::trainOneBatch(int64_t batchId,
   real cost = 0;
   {
     REGISTER_TIMER("sumCost");
-    cost = Argument::sumCosts(*outArgs);
+    cost = Argument::sum(*outArgs);
   }
 
   if (batchId % intconfig_->log_period == 0) {
diff --git a/paddle/utils/Error.h b/paddle/utils/Error.h
index 2b4fbef4e015e7c6895745f220bd444f3883c121..cda1b5c37dada8d0c6c77fc2fb03bb614d5301b5 100644
--- a/paddle/utils/Error.h
+++ b/paddle/utils/Error.h
@@ -37,10 +37,10 @@ namespace paddle {
  *
  * Error __must_check bar() {
  *   // do something.
- *   Status s = foo();  // invoke other method return status.
- *   if (!s) return s;
+ *   Error err = foo();  // invoke other method return status.
+ *   if (err) return err;
  *   // do something else.
- *   return Status();
+ *   return Error();
  * }
  * @endcode{cpp}
  *
@@ -53,8 +53,8 @@ namespace paddle {
  *
  * int foo(Error* error) {
  *   // Do something.
- *   Error s = bar();
- *   if (!s) {
+ *   Error err = bar();
+ *   if (err) {
  *     *error = s;
  *     return 0;
  *   }
@@ -68,10 +68,10 @@ namespace paddle {
  * }
  *
  * Error foobar() {
- *   Error s;
+ *   Error err;
  *   // do something.
- *   foo(&s);
- *   if (!s) return s;
+ *   foo(&err);
+ *   if (err) return err;
  * }
  * @endcode{cpp}
  *
@@ -112,16 +112,22 @@ public:
   }
 
   /**
-   * @brief operator bool, return True if there is no error.
+   * @brief operator bool, return True if there is something error.
    */
-  operator bool() const { return msg_ == nullptr; }
+  operator bool() const { return !this->isOK(); }
+
+  /**
+   * @brief isOK return True if there is no error.
+   * @return True if no error.
+   */
+  bool isOK() const { return msg_ == nullptr; }
 
   /**
    * @brief check this status by glog.
    * @note It is a temp method used during cleaning Paddle code. It will be
    *       removed later.
    */
-  void check() const { CHECK(*this) << msg(); }
+  void check() const { CHECK(this->isOK()) << msg(); }
 
 private:
   std::shared_ptr<std::string> msg_;
diff --git a/paddle/utils/Flags.cpp b/paddle/utils/Flags.cpp
index e8f31bc811ac30d83e8203b784ee1f93a8d35d90..320f671ed97dbadc4fa1b4b52d5611cf9239e7dd 100644
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
@@ -30,7 +30,6 @@ DEFINE_bool(parallel_nn,
 DEFINE_int32(trainer_count, 1, "Defined how many trainers to train");
 DEFINE_int32(gpu_id, 0, "Which gpu core to use");
 DEFINE_int32(port, 20134, "Listening port for pserver");
-DEFINE_int32(data_server_port, 21134, "Listening port for dserver");
 DEFINE_int32(ports_num,
              1,
              "Number of ports for sending dense parameter,"
diff --git a/paddle/utils/Flags.h b/paddle/utils/Flags.h
index 3e72f8356d883b353127ccae80f2881320d20b2b..dc4faef8331ed47b9ce3e952389b6469cd9fda2e 100644
--- a/paddle/utils/Flags.h
+++ b/paddle/utils/Flags.h
@@ -19,7 +19,6 @@ limitations under the License. */
 DECLARE_bool(parallel_nn);
 DECLARE_int32(async_count);
 DECLARE_int32(port);
-DECLARE_int32(data_server_port);
 DECLARE_bool(use_gpu);
 DECLARE_int32(gpu_id);
 DECLARE_int32(trainer_count);
diff --git a/paddle/utils/GlobalConstants.h b/paddle/utils/GlobalConstants.h
index 707346f2c76e59b50722f4f8805ebe56c3cf861b..0ec1c28dfbb2a7db9fa84c9eb2bc4dad806b78e9 100644
--- a/paddle/utils/GlobalConstants.h
+++ b/paddle/utils/GlobalConstants.h
@@ -23,11 +23,6 @@ enum PassType {
   PASS_TEST,    // Test pass
   PASS_GC,      // Gradient Check pass
   PASS_METRIC,  // pass for generate template output with no drop rate.
-  // pass for metric learning training with metric learning error, only used
-  // when we are doing KNN evaluation.
-  PASS_METRIC_TRAIN,
-  PASS_METRIC_TRAIN_WITH_NOERROR,  // Pass for metric learning training
-                                   // with no evaluation.
 };
 
 enum ParameterType {
diff --git a/paddle/utils/tests/test_Error.cpp b/paddle/utils/tests/test_Error.cpp
index 85156466e2cafd36d49941836c066a542dbbd60e..fdf326b17a1c8baa87e2a17fafae253565d1e699 100644
--- a/paddle/utils/tests/test_Error.cpp
+++ b/paddle/utils/tests/test_Error.cpp
@@ -18,17 +18,17 @@ limitations under the License. */
 
 TEST(Error, testAll) {
   paddle::Error error;
-  ASSERT_TRUE(error);
-  error = paddle::Error("I'm the error");
   ASSERT_FALSE(error);
+  error = paddle::Error("I'm the error");
+  ASSERT_TRUE(error);
   ASSERT_STREQ("I'm the error", error.msg());
 
   error = paddle::Error("error2");
-  ASSERT_FALSE(error);
+  ASSERT_TRUE(error);
   ASSERT_STREQ("error2", error.msg());
 
   int i = 3;
   auto error3 = paddle::Error("error%d", i);
-  ASSERT_FALSE(error3);
+  ASSERT_TRUE(error3);
   ASSERT_STREQ("error3", error3.msg());
 }
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index be4634d5103c0f219389823d132b1977963017e1..65d5d50277b665e7c355202d6e8043f656ae92f1 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -475,6 +475,10 @@ message EvaluatorConfig {
   // Used by ChunkEvaluator
   // chunk of these types are not counted
   repeated int32 excluded_chunk_types = 12;
+
+  // Used by ClassificationErrorEvaluator
+  // top # classification error
+  optional int32 top_k = 13 [default = 1];
 }
 
 message LinkConfig {
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index ee7a5bff84ca96ef1010fa7430356722f807fb0f..48e0a1993d07f801e65dfa54a991995c593fe475 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -4,7 +4,7 @@ set(OUTPUT_DIR
 file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
 file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
 file(GLOB UTILS_PY_FILES . ./paddle/utils/*.py)
-file(GLOB V2_PY_FILES . ./paddle/v2/*.py)
+file(GLOB_RECURSE V2_PY_FILES ./paddle/v2/ *.py)
 
 set(PY_FILES paddle/__init__.py
              ${TRAINER_PY_FILES}
@@ -24,6 +24,8 @@ add_custom_target(paddle_python ALL DEPENDS
     ${OUTPUT_DIR}/.timestamp)
 
 add_subdirectory(paddle/trainer_config_helpers/tests)
+add_subdirectory(paddle/v2/reader/tests)
+add_subdirectory(paddle/v2/tests)
 
 install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/dist/
     DESTINATION opt/paddle/share/wheels
diff --git a/python/paddle/trainer/PyDataProvider2.py b/python/paddle/trainer/PyDataProvider2.py
index bd24c68b6fe88eab03c814f8cac70db3880316f4..0e752c117c1ecfab72e2da2f830380e9524236e7 100644
--- a/python/paddle/trainer/PyDataProvider2.py
+++ b/python/paddle/trainer/PyDataProvider2.py
@@ -45,6 +45,23 @@ class CacheType(object):
 
 
 class InputType(object):
+    """
+    InputType is the base class for paddle input types.
+
+    ..  note::
+
+        this is a base class, and should never be used by user.
+
+    :param dim: dimension of input. If the input is an integer, it means the
+                value range. Otherwise, it means the size of layer.
+    :type dim: int
+    :param seq_type: sequence type of input. 0 means it is not a sequence. 1
+                     means it is a variable length sequence. 2 means it is a
+                     nested sequence.
+    :type seq_type: int
+    :param type: data type of input.
+    :type type: int
+    """
     __slots__ = ['dim', 'seq_type', 'type']
 
     def __init__(self, dim, seq_type, tp):
@@ -54,19 +71,63 @@ class InputType(object):
 
 
 def dense_slot(dim, seq_type=SequenceType.NO_SEQUENCE):
+    """
+    Dense Vector. It means the input feature is dense float vector. For example,
+    if the input is an image with 28*28 pixels, the input of Paddle neural
+    network should be a dense vector with dimension 784.
+
+    :param dim: dimension of this vector.
+    :type dim: int
+    :param seq_type: sequence type of input.
+    :type seq_type: int
+    :return: An input type object.
+    :rtype: InputType
+    """
     return InputType(dim, seq_type, DataType.Dense)
 
 
 def sparse_non_value_slot(dim, seq_type=SequenceType.NO_SEQUENCE):
+    """
+    Sparse binary vector. It means the input feature is a sparse vector and the
+    every element in this vector is either zero or one.
+
+    :param dim: dimension of this vector.
+    :type dim: int
+    :param seq_type: sequence type of this input.
+    :type seq_type: int
+    :return: An input type object.
+    :rtype: InputType
+    """
     return InputType(dim, seq_type, DataType.SparseNonValue)
 
 
 def sparse_value_slot(dim, seq_type=SequenceType.NO_SEQUENCE):
+    """
+    Sparse vector. It means the input feature is a sparse vector. Most of the
+    elements in this vector are zero, others could be any float value.
+
+    :param dim: dimension of this vector.
+    :type dim: int
+    :param seq_type: sequence type of this input.
+    :type seq_type: int
+    :return: An input type object.
+    :rtype: InputType
+    """
     return InputType(dim, seq_type, DataType.SparseValue)
 
 
-def index_slot(dim, seq_type=SequenceType.NO_SEQUENCE):
-    return InputType(dim, seq_type, DataType.Index)
+def index_slot(value_range, seq_type=SequenceType.NO_SEQUENCE):
+    """
+    Data type of integer.
+
+    :param seq_type: sequence type of this input.
+    :type seq_type: int
+    :param value_range: range of this integer.
+    :type value_range: int
+    :return: An input type object
+    :rtype: InputType
+    """
+    return InputType(value_range, seq_type, DataType.Index)
 
 
 dense_vector = dense_slot
@@ -76,6 +137,14 @@ integer_value = index_slot
 
 
 def dense_vector_sequence(dim):
+    """
+    Data type of a sequence of dense vector.
+
+    :param dim: dimension of dense vector.
+    :type dim: int
+    :return: An input type object
+    :rtype: InputType
+    """
     return dense_vector(dim, seq_type=SequenceType.SEQUENCE)
 
 
@@ -84,6 +153,15 @@ def dense_vector_sub_sequence(dim):
 
 
 def sparse_binary_vector_sequence(dim):
+    """
+    Data type of a sequence of sparse vector, which every element is either zero
+     or one.
+
+    :param dim: dimension of sparse vector.
+    :type dim: int
+    :return: An input type object
+    :rtype: InputType
+    """
     return sparse_binary_vector(dim, seq_type=SequenceType.SEQUENCE)
 
 
@@ -92,6 +170,15 @@ def sparse_binary_vector_sub_sequence(dim):
 
 
 def sparse_vector_sequence(dim):
+    """
+    Data type of a sequence of sparse vector, which most elements are zero,
+    others could be any float value.
+
+    :param dim: dimension of sparse vector.
+    :type dim: int
+    :return: An input type object
+    :rtype: InputType
+    """
     return sparse_vector(dim, seq_type=SequenceType.SEQUENCE)
 
 
@@ -99,8 +186,14 @@ def sparse_vector_sub_sequence(dim):
     return sparse_vector(dim, seq_type=SequenceType.SUB_SEQUENCE)
 
 
-def integer_value_sequence(dim):
-    return integer_value(dim, seq_type=SequenceType.SEQUENCE)
+def integer_value_sequence(value_range):
+    """
+    Data type of a sequence of integer.
+
+    :param value_range: range of each element.
+    :type value_range: int
+    """
+    return integer_value(value_range, seq_type=SequenceType.SEQUENCE)
 
 
 def integer_value_sub_sequence(dim):
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 575e1107413c22f1efe9c677093382a366fc3f67..efc9d98826742b482cb8e598d0e8544b2769a4ad 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1253,6 +1253,7 @@ def Evaluator(
         dict_file=None,
         result_file=None,
         num_results=None,
+        top_k=None,
         delimited=None,
         excluded_chunk_types=None, ):
     evaluator = g_config.model_config.evaluators.add()
@@ -1280,6 +1281,8 @@ def Evaluator(
         evaluator.result_file = result_file
     if num_results is not None:
         evaluator.num_results = num_results
+    if top_k is not None:
+        evaluator.top_k = top_k
     if delimited is not None:
         evaluator.delimited = delimited
 
@@ -2316,14 +2319,9 @@ def Generator(
 
 @config_layer('expand')
 class ExpandLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 inputs,
-                 trans_type='non-seq',
-                 device=None,
-                 bias=False):
+    def __init__(self, name, inputs, trans_type='non-seq', bias=False, **xargs):
         super(ExpandLayer, self).__init__(
-            name, 'expand', 0, inputs=inputs, device=device)
+            name, 'expand', 0, inputs=inputs, **xargs)
         config_assert(
             len(self.inputs) == 2, 'ExpandLayer takes 2 and only 2 inputs')
         self.config.trans_type = trans_type
@@ -2354,11 +2352,10 @@ class MaxLayer(LayerBase):
                  inputs,
                  trans_type='non-seq',
                  active_type='linear',
-                 device=None,
                  bias=False,
-                 output_max_index=None):
-        super(MaxLayer, self).__init__(
-            name, 'max', 0, inputs=inputs, device=device)
+                 output_max_index=None,
+                 **xargs):
+        super(MaxLayer, self).__init__(name, 'max', 0, inputs=inputs, **xargs)
         config_assert(len(self.inputs) == 1, 'MaxLayer must have 1 input')
         self.config.trans_type = trans_type
         self.config.active_type = active_type
@@ -2405,15 +2402,15 @@ class SequenceLastInstanceLayer(LayerBase):
                  inputs,
                  active_type='linear',
                  trans_type='non-seq',
-                 device=None,
-                 bias=False):
+                 bias=False,
+                 **xargs):
         super(SequenceLastInstanceLayer, self).__init__(
             name,
             'seqlastins',
             0,
             inputs=inputs,
-            device=device,
-            active_type=active_type)
+            active_type=active_type,
+            **xargs)
         config_assert(
             len(inputs) == 1, 'SequenceLastInstanceLayer must have 1 input')
         self.config.trans_type = trans_type
@@ -2425,39 +2422,29 @@ class SequenceLastInstanceLayer(LayerBase):
 
 @config_layer('seqfirstins')
 class SequenceFirstInstanceLayer(SequenceLastInstanceLayer):
-    def __init__(
-            self,
-            name,
-            inputs,
-            active_type='linear',
-            trans_type='non-seq',
-            device=None,
-            bias=False, ):
+    def __init__(self,
+                 name,
+                 inputs,
+                 active_type='linear',
+                 trans_type='non-seq',
+                 bias=False,
+                 **xargs):
         super(SequenceFirstInstanceLayer, self).__init__(
-            name,
-            inputs=inputs,
-            active_type=active_type,
-            device=device,
-            bias=bias)
+            name, inputs=inputs, active_type=active_type, bias=bias, **xargs)
         self.config.trans_type = trans_type
         self.config.select_first = True
 
 
 @config_layer('seqconcat')
 class SequenceConcatLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 inputs,
-                 active_type='linear',
-                 device=None,
-                 bias=False):
+    def __init__(self, name, inputs, active_type='linear', bias=False, **xargs):
         super(SequenceConcatLayer, self).__init__(
             name,
             'seqconcat',
             0,
             inputs=inputs,
-            device=device,
-            active_type=active_type)
+            active_type=active_type,
+            **xargs)
         config_assert(
             len(inputs) == 2, 'SequenceConcatLayer must have 2 inputs')
         for input_index in xrange(len(self.inputs)):
@@ -2473,15 +2460,15 @@ class SequenceReshapeLayer(LayerBase):
                  size,
                  inputs,
                  active_type='linear',
-                 device=None,
-                 bias=False):
+                 bias=False,
+                 **xargs):
         super(SequenceReshapeLayer, self).__init__(
             name,
             'seqreshape',
             size,
             inputs=inputs,
-            device=device,
-            active_type=active_type)
+            active_type=active_type,
+            **xargs)
         config_assert(
             len(inputs) == 1, 'SequenceReshapeLayer must have 1 inputs')
         self.set_layer_size(size)
@@ -2490,19 +2477,9 @@ class SequenceReshapeLayer(LayerBase):
 
 @config_layer('subseq')
 class SubSequenceLayer(LayerBase):
-    def __init__(self,
-                 name,
-                 inputs,
-                 active_type='linear',
-                 device=None,
-                 bias=False):
+    def __init__(self, name, inputs, active_type='linear', bias=False, **xargs):
         super(SubSequenceLayer, self).__init__(
-            name,
-            'subseq',
-            0,
-            inputs=inputs,
-            device=device,
-            active_type=active_type)
+            name, 'subseq', 0, inputs=inputs, active_type=active_type, **xargs)
         config_assert(len(inputs) == 3, 'SubSequenceLayer must have 3 inputs')
         input_layer0 = self.get_input_layer(0)
         size = input_layer0.size
@@ -2659,15 +2636,10 @@ class AverageLayer(LayerBase):
                  average_strategy='average',
                  trans_type='non-seq',
                  active_type='linear',
-                 device=None,
-                 bias=False):
+                 bias=False,
+                 **xargs):
         super(AverageLayer, self).__init__(
-            name,
-            'average',
-            0,
-            inputs=inputs,
-            device=device,
-            active_type=active_type)
+            name, 'average', 0, inputs=inputs, active_type=active_type, **xargs)
         self.config.average_strategy = average_strategy
         self.config.trans_type = trans_type
         config_assert(len(inputs) == 1, 'AverageLayer must have 1 input')
@@ -2691,9 +2663,9 @@ class CosSimLayer(LayerBase):
 
 @config_layer('tensor')
 class TensorLayer(LayerBase):
-    def __init__(self, name, size, inputs, device=None, bias=True, **xargs):
+    def __init__(self, name, size, inputs, bias=True, **xargs):
         super(TensorLayer, self).__init__(
-            name, 'tensor', size, inputs=inputs, device=device, **xargs)
+            name, 'tensor', size, inputs=inputs, **xargs)
         config_assert(len(self.inputs) == 2, 'TensorLayer must have 2 inputs')
         config_assert(size > 0, 'size must be positive')
         config_assert(inputs[1].parameter_name == None,
@@ -3044,7 +3016,7 @@ class CRFLayer(LayerBase):
         super(CRFLayer, self).__init__(name, 'crf', size, inputs, device=device)
         config_assert(2 <= len(self.inputs) <= 3,
                       'CRFLayer must have 2 or 3 inputs')
-        self.create_input_parameter(0, size * (size + 2), [size, size + 2])
+        self.create_input_parameter(0, size * (size + 2), [size + 2, size])
         self.config.coeff = coeff
 
 
@@ -3066,7 +3038,7 @@ class CRFDecodingLayer(LayerBase):
         config_assert(
             len(self.inputs) <= 2,
             'CRFDecodingLayer cannot have more than 2 inputs')
-        self.create_input_parameter(0, size * (size + 2), [size, size + 2])
+        self.create_input_parameter(0, size * (size + 2), [size + 2, size])
 
 
 @config_layer('ctc')
diff --git a/python/paddle/trainer_config_helpers/default_decorators.py b/python/paddle/trainer_config_helpers/default_decorators.py
index b7463a022a146d749711a55b278354b4cd90e907..69d860d9dab9c1d90e4d6a6940d66fcb551f6eb6 100644
--- a/python/paddle/trainer_config_helpers/default_decorators.py
+++ b/python/paddle/trainer_config_helpers/default_decorators.py
@@ -52,6 +52,10 @@ def wrap_param_default(param_names=None,
                     kwargs[name] = default_factory(func)
             return func(*args, **kwargs)
 
+        if hasattr(func, 'argspec'):
+            __wrapper__.argspec = func.argspec
+        else:
+            __wrapper__.argspec = inspect.getargspec(func)
         return __wrapper__
 
     return __impl__
diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py
index bd247ea9af9d8dfb2d476cdc62638bd65c11add5..567521ee9dbadb7a2502cfb9972ef0940e1e410a 100644
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ b/python/paddle/trainer_config_helpers/evaluators.py
@@ -71,6 +71,7 @@ def evaluator_base(
         result_file=None,
         num_results=None,
         delimited=None,
+        top_k=None,
         excluded_chunk_types=None, ):
     """
     Evaluator will evaluate the network status while training/testing.
@@ -104,12 +105,15 @@ def evaluator_base(
     :param weight: An input layer which is a weight for each sample.
                    Each evaluator may calculate differently to use this weight.
     :type weight: LayerOutput.
+    :param top_k: number k in top-k error rate
+    :type top_k: int
     """
     # inputs type assertions.
     assert classification_threshold is None or isinstance(
         classification_threshold, float)
     assert positive_label is None or isinstance(positive_label, int)
     assert num_results is None or isinstance(num_results, int)
+    assert top_k is None or isinstance(top_k, int)
 
     if not isinstance(input, list):
         input = [input]
@@ -130,6 +134,8 @@ def evaluator_base(
         dict_file=dict_file,
         result_file=result_file,
         delimited=delimited,
+        num_results=num_results,
+        top_k=top_k,
         excluded_chunk_types=excluded_chunk_types, )
 
 
@@ -139,6 +145,7 @@ def classification_error_evaluator(input,
                                    label,
                                    name=None,
                                    weight=None,
+                                   top_k=None,
                                    threshold=None):
     """
     Classification Error Evaluator. It will print error rate for classification.
@@ -167,6 +174,8 @@ def classification_error_evaluator(input,
                   then means not set weight. The larger weight it is, the more
                   important this sample is.
     :type weight: LayerOutput
+    :param top_k: number k in top-k error rate
+    :type top_k: int
     :param threshold: The classification threshold.
     :type threshold: float
     :return: None.
@@ -178,6 +187,7 @@ def classification_error_evaluator(input,
         input=input,
         label=label,
         weight=weight,
+        top_k=top_k,
         classification_threshold=threshold, )
 
 
diff --git a/python/paddle/trainer_config_helpers/layer_math.py b/python/paddle/trainer_config_helpers/layer_math.py
index 2d9e36f2b0d379d907634208a45c69efa9dbba3d..544b443825393c9a31c0375724d4ca63dac5c5eb 100644
--- a/python/paddle/trainer_config_helpers/layer_math.py
+++ b/python/paddle/trainer_config_helpers/layer_math.py
@@ -39,6 +39,7 @@ register_unary_math_op('abs', act.AbsActivation())
 register_unary_math_op('sigmoid', act.SigmoidActivation())
 register_unary_math_op('tanh', act.TanhActivation())
 register_unary_math_op('square', act.SquareActivation())
+register_unary_math_op('relu', act.ReluActivation())
 
 
 def add(layeroutput, other):
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 4087f3051e20fc6cf49e6840be22183714e1f12f..38972f8878d2544f67422d0f1d6fc85ee5a8bddf 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -14,6 +14,7 @@
 
 import functools
 import collections
+import inspect
 
 from paddle.trainer.config_parser import *
 from .activations import LinearActivation, SigmoidActivation, TanhActivation, \
@@ -37,6 +38,7 @@ __all__ = [
     "dotmul_projection",
     "dotmul_operator",
     "repeat_layer",
+    "seq_reshape_layer",
     "table_projection",
     "mixed_layer",
     "data_layer",
@@ -50,6 +52,7 @@ __all__ = [
     "cos_sim",
     "hsigmoid",
     "conv_projection",
+    "mse_cost",
     "regression_cost",
     'classification_cost',
     "LayerOutput",
@@ -59,6 +62,7 @@ __all__ = [
     'img_cmrnorm_layer',
     'addto_layer',
     'concat_layer',
+    'seq_concat_layer',
     'lstm_step_layer',
     'recurrent_group',
     'memory',
@@ -110,6 +114,8 @@ __all__ = [
     'priorbox_layer',
     'spp_layer',
     'pad_layer',
+    'eos_layer',
+    'layer_support',
 ]
 
 
@@ -124,6 +130,7 @@ class LayerType(object):
     GRUMEMORY = "gated_recurrent"
     SEQUENCE_LAST_INSTANCE = "seqlastins"
     SEQUENCE_FIRST_INSTANCE = "seqfirstins"
+    SEQUENCE_RESHAPE = "seqreshape"
     POOLING_MAX = "max"
     POOLING_AVG = 'average'
     FC_LAYER = "fc"
@@ -144,6 +151,7 @@ class LayerType(object):
 
     CONCAT_LAYER = 'concat'
     CONCAT_PROJ_LAYER = 'concat2'
+    SEQUENCE_CONCAT_LAYER = 'seqconcat'
 
     LSTM_STEP_LAYER = 'lstm_step'
     GRU_STEP_LAYER = 'gru_step'
@@ -318,6 +326,11 @@ def layer_support(*attrs):
                     val.check(method.__name__)
             return method(*args, **kwargs)
 
+        if hasattr(method, 'argspec'):
+            wrapper.argspec = method.argspec
+        else:
+            wrapper.argspec = inspect.getargspec(method)
+
         return wrapper
 
     return decorator
@@ -712,6 +725,7 @@ class MixedLayerType(LayerOutput):
         # update the size which might be computed inside MixedLayer
         # according to the operator's output size
         self.size = ml.config.size
+        self.finalized = True
 
 
 @wrap_name_default("mixed")
@@ -790,17 +804,16 @@ def data_layer(name, size, height=None, width=None, layer_attr=None):
 
     ..  code-block:: python
 
-        data = data_layer(name="input",
-                          size=1000)
+        data = data_layer(name="input", size=1000)
 
     :param name: Name of this data layer.
     :type name: basestring
     :param size: Size of this data layer.
     :type size: int
     :param height: Height of this data layer, used for image
-    :type size: int|None
+    :type height: int|None
     :param width: Width of this data layer, used for image
-    :type size: int|None
+    :type width: int|None
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
@@ -1291,6 +1304,12 @@ def last_seq(input,
     """
     Get Last Timestamp Activation of a sequence.
 
+    The simple usage is:
+
+    .. code-block:: python
+
+       seq = last_seq(input=layer)
+
     :param agg_level: Aggregated level
     :param name: Layer name.
     :type name: basestring
@@ -1329,6 +1348,12 @@ def first_seq(input,
     """
     Get First Timestamp Activation of a sequence.
 
+    The simple usage is:
+
+    .. code-block:: python
+
+       seq = first_seq(input=layer)
+
     :param agg_level: aggregation level
     :param name: Layer name.
     :type name: basestring
@@ -1429,7 +1454,7 @@ def repeat_layer(input, num_repeats, name=None, layer_attr=None):
 
     .. code-block:: python
 
-       expand = repeat_layer(layer, 4)
+       expand = repeat_layer(input=layer, num_repeats=4)
 
     :param input: Input layer
     :type input: LayerOutput
@@ -1456,6 +1481,61 @@ def repeat_layer(input, num_repeats, name=None, layer_attr=None):
         parents=[input])
 
 
+@wrap_name_default("seqreshape")
+@wrap_act_default(act=IdentityActivation())
+@wrap_bias_attr_default(has_bias=False)
+@layer_support()
+def seq_reshape_layer(input,
+                      reshape_size,
+                      act=None,
+                      name=None,
+                      layer_attr=None,
+                      bias_attr=None):
+    """
+    A layer for reshaping the sequence. Assume the input sequence has T instances,
+    the dimension of each instance is M, and the input reshape_size is N, then the 
+    output sequence has T*M/N instances, the dimension of each instance is N.
+
+    Note that T*M/N must be an integer.
+
+    The example usage is:
+
+    .. code-block:: python
+
+       reshape = seq_reshape_layer(input=layer, reshape_size=4)
+
+    :param input: Input layer.
+    :type input: LayerOutput
+    :param reshape_size: the size of reshaped sequence.
+    :type reshape_size: int
+    :param name: Layer name.
+    :type name: basestring
+    :param act: Activation type.
+    :type act: BaseActivation
+    :param layer_attr: extra layer attributes.
+    :type layer_attr: ExtraLayerAttribute.
+    :param bias_attr: The Bias Attribute. If no bias, then pass False or
+                      something not type of ParameterAttribute. None will get a
+                      default Bias.
+    :type bias_attr: ParameterAttribute or None or bool
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+
+    Layer(
+        inputs=[input.name],
+        name=name,
+        size=reshape_size,
+        type=LayerType.SEQUENCE_RESHAPE,
+        bias=ParamAttr.to_bias(bias_attr),
+        **ExtraAttr.to_kwargs(layer_attr))
+    return LayerOutput(
+        name=name,
+        size=reshape_size,
+        layer_type=LayerType.SEQUENCE_RESHAPE,
+        parents=[input])
+
+
 @wrap_name_default()
 @layer_support()
 def interpolation_layer(input, weight, name=None, layer_attr=None):
@@ -1746,6 +1826,12 @@ def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None):
     Note that the above computation is for one sample. Multiple samples are
     processed in one batch.
 
+    The example usage is:
+
+    .. code-block:: python
+
+       cos = cos_sim(a=layer1, b=layer2, size=3)
+
     :param name: layer name
     :type name: basestring
     :param a: input layer a
@@ -1907,6 +1993,16 @@ def img_conv_layer(input,
     pieces. First 256/4 = 64 channels will process by first 32 filters. The
     rest channels will be processed by rest group of filters.
 
+    The example usage is:
+
+    ..  code-block:: python
+
+        conv = img_conv_layer(input=data, filter_size=1, filter_size_y=1,
+                              num_channels=8,
+                              num_filters=16, stride=1,
+                              bias_attr=False,
+                              act=ReluActivation())
+
     :param name: Layer name.
     :type name: basestring
     :param input: Layer Input.
@@ -2046,6 +2142,34 @@ def img_pool_layer(input,
 
     .. _pooling: http://ufldl.stanford.edu/tutorial/supervised/Pooling/
 
+    - ceil_mode=True:
+
+    ..  math::
+
+        w = 1 + int(ceil(input\_width + 2 * padding - pool\_size) / float(stride))
+        h = 1 + int(ceil(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y))
+
+    - ceil_mode=False:
+
+    ..  math::
+
+        w = 1 + int(floor(input\_width + 2 * padding - pool\_size) / float(stride))
+        h = 1 + int(floor(input\_height + 2 * padding\_y - pool\_size\_y) / float(stride\_y))
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        maxpool = img_pool_layer(input=conv,
+                                 pool_size=3,
+                                 pool_size_y=5,
+                                 num_channels=8,
+                                 stride=1,
+                                 stride_y=2,
+                                 padding=1,
+                                 padding_y=2,
+                                 pool_type=MaxPooling())
+
     :param padding: pooling padding width.
     :type padding: int
     :param padding_y: pooling padding height. It's equal to padding by default.
@@ -2072,19 +2196,6 @@ def img_pool_layer(input,
     :param ceil_mode: Wether to use ceil mode to calculate output height and with.
                       Defalut is True. If set false, Otherwise use floor.
 
-                      - ceil_mode=True:
-
-                      ..  math::
-
-                          w = 1 + int(ceil(input_width + 2 * padding - pool_size) / float(stride))
-                          h = 1 + int(ceil(input_height + 2 * padding_y - pool_size_y) / float(stride_y))
-
-                      - ceil_mode=False:
-
-                      ..  math::
-
-                          w = 1 + int(floor(input_width + 2 * padding - pool_size) / float(stride))
-                          h = 1 + int(floor(input_height + 2 * padding_y - pool_size_y) / float(stride_y))
     :type ceil_mode: bool
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2146,6 +2257,15 @@ def spp_layer(input,
     The details please refer to
     `Kaiming He's paper <https://arxiv.org/abs/1406.4729>`_.
 
+    The example usage is:
+
+    ..  code-block:: python
+
+        spp = spp_layer(input=data, 
+                        pyramid_height=2, 
+                        num_channels=16, 
+                        pool_type=MaxPooling())
+
     :param name: layer name.
     :type name: basestring
     :param input: layer's input.
@@ -2234,6 +2354,12 @@ def img_cmrnorm_layer(input,
     The details please refer to
     `Alex's paper <http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf>`_.
 
+    The example usage is:
+
+    ..  code-block:: python
+    
+        norm = img_cmrnorm_layer(input=net, size=5)
+
     :param name: layer name.
     :type name: None|basestring
     :param input: layer's input.
@@ -2289,6 +2415,12 @@ def batch_norm_layer(input,
     The details of batch normalization please refer to this
     `paper <http://arxiv.org/abs/1502.03167>`_.
 
+    The example usage is:
+
+    ..  code-block:: python
+    
+        norm = batch_norm_layer(input=net, act=ReluActivation())
+
     :param name: layer name.
     :type name: basestring
     :param input: batch normalization input. Better be linear activation.
@@ -2578,6 +2710,63 @@ def concat_layer(input, act=None, name=None, layer_attr=None, bias_attr=None):
         size=sz)
 
 
+@wrap_name_default("seqconcat")
+@wrap_act_default(act=IdentityActivation())
+@wrap_bias_attr_default(has_bias=False)
+@layer_support()
+def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
+                     bias_attr=None):
+    """
+    Concat sequence a with sequence b.
+
+    Inputs: 
+      - a = [a1, a2, ..., an]
+      - b = [b1, b2, ..., bn]
+      - Note that the length of a and b should be the same.
+        
+    Output: [a1, b1, a2, b2, ..., an, bn]
+
+    The example usage is:
+
+    ..  code-block:: python
+
+        concat = seq_concat_layer(a=layer1, b=layer2)
+
+    :param name: Layer name.
+    :type name: basestring
+    :param a: input sequence layer
+    :type a: LayerOutput
+    :param b: input sequence layer
+    :type b: LayerOutput
+    :param act: Activation type.
+    :type act: BaseActivation
+    :param layer_attr: Extra Layer Attribute.
+    :type layer_attr: ExtraLayerAttribute
+    :param bias_attr: The Bias Attribute. If no bias, then pass False or
+                      something not type of ParameterAttribute. None will get a
+                      default Bias.
+    :type bias_attr: ParameterAttribute or None or bool
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(a, LayerOutput) and isinstance(b, LayerOutput)
+    assert a.size == b.size
+    Layer(
+        name=name,
+        type=LayerType.SEQUENCE_CONCAT_LAYER,
+        inputs=[a.name, b.name],
+        active_type=act.name,
+        bias=ParamAttr.to_bias(bias_attr),
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+
+    return LayerOutput(
+        name,
+        layer_type=LayerType.SEQUENCE_CONCAT_LAYER,
+        parents=[a, b],
+        activation=act,
+        size=a.size)
+
+
 @wrap_name_default("memory", "memory_name")
 def memory(name,
            size,
@@ -3420,11 +3609,14 @@ def __cost_input__(input, label, weight=None):
 
 @wrap_name_default()
 @layer_support()
-def regression_cost(input, label, weight=None, name=None, layer_attr=None):
+def mse_cost(input, label, weight=None, name=None, layer_attr=None):
     """
-    Regression Layer.
+    mean squared error cost:
+
+    ..  math::
+
+       $\frac{1}{N}\sum_{i=1}^N(t _i- y_i)^2$
 
-    TODO(yuyang18): Complete this method.
 
     :param name: layer name.
     :type name: basestring
@@ -3450,12 +3642,16 @@ def regression_cost(input, label, weight=None, name=None, layer_attr=None):
     return LayerOutput(name, LayerType.COST, parents=parents, size=1)
 
 
+regression_cost = mse_cost
+
+
 @wrap_name_default("cost")
 @layer_support()
 def classification_cost(input,
                         label,
                         weight=None,
                         name=None,
+                        top_k=None,
                         evaluator=classification_error_evaluator,
                         layer_attr=None):
     """
@@ -3470,6 +3666,8 @@ def classification_cost(input,
     :param weight: The weight affects the cost, namely the scale of cost.
                    It is an optional argument.
     :type weight: LayerOutput
+    :param top_k: number k in top-k error rate
+    :type top_k: int
     :param evaluator: Evaluator method.
     :param layer_attr: layer's extra attribute.
     :type layer_attr: ExtraLayerAttribute
@@ -3497,7 +3695,7 @@ def classification_cost(input,
         assert isinstance(e.for_classification, bool)
         assert e.for_classification
 
-        e(name=e.__name__, input=input, label=label, weight=weight)
+        e(name=e.__name__, input=input, label=label, weight=weight, top_k=top_k)
 
     if not isinstance(evaluator, collections.Sequence):
         evaluator = [evaluator]
@@ -3820,13 +4018,13 @@ def conv_shift_layer(a, b, name=None, layer_attr=None):
 
     .. code-block:: python
 
-       conv_shift = conv_shift_layer(input=[layer1, layer2])
+       conv_shift = conv_shift_layer(a=layer1, b=layer2)
 
     :param name: layer name
     :type name: basestring
     :param a: Input layer a.
     :type a: LayerOutput
-    :param b: input layer b
+    :param b: input layer b.
     :type b: LayerOutput
     :param layer_attr: layer's extra attribute.
     :type layer_attr: ExtraLayerAttribute
@@ -3918,8 +4116,8 @@ def tensor_layer(a,
 @wrap_act_default()
 @layer_support()
 def selective_fc_layer(input,
-                       select,
                        size,
+                       select=None,
                        act=None,
                        name=None,
                        pass_generation=False,
@@ -3946,6 +4144,7 @@ def selective_fc_layer(input,
     :type input: LayerOutput|list|tuple
     :param select: The select layer. The output of select layer should be a
                    sparse binary matrix, and treat as the mask of selective fc.
+                   If is None, acts exactly like fc_layer.
     :type select: LayerOutput
     :param size: The layer dimension.
     :type size: int
@@ -4174,7 +4373,7 @@ def block_expand_layer(input,
 
     .. code-block:: python
 
-       block_expand = block_expand_layer(input,
+       block_expand = block_expand_layer(input=layer,
                                          num_channels=128,
                                          stride_x=1,
                                          stride_y=1,
@@ -4378,7 +4577,7 @@ def warp_ctc_layer(input,
         - You can set 'blank' to any value ranged in [0, num_classes], which
           should be consistent as that used in your labels.
         - As a native 'softmax' activation is interated to the warp-ctc library,
-         'linear' activation is expected instead in the 'input' layer.
+          'linear' activation is expected instead in the 'input' layer.
 
     The simple usage:
 
@@ -4511,6 +4710,13 @@ def crf_decoding_layer(input,
     this layer will also calculate error. output.value[i] is 1 for incorrect
     decoding or 0 for correct decoding.
 
+    The simple usage:
+
+    .. code-block:: python
+
+      crf_decoding = crf_decoding_layer(input=input,
+                                        size=label_dim)
+
     :param input: The first input layer.
     :type input: LayerOutput
     :param size: size of this layer.
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index ea46b557a26ce638742facda3eb6aa2feb4b2563..c9178e3c6a46a2d663ec368569e529e780b76a6f 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -4,6 +4,7 @@ test_sequence_pooling test_lstmemory_layer test_grumemory_layer
 last_first_seq test_expand_layer test_ntm_layers test_hsigmoid
 img_layers img_trans_layers util_layers simple_rnn_layers unused_layers test_cost_layers
 test_rnn_group shared_fc shared_lstm shared_gru test_cost_layers_with_weight
-test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops)
+test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
+test_seq_concat_reshape)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
index 3331c10d6497f58eb135208bd7abe48aacfb10ae..24c901c8ee3ab1c90fc14fbff761db06345a6313 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
@@ -7,8 +7,9 @@ x = layer_math.exp(x)
 x = layer_math.log(x)
 x = layer_math.abs(x)
 x = layer_math.sigmoid(x)
+x = layer_math.tanh(x)
 x = layer_math.square(x)
-x = layer_math.square(x)
+x = layer_math.relu(x)
 y = 1 + x
 y = y + 1
 y = x + y
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr
index da8da1b541f37a09654202f68232b99e4dac9f61..9b8a2ad9687d313e6c5017c2d7331eddf539af92 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/math_ops.protostr
@@ -65,13 +65,28 @@ layers {
     }
   }
 }
+layers {
+  name: "__tanh_0__"
+  type: "mixed"
+  size: 100
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "__sigmoid_0__"
+    proj_conf {
+      type: "identity"
+      name: "___tanh_0__.w0"
+      input_size: 100
+      output_size: 100
+    }
+  }
+}
 layers {
   name: "__square_0__"
   type: "mixed"
   size: 100
   active_type: "square"
   inputs {
-    input_layer_name: "__sigmoid_0__"
+    input_layer_name: "__tanh_0__"
     proj_conf {
       type: "identity"
       name: "___square_0__.w0"
@@ -81,15 +96,15 @@ layers {
   }
 }
 layers {
-  name: "__square_1__"
+  name: "__relu_0__"
   type: "mixed"
   size: 100
-  active_type: "square"
+  active_type: "relu"
   inputs {
     input_layer_name: "__square_0__"
     proj_conf {
       type: "identity"
-      name: "___square_1__.w0"
+      name: "___relu_0__.w0"
       input_size: 100
       output_size: 100
     }
@@ -101,7 +116,7 @@ layers {
   size: 100
   active_type: ""
   inputs {
-    input_layer_name: "__square_1__"
+    input_layer_name: "__relu_0__"
   }
   slope: 1.0
   intercept: 1
@@ -123,7 +138,7 @@ layers {
   size: 100
   active_type: ""
   inputs {
-    input_layer_name: "__square_1__"
+    input_layer_name: "__relu_0__"
     proj_conf {
       type: "identity"
       name: "___mixed_0__.w0"
@@ -147,7 +162,7 @@ layers {
   size: 100
   active_type: ""
   inputs {
-    input_layer_name: "__square_1__"
+    input_layer_name: "__relu_0__"
   }
   slope: -1.0
   intercept: 0.0
@@ -339,8 +354,9 @@ sub_models {
   layer_names: "__log_0__"
   layer_names: "__abs_0__"
   layer_names: "__sigmoid_0__"
+  layer_names: "__tanh_0__"
   layer_names: "__square_0__"
-  layer_names: "__square_1__"
+  layer_names: "__relu_0__"
   layer_names: "__slope_intercept_layer_0__"
   layer_names: "__slope_intercept_layer_1__"
   layer_names: "__mixed_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
index 10e59e21bc7a48bc53fb535f86f053c91f57c1df..05fd1c99d2db6e9faa3b3884ec9baf051791f9fe 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
@@ -239,9 +239,9 @@ parameters {
   name: "___crf_layer_0__.w0"
   size: 24
   initial_mean: 0.0
-  initial_std: 0.5
-  dims: 4
+  initial_std: 0.408248290464
   dims: 6
+  dims: 4
   initial_strategy: 0
   initial_smart: true
 }
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
index 811b38ae4a51e8faedb59fea2b81a8be3cceeae6..3244181a63109335c4fba6ca4dd04ac8f0446313 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
@@ -45,7 +45,7 @@ layers {
   coeff: 1.0
 }
 layers {
-  name: "__regression_cost_0__"
+  name: "__mse_cost_0__"
   type: "square_error"
   size: 1
   active_type: ""
@@ -84,7 +84,7 @@ input_layer_names: "input"
 input_layer_names: "label"
 input_layer_names: "weight"
 output_layer_names: "__cost_0__"
-output_layer_names: "__regression_cost_0__"
+output_layer_names: "__mse_cost_0__"
 evaluators {
   name: "classification_error_evaluator"
   type: "classification_error"
@@ -99,12 +99,12 @@ sub_models {
   layer_names: "weight"
   layer_names: "__fc_layer_0__"
   layer_names: "__cost_0__"
-  layer_names: "__regression_cost_0__"
+  layer_names: "__mse_cost_0__"
   input_layer_names: "input"
   input_layer_names: "label"
   input_layer_names: "weight"
   output_layer_names: "__cost_0__"
-  output_layer_names: "__regression_cost_0__"
+  output_layer_names: "__mse_cost_0__"
   evaluator_names: "classification_error_evaluator"
   is_recurrent_layer_group: false
 }
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..91284b4fb32fcfdbf6b9e7384ffe080574b78821
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr
@@ -0,0 +1,51 @@
+type: "nn"
+layers {
+  name: "data1"
+  type: "data"
+  size: 30
+  active_type: ""
+}
+layers {
+  name: "data2"
+  type: "data"
+  size: 30
+  active_type: ""
+}
+layers {
+  name: "__seqconcat_0__"
+  type: "seqconcat"
+  size: 30
+  active_type: ""
+  inputs {
+    input_layer_name: "data1"
+  }
+  inputs {
+    input_layer_name: "data2"
+  }
+}
+layers {
+  name: "__seqreshape_0__"
+  type: "seqreshape"
+  size: 5
+  active_type: "linear"
+  inputs {
+    input_layer_name: "data1"
+  }
+}
+input_layer_names: "data1"
+input_layer_names: "data2"
+output_layer_names: "__seqconcat_0__"
+output_layer_names: "__seqreshape_0__"
+sub_models {
+  name: "root"
+  layer_names: "data1"
+  layer_names: "data2"
+  layer_names: "__seqconcat_0__"
+  layer_names: "__seqreshape_0__"
+  input_layer_names: "data1"
+  input_layer_names: "data2"
+  output_layer_names: "__seqconcat_0__"
+  output_layer_names: "__seqreshape_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
index d30f70a55c5b1834074966dfb3f378e01447c8ab..1c0aa7f9b9ee45b9eaf82dc46a2648d834dcd4ad 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
@@ -10,5 +10,5 @@ fc = fc_layer(input=data, size=10, act=SoftmaxActivation())
 outputs(
     classification_cost(
         input=fc, label=lbl, weight=wt),
-    regression_cost(
+    mse_cost(
         input=fc, label=lbl, weight=wt))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py b/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c161ba805fb301e8feb8702ad61a8341df40e3f
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py
@@ -0,0 +1,12 @@
+from paddle.trainer_config_helpers import *
+
+settings(batch_size=1000, learning_rate=1e-5)
+
+din1 = data_layer(name='data1', size=30)
+din2 = data_layer(name='data2', size=30)
+
+opts = []
+opts.append(seq_concat_layer(a=din1, b=din2))
+opts.append(seq_reshape_layer(input=din1, reshape_size=5))
+
+outputs(opts)
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
index b2ea87b086101d71e89c33ce7c1f4eb21afade5a..25526bf409cf82f26979a84700ce948ac969df0c 100644
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -11,7 +11,39 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 import optimizer
+import layer
+import activation
+import parameters
+import trainer
+import event
+import data_type
+import topology
+import data_feeder
+import networks
+from . import dataset
+from . import reader
+import attr
+import pooling
+import inference
+import networks
+import py_paddle.swig_paddle as api
+import minibatch
+
+__all__ = [
+    'optimizer', 'layer', 'activation', 'parameters', 'init', 'trainer',
+    'event', 'data_type', 'attr', 'pooling', 'data_feeder', 'dataset', 'reader',
+    'topology', 'networks', 'infer'
+]
+
+
+def init(**kwargs):
+    args = []
+    for key in kwargs.keys():
+        args.append('--%s=%s' % (key, str(kwargs[key])))
+
+    api.initPaddle(*args)
+
 
-__all__ = ['optimizer']
+infer = inference.infer
+batch = minibatch.batch
diff --git a/python/paddle/v2/activation.py b/python/paddle/v2/activation.py
new file mode 100644
index 0000000000000000000000000000000000000000..21261a178203b633ca6cf59a5fc89edc24a868b9
--- /dev/null
+++ b/python/paddle/v2/activation.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.trainer_config_helpers.activations
+import copy
+
+__all__ = []
+
+suffix = 'Activation'
+for act in paddle.trainer_config_helpers.activations.__all__:
+    new_name = act[:-len(suffix)]
+    globals()[new_name] = copy.copy(
+        getattr(paddle.trainer_config_helpers.activations, act))
+    globals()[new_name].__name__ = new_name
+    __all__.append(new_name)
diff --git a/python/paddle/v2/attr.py b/python/paddle/v2/attr.py
new file mode 100644
index 0000000000000000000000000000000000000000..32f78614e7f8abe7cffdc7a50a9fa77f1fc1a780
--- /dev/null
+++ b/python/paddle/v2/attr.py
@@ -0,0 +1,27 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.trainer_config_helpers.attrs
+
+__all__ = [
+    "Param",
+    "Extra",
+]
+
+Param = paddle.trainer_config_helpers.attrs.ParameterAttribute
+Extra = paddle.trainer_config_helpers.attrs.ExtraLayerAttribute
+
+for each in paddle.trainer_config_helpers.attrs.__all__:
+    globals()[each] = getattr(paddle.trainer_config_helpers.attrs, each)
+    __all__.append(each)
diff --git a/python/paddle/v2/config_base.py b/python/paddle/v2/config_base.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ec1d7bbdf912b940ca4b8e7b20eb11310f0e74f
--- /dev/null
+++ b/python/paddle/v2/config_base.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import re
+from paddle.trainer_config_helpers.default_decorators import wrap_name_default
+import paddle.trainer_config_helpers as conf_helps
+
+
+class LayerType(type):
+    def __new__(cls, name, bases, attrs):
+        method_name = attrs.get('METHOD_NAME', None)
+        if method_name is not None:
+            method = getattr(conf_helps, method_name)
+            if method.__doc__ is not None:
+                mapper = attrs.get("__map_docstr__", None)
+                if mapper is not None:
+                    attrs['__doc__'] = LayerType.__map_docstr__(
+                        mapper(method.__doc__),
+                        method_name=method_name,
+                        name=name)
+                else:
+                    attrs['__doc__'] = LayerType.__map_docstr__(
+                        method.__doc__, method_name=method_name, name=name)
+        return super(LayerType, cls).__new__(cls, name, bases, attrs)
+
+    @staticmethod
+    def __map_docstr__(doc, name, method_name):
+        assert isinstance(doc, basestring)
+
+        # replace LayerOutput to paddle.v2.config_base.Layer
+        doc = doc.replace("LayerOutput", "paddle.v2.config_base.Layer")
+
+        doc = doc.replace('ParameterAttribute',
+                          'paddle.v2.attr.ParameterAttribute')
+
+        doc = re.sub(r'ExtraLayerAttribute[^\s]?',
+                     'paddle.v2.attr.ExtraAttribute', doc)
+
+        # xxx_layer to xxx
+        doc = re.sub(r"(?P<name>[a-z]+)_layer", r"\g<name>", doc)
+
+        # XxxxActivation to paddle.v2.Activation.Xxxx
+        doc = re.sub(r"(?P<name>[A-Z][a-zA-Z]+)Activation",
+                     r"paddle.v2.Activation.\g<name>", doc)
+
+        # TODO(yuyang18): Add more rules if needed.
+        return doc
+
+
+class Layer(object):
+    __metaclass__ = LayerType
+
+    def __init__(self, name=None, parent_layers=None):
+        assert isinstance(parent_layers, dict)
+        self.name = name
+        self.__contex__ = {}
+        self.__parent_layers__ = parent_layers
+
+    def to_proto(self, context):
+        """
+        function to set proto attribute
+        """
+        kwargs = dict()
+        for layer_name in self.__parent_layers__:
+            if not isinstance(self.__parent_layers__[layer_name],
+                              collections.Sequence):
+                v1_layer = self.__parent_layers__[layer_name].to_proto(
+                    context=context)
+            else:
+                v1_layer = map(lambda x: x.to_proto(context=context),
+                               self.__parent_layers__[layer_name])
+            kwargs[layer_name] = v1_layer
+
+        if self.context_name() is None:
+            return self.to_proto_impl(**kwargs)
+        elif self.context_name() not in context:
+            context[self.context_name()] = self.to_proto_impl(**kwargs)
+        self.__contex__ = context
+        if self.use_context_name():
+            return context[self.context_name()]
+        else:
+            return context[self.name]
+
+    def to_proto_impl(self, **kwargs):
+        raise NotImplementedError()
+
+    def context_name(self):
+        """
+        Context name means the context which stores `to_proto_impl` result.
+        If multiple layer share same context_name, the `to_proto_impl` of them
+        will be invoked only once.
+        """
+        return self.name
+
+    def use_context_name(self):
+        return False
+
+    def calculate_size(self):
+        """
+        lazy calculate size of the layer, should be called when to_proto_impl of
+        this layer is called.
+        :return:
+        """
+        return self.__contex__[self.context_name()].size
+
+
+def __convert_to_v2__(method_name, parent_names, is_default_name=True):
+    if is_default_name:
+        wrapper = wrap_name_default(name_prefix=method_name)
+    else:
+        wrapper = None
+
+    class V2LayerImpl(Layer):
+        METHOD_NAME = method_name
+
+        def __init__(self, **kwargs):
+            parent_layers = dict()
+            other_kwargs = dict()
+            for pname in parent_names:
+                if kwargs.has_key(pname):
+                    parent_layers[pname] = kwargs[pname]
+
+            for key in kwargs.keys():
+                if key not in parent_names:
+                    other_kwargs[key] = kwargs[key]
+
+            name = kwargs.get('name', None)
+            super(V2LayerImpl, self).__init__(name, parent_layers)
+            self.__other_kwargs__ = other_kwargs
+
+        if wrapper is not None:
+            __init__ = wrapper(__init__)
+
+        def to_proto_impl(self, **kwargs):
+            args = dict()
+            for each in kwargs:
+                args[each] = kwargs[each]
+            for each in self.__other_kwargs__:
+                args[each] = self.__other_kwargs__[each]
+            return getattr(conf_helps, method_name)(**args)
+
+    return V2LayerImpl
diff --git a/python/paddle/v2/data_feeder.py b/python/paddle/v2/data_feeder.py
new file mode 100644
index 0000000000000000000000000000000000000000..bda8e22fd282f8ff4a820e4ecb6b3bb421d57890
--- /dev/null
+++ b/python/paddle/v2/data_feeder.py
@@ -0,0 +1,112 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from py_paddle import DataProviderConverter
+
+import paddle.trainer.PyDataProvider2 as pydp2
+
+__all__ = ['DataFeeder']
+
+
+def default_feeding_map(data_types):
+    reader_dict = dict()
+    for i, tp in enumerate(data_types):
+        reader_dict[tp[0]] = i
+    return reader_dict
+
+
+class DataFeeder(DataProviderConverter):
+    """
+    DataFeeder converts the data returned by paddle.reader into a data structure
+    of Arguments which is defined in the API. The paddle.reader usually returns
+    a list of mini-batch data entries. Each data entry in the list is one sample.
+    Each sample is a list or a tuple with one feature or multiple features.
+    DataFeeder converts this mini-batch data entries into Arguments in order
+    to feed it to C++ interface.
+    
+    The example usage:
+
+
+    ..  code-block:: python
+
+        data_types = [('image', paddle.data_type.dense_vector(784)),
+                      ('label', paddle.data_type.integer_value(10))]
+        reader_dict = {'image':0, 'label':1}
+        feeder = DataFeeder(data_types=data_types, reader_dict=reader_dict)
+        minibatch_data = [
+                           ( [1.0,2.0,3.0,4.0], 5, [6,7,8] ),  # first sample
+                           ( [1.0,2.0,3.0,4.0], 5, [6,7,8] )   # second sample
+                         ]
+        # or minibatch_data = [
+        #                       [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ],  # first sample
+        #                       [ [1.0,2.0,3.0,4.0], 5, [6,7,8] ]   # second sample
+        #                     ]
+        arg = feeder(minibatch_data)
+
+    ..  note::
+
+        This module is for internal use only. Users should use the `reader`
+        interface.
+
+
+
+    :param data_types: A list to specify data name and type. Each item is
+                       a tuple of (data_name, data_type).
+
+    :type data_types: list
+    :param reader_dict: A dictionary to specify the position of each data
+                        in the input data.
+    :type feeding: dict
+    """
+
+    def __init__(self, data_types, feeding=None):
+        self.input_names = []
+        input_types = []
+        if feeding is None:
+            feeding = default_feeding_map(data_types)
+
+        self.feeding = feeding
+        for each in data_types:
+            self.input_names.append(each[0])
+            if not isinstance(each[1], pydp2.InputType):
+                raise TypeError("second item in each data_type should be an "
+                                "InputType")
+            input_types.append(each[1])
+        DataProviderConverter.__init__(self, input_types)
+
+    def __len__(self):
+        return len(self.input_names)
+
+    def convert(self, dat, argument=None):
+        """
+        :param dat: A list of mini-batch data. Each sample is a list or tuple
+                    one feature or multiple features.
+
+        :type dat: list
+        :param argument: An Arguments object contains this mini-batch data with
+                         one or multiple features. The Arguments definition is
+                         in the API.
+        :type argument: py_paddle.swig_paddle.Arguments
+        """
+
+        def reorder_data(data):
+            retv = []
+            for each in data:
+                reorder = []
+                for name in self.input_names:
+                    reorder.append(each[self.feeding[name]])
+                retv.append(reorder)
+            return retv
+
+        return DataProviderConverter.convert(self, reorder_data(dat), argument)
diff --git a/python/paddle/v2/data_type.py b/python/paddle/v2/data_type.py
new file mode 100644
index 0000000000000000000000000000000000000000..d582f76ddf01ed3430a1d075624bbb8e0bf3f2a9
--- /dev/null
+++ b/python/paddle/v2/data_type.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.trainer.PyDataProvider2 as pydp2
+
+import_list = [
+    nm for nm in dir(pydp2)
+    if '_' in nm and nm[0] != '_' and ('value' in nm or 'vector' in nm)
+]
+import_list.extend(['InputType'])
+
+for nm in import_list:
+    globals()[nm] = getattr(pydp2, nm)
+
+__all__ = import_list
diff --git a/python/paddle/v2/dataset/__init__.py b/python/paddle/v2/dataset/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..80ff6295c34e853d8f69b9e78719af23a56d1fbb
--- /dev/null
+++ b/python/paddle/v2/dataset/__init__.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Dataset package.
+"""
+
+import mnist
+import imikolov
+import imdb
+import cifar
+import movielens
+import conll05
+import uci_housing
+import sentiment
+import wmt14
+
+__all__ = [
+    'mnist', 'imikolov', 'imdb', 'cifar', 'movielens', 'conll05', 'sentiment'
+    'uci_housing', 'wmt14'
+]
diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9f7a830ee60a331b55a1e218923e690103e1c5b
--- /dev/null
+++ b/python/paddle/v2/dataset/cifar.py
@@ -0,0 +1,77 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+CIFAR dataset: https://www.cs.toronto.edu/~kriz/cifar.html
+
+TODO(yuyang18): Complete the comments.
+"""
+
+import cPickle
+import itertools
+import numpy
+import paddle.v2.dataset.common
+import tarfile
+
+__all__ = ['train100', 'test100', 'train10', 'test10']
+
+URL_PREFIX = 'https://www.cs.toronto.edu/~kriz/'
+CIFAR10_URL = URL_PREFIX + 'cifar-10-python.tar.gz'
+CIFAR10_MD5 = 'c58f30108f718f92721af3b95e74349a'
+CIFAR100_URL = URL_PREFIX + 'cifar-100-python.tar.gz'
+CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
+
+
+def reader_creator(filename, sub_name):
+    def read_batch(batch):
+        data = batch['data']
+        labels = batch.get('labels', batch.get('fine_labels', None))
+        assert labels is not None
+        for sample, label in itertools.izip(data, labels):
+            yield (sample / 255.0).astype(numpy.float32), int(label)
+
+    def reader():
+        with tarfile.open(filename, mode='r') as f:
+            names = (each_item.name for each_item in f
+                     if sub_name in each_item.name)
+
+            for name in names:
+                batch = cPickle.load(f.extractfile(name))
+                for item in read_batch(batch):
+                    yield item
+
+    return reader
+
+
+def train100():
+    return reader_creator(
+        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
+        'train')
+
+
+def test100():
+    return reader_creator(
+        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
+        'test')
+
+
+def train10():
+    return reader_creator(
+        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        'data_batch')
+
+
+def test10():
+    return reader_creator(
+        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        'test_batch')
diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py
new file mode 100644
index 0000000000000000000000000000000000000000..3021b68ddb02ecaa874e21681796c0912ad4cc06
--- /dev/null
+++ b/python/paddle/v2/dataset/common.py
@@ -0,0 +1,71 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import requests
+import hashlib
+import os
+import shutil
+import sys
+
+__all__ = ['DATA_HOME', 'download', 'md5file']
+
+DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset')
+
+if not os.path.exists(DATA_HOME):
+    os.makedirs(DATA_HOME)
+
+
+def md5file(fname):
+    hash_md5 = hashlib.md5()
+    f = open(fname, "rb")
+    for chunk in iter(lambda: f.read(4096), b""):
+        hash_md5.update(chunk)
+    f.close()
+    return hash_md5.hexdigest()
+
+
+def download(url, module_name, md5sum):
+    dirname = os.path.join(DATA_HOME, module_name)
+    if not os.path.exists(dirname):
+        os.makedirs(dirname)
+
+    filename = os.path.join(dirname, url.split('/')[-1])
+    if not (os.path.exists(filename) and md5file(filename) == md5sum):
+        print "Cache file %s not found, downloading %s" % (filename, url)
+        r = requests.get(url, stream=True)
+        total_length = r.headers.get('content-length')
+
+        if total_length is None:
+            with open(filename, 'w') as f:
+                shutil.copyfileobj(r.raw, f)
+        else:
+            with open(filename, 'w') as f:
+                dl = 0
+                total_length = int(total_length)
+                for data in r.iter_content(chunk_size=4096):
+                    dl += len(data)
+                    f.write(data)
+                    done = int(50 * dl / total_length)
+                    sys.stdout.write("\r[%s%s]" % ('=' * done,
+                                                   ' ' * (50 - done)))
+                    sys.stdout.flush()
+
+    return filename
+
+
+def dict_add(a_dict, ele):
+    if ele in a_dict:
+        a_dict[ele] += 1
+    else:
+        a_dict[ele] = 1
diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py
new file mode 100644
index 0000000000000000000000000000000000000000..9eab49ee39325c1c60fc511e0bd834e83aa987f0
--- /dev/null
+++ b/python/paddle/v2/dataset/conll05.py
@@ -0,0 +1,198 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tarfile
+import gzip
+import itertools
+from common import download
+"""
+Conll 2005 dataset.  Paddle semantic role labeling Book and demo use this
+dataset as an example. Because Conll 2005 is not free in public, the default
+downloaded URL is test set of Conll 2005 (which is public). Users can change
+URL and MD5 to their Conll dataset.
+
+TODO(yuyang18): Complete comments.
+"""
+
+__all__ = ['test, get_dict', 'get_embedding']
+
+DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
+DATA_MD5 = '387719152ae52d60422c016e92a742fc'
+WORDDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/wordDict.txt'
+WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
+VERBDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/verbDict.txt'
+VERBDICT_MD5 = '0d2977293bbb6cbefab5b0f97db1e77c'
+TRGDICT_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/targetDict.txt'
+TRGDICT_MD5 = 'd8c7f03ceb5fc2e5a0fa7503a4353751'
+EMB_URL = 'http://paddlepaddle.bj.bcebos.com/demo/srl_dict_and_embedding/emb'
+EMB_MD5 = 'bf436eb0faa1f6f9103017f8be57cdb7'
+
+UNK_IDX = 0
+
+
+def load_dict(filename):
+    d = dict()
+    with open(filename, 'r') as f:
+        for i, line in enumerate(f):
+            d[line.strip()] = i
+    return d
+
+
+def corpus_reader(data_path, words_name, props_name):
+    """
+    Read one corpus. It returns an iterator. Each element of
+    this iterator is a tuple including sentence and labels. The sentence is
+    consist of a list of word IDs. The labels include a list of label IDs.
+    :return: a iterator of data.
+    :rtype: iterator
+    """
+
+    def reader():
+        tf = tarfile.open(data_path)
+        wf = tf.extractfile(words_name)
+        pf = tf.extractfile(props_name)
+        with gzip.GzipFile(fileobj=wf) as words_file, gzip.GzipFile(
+                fileobj=pf) as props_file:
+            sentences = []
+            labels = []
+            one_seg = []
+            for word, label in itertools.izip(words_file, props_file):
+                word = word.strip()
+                label = label.strip().split()
+
+                if len(label) == 0:  # end of sentence
+                    for i in xrange(len(one_seg[0])):
+                        a_kind_lable = [x[i] for x in one_seg]
+                        labels.append(a_kind_lable)
+
+                    if len(labels) >= 1:
+                        verb_list = []
+                        for x in labels[0]:
+                            if x != '-':
+                                verb_list.append(x)
+
+                        for i, lbl in enumerate(labels[1:]):
+                            cur_tag = 'O'
+                            is_in_bracket = False
+                            lbl_seq = []
+                            verb_word = ''
+                            for l in lbl:
+                                if l == '*' and is_in_bracket == False:
+                                    lbl_seq.append('O')
+                                elif l == '*' and is_in_bracket == True:
+                                    lbl_seq.append('I-' + cur_tag)
+                                elif l == '*)':
+                                    lbl_seq.append('I-' + cur_tag)
+                                    is_in_bracket = False
+                                elif l.find('(') != -1 and l.find(')') != -1:
+                                    cur_tag = l[1:l.find('*')]
+                                    lbl_seq.append('B-' + cur_tag)
+                                    is_in_bracket = False
+                                elif l.find('(') != -1 and l.find(')') == -1:
+                                    cur_tag = l[1:l.find('*')]
+                                    lbl_seq.append('B-' + cur_tag)
+                                    is_in_bracket = True
+                                else:
+                                    raise RuntimeError('Unexpected label: %s' %
+                                                       l)
+
+                            yield sentences, verb_list[i], lbl_seq
+
+                    sentences = []
+                    labels = []
+                    one_seg = []
+                else:
+                    sentences.append(word)
+                    one_seg.append(label)
+
+        pf.close()
+        wf.close()
+        tf.close()
+
+    return reader
+
+
+def reader_creator(corpus_reader,
+                   word_dict=None,
+                   predicate_dict=None,
+                   label_dict=None):
+    def reader():
+        for sentence, predicate, labels in corpus_reader():
+
+            sen_len = len(sentence)
+
+            verb_index = labels.index('B-V')
+            mark = [0] * len(labels)
+            if verb_index > 0:
+                mark[verb_index - 1] = 1
+                ctx_n1 = sentence[verb_index - 1]
+            else:
+                ctx_n1 = 'bos'
+
+            if verb_index > 1:
+                mark[verb_index - 2] = 1
+                ctx_n2 = sentence[verb_index - 2]
+            else:
+                ctx_n2 = 'bos'
+
+            mark[verb_index] = 1
+            ctx_0 = sentence[verb_index]
+
+            if verb_index < len(labels) - 1:
+                mark[verb_index + 1] = 1
+                ctx_p1 = sentence[verb_index + 1]
+            else:
+                ctx_p1 = 'eos'
+
+            if verb_index < len(labels) - 2:
+                mark[verb_index + 2] = 1
+                ctx_p2 = sentence[verb_index + 2]
+            else:
+                ctx_p2 = 'eos'
+
+            word_idx = [word_dict.get(w, UNK_IDX) for w in sentence]
+
+            ctx_n2_idx = [word_dict.get(ctx_n2, UNK_IDX)] * sen_len
+            ctx_n1_idx = [word_dict.get(ctx_n1, UNK_IDX)] * sen_len
+            ctx_0_idx = [word_dict.get(ctx_0, UNK_IDX)] * sen_len
+            ctx_p1_idx = [word_dict.get(ctx_p1, UNK_IDX)] * sen_len
+            ctx_p2_idx = [word_dict.get(ctx_p2, UNK_IDX)] * sen_len
+
+            pred_idx = [predicate_dict.get(predicate)] * sen_len
+            label_idx = [label_dict.get(w) for w in labels]
+
+            yield word_idx, ctx_n2_idx, ctx_n1_idx, \
+              ctx_0_idx, ctx_p1_idx, ctx_p2_idx, pred_idx, mark, label_idx
+
+    return reader
+
+
+def get_dict():
+    word_dict = load_dict(download(WORDDICT_URL, 'conll05st', WORDDICT_MD5))
+    verb_dict = load_dict(download(VERBDICT_URL, 'conll05st', VERBDICT_MD5))
+    label_dict = load_dict(download(TRGDICT_URL, 'conll05st', TRGDICT_MD5))
+    return word_dict, verb_dict, label_dict
+
+
+def get_embedding():
+    return download(EMB_URL, 'conll05st', EMB_MD5)
+
+
+def test():
+    word_dict, verb_dict, label_dict = get_dict()
+    reader = corpus_reader(
+        download(DATA_URL, 'conll05st', DATA_MD5),
+        words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
+        props_name='conll05st-release/test.wsj/props/test.wsj.props.gz')
+    return reader_creator(reader, word_dict, verb_dict, label_dict)
diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py
new file mode 100644
index 0000000000000000000000000000000000000000..76019d9f54020ff6f02c17eb6047cbd014a8ccf2
--- /dev/null
+++ b/python/paddle/v2/dataset/imdb.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+IMDB dataset: http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz
+
+TODO(yuyang18): Complete comments.
+"""
+
+import paddle.v2.dataset.common
+import tarfile
+import Queue
+import re
+import string
+import threading
+
+__all__ = ['build_dict', 'train', 'test']
+
+URL = 'http://ai.stanford.edu/%7Eamaas/data/sentiment/aclImdb_v1.tar.gz'
+MD5 = '7c2ac02c03563afcf9b574c7e56c153a'
+
+
+# Read files that match pattern.  Tokenize and yield each file.
+def tokenize(pattern):
+    with tarfile.open(paddle.v2.dataset.common.download(URL, 'imdb',
+                                                        MD5)) as tarf:
+        # Note that we should use tarfile.next(), which does
+        # sequential access of member files, other than
+        # tarfile.extractfile, which does random access and might
+        # destroy hard disks.
+        tf = tarf.next()
+        while tf != None:
+            if bool(pattern.match(tf.name)):
+                # newline and punctuations removal and ad-hoc tokenization.
+                yield tarf.extractfile(tf).read().rstrip("\n\r").translate(
+                    None, string.punctuation).lower().split()
+            tf = tarf.next()
+
+
+def build_dict(pattern, cutoff):
+    word_freq = {}
+    for doc in tokenize(pattern):
+        for word in doc:
+            paddle.v2.dataset.common.dict_add(word_freq, word)
+
+    # Not sure if we should prune less-frequent words here.
+    word_freq = filter(lambda x: x[1] > cutoff, word_freq.items())
+
+    dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
+    words, _ = list(zip(*dictionary))
+    word_idx = dict(zip(words, xrange(len(words))))
+    word_idx['<unk>'] = len(words)
+    return word_idx
+
+
+def reader_creator(pos_pattern, neg_pattern, word_idx, buffer_size):
+    UNK = word_idx['<unk>']
+
+    qs = [Queue.Queue(maxsize=buffer_size), Queue.Queue(maxsize=buffer_size)]
+
+    def load(pattern, queue):
+        for doc in tokenize(pattern):
+            queue.put(doc)
+        queue.put(None)
+
+    def reader():
+        # Creates two threads that loads positive and negative samples
+        # into qs.
+        t0 = threading.Thread(
+            target=load, args=(
+                pos_pattern,
+                qs[0], ))
+        t0.daemon = True
+        t0.start()
+
+        t1 = threading.Thread(
+            target=load, args=(
+                neg_pattern,
+                qs[1], ))
+        t1.daemon = True
+        t1.start()
+
+        # Read alternatively from qs[0] and qs[1].
+        i = 0
+        doc = qs[i].get()
+        while doc != None:
+            yield [word_idx.get(w, UNK) for w in doc], i % 2
+            i += 1
+            doc = qs[i % 2].get()
+
+        # If any queue is empty, reads from the other queue.
+        i += 1
+        doc = qs[i % 2].get()
+        while doc != None:
+            yield [word_idx.get(w, UNK) for w in doc], i % 2
+            doc = qs[i % 2].get()
+
+    return reader()
+
+
+def train(word_idx):
+    return reader_creator(
+        re.compile("aclImdb/train/pos/.*\.txt$"),
+        re.compile("aclImdb/train/neg/.*\.txt$"), word_idx, 1000)
+
+
+def test(word_idx):
+    return reader_creator(
+        re.compile("aclImdb/test/pos/.*\.txt$"),
+        re.compile("aclImdb/test/neg/.*\.txt$"), word_idx, 1000)
+
+
+def word_dict():
+    return build_dict(
+        re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py
new file mode 100644
index 0000000000000000000000000000000000000000..97c160f111d09d61eb860c7f02552e635f2400a7
--- /dev/null
+++ b/python/paddle/v2/dataset/imikolov.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+imikolov's simple dataset: http://www.fit.vutbr.cz/~imikolov/rnnlm/
+
+Complete comments.
+"""
+import paddle.v2.dataset.common
+import tarfile
+
+__all__ = ['train', 'test', 'build_dict']
+
+URL = 'http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz'
+MD5 = '30177ea32e27c525793142b6bf2c8e2d'
+
+
+def word_count(f, word_freq=None):
+    add = paddle.v2.dataset.common.dict_add
+    if word_freq == None:
+        word_freq = {}
+
+    for l in f:
+        for w in l.strip().split():
+            add(word_freq, w)
+        add(word_freq, '<s>')
+        add(word_freq, '<e>')
+
+    return word_freq
+
+
+def build_dict():
+    train_filename = './simple-examples/data/ptb.train.txt'
+    test_filename = './simple-examples/data/ptb.valid.txt'
+    with tarfile.open(
+            paddle.v2.dataset.common.download(
+                paddle.v2.dataset.imikolov.URL, 'imikolov',
+                paddle.v2.dataset.imikolov.MD5)) as tf:
+        trainf = tf.extractfile(train_filename)
+        testf = tf.extractfile(test_filename)
+        word_freq = word_count(testf, word_count(trainf))
+        if '<unk>' in word_freq:
+            # remove <unk> for now, since we will set it as last index
+            del word_freq['<unk>']
+
+        TYPO_FREQ = 50
+        word_freq = filter(lambda x: x[1] > TYPO_FREQ, word_freq.items())
+
+        word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
+        words, _ = list(zip(*word_freq_sorted))
+        word_idx = dict(zip(words, xrange(len(words))))
+        word_idx['<unk>'] = len(words)
+
+    return word_idx
+
+
+def reader_creator(filename, word_idx, n):
+    def reader():
+        with tarfile.open(
+                paddle.v2.dataset.common.download(
+                    paddle.v2.dataset.imikolov.URL, 'imikolov',
+                    paddle.v2.dataset.imikolov.MD5)) as tf:
+            f = tf.extractfile(filename)
+
+            UNK = word_idx['<unk>']
+            for l in f:
+                l = ['<s>'] + l.strip().split() + ['<e>']
+                if len(l) >= n:
+                    l = [word_idx.get(w, UNK) for w in l]
+                    for i in range(n, len(l) + 1):
+                        yield tuple(l[i - n:i])
+
+    return reader
+
+
+def train(word_idx, n):
+    return reader_creator('./simple-examples/data/ptb.train.txt', word_idx, n)
+
+
+def test(word_idx, n):
+    return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n)
diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..16f2fcb99de4cb1971a7375a97b5daa209ee95ef
--- /dev/null
+++ b/python/paddle/v2/dataset/mnist.py
@@ -0,0 +1,108 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+MNIST dataset.
+
+This module will download dataset from http://yann.lecun.com/exdb/mnist/ and
+parse train set and test set into paddle reader creators.
+"""
+import paddle.v2.dataset.common
+import subprocess
+import numpy
+import platform
+__all__ = ['train', 'test']
+
+URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/'
+TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
+TEST_IMAGE_MD5 = '9fb629c4189551a2d022fa330f9573f3'
+TEST_LABEL_URL = URL_PREFIX + 't10k-labels-idx1-ubyte.gz'
+TEST_LABEL_MD5 = 'ec29112dd5afa0611ce80d1b7f02629c'
+TRAIN_IMAGE_URL = URL_PREFIX + 'train-images-idx3-ubyte.gz'
+TRAIN_IMAGE_MD5 = 'f68b3c2dcbeaaa9fbdd348bbdeb94873'
+TRAIN_LABEL_URL = URL_PREFIX + 'train-labels-idx1-ubyte.gz'
+TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432'
+
+
+def reader_creator(image_filename, label_filename, buffer_size):
+    def reader():
+        if platform.system() == 'Darwin':
+            zcat_cmd = 'gzcat'
+        elif platform.system() == 'Linux':
+            zcat_cmd = 'zcat'
+        else:
+            raise NotImplementedError()
+
+        # According to http://stackoverflow.com/a/38061619/724872, we
+        # cannot use standard package gzip here.
+        m = subprocess.Popen([zcat_cmd, image_filename], stdout=subprocess.PIPE)
+        m.stdout.read(16)  # skip some magic bytes
+
+        l = subprocess.Popen([zcat_cmd, label_filename], stdout=subprocess.PIPE)
+        l.stdout.read(8)  # skip some magic bytes
+
+        try:  # reader could be break.
+            while True:
+                labels = numpy.fromfile(
+                    l.stdout, 'ubyte', count=buffer_size).astype("int")
+
+                if labels.size != buffer_size:
+                    break  # numpy.fromfile returns empty slice after EOF.
+
+                images = numpy.fromfile(
+                    m.stdout, 'ubyte', count=buffer_size * 28 * 28).reshape(
+                        (buffer_size, 28 * 28)).astype('float32')
+
+                images = images / 255.0 * 2.0 - 1.0
+
+                for i in xrange(buffer_size):
+                    yield images[i, :], int(labels[i])
+        finally:
+            m.terminate()
+            l.terminate()
+
+    return reader
+
+
+def train():
+    """
+    MNIST train set creator.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Train reader creator
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist',
+                                          TRAIN_IMAGE_MD5),
+        paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist',
+                                          TRAIN_LABEL_MD5), 100)
+
+
+def test():
+    """
+    MNIST test set cretor.
+
+    It returns a reader creator, each sample in the reader is image pixels in
+    [0, 1] and label in [0, 9].
+
+    :return: Test reader creator.
+    :rtype: callable
+    """
+    return reader_creator(
+        paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist',
+                                          TEST_IMAGE_MD5),
+        paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist',
+                                          TEST_LABEL_MD5), 100)
diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py
new file mode 100644
index 0000000000000000000000000000000000000000..25fd8227da2f219d75c6b830e65627ecf35be453
--- /dev/null
+++ b/python/paddle/v2/dataset/movielens.py
@@ -0,0 +1,209 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Movielens 1-M dataset.
+
+TODO(yuyang18): Complete comments.
+"""
+
+import zipfile
+from common import download
+import re
+import random
+import functools
+
+__all__ = [
+    'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id',
+    'age_table', 'movie_categories', 'max_job_id', 'user_info', 'movie_info'
+]
+
+age_table = [1, 18, 25, 35, 45, 50, 56]
+
+
+class MovieInfo(object):
+    def __init__(self, index, categories, title):
+        self.index = int(index)
+        self.categories = categories
+        self.title = title
+
+    def value(self):
+        return [
+            self.index, [CATEGORIES_DICT[c] for c in self.categories],
+            [MOVIE_TITLE_DICT[w.lower()] for w in self.title.split()]
+        ]
+
+    def __str__(self):
+        return "<MovieInfo id(%d), title(%s), categories(%s)>" % (
+            self.index, self.title, self.categories)
+
+    def __repr__(self):
+        return self.__str__()
+
+
+class UserInfo(object):
+    def __init__(self, index, gender, age, job_id):
+        self.index = int(index)
+        self.is_male = gender == 'M'
+        self.age = age_table.index(int(age))
+        self.job_id = int(job_id)
+
+    def value(self):
+        return [self.index, 0 if self.is_male else 1, self.age, self.job_id]
+
+    def __str__(self):
+        return "<UserInfo id(%d), gender(%s), age(%d), job(%d)>" % (
+            self.index, "M"
+            if self.is_male else "F", age_table[self.age], self.job_id)
+
+    def __repr__(self):
+        return str(self)
+
+
+MOVIE_INFO = None
+MOVIE_TITLE_DICT = None
+CATEGORIES_DICT = None
+USER_INFO = None
+
+
+def __initialize_meta_info__():
+    fn = download(
+        url='http://files.grouplens.org/datasets/movielens/ml-1m.zip',
+        module_name='movielens',
+        md5sum='c4d9eecfca2ab87c1945afe126590906')
+    global MOVIE_INFO
+    if MOVIE_INFO is None:
+        pattern = re.compile(r'^(.*)\((\d+)\)$')
+        with zipfile.ZipFile(file=fn) as package:
+            for info in package.infolist():
+                assert isinstance(info, zipfile.ZipInfo)
+                MOVIE_INFO = dict()
+                title_word_set = set()
+                categories_set = set()
+                with package.open('ml-1m/movies.dat') as movie_file:
+                    for i, line in enumerate(movie_file):
+                        movie_id, title, categories = line.strip().split('::')
+                        categories = categories.split('|')
+                        for c in categories:
+                            categories_set.add(c)
+                        title = pattern.match(title).group(1)
+                        MOVIE_INFO[int(movie_id)] = MovieInfo(
+                            index=movie_id, categories=categories, title=title)
+                        for w in title.split():
+                            title_word_set.add(w.lower())
+
+                global MOVIE_TITLE_DICT
+                MOVIE_TITLE_DICT = dict()
+                for i, w in enumerate(title_word_set):
+                    MOVIE_TITLE_DICT[w] = i
+
+                global CATEGORIES_DICT
+                CATEGORIES_DICT = dict()
+                for i, c in enumerate(categories_set):
+                    CATEGORIES_DICT[c] = i
+
+                global USER_INFO
+                USER_INFO = dict()
+                with package.open('ml-1m/users.dat') as user_file:
+                    for line in user_file:
+                        uid, gender, age, job, _ = line.strip().split("::")
+                        USER_INFO[int(uid)] = UserInfo(
+                            index=uid, gender=gender, age=age, job_id=job)
+    return fn
+
+
+def __reader__(rand_seed=0, test_ratio=0.1, is_test=False):
+    fn = __initialize_meta_info__()
+    rand = random.Random(x=rand_seed)
+    with zipfile.ZipFile(file=fn) as package:
+        with package.open('ml-1m/ratings.dat') as rating:
+            for line in rating:
+                if (rand.random() < test_ratio) == is_test:
+                    uid, mov_id, rating, _ = line.strip().split("::")
+                    uid = int(uid)
+                    mov_id = int(mov_id)
+                    rating = float(rating) * 2 - 5.0
+
+                    mov = MOVIE_INFO[mov_id]
+                    usr = USER_INFO[uid]
+                    yield usr.value() + mov.value() + [[rating]]
+
+
+def __reader_creator__(**kwargs):
+    return lambda: __reader__(**kwargs)
+
+
+train = functools.partial(__reader_creator__, is_test=False)
+test = functools.partial(__reader_creator__, is_test=True)
+
+
+def get_movie_title_dict():
+    __initialize_meta_info__()
+    return MOVIE_TITLE_DICT
+
+
+def __max_index_info__(a, b):
+    if a.index > b.index:
+        return a
+    else:
+        return b
+
+
+def max_movie_id():
+    __initialize_meta_info__()
+    return reduce(__max_index_info__, MOVIE_INFO.viewvalues()).index
+
+
+def max_user_id():
+    __initialize_meta_info__()
+    return reduce(__max_index_info__, USER_INFO.viewvalues()).index
+
+
+def __max_job_id_impl__(a, b):
+    if a.job_id > b.job_id:
+        return a
+    else:
+        return b
+
+
+def max_job_id():
+    __initialize_meta_info__()
+    return reduce(__max_job_id_impl__, USER_INFO.viewvalues()).job_id
+
+
+def movie_categories():
+    __initialize_meta_info__()
+    return CATEGORIES_DICT
+
+
+def user_info():
+    __initialize_meta_info__()
+    return USER_INFO
+
+
+def movie_info():
+    __initialize_meta_info__()
+    return MOVIE_INFO
+
+
+def unittest():
+    for train_count, _ in enumerate(train()()):
+        pass
+    for test_count, _ in enumerate(test()()):
+        pass
+
+    print train_count, test_count
+
+
+if __name__ == '__main__':
+    unittest()
diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py
new file mode 100644
index 0000000000000000000000000000000000000000..71689fd61b6b14a7b5072caff4e2fd48a7f74072
--- /dev/null
+++ b/python/paddle/v2/dataset/sentiment.py
@@ -0,0 +1,127 @@
+# /usr/bin/env python
+# -*- coding:utf-8 -*-
+
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+The script fetch and preprocess movie_reviews data set that provided by NLTK
+
+TODO(yuyang18): Complete dataset.
+"""
+
+import collections
+from itertools import chain
+
+import nltk
+from nltk.corpus import movie_reviews
+
+import common
+
+__all__ = ['train', 'test', 'get_word_dict']
+NUM_TRAINING_INSTANCES = 1600
+NUM_TOTAL_INSTANCES = 2000
+
+
+def download_data_if_not_yet():
+    """
+    Download the data set, if the data set is not download.
+    """
+    try:
+        # make sure that nltk can find the data
+        if common.DATA_HOME not in nltk.data.path:
+            nltk.data.path.append(common.DATA_HOME)
+        movie_reviews.categories()
+    except LookupError:
+        print "Downloading movie_reviews data set, please wait....."
+        nltk.download('movie_reviews', download_dir=common.DATA_HOME)
+        print "Download data set success....."
+        print "Path is " + nltk.data.find('corpora/movie_reviews').path
+
+
+def get_word_dict():
+    """
+    Sorted the words by the frequency of words which occur in sample
+    :return:
+        words_freq_sorted
+    """
+    words_freq_sorted = list()
+    word_freq_dict = collections.defaultdict(int)
+    download_data_if_not_yet()
+
+    for category in movie_reviews.categories():
+        for field in movie_reviews.fileids(category):
+            for words in movie_reviews.words(field):
+                word_freq_dict[words] += 1
+    words_sort_list = word_freq_dict.items()
+    words_sort_list.sort(cmp=lambda a, b: b[1] - a[1])
+    for index, word in enumerate(words_sort_list):
+        words_freq_sorted.append((word[0], index))
+    return words_freq_sorted
+
+
+def sort_files():
+    """
+    Sorted the sample for cross reading the sample
+    :return:
+        files_list
+    """
+    files_list = list()
+    neg_file_list = movie_reviews.fileids('neg')
+    pos_file_list = movie_reviews.fileids('pos')
+    files_list = list(chain.from_iterable(zip(neg_file_list, pos_file_list)))
+    return files_list
+
+
+def load_sentiment_data():
+    """
+    Load the data set
+    :return:
+        data_set
+    """
+    data_set = list()
+    download_data_if_not_yet()
+    words_ids = dict(get_word_dict())
+    for sample_file in sort_files():
+        words_list = list()
+        category = 0 if 'neg' in sample_file else 1
+        for word in movie_reviews.words(sample_file):
+            words_list.append(words_ids[word.lower()])
+        data_set.append((words_list, category))
+    return data_set
+
+
+def reader_creator(data):
+    """
+    Reader creator, generate an iterator for data set
+    :param data:
+        train data set or test data set
+    """
+    for each in data:
+        yield each[0], each[1]
+
+
+def train():
+    """
+    Default train set reader creator
+    """
+    data_set = load_sentiment_data()
+    return reader_creator(data_set[0:NUM_TRAINING_INSTANCES])
+
+
+def test():
+    """
+    Default test set reader creator
+    """
+    data_set = load_sentiment_data()
+    return reader_creator(data_set[NUM_TRAINING_INSTANCES:])
diff --git a/python/paddle/v2/dataset/tests/cifar_test.py b/python/paddle/v2/dataset/tests/cifar_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0e18229da7818be5752ee592e094a00da286ad9
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/cifar_test.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.cifar
+import unittest
+
+
+class TestCIFAR(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        for l in reader():
+            self.assertEqual(l[0].size, 3072)
+            if l[1] > label:
+                label = l[1]
+            sum += 1
+        return sum, label
+
+    def test_test10(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.cifar.test10())
+        self.assertEqual(instances, 10000)
+        self.assertEqual(max_label_value, 9)
+
+    def test_train10(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.cifar.train10())
+        self.assertEqual(instances, 50000)
+        self.assertEqual(max_label_value, 9)
+
+    def test_test100(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.cifar.test100())
+        self.assertEqual(instances, 10000)
+        self.assertEqual(max_label_value, 99)
+
+    def test_train100(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.cifar.train100())
+        self.assertEqual(instances, 50000)
+        self.assertEqual(max_label_value, 99)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/common_test.py b/python/paddle/v2/dataset/tests/common_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5babcef0eb4345d243904877d323c37d4889a643
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/common_test.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.common
+import unittest
+import tempfile
+
+
+class TestCommon(unittest.TestCase):
+    def test_md5file(self):
+        _, temp_path = tempfile.mkstemp()
+        with open(temp_path, 'w') as f:
+            f.write("Hello\n")
+        self.assertEqual('09f7e02f1290be211da707a266f153b3',
+                         paddle.v2.dataset.common.md5file(temp_path))
+
+    def test_download(self):
+        yi_avatar = 'https://avatars0.githubusercontent.com/u/1548775?v=3&s=460'
+        self.assertEqual(
+            paddle.v2.dataset.common.DATA_HOME + '/test/1548775?v=3&s=460',
+            paddle.v2.dataset.common.download(
+                yi_avatar, 'test', 'f75287202d6622414c706c36c16f8e0d'))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/imdb_test.py b/python/paddle/v2/dataset/tests/imdb_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4d82f26895d77d05c6e936bd636b1239e1a0cd8
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/imdb_test.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.imdb
+import unittest
+import re
+
+TRAIN_POS_PATTERN = re.compile("aclImdb/train/pos/.*\.txt$")
+TRAIN_NEG_PATTERN = re.compile("aclImdb/train/neg/.*\.txt$")
+TRAIN_PATTERN = re.compile("aclImdb/train/.*\.txt$")
+
+TEST_POS_PATTERN = re.compile("aclImdb/test/pos/.*\.txt$")
+TEST_NEG_PATTERN = re.compile("aclImdb/test/neg/.*\.txt$")
+TEST_PATTERN = re.compile("aclImdb/test/.*\.txt$")
+
+
+class TestIMDB(unittest.TestCase):
+    word_idx = None
+
+    def test_build_dict(self):
+        if self.word_idx == None:
+            self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN,
+                                                              150)
+
+        self.assertEqual(len(self.word_idx), 7036)
+
+    def check_dataset(self, dataset, expected_size):
+        if self.word_idx == None:
+            self.word_idx = paddle.v2.dataset.imdb.build_dict(TRAIN_PATTERN,
+                                                              150)
+
+        sum = 0
+        for l in dataset(self.word_idx):
+            self.assertEqual(l[1], sum % 2)
+            sum += 1
+        self.assertEqual(sum, expected_size)
+
+    def test_train(self):
+        self.check_dataset(paddle.v2.dataset.imdb.train, 25000)
+
+    def test_test(self):
+        self.check_dataset(paddle.v2.dataset.imdb.test, 25000)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/imikolov_test.py b/python/paddle/v2/dataset/tests/imikolov_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..009e55243a594e5e235c36fb0223ec70754d17f3
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/imikolov_test.py
@@ -0,0 +1,26 @@
+import paddle.v2.dataset.imikolov
+import unittest
+
+WORD_DICT = paddle.v2.dataset.imikolov.build_dict()
+
+
+class TestMikolov(unittest.TestCase):
+    def check_reader(self, reader, n):
+        for l in reader():
+            self.assertEqual(len(l), n)
+
+    def test_train(self):
+        n = 5
+        self.check_reader(paddle.v2.dataset.imikolov.train(WORD_DICT, n), n)
+
+    def test_test(self):
+        n = 5
+        self.check_reader(paddle.v2.dataset.imikolov.test(WORD_DICT, n), n)
+
+    def test_total(self):
+        _, idx = zip(*WORD_DICT.items())
+        self.assertEqual(sorted(idx)[-1], len(WORD_DICT) - 1)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/mnist_test.py b/python/paddle/v2/dataset/tests/mnist_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d344cac3e7483a351033570fbec75a4d19f4a55
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/mnist_test.py
@@ -0,0 +1,44 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2.dataset.mnist
+import unittest
+
+
+class TestMNIST(unittest.TestCase):
+    def check_reader(self, reader):
+        sum = 0
+        label = 0
+        for l in reader():
+            self.assertEqual(l[0].size, 784)
+            if l[1] > label:
+                label = l[1]
+            sum += 1
+        return sum, label
+
+    def test_train(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.mnist.train())
+        self.assertEqual(instances, 60000)
+        self.assertEqual(max_label_value, 9)
+
+    def test_test(self):
+        instances, max_label_value = self.check_reader(
+            paddle.v2.dataset.mnist.test())
+        self.assertEqual(instances, 10000)
+        self.assertEqual(max_label_value, 9)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/tests/test_sentiment.py b/python/paddle/v2/dataset/tests/test_sentiment.py
new file mode 100644
index 0000000000000000000000000000000000000000..407405290734609059c1767600748d530e8a13a6
--- /dev/null
+++ b/python/paddle/v2/dataset/tests/test_sentiment.py
@@ -0,0 +1,55 @@
+# /usr/bin/env python
+# -*- coding:utf-8 -*-
+
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import nltk
+import paddle.v2.dataset.sentiment as st
+from nltk.corpus import movie_reviews
+
+
+class TestSentimentMethods(unittest.TestCase):
+    def test_get_word_dict(self):
+        word_dict = st.get_word_dict()[0:10]
+        test_word_list = [(u',', 0), (u'the', 1), (u'.', 2), (u'a', 3),
+                          (u'and', 4), (u'of', 5), (u'to', 6), (u"'", 7),
+                          (u'is', 8), (u'in', 9)]
+        for idx, each in enumerate(word_dict):
+            self.assertEqual(each, test_word_list[idx])
+        self.assertTrue("/root/.cache/paddle/dataset" in nltk.data.path)
+
+    def test_sort_files(self):
+        last_label = ''
+        for sample_file in st.sort_files():
+            current_label = sample_file.split("/")[0]
+            self.assertNotEqual(current_label, last_label)
+            last_label = current_label
+
+    def test_data_set(self):
+        data_set = st.load_sentiment_data()
+        last_label = -1
+        for each in st.test():
+            self.assertNotEqual(each[1], last_label)
+            last_label = each[1]
+        self.assertEqual(len(data_set), st.NUM_TOTAL_INSTANCES)
+        self.assertEqual(len(list(st.train())), st.NUM_TRAINING_INSTANCES)
+        self.assertEqual(
+            len(list(st.test())),
+            (st.NUM_TOTAL_INSTANCES - st.NUM_TRAINING_INSTANCES))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py
new file mode 100644
index 0000000000000000000000000000000000000000..27f454b137e3a40febd19cf085e2f4034cc16b24
--- /dev/null
+++ b/python/paddle/v2/dataset/uci_housing.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+UCI Housing dataset.
+
+TODO(yuyang18): Complete comments.
+"""
+
+import numpy as np
+import os
+from common import download
+
+__all__ = ['train', 'test']
+
+URL = 'https://archive.ics.uci.edu/ml/machine-learning-databases/housing/housing.data'
+MD5 = 'd4accdce7a25600298819f8e28e8d593'
+feature_names = [
+    'CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
+    'PTRATIO', 'B', 'LSTAT'
+]
+
+UCI_TRAIN_DATA = None
+UCI_TEST_DATA = None
+
+
+def feature_range(maximums, minimums):
+    import matplotlib
+    matplotlib.use('Agg')
+    import matplotlib.pyplot as plt
+    fig, ax = plt.subplots()
+    feature_num = len(maximums)
+    ax.bar(range(feature_num), maximums - minimums, color='r', align='center')
+    ax.set_title('feature scale')
+    plt.xticks(range(feature_num), feature_names)
+    plt.xlim([-1, feature_num])
+    fig.set_figheight(6)
+    fig.set_figwidth(10)
+    if not os.path.exists('./image'):
+        os.makedirs('./image')
+    fig.savefig('image/ranges.png', dpi=48)
+    plt.close(fig)
+
+
+def load_data(filename, feature_num=14, ratio=0.8):
+    global UCI_TRAIN_DATA, UCI_TEST_DATA
+    if UCI_TRAIN_DATA is not None and UCI_TEST_DATA is not None:
+        return
+
+    data = np.fromfile(filename, sep=' ')
+    data = data.reshape(data.shape[0] / feature_num, feature_num)
+    maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
+        axis=0) / data.shape[0]
+    feature_range(maximums[:-1], minimums[:-1])
+    for i in xrange(feature_num - 1):
+        data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
+    offset = int(data.shape[0] * ratio)
+    UCI_TRAIN_DATA = data[:offset]
+    UCI_TEST_DATA = data[offset:]
+
+
+def train():
+    global UCI_TRAIN_DATA
+    load_data(download(URL, 'uci_housing', MD5))
+
+    def reader():
+        for d in UCI_TRAIN_DATA:
+            yield d[:-1], d[-1:]
+
+    return reader
+
+
+def test():
+    global UCI_TEST_DATA
+    load_data(download(URL, 'uci_housing', MD5))
+
+    def reader():
+        for d in UCI_TEST_DATA:
+            yield d[:-1], d[-1:]
+
+    return reader
diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5a16d51477f9cfbf0cd32af54098406fbbd2b41
--- /dev/null
+++ b/python/paddle/v2/dataset/wmt14.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+wmt14 dataset
+"""
+import tarfile
+
+import paddle.v2.dataset.common
+
+__all__ = ['train', 'test', 'build_dict']
+
+URL_DEV_TEST = 'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz'
+MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
+# this is a small set of data for test. The original data is too large and will be add later.
+URL_TRAIN = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz'
+MD5_TRAIN = 'a755315dd01c2c35bde29a744ede23a6'
+
+START = "<s>"
+END = "<e>"
+UNK = "<unk>"
+UNK_IDX = 2
+
+
+def __read_to_dict__(tar_file, dict_size):
+    def __to_dict__(fd, size):
+        out_dict = dict()
+        for line_count, line in enumerate(fd):
+            if line_count < size:
+                out_dict[line.strip()] = line_count
+            else:
+                break
+        return out_dict
+
+    with tarfile.open(tar_file, mode='r') as f:
+        names = [
+            each_item.name for each_item in f
+            if each_item.name.endswith("src.dict")
+        ]
+        assert len(names) == 1
+        src_dict = __to_dict__(f.extractfile(names[0]), dict_size)
+        names = [
+            each_item.name for each_item in f
+            if each_item.name.endswith("trg.dict")
+        ]
+        assert len(names) == 1
+        trg_dict = __to_dict__(f.extractfile(names[0]), dict_size)
+        return src_dict, trg_dict
+
+
+def reader_creator(tar_file, file_name, dict_size):
+    def reader():
+        src_dict, trg_dict = __read_to_dict__(tar_file, dict_size)
+        with tarfile.open(tar_file, mode='r') as f:
+            names = [
+                each_item.name for each_item in f
+                if each_item.name.endswith(file_name)
+            ]
+            for name in names:
+                for line in f.extractfile(name):
+                    line_split = line.strip().split('\t')
+                    if len(line_split) != 2:
+                        continue
+                    src_seq = line_split[0]  # one source sequence
+                    src_words = src_seq.split()
+                    src_ids = [
+                        src_dict.get(w, UNK_IDX)
+                        for w in [START] + src_words + [END]
+                    ]
+
+                    trg_seq = line_split[1]  # one target sequence
+                    trg_words = trg_seq.split()
+                    trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words]
+
+                    # remove sequence whose length > 80 in training mode
+                    if len(src_ids) > 80 or len(trg_ids) > 80:
+                        continue
+                    trg_ids_next = trg_ids + [trg_dict[END]]
+                    trg_ids = [trg_dict[START]] + trg_ids
+
+                    yield src_ids, trg_ids, trg_ids_next
+
+    return reader
+
+
+def train(dict_size):
+    return reader_creator(
+        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        'train/train', dict_size)
+
+
+def test(dict_size):
+    return reader_creator(
+        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        'test/test', dict_size)
diff --git a/python/paddle/v2/event.py b/python/paddle/v2/event.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ad52b8baa411269d29732685871a875df5185cc
--- /dev/null
+++ b/python/paddle/v2/event.py
@@ -0,0 +1,84 @@
+"""
+All training events.
+
+There are:
+
+* BeginIteration
+* EndIteration
+* BeginPass
+* EndPass
+
+TODO(yuyang18): Complete it!
+"""
+import py_paddle.swig_paddle as api
+
+__all__ = [
+    'EndIteration', 'BeginIteration', 'BeginPass', 'EndPass', 'TestResult'
+]
+
+
+class WithMetric(object):
+    def __init__(self, evaluator):
+        if not isinstance(evaluator, api.Evaluator):
+            raise TypeError("Evaluator should be api.Evaluator type")
+        self.__evaluator__ = evaluator
+
+    @property
+    def metrics(self):
+        names = self.__evaluator__.getNames()
+        retv = dict()
+        for each_name in names:
+            val = self.__evaluator__.getValue(each_name)
+            retv[each_name] = val
+        return retv
+
+
+class TestResult(WithMetric):
+    """
+    Result that trainer.test return.
+    """
+
+    def __init__(self, evaluator, cost):
+        super(TestResult, self).__init__(evaluator)
+        self.cost = cost
+
+
+class BeginPass(object):
+    """
+    Event On One Pass Training Start.
+    """
+
+    def __init__(self, pass_id):
+        self.pass_id = pass_id
+
+
+class EndPass(WithMetric):
+    """
+    Event On One Pass Training Complete.
+    """
+
+    def __init__(self, pass_id, evaluator):
+        self.pass_id = pass_id
+        WithMetric.__init__(self, evaluator)
+
+
+class BeginIteration(object):
+    """
+    Event On One Batch Training Start.
+    """
+
+    def __init__(self, pass_id, batch_id):
+        self.pass_id = pass_id
+        self.batch_id = batch_id
+
+
+class EndIteration(WithMetric):
+    """
+    Event On One Batch Training Complete.
+    """
+
+    def __init__(self, pass_id, batch_id, cost, evaluator):
+        self.pass_id = pass_id
+        self.batch_id = batch_id
+        self.cost = cost
+        WithMetric.__init__(self, evaluator)
diff --git a/python/paddle/v2/inference.py b/python/paddle/v2/inference.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec3c67d89548f68d705a9b5de80e28597e9829da
--- /dev/null
+++ b/python/paddle/v2/inference.py
@@ -0,0 +1,88 @@
+import numpy
+import py_paddle.swig_paddle as api
+import collections
+import topology
+import minibatch
+from data_feeder import DataFeeder
+
+__all__ = ['infer']
+
+
+class Inference(object):
+    def __init__(self, output_layer, parameters):
+        topo = topology.Topology(output_layer)
+        gm = api.GradientMachine.createFromConfigProto(
+            topo.proto(), api.CREATE_MODE_TESTING, [api.PARAMETER_VALUE])
+        for param in gm.getParameters():
+            val = param.getBuf(api.PARAMETER_VALUE)
+            name = param.getName()
+            assert isinstance(val, api.Vector)
+            val.copyFromNumpyArray(parameters.get(name).flatten())
+        self.__gradient_machine__ = gm
+        self.__data_types__ = topo.data_type()
+
+    def iter_infer(self, input, feeding=None):
+        feeder = DataFeeder(self.__data_types__, feeding)
+        batch_size = len(input)
+
+        def __reader_impl__():
+            for each_sample in input:
+                yield each_sample
+
+        reader = minibatch.batch(__reader_impl__, batch_size=batch_size)
+
+        self.__gradient_machine__.start()
+        for data_batch in reader():
+            yield self.__gradient_machine__.forwardTest(feeder(data_batch))
+        self.__gradient_machine__.finish()
+
+    def iter_infer_field(self, field, **kwargs):
+        for result in self.iter_infer(**kwargs):
+            yield [each_result[field] for each_result in result]
+
+    def infer(self, field='value', **kwargs):
+        retv = None
+        for result in self.iter_infer_field(field=field, **kwargs):
+            if retv is None:
+                retv = [[]] * len(result)
+            for i, item in enumerate(result):
+                retv[i].append(item)
+        retv = [numpy.concatenate(out) for out in retv]
+        if len(retv) == 1:
+            return retv[0]
+        else:
+            return retv
+
+
+def infer(output_layer, parameters, input, feeding=None, field='value'):
+    """
+    Infer a neural network by given neural network output and parameters.  The
+    user should pass either a batch of input data or reader method.
+
+    Example usages:
+
+    ..  code-block:: python
+
+        result = paddle.infer(prediction, parameters, input=SomeData,
+                              batch_size=32)
+        print result
+
+    :param output_layer: output of the neural network that would be inferred
+    :type output_layer: paddle.v2.config_base.Layer
+    :param parameters: parameters of the neural network.
+    :type parameters: paddle.v2.parameters.Parameters
+    :param input: input data batch. Should be a python iterable object, and each
+                  element is the data batch.
+    :type input: collections.Iterable
+    :param feeding: Reader dictionary. Default could generate from input
+                        value.
+    :param field: The prediction field. It should in [`value`, `ids`]. `value`
+                  means return the prediction probabilities, `ids` means return
+                  the prediction labels. Default is `value`
+    :type field: str
+    :return: a numpy array
+    :rtype: numpy.ndarray
+    """
+
+    inferer = Inference(output_layer=output_layer, parameters=parameters)
+    return inferer.infer(field=field, input=input, feeding=feeding)
diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..1e4efedde363f20fde168941adcb6e8a594b533a
--- /dev/null
+++ b/python/paddle/v2/layer.py
@@ -0,0 +1,498 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+`paddle.v2.layer` is a part of model config packages in paddle.v2. In API v2,
+we want to make Paddle a plain Python package. The model config package defined
+the way how to configure a neural network topology in Paddle Python code.
+
+The primary usage shows below.
+
+..  code-block:: python
+
+    import paddle.v2 as paddle
+
+    img = paddle.layer.data(name='img', type=paddle.data_type.dense_vector(784))
+    hidden = paddle.layer.fc(input=img, size=200)
+    prediction = paddle.layer.fc(input=hidden, size=10,
+                                 act=paddle.activation.Softmax())
+
+    # use prediction instance where needed.
+    parameters = paddle.parameters.create(cost)
+"""
+
+import collections
+import inspect
+from config_base import Layer, __convert_to_v2__
+import paddle.trainer_config_helpers as conf_helps
+from paddle.trainer_config_helpers.config_parser_utils import \
+    parse_network_config as __parse__
+from paddle.trainer_config_helpers.default_decorators import wrap_act_default
+from paddle.trainer_config_helpers.default_decorators import \
+    wrap_bias_attr_default
+from paddle.trainer_config_helpers.default_decorators import wrap_name_default
+from paddle.trainer_config_helpers.layers import layer_support
+from paddle.trainer.config_parser import \
+    RecurrentLayerGroupWithoutOutLinksBegin, RecurrentLayerGroupSetOutLink, \
+    RecurrentLayerGroupEnd, model_type
+
+import activation
+import re
+import data_type
+
+__all__ = ['parse_network', 'data']
+
+
+def parse_network(*outputs):
+    """
+    Parse all output layers and then generate a ModelConfig object.
+
+    ..  note::
+
+        This function is used internally in paddle.v2 module. User should never
+        invoke this method.
+
+    :param outputs: Output layers.
+    :type outputs: Layer
+    :return: A ModelConfig object instance.
+    :rtype: ModelConfig
+    """
+
+    def __real_func__():
+        """
+        __real_func__ is the function that config_parser.parse invoked. It is
+        the plain old paddle configuration function.
+        """
+        context = dict()
+        real_output = [each.to_proto(context=context) for each in outputs]
+        conf_helps.outputs(real_output)
+
+    return __parse__(__real_func__)
+
+
+"""
+Some layer may need some special config, and can not use __convert_to_v2__ to convert.
+So we also need to implement some special LayerV2.
+"""
+
+
+class DataLayerV2(Layer):
+    METHOD_NAME = 'data_layer'
+
+    def __init__(self, name, type, **kwargs):
+        assert isinstance(type, data_type.InputType)
+
+        self.type = type
+        self.__method_name__ = 'data_layer'
+        self.__kwargs__ = kwargs
+
+        super(DataLayerV2, self).__init__(name=name, parent_layers=dict())
+
+    def to_proto_impl(self, **kwargs):
+        args = dict()
+        args['size'] = self.type.dim
+        for each in kwargs:
+            args[each] = kwargs[each]
+        for each in self.__kwargs__:
+            args[each] = self.__kwargs__[each]
+        return getattr(conf_helps, self.__method_name__)(name=self.name, **args)
+
+    def __map_docstr__(doc):
+        doc = re.sub(r'(data = [^\)]+)\).*',
+                     "data = paddle.layer.data(name=\"input\", "
+                     "type=paddle.data_type.dense_vector(1000))", doc)
+
+        doc = re.sub(r':param size:.*',
+                     ':param type: Data type of this data layer', doc)
+        doc = re.sub(r':type size:.*',
+                     ":type size: paddle.v2.data_type.InputType", doc)
+        return doc
+
+
+class WithExtraParent(Layer):
+    def extra_parent(self):
+        return self.__extra_parent__
+
+    def __init__(self, name=None, parent_layers=None):
+        self.__extra_parent__ = []
+        super(WithExtraParent, self).__init__(
+            name=name, parent_layers=parent_layers)
+
+    def append_extra_parent(self, parent):
+        self.__extra_parent__.append(parent)
+
+    def to_proto(self, context):
+        """
+        function to set proto attribute
+        """
+        kwargs = dict()
+        for p in self.__extra_parent__:
+            p.to_proto(context=context)
+
+        for layer_name in self.__parent_layers__:
+            if not isinstance(self.__parent_layers__[layer_name],
+                              collections.Sequence):
+                v1_layer = self.__parent_layers__[layer_name].to_proto(
+                    context=context)
+            else:
+                v1_layer = map(lambda x: x.to_proto(context=context),
+                               self.__parent_layers__[layer_name])
+            kwargs[layer_name] = v1_layer
+
+        if self.context_name() is None:
+            return self.to_proto_impl(context=context, **kwargs)
+        elif self.context_name() not in context:
+            context[self.context_name()] = self.to_proto_impl(
+                context=context, **kwargs)
+
+        if self.use_context_name():
+            return context[self.context_name()]
+        else:
+            return context[self.name]
+
+
+class MemoryV2(WithExtraParent):
+    def __init__(self, name, **kwargs):
+        self.name = name
+        super(MemoryV2, self).__init__(name=name, parent_layers=dict())
+        self.__kwargs__ = kwargs
+        self.__boot_layer_name__ = None
+        if 'boot_layer' in kwargs:
+            begin_of_current_rnn = []
+            # TODO(yuyang18): Fix inspect, it could be wrong when user invoke a
+            # function inside step.
+            st = inspect.stack()
+            for i in xrange(len(st)):
+                locs = inspect.stack()[i][0].f_locals
+                keys = locs.keys()
+                for key in keys:
+                    val = locs[key]
+                    if isinstance(val, RecurrentLayerInput):
+                        begin_of_current_rnn.append(val)
+                    elif isinstance(val, collections.Sequence):
+                        for v in val:
+                            if isinstance(v, RecurrentLayerInput):
+                                begin_of_current_rnn.append(v)
+
+                if begin_of_current_rnn:
+                    break
+            assert begin_of_current_rnn is not None
+            for extra in begin_of_current_rnn:
+                self.append_extra_parent(extra)
+                assert isinstance(extra, WithExtraParent)
+                extra.append_extra_parent(kwargs['boot_layer'])
+                self.__boot_layer_name__ = kwargs['boot_layer'].name
+
+    def to_proto_impl(self, context, **kwargs):
+        args = dict()
+        for each in kwargs:
+            args[each] = kwargs[each]
+        for each in self.__kwargs__:
+            args[each] = self.__kwargs__[each]
+
+        if self.__boot_layer_name__ is not None:
+            args['boot_layer'] = context[self.__boot_layer_name__]
+
+        size = args.get('size', None)
+        if size is not None:
+            if callable(size):
+                real_size = size()
+            else:
+                real_size = size
+            args['size'] = real_size
+        return conf_helps.memory(name=self.name, **args)
+
+    def context_name(self):
+        return self.name + "#memory"
+
+    def use_context_name(self):
+        """
+        memory layer will have the same name with some layer
+        :return:
+        """
+        return True
+
+
+class LayerOutputV2(Layer):
+    """
+    LayerOutputV2 is used to store the result of LayerOutput in v1 api.
+    It will not store it's parents because layer_output has been parsed already.
+    """
+
+    def __init__(self, layer_output):
+        assert isinstance(layer_output, conf_helps.LayerOutput)
+        self.layer_output = layer_output
+        super(LayerOutputV2, self).__init__(
+            name=layer_output.name, parent_layers=dict())
+
+    def to_proto_impl(self):
+        return self.layer_output
+
+
+class StaticInputV2(object):
+    def __init__(self, input, is_seq=False, size=None):
+        assert isinstance(input, LayerV2)
+        self.name = input.name
+        self.input = input
+        self.is_seq = is_seq
+        self.size = size
+        # TODO(add size check)
+        # assert input.size is not None or size is not None
+
+
+class MixedLayerV2(Layer):
+    """
+    This class is use to support `with` grammar. If not, the following code
+    could convert mixed_layer simply.
+
+        mixed = __convert_to_v2__(
+            'mixed_layer', name_prefix='mixed', parent_names=['input'])
+    """
+
+    class AddToSealedMixedLayerExceptionV2(Exception):
+        pass
+
+    def __init__(self,
+                 size=0,
+                 input=None,
+                 name=None,
+                 act=None,
+                 bias_attr=None,
+                 layer_attr=None):
+        self.__method_name__ = 'mixed_layer'
+        self.finalized = False
+        self.__inputs__ = []
+        if input is not None:
+            self.__inputs__ = input
+
+        other_kwargs = dict()
+        other_kwargs['name'] = name
+        other_kwargs['size'] = size
+        other_kwargs['act'] = act
+        other_kwargs['bias_attr'] = bias_attr
+        other_kwargs['layer_attr'] = layer_attr
+        parent_layers = {"input": self.__inputs__}
+        super(MixedLayerV2, self).__init__(name, parent_layers)
+        self.__other_kwargs__ = other_kwargs
+
+    def __iadd__(self, other):
+        if not self.finalized:
+            self.__inputs__.append(other)
+            return self
+        else:
+            raise MixedLayerV2.AddToSealedMixedLayerExceptionV2()
+
+    def __enter__(self):
+        assert len(self.__inputs__) == 0
+        return self
+
+    def __exit__(self, *args, **kwargs):
+        self.finalized = True
+
+    def to_proto_impl(self, **kwargs):
+        args = dict()
+        for each in kwargs:
+            args[each] = kwargs[each]
+        for each in self.__other_kwargs__:
+            args[each] = self.__other_kwargs__[each]
+        size = args.get('size', None)
+        if size is not None:
+            if callable(size):
+                real_size = size()
+            else:
+                real_size = size
+            args['size'] = real_size
+        return getattr(conf_helps, self.__method_name__)(**args)
+
+
+@wrap_name_default("mixed")
+@wrap_act_default(act=activation.Linear())
+@wrap_bias_attr_default(has_bias=False)
+@layer_support(conf_helps.layers.ERROR_CLIPPING, conf_helps.layers.DROPOUT)
+def mixed(size=0,
+          name=None,
+          input=None,
+          act=None,
+          bias_attr=False,
+          layer_attr=None):
+    return MixedLayerV2(size, input, name, act, bias_attr, layer_attr)
+
+
+class RecurrentLayerInput(WithExtraParent):
+    def __init__(self, recurrent_name, index, parent_layers):
+        assert len(parent_layers) == 1
+        self.__parents__ = parent_layers.values()[0]
+        super(RecurrentLayerInput, self).__init__(
+            name=self.__parents__[index].name, parent_layers=parent_layers)
+        self.__recurrent_name__ = recurrent_name
+
+    def context_name(self):
+        return self.__recurrent_name__ + ".begin"
+
+    def to_proto_impl(self, context, **kwargs):
+        model_type('recurrent_nn')
+        RecurrentLayerGroupWithoutOutLinksBegin(
+            name=self.__recurrent_name__,
+            in_links=map(lambda x: x.name, self.__parents__))
+        return self
+
+
+class RecurrentLayerOutput(Layer):
+    def __init__(self, recurrent_name, index, parent_layers):
+        assert len(parent_layers) == 1
+        self.__parents__ = parent_layers.values()[0]
+        super(RecurrentLayerOutput, self).__init__(
+            name=self.__parents__[index].name, parent_layers=parent_layers)
+        self.__recurrent_name__ = recurrent_name
+
+    def context_name(self):
+        return self.__recurrent_name__ + ".end"
+
+    def to_proto_impl(self, **kwargs):
+        for l in self.__parents__:
+            RecurrentLayerGroupSetOutLink(l.name)
+        RecurrentLayerGroupEnd(name=self.__recurrent_name__)
+
+
+LayerV2 = Layer
+data = DataLayerV2
+data.__name__ = 'data'
+AggregateLevel = conf_helps.layers.AggregateLevel
+ExpandLevel = conf_helps.layers.ExpandLevel
+memory = MemoryV2
+
+
+def __layer_name_mapping__(inname):
+    if inname in ['data_layer', 'memory', 'mixed_layer', 'recurrent_group']:
+        # Do Not handle these layers
+        return
+    elif inname == 'maxid_layer':
+        return 'max_id'
+    elif inname.endswith('memory') or inname.endswith(
+            '_seq') or inname.endswith('_sim') or inname == 'hsigmoid':
+        return inname
+    elif inname in [
+            'cross_entropy', 'multi_binary_label_cross_entropy',
+            'cross_entropy_with_selfnorm'
+    ]:
+        return inname + "_cost"
+    elif inname.endswith('_cost'):
+        return inname
+    elif inname.endswith("_layer"):
+        return inname[:-len("_layer")]
+
+
+def __layer_name_mapping_parent_names__(inname):
+    all_args = getattr(conf_helps, inname).argspec.args
+    return filter(
+        lambda x: x in ['input1', 'input2', 'label', 'input', 'a', 'b',
+                        'expand_as',
+                        'weights', 'vectors', 'weight', 'score', 'left',
+                        'right', 'output_mem'],
+        all_args)
+
+
+def __convert_layer__(_new_name_, _old_name_, _parent_names_):
+    global __all__
+    __all__.append(_new_name_)
+    globals()[new_name] = __convert_to_v2__(_old_name_, _parent_names_)
+    globals()[new_name].__name__ = new_name
+
+
+for each_layer_name in dir(conf_helps):
+    new_name = __layer_name_mapping__(each_layer_name)
+    if new_name is not None:
+        parent_names = __layer_name_mapping_parent_names__(each_layer_name)
+        assert len(parent_names) != 0, each_layer_name
+        __convert_layer__(new_name, each_layer_name, parent_names)
+
+del parent_names
+del new_name
+del each_layer_name
+
+
+@wrap_name_default()
+def recurrent_group(step, input, name=None):
+    if not isinstance(input, collections.Sequence):
+        input = [input]
+
+    non_static_inputs = filter(lambda x: not isinstance(x, StaticInputV2),
+                               input)
+    actual_input = [
+        RecurrentLayerInput(
+            recurrent_name=name,
+            index=i,
+            parent_layers={'recurrent_inputs': non_static_inputs})
+        for i in xrange(len(non_static_inputs))
+    ]
+
+    def __real_step__(*args):
+        rnn_input = list(args)
+        static_inputs = filter(lambda x: isinstance(x, StaticInputV2), input)
+        for static_input in static_inputs:
+            mem_name = "__%s_memory__" % static_input.input.name
+            mem = memory(
+                name=mem_name,
+                is_seq=static_input.is_seq,
+                size=static_input.input.calculate_size,
+                boot_layer=static_input.input)
+            with mixed(
+                    name=mem_name,
+                    size=static_input.input.calculate_size,
+                    act=activation.Identity()) as mix:
+                mix += identity_projection(input=mem)
+            rnn_input.insert(input.index(static_input), mix)
+        return step(*rnn_input)
+
+    actual_output = __real_step__(*actual_input)
+
+    if not isinstance(actual_output, collections.Sequence):
+        actual_output = [actual_output]
+
+    retv = [
+        RecurrentLayerOutput(
+            recurrent_name=name,
+            index=i,
+            parent_layers={'recurrent_outputs': actual_output})
+        for i in xrange(len(actual_output))
+    ]
+    if len(retv) == 1:
+        return retv[0]
+    else:
+        return retv
+
+
+__projection_names__ = filter(lambda x: x.endswith('_projection'),
+                              dir(conf_helps))
+
+__all__ += __projection_names__
+
+__operator_names__ = filter(lambda x: x.endswith('_operator'), dir(conf_helps))
+__all__ += __operator_names__
+
+# convert projection
+for prj in __projection_names__:
+    globals()[prj] = __convert_to_v2__(
+        prj, parent_names=['input'], is_default_name=False)
+    globals()[prj].__name__ = prj
+
+# convert operator
+operator_list = [
+    # [V1_method_name, parent_names],
+    ['dotmul_operator', ['a', 'b']],
+    ['conv_operator', ['img', 'filter']]
+]
+for op in operator_list:
+    globals()[op[0]] = __convert_to_v2__(
+        op[0], parent_names=op[1], is_default_name=False)
+    globals()[op[0]].__name__ = op[0]
diff --git a/python/paddle/v2/minibatch.py b/python/paddle/v2/minibatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..317cf037c69f8639e3760fbfce20565127794fcb
--- /dev/null
+++ b/python/paddle/v2/minibatch.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = ['batch']
+
+
+def batch(reader, batch_size):
+    """
+    Create a batched reader.
+
+    :param reader: the data reader to read from.
+    :type reader: callable
+    :param batch_size: size of each mini-batch
+    :type batch_size: int
+    :return: the batched reader.
+    :rtype: callable
+    """
+
+    def batch_reader():
+        r = reader()
+        b = []
+        for instance in r:
+            b.append(instance)
+            if len(b) == batch_size:
+                yield b
+                b = []
+        if b:
+            yield b
+
+    return batch_reader
diff --git a/python/paddle/v2/networks.py b/python/paddle/v2/networks.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e6644196c8242cc3fed7a4fb1503697e5b59ffb
--- /dev/null
+++ b/python/paddle/v2/networks.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.trainer_config_helpers.networks as conf_nw
+import inspect
+from config_base import __convert_to_v2__
+
+__all__ = []
+
+
+def __initialize__():
+    for each_subnetwork in conf_nw.__all__:
+        if each_subnetwork in ['inputs', 'outputs']:
+            continue
+        func = getattr(conf_nw, each_subnetwork)
+        if hasattr(func, 'argspec'):
+            argspec = func.argspec
+        else:
+            argspec = inspect.getargspec(func)
+        if each_subnetwork == 'simple_attention':
+            parents = ['encoded_sequence', 'encoded_proj', 'decoder_state']
+        else:
+            parents = filter(lambda x: x.startswith('input'), argspec.args)
+        assert len(parents) != 0, each_subnetwork
+        v2_subnet = __convert_to_v2__(
+            each_subnetwork,
+            parent_names=parents,
+            is_default_name='name' in argspec.args)
+        globals()[each_subnetwork] = v2_subnet
+        globals()[each_subnetwork].__name__ = each_subnetwork
+        global __all__
+        __all__.append(each_subnetwork)
+
+
+__initialize__()
diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
index aa2942bc9faeb2a353459cd619886f56ea32f450..1a01d95c205c0626374e1814a170ce2d58f23a60 100644
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -1,9 +1,17 @@
 import py_paddle.swig_paddle as swig_api
-import paddle.trainer_config_helpers.optimizers as v1_optimizers
+
 import paddle.trainer_config_helpers.config_parser_utils as config_parser_utils
-import paddle.v2
+import paddle.trainer_config_helpers.optimizers as v1_optimizers
+"""
+Optimizers(update equation) for SGD method.
+
+TODO(yuyang18): Complete comments.
+"""
 
-__all__ = ['Adam', 'Adamax']
+__all__ = [
+    'Momentum', 'Adam', 'Adamax', 'AdaGrad', 'DecayedAdaGrad', 'AdaDelta',
+    'RMSProp', 'ModelAverage', 'L2Regularization'
+]
 
 
 class Optimizer(object):
@@ -38,6 +46,14 @@ class Optimizer(object):
                                                              pass_num)
 
 
+class Momentum(Optimizer):
+    def __init__(self, momentum=None, sparse=False, **kwargs):
+        learning_method = v1_optimizers.MomentumOptimizer(
+            momentum=momentum, sparse=sparse)
+        super(Momentum, self).__init__(
+            learning_method=learning_method, **kwargs)
+
+
 class Adam(Optimizer):
     def __init__(self, beta1=0.9, beta2=0.999, epsilon=1e-8, **kwargs):
         learning_method = v1_optimizers.AdamOptimizer(
@@ -52,7 +68,45 @@ class Adamax(Optimizer):
         super(Adamax, self).__init__(learning_method=learning_method, **kwargs)
 
 
+class AdaGrad(Optimizer):
+    def __init__(self, **kwargs):
+        learning_method = v1_optimizers.AdaGradOptimizer()
+        super(AdaGrad, self).__init__(learning_method=learning_method, **kwargs)
+
+
+class DecayedAdaGrad(Optimizer):
+    def __init__(self, rho=0.95, epsilon=1e-06, **kwargs):
+        learning_method = v1_optimizers.DecayedAdaGradOptimizer(
+            rho=rho, epsilon=epsilon)
+        super(DecayedAdaGrad, self).__init__(
+            learning_method=learning_method, **kwargs)
+
+
+class AdaDelta(Optimizer):
+    def __init__(self, rho=0.95, epsilon=1e-06, **kwargs):
+        learning_method = v1_optimizers.AdaDeltaOptimizer(
+            rho=rho, epsilon=epsilon)
+        super(AdaDelta, self).__init__(
+            learning_method=learning_method, **kwargs)
+
+
+class RMSProp(Optimizer):
+    def __init__(self, rho=0.95, epsilon=1e-6, **kwargs):
+        learning_method = v1_optimizers.RMSPropOptimizer(
+            rho=rho, epsilon=epsilon)
+        super(RMSProp, self).__init__(learning_method=learning_method, **kwargs)
+
+
+ModelAverage = v1_optimizers.ModelAverage
+L2Regularization = v1_optimizers.L2Regularization
+
 if __name__ == '__main__':
     swig_api.initPaddle('--use_gpu=false')
-    opt = paddle.v2.optimizer.Adam()
-    print opt.enable_types()
+    for opt in [
+            Momentum(), Adam(), Adamax(), AdaGrad(), DecayedAdaGrad(),
+            AdaDelta(), RMSProp(), Adam(
+                model_average=ModelAverage(average_window=0.5),
+                regularization=L2Regularization(rate=0.5),
+                gradient_clipping_threshold=25)
+    ]:
+        print opt, opt.enable_types()
diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py
new file mode 100644
index 0000000000000000000000000000000000000000..05dc5c68dd97b00fb15b74564a32313430c45345
--- /dev/null
+++ b/python/paddle/v2/parameters.py
@@ -0,0 +1,336 @@
+import numpy as np
+import py_paddle.swig_paddle as api
+from paddle.proto.ParameterConfig_pb2 import ParameterConfig
+import struct
+import tarfile
+import cStringIO
+from topology import Topology
+
+__all__ = ['Parameters', 'create']
+
+
+def create(layers):
+    """
+    Create parameter pool by topology.
+
+    :param layers:
+    :return:
+    """
+    topology = Topology(layers)
+    pool = Parameters()
+    for param in topology.proto().parameters:
+        pool.__append_config__(param)
+    return pool
+
+
+class Parameters(object):
+    """
+    Parameters is a dictionary contains Paddle's parameter. The key of
+    Parameters is the name of parameter. The value of Parameters is a plain
+    :code:`numpy.ndarry` .
+
+    Basically usage is
+
+    ..  code-block:: python
+
+        data = paddle.layers.data(...)
+        ...
+        out = paddle.layers.fc(...)
+
+        parameters = paddle.parameters.create(out)
+
+        parameter_names = parameters.names()
+        fc_mat = parameters.get('fc')
+        print fc_mat
+    """
+
+    def __init__(self):
+        self.__param_conf__ = dict()
+        self.__gradient_machines__ = []
+        self.__tmp_params__ = []
+
+    def __append_config__(self, param_conf):
+        """
+        Append a parameter configuration. It used to initialize Parameters and
+        should be invoked only in paddle.parameters.create
+
+        :param param_conf: The parameter configuration in protobuf
+        :type param_conf: ParameterConfig
+        :return: Nothing
+        """
+
+        if not isinstance(param_conf, ParameterConfig):
+            raise ValueError("param_conf must be paddle.proto.ParameterConfig")
+
+        if param_conf.name in self.__param_conf__:
+            raise ValueError("duplicated parameter %s" % param_conf.name)
+
+        self.__param_conf__[param_conf.name] = param_conf
+
+    def keys(self):
+        """
+        keys are the names of each parameter.
+
+        :return: list of parameter name
+        :rtype: list
+        """
+        return self.__param_conf__.keys()
+
+    def names(self):
+        """
+        names of each parameter.
+
+        :return: list of parameter name
+        :rtype: list
+        """
+        return self.keys()
+
+    def has_key(self, key):
+        """
+        has_key return true if there are such parameter name == key
+
+        :param key: Parameter name
+        :type key: basestring
+        :return: True if contains such key
+        """
+        return key in self.__param_conf__.keys()
+
+    def __iter__(self):
+        """
+        Return an iterator of parameter name. It is used by `for loop`
+        or `in` operator.
+
+        ..  code-block:: python
+
+            parameters = paddle.parameters.create(...)
+            if "fc_param" in parameters:
+                print 'OK'
+        :return: an iterator of parameter name
+        :rtype: iterator
+        """
+        return iter(self.__param_conf__)
+
+    def __getitem__(self, key):
+        """
+        Get parameter by parameter name. It uses Python dict syntax.
+
+        :note: It will always copy the parameter from C++ side.
+        :param key: Parameter name
+        :type key: basestring
+        :return: parameter value
+        :rtype: np.ndarray
+        """
+        shape = self.get_shape(key)
+
+        if len(self.__gradient_machines__) == 0:
+            # create new parameter in python numpy.
+            if len(self.__tmp_params__) != 0:
+                ret_list = [
+                    mat for name, mat in self.__tmp_params__ if name == key
+                ]
+                if len(ret_list) == 1:
+                    return ret_list[0]
+            return np.ndarray(shape=shape, dtype=np.float32)
+        else:
+            for each_gradient_machine in self.__gradient_machines__:
+                param = __get_parameter_in_gradient_machine__(
+                    each_gradient_machine, key)
+                # for simplify implementation now, we always copy from C++
+                assert isinstance(param, api.Parameter)
+                val = param.getBuf(api.PARAMETER_VALUE)
+                assert isinstance(val, api.Vector)
+                val = val.copyToNumpyArray()
+                return val
+                # else continue
+
+            raise RuntimeError("Unexpected branch")
+
+    def get_shape(self, key):
+        """
+        get shape of the parameter.
+
+        :param key: parameter name
+        :type key: basestring
+        :return: parameter's shape
+        :rtype: tuple
+        """
+        if not isinstance(key, basestring):
+            raise ValueError("parameter name should be string")
+        if not self.has_key(key):
+            raise ValueError("No such parameter %s" % key)
+        conf = self.__param_conf__[key]
+        return tuple(map(int, conf.dims))
+
+    def __setitem__(self, key, value):
+        """
+        Set parameter by parameter name & value. It use Python dict syntax.
+
+        :note: It will always copy the parameter to C++ side.
+        :param key: Parameter name
+        :type key: basestring
+        :param value: Parameter matrix.
+        :type value: np.ndarray
+        :return: Nothing
+        """
+
+        if not isinstance(value, np.ndarray):
+            raise ValueError("Must return ndarray")
+        value = value.astype(dtype=np.float32)
+        shape = self.get_shape(key)
+        if value.shape != shape:
+            raise ValueError("Value shape mismatch, expect %s, should %s" %
+                             (shape, value.shape))
+
+        if len(self.__gradient_machines__) == 0:
+            self.__tmp_params__.append((key, value))
+        else:
+            for each_gradient_machine in self.__gradient_machines__:
+                __copy_parameter_to_gradient_machine__(each_gradient_machine,
+                                                       key, value)
+
+    def get(self, parameter_name):
+        """
+        Get parameter by parameter name.
+
+        :note: It will always copy the parameter from C++ side.
+        :param parameter_name: parameter name
+        :type parameter_name: basestring
+        :return: The parameter matrix.
+        :rtype: np.ndarray
+        """
+        return self.__getitem__(key=parameter_name)
+
+    def set(self, parameter_name, value):
+        """
+        Set parameter by parameter name & matrix.
+
+        :param parameter_name: parameter name
+        :type parameter_name: basestring
+        :param value: parameter matrix
+        :type value: np.ndarray
+        :return: Nothing.
+        """
+        self.__setitem__(key=parameter_name, value=value)
+
+    def append_gradient_machine(self, gradient_machine):
+        """
+        append gradient machine to parameters. This method is used internally in
+        Trainer.train.
+
+        :param gradient_machine: Paddle C++ GradientMachine object.
+        :type gradient_machine: api.GradientMachine
+        :return:
+        """
+
+        if not isinstance(gradient_machine, api.GradientMachine):
+            raise ValueError("gradient_machine should be api.GradientMachine")
+
+        if len(self.__tmp_params__) != 0:
+            for name, val in self.__tmp_params__:
+                try:
+                    __copy_parameter_to_gradient_machine__(gradient_machine,
+                                                           name, val)
+                except ValueError:
+                    # If no such parameter in gradient machine, then don't copy
+                    pass
+
+        self.__gradient_machines__.append(gradient_machine)
+
+    def serialize(self, name, f):
+        """
+
+        :param name:
+        :param f:
+        :type f: file
+        :return:
+        """
+        param = self.get(name)
+        size = reduce(lambda a, b: a * b, param.shape)
+        f.write(struct.pack("IIQ", 0, 4, size))
+        param = param.astype(np.float32)
+        f.write(param.tobytes())
+
+    def deserialize(self, name, f):
+        """
+
+        :param name:
+        :param f:
+        :type f: file
+        :return:
+        """
+        f.read(16)  # header
+        arr = np.frombuffer(f.read(), dtype=np.float32)
+        self.set(name, arr.reshape(self.get_shape(name)))
+
+    def to_tar(self, f):
+        tar = tarfile.TarFile(fileobj=f, mode='w')
+        for nm in self.names():
+            buf = cStringIO.StringIO()
+            self.serialize(nm, buf)
+            tarinfo = tarfile.TarInfo(name=nm)
+            buf.seek(0)
+            tarinfo.size = len(buf.getvalue())
+            tar.addfile(tarinfo, buf)
+
+            conf = self.__param_conf__[nm]
+            confStr = conf.SerializeToString()
+            tarinfo = tarfile.TarInfo(name="%s.protobuf" % nm)
+            tarinfo.size = len(confStr)
+            buf = cStringIO.StringIO(confStr)
+            buf.seek(0)
+            tar.addfile(tarinfo, fileobj=buf)
+
+    @staticmethod
+    def from_tar(f):
+        params = Parameters()
+        tar = tarfile.TarFile(fileobj=f, mode='r')
+        for finfo in tar:
+            assert isinstance(finfo, tarfile.TarInfo)
+            if finfo.name.endswith('.protobuf'):
+                f = tar.extractfile(finfo)
+                conf = ParameterConfig()
+                conf.ParseFromString(f.read())
+                params.__append_config__(conf)
+
+        for param_name in params.names():
+            f = tar.extractfile(param_name)
+            params.deserialize(param_name, f)
+        return params
+
+
+def __get_parameter_in_gradient_machine__(gradient_machine, name):
+    """
+
+    :param gradient_machine:
+    :type gradient_machine: api.GradientMachine
+    :param name:
+    :return:
+    :rtype: api.Parameter
+    """
+    params = filter(lambda p: p.getName() == name,
+                    gradient_machine.getParameters())
+
+    if len(params) == 0:
+        raise ValueError("No such parameter")
+    elif len(params) > 1:
+        raise ValueError("Unexpected branch")
+    else:
+        return params[0]
+
+
+def __copy_parameter_to_gradient_machine__(gradient_machine, name, arr):
+    """
+    Copy a python ndarray into the gradient machine.
+
+    :param gradient_machine:
+    :type gradient_machine: api.GradientMachine
+    :param name:
+    :param arr:
+    :type arr: np.ndarray
+    :return:
+    :rtype: api.Parameter
+    """
+    param = __get_parameter_in_gradient_machine__(gradient_machine, name)
+    vec = param.getBuf(api.PARAMETER_VALUE)
+    assert isinstance(vec, api.Vector)
+    vec.copyFromNumpyArray(arr.flatten())
diff --git a/python/paddle/v2/pooling.py b/python/paddle/v2/pooling.py
new file mode 100644
index 0000000000000000000000000000000000000000..4881c27d1d6d3d926f12aab096f377164debf1ef
--- /dev/null
+++ b/python/paddle/v2/pooling.py
@@ -0,0 +1,26 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.trainer_config_helpers.poolings
+import copy
+
+__all__ = []
+suffix = 'Pooling'
+
+for name in paddle.trainer_config_helpers.poolings.__all__:
+    new_name = name[:-len(suffix)]
+    globals()[new_name] = copy.copy(
+        getattr(paddle.trainer_config_helpers.poolings, name))
+    globals()[new_name].__name__ = new_name
+    __all__.append(new_name)
diff --git a/python/paddle/v2/reader/__init__.py b/python/paddle/v2/reader/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b059735a924d58714cd88a761eb83143f1192d6
--- /dev/null
+++ b/python/paddle/v2/reader/__init__.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+At training and testing time, PaddlePaddle programs need to read data. To ease
+the users' work to write data reading code, we define that
+
+- A *reader* is a function that reads data (from file, network, random number
+  generator, etc) and yields data items.
+- A *reader creator* is a function that returns a reader function.
+- A *reader decorator* is a function, which accepts one or more readers, and
+  returns a reader.
+- A *batch reader* is a function that reads data (from *reader*, file, network,
+  random number generator, etc) and yields a batch of data items.
+
+#####################
+Data Reader Interface
+#####################
+
+Indeed, *data reader* doesn't have to be a function that reads and yields data
+items. It can be any function with no parameter that creates a iterable
+(anything can be used in :code:`for x in iterable`)\:
+
+..  code-block:: python
+
+    iterable = data_reader()
+
+Element produced from the iterable should be a **single** entry of data,
+**not** a mini batch. That entry of data could be a single item, or a tuple of
+items.
+Item should be of `supported type <http://www.paddlepaddle.org/doc/ui/data_provider
+/pydataprovider2.html?highlight=dense_vector#input-types>`_ (e.g., numpy 1d
+array of float32, int, list of int)
+
+An example implementation for single item data reader creator:
+
+..  code-block:: python
+
+    def reader_creator_random_image(width, height):
+        def reader():
+            while True:
+                yield numpy.random.uniform(-1, 1, size=width*height)
+    return reader
+
+An example implementation for multiple item data reader creator:
+
+..  code-block:: python
+
+    def reader_creator_random_image_and_label(width, height, label):
+        def reader():
+            while True:
+                yield numpy.random.uniform(-1, 1, size=width*height), label
+    return reader
+
+
+TODO(yuyang18): Should we add whole design doc here?
+"""
+
+import decorator
+from decorator import *
+
+import creator
+
+__all__ = decorator.__all__ + ['creator']
diff --git a/python/paddle/v2/reader/creator.py b/python/paddle/v2/reader/creator.py
new file mode 100644
index 0000000000000000000000000000000000000000..07142056f872db5113acdd296b17c52b343c1be6
--- /dev/null
+++ b/python/paddle/v2/reader/creator.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Creator package contains some simple reader creator, which could be used in user
+program.
+"""
+
+__all__ = ['np_array', 'text_file']
+
+
+def np_array(x):
+    """
+    Creates a reader that yields elements of x, if it is a
+    numpy vector. Or rows of x, if it is a numpy matrix.
+    Or any sub-hyperplane indexed by the highest dimension.
+
+    :param x: the numpy array to create reader from.
+    :returns: data reader created from x.
+    """
+
+    def reader():
+        if x.ndim < 1:
+            yield x
+
+        for e in x:
+            yield e
+
+    return reader
+
+
+def text_file(path):
+    """
+    Creates a data reader that outputs text line by line from given text file.
+    Trailing new line ('\\\\n') of each line will be removed.
+
+    :path: path of the text file.
+    :returns: data reader of text file
+    """
+
+    def reader():
+        f = open(path, "r")
+        for l in f:
+            yield l.rstrip('\n')
+        f.close()
+
+    return reader
diff --git a/python/paddle/v2/reader/decorator.py b/python/paddle/v2/reader/decorator.py
new file mode 100644
index 0000000000000000000000000000000000000000..104ce9a0411413bb8fc65eedf5821f98d6acdba3
--- /dev/null
+++ b/python/paddle/v2/reader/decorator.py
@@ -0,0 +1,226 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+__all__ = [
+    'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
+    'ComposeNotAligned', 'firstn'
+]
+
+import itertools
+import random
+from Queue import Queue
+from threading import Thread
+
+
+def map_readers(func, *readers):
+    """
+    Creates a data reader that outputs return value of function using
+    output of each data readers as arguments.
+
+    :param func: function to use. The type of func should be (Sample) => Sample
+    :type: callable
+    :param readers: readers whose outputs will be used as arguments of func.
+    :return: the created data reader.
+    :rtype: callable
+    """
+
+    def reader():
+        rs = []
+        for r in readers:
+            rs.append(r())
+        for e in itertools.imap(func, *rs):
+            yield e
+
+    return reader
+
+
+def shuffle(reader, buf_size):
+    """
+    Creates a data reader whose data output is shuffled.
+
+    Output from the iterator that created by original reader will be
+    buffered into shuffle buffer, and then shuffled. The size of shuffle buffer
+    is determined by argument buf_size.
+
+    :param reader: the original reader whose output will be shuffled.
+    :type reader: callable
+    :param buf_size: shuffle buffer size.
+    :type buf_size: int
+
+    :return: the new reader whose output is shuffled.
+    :rtype: callable
+    """
+
+    def data_reader():
+        buf = []
+        for e in reader():
+            buf.append(e)
+            if len(buf) >= buf_size:
+                random.shuffle(buf)
+                for b in buf:
+                    yield b
+                buf = []
+
+        if len(buf) > 0:
+            random.shuffle(buf)
+            for b in buf:
+                yield b
+
+    return data_reader
+
+
+def chain(*readers):
+    """
+    Creates a data reader whose output is the outputs of input data
+    readers chained together.
+
+    If input readers output following data entries:
+    [0, 0, 0]
+    [1, 1, 1]
+    [2, 2, 2]
+    The chained reader will output:
+    [0, 0, 0, 1, 1, 1, 2, 2, 2]
+
+    :param readers: input readers.
+    :return: the new data reader.
+    :rtype: callable
+    """
+
+    def reader():
+        rs = []
+        for r in readers:
+            rs.append(r())
+
+        for e in itertools.chain(*rs):
+            yield e
+
+    return reader
+
+
+class ComposeNotAligned(ValueError):
+    pass
+
+
+def compose(*readers, **kwargs):
+    """
+    Creates a data reader whose output is the combination of input readers.
+
+    If input readers output following data entries:
+    (1, 2)    3    (4, 5)
+    The composed reader will output:
+    (1, 2, 3, 4, 5)
+
+    :param readers: readers that will be composed together.
+    :param check_alignment: if True, will check if input readers are aligned
+        correctly. If False, will not check alignment and trailing outputs
+        will be discarded. Defaults to True.
+    :type check_alignment: bool
+
+    :return: the new data reader.
+
+    :raises ComposeNotAligned: outputs of readers are not aligned.
+        Will not raise when check_alignment is set to False.
+    """
+    check_alignment = kwargs.pop('check_alignment', True)
+
+    def make_tuple(x):
+        if isinstance(x, tuple):
+            return x
+        else:
+            return (x, )
+
+    def reader():
+        rs = []
+        for r in readers:
+            rs.append(r())
+        if not check_alignment:
+            for outputs in itertools.izip(*rs):
+                yield sum(map(make_tuple, outputs), ())
+        else:
+            for outputs in itertools.izip_longest(*rs):
+                for o in outputs:
+                    if o is None:
+                        # None will be not be present if compose is aligned
+                        raise ComposeNotAligned(
+                            "outputs of readers are not aligned.")
+                yield sum(map(make_tuple, outputs), ())
+
+    return reader
+
+
+def buffered(reader, size):
+    """
+    Creates a buffered data reader.
+
+    The buffered data reader will read and save data entries into a
+    buffer. Reading from the buffered data reader will proceed as long
+    as the buffer is not empty.
+    
+    :param reader: the data reader to read from.
+    :type reader: callable
+    :param size: max buffer size.
+    :type size: int
+    
+    :returns: the buffered data reader.
+    """
+
+    class EndSignal():
+        pass
+
+    end = EndSignal()
+
+    def read_worker(r, q):
+        for d in r:
+            q.put(d)
+        q.put(end)
+
+    def data_reader():
+        r = reader()
+        q = Queue(maxsize=size)
+        t = Thread(
+            target=read_worker, args=(
+                r,
+                q, ))
+        t.daemon = True
+        t.start()
+        e = q.get()
+        while e != end:
+            yield e
+            e = q.get()
+
+    return data_reader
+
+
+def firstn(reader, n):
+    """
+    Limit the max number of samples that reader could return.
+
+    :param reader: the data reader to read from.
+    :type reader: callable
+    :param n: the max number of samples that return.
+    :type n: int
+    :return: the decorated reader.
+    :rtype: callable
+    """
+
+    # TODO(yuyang18): Check if just drop the reader, could clean the opened
+    # resource or not?
+
+    def firstn_reader():
+        for i, item in enumerate(reader()):
+            if i == n:
+                break
+            yield item
+
+    return firstn_reader
diff --git a/python/paddle/v2/reader/tests/CMakeLists.txt b/python/paddle/v2/reader/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a51f700406b48f8186e45f1ced94765e343a8b5e
--- /dev/null
+++ b/python/paddle/v2/reader/tests/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_test(NAME reader_tests
+  COMMAND bash ${PROJ_ROOT}/python/paddle/v2/reader/tests/run_tests.sh
+  ${PYTHON_EXECUTABLE})
diff --git a/python/paddle/v2/reader/tests/__init__.py b/python/paddle/v2/reader/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/python/paddle/v2/reader/tests/creator_test.py b/python/paddle/v2/reader/tests/creator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..9f8d7133b8694aae5541eff9576eaba8a31e77dc
--- /dev/null
+++ b/python/paddle/v2/reader/tests/creator_test.py
@@ -0,0 +1,40 @@
+# Copyright PaddlePaddle contributors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+import unittest
+
+import numpy as np
+
+import paddle.v2.reader.creator
+
+
+class TestNumpyArray(unittest.TestCase):
+    def test_numpy_array(self):
+        l = [[1, 2, 3], [4, 5, 6]]
+        x = np.array(l, np.int32)
+        reader = paddle.v2.reader.creator.np_array(x)
+        for idx, e in enumerate(reader()):
+            self.assertItemsEqual(e, l[idx])
+
+
+class TestTextFile(unittest.TestCase):
+    def test_text_file(self):
+        path = os.path.join(os.path.dirname(__file__), "test_data_creator.txt")
+        reader = paddle.v2.reader.creator.text_file(path)
+        for idx, e in enumerate(reader()):
+            self.assertEqual(e, str(idx * 2) + " " + str(idx * 2 + 1))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/reader/tests/decorator_test.py b/python/paddle/v2/reader/tests/decorator_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..734154b9790a4dc118d11992343648364c907305
--- /dev/null
+++ b/python/paddle/v2/reader/tests/decorator_test.py
@@ -0,0 +1,125 @@
+# Copyright PaddlePaddle contributors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+import unittest
+
+import paddle.v2.reader
+
+
+def reader_creator_10(dur):
+    def reader():
+        for i in range(10):
+            # this invocation helps testing paddle.reader.buffer
+            time.sleep(dur)
+            yield i
+
+    return reader
+
+
+class TestMap(unittest.TestCase):
+    def test_map(self):
+        d = {"h": 0, "i": 1}
+
+        def tokenize(x):
+            return d[x]
+
+        def read():
+            yield "h"
+            yield "i"
+
+        r = paddle.v2.reader.map_readers(tokenize, read)
+        for i, e in enumerate(r()):
+            self.assertEqual(e, i)
+
+
+class TestBuffered(unittest.TestCase):
+    def test_read(self):
+        for size in range(20):
+            b = paddle.v2.reader.buffered(reader_creator_10(0), size)
+            c = 0
+            for i in b():
+                self.assertEqual(i, c)
+                c += 1
+            self.assertEqual(c, 10)
+
+    def test_buffering(self):
+        # read have 30ms delay.
+        b = paddle.v2.reader.buffered(reader_creator_10(0.03), 10)
+        last_time = time.time()
+        for idx, i in enumerate(b()):
+            elapsed_time = time.time() - last_time
+            if i == 0:
+                time.sleep(0.3)
+            else:
+                # read time should be short, meaning already buffered.
+                self.assertLess(elapsed_time, 0.05)
+            last_time = time.time()
+
+
+class TestCompose(unittest.TestCase):
+    def test_compse(self):
+        reader = paddle.v2.reader.compose(
+            reader_creator_10(0), reader_creator_10(0))
+        for idx, e in enumerate(reader()):
+            self.assertEqual(e, (idx, idx))
+
+    def test_compose_not_aligned(self):
+        total = 0
+        reader = paddle.v2.reader.compose(
+            paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)),
+            reader_creator_10(0))
+        with self.assertRaises(paddle.v2.reader.ComposeNotAligned):
+            for e in reader():
+                total += 1
+        # expecting 10, not 20
+        self.assertEqual(total, 10)
+
+    def test_compose_not_aligned_no_check(self):
+        total = 0
+        reader = paddle.v2.reader.compose(
+            paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0)),
+            reader_creator_10(0),
+            check_alignment=False)
+        for e in reader():
+            total += 1
+        # expecting 10, not 20
+        self.assertEqual(total, 10)
+
+
+class TestChain(unittest.TestCase):
+    def test_chain(self):
+        c = paddle.v2.reader.chain(reader_creator_10(0), reader_creator_10(0))
+        idx = 0
+        for e in c():
+            self.assertEqual(e, idx % 10)
+            idx += 1
+        self.assertEqual(idx, 20)
+
+
+class TestShuffle(unittest.TestCase):
+    def test_shuffle(self):
+        case = [(0, True), (1, True), (10, False), (100, False)]
+        a = reader_creator_10(0)
+        for size, checkEq in case:
+            s = paddle.v2.reader.shuffle(a, size)
+            total = 0
+            for idx, e in enumerate(s()):
+                if checkEq:
+                    self.assertEqual(idx, e)
+                total += 1
+            self.assertEqual(total, 10)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/reader/tests/run_tests.sh b/python/paddle/v2/reader/tests/run_tests.sh
new file mode 100755
index 0000000000000000000000000000000000000000..a544a5636021bcf8bd9a35966c91ae343c149d14
--- /dev/null
+++ b/python/paddle/v2/reader/tests/run_tests.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+pushd `dirname $0` > /dev/null
+SCRIPTPATH=$PWD
+popd > /dev/null
+
+cd $SCRIPTPATH
+$1 -m pip install ../../../../../paddle/dist/*.whl
+
+test_list="creator_test.py decorator_test.py"
+
+export PYTHONPATH=$PWD/../../../../../python/
+
+for fn in $test_list
+do
+  echo "test $fn"
+  $1 $fn
+  if [ $? -ne 0 ]; then
+    exit 1
+  fi
+done
diff --git a/python/paddle/v2/reader/tests/test_data_creator.txt b/python/paddle/v2/reader/tests/test_data_creator.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a2a8d47d43868d369083808497697da79e620e31
--- /dev/null
+++ b/python/paddle/v2/reader/tests/test_data_creator.txt
@@ -0,0 +1,3 @@
+0 1
+2 3
+4 5
diff --git a/python/paddle/v2/tests/CMakeLists.txt b/python/paddle/v2/tests/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..572deaff356712cac23cd7911cdf289db100564c
--- /dev/null
+++ b/python/paddle/v2/tests/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_test(NAME test_v2_api
+        COMMAND bash ${PROJ_ROOT}/python/paddle/v2/tests/run_tests.sh ${PYTHON_EXECUTABLE})
+
+add_test(NAME test_v2_layer
+        COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/v2/tests/test_layer.py
+        WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
+
+add_test(NAME test_v2_rnn_layer
+        COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/v2/tests/test_rnn_layer.py)
+
+add_test(NAME test_topology
+        COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/v2/tests/test_topology.py
+        WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
diff --git a/python/paddle/v2/tests/run_tests.sh b/python/paddle/v2/tests/run_tests.sh
new file mode 100755
index 0000000000000000000000000000000000000000..dda1b1bd222a9f226db1a4bd730e9637ab882196
--- /dev/null
+++ b/python/paddle/v2/tests/run_tests.sh
@@ -0,0 +1,36 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+pushd `dirname $0` > /dev/null
+SCRIPTPATH=$PWD
+popd > /dev/null
+
+cd $SCRIPTPATH
+
+$1 -m pip install ../../../../paddle/dist/*.whl
+
+test_list="test_data_feeder.py test_parameters.py"
+
+export PYTHONPATH=$PWD/../../../../python/
+
+for fn in $test_list
+do
+  echo "test $fn"
+  $1 $fn
+  if [ $? -ne 0 ]; then
+    exit 1
+  fi
+done
diff --git a/python/paddle/v2/tests/test_data_feeder.py b/python/paddle/v2/tests/test_data_feeder.py
new file mode 100644
index 0000000000000000000000000000000000000000..71eb3bf31425c22b47accc11c9550042e077ef12
--- /dev/null
+++ b/python/paddle/v2/tests/test_data_feeder.py
@@ -0,0 +1,243 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import py_paddle.swig_paddle as api
+import numpy as np
+
+from paddle.v2 import data_type
+from paddle.v2.data_feeder import DataFeeder
+
+
+class DataFeederTest(unittest.TestCase):
+    def dense_reader(self, size):
+        data = np.random.random(size)
+        return data
+
+    def sparse_binary_reader(self, high, size_limit, non_empty=False):
+        num = np.random.randint(size_limit)  # num could be 0
+        while non_empty and num == 0:
+            num = np.random.randint(size_limit)
+        return np.random.randint(high, size=num).tolist()
+
+    def test_dense(self):
+        def compare(input):
+            feeder = DataFeeder([('image', data_type.dense_vector(784))],
+                                {'image': 0})
+            arg = feeder(input)
+            output = arg.getSlotValue(0).copyToNumpyMat()
+            input = np.array(input, dtype='float32')
+            self.assertAlmostEqual(input.all(), output.all())
+
+        # test numpy array
+        batch_size = 32
+        dim = 784
+        data = []
+        for i in xrange(batch_size):
+            each_sample = []
+            each_sample.append(self.dense_reader(dim))
+            data.append(each_sample)
+        compare(data)
+
+        # each feature is a list
+        data = []
+        for i in xrange(batch_size):
+            each_sample = []
+            each_sample.append(self.dense_reader(dim).tolist())
+            data.append(each_sample)
+        compare(data)
+
+        # test tuple
+        data = []
+        for i in xrange(batch_size):
+            each_sample = (self.dense_reader(dim).tolist(), )
+            data.append(each_sample)
+        compare(data)
+
+    def test_sparse_binary(self):
+        dim = 10000
+        batch_size = 32
+        data = []
+        for i in xrange(batch_size):
+            each_sample = []
+            each_sample.append(self.sparse_binary_reader(dim, 50))
+            data.append(each_sample)
+        feeder = DataFeeder([('input', data_type.sparse_binary_vector(dim))],
+                            {'input': 0})
+        arg = feeder(data)
+        output = arg.getSlotValue(0)
+        assert isinstance(output, api.Matrix)
+        for i in xrange(batch_size):
+            self.assertEqual(output.getSparseRowCols(i), data[i][0])
+
+    def test_sparse(self):
+        dim = 10000
+        batch_size = 32
+        v = []
+        w = []
+        data = []
+        for dat in xrange(batch_size):
+            each_sample = []
+            a = self.sparse_binary_reader(dim, 40, non_empty=True)
+            b = self.dense_reader(len(a)).tolist()
+            v.append(a)
+            w.append(np.array(b, dtype="float32"))
+            each_sample.append(zip(a, b))
+            data.append(each_sample)
+
+        feeder = DataFeeder([('input', data_type.sparse_vector(dim))],
+                            {'input': 0})
+        arg = feeder(data)
+        output = arg.getSlotValue(0)
+        assert isinstance(output, api.Matrix)
+        for i in xrange(batch_size):
+            self.assertEqual(output.getSparseRowCols(i), v[i])
+            cols_value = output.getSparseRowColsVal(i)
+            value = [val[1] for val in cols_value]
+            value = np.array(value, dtype="float32")
+            self.assertAlmostEqual(value.all(), w[i].all())
+
+    def test_integer(self):
+        value_range = 100
+        batch_size = 32
+        index = []
+        for i in xrange(batch_size):
+            each_sample = []
+            each_sample.append(np.random.randint(value_range))
+            index.append(each_sample)
+        feeder = DataFeeder([('input', data_type.integer_value(value_range))],
+                            {'input': 0})
+        arg = feeder(index)
+        output = arg.getSlotIds(0).copyToNumpyArray()
+        index = np.array(index, dtype='int')
+        self.assertEqual(output.all(), index.flatten().all())
+
+    def test_integer_sequence(self):
+        value_range = 10000
+        batch_size = 32
+        start = [0]
+        data = []
+        for i in xrange(batch_size):
+            each_sample = []
+            each_sample.append(
+                self.sparse_binary_reader(
+                    value_range, 30, non_empty=True))
+            data.append(each_sample)
+            start.append(len(each_sample[0]) + start[-1])
+        feeder = DataFeeder(
+            [('input', data_type.integer_value_sequence(value_range))],
+            {'input': 0})
+        arg = feeder(data)
+        output_data = arg.getSlotIds(0).copyToNumpyArray()
+        output_start = arg.getSlotSequenceStartPositions(0).copyToNumpyArray()
+
+        index = []
+        for dat in data:
+            index.extend(x for x in dat[0])  # only one feature, so dat[0]
+        index = np.array(index, dtype='int')
+        start = np.array(start, dtype='int')
+        self.assertEqual(output_data.all(), index.all())
+        self.assertEqual(output_start.all(), start.all())
+
+    def test_multiple_features(self):
+        batch_size = 2
+        data = []
+        for i in xrange(batch_size):
+            each_sample = []
+            each_sample.append(np.random.randint(10))
+            each_sample.append(
+                self.sparse_binary_reader(
+                    20000, 40, non_empty=True))
+            each_sample.append(self.dense_reader(100))
+            data.append(each_sample)
+
+        # test multiple features
+        data_types = [('fea0', data_type.dense_vector(100)),
+                      ('fea1', data_type.sparse_binary_vector(20000)),
+                      ('fea2', data_type.integer_value(10))]
+        feeder = DataFeeder(data_types, {'fea0': 2, 'fea1': 1, 'fea2': 0})
+        arg = feeder(data)
+        output_dense = arg.getSlotValue(0).copyToNumpyMat()
+        output_sparse = arg.getSlotValue(1)
+        output_index = arg.getSlotIds(2).copyToNumpyArray()
+        for i in xrange(batch_size):
+            self.assertEqual(output_dense[i].all(), data[i][2].all())
+            self.assertEqual(output_sparse.getSparseRowCols(i), data[i][1])
+            self.assertEqual(output_index[i], data[i][0])
+
+        # reader returns 3 features, but only use 2 features
+        data_types = [('fea0', data_type.dense_vector(100)),
+                      ('fea2', data_type.integer_value(10))]
+        feeder = DataFeeder(data_types, {'fea0': 2, 'fea2': 0})
+        arg = feeder(data)
+        output_dense = arg.getSlotValue(0).copyToNumpyMat()
+        output_index = arg.getSlotIds(1).copyToNumpyArray()
+        for i in xrange(batch_size):
+            self.assertEqual(output_dense[i].all(), data[i][2].all())
+            self.assertEqual(output_index[i], data[i][0])
+
+        # reader returns 3 featreus, one is duplicate data
+        data_types = [('fea0', data_type.dense_vector(100)),
+                      ('fea1', data_type.sparse_binary_vector(20000)),
+                      ('fea2', data_type.integer_value(10)),
+                      ('fea3', data_type.dense_vector(100))]
+        feeder = DataFeeder(data_types,
+                            {'fea0': 2,
+                             'fea1': 1,
+                             'fea2': 0,
+                             'fea3': 2})
+        arg = feeder(data)
+        fea0 = arg.getSlotValue(0).copyToNumpyMat()
+        fea1 = arg.getSlotValue(1)
+        fea2 = arg.getSlotIds(2).copyToNumpyArray()
+        fea3 = arg.getSlotValue(3).copyToNumpyMat()
+        for i in xrange(batch_size):
+            self.assertEqual(fea0[i].all(), data[i][2].all())
+            self.assertEqual(fea1.getSparseRowCols(i), data[i][1])
+            self.assertEqual(fea2[i], data[i][0])
+            self.assertEqual(fea3[i].all(), data[i][2].all())
+
+    def test_multiple_features_tuple(self):
+        batch_size = 2
+        data = []
+        for i in xrange(batch_size):
+            a = np.random.randint(10)
+            b = self.sparse_binary_reader(20000, 40, non_empty=True)
+            c = self.dense_reader(100)
+            each_sample = (a, b, c)
+            data.append(each_sample)
+
+        # test multiple features
+        data_types = [('fea0', data_type.dense_vector(100)),
+                      ('fea1', data_type.sparse_binary_vector(20000)),
+                      ('fea2', data_type.integer_value(10))]
+        feeder = DataFeeder(data_types, {'fea0': 2, 'fea1': 1, 'fea2': 0})
+        arg = feeder(data)
+        out_dense = arg.getSlotValue(0).copyToNumpyMat()
+        out_sparse = arg.getSlotValue(1)
+        out_index = arg.getSlotIds(2).copyToNumpyArray()
+        for i in xrange(batch_size):
+            self.assertEqual(out_dense[i].all(), data[i][2].all())
+            self.assertEqual(out_sparse.getSparseRowCols(i), data[i][1])
+            self.assertEqual(out_index[i], data[i][0])
+
+
+if __name__ == '__main__':
+    api.initPaddle("--use_gpu=0")
+    suite = unittest.TestLoader().loadTestsFromTestCase(DataFeederTest)
+    unittest.TextTestRunner().run(suite)
+    if api.isGpuVersion():
+        api.setUseGpu(True)
+        unittest.main()
diff --git a/python/paddle/v2/tests/test_layer.py b/python/paddle/v2/tests/test_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ccd3d6913e1755a37b4da7c4f182147b880d3cb
--- /dev/null
+++ b/python/paddle/v2/tests/test_layer.py
@@ -0,0 +1,263 @@
+# Copyright PaddlePaddle contributors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import paddle.v2.activation as activation
+import paddle.v2.attr as attr
+import paddle.v2.data_type as data_type
+import paddle.v2.layer as layer
+import paddle.v2.pooling as pooling
+import paddle.v2.networks as networks
+
+pixel = layer.data(name='pixel', type=data_type.dense_vector(128))
+label = layer.data(name='label', type=data_type.integer_value(10))
+weight = layer.data(name='weight', type=data_type.dense_vector(10))
+score = layer.data(name='score', type=data_type.dense_vector(1))
+
+hidden = layer.fc(input=pixel,
+                  size=100,
+                  act=activation.Sigmoid(),
+                  param_attr=attr.Param(name='hidden'))
+inference = layer.fc(input=hidden, size=10, act=activation.Softmax())
+conv = layer.img_conv(
+    input=pixel,
+    filter_size=1,
+    filter_size_y=1,
+    num_channels=8,
+    num_filters=16,
+    act=activation.Linear())
+
+
+class ImageLayerTest(unittest.TestCase):
+    def test_conv_layer(self):
+        conv_shift = layer.conv_shift(a=pixel, b=score)
+        print layer.parse_network(conv, conv_shift)
+
+    def test_pooling_layer(self):
+        maxpool = layer.img_pool(
+            input=conv,
+            pool_size=2,
+            num_channels=16,
+            padding=1,
+            pool_type=pooling.Max())
+        spp = layer.spp(input=conv,
+                        pyramid_height=2,
+                        num_channels=16,
+                        pool_type=pooling.Max())
+        maxout = layer.maxout(input=conv, num_channels=16, groups=4)
+        print layer.parse_network(maxpool, spp, maxout)
+
+    def test_norm_layer(self):
+        norm1 = layer.img_cmrnorm(input=conv, size=5)
+        norm2 = layer.batch_norm(input=conv)
+        norm3 = layer.sum_to_one_norm(input=conv)
+        print layer.parse_network(norm1, norm2, norm3)
+
+
+class AggregateLayerTest(unittest.TestCase):
+    def test_aggregate_layer(self):
+        pool = layer.pooling(
+            input=pixel,
+            pooling_type=pooling.Avg(),
+            agg_level=layer.AggregateLevel.EACH_SEQUENCE)
+        last_seq = layer.last_seq(input=pixel)
+        first_seq = layer.first_seq(input=pixel)
+        concat = layer.concat(input=[last_seq, first_seq])
+        seq_concat = layer.seq_concat(a=last_seq, b=first_seq)
+        print layer.parse_network(pool, last_seq, first_seq, concat, seq_concat)
+
+
+class MathLayerTest(unittest.TestCase):
+    def test_math_layer(self):
+        addto = layer.addto(input=[pixel, pixel])
+        linear_comb = layer.linear_comb(weights=weight, vectors=hidden, size=10)
+        interpolation = layer.interpolation(
+            input=[hidden, hidden], weight=score)
+        bilinear = layer.bilinear_interp(input=conv, out_size_x=4, out_size_y=4)
+        power = layer.power(input=pixel, weight=score)
+        scaling = layer.scaling(input=pixel, weight=score)
+        slope = layer.slope_intercept(input=pixel)
+        tensor = layer.tensor(a=pixel, b=pixel, size=1000)
+        cos_sim = layer.cos_sim(a=pixel, b=pixel)
+        trans = layer.trans(input=tensor)
+        print layer.parse_network(addto, linear_comb, interpolation, power,
+                                  scaling, slope, tensor, cos_sim, trans)
+
+
+class ReshapeLayerTest(unittest.TestCase):
+    def test_reshape_layer(self):
+        block_expand = layer.block_expand(
+            input=conv, num_channels=4, stride_x=1, block_x=1)
+        expand = layer.expand(
+            input=weight,
+            expand_as=pixel,
+            expand_level=layer.ExpandLevel.FROM_TIMESTEP)
+        repeat = layer.repeat(input=pixel, num_repeats=4)
+        reshape = layer.seq_reshape(input=pixel, reshape_size=4)
+        rotate = layer.rotate(input=pixel, height=16, width=49)
+        print layer.parse_network(block_expand, expand, repeat, reshape, rotate)
+
+
+class RecurrentLayerTest(unittest.TestCase):
+    def test_recurrent_layer(self):
+        word = layer.data(name='word', type=data_type.integer_value(12))
+        recurrent = layer.recurrent(input=word)
+        lstm = layer.lstmemory(input=word)
+        gru = layer.grumemory(input=word)
+        print layer.parse_network(recurrent, lstm, gru)
+
+
+class CostLayerTest(unittest.TestCase):
+    def test_cost_layer(self):
+        cost1 = layer.classification_cost(input=inference, label=label)
+        cost2 = layer.classification_cost(
+            input=inference, label=label, weight=weight)
+        cost3 = layer.cross_entropy_cost(input=inference, label=label)
+        cost4 = layer.cross_entropy_with_selfnorm_cost(
+            input=inference, label=label)
+        cost5 = layer.mse_cost(input=inference, label=label)
+        cost6 = layer.mse_cost(input=inference, label=label, weight=weight)
+        cost7 = layer.multi_binary_label_cross_entropy_cost(
+            input=inference, label=label)
+        cost8 = layer.rank_cost(left=score, right=score, label=score)
+        cost9 = layer.lambda_cost(input=inference, score=score)
+        cost10 = layer.sum_cost(input=inference)
+        cost11 = layer.huber_cost(input=score, label=label)
+
+        print layer.parse_network(cost1, cost2)
+        print layer.parse_network(cost3, cost4)
+        print layer.parse_network(cost5, cost6)
+        print layer.parse_network(cost7, cost8, cost9, cost10, cost11)
+
+        crf = layer.crf(input=inference, label=label)
+        crf_decoding = layer.crf_decoding(input=inference, size=3)
+        ctc = layer.ctc(input=inference, label=label)
+        warp_ctc = layer.warp_ctc(input=pixel, label=label)
+        nce = layer.nce(input=inference, label=label, num_classes=3)
+        hsigmoid = layer.hsigmoid(input=inference, label=label, num_classes=3)
+
+        print layer.parse_network(crf, crf_decoding, ctc, warp_ctc, nce,
+                                  hsigmoid)
+
+
+class OtherLayerTest(unittest.TestCase):
+    def test_sampling_layer(self):
+        maxid = layer.max_id(input=inference)
+        sampling_id = layer.sampling_id(input=inference)
+        eos = layer.eos(input=maxid, eos_id=5)
+        print layer.parse_network(maxid, sampling_id, eos)
+
+    def test_slicing_joining_layer(self):
+        pad = layer.pad(input=conv, pad_c=[2, 3], pad_h=[1, 2], pad_w=[3, 1])
+        print layer.parse_network(pad)
+
+
+class ProjOpTest(unittest.TestCase):
+    def test_projection(self):
+        input = layer.data(name='data', type=data_type.dense_vector(784))
+        word = layer.data(
+            name='word', type=data_type.integer_value_sequence(10000))
+        fc0 = layer.fc(input=input, size=100, act=activation.Sigmoid())
+        fc1 = layer.fc(input=input, size=200, act=activation.Sigmoid())
+        mixed0 = layer.mixed(
+            size=256,
+            input=[
+                layer.full_matrix_projection(input=fc0),
+                layer.full_matrix_projection(input=fc1)
+            ])
+        with layer.mixed(size=200) as mixed1:
+            mixed1 += layer.full_matrix_projection(input=fc0)
+            mixed1 += layer.identity_projection(input=fc1)
+
+        table = layer.table_projection(input=word)
+        emb0 = layer.mixed(size=512, input=table)
+        with layer.mixed(size=512) as emb1:
+            emb1 += table
+
+        scale = layer.scaling_projection(input=fc0)
+        scale0 = layer.mixed(size=100, input=scale)
+        with layer.mixed(size=100) as scale1:
+            scale1 += scale
+
+        dotmul = layer.dotmul_projection(input=fc0)
+        dotmul0 = layer.mixed(size=100, input=dotmul)
+        with layer.mixed(size=100) as dotmul1:
+            dotmul1 += dotmul
+
+        context = layer.context_projection(input=fc0, context_len=5)
+        context0 = layer.mixed(size=100, input=context)
+        with layer.mixed(size=100) as context1:
+            context1 += context
+
+        conv = layer.conv_projection(
+            input=input,
+            filter_size=1,
+            num_channels=1,
+            num_filters=128,
+            stride=1,
+            padding=0)
+        conv0 = layer.mixed(input=conv, bias_attr=True)
+        with layer.mixed(bias_attr=True) as conv1:
+            conv1 += conv
+
+        print layer.parse_network(mixed0)
+        print layer.parse_network(mixed1)
+        print layer.parse_network(emb0)
+        print layer.parse_network(emb1)
+        print layer.parse_network(scale0)
+        print layer.parse_network(scale1)
+        print layer.parse_network(dotmul0)
+        print layer.parse_network(dotmul1)
+        print layer.parse_network(conv0)
+        print layer.parse_network(conv1)
+
+    def test_operator(self):
+        ipt0 = layer.data(name='data', type=data_type.dense_vector(784))
+        ipt1 = layer.data(name='word', type=data_type.dense_vector(128))
+        fc0 = layer.fc(input=ipt0, size=100, act=activation.Sigmoid())
+        fc1 = layer.fc(input=ipt0, size=100, act=activation.Sigmoid())
+
+        dotmul_op = layer.dotmul_operator(a=fc0, b=fc1)
+        dotmul0 = layer.mixed(input=dotmul_op)
+        with layer.mixed() as dotmul1:
+            dotmul1 += dotmul_op
+
+        conv = layer.conv_operator(
+            img=ipt0,
+            filter=ipt1,
+            filter_size=1,
+            num_channels=1,
+            num_filters=128,
+            stride=1,
+            padding=0)
+        conv0 = layer.mixed(input=conv)
+        with layer.mixed() as conv1:
+            conv1 += conv
+
+        print layer.parse_network(dotmul0)
+        print layer.parse_network(dotmul1)
+        print layer.parse_network(conv0)
+        print layer.parse_network(conv1)
+
+
+class NetworkTests(unittest.TestCase):
+    def test_vgg(self):
+        img = layer.data(name='pixel', type=data_type.dense_vector(784))
+        vgg_out = networks.small_vgg(
+            input_image=img, num_channels=1, num_classes=2)
+        print layer.parse_network(vgg_out)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/tests/test_parameters.py b/python/paddle/v2/tests/test_parameters.py
new file mode 100644
index 0000000000000000000000000000000000000000..ebb182caab6430862a8e4da2ae4ea6b1e72f726c
--- /dev/null
+++ b/python/paddle/v2/tests/test_parameters.py
@@ -0,0 +1,60 @@
+import unittest
+import sys
+
+try:
+    import py_paddle
+
+    del py_paddle
+except ImportError:
+    print >> sys.stderr, "It seems swig of Paddle is not installed, this " \
+                         "unittest will not be run."
+    sys.exit(0)
+
+import paddle.v2.parameters as parameters
+from paddle.proto.ParameterConfig_pb2 import ParameterConfig
+import random
+import cStringIO
+import numpy
+
+
+def __rand_param_config__(name):
+    conf = ParameterConfig()
+    conf.name = name
+    size = 1
+    for i in xrange(2):
+        dim = random.randint(1, 1000)
+        conf.dims.append(dim)
+        size *= dim
+    conf.size = size
+    assert conf.IsInitialized()
+    return conf
+
+
+class TestParameters(unittest.TestCase):
+    def test_serialization(self):
+        params = parameters.Parameters()
+        params.__append_config__(__rand_param_config__("param_0"))
+        params.__append_config__(__rand_param_config__("param_1"))
+
+        for name in params.names():
+            param = params.get(name)
+            param[:] = numpy.random.uniform(
+                -1.0, 1.0, size=params.get_shape(name))
+            params.set(name, param)
+
+        tmp_file = cStringIO.StringIO()
+        params.to_tar(tmp_file)
+        tmp_file.seek(0)
+        params_dup = parameters.Parameters.from_tar(tmp_file)
+
+        self.assertEqual(params_dup.names(), params.names())
+
+        for name in params.names():
+            self.assertEqual(params.get_shape(name), params_dup.get_shape(name))
+            p0 = params.get(name)
+            p1 = params_dup.get(name)
+            self.assertTrue(numpy.isclose(p0, p1).all())
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/tests/test_rnn_layer.py b/python/paddle/v2/tests/test_rnn_layer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fbbd20eb76bb9daab2bcf98c4adad989106a377
--- /dev/null
+++ b/python/paddle/v2/tests/test_rnn_layer.py
@@ -0,0 +1,155 @@
+# Copyright PaddlePaddle contributors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import difflib
+import unittest
+
+import paddle.trainer_config_helpers as conf_helps
+import paddle.v2.activation as activation
+import paddle.v2.data_type as data_type
+import paddle.v2.layer as layer
+from paddle.trainer_config_helpers.config_parser_utils import \
+    parse_network_config as parse_network
+
+
+class RNNTest(unittest.TestCase):
+    def test_simple_rnn(self):
+        dict_dim = 10
+        word_dim = 8
+        hidden_dim = 8
+
+        def parse_old_rnn():
+            def step(y):
+                mem = conf_helps.memory(name="rnn_state", size=hidden_dim)
+                out = conf_helps.fc_layer(
+                    input=[y, mem],
+                    size=hidden_dim,
+                    act=activation.Tanh(),
+                    bias_attr=True,
+                    name="rnn_state")
+                return out
+
+            def test():
+                data = conf_helps.data_layer(name="word", size=dict_dim)
+                embd = conf_helps.embedding_layer(input=data, size=word_dim)
+                conf_helps.recurrent_group(name="rnn", step=step, input=embd)
+
+            return str(parse_network(test))
+
+        def parse_new_rnn():
+            def new_step(y):
+                mem = layer.memory(name="rnn_state", size=hidden_dim)
+                out = layer.fc(input=[y, mem],
+                               size=hidden_dim,
+                               act=activation.Tanh(),
+                               bias_attr=True,
+                               name="rnn_state")
+                return out
+
+            data = layer.data(
+                name="word", type=data_type.integer_value(dict_dim))
+            embd = layer.embedding(input=data, size=word_dim)
+            rnn_layer = layer.recurrent_group(
+                name="rnn", step=new_step, input=embd)
+            return str(layer.parse_network(rnn_layer))
+
+        diff = difflib.unified_diff(parse_old_rnn().splitlines(1),
+                                    parse_new_rnn().splitlines(1))
+        print ''.join(diff)
+
+    def test_sequence_rnn_multi_input(self):
+        dict_dim = 10
+        word_dim = 8
+        hidden_dim = 8
+        label_dim = 3
+
+        def parse_old_rnn():
+            def test():
+                data = conf_helps.data_layer(name="word", size=dict_dim)
+                label = conf_helps.data_layer(name="label", size=label_dim)
+                emb = conf_helps.embedding_layer(input=data, size=word_dim)
+                boot_layer = conf_helps.data_layer(name="boot", size=10)
+                boot_layer = conf_helps.fc_layer(
+                    name='boot_fc', input=boot_layer, size=10)
+
+                def step(y, wid):
+                    z = conf_helps.embedding_layer(input=wid, size=word_dim)
+                    mem = conf_helps.memory(
+                        name="rnn_state",
+                        size=hidden_dim,
+                        boot_layer=boot_layer)
+                    out = conf_helps.fc_layer(
+                        input=[y, z, mem],
+                        size=hidden_dim,
+                        act=conf_helps.TanhActivation(),
+                        bias_attr=True,
+                        name="rnn_state")
+                    return out
+
+                out = conf_helps.recurrent_group(
+                    name="rnn", step=step, input=[emb, data])
+
+                rep = conf_helps.last_seq(input=out)
+                prob = conf_helps.fc_layer(
+                    size=label_dim,
+                    input=rep,
+                    act=conf_helps.SoftmaxActivation(),
+                    bias_attr=True)
+
+                conf_helps.outputs(
+                    conf_helps.classification_cost(
+                        input=prob, label=label))
+
+            return str(parse_network(test))
+
+        def parse_new_rnn():
+            data = layer.data(
+                name="word", type=data_type.dense_vector(dict_dim))
+            label = layer.data(
+                name="label", type=data_type.dense_vector(label_dim))
+            emb = layer.embedding(input=data, size=word_dim)
+            boot_layer = layer.data(
+                name="boot", type=data_type.dense_vector(10))
+            boot_layer = layer.fc(name='boot_fc', input=boot_layer, size=10)
+
+            def step(y, wid):
+                z = layer.embedding(input=wid, size=word_dim)
+                mem = layer.memory(
+                    name="rnn_state", size=hidden_dim, boot_layer=boot_layer)
+                out = layer.fc(input=[y, z, mem],
+                               size=hidden_dim,
+                               act=activation.Tanh(),
+                               bias_attr=True,
+                               name="rnn_state")
+                return out
+
+            out = layer.recurrent_group(
+                name="rnn", step=step, input=[emb, data])
+
+            rep = layer.last_seq(input=out)
+            prob = layer.fc(size=label_dim,
+                            input=rep,
+                            act=activation.Softmax(),
+                            bias_attr=True)
+
+            cost = layer.classification_cost(input=prob, label=label)
+
+            return str(layer.parse_network(cost))
+
+        diff = difflib.unified_diff(parse_old_rnn().splitlines(1),
+                                    parse_new_rnn().splitlines(1))
+        print ''.join(diff)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/tests/test_topology.py b/python/paddle/v2/tests/test_topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c6dbcdb4f49b960fb8b71aecbad4f013d2cd283
--- /dev/null
+++ b/python/paddle/v2/tests/test_topology.py
@@ -0,0 +1,84 @@
+# Copyright PaddlePaddle contributors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import paddle.v2.layer as layer
+import paddle.v2.topology as topology
+import paddle.v2.data_type as data_type
+import paddle.trainer_config_helpers as conf_helps
+import paddle.trainer.PyDataProvider2 as pydp2
+
+
+class TestTopology(unittest.TestCase):
+    def test_data_type(self):
+        pixel = layer.data(name='pixel', type=data_type.dense_vector(784))
+        label = layer.data(name='label', type=data_type.integer_value(10))
+        hidden = layer.fc(input=pixel,
+                          size=100,
+                          act=conf_helps.SigmoidActivation())
+        inference = layer.fc(input=hidden,
+                             size=10,
+                             act=conf_helps.SoftmaxActivation())
+        cost = layer.classification_cost(input=inference, label=label)
+        topo = topology.Topology(cost)
+        data_types = topo.data_type()
+        self.assertEqual(len(data_types), 2)
+        pixel_data_type = filter(lambda type: type[0] == "pixel", data_types)
+        self.assertEqual(len(pixel_data_type), 1)
+        pixel_data_type = pixel_data_type[0]
+        self.assertEqual(pixel_data_type[1].type, pydp2.DataType.Dense)
+        self.assertEqual(pixel_data_type[1].dim, 784)
+
+        label_data_type = filter(lambda type: type[0] == "label", data_types)
+        self.assertEqual(len(label_data_type), 1)
+        label_data_type = label_data_type[0]
+        self.assertEqual(label_data_type[1].type, pydp2.DataType.Index)
+        self.assertEqual(label_data_type[1].dim, 10)
+
+    def test_get_layer(self):
+        pixel = layer.data(name='pixel', type=data_type.dense_vector(784))
+        label = layer.data(name='label', type=data_type.integer_value(10))
+        hidden = layer.fc(input=pixel,
+                          size=100,
+                          act=conf_helps.SigmoidActivation())
+        inference = layer.fc(input=hidden,
+                             size=10,
+                             act=conf_helps.SoftmaxActivation())
+        cost = layer.classification_cost(input=inference, label=label)
+        topo = topology.Topology(cost)
+        pixel_layer = topo.get_layer("pixel")
+        label_layer = topo.get_layer("label")
+        self.assertEqual(pixel_layer, pixel)
+        self.assertEqual(label_layer, label)
+
+    def test_parse(self):
+        pixel = layer.data(name='pixel', type=data_type.dense_vector(784))
+        label = layer.data(name='label', type=data_type.integer_value(10))
+        hidden = layer.fc(input=pixel,
+                          size=100,
+                          act=conf_helps.SigmoidActivation())
+        inference = layer.fc(input=hidden,
+                             size=10,
+                             act=conf_helps.SoftmaxActivation())
+        maxid = layer.max_id(input=inference)
+        cost1 = layer.classification_cost(input=inference, label=label)
+        cost2 = layer.cross_entropy_cost(input=inference, label=label)
+
+        topology.Topology(cost2).proto()
+        topology.Topology([cost1]).proto()
+        topology.Topology([cost1, cost2]).proto()
+        topology.Topology([inference, maxid]).proto()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/topology.py b/python/paddle/v2/topology.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0679c5675b0c0f24f28f3df22efd4eb51ccbb3a
--- /dev/null
+++ b/python/paddle/v2/topology.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+
+from paddle.proto.ModelConfig_pb2 import ModelConfig
+
+import layer as v2_layer
+from layer import WithExtraParent
+
+__all__ = ['Topology']
+
+
+def __flatten__(lis):
+    """
+    Given a list, possibly nested to any level, return it flattened.
+    """
+    new_lis = []
+    for item in lis:
+        if isinstance(item, collections.Sequence):
+            new_lis.extend(__flatten__(item))
+        else:
+            new_lis.append(item)
+    return new_lis
+
+
+def __bfs_travel__(callback, *layers):
+    layers = __flatten__(layers)
+    for each_layer in layers:
+        __break__ = callback(each_layer)
+        if __break__:
+            return
+        __layers__ = each_layer.__parent_layers__.values()
+        if isinstance(each_layer, WithExtraParent):
+            __layers__ = __layers__ + each_layer.extra_parent()
+        __bfs_travel__(callback, *__layers__)
+
+
+class Topology(object):
+    """
+    Topology is used to store the information about all layers
+    and network configs.
+    """
+
+    def __init__(self, layers):
+        if not isinstance(layers, collections.Sequence):
+            __check_layer_type__(layers)
+            layers = [layers]
+        for layer in layers:
+            __check_layer_type__(layer)
+        self.layers = layers
+        self.__model_config__ = v2_layer.parse_network(*layers)
+        assert isinstance(self.__model_config__, ModelConfig)
+
+    def proto(self):
+        return self.__model_config__
+
+    def get_layer(self, name):
+        """
+        get v2.Layer Class instance by layer name
+        :param name:
+        :return:
+        """
+        result_layer = [None]
+
+        def __impl__(l):
+            if l.name == name:
+                result_layer[0] = l
+                return True  # break
+            return False
+
+        __bfs_travel__(__impl__, *self.layers)
+        if result_layer[0] is None:
+            raise ValueError("No such layer %s" % name)
+        return result_layer[0]
+
+    def data_layers(self):
+        """
+        get all data layer
+        :return:
+        """
+        data_layers = dict()
+
+        def __impl__(l):
+            if isinstance(l, v2_layer.DataLayerV2):
+                data_layers[l.name] = l
+
+        __bfs_travel__(__impl__, *self.layers)
+        return data_layers
+
+    def data_type(self):
+        """
+        get data_type from proto, such as:
+        [('image', dense_vector(768)), ('label', integer_value(10))]
+        """
+        data_layers = self.data_layers()
+        return [(nm, data_layers[nm].type)
+                for nm in self.proto().input_layer_names]
+
+
+def __check_layer_type__(layer):
+    if not isinstance(layer, v2_layer.LayerV2):
+        raise ValueError('layer should have type paddle.layer.Layer')
diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bd3e2c565ee00c91402e7dea36c7393fb1a9bdf
--- /dev/null
+++ b/python/paddle/v2/trainer.py
@@ -0,0 +1,152 @@
+import collections
+
+import py_paddle.swig_paddle as api
+
+from data_feeder import DataFeeder
+from topology import Topology
+from . import event as v2_event
+from . import optimizer as v2_optimizer
+from . import parameters as v2_parameters
+
+__all__ = ['SGD']
+"""
+Trainer package
+TODO(yuyang18): Complete comments.
+"""
+
+
+def default_event_handler(event):
+    """
+    Default event handler. It will print some log and save mode.
+
+    TODO(yuyang18): Complete it!
+    :param event:
+    :return:
+    """
+    pass
+
+
+class SGD(object):
+    """
+    Simple SGD Trainer.
+    TODO(yuyang18): Complete comments
+
+    :param update_equation: The optimizer object.
+    :type update_equation: paddle.v2.optimizer.Optimizer
+    :param cost: Target cost that neural network should be optimized.
+    :type cost: paddle.v2.config_base.Layer
+    :param parameters: The parameters dictionary.
+    :type parameters: paddle.v2.parameters.Parameters
+    """
+
+    def __init__(self, cost, parameters, update_equation):
+
+        if not isinstance(parameters, v2_parameters.Parameters):
+            raise TypeError('parameters should be parameters')
+
+        if not isinstance(update_equation, v2_optimizer.Optimizer):
+            raise TypeError("update equation parameter must be "
+                            "paddle.v2.optimizer.Optimizer")
+        topology = Topology(cost)
+        self.__optimizer__ = update_equation
+        self.__topology__ = topology
+        self.__parameters__ = parameters
+        self.__topology_in_proto__ = topology.proto()
+        self.__data_types__ = topology.data_type()
+        gm = api.GradientMachine.createFromConfigProto(
+            self.__topology_in_proto__, api.CREATE_MODE_NORMAL,
+            self.__optimizer__.enable_types())
+        assert isinstance(gm, api.GradientMachine)
+        self.__gradient_machine__ = gm
+        self.__gradient_machine__.randParameters()
+        parameters.append_gradient_machine(gm)
+
+    def train(self, reader, num_passes=1, event_handler=None, feeding=None):
+        """
+        Training method. Will train num_passes of input data.
+
+        :param reader:
+        :param num_passes: The total train passes.
+        :param event_handler: Event handler. A method will be invoked when event
+                              occurred.
+        :type event_handler: (BaseEvent) => None
+        :param feeding: Feeding is a map of neural network input name and array
+                        index that reader returns.
+        :type feeding: dict
+        :return:
+        """
+        if event_handler is None:
+            event_handler = default_event_handler
+        __check_train_args__(**locals())
+
+        updater = self.__optimizer__.create_local_updater()
+        updater.init(self.__gradient_machine__)
+
+        self.__gradient_machine__.start()
+        batch_evaluator = self.__gradient_machine__.makeEvaluator()
+        assert isinstance(batch_evaluator, api.Evaluator)
+        pass_evaluator = self.__gradient_machine__.makeEvaluator()
+        assert isinstance(pass_evaluator, api.Evaluator)
+        out_args = api.Arguments.createArguments(0)
+        feeder = DataFeeder(self.__data_types__, feeding)
+        for pass_id in xrange(num_passes):
+            event_handler(v2_event.BeginPass(pass_id))
+            pass_evaluator.start()
+            updater.startPass()
+            for batch_id, data_batch in enumerate(reader()):
+                batch_evaluator.start()
+                event_handler(
+                    v2_event.BeginIteration(
+                        pass_id=pass_id, batch_id=batch_id))
+                pass_type = updater.startBatch(len(data_batch))
+                self.__gradient_machine__.forwardBackward(
+                    feeder(data_batch), out_args, pass_type)
+                self.__gradient_machine__.eval(pass_evaluator)
+                self.__gradient_machine__.eval(batch_evaluator)
+                for each_param in self.__gradient_machine__.getNonStaticParameters(
+                ):
+                    updater.update(each_param)
+                cost_sum = out_args.sum()
+                cost = cost_sum / len(data_batch)
+                updater.finishBatch(cost)
+                batch_evaluator.finish()
+                event_handler(
+                    v2_event.EndIteration(
+                        pass_id=pass_id,
+                        batch_id=batch_id,
+                        cost=cost,
+                        evaluator=batch_evaluator))
+
+            updater.finishPass()
+            pass_evaluator.finish()
+            event_handler(v2_event.EndPass(pass_id, evaluator=pass_evaluator))
+        self.__gradient_machine__.finish()
+
+    def test(self, reader, feeding=None):
+        feeder = DataFeeder(self.__data_types__, feeding)
+        evaluator = self.__gradient_machine__.makeEvaluator()
+        out_args = api.Arguments.createArguments(0)
+        evaluator.start()
+        total_cost = 0
+        num_samples = 0.0
+        for data_batch in reader():
+            num_samples += len(data_batch)
+            self.__gradient_machine__.forward(
+                feeder(data_batch), out_args, api.PASS_TEST)
+            total_cost += out_args.sum()
+            self.__gradient_machine__.eval(evaluator)
+
+        evaluator.finish()
+        return v2_event.TestResult(
+            evaluator=evaluator, cost=total_cost / num_samples)
+
+
+def __check_train_args__(reader, event_handler, **kwargs):
+    """
+    Check train function's argument types
+    """
+    if not callable(reader) or not isinstance(reader(), collections.Iterator):
+        raise TypeError('train_data_reader should be a function, '
+                        'which can return a iterator')
+    if not callable(event_handler):
+        raise TypeError('event handler should be a function')
diff --git a/python/setup.py.in b/python/setup.py.in
index 1e1324eea825ab1945a38cb43eceec29a4ebb1a1..68ca35265cf13265ad0b171b0f70e20b83006ff9 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -5,7 +5,9 @@ packages=['paddle',
           'paddle.trainer',
           'paddle.trainer_config_helpers',
           'paddle.utils',
-          'paddle.v2']
+          'paddle.v2',
+          'paddle.v2.dataset',
+          'paddle.v2.reader']
 
 setup(name='paddle',
       version='${PADDLE_VERSION}',