Merge branch 'develop' of github.com:baidu/Paddle into feature/c_api

4380e73f · Yu Yang · 510ccfef · 963bd5d5 · 4380e73f · 4380e73f
336 changed file
--- a/.travis.yml
+++ b/.travis.yml
@@ -8,17 +8,10 @@ sudo: required
 dist: trusty
 os:
  - linux
-  - osx
 env:
  - JOB=DOCS
  - JOB=BUILD_AND_TEST
  - JOB=PRE_COMMIT
-matrix:
-  exclude:
-    - os: osx
-      env: JOB=DOCS  # Only generate documentation in linux.
-    - os: osx
-      env: JOB=PRE_COMMIT # Only check pre-commit hook in linux

 addons:
  apt:
@@ -52,9 +45,10 @@ before_install:
        fi
      fi
    fi
-  - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
  - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
-  - pip install numpy wheel protobuf sphinx recommonmark sphinx_rtd_theme virtualenv pre-commit requests==2.9.2 LinkChecker
+  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python 
+  # protobuf version.
+  - pip install numpy wheel 'protobuf==3.1' sphinx recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
 script:
  - paddle/scripts/travis/main.sh
 notifications:

--- a/cmake/FindSphinx.cmake
+++ b/cmake/FindSphinx.cmake
@@ -72,7 +72,7 @@ function( Sphinx_add_target target_name builder conf cache source destination )
    ${source}
    ${destination}
    COMMENT "Generating sphinx documentation: ${builder}"
-    COMMAND ln -sf ${destination}/index_*.html ${destination}/index.html
+    COMMAND cd ${destination} && ln -s ./index_*.html index.html
    )

  set_property(

--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -16,7 +16,8 @@
 set(CBLAS_FOUND OFF)

 ## Find MKL First.
-set(MKL_ROOT $ENV{MKLROOT} CACHE PATH "Folder contains MKL")
+set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs")
+set(MKL_ROOT ${INTEL_ROOT}/mkl CACHE PATH "Folder contains MKL")

 find_path(MKL_INCLUDE_DIR mkl.h PATHS
  ${MKL_ROOT}/include)

--- a/cmake/coverallsGcovJsons.cmake
+++ b/cmake/coverallsGcovJsons.cmake
@@ -110,14 +110,13 @@ endmacro()

 # Get the coverage data.
 file(GLOB_RECURSE GCDA_FILES "${COV_PATH}" "*.gcda")
-message("GCDA files:")
+message("Process GCDA files:")
+message("===============================")

 # Get a list of all the object directories needed by gcov
 # (The directories the .gcda files and .o files are found in)
 # and run gcov on those.
 foreach(GCDA ${GCDA_FILES})
-	message("Process: ${GCDA}")
-	message("------------------------------------------------------------------------------")
 	get_filename_component(GCDA_DIR ${GCDA} PATH)

 	#
@@ -135,7 +134,7 @@ foreach(GCDA ${GCDA_FILES})
 	# If -p is not specified then the file is named only "the_file.c.gcov"
 	#
 	execute_process(
-		COMMAND ${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA}
+		COMMAND "${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA} >/dev/null"
 		WORKING_DIRECTORY ${GCDA_DIR}
 	)
 endforeach()
@@ -383,7 +382,6 @@ foreach(NOT_COVERED_SRC ${COVERAGE_SRCS_REMAINING})
 	set(GCOV_FILE_COVERAGE "${GCOV_FILE_COVERAGE}]")

 	# Generate the final JSON for this file.
-	message("Generate JSON for non-gcov file: ${NOT_COVERED_SRC}...")
 	string(CONFIGURE ${SRC_FILE_TEMPLATE} FILE_JSON)
 	set(JSON_GCOV_FILES "${JSON_GCOV_FILES}${FILE_JSON}, ")
 endforeach()

--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-# 
+#
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
-# 
+#
 # http://www.apache.org/licenses/LICENSE-2.0
-# 
+#
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -29,12 +29,14 @@ INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
 ExternalProject_Add(
    glog
    ${EXTERNAL_PROJECT_LOG_ARGS}
+    DEPENDS gflags
    GIT_REPOSITORY  "https://github.com/google/glog.git"
    PREFIX          ${GLOG_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    CMAKE_ARGS      -DWITH_GFLAGS=OFF
+    CMAKE_ARGS      -DWITH_GFLAGS=ON
+    CMAKE_ARGS      -Dgflags_DIR=${GFLAGS_INSTALL_DIR}/lib/cmake/gflags
    CMAKE_ARGS      -DBUILD_TESTING=OFF
 )


--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -14,50 +14,50 @@

 INCLUDE(ExternalProject)

-SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/protobuf)
-SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/protobuf)
-SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" CACHE PATH "protobuf include directory." FORCE)
+FIND_PACKAGE(Protobuf)

-INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})
+IF(NOT PROTOBUF_FOUND)
+    SET(PROTOBUF_SOURCES_DIR ${THIRD_PARTY_PATH}/protobuf)
+    SET(PROTOBUF_INSTALL_DIR ${THIRD_PARTY_PATH}/install/protobuf)
+    SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" CACHE PATH "protobuf include directory." FORCE)
+
+    IF(WIN32)
+        SET(PROTOBUF_LITE_LIBRARY
+            "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.lib" CACHE FILEPATH "protobuf lite library." FORCE)
+        SET(PROTOBUF_LIBRARY
+            "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.lib" CACHE FILEPATH "protobuf library." FORCE)
+        SET(PROTOBUF_PROTOC_LIBRARY
+            "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.lib" CACHE FILEPATH "protoc library." FORCE)
+        SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc.exe" CACHE FILEPATH "protobuf executable." FORCE)
+    ELSE(WIN32)
+        SET(PROTOBUF_LITE_LIBRARY
+            "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.a" CACHE FILEPATH "protobuf lite library." FORCE)
+        SET(PROTOBUF_LIBRARY
+            "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.a" CACHE FILEPATH "protobuf library." FORCE)
+        SET(PROTOBUF_PROTOC_LIBRARY
+            "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.a" CACHE FILEPATH "protoc library." FORCE)
+        SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc" CACHE FILEPATH "protobuf executable." FORCE)
+    ENDIF(WIN32)

-IF(WIN32)
-  SET(PROTOBUF_LITE_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.lib" CACHE FILEPATH "protobuf lite library." FORCE)
-  SET(PROTOBUF_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.lib" CACHE FILEPATH "protobuf library." FORCE)
-  SET(PROTOBUF_PROTOC_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.lib" CACHE FILEPATH "protoc library." FORCE)
-  SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc.exe" CACHE FILEPATH "protobuf executable." FORCE)
-ELSE(WIN32)
-  IF(${HOST_SYSTEM} STREQUAL "centos")
-    SET(LIB "lib64")
-  ELSE()
-    SET(LIB "lib")
-  ENDIF()
-  SET(PROTOBUF_LITE_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/${LIB}/libprotobuf-lite.a" CACHE FILEPATH "protobuf lite library." FORCE)
-  SET(PROTOBUF_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/${LIB}/libprotobuf.a" CACHE FILEPATH "protobuf library." FORCE)
-  SET(PROTOBUF_PROTOC_LIBRARY
-        "${PROTOBUF_INSTALL_DIR}/${LIB}/libprotoc.a" CACHE FILEPATH "protoc library." FORCE)
-  SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc" CACHE FILEPATH "protobuf executable." FORCE)
-ENDIF(WIN32)
+    ExternalProject_Add(
+        protobuf
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        PREFIX          ${PROTOBUF_SOURCES_DIR}
+        UPDATE_COMMAND  ""
+        DEPENDS         zlib
+        GIT_REPOSITORY  "https://github.com/google/protobuf.git"
+        GIT_TAG         "9f75c5aa851cd877fb0d93ccc31b8567a6706546"
+        CONFIGURE_COMMAND
+        ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake
+        -Dprotobuf_BUILD_TESTS=OFF
+        -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
+        -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+        -DCMAKE_BUILD_TYPE=Release
+        -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
+        -DCMAKE_INSTALL_LIBDIR=lib
+    )

-ExternalProject_Add(
-  protobuf
-  ${EXTERNAL_PROJECT_LOG_ARGS}
-  PREFIX          ${PROTOBUF_SOURCES_DIR}
-  UPDATE_COMMAND  ""
-  DEPENDS         zlib
-  GIT_REPOSITORY  "https://github.com/google/protobuf.git"
-  GIT_TAG         "9f75c5aa851cd877fb0d93ccc31b8567a6706546"
-  CONFIGURE_COMMAND
-    ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake
-    -Dprotobuf_BUILD_TESTS=OFF
-    -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
-    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
-    -DCMAKE_BUILD_TYPE=Release
-    -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
-)
+    LIST(APPEND external_project_dependencies protobuf)
+ENDIF(NOT PROTOBUF_FOUND)

-LIST(APPEND external_project_dependencies protobuf)
+INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -26,10 +26,10 @@ IF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
    find_python_module(wheel REQUIRED)
    find_python_module(google.protobuf REQUIRED)
    FIND_PACKAGE(NumPy REQUIRED)
-    IF(${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
+    IF(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
        MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, "
-        "please use pip to upgrade protobuf.")
-    ENDIF(${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
+        "please use pip to upgrade protobuf. pip install -U protobuf")
+    ENDIF()
 ELSE(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
    MESSAGE(FATAL_ERROR "Please install python 2.7 before building PaddlePaddle.")
    ##################################### PYTHON ########################################
@@ -221,7 +221,3 @@ ENDIF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)

 INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
 INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
-
-MESSAGE("[Paddle] Python Executable: ${PYTHON_EXECUTABLE}")
-MESSAGE("[Paddle] Python Include: ${PYTHON_INCLUDE_DIRS}")
-MESSAGE("[Paddle] Python Libraries: ${PYTHON_LIBRARIES}")
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -54,6 +54,7 @@ ExternalProject_Add(
    CMAKE_ARGS      -DWITH_GPU=${WITH_GPU}
    CMAKE_ARGS      -DWITH_OMP=${USE_OMP}
    CMAKE_ARGS      -DWITH_TORCH=OFF
+    CMAKE_ARGS      -DCMAKE_DISABLE_FIND_PACKAGE_Torch=TRUE
    CMAKE_ARGS      -DBUILD_SHARED=ON
 )


--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -12,6 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+# Detects the OS and sets appropriate variables.
+# CMAKE_SYSTEM_NAME only give us a coarse-grained name,
+# but the name like centos is necessary in some scenes
+# to distinguish system for customization.
+#
+# for instance, protobuf libs path is <install_dir>/lib64
+# on CentOS, but <install_dir>/lib on other systems.
+
 IF(WIN32)
    SET(HOST_SYSTEM "win32")
 ELSE(WIN32)
@@ -30,6 +38,10 @@ ELSE(WIN32)
                SET(HOST_SYSTEM "debian")
            ELSEIF(LINUX_ISSUE MATCHES "Ubuntu")
                SET(HOST_SYSTEM "ubuntu")
+            ELSEIF(LINUX_ISSUE MATCHES "Red Hat")
+                SET(HOST_SYSTEM "redhat")
+            ELSEIF(LINUX_ISSUE MATCHES "Fedora")
+                SET(HOST_SYSTEM "fedora")
            ENDIF()
        ENDIF(EXISTS "/etc/issue")

@@ -40,6 +52,10 @@ ELSE(WIN32)
            ENDIF()
        ENDIF(EXISTS "/etc/redhat-release")

+        IF(NOT HOST_SYSTEM)
+            SET(HOST_SYSTEM ${CMAKE_SYSTEM_NAME})
+        ENDIF()
+
    ENDIF(APPLE)
 ENDIF(WIN32)


--- a/demo/image_classification/api_v2_resnet.py
+++ b/demo/image_classification/api_v2_resnet.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2 as paddle
+
+__all__ = ['resnet_cifar10']
+
+
+def conv_bn_layer(input,
+                  ch_out,
+                  filter_size,
+                  stride,
+                  padding,
+                  active_type=paddle.activation.Relu(),
+                  ch_in=None):
+    tmp = paddle.layer.img_conv(
+        input=input,
+        filter_size=filter_size,
+        num_channels=ch_in,
+        num_filters=ch_out,
+        stride=stride,
+        padding=padding,
+        act=paddle.activation.Linear(),
+        bias_attr=False)
+    return paddle.layer.batch_norm(input=tmp, act=active_type)
+
+
+def shortcut(ipt, n_in, n_out, stride):
+    if n_in != n_out:
+        return conv_bn_layer(ipt, n_out, 1, stride, 0,
+                             paddle.activation.Linear())
+    else:
+        return ipt
+
+
+def basicblock(ipt, ch_out, stride):
+    ch_in = ch_out * 2
+    tmp = conv_bn_layer(ipt, ch_out, 3, stride, 1)
+    tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, paddle.activation.Linear())
+    short = shortcut(ipt, ch_in, ch_out, stride)
+    return paddle.layer.addto(input=[tmp, short], act=paddle.activation.Relu())
+
+
+def layer_warp(block_func, ipt, features, count, stride):
+    tmp = block_func(ipt, features, stride)
+    for i in range(1, count):
+        tmp = block_func(tmp, features, 1)
+    return tmp
+
+
+def resnet_cifar10(ipt, depth=32):
+    # depth should be one of 20, 32, 44, 56, 110, 1202
+    assert (depth - 2) % 6 == 0
+    n = (depth - 2) / 6
+    nStages = {16, 64, 128}
+    conv1 = conv_bn_layer(
+        ipt, ch_in=3, ch_out=16, filter_size=3, stride=1, padding=1)
+    res1 = layer_warp(basicblock, conv1, 16, n, 1)
+    res2 = layer_warp(basicblock, res1, 32, n, 2)
+    res3 = layer_warp(basicblock, res2, 64, n, 2)
+    pool = paddle.layer.img_pool(
+        input=res3, pool_size=8, stride=1, pool_type=paddle.pooling.Avg())
+    return pool
--- a/demo/image_classification/api_v2_train.py
+++ b/demo/image_classification/api_v2_train.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+import sys
+import paddle.v2 as paddle
+from api_v2_vgg import vgg_bn_drop
+from api_v2_resnet import resnet_cifar10
+
+
+def main():
+    datadim = 3 * 32 * 32
+    classdim = 10
+
+    # PaddlePaddle init
+    paddle.init(use_gpu=True, trainer_count=1)
+
+    image = paddle.layer.data(
+        name="image", type=paddle.data_type.dense_vector(datadim))
+
+    # Add neural network config
+    # option 1. resnet
+    net = resnet_cifar10(image, depth=32)
+    # option 2. vgg
+    # net = vgg_bn_drop(image)
+
+    out = paddle.layer.fc(input=net,
+                          size=classdim,
+                          act=paddle.activation.Softmax())
+
+    lbl = paddle.layer.data(
+        name="label", type=paddle.data_type.integer_value(classdim))
+    cost = paddle.layer.classification_cost(input=out, label=lbl)
+
+    # Create parameters
+    parameters = paddle.parameters.create(cost)
+
+    # Create optimizer
+    momentum_optimizer = paddle.optimizer.Momentum(
+        momentum=0.9,
+        regularization=paddle.optimizer.L2Regularization(rate=0.0002 * 128),
+        learning_rate=0.1 / 128.0,
+        learning_rate_decay_a=0.1,
+        learning_rate_decay_b=50000 * 100,
+        learning_rate_schedule='discexp',
+        batch_size=128)
+
+    # End batch and end pass event handler
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                print "\nPass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+            else:
+                sys.stdout.write('.')
+                sys.stdout.flush()
+        if isinstance(event, paddle.event.EndPass):
+            result = trainer.test(
+                reader=paddle.batch(
+                    paddle.dataset.cifar.test10(), batch_size=128),
+                reader_dict={'image': 0,
+                             'label': 1})
+            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+
+    # Create trainer
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=momentum_optimizer)
+    trainer.train(
+        reader=paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.cifar.train10(), buf_size=50000),
+            batch_size=128),
+        num_passes=5,
+        event_handler=event_handler,
+        reader_dict={'image': 0,
+                     'label': 1})
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/image_classification/api_v2_vgg.py
+++ b/demo/image_classification/api_v2_vgg.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle.v2 as paddle
+
+__all__ = ['vgg_bn_drop']
+
+
+def vgg_bn_drop(input):
+    def conv_block(ipt, num_filter, groups, dropouts, num_channels=None):
+        return paddle.networks.img_conv_group(
+            input=ipt,
+            num_channels=num_channels,
+            pool_size=2,
+            pool_stride=2,
+            conv_num_filter=[num_filter] * groups,
+            conv_filter_size=3,
+            conv_act=paddle.activation.Relu(),
+            conv_with_batchnorm=True,
+            conv_batchnorm_drop_rate=dropouts,
+            pool_type=paddle.pooling.Max())
+
+    conv1 = conv_block(input, 64, 2, [0.3, 0], 3)
+    conv2 = conv_block(conv1, 128, 2, [0.4, 0])
+    conv3 = conv_block(conv2, 256, 3, [0.4, 0.4, 0])
+    conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
+    conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
+
+    drop = paddle.layer.dropout(input=conv5, dropout_rate=0.5)
+    fc1 = paddle.layer.fc(input=drop, size=512, act=paddle.activation.Linear())
+    bn = paddle.layer.batch_norm(
+        input=fc1,
+        act=paddle.activation.Relu(),
+        layer_attr=paddle.attr.Extra(drop_rate=0.5))
+    fc2 = paddle.layer.fc(input=bn, size=512, act=paddle.activation.Linear())
+    return fc2
--- a/demo/image_classification/prediction.py
+++ b/demo/image_classification/prediction.py
@@ -126,7 +126,7 @@ class ImageClassifier():
        # For oversampling, average predictions across crops.
        # If not, the shape of output[name]: (1, class_number),
        # the mean is also applicable.
-        return output[output_layer].mean(0)
+        return output[output_layer]['value'].mean(0)

    def predict(self, image=None, output_layer=None):
        assert isinstance(image, basestring)

--- a/demo/introduction/api_train_v2.py
+++ b/demo/introduction/api_train_v2.py
+import paddle.v2 as paddle
+import paddle.v2.dataset.uci_housing as uci_housing
+
+
+def main():
+    # init
+    paddle.init(use_gpu=False, trainer_count=1)
+
+    # network config
+    x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+    y_predict = paddle.layer.fc(input=x,
+                                param_attr=paddle.attr.Param(name='w'),
+                                size=1,
+                                act=paddle.activation.Linear(),
+                                bias_attr=paddle.attr.Param(name='b'))
+    y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
+    cost = paddle.layer.regression_cost(input=y_predict, label=y)
+
+    # create parameters
+    parameters = paddle.parameters.create(cost)
+
+    # create optimizer
+    optimizer = paddle.optimizer.Momentum(momentum=0)
+
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer)
+
+    # event_handler to print training and testing info
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                print "Pass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+
+        if isinstance(event, paddle.event.EndPass):
+            result = trainer.test(
+                reader=paddle.reader.batched(
+                    uci_housing.test(), batch_size=2),
+                reader_dict={'x': 0,
+                             'y': 1})
+            if event.pass_id % 10 == 0:
+                print "Test %d, %s" % (event.pass_id, result.metrics)
+
+    # training
+    trainer.train(
+        reader=paddle.reader.batched(
+            paddle.reader.shuffle(
+                uci_housing.train(), buf_size=500),
+            batch_size=2),
+        reader_dict={'x': 0,
+                     'y': 1},
+        event_handler=event_handler,
+        num_passes=30)
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/mnist/.gitignore
+++ b/demo/mnist/.gitignore
@@ -5,3 +5,6 @@ plot.png
 train.log
 *pyc
 .ipynb_checkpoints
+params.pkl
+params.tar
+params.tar.gz
--- a/demo/mnist/api_train.py
+++ b/demo/mnist/api_train.py
@@ -6,33 +6,15 @@ passed to C++ side of Paddle.

 The user api could be simpler and carefully designed.
 """
-import py_paddle.swig_paddle as api
-from py_paddle import DataProviderConverter
-import paddle.trainer.PyDataProvider2 as dp
-import numpy as np
 import random
-from mnist_util import read_from_mnist
-from paddle.trainer_config_helpers import *
-
-
-def optimizer_config():
-    settings(
-        learning_rate=1e-4,
-        learning_method=AdamOptimizer(),
-        batch_size=1000,
-        model_average=ModelAverage(average_window=0.5),
-        regularization=L2Regularization(rate=0.5))

+import numpy as np
+import paddle.v2 as paddle_v2
+import py_paddle.swig_paddle as api
+from paddle.trainer_config_helpers import *
+from py_paddle import DataProviderConverter

-def network_config():
-    imgs = data_layer(name='pixel', size=784)
-    hidden1 = fc_layer(input=imgs, size=200)
-    hidden2 = fc_layer(input=hidden1, size=200)
-    inference = fc_layer(input=hidden2, size=10, act=SoftmaxActivation())
-    cost = classification_cost(
-        input=inference, label=data_layer(
-            name='label', size=10))
-    outputs(cost)
+from mnist_util import read_from_mnist


 def init_parameter(network):
@@ -75,19 +57,35 @@ def input_order_converter(generator):
 def main():
    api.initPaddle("-use_gpu=false", "-trainer_count=4")  # use 4 cpu cores

-    # get enable_types for each optimizer.
-    # enable_types = [value, gradient, momentum, etc]
-    # For each optimizer(SGD, Adam), GradientMachine should enable different
-    # buffers.
-    opt_config_proto = parse_optimizer_config(optimizer_config)
-    opt_config = api.OptimizationConfig.createFromProto(opt_config_proto)
-    _temp_optimizer_ = api.ParameterOptimizer.create(opt_config)
-    enable_types = _temp_optimizer_.getParameterTypes()
+    optimizer = paddle_v2.optimizer.Adam(
+        learning_rate=1e-4,
+        batch_size=1000,
+        model_average=ModelAverage(average_window=0.5),
+        regularization=L2Regularization(rate=0.5))
+
+    # Create Local Updater. Local means not run in cluster.
+    # For a cluster training, here we can change to createRemoteUpdater
+    # in future.
+    updater = optimizer.create_local_updater()
+    assert isinstance(updater, api.ParameterUpdater)
+
+    # define network
+    images = paddle_v2.layer.data(
+        name='pixel', type=paddle_v2.data_type.dense_vector(784))
+    label = paddle_v2.layer.data(
+        name='label', type=paddle_v2.data_type.integer_value(10))
+    hidden1 = paddle_v2.layer.fc(input=images, size=200)
+    hidden2 = paddle_v2.layer.fc(input=hidden1, size=200)
+    inference = paddle_v2.layer.fc(input=hidden2,
+                                   size=10,
+                                   act=paddle_v2.activation.Softmax())
+    cost = paddle_v2.layer.classification_cost(input=inference, label=label)

    # Create Simple Gradient Machine.
-    model_config = parse_network_config(network_config)
-    m = api.GradientMachine.createFromConfigProto(
-        model_config, api.CREATE_MODE_NORMAL, enable_types)
+    model_config = paddle_v2.layer.parse_network(cost)
+    m = api.GradientMachine.createFromConfigProto(model_config,
+                                                  api.CREATE_MODE_NORMAL,
+                                                  optimizer.enable_types())

    # This type check is not useful. Only enable type hint in IDE.
    # Such as PyCharm
@@ -96,19 +94,12 @@ def main():
    # Initialize Parameter by numpy.
    init_parameter(network=m)

-    # Create Local Updater. Local means not run in cluster.
-    # For a cluster training, here we can change to createRemoteUpdater
-    # in future.
-    updater = api.ParameterUpdater.createLocalUpdater(opt_config)
-    assert isinstance(updater, api.ParameterUpdater)
-
    # Initialize ParameterUpdater.
    updater.init(m)

    # DataProvider Converter is a utility convert Python Object to Paddle C++
    # Input. The input format is as same as Paddle's DataProvider.
-    converter = DataProviderConverter(
-        input_types=[dp.dense_vector(784), dp.integer_value(10)])
+    converter = DataProviderConverter(input_types=[images.type, label.type])

    train_file = './data/raw_data/train'
    test_file = './data/raw_data/t10k'

--- a/demo/mnist/api_train_v2.py
+++ b/demo/mnist/api_train_v2.py
+import paddle.v2 as paddle
+import gzip
+
+
+def softmax_regression(img):
+    predict = paddle.layer.fc(input=img,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+
+
+def multilayer_perceptron(img):
+    # The first fully-connected layer
+    hidden1 = paddle.layer.fc(input=img, size=128, act=paddle.activation.Relu())
+    # The second fully-connected layer and the according activation function
+    hidden2 = paddle.layer.fc(input=hidden1,
+                              size=64,
+                              act=paddle.activation.Relu())
+    # The thrid fully-connected layer, note that the hidden size should be 10,
+    # which is the number of unique digits
+    predict = paddle.layer.fc(input=hidden2,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+
+
+def convolutional_neural_network(img):
+    # first conv layer
+    conv_pool_1 = paddle.networks.simple_img_conv_pool(
+        input=img,
+        filter_size=5,
+        num_filters=20,
+        num_channel=1,
+        pool_size=2,
+        pool_stride=2,
+        act=paddle.activation.Tanh())
+    # second conv layer
+    conv_pool_2 = paddle.networks.simple_img_conv_pool(
+        input=conv_pool_1,
+        filter_size=5,
+        num_filters=50,
+        num_channel=20,
+        pool_size=2,
+        pool_stride=2,
+        act=paddle.activation.Tanh())
+    # The first fully-connected layer
+    fc1 = paddle.layer.fc(input=conv_pool_2,
+                          size=128,
+                          act=paddle.activation.Tanh())
+    # The softmax layer, note that the hidden size should be 10,
+    # which is the number of unique digits
+    predict = paddle.layer.fc(input=fc1,
+                              size=10,
+                              act=paddle.activation.Softmax())
+    return predict
+
+
+def main():
+    paddle.init(use_gpu=False, trainer_count=1)
+
+    # define network topology
+    images = paddle.layer.data(
+        name='pixel', type=paddle.data_type.dense_vector(784))
+    label = paddle.layer.data(
+        name='label', type=paddle.data_type.integer_value(10))
+
+    # Here we can build the prediction network in different ways. Please
+    # choose one by uncomment corresponding line.
+    predict = softmax_regression(images)
+    #predict = multilayer_perceptron(images)
+    #predict = convolutional_neural_network(images)
+
+    cost = paddle.layer.classification_cost(input=predict, label=label)
+
+    try:
+        with gzip.open('params.tar.gz', 'r') as f:
+            parameters = paddle.parameters.Parameters.from_tar(f)
+    except IOError:
+        parameters = paddle.parameters.create(cost)
+
+    optimizer = paddle.optimizer.Momentum(
+        learning_rate=0.1 / 128.0,
+        momentum=0.9,
+        regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
+
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer)
+
+    lists = []
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 1000 == 0:
+                result = trainer.test(reader=paddle.reader.batched(
+                    paddle.dataset.mnist.test(), batch_size=256))
+
+                print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics,
+                    result.metrics)
+
+                with gzip.open('params.tar.gz', 'w') as f:
+                    parameters.to_tar(f)
+
+        elif isinstance(event, paddle.event.EndPass):
+            result = trainer.test(reader=paddle.reader.batched(
+                paddle.dataset.mnist.test(), batch_size=128))
+            print "Test with Pass %d, Cost %f, %s\n" % (
+                event.pass_id, result.cost, result.metrics)
+            lists.append((event.pass_id, result.cost,
+                          result.metrics['classification_error_evaluator']))
+
+    trainer.train(
+        reader=paddle.batch(
+            paddle.reader.shuffle(
+                paddle.dataset.mnist.train(), buf_size=8192),
+            batch_size=128),
+        event_handler=event_handler,
+        num_passes=100)
+
+    # find the best pass
+    best = sorted(lists, key=lambda list: float(list[1]))[0]
+    print 'Best pass is %s, testing Avgcost is %s' % (best[0], best[1])
+    print 'The classification accuracy is %.2f%%' % (100 - float(best[2]) * 100)
+
+    # output is a softmax layer. It returns probabilities.
+    # Shape should be (100, 10)
+    probs = paddle.infer(
+        output=predict,
+        parameters=parameters,
+        reader=paddle.batch(
+            paddle.reader.firstn(
+                paddle.reader.map_readers(lambda item: (item[0], ),
+                                          paddle.dataset.mnist.test()),
+                n=100),
+            batch_size=32))
+    print probs.shape
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/model_zoo/resnet/classify.py
+++ b/demo/model_zoo/resnet/classify.py
@@ -156,7 +156,7 @@ class ImageClassifier():
            # For oversampling, average predictions across crops.
            # If not, the shape of output[name]: (1, class_number),
            # the mean is also applicable.
-            res[name] = output[name].mean(0)
+            res[name] = output[name]['value'].mean(0)

        return res


--- a/demo/semantic_role_labeling/api_train_v2.py
+++ b/demo/semantic_role_labeling/api_train_v2.py
+import sys
+import math
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.dataset.conll05 as conll05
+
+
+def db_lstm():
+    word_dict, verb_dict, label_dict = conll05.get_dict()
+    word_dict_len = len(word_dict)
+    label_dict_len = len(label_dict)
+    pred_len = len(verb_dict)
+
+    mark_dict_len = 2
+    word_dim = 32
+    mark_dim = 5
+    hidden_dim = 512
+    depth = 8
+
+    #8 features
+    def d_type(size):
+        return paddle.data_type.integer_value_sequence(size)
+
+    word = paddle.layer.data(name='word_data', type=d_type(word_dict_len))
+    predicate = paddle.layer.data(name='verb_data', type=d_type(pred_len))
+
+    ctx_n2 = paddle.layer.data(name='ctx_n2_data', type=d_type(word_dict_len))
+    ctx_n1 = paddle.layer.data(name='ctx_n1_data', type=d_type(word_dict_len))
+    ctx_0 = paddle.layer.data(name='ctx_0_data', type=d_type(word_dict_len))
+    ctx_p1 = paddle.layer.data(name='ctx_p1_data', type=d_type(word_dict_len))
+    ctx_p2 = paddle.layer.data(name='ctx_p2_data', type=d_type(word_dict_len))
+    mark = paddle.layer.data(name='mark_data', type=d_type(mark_dict_len))
+
+    target = paddle.layer.data(name='target', type=d_type(label_dict_len))
+
+    default_std = 1 / math.sqrt(hidden_dim) / 3.0
+
+    emb_para = paddle.attr.Param(name='emb', initial_std=0., learning_rate=0.)
+    std_0 = paddle.attr.Param(initial_std=0.)
+    std_default = paddle.attr.Param(initial_std=default_std)
+
+    predicate_embedding = paddle.layer.embedding(
+        size=word_dim,
+        input=predicate,
+        param_attr=paddle.attr.Param(
+            name='vemb', initial_std=default_std))
+    mark_embedding = paddle.layer.embedding(
+        size=mark_dim, input=mark, param_attr=std_0)
+
+    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+    emb_layers = [
+        paddle.layer.embedding(
+            size=word_dim, input=x, param_attr=emb_para) for x in word_input
+    ]
+    emb_layers.append(predicate_embedding)
+    emb_layers.append(mark_embedding)
+
+    hidden_0 = paddle.layer.mixed(
+        size=hidden_dim,
+        bias_attr=std_default,
+        input=[
+            paddle.layer.full_matrix_projection(
+                input=emb, param_attr=std_default) for emb in emb_layers
+        ])
+
+    mix_hidden_lr = 1e-3
+    lstm_para_attr = paddle.attr.Param(initial_std=0.0, learning_rate=1.0)
+    hidden_para_attr = paddle.attr.Param(
+        initial_std=default_std, learning_rate=mix_hidden_lr)
+
+    lstm_0 = paddle.layer.lstmemory(
+        input=hidden_0,
+        act=paddle.activation.Relu(),
+        gate_act=paddle.activation.Sigmoid(),
+        state_act=paddle.activation.Sigmoid(),
+        bias_attr=std_0,
+        param_attr=lstm_para_attr)
+
+    #stack L-LSTM and R-LSTM with direct edges
+    input_tmp = [hidden_0, lstm_0]
+
+    for i in range(1, depth):
+        mix_hidden = paddle.layer.mixed(
+            size=hidden_dim,
+            bias_attr=std_default,
+            input=[
+                paddle.layer.full_matrix_projection(
+                    input=input_tmp[0], param_attr=hidden_para_attr),
+                paddle.layer.full_matrix_projection(
+                    input=input_tmp[1], param_attr=lstm_para_attr)
+            ])
+
+        lstm = paddle.layer.lstmemory(
+            input=mix_hidden,
+            act=paddle.activation.Relu(),
+            gate_act=paddle.activation.Sigmoid(),
+            state_act=paddle.activation.Sigmoid(),
+            reverse=((i % 2) == 1),
+            bias_attr=std_0,
+            param_attr=lstm_para_attr)
+
+        input_tmp = [mix_hidden, lstm]
+
+    feature_out = paddle.layer.mixed(
+        size=label_dict_len,
+        bias_attr=std_default,
+        input=[
+            paddle.layer.full_matrix_projection(
+                input=input_tmp[0], param_attr=hidden_para_attr),
+            paddle.layer.full_matrix_projection(
+                input=input_tmp[1], param_attr=lstm_para_attr)
+        ], )
+
+    crf_cost = paddle.layer.crf(size=label_dict_len,
+                                input=feature_out,
+                                label=target,
+                                param_attr=paddle.attr.Param(
+                                    name='crfw',
+                                    initial_std=default_std,
+                                    learning_rate=mix_hidden_lr))
+
+    crf_dec = paddle.layer.crf_decoding(
+        name='crf_dec_l',
+        size=label_dict_len,
+        input=feature_out,
+        label=target,
+        param_attr=paddle.attr.Param(name='crfw'))
+
+    return crf_cost, crf_dec
+
+
+def load_parameter(file_name, h, w):
+    with open(file_name, 'rb') as f:
+        f.read(16)  # skip header.
+        return np.fromfile(f, dtype=np.float32).reshape(h, w)
+
+
+def main():
+    paddle.init(use_gpu=False, trainer_count=1)
+
+    # define network topology
+    crf_cost, crf_dec = db_lstm()
+
+    # create parameters
+    parameters = paddle.parameters.create([crf_cost, crf_dec])
+
+    # create optimizer
+    optimizer = paddle.optimizer.Momentum(
+        momentum=0,
+        learning_rate=2e-2,
+        regularization=paddle.optimizer.L2Regularization(rate=8e-4),
+        model_average=paddle.optimizer.ModelAverage(
+            average_window=0.5, max_average_window=10000), )
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                print "Pass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+
+    trainer = paddle.trainer.SGD(cost=crf_cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer)
+    parameters.set('emb', load_parameter(conll05.get_embedding(), 44068, 32))
+
+    trn_reader = paddle.reader.batched(
+        paddle.reader.shuffle(
+            conll05.test(), buf_size=8192), batch_size=10)
+
+    reader_dict = {
+        'word_data': 0,
+        'ctx_n2_data': 1,
+        'ctx_n1_data': 2,
+        'ctx_0_data': 3,
+        'ctx_p1_data': 4,
+        'ctx_p2_data': 5,
+        'verb_data': 6,
+        'mark_data': 7,
+        'target': 8
+    }
+
+    trainer.train(
+        reader=trn_reader,
+        event_handler=event_handler,
+        num_passes=10000,
+        reader_dict=reader_dict)
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/sentiment/dataprovider.py
+++ b/demo/sentiment/dataprovider.py
@@ -32,4 +32,6 @@ def process(settings, file_name):
            word_slot = [
                settings.word_dict[w] for w in words if w in settings.word_dict
            ]
+            if not word_slot:
+                continue
            yield word_slot, label
--- a/demo/sentiment/predict.py
+++ b/demo/sentiment/predict.py
@@ -138,7 +138,11 @@ def main():

    batch = []
    for line in sys.stdin:
-        batch.append([predict.get_index(line)])
+        words = predict.get_index(line)
+        if words:
+            batch.append([words])
+        else:
+            print('All the words in [%s] are not in the dictionary.' % line)
        if len(batch) == batch_size:
            predict.batch_predict(batch)
            batch = []

--- a/demo/sentiment/train_v2.py
+++ b/demo/sentiment/train_v2.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import paddle.trainer_config_helpers.attrs as attrs
+from paddle.trainer_config_helpers.poolings import MaxPooling
+import paddle.v2 as paddle
+
+
+def convolution_net(input_dim,
+                    class_dim=2,
+                    emb_dim=128,
+                    hid_dim=128,
+                    is_predict=False):
+    data = paddle.layer.data("word",
+                             paddle.data_type.integer_value_sequence(input_dim))
+    emb = paddle.layer.embedding(input=data, size=emb_dim)
+    conv_3 = paddle.networks.sequence_conv_pool(
+        input=emb, context_len=3, hidden_size=hid_dim)
+    conv_4 = paddle.networks.sequence_conv_pool(
+        input=emb, context_len=4, hidden_size=hid_dim)
+    output = paddle.layer.fc(input=[conv_3, conv_4],
+                             size=class_dim,
+                             act=paddle.activation.Softmax())
+    lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
+    cost = paddle.layer.classification_cost(input=output, label=lbl)
+    return cost
+
+
+def stacked_lstm_net(input_dim,
+                     class_dim=2,
+                     emb_dim=128,
+                     hid_dim=512,
+                     stacked_num=3,
+                     is_predict=False):
+    """
+    A Wrapper for sentiment classification task.
+    This network uses bi-directional recurrent network,
+    consisting three LSTM layers. This configure is referred to
+    the paper as following url, but use fewer layrs.
+        http://www.aclweb.org/anthology/P15-1109
+
+    input_dim: here is word dictionary dimension.
+    class_dim: number of categories.
+    emb_dim: dimension of word embedding.
+    hid_dim: dimension of hidden layer.
+    stacked_num: number of stacked lstm-hidden layer.
+    is_predict: is predicting or not.
+                Some layers is not needed in network when predicting.
+    """
+    assert stacked_num % 2 == 1
+
+    layer_attr = attrs.ExtraLayerAttribute(drop_rate=0.5)
+    fc_para_attr = attrs.ParameterAttribute(learning_rate=1e-3)
+    lstm_para_attr = attrs.ParameterAttribute(initial_std=0., learning_rate=1.)
+    para_attr = [fc_para_attr, lstm_para_attr]
+    bias_attr = attrs.ParameterAttribute(initial_std=0., l2_rate=0.)
+    relu = paddle.activation.Relu()
+    linear = paddle.activation.Linear()
+
+    data = paddle.layer.data("word",
+                             paddle.data_type.integer_value_sequence(input_dim))
+    emb = paddle.layer.embedding(input=data, size=emb_dim)
+
+    fc1 = paddle.layer.fc(input=emb,
+                          size=hid_dim,
+                          act=linear,
+                          bias_attr=bias_attr)
+    lstm1 = paddle.layer.lstmemory(
+        input=fc1, act=relu, bias_attr=bias_attr, layer_attr=layer_attr)
+
+    inputs = [fc1, lstm1]
+    for i in range(2, stacked_num + 1):
+        fc = paddle.layer.fc(input=inputs,
+                             size=hid_dim,
+                             act=linear,
+                             param_attr=para_attr,
+                             bias_attr=bias_attr)
+        lstm = paddle.layer.lstmemory(
+            input=fc,
+            reverse=(i % 2) == 0,
+            act=relu,
+            bias_attr=bias_attr,
+            layer_attr=layer_attr)
+        inputs = [fc, lstm]
+
+    fc_last = paddle.layer.pooling(input=inputs[0], pooling_type=MaxPooling())
+    lstm_last = paddle.layer.pooling(input=inputs[1], pooling_type=MaxPooling())
+    output = paddle.layer.fc(input=[fc_last, lstm_last],
+                             size=class_dim,
+                             act=paddle.activation.Softmax(),
+                             bias_attr=bias_attr,
+                             param_attr=para_attr)
+
+    lbl = paddle.layer.data("label", paddle.data_type.integer_value(2))
+    cost = paddle.layer.classification_cost(input=output, label=lbl)
+    return cost
+
+
+if __name__ == '__main__':
+    # init
+    paddle.init(use_gpu=True, trainer_count=4)
+
+    # network config
+    print 'load dictionary...'
+    word_dict = paddle.dataset.imdb.word_dict()
+    dict_dim = len(word_dict)
+    class_dim = 2
+
+    # Please choose the way to build the network
+    # by uncommenting the corresponding line.
+    cost = convolution_net(dict_dim, class_dim=class_dim)
+    # cost = stacked_lstm_net(dict_dim, class_dim=class_dim, stacked_num=3)
+
+    # create parameters
+    parameters = paddle.parameters.create(cost)
+
+    # create optimizer
+    adam_optimizer = paddle.optimizer.Adam(
+        learning_rate=2e-3,
+        regularization=paddle.optimizer.L2Regularization(rate=8e-4),
+        model_average=paddle.optimizer.ModelAverage(average_window=0.5))
+
+    # End batch and end pass event handler
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                print "\nPass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+            else:
+                sys.stdout.write('.')
+                sys.stdout.flush()
+        if isinstance(event, paddle.event.EndPass):
+            result = trainer.test(
+                reader=paddle.reader.batched(
+                    lambda: paddle.dataset.imdb.test(word_dict),
+                    batch_size=128),
+                reader_dict={'word': 0,
+                             'label': 1})
+            print "\nTest with Pass %d, %s" % (event.pass_id, result.metrics)
+
+    # create trainer
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=adam_optimizer)
+
+    trainer.train(
+        reader=paddle.reader.batched(
+            paddle.reader.shuffle(
+                lambda: paddle.dataset.imdb.train(word_dict), buf_size=1000),
+            batch_size=100),
+        event_handler=event_handler,
+        reader_dict={'word': 0,
+                     'label': 1},
+        num_passes=10)
--- a/demo/seqToseq/api_train_v2.py
+++ b/demo/seqToseq/api_train_v2.py
+import os
+
+import paddle.v2 as paddle
+
+from seqToseq_net_v2 import seqToseq_net_v2
+
+# Data Definiation.
+# TODO:This code should be merged to dataset package.
+data_dir = "./data/pre-wmt14"
+src_lang_dict = os.path.join(data_dir, 'src.dict')
+trg_lang_dict = os.path.join(data_dir, 'trg.dict')
+
+source_dict_dim = len(open(src_lang_dict, "r").readlines())
+target_dict_dim = len(open(trg_lang_dict, "r").readlines())
+
+
+def read_to_dict(dict_path):
+    with open(dict_path, "r") as fin:
+        out_dict = {
+            line.strip(): line_count
+            for line_count, line in enumerate(fin)
+        }
+    return out_dict
+
+
+src_dict = read_to_dict(src_lang_dict)
+trg_dict = read_to_dict(trg_lang_dict)
+
+train_list = os.path.join(data_dir, 'train.list')
+test_list = os.path.join(data_dir, 'test.list')
+
+UNK_IDX = 2
+START = "<s>"
+END = "<e>"
+
+
+def _get_ids(s, dictionary):
+    words = s.strip().split()
+    return [dictionary[START]] + \
+           [dictionary.get(w, UNK_IDX) for w in words] + \
+           [dictionary[END]]
+
+
+def train_reader(file_name):
+    def reader():
+        with open(file_name, 'r') as f:
+            for line_count, line in enumerate(f):
+                line_split = line.strip().split('\t')
+                if len(line_split) != 2:
+                    continue
+                src_seq = line_split[0]  # one source sequence
+                src_ids = _get_ids(src_seq, src_dict)
+
+                trg_seq = line_split[1]  # one target sequence
+                trg_words = trg_seq.split()
+                trg_ids = [trg_dict.get(w, UNK_IDX) for w in trg_words]
+
+                # remove sequence whose length > 80 in training mode
+                if len(src_ids) > 80 or len(trg_ids) > 80:
+                    continue
+                trg_ids_next = trg_ids + [trg_dict[END]]
+                trg_ids = [trg_dict[START]] + trg_ids
+
+                yield src_ids, trg_ids, trg_ids_next
+
+    return reader
+
+
+def main():
+    paddle.init(use_gpu=False, trainer_count=1)
+
+    # define network topology
+    cost = seqToseq_net_v2(source_dict_dim, target_dict_dim)
+    parameters = paddle.parameters.create(cost)
+
+    # define optimize method and trainer
+    optimizer = paddle.optimizer.Adam(learning_rate=1e-4)
+    trainer = paddle.trainer.SGD(cost=cost,
+                                 parameters=parameters,
+                                 update_equation=optimizer)
+
+    # define data reader
+    reader_dict = {
+        'source_language_word': 0,
+        'target_language_word': 1,
+        'target_language_next_word': 2
+    }
+
+    wmt14_reader = paddle.reader.batched(
+        paddle.reader.shuffle(
+            train_reader("data/pre-wmt14/train/train"), buf_size=8192),
+        batch_size=5)
+
+    # define event_handler callback
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 10 == 0:
+                print "Pass %d, Batch %d, Cost %f, %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics)
+
+    # start to train
+    trainer.train(
+        reader=wmt14_reader,
+        event_handler=event_handler,
+        num_passes=10000,
+        reader_dict=reader_dict)
+
+
+if __name__ == '__main__':
+    main()
--- a/demo/seqToseq/seqToseq_net_v2.py
+++ b/demo/seqToseq/seqToseq_net_v2.py
+import paddle.v2 as paddle
+
+
+def seqToseq_net_v2(source_dict_dim, target_dict_dim):
+    ### Network Architecture
+    word_vector_dim = 512  # dimension of word vector
+    decoder_size = 512  # dimension of hidden unit in GRU Decoder network
+    encoder_size = 512  # dimension of hidden unit in GRU Encoder network
+
+    #### Encoder
+    src_word_id = paddle.layer.data(
+        name='source_language_word',
+        type=paddle.data_type.integer_value_sequence(source_dict_dim))
+    src_embedding = paddle.layer.embedding(
+        input=src_word_id,
+        size=word_vector_dim,
+        param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
+    src_forward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size)
+    src_backward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size, reverse=True)
+    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
+
+    #### Decoder
+    with paddle.layer.mixed(size=decoder_size) as encoded_proj:
+        encoded_proj += paddle.layer.full_matrix_projection(
+            input=encoded_vector)
+
+    backward_first = paddle.layer.first_seq(input=src_backward)
+
+    with paddle.layer.mixed(
+            size=decoder_size, act=paddle.activation.Tanh()) as decoder_boot:
+        decoder_boot += paddle.layer.full_matrix_projection(
+            input=backward_first)
+
+    def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
+
+        decoder_mem = paddle.layer.memory(
+            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
+
+        context = paddle.networks.simple_attention(
+            encoded_sequence=enc_vec,
+            encoded_proj=enc_proj,
+            decoder_state=decoder_mem)
+
+        with paddle.layer.mixed(size=decoder_size * 3) as decoder_inputs:
+            decoder_inputs += paddle.layer.full_matrix_projection(input=context)
+            decoder_inputs += paddle.layer.full_matrix_projection(
+                input=current_word)
+
+        gru_step = paddle.layer.gru_step(
+            name='gru_decoder',
+            input=decoder_inputs,
+            output_mem=decoder_mem,
+            size=decoder_size)
+
+        with paddle.layer.mixed(
+                size=target_dict_dim,
+                bias_attr=True,
+                act=paddle.activation.Softmax()) as out:
+            out += paddle.layer.full_matrix_projection(input=gru_step)
+        return out
+
+    decoder_group_name = "decoder_group"
+    group_input1 = paddle.layer.StaticInputV2(input=encoded_vector, is_seq=True)
+    group_input2 = paddle.layer.StaticInputV2(input=encoded_proj, is_seq=True)
+    group_inputs = [group_input1, group_input2]
+
+    trg_embedding = paddle.layer.embedding(
+        input=paddle.layer.data(
+            name='target_language_word',
+            type=paddle.data_type.integer_value_sequence(target_dict_dim)),
+        size=word_vector_dim,
+        param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+    group_inputs.append(trg_embedding)
+
+    # For decoder equipped with attention mechanism, in training,
+    # target embeding (the groudtruth) is the data input,
+    # while encoded source sequence is accessed to as an unbounded memory.
+    # Here, the StaticInput defines a read-only memory
+    # for the recurrent_group.
+    decoder = paddle.layer.recurrent_group(
+        name=decoder_group_name,
+        step=gru_decoder_with_attention,
+        input=group_inputs)
+
+    lbl = paddle.layer.data(
+        name='target_language_next_word',
+        type=paddle.data_type.integer_value_sequence(target_dict_dim))
+    cost = paddle.layer.classification_cost(input=decoder, label=lbl)
+
+    return cost
--- a/demo/word2vec/train_v2.py
+++ b/demo/word2vec/train_v2.py
+import math
+
+import paddle.v2 as paddle
+
+dictsize = 1953
+embsize = 32
+hiddensize = 256
+N = 5
+
+
+def wordemb(inlayer):
+    wordemb = paddle.layer.table_projection(
+        input=inlayer,
+        size=embsize,
+        param_attr=paddle.attr.Param(
+            name="_proj",
+            initial_std=0.001,
+            learning_rate=1,
+            l2_rate=0, ))
+    return wordemb
+
+
+def main():
+    paddle.init(use_gpu=False, trainer_count=1)
+    word_dict = paddle.dataset.imikolov.build_dict()
+    dict_size = len(word_dict)
+    firstword = paddle.layer.data(
+        name="firstw", type=paddle.data_type.integer_value(dict_size))
+    secondword = paddle.layer.data(
+        name="secondw", type=paddle.data_type.integer_value(dict_size))
+    thirdword = paddle.layer.data(
+        name="thirdw", type=paddle.data_type.integer_value(dict_size))
+    fourthword = paddle.layer.data(
+        name="fourthw", type=paddle.data_type.integer_value(dict_size))
+    nextword = paddle.layer.data(
+        name="fifthw", type=paddle.data_type.integer_value(dict_size))
+
+    Efirst = wordemb(firstword)
+    Esecond = wordemb(secondword)
+    Ethird = wordemb(thirdword)
+    Efourth = wordemb(fourthword)
+
+    contextemb = paddle.layer.concat(input=[Efirst, Esecond, Ethird, Efourth])
+    hidden1 = paddle.layer.fc(input=contextemb,
+                              size=hiddensize,
+                              act=paddle.activation.Sigmoid(),
+                              layer_attr=paddle.attr.Extra(drop_rate=0.5),
+                              bias_attr=paddle.attr.Param(learning_rate=2),
+                              param_attr=paddle.attr.Param(
+                                  initial_std=1. / math.sqrt(embsize * 8),
+                                  learning_rate=1))
+    predictword = paddle.layer.fc(input=hidden1,
+                                  size=dict_size,
+                                  bias_attr=paddle.attr.Param(learning_rate=2),
+                                  act=paddle.activation.Softmax())
+
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 100 == 0:
+                result = trainer.test(
+                    paddle.batch(
+                        paddle.dataset.imikolov.test(word_dict, N), 32))
+                print "Pass %d, Batch %d, Cost %f, %s, Testing metrics %s" % (
+                    event.pass_id, event.batch_id, event.cost, event.metrics,
+                    result.metrics)
+
+    cost = paddle.layer.classification_cost(input=predictword, label=nextword)
+    parameters = paddle.parameters.create(cost)
+    adam_optimizer = paddle.optimizer.Adam(
+        learning_rate=3e-3,
+        regularization=paddle.optimizer.L2Regularization(8e-4))
+    trainer = paddle.trainer.SGD(cost, parameters, adam_optimizer)
+    trainer.train(
+        paddle.batch(paddle.dataset.imikolov.train(word_dict, N), 32),
+        num_passes=30,
+        event_handler=event_handler)
+
+
+if __name__ == '__main__':
+    main()
--- a/doc/api/index_cn.rst
+++ b/doc/api/index_cn.rst
-API中文手册
-============
+API
+===

-DataProvider API
----------------
+模型配置 API
+------------

 ..  toctree::
    :maxdepth: 1

-    data_provider/dataprovider_cn.rst
-    data_provider/pydataprovider2_cn.rst
+    v2/model_configs.rst

-..  _api_trainer_config:
-
-Model Config API
----------------
+数据 API
+--------

 ..  toctree::
    :maxdepth: 1

-    trainer_config_helpers/optimizers.rst
-    trainer_config_helpers/data_sources.rst
-    trainer_config_helpers/layers.rst
-    trainer_config_helpers/activations.rst 
-    trainer_config_helpers/poolings.rst
-    trainer_config_helpers/networks.rst
-    trainer_config_helpers/evaluators.rst
-    trainer_config_helpers/attrs.rst
-
+    v2/data.rst

-Applications API
----------------
+训练 API
+--------

-..  toctree::
-    :maxdepth: 1
+..	toctree::
+	:maxdepth: 1

-    predict/swig_py_paddle_cn.rst
+	v2/run_logic.rst
\ No newline at end of file
--- a/doc/api/index_en.rst
+++ b/doc/api/index_en.rst
 API
 ===

-DataProvider API
+Model Config API
 ----------------

 ..  toctree::
    :maxdepth: 1

-    data_provider/dataprovider_en.rst
-    data_provider/pydataprovider2_en.rst
-
-..  _api_trainer_config:
+    v2/model_configs.rst

-Model Config API
----------------
+Data API
+--------

 ..  toctree::
    :maxdepth: 1

-    trainer_config_helpers/optimizers.rst
-    trainer_config_helpers/data_sources.rst
-    trainer_config_helpers/layers.rst
-    trainer_config_helpers/activations.rst 
-    trainer_config_helpers/poolings.rst
-    trainer_config_helpers/networks.rst
-    trainer_config_helpers/evaluators.rst
-    trainer_config_helpers/attrs.rst
+    v2/data.rst

+Train API
+---------

-Applications API
----------------
-
-..  toctree::
-    :maxdepth: 1
+..	toctree::
+	:maxdepth: 1

-    predict/swig_py_paddle_en.rst
+	v2/run_logic.rst
\ No newline at end of file
--- a/doc/api/data_provider/dataprovider_cn.rst
+++ b/doc/api/data_provider/dataprovider_cn.rst
--- a/doc/api/data_provider/dataprovider_en.rst
+++ b/doc/api/data_provider/dataprovider_en.rst
--- a/doc/api/data_provider/pydataprovider2_cn.rst
+++ b/doc/api/data_provider/pydataprovider2_cn.rst
--- a/doc/api/data_provider/pydataprovider2_en.rst
+++ b/doc/api/data_provider/pydataprovider2_en.rst
--- a/doc/api/data_provider/src/mnist_config.py
+++ b/doc/api/data_provider/src/mnist_config.py
--- a/doc/api/data_provider/src/mnist_provider.dict.py
+++ b/doc/api/data_provider/src/mnist_provider.dict.py
--- a/doc/api/data_provider/src/mnist_train.txt
+++ b/doc/api/data_provider/src/mnist_train.txt
--- a/doc/api/data_provider/src/sentimental_config.py
+++ b/doc/api/data_provider/src/sentimental_config.py
--- a/doc/api/data_provider/src/sentimental_provider.py
+++ b/doc/api/data_provider/src/sentimental_provider.py
--- a/doc/api/data_provider/src/sentimental_train.txt
+++ b/doc/api/data_provider/src/sentimental_train.txt
--- a/doc/api/data_provider/src/train.list
+++ b/doc/api/data_provider/src/train.list
--- a/doc/api/v1/index_cn.rst
+++ b/doc/api/v1/index_cn.rst
+API中文手册
+============
+
+DataProvider API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    data_provider/dataprovider_cn.rst
+    data_provider/pydataprovider2_cn.rst
+
+..  _api_trainer_config:
+
+Model Config API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    trainer_config_helpers/optimizers.rst
+    trainer_config_helpers/data_sources.rst
+    trainer_config_helpers/layers.rst
+    trainer_config_helpers/activations.rst 
+    trainer_config_helpers/poolings.rst
+    trainer_config_helpers/networks.rst
+    trainer_config_helpers/evaluators.rst
+    trainer_config_helpers/attrs.rst
+
+
+Applications API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    predict/swig_py_paddle_cn.rst
--- a/doc/api/v1/index_en.rst
+++ b/doc/api/v1/index_en.rst
+API
+===
+
+DataProvider API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    data_provider/dataprovider_en.rst
+    data_provider/pydataprovider2_en.rst
+
+..  _api_trainer_config:
+
+Model Config API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    trainer_config_helpers/optimizers.rst
+    trainer_config_helpers/data_sources.rst
+    trainer_config_helpers/layers.rst
+    trainer_config_helpers/activations.rst 
+    trainer_config_helpers/poolings.rst
+    trainer_config_helpers/networks.rst
+    trainer_config_helpers/evaluators.rst
+    trainer_config_helpers/attrs.rst
+
+
+Applications API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    predict/swig_py_paddle_en.rst
--- a/doc/api/predict/src/predict_sample.py
+++ b/doc/api/predict/src/predict_sample.py
--- a/doc/api/predict/swig_py_paddle_cn.rst
+++ b/doc/api/predict/swig_py_paddle_cn.rst
--- a/doc/api/predict/swig_py_paddle_en.rst
+++ b/doc/api/predict/swig_py_paddle_en.rst
--- a/doc/api/trainer_config_helpers/activations.rst
+++ b/doc/api/trainer_config_helpers/activations.rst
--- a/doc/api/trainer_config_helpers/attrs.rst
+++ b/doc/api/trainer_config_helpers/attrs.rst
--- a/doc/api/trainer_config_helpers/data_sources.rst
+++ b/doc/api/trainer_config_helpers/data_sources.rst
--- a/doc/api/trainer_config_helpers/evaluators.rst
+++ b/doc/api/trainer_config_helpers/evaluators.rst
--- a/doc/api/trainer_config_helpers/layers.rst
+++ b/doc/api/trainer_config_helpers/layers.rst
@@ -139,24 +139,12 @@ lstmemory
    :members: lstmemory
    :noindex:

-lstm_step_layer
---------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: lstm_step_layer
-    :noindex:
-
 grumemory
 ---------
 ..  automodule:: paddle.trainer_config_helpers.layers
    :members: grumemory
    :noindex:

-gru_step_layer
---------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: gru_step_layer
-    :noindex:
-
 Recurrent Layer Group
 =====================

@@ -172,6 +160,18 @@ recurrent_group
    :members: recurrent_group
    :noindex:
    
+lstm_step_layer
+---------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: lstm_step_layer
+    :noindex:
+
+gru_step_layer
+---------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: gru_step_layer
+    :noindex:
+
 beam_search
 ------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -279,6 +279,12 @@ concat_layer
    :members: concat_layer
    :noindex:

+seq_concat_layer
+----------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: seq_concat_layer
+    :noindex:
+
 Reshaping Layers
 ================

@@ -302,6 +308,18 @@ repeat_layer
    :members: repeat_layer
    :noindex:

+rotate_layer
+------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: rotate_layer
+    :noindex:
+
+seq_reshape_layer
+-----------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: seq_reshape_layer
+    :noindex:
+
 Math Layers
 ===========

@@ -382,6 +400,15 @@ sampling_id_layer
    :members: sampling_id_layer
    :noindex:

+Slicing and Joining Layers
+==========================
+
+pad_layer
+-----------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: pad_layer
+    :noindex:
+
 ..  _api_trainer_config_helpers_layers_cost_layers:

 Cost Layers
@@ -441,6 +468,12 @@ ctc_layer
    :members: ctc_layer
    :noindex:

+warp_ctc_layer
+--------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: warp_ctc_layer
+    :noindex:
+
 nce_layer
 -----------
 ..  automodule:: paddle.trainer_config_helpers.layers

--- a/doc/api/trainer_config_helpers/networks.rst
+++ b/doc/api/trainer_config_helpers/networks.rst
--- a/doc/api/trainer_config_helpers/optimizers.rst
+++ b/doc/api/trainer_config_helpers/optimizers.rst
--- a/doc/api/trainer_config_helpers/poolings.rst
+++ b/doc/api/trainer_config_helpers/poolings.rst
--- a/doc/api/v2/data.rst
+++ b/doc/api/v2/data.rst
+================
+Data Related API
+================
+
+
+#########
+DataTypes
+#########
+
+..  automodule:: paddle.v2.data_type
+    :members:
+
+##########
+DataFeeder
+##########
+
+..  automodule:: paddle.v2.data_feeder
+    :members:
+
+######
+Reader
+######
+
+..  automodule:: paddle.v2.reader
+    :members:
+
+..  automodule:: paddle.v2.reader.creator
+    :members:
+
+#########
+minibatch
+#########
+
+..  automodule:: paddle.v2.minibatch
+    :members:
+
+#######
+Dataset
+#######
+
+..  automodule:: paddle.v2.dataset
+    :members:
+
+
+mnist
+++++
+
+..  automodule:: paddle.v2.dataset.mnist
+    :members:
+
+
+cifar
+++++
+
+..  automodule:: paddle.v2.dataset.cifar
+    :members:
+
+conll05
+++++++
+
+..  automodule:: paddle.v2.dataset.conll05
+    :members:
+
+imdb
++++
+
+..  automodule:: paddle.v2.dataset.imdb
+    :members:
+
+imikolov
++++++++
+
+..  automodule:: paddle.v2.dataset.imikolov
+    :members:
+
+movielens
+++++++++
+
+..  automodule:: paddle.v2.dataset.movielens
+    :members:
+
+sentiment
+++++++++
+
+..  automodule:: paddle.v2.dataset.sentiment
+    :members:
+
+uci_housing
+++++++++++
+
+..  automodule:: paddle.v2.dataset.uci_housing
+    :members:
+
--- a/doc/api/v2/model_configs.rst
+++ b/doc/api/v2/model_configs.rst
+#########################
+Configuration Related API
+#########################
+
+======
+Layers
+======
+
+..  automodule:: paddle.v2.layer
+    :members:
+
+
+==========
+Attributes
+==========
+
+..	automodule:: paddle.v2.attr
+	:members:
+
+===========
+Activations
+===========
+
+..	automodule:: paddle.v2.activation
+	:members:
+
+========
+Poolings
+========
+
+..	automodule:: paddle.v2.pooling
+	:members:
+
+========
+Networks
+========
+
+..	automodule:: paddle.v2.networks
+	:members:
+
+==========
+Optimizers
+==========
+
+..	automodule:: paddle.v2.optimizer
+	:members:
--- a/doc/api/v2/run_logic.rst
+++ b/doc/api/v2/run_logic.rst
+###########
+Trainer API
+###########
+
+==========
+Parameters
+==========
+
+..  automodule:: paddle.v2.parameters
+    :members:
+
+
+=======
+Trainer
+=======
+
+..	automodule:: paddle.v2.trainer
+	:members:
+
+
+=====
+Event
+=====
+
+..	automodule:: paddle.v2.event
+	:members:
--- a/doc/design/api.md
+++ b/doc/design/api.md
+# PaddlePaddle Design Doc
+
+## Ingredients
+
+As our design principle is starting from the essence: how could we
+allow users to express and solve their problems at neural networks.
+Some essential concepts that our API have to provide include:
+
+1. A *topology* is an expression of *layers*.
+
+1. A layer could be any kind of computation, including *cost*.
+
+1. Some layers have parameters, some don't. Most costs don't have
+   parameters.
+
+1. In some topologies, layers share parameters.  For
+   example,
+   [the network for training a ranking model](https://github.com/PaddlePaddle/Paddle/issues/1311#issuecomment-279121850).
+
+1. At programming time, users specify topologies and possible sharing
+   of parameters.  PaddlePaddle can figure out and create parameters
+   required (and possibly shared) by one or more topologies.
+
+
+## Starting from Examples
+
+As a summarization
+of
+[our disucssion](https://github.com/PaddlePaddle/Paddle/issues/1315),
+let us present two examples here:
+
+
+### Example 1. Sharing Parameters between Layers
+
+We use
+the
+[3-branch ranking](https://github.com/PaddlePaddle/Paddle/issues/1311#issuecomment-279121850) model
+in this example.  For your convenience, I copy-a-paste the model's
+topology as follows:
+
+```
+A -> f -\
+Q -> f --> cost
+B -> f -/
+```
+
+The following program trains the topology including the cost, and then
+use the sub-network in the trained topology in inference:
+
+```python
+def f(in):
+    e = paddle.layer.embedding(in, parameter_name="embedding")
+    o = paddle.layer.softmax(e, parameter_name="semantic")
+    return o
+
+# Create 3 topologies (subnets), they share parameters because all
+# correspoinding layers have the same parameter names.
+fA = f(paddle.layer.data(input_name="A"))
+fB = f(paddle.layer.data(input_name="B"))
+fQ = f(paddle.layer.data(input_name="Q"))
+
+topology = paddle.layer.less_than(
+               paddle.layer.cross_entropy(fA, fQ),
+               paddle.layer.corss_entropy(fB, fQ))
+
+# Derive parameters required in topology and create them in model.
+parameters = paddle.parameters.create(topology)
+
+# Estimate parameters used in topology from data.
+paddle.train(topology, parameters, reader=read_ranking_model_data)
+
+# Inference using fA (or fB or fC, as they share their parameters).
+[testA, testB, testQ] = read_ranking_model_data()
+print "The sematic-vector of testA: ", paddle.infer(fA, parameters, testA)
+```
+
+
+### Example 2. Sharing Parameters between "Models"
+
+We use [GAN](https://github.com/PaddlePaddle/book/tree/develop/gan) in
+this example.  In the following example program, `d0` and `d1`
+correspond to the two networks in the following figure:
+
+<img src="https://github.com/wangyang59/book/raw/00036f4b0da5225041a6824587c1a01cf20159b1/gan/image/gan_ig.png" width=400 />
+
+```python
+def G(in):
+    # over-simplified example as G has only one layers:
+    return paddle.layer.fc(in, parameter_name="G")
+
+def D(in);
+    # again, over-simplified:
+    return paddle.layer.fc(in, parameter_name="D")
+
+# Construct the first topology, which contains both D and G.
+# By learning this topology, we update parameters of G.
+d0 = paddle.layer.should_be_false(D(G(paddle.layer.data())))
+
+# Construct a second topology d1, which contains only D. By
+# training this topology, we update parameters of D.  Note
+# that d1 share parameters with d0.
+d1 = paddle.layer.should_be_true(D(paddle.layer.data()))
+
+# Create parameters from a list of multiple topologies (models) for
+# the chance to share parameters between these topologies.
+parameters = paddle.parameters.create([d0, d1])
+
+# Iterative training of GAN.
+for ...:
+    train(d0, parameters, reader=read_from_rng, immutable_parameters={"D"})
+    train(d1, parameters, reader=read_from_realistic_images)
+
+# Use d1 for inference:
+print "D thinks a batch of images are realistic ", infer(d1, parameters, read_mnist_images)
+```
+
+
+### Summarization
+
+
+Above two programs reveal some important design concerns:
+
+1. Users describe a topology as an expression of layers.  Every layer
+   has a *parameter name*.  If the users don't specify it explicitly, it's automatically generated as a unique name.  By
+   specifying the parameter name, users can specify the sharing of
+   parameters between layers and even between topologies.
+
+1. `paddle.parameters.create` figures out parameters required by one
+   or more topologies from parameter names of layers.  It creates these
+   parameters and returns a `ParameterSet` object, which is in essence
+   a map from *parameter names* to *parameters*.
+
+1. At training and inference time, `paddle.train` and `paddle.infer`
+   requires both a topology and the parameter set that holds the parameters of that topology.  There are some reasons:
+
+   1. This prevents users from forgetting to call
+      `paddle.parameters.create`.
+   1. `paddle.train` needs to know which parameter set to update.
+   1. Users could load another (pre-trained) parameter set and use it
+      with a topology in `train.infer`.
+
+1. By specifying the `immutable_parameters` parameter of
+   `paddle.train`, we can forbid the update of these parameters.
+
+
+## Reader
+
+Not all programming frameworks allow users to define I/O functions.
+An example is Google MapReduce, which can only read from text,
+SSTable, and RecordIO files.  Hadoop MapReduce allows users to define
+readers and writers by deriving from base classes `Reader` and
+`Writer`.  The former is less flexible but also less error-prone.  We
+decide to provide the flexibility to users to define their readers.
+
+
+There are some open questions here:
+
+1. **Should a reader return a Python dictionary?**
+
+1. **How to map multiple outputs from a reader to multiple data layers?**
+
+1. **How to easily compose some existing readers to read more data and
+   feed a topology with more data layers?**
+
+
+## Training
+
+The recommended way to training a model is to call `paddle.train`,
+which simply calls `paddle.trainer.Default`, a global variable of
+type `paddle.trainer.SGD`.  Equivalently, we can do
+
+```python
+opt = paddle.trainer.SGD(..., paddle.updater.Adam(...))
+opt.train(topology, parameters, reader=read, ...)
+```
+
+### Updater
+
+Please be aware that a trainer can accept an updater as its data
+member, where an updater is a class derived from
+`paddle.trainer.Updater`.  This is to make it easier to customize
+trainers, as discussed
+[here](https://github.com/PaddlePaddle/Paddle/issues/1319).
+
+### Event Handler
+
+`paddle.train` and `paddle.trainer.XXX.train` take an optional
+parameter `event_handler`, which should be either `None` or a function
+that handle some events:
+
+1. BeginTraining
+1. EndTraining
+1. BeginIteration
+1. EndIteration
+1. BeginPass
+1. EndPass
+
+where EndPass is sent if and only if the reader yields
+`end_pass=True`.
+
+An example as follows:
+
+```python
+def event_handler(event):
+    if ininstance(event, paddle.event.EndIteration):
+        print paddle.test(...)
+
+paddle.train(topology, parameters, reader, event_handler)
+```
+
+If we are writing a PaddlePaddle program in and for iPython/Jypyter,
+we can use metaplotlib in the event handler to plot a curve of
+cost/error versus iterations, as shown
+[here](https://blog.dominodatalab.com/interactive-dashboards-in-jupyter/).
+
+### Distributed Training
+
+If users want to do distributed training on a cluster, s/he should
+call `paddle.dist_train` and provides access tokens to the cluster as
+a parameter.
+
+For example, if the user has a TLS certificate that allows him to
+access a Kubernetes cluster, s/he should be able to call
+
+```python
+paddle.dist_train(model,
+                  trainer=paddle.trainer.SGD(...,
+                                             paddle.updater.Adam(...)),
+                  reader=read,
+                  k8s_user="yi",
+                  k8s_token="kube_cluster_tls.pem",
+                  k8s_job="hello",
+                  num_parameter_servers=15)
+```
+
+The pseudo code if `paddle.dist_train` is as follows:
+
+```python
+def dist_train(topology, parameters, trainer, reader, ...):
+    if os.getenv("KUBERNETES_SERVICE_HOST") == None:
+        image_name = k8s_user + '/' + k8s_job
+        docker_build(image_name)
+        docker_push()
+        kube_ctrl_start_job(image_name, k8s_user, k8s_token)
+    else:
+        rank = kube_list_containers_in_job_and_return_current_containers_rank()
+        if rank == 0:
+            master()
+        elif rank < 15:
+            parameter_server()
+        else:
+            trainer.train(model, reader=read)
+```
+
+Please be aware that if a process is running on the Kubernetes
+cluster, it will have some environment variables pre-defined.
+
+If `dist_train` doesn't see these environment variables, it knows
+that it's running on users' personal computer, and it should work as a
+*launcher*.  Otherwise, it knows that it's running on the cluster and
+need to figure out its role as either the master, or a trainer, or a
+parameter server.
--- a/doc/design/reader/README.md
+++ b/doc/design/reader/README.md
+# Python Data Reader Design Doc
+
+At training and testing time, PaddlePaddle programs need to read data. To ease the users' work to write data reading code, we define that
+
+- A *reader* is a function that reads data (from file, network, random number generator, etc) and yields data items.
+- A *reader creator* is a function that returns a reader function.
+- A *reader decorator* is a function, which accepts one or more readers, and returns a reader.
+- A *batch reader* is a function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items.
+
+and provide function which converts reader to batch reader, frequently used reader creators and reader decorators.
+
+## Data Reader Interface
+
+Indeed, *data reader* doesn't have to be a function that reads and yields data items. It can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`):
+
+```
+iterable = data_reader()
+```
+
+Element produced from the iterable should be a **single** entry of data, **not** a mini batch. That entry of data could be a single item, or a tuple of items. Item should be of [supported type](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int)
+
+An example implementation for single item data reader creator:
+
+```python
+def reader_creator_random_image(width, height):
+    def reader():
+        while True:
+            yield numpy.random.uniform(-1, 1, size=width*height)
+    return reader
+```
+
+An example implementation for multiple item data reader creator:
+```python
+def reader_creator_random_image_and_label(width, height, label):
+    def reader():
+        while True:
+            yield numpy.random.uniform(-1, 1, size=width*height), label
+    return reader
+```
+
+## Batch Reader Interface
+
+*batch reader* can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list must be a tuple.
+
+Here are valid outputs:
+```python
+# a mini batch of three data items. Each data item consist three columns of data, each of which is 1.
+[(1, 1, 1),
+(2, 2, 2),
+(3, 3, 3)]
+
+# a mini batch of three data items, each data item is a list (single column).
+[([1,1,1],),
+([2,2,2],),
+([3,3,3],),
+```
+
+Please note that each item inside the list must be a tuple, below is an invalid output:
+```python
+ # wrong, [1,1,1] needs to be inside a tuple: ([1,1,1],).
+ # Otherwise it's ambiguous whether [1,1,1] means a single column of data [1, 1, 1],
+ # or three column of datas, each of which is 1.
+[[1,1,1],
+[2,2,2],
+[3,3,3]]
+```
+
+It's easy to convert from reader to batch reader:
+```python
+mnist_train = paddle.dataset.mnist.train()
+mnist_train_batch_reader = paddle.batch(mnist_train, 128)
+```
+
+Also easy to create custom batch reader:
+```python
+def custom_batch_reader():
+    while True:
+        batch = []
+        for i in xrange(128):
+            batch.append((numpy.random.uniform(-1, 1, 28*28),)) # note that it's a tuple being appended.
+        yield batch
+
+mnist_random_image_batch_reader = custom_batch_reader
+```
+
+## Usage
+
+batch reader, mapping from item(s) read to data layer, batch size and number of total pass will be passed into `paddle.train`:
+
+```python
+# two data layer is created:
+image_layer = paddle.layer.data("image", ...)
+label_layer = paddle.layer.data("label", ...)
+
+# ...
+batch_reader = paddle.batch(paddle.dataset.mnist.train(), 128)
+paddle.train(batch_reader, {"image":0, "label":1}, 128, 10, ...)
+```
+
+## Data Reader Decorator
+
+*Data reader decorator* takes a single or multiple data reader, returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` syntax.
+
+Since we have a strict interface for data readers (no parameter, return a single data item). Data reader can be used flexiable via data reader decorators. Following are a few examples:
+
+### Prefetch Data
+
+Since reading data may take time and training can not proceed without data. It is generally a good idea to prefetch data.
+
+Use `paddle.reader.buffered` to prefetch data:
+
+```python
+buffered_reader = paddle.reader.buffered(paddle.dataset.mnist.train(), 100)
+```
+
+`buffered_reader` will try to buffer (prefetch) `100` data entries.
+
+### Compose Multiple Data Readers
+
+For example, we want to use a source of real images (reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
+
+We can do:
+
+```python
+def reader_creator_random_image(width, height):
+    def reader():
+        while True:
+            yield numpy.random.uniform(-1, 1, size=width*height)
+    return reader
+
+def reader_creator_bool(t):
+    def reader:
+        while True:
+            yield t
+    return reader
+
+true_reader = reader_creator_bool(True)
+false_reader = reader_creator_bool(False)
+
+reader = paddle.reader.compose(paddle.dataset.mnist.train(), data_reader_creator_random_image(20, 20), true_reader, false_reader)
+# Skipped 1 because paddle.dataset.mnist.train() produces two items per data entry.
+# And we don't care second item at this time.
+paddle.train(paddle.batch(reader, 128), {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...)
+```
+
+### Shuffle
+
+Given shuffle buffer size `n`, `paddle.reader.shuffle` will return a data reader that buffers `n` data entries and shuffle them before a data entry is read.
+
+Example:
+```python
+reader = paddle.reader.shuffle(paddle.dataset.mnist.train(), 512)
+```
+
+## Q & A
+
+### Why reader return only a single entry, but not a mini batch?
+
+Always returning a single entry make reusing existing data readers much easier (e.g., if existing reader return not a single entry but 3 entries, training code will be more complex because it need to handle cases like batch size 2).
+
+We provide function `paddle.batch` to turn (single entry) reader into batch reader.
+
+### Why do we need batch reader, isn't train take reader and batch_size as arguments sufficient?
+
+In most of the case, train taking reader and batch_size as arguments would be sufficent. However sometimes user want to customize order of data entries inside a mini batch. Or even change batch size dynamically.
+
+### Why use a dictionary but not a list to provide mapping?
+
+We decided to use dictionary (`{"image":0, "label":1}`) instead of list (`["image", "label"]`) is because that user can easily resue item (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or skip item (e.g., using `{"image_a":0, "label":2}`).
+
+### How to create custom data reader creator
+
+```python
+def image_reader_creator(image_path, label_path, n):
+    def reader():
+        f = open(image_path)
+        l = open(label_path)
+        images = numpy.fromfile(
+            f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32')
+        images = images / 255.0 * 2.0 - 1.0
+        labels = numpy.fromfile(l, 'ubyte', count=n).astype("int")
+        for i in xrange(n):
+            yield images[i, :], labels[i] # a single entry of data is created each time
+        f.close()
+        l.close()
+    return reader
+
+# images_reader_creator creates a reader
+reader = image_reader_creator("/path/to/image_file", "/path/to/label_file", 1024)
+paddle.train(paddle.batch(reader, 128), {"image":0, "label":1}, ...)
+```
+
+### How is `paddle.train` implemented
+
+An example implementation of paddle.train could be:
+
+```python
+def train(batch_reader, mapping, batch_size, total_pass):
+    for pass_idx in range(total_pass):
+        for mini_batch in batch_reader(): # this loop will never end in online learning.
+            do_forward_backward(mini_batch, mapping)
+```
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -12,7 +12,7 @@ PaddlePaddle项目提供官方 `Docker <https://www.docker.com/>`_ 镜像。Dock
 PaddlePaddle提供的Docker镜像版本
 --------------------------------

-我们提供了12个 `Docker image <https://hub.docker.com/r/paddledev/paddle/tags/>`_ ，他们的image name都是 :code:`paddle-dev/paddle` ，tag分别为
+我们提供了12个 `Docker image <https://hub.docker.com/r/paddledev/paddle/tags/>`_ ，他们的image name都是 :code:`paddledev/paddle` ，tag分别为

 +-----------------+------------------+------------------------+-----------------------+
 |                 |   normal         |           devel        |          demo         |
@@ -45,7 +45,7 @@ PaddlePaddle提供的Docker镜像版本

    if cat /proc/cpuinfo | grep -q avx ; then echo "Support AVX"; else echo "Not support AVX"; fi

-如果输出 :code:`Support AVX`，则可以选择上表中的AVX版本PaddlePaddle。否则需要选择非AVX的PaddlePaddle。选择普通CPU版本的devel版本的image，则可以使用 :code:`paddle-dev/paddle:cpu-devel-latest` 来引用这个image。
+如果输出 :code:`Support AVX`，则可以选择上表中的AVX版本PaddlePaddle。否则需要选择非AVX的PaddlePaddle。选择普通CPU版本的devel版本的image，则可以使用 :code:`paddledev/paddle:cpu-devel-latest` 来引用这个image。

 PaddlePaddle提供的镜像并不包含任何命令运行，想要运行PaddlePaddle，您需要进入镜像运行PaddlePaddle
 程序或者自定义一个含有启动脚本的image。具体请参考注意事项中的 :code:`使用ssh访问PaddlePaddle镜像`

--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -16,70 +16,71 @@ Developers can work on PaddlePaddle using Docker.  This allows
 developers to work on different platforms -- Linux, Mac OS X, and
 Windows -- in a consistent way.

-The general development workflow with Docker and CMake is as follows:
-
-1. Get the source code of Paddle:
+1. Build the Development Environment as a Docker Image

   .. code-block:: bash

-      git clone https://github.com/PaddlePaddle/Paddle.git
+      git clone --recursive https://github.com/PaddlePaddle/Paddle
+      cd Paddle
+      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile .


-2. Build a development Docker image :code:`paddle:dev` from the source
-   code.  This image contains all the development tools and
-   dependencies of PaddlePaddle.
+   Note that by default :code:`docker build` wouldn't import source
+   tree into the image and build it.  If we want to do that, we need
+   to set a build arg:

   .. code-block:: bash

-      cd paddle
-      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile .
+      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile --build-arg BUILD_AND_INSTALL=ON .
+
+
+2. Run the Development Environment

-   Sometimes docker build might suffer from a slow network connection to the official Ubuntu apt-source servers. In such case, we can specify an apt-source mirror server that is geologically nearer to us. In the following example, we specified an apt-source server that responds fast in China.You can specify the UBUNTU MIRROR with :code:`--build-arg UBUNTU_MIRROR` like the example below.
+   Once we got the image :code:`paddle:dev`, we can use it to develop
+   Paddle by mounting the local source code tree into a container that
+   runs the image:

   .. code-block:: bash

-      docker build \
-       --build-arg UBUNTU_MIRROR="http://mirrors.163.com" \
-       -t paddle:dev \
-       -f paddle/scripts/docker/Dockerfile .
+      docker run -d -p 2202:22 -v $PWD:/paddle paddle:dev
+
+   This runs a container of the development environment Docker image
+   with the local source tree mounted to :code:`/paddle` of the
+   container.
+
+   Note that the default entry-point of :code:`paddle:dev` is
+   :code:`sshd`, and above :code:`docker run` commands actually starts
+   an SSHD server listening on port 2202.  This allows us to log into
+   this container with:
+
+   .. code-block:: bash

+      ssh root@localhost -p 2202

-3. Run the image as a container and mounting local source code
-   directory into the container.  This allows us to change the code on
-   the host and build it within the container.
+   Usually, I run above commands on my Mac.  I can also run them on a
+   GPU server :code:`xxx.yyy.zzz.www` and ssh from my Mac to it:

   .. code-block:: bash

-      docker run       \
-       -d              \
-       --name paddle   \
-       -p 2022:22      \
-       -v $PWD:/paddle \
-       paddle:dev
+      my-mac$ ssh root@xxx.yyy.zzz.www -p 2202

-   where :code:`-d` makes the container running in background,
-   :code:`--name paddle` allows us to run a nginx container to serve
-   documents in this container, :code:`-p 2022:22` allows us to SSH
-   into this container, :code:`-v $PWD:/paddle` shares the source code
-   on the host with the container.
+3. Build and Install Using the Development Environment

-4. SSH into the container:
+   Once I am in the container, I can use
+   :code:`paddle/scripts/docker/build.sh` to build, install, and test
+   Paddle:

   .. code-block:: bash

-      ssh root@localhost -p 2022
+      /paddle/paddle/scripts/docker/build.sh

-5. We can edit the source code in the container or on this host.  Then
-   we can build using cmake
+   This builds everything about Paddle in :code:`/paddle/build`.  And
+   we can run unit tests there:

   .. code-block:: bash

-      cd /paddle # where paddle source code has been mounted into the container
-      mkdir -p build
-      cd build
-      cmake -DWITH_TESTING=ON ..
-      make -j `nproc`
-      CTEST_OUTPUT_ON_FAILURE=1 ctest
+      cd /paddle/build
+      ctest


 CPU-only and GPU Images

--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -10,6 +10,7 @@
  usage/cmd_parameter/index_cn.rst
  usage/concepts/use_concepts_cn.rst
  usage/cluster/cluster_train_cn.md
+  usage/k8s/k8s_basis_cn.md
  usage/k8s/k8s_cn.md
  usage/k8s/k8s_distributed_cn.md


--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
@@ -6,7 +6,7 @@

 在本文中，我们将阐释如何在集群上运行分布式 Paddle 训练作业。我们将以[推荐系统](https://github.com/baidu/Paddle/tree/develop/demo/recommendation)为例创建分布式的单进程训练。

-在本文中使用的[脚本](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train)通过 SSH 运行分布式作业。 它们还可以供那些运行更复杂的集群管理系统（如 MPI 和 [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/k8s) ）的用户参考。
+在本文中使用的[脚本](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train)通过 SSH 运行分布式作业。 它们还可以供那些运行更复杂的集群管理系统（如 MPI 和 [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/k8s) ）的用户参考。

 ## 前提条件


--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
@@ -2,7 +2,7 @@

 In this article, we explain how to run distributed Paddle training jobs on clusters.  We will create the distributed version of the single-process training example, [recommendation](https://github.com/baidu/Paddle/tree/develop/demo/recommendation).

-[Scripts](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train) used in this article launch distributed jobs via SSH.  They also work as a reference for users running more sophisticated cluster management systems like MPI and [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/cluster/k8s).
+[Scripts](https://github.com/baidu/Paddle/tree/develop/paddle/scripts/cluster_train) used in this article launch distributed jobs via SSH.  They also work as a reference for users running more sophisticated cluster management systems like MPI and [Kubernetes](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/howto/usage/k8s).

 ## Prerequisite


--- a/doc/howto/usage/cmd_parameter/arguments_cn.md
+++ b/doc/howto/usage/cmd_parameter/arguments_cn.md
@@ -127,11 +127,6 @@
 <td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
 </tr>

-<tr>
-<td class="left">allow_inefficient_sparse_update</td>
-<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
-</tr>
-
 <tr>
 <td class="left">start_pass</td>
 <td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>

--- a/doc/howto/usage/cmd_parameter/arguments_en.md
+++ b/doc/howto/usage/cmd_parameter/arguments_en.md
@@ -127,11 +127,6 @@ It looks like there are a lot of arguments. However, most of them are for develo
 <td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
 </tr>

-<tr>
-<td class="left">allow_inefficient_sparse_update</td>
-<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
-</tr>
-
 <tr>
 <td class="left">start_pass</td>
 <td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>

--- a/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
+++ b/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
@@ -306,10 +306,6 @@
  - 指示是否显示参数服务器上的稀疏参数分布的日志细节.
  - 类型: bool (默认: 0).

-* `--allow_inefficient_sparse_update`
-  - 指示是否允许低效率的稀疏更新.
-  - 类型: bool (默认: 0).
-
 * `--check_sparse_distribution_batches`
  - 每运行多少个批次执行一次稀疏参数分布的检查.
  - 类型: int32 (默认: 100).

--- a/doc/howto/usage/cmd_parameter/detail_introduction_en.md
+++ b/doc/howto/usage/cmd_parameter/detail_introduction_en.md
@@ -310,10 +310,6 @@
  - show log details for sparse parameter distribution in pserver.
  - type: bool (default: 0).

-* `--allow_inefficient_sparse_update`
-  - Whether to allow inefficient sparse update.
-  - type: bool (default: 0).
-
 * `--check_sparse_distribution_batches`
  - Running sparse parameter distribution check every so many batches.
  - type: int32 (default: 100).

--- a/doc/howto/usage/k8s/k8s_aws_en.md
+++ b/doc/howto/usage/k8s/k8s_aws_en.md
--- a/doc/howto/usage/k8s/k8s_basis_cn.md
+++ b/doc/howto/usage/k8s/k8s_basis_cn.md
+# Kubernetes 简介
+
+[*Kubernetes*](http://kubernetes.io/)是Google开源的容器集群管理系统，其提供应用部署、维护、扩展机制等功能，利用Kubernetes能方便地管理跨机器运行容器化的应用。Kubernetes可以在物理机或虚拟机上运行，且支持部署到[AWS](http://kubernetes.io/docs/getting-started-guides/aws)，[Azure](http://kubernetes.io/docs/getting-started-guides/azure/)，[GCE](http://kubernetes.io/docs/getting-started-guides/gce)等多种公有云环境。介绍分布式训练之前，需要对[Kubernetes](http://kubernetes.io/)有一个基本的认识，下面先简要介绍一下本文用到的几个Kubernetes概念。
+
+- [*Node*](http://kubernetes.io/docs/admin/node/) 表示一个Kubernetes集群中的一个工作节点，这个节点可以是物理机或者虚拟机，Kubernetes集群就是由node节点与master节点组成的。
+
+- [*Pod*](http://kubernetes.io/docs/user-guide/pods/) 是一组(一个或多个)容器，pod是Kubernetes的最小调度单元，一个pod中的所有容器会被调度到同一个node上。Pod中的容器共享NET，PID，IPC，UTS等Linux namespace。由于容器之间共享NET namespace，所以它们使用同一个IP地址，可以通过*localhost*互相通信。不同pod之间可以通过IP地址访问。
+
+- [*Job*](http://kubernetes.io/docs/user-guide/jobs/) 描述Kubernetes上运行的作业，一次作业称为一个job，通常每个job包括一个或者多个pods，job启动后会创建这些pod并开始执行一个程序，等待这个程序执行成功并返回0则成功退出，如果执行失败，也可以配置不同的重试机制。
+
+- [*Volume*](http://kubernetes.io/docs/user-guide/volumes/) 存储卷，是pod内的容器都可以访问的共享目录，也是容器与node之间共享文件的方式，因为容器内的文件都是暂时存在的，当容器因为各种原因被销毁时，其内部的文件也会随之消失。通过volume，就可以将这些文件持久化存储。Kubernetes支持多种volume，例如hostPath(宿主机目录)，gcePersistentDisk，awsElasticBlockStore等。
+
+- [*Namespaces*](https://kubernetes.io/docs/user-guide/namespaces/) 命名空间，在kubernetes中创建的所有资源对象(例如上文的pod，job)等都属于一个命名空间，在同一个命名空间中，资源对象的名字是唯一的，不同空间的资源名可以重复，命名空间主要为了对象进行逻辑上的分组便于管理。本文只使用了默认命名空间。
+
+- [*PersistentVolume*](https://kubernetes.io/docs/user-guide/persistent-volumes/): 和[*PersistentVolumeClaim*](https://kubernetes.io/docs/user-guide/persistent-volumes/#persistentvolumeclaims)结合，将外部的存储服务在Kubernetes中描述成为统一的资源形式，便于存储资源管理和Pod引用。
+
+# 部署Kubernetes集群
+
+Kubernetes提供了多种集群部署的方案，本文档内不重复介绍。这里给出集中常见的部署方法：
+
+- [*minikube*](https://kubernetes.io/docs/getting-started-guides/minikube/): 快速在本地启动一个单机的kubernetes服务器，便于本地验证和测试。
+- [*kubeadm*](http://kubernetes.io/docs/getting-started-guides/kubeadm/): 在不同操作系统，不同主机(Bare-Metal, AWS, GCE)条件下，快速部署集群。
+- [*AWS EC2*](https://kubernetes.io/docs/getting-started-guides/aws/): 在aws上快速部署集群。
+- [*Bare-Metal*](https://kubernetes.io/docs/getting-started-guides/centos/centos_manual_config/): 在物理机上手动部署。
+
+可以参考[这个表格](https://kubernetes.io/docs/getting-started-guides/#table-of-solutions)选择适合您的场景的合适方案。
+
+# 选择存储方案
+
+容器不会保留在运行时生成的数据，job或者应用程序在容器中运行时生成的数据会在容器销毁时消失。为了完成分布式机器学习训练任务，需要有一个外部的存储服务来保存训练所需数据和训练输出。
+常见的可选存储服务包括：
+
+- [*NFS*](https://github.com/kubernetes/kubernetes/tree/master/examples/volumes/nfs): 可以将磁盘上某个目录共享给网络中其他机器访问。部署和配置比较简单，可以用于小量数据的验证。不提供分布式存储，高可用，冗余等功能。NFS的部署方法可以参考[这里](http://www.tecmint.com/how-to-setup-nfs-server-in-linux/)。
+- [*GlusterFS*](http://gluster.readthedocs.io/en/latest/Quick-Start-Guide/Quickstart/): 网络分布式文件系统，可以在Kubernetes中按照[这个](https://github.com/kubernetes/kubernetes/tree/master/examples/volumes/glusterfs)例子使用。
+- [*Ceph*](http://docs.ceph.com/docs/master/): 分布式文件系统，支持rbd，POSIX API接口(ceph fs)和对象存储API，参考[这里](https://kubernetes.io/docs/user-guide/volumes/#rbd)。
+- [*MooseFS*](https://moosefs.com/documentation.html): 一个分布式的存储系统。需要先挂载到服务器Node上再通过kubernetes hostPath Volume挂载到容器中。
+
+# 配置kubectl
+
+## 安装kubectl
+```
+# OS X
+curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/darwin/amd64/kubectl
+
+# Linux
+curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/linux/amd64/kubectl
+
+# Windows
+curl -LO https://storage.googleapis.com/kubernetes-release/release/$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)/bin/windows/amd64/kubectl.exe
+```
+
+## 配置kubectl访问你的kubernetes集群
+
+编辑`~/.kube/config`这个配置文件，修改`Master-IP`的地址。如果使用SSL认证，则需要配置`certificate-authority`和`users`中的用户证书。如果是使用非SSL方式访问（比如通过8080端口），也可以去掉这些证书的配置。
+```
+apiVersion: v1
+clusters:
+- cluster:
+    certificate-authority: /path/to/ca.crt
+    server: https://[Master-IP]:443
+  name: minikube
+contexts:
+- context:
+    cluster: minikube
+    user: minikube
+  name: minikube
+current-context: minikube
+kind: Config
+preferences: {}
+users:
+- name: minikube
+  user:
+    client-certificate: /path/to/apiserver.crt
+    client-key: /Users/wuyi/.minikube/apiserver.key
+```
--- a/doc/howto/usage/k8s/k8s_distributed_cn.md
+++ b/doc/howto/usage/k8s/k8s_distributed_cn.md
--- a/doc/howto/usage/k8s/src/add_security_group.png
+++ b/doc/howto/usage/k8s/src/add_security_group.png
--- a/doc/howto/usage/k8s/src/create_efs.png
+++ b/doc/howto/usage/k8s/src/create_efs.png
--- a/doc/howto/usage/k8s/src/job.yaml
+++ b/doc/howto/usage/k8s/src/job.yaml
-apiVersion: batch/v1
-kind: Job
-metadata:
-  name: paddle-cluster-job
-spec:
-  parallelism: 3
-  completions: 3
-  template:
-    metadata:
-      name: paddle-cluster-job
-    spec:
-      volumes:
-      - name: jobpath
-        hostPath: 
-          path: /home/work/paddle_output              
-      containers:
-      - name: trainer
-        image: registry.baidu.com/public/paddle:mypaddle
-        command: ["bin/bash",  "-c", "/root/start.sh"]        
-        env:
-        - name: JOB_NAME
-          value: paddle-cluster-job
-        - name: JOB_PATH
-          value: /home/jobpath     
-        - name: JOB_NAMESPACE
-          value: default         
-        - name: TRAIN_CONFIG_DIR
-          value: recommendation
-        - name: CONF_PADDLE_NIC
-          value: eth0  
-        - name: CONF_PADDLE_PORT
-          value: "7164"
-        - name: CONF_PADDLE_PORTS_NUM
-          value: "2"     
-        - name: CONF_PADDLE_PORTS_NUM_SPARSE
-          value: "2"  
-        - name: CONF_PADDLE_GRADIENT_NUM
-          value: "3"                                                               
-        volumeMounts:
-        - name: jobpath
-          mountPath: /home/jobpath       
-      restartPolicy: Never
-    
--- a/doc/howto/usage/k8s/src/k8s-paddle-arch.png
+++ b/doc/howto/usage/k8s/src/k8s-paddle-arch.png
--- a/doc/howto/usage/k8s/src/k8s_data/Dockerfile
+++ b/doc/howto/usage/k8s/src/k8s_data/Dockerfile
+FROM alpine
+
+RUN apk update && apk upgrade && apk add coreutils
+ADD quick_start /quick_start
+ADD get_data.sh /bin/
+RUN chmod +x /bin/get_data.sh
+ENTRYPOINT ["/bin/get_data.sh"]
--- a/doc/howto/usage/k8s/src/k8s_data/README.md
+++ b/doc/howto/usage/k8s/src/k8s_data/README.md
+To build PaddlePaddle data preparation image in tutorial [Distributed PaddlePaddle Training on AWS with Kubernetes](../../k8s_aws_en.md), run following commands:
+
+```
+cp -r ../../../../../../demo/quick_start .
+docker build . -t prepare-data-image-name
+```
--- a/doc/howto/usage/k8s/src/k8s_data/get_data.sh
+++ b/doc/howto/usage/k8s/src/k8s_data/get_data.sh
+#!/bin/sh
+
+out_dir=$OUT_DIR
+split_count=$SPLIT_COUNT
+
+set -e
+
+mkdir -p $out_dir
+cp -r /quick_start $out_dir/
+
+mkdir -p $out_dir/0/data
+cd $out_dir/0/data
+wget http://paddlepaddle.bj.bcebos.com/demo/quick_start_preprocessed_data/preprocessed_data.tar.gz
+tar zxvf preprocessed_data.tar.gz
+rm preprocessed_data.tar.gz
+
+split -d --number=l/$split_count -a 5 train.txt train.
+mv train.00000 train.txt
+
+cd $out_dir
+end=$(expr $split_count - 1)
+for i in $(seq 1 $end); do
+    mkdir -p $i/data
+    cp -r 0/data/* $i/data
+    mv $i/data/train.`printf %05d $i` $i/data/train.txt
+done;
--- a/doc/howto/usage/k8s/src/k8s_train/Dockerfile
+++ b/doc/howto/usage/k8s/src/k8s_train/Dockerfile
+FROM paddledev/paddle:cpu-latest
+
+COPY start.sh /root/
+COPY start_paddle.py /root/
+RUN chmod +x /root/start.sh
+CMD ["bash"," -c","/root/start.sh"]
--- a/doc/howto/usage/k8s/src/k8s_train/README.md
+++ b/doc/howto/usage/k8s/src/k8s_train/README.md
+To build PaddlePaddle training image in tutorial [Distributed PaddlePaddle Training on AWS with Kubernetes](../../k8s_aws_en.md), run following command:
+
+```
+docker build . -t train-image-name
+```
--- a/doc/howto/usage/k8s/src/start.sh
+++ b/doc/howto/usage/k8s/src/start.sh
 #!/bin/sh
+
 set -eu

 jobconfig=${JOB_PATH}"/"${JOB_NAME}"/"${TRAIN_CONFIG_DIR}
 cd /root
-cp -rf $jobconfig .
-cd $TRAIN_CONFIG_DIR
-
+cp -rf $jobconfig/* .

 python /root/start_paddle.py \
  --dot_period=10 \
-  --ports_num_for_sparse=$CONF_PADDLE_PORTS_NUM \
+  --ports_num=$CONF_PADDLE_PORTS_NUM \
+  --ports_num_for_sparse=$CONF_PADDLE_PORTS_NUM_SPARSE \
  --log_period=50 \
  --num_passes=10 \
-  --trainer_count=4 \
+  --trainer_count=$TRAINER_COUNT \
  --saving_period=1 \
  --local=0 \
-  --config=./trainer_config.py \
+  --config=trainer_config.lr.py \
  --use_gpu=0
--- a/doc/howto/usage/k8s/src/start_paddle.py
+++ b/doc/howto/usage/k8s/src/start_paddle.py
@@ -23,7 +23,6 @@ import argparse
 API = "/api/v1/namespaces/"
 JOBSELECTOR = "labelSelector=job-name="
 JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME")
-JOB_PATH_DATA = JOB_PATH + "/data"
 JOB_PATH_OUTPUT = JOB_PATH + "/output"
 JOBNAME = os.getenv("JOB_NAME")
 NAMESPACE = os.getenv("JOB_NAMESPACE")
@@ -33,6 +32,8 @@ PADDLE_PORTS_NUM = os.getenv("CONF_PADDLE_PORTS_NUM")
 PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE")
 PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM")

+tokenpath = '/var/run/secrets/kubernetes.io/serviceaccount/token'
+

 def refine_unknown_args(cmd_args):
    '''
@@ -64,6 +65,7 @@ def isPodAllRunning(podlist):
    for pod in podlist["items"]:
        if pod["status"]["phase"] == "Running":
            running += 1
+    print "waiting for pods running, require:", require, "running:", running
    if require == running:
        return True
    return False
@@ -79,8 +81,17 @@ def getPodList():

    pod = API + NAMESPACE + "/pods?"
    job = JOBNAME
-    return requests.get(apiserver + pod + JOBSELECTOR + job,
-                        verify=False).json()
+    if os.path.isfile(tokenpath):
+        tokenfile = open(tokenpath, mode='r')
+        token = tokenfile.read()
+        Bearer = "Bearer " + token
+        headers = {"Authorization": Bearer}
+        return requests.get(apiserver + pod + JOBSELECTOR + job,
+                            headers=headers,
+                            verify=False).json()
+    else:
+        return requests.get(apiserver + pod + JOBSELECTOR + job,
+                            verify=False).json()


 def getIdMap(podlist):
@@ -121,9 +132,10 @@ def startPaddle(idMap={}, train_args_dict=None):
    logDir = JOB_PATH_OUTPUT + "/node_" + str(trainerId)
    if not os.path.exists(JOB_PATH_OUTPUT):
        os.makedirs(JOB_PATH_OUTPUT)
-    os.mkdir(logDir)
-    copyCommand = 'cp -rf ' + JOB_PATH_DATA + \
-        "/" + str(trainerId) + " ./data"
+    if not os.path.exists(logDir):
+        os.mkdir(logDir)
+    copyCommand = 'cp -rf ' + JOB_PATH + \
+        "/" + str(trainerId) + "/data/*" + " ./data/"
    os.system(copyCommand)
    startPserver = 'nohup paddle pserver' + \
        " --port=" + str(PADDLE_PORT) + \
@@ -136,9 +148,9 @@ def startPaddle(idMap={}, train_args_dict=None):
    print startPserver
    os.system(startPserver)
    # wait until pservers completely start
-    time.sleep(10)
-    startTrainer = program + args + " > " + \
-        logDir + "/train.log 2>&1 < /dev/null"
+    time.sleep(20)
+    startTrainer = program + args + " 2>&1 | tee " + \
+        logDir + "/train.log"
    print startTrainer
    os.system(startTrainer)

@@ -152,7 +164,7 @@ if __name__ == '__main__':
    podlist = getPodList()
    # need to wait until all pods are running
    while not isPodAllRunning(podlist):
-        time.sleep(10)
+        time.sleep(20)
        podlist = getPodList()
    idMap = getIdMap(podlist)
    startPaddle(idMap, train_args_dict)
--- a/doc/howto/usage/k8s/src/route53_create_recordset.png
+++ b/doc/howto/usage/k8s/src/route53_create_recordset.png
--- a/doc/howto/usage/k8s/src/route53_create_zone.png
+++ b/doc/howto/usage/k8s/src/route53_create_zone.png
--- a/doc/howto/usage/k8s/src/worker_security_group.png
+++ b/doc/howto/usage/k8s/src/worker_security_group.png
--- a/doc/templates/conf.py.cn.in
+++ b/doc/templates/conf.py.cn.in
@@ -15,13 +15,19 @@ import sys
 import os, subprocess
 import shlex
 from recommonmark import parser, transform
+try:
+   import py_paddle
+   import paddle
+   import paddle.v2
+except ImportError:
+   print("Must install paddle python package before generating documentation")
+   sys.exit(1)

 MarkdownParser = parser.CommonMarkParser
 AutoStructify = transform.AutoStructify
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-sys.path.insert(0, '@PROJ_ROOT@/python')
 templates_path = ["@PROJ_ROOT@/doc_theme/templates"]

 # -- General configuration ------------------------------------------------

--- a/doc/templates/conf.py.en.in
+++ b/doc/templates/conf.py.en.in
@@ -15,14 +15,20 @@ import sys
 import os, subprocess
 import shlex
 from recommonmark import parser, transform
+try:
+   import py_paddle
+   import paddle
+   import paddle.v2
+except ImportError:
+   print("Must install paddle python package before generating documentation")
+   sys.exit(1)
+

 MarkdownParser = parser.CommonMarkParser
 AutoStructify = transform.AutoStructify
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-sys.path.insert(0, '@PROJ_ROOT@/python')
-
 templates_path = ["@PROJ_ROOT@/doc_theme/templates"]

 # -- General configuration ------------------------------------------------

--- a/doc/tutorials/quick_start/index_en.md
+++ b/doc/tutorials/quick_start/index_en.md
@@ -156,14 +156,14 @@ define_py_data_sources2(train_list='data/train.list',
                        obj="process",
                        args={"dictionary": word_dict})
 ```
-You can refer to the following link for more detailed examples and data formats: <a href = "../../api/data_provider/pydataprovider2_en.html">PyDataProvider2</a>.
+You can refer to the following link for more detailed examples and data formats: <a href = "../../api/v1/data_provider/pydataprovider2_en.html">PyDataProvider2</a>.

 ## Network Architecture
 We will describe four kinds of network architectures in this section.
 <center> ![](./src/PipelineNetwork_en.jpg) </center>

 First, you will build a logistic regression model. Later, you will also get chance to build other more powerful network architectures.
-For more detailed documentation, you could refer to: <a href = "../../api/trainer_config_helpers/layers.html">layer documentation</a>. All configuration files are in `demo/quick_start` directory.
+For more detailed documentation, you could refer to: <a href = "../../api/v1/trainer_config_helpers/layers.html">layer documentation</a>. All configuration files are in `demo/quick_start` directory.

 ### Logistic Regression
 The architecture is illustrated in the following picture:
@@ -366,7 +366,7 @@ You can use single layer LSTM model with Dropout for our text classification pro
 <br>

 ## Optimization Algorithm
-<a href = "../../api/trainer_config_helpers/optimizers.html">Optimization algorithms</a> include Momentum, RMSProp, AdaDelta, AdaGrad, Adam, and Adamax. You can use Adam optimization method here, with L2 regularization and gradient clipping, because Adam has been proved to work very well for training recurrent neural network.
+<a href = "../../api/v1/trainer_config_helpers/optimizers.html">Optimization algorithms</a> include Momentum, RMSProp, AdaDelta, AdaGrad, Adam, and Adamax. You can use Adam optimization method here, with L2 regularization and gradient clipping, because Adam has been proved to work very well for training recurrent neural network.

 ```python
 settings(batch_size=128,
@@ -407,7 +407,7 @@ paddle train \
 --init_model_path=./output/pass-0000x
 ```

-We will give an example of performing prediction using Recurrent model on a dataset with no labels. You can refer to <a href = "../../api/predict/swig_py_paddle_en.html">Python Prediction API</a> tutorial，or other <a href = "../../tutorials/index_en.html">demo</a> for the prediction process using Python. You can also use the following script for inference or evaluation.
+We will give an example of performing prediction using Recurrent model on a dataset with no labels. You can refer to <a href = "../../api/v1/predict/swig_py_paddle_en.html">Python Prediction API</a> tutorial，or other <a href = "../../tutorials/index_en.html">demo</a> for the prediction process using Python. You can also use the following script for inference or evaluation.

 inference script (predict.sh)：


--- a/paddle/api/Arguments.cpp
+++ b/paddle/api/Arguments.cpp
@@ -38,6 +38,13 @@ Arguments* Arguments::createByPaddleArgumentVector(void* ptr) {
  return args;
 }

+Arguments* Arguments::createByPaddleArgument(const void* ptr) {
+  auto p = (paddle::Argument*)(ptr);
+  auto args = new Arguments();
+  args->m->outputs.push_back(*p);
+  return args;
+}
+
 Matrix* Arguments::getSlotValue(size_t idx) const throw(RangeError) {
  auto& a = m->getArg(idx);
  return Matrix::createByPaddleMatrixPtr(&a.value);
@@ -137,9 +144,7 @@ void Arguments::setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError) {
  a.cpuSequenceDims = m->cast<paddle::IVector>(vec->getSharedPtr());
 }

-float Arguments::sumCosts() const {
-  return paddle::Argument::sumCosts(m->outputs);
-}
+float Arguments::sum() const { return paddle::Argument::sum(m->outputs); }

 int64_t Arguments::getBatchSize(size_t idx) const throw(RangeError) {
  auto& a = m->getArg(idx);

--- a/paddle/api/Evaluator.cpp
+++ b/paddle/api/Evaluator.cpp
@@ -27,3 +27,18 @@ std::string Evaluator::toString() {
  m->rawPtr->printStats(sout);
  return sout.str();
 }
+
+std::vector<std::string> Evaluator::getNames() const {
+  std::vector<std::string> retv;
+  m->rawPtr->getNames(&retv);
+  return retv;
+}
+
+double Evaluator::getValue(const std::string name) const {
+  paddle::Error err;
+  double v = m->rawPtr->getValue(name, &err);
+  if (err) {
+    throw std::runtime_error(err.msg());
+  }
+  return v;
+}
--- a/paddle/api/GradientMachine.cpp
+++ b/paddle/api/GradientMachine.cpp
@@ -142,14 +142,28 @@ Parameter* GradientMachine::getParameter(size_t i) throw(RangeError) {
  }
 }

+size_t GradientMachine::getNonStaticParameterSize() const {
+  return m->machine->getNonStaticParameters().size();
+}
+
+Parameter* GradientMachine::getNonStaticParameter(size_t i) throw(RangeError) {
+  auto params = m->machine->getNonStaticParameters();
+  if (i < params.size()) {
+    return Parameter::createFromSharedPtr(
+        &m->machine->getNonStaticParameters()[i]);
+  } else {
+    throw RangeError();
+  }
+}
+
 void GradientMachine::randParameters() { m->machine->randParameters(); }

-Matrix* GradientMachine::getLayerOutput(const std::string& layerName) const
+Arguments* GradientMachine::getLayerOutput(const std::string& layerName) const
    throw(UnsupportError) {
-  auto nn = std::dynamic_pointer_cast<paddle::NeuralNetwork>(m->machine);
+  auto nn = m->machine;
  if (nn) {
-    auto mat = nn->getLayerOutput(layerName);
-    return Matrix::createByPaddleMatrixPtr(&mat);
+    auto arg = nn->getLayerOutput(layerName);
+    return Arguments::createByPaddleArgument(&arg);
  } else {
    throw UnsupportError();
  }

--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -47,6 +47,9 @@ void setUseGpu(bool useGpu);
 /// Return true if this py_paddle is compiled in GPU Version
 bool isGpuVersion();

+/// Return FLAGS_trainer_count
+int getTrainerCount();
+
 /// The Error of IO Operation. Such as file not found, etc.
 class IOError {};

@@ -450,10 +453,11 @@ public:
                                        IVector* vec) throw(RangeError);
  void setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError);

-  float sumCosts() const;
+  float sum() const;

 private:
  static Arguments* createByPaddleArgumentVector(void* ptr);
+  static Arguments* createByPaddleArgument(const void* ptr);
  void* getInternalArgumentsPtr() const;

 private:
@@ -767,9 +771,12 @@ public:
  size_t getParameterSize() const;
  Parameter* getParameter(size_t i) throw(RangeError);

+  size_t getNonStaticParameterSize() const;
+  Parameter* getNonStaticParameter(size_t i) throw(RangeError);
+
  void randParameters();

-  Matrix* getLayerOutput(const std::string& layerName) const
+  Arguments* getLayerOutput(const std::string& layerName) const
      throw(UnsupportError);

  /**
@@ -900,6 +907,10 @@ public:
   */
  std::string toString();

+  std::vector<std::string> getNames() const;
+
+  double getValue(const std::string name) const;
+
 private:
  EvaluatorPrivate* m;

@@ -952,7 +963,7 @@ public:

  Arguments* getForwardOutput();

-  Matrix* getLayerOutput(const std::string& layerName);
+  Arguments* getLayerOutput(const std::string& layerName) const;
 };

 /// the N-Best results generated from one input sequence.

--- a/paddle/api/Trainer.cpp
+++ b/paddle/api/Trainer.cpp
@@ -131,12 +131,11 @@ void Trainer::testOneDataBatch(size_t batchSize, const Arguments& args) {
 void TrainerPrivate::finishTestPeriod() { tester_->finishTestPeriod(); }
 void Trainer::finishTestPeriod() { m->finishTestPeriod(); }

-Matrix* Trainer::getLayerOutput(const std::string& layerName) {
-  auto nn = std::dynamic_pointer_cast<paddle::NeuralNetwork>(
-      this->m->getGradientMachine());
+Arguments* Trainer::getLayerOutput(const std::string& layerName) const {
+  auto nn = this->m->getGradientMachine();
  CHECK(nn) << "trainerInternal_.getGradientMachine() is not NeuralNetwork";
-  auto m = nn->getLayerOutput(layerName);
-  return Matrix::createByPaddleMatrixPtr(&m);
+  auto arg = nn->getLayerOutput(layerName);
+  return Arguments::createByPaddleArgument(&arg);
 }

 void Trainer::forwardOneBatch(size_t batchSize) {

--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -54,5 +54,7 @@ bool isGpuVersion() {
 #endif
 }

+int getTrainerCount() { return FLAGS_trainer_count; }
+
 static_assert(NUM_PARAMETER_TYPES == paddle::NUM_PARAMETER_TYPES,
              "The Parameter Type should be same in core/api and core/common");
--- a/paddle/api/test/testArguments.py
+++ b/paddle/api/test/testArguments.py
@@ -22,7 +22,7 @@ class TestArguments(unittest.TestCase):
        args = swig_paddle.Arguments.createArguments(1)
        args.setSlotValue(0, m)

-        self.assertAlmostEqual(27.0, args.sumCosts())
+        self.assertAlmostEqual(27.0, args.sum())

        mat = args.getSlotValue(0)
        assert isinstance(mat, swig_paddle.Matrix)

--- a/paddle/api/test/testMatrix.py
+++ b/paddle/api/test/testMatrix.py
@@ -68,7 +68,7 @@ class TestMatrix(unittest.TestCase):

    def test_numpyCpu(self):
        numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype="float32")
-        m = swig_paddle.Matrix.createCpuDenseFromNumpy(numpy_mat, copy=False)
+        m = swig_paddle.Matrix.createCpuDenseFromNumpy(numpy_mat, False)
        self.assertEqual((int(m.getHeight()), int(m.getWidth())),
                         numpy_mat.shape)


--- a/paddle/api/test/testTrain.py
+++ b/paddle/api/test/testTrain.py
@@ -89,9 +89,14 @@ def main():
            except Exception as e:
                print e

+        ev = m.makeEvaluator()
+        ev.start()
        m.forwardBackward(inArgs, outArgs, swig_paddle.PASS_TRAIN,
                          update_callback)
-
+        m.eval(ev)
+        ev.finish()
+        for name in ev.getNames():
+            print name, ev.getValue(name)
        for optimizer in optimizers:
            optimizer.finishBatch()


--- a/paddle/api/test/testVector.py
+++ b/paddle/api/test/testVector.py
@@ -43,7 +43,7 @@ class TestIVector(unittest.TestCase):

    def test_cpu_numpy(self):
        vec = np.array([1, 3, 4, 65, 78, 1, 4], dtype="int32")
-        iv = swig_paddle.IVector.createCpuVectorFromNumpy(vec, copy=False)
+        iv = swig_paddle.IVector.createCpuVectorFromNumpy(vec, False)
        self.assertEqual(vec.shape[0], int(iv.__len__()))
        vec[4] = 832
        for i in xrange(len(iv)):
@@ -106,7 +106,7 @@ class TestVector(unittest.TestCase):

    def testCpuNumpy(self):
        numpy_arr = np.array([1.2, 2.3, 3.4, 4.5], dtype="float32")
-        vec = swig_paddle.Vector.createCpuVectorFromNumpy(numpy_arr, copy=False)
+        vec = swig_paddle.Vector.createCpuVectorFromNumpy(numpy_arr, False)
        assert isinstance(vec, swig_paddle.Vector)
        numpy_arr[0] = 0.1
        for n, v in zip(numpy_arr, vec):

--- a/paddle/cuda/include/hl_matrix.h
+++ b/paddle/cuda/include/hl_matrix.h
@@ -69,19 +69,6 @@ extern void hl_sequence_softmax_forward(real* A_d,
                                        const int* index,
                                        int numSequence);

-/**
- * @brief   Matrix classification error.
- *
- * @param[in]   A_d     input matrix (M x N).
- * @param[in]   B_d     input vector (M x 1).
- * @param[out]  C_d     output vector (M x 1).
- * @param[in]   dimM    matrix height.
- * @param[in]   dimN    matrix width.
- *
- */
-extern void hl_matrix_classification_error(
-    real* A_d, int* B_d, real* C_d, int dimM, int dimN);
-
 /**
 * @brief   Matrix cross entropy.
 *
@@ -188,48 +175,6 @@ extern void hl_param_relu_backward_diff(real* grad_o,
                                        int width,
                                        int height,
                                        int partial_sum);
-/**
- * @brief cos sim forward
- *
- * @param[out]    output         output data
- * @param[in]     input1         input1 data(matrix)
- * @param[in]     input2         input2 data(matrix or vector)
- * @param[in]     width          matrix width
- * @param[in]     input1_height  input1_height
- * @param[in]     input2_height  input2_height
- * @param[in]     scale          scale factor
- */
-extern void hl_cossim(real* output,
-                      real* input1,
-                      real* input2,
-                      int width,
-                      int input1_height,
-                      int input2_height,
-                      real scale);
-/**
- * @brief cos sim derivate
- *
- * @param[in]     grad             output grad
- * @param[in]     output           output data
- * @param[in]     prevOutX         input1 data
- * @param[in]     prevOutY         input2 data
- * @param[out]    prevGradX        input1 grad
- * @param[out]    prevGradY        input2 grad
- * @param[in]     width            matrix width
- * @param[in]     input1_height    input1 height
- * @param[in]     input2_height    input2 height
- * @param[in]     scale            scale factor
- */
-extern void hl_cossim_derivative(real* grad,
-                                 real* output,
-                                 real* prevOutX,
-                                 real* prevOutY,
-                                 real* prevGradX,
-                                 real* prevGradY,
-                                 int width,
-                                 int input1_height,
-                                 int input2_height,
-                                 real scale);

 /**
 * @brief   Matrix addition: A_d[i][j] += scale * B_d[j/channel].
@@ -267,4 +212,16 @@ extern void hl_matrix_collect_shared_bias(real* B_d,
                                          const int dimN,
                                          real scale);

+/**
+ * @brief  Matrix rotation in 90 degrees
+ *
+ * @param[in]   mat       input matrix (M x N).
+ * @param[out]  matRot    output matrix (N x M).
+ * @param[in]   dimM      input matrix height.
+ * @param[in]   dimN      input matrix width.
+ * @param[in]   clockWise rotation direction
+ */
+extern void hl_matrix_rotate(
+    real* mat, real* matRot, int dimM, int dimN, bool clockWise);
+
 #endif /* HL_MATRIX_H_ */
--- a/paddle/cuda/include/hl_top_k.h
+++ b/paddle/cuda/include/hl_top_k.h
@@ -58,4 +58,30 @@ extern void hl_sparse_matrix_top_k(real* topVal,
                                   int beamSize,
                                   int numSamples);

-#endif /* HL_TOP_K_H_ */
+/**
+ * @brief   Matrix classification error.
+ *
+ * @param[out]  topVal         top k element.
+ * @param[in]   ldv            leading dimension of topVal.
+ * @param[out]  topIds         top k index.
+ * @param[in]   src            input value.
+ * @param[in]   lds            leading dimension of src.
+ * @param[in]   dim            width of input value.
+ * @param[in]   topkSize       size of top k element.
+ * @param[in]   numSamples     height of input value.
+ * @param[in]   label          ground truth label.
+ * @param[out]  recResult      top-k classification error.
+ *
+ */
+extern void hl_matrix_classification_error(real* topVal,
+                                           int ldv,
+                                           int* topIds,
+                                           real* src,
+                                           int lds,
+                                           int dim,
+                                           int topkSize,
+                                           int numSamples,
+                                           int* label,
+                                           real* recResult);
+
+#endif  // HL_TOP_K_H_
--- a/paddle/cuda/include/stub/hl_matrix_stub.h
+++ b/paddle/cuda/include/stub/hl_matrix_stub.h
@@ -35,8 +35,16 @@ inline void hl_sequence_softmax_forward(real* A_d,
 inline void hl_matrix_softmax_derivative(
    real* grad_d, real* output_d, real* sftmaxSum_d, int dimM, int dimN) {}

-inline void hl_matrix_classification_error(
-    real* A_d, int* B_d, real* C_d, int dimM, int dimN) {}
+inline void hl_matrix_classification_error(real* topVal,
+                                           int ldv,
+                                           int* topIds,
+                                           real* src,
+                                           int lds,
+                                           int dim,
+                                           int topkSize,
+                                           int numSamples,
+                                           int* label,
+                                           real* recResult) {}

 inline void hl_matrix_cross_entropy(
    real* A_d, real* C_d, int* label_d, int dimM, int dimN) {}
@@ -74,25 +82,6 @@ inline void hl_param_relu_backward_diff(real* grad_o,
                                        int height,
                                        int partial_sum) {}

-inline void hl_cossim(real* output,
-                      real* input1,
-                      real* input2,
-                      int width,
-                      int input1_height,
-                      int input2_height,
-                      real scale) {}
-
-inline void hl_cossim_derivative(real* grad,
-                                 real* output,
-                                 real* prevOutX,
-                                 real* prevOutY,
-                                 real* prevGradX,
-                                 real* prevGradY,
-                                 int width,
-                                 int input1_height,
-                                 int input2_height,
-                                 real scale) {}
-
 inline void hl_matrix_add_shared_bias(real* A_d,
                                      real* B_d,
                                      const int channel,
@@ -106,4 +95,8 @@ inline void hl_matrix_collect_shared_bias(real* B_d,
                                          const int dimM,
                                          const int dimN,
                                          real scale) {}
+
+inline void hl_matrix_rotate(
+    real* mat, real* matRot, int dimM, int dimN, bool clockWise) {}
+
 #endif  // HL_MATRIX_STUB_H_
--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -265,59 +265,6 @@ void hl_matrix_softmax_derivative(real *grad_d,
  CHECK_SYNC("hl_matrix_softmax_derivative failed");
 }

-template<int blockSize>
-__global__ void KeMatrixClassificationError(real* in_A,
-                                            int* in_B,
-                                            real* out_C,
-                                            int dimN) {
-  __shared__ real max_s[blockSize];
-  __shared__ int max_l[blockSize];
-  const int tid = threadIdx.x;
-  const int rowId = blockIdx.x;
-
-  max_s[tid] = -1e30f;
-  in_A += rowId * dimN;
-  real tmp;
-  for (int colId = tid; colId < dimN; colId += blockSize) {
-    tmp = in_A[colId];
-    if (max_s[tid] < tmp) {
-      max_s[tid] = tmp;
-      max_l[tid] = colId;
-    }
-  }
-  __syncthreads();
-
-  for (int stride = blockSize/2; stride > 0; stride = stride/2) {
-    if (tid < stride) {
-      if (max_s[tid] < max_s[tid + stride]) {
-        max_s[tid] = max_s[tid + stride];
-        max_l[tid] = max_l[tid + stride];
-      }
-    }
-    __syncthreads();
-  }
-  __syncthreads();
-
-  if (tid == 0) {
-    out_C[rowId] = (max_l[0] == in_B[rowId] ? 0 : 1.0f);
-  }
-}
-
-void hl_matrix_classification_error(real* A_d,
-                                    int* B_d,
-                                    real* C_d,
-                                    int dimM,
-                                    int dimN) {
-  CHECK_NOTNULL(A_d);
-  CHECK_NOTNULL(B_d);
-  CHECK_NOTNULL(C_d);
-
-  // each sample is calculated by one block
-  KeMatrixClassificationError<1024><<< dimM, 1024, 0, STREAM_DEFAULT >>>
-    (A_d, B_d, C_d, dimN);
-  CHECK_SYNC("hl_matrix_classification_error");
-}
-
 __global__ void KeMatrixMultiBinaryCrossEntropy(real* output,
                                                real* entropy,
                                                int* row,
@@ -584,177 +531,6 @@ void hl_param_relu_backward_diff(real* grad_o,
  CHECK_SYNC("hl_param_relu_backward_diff failed");
 }

-template<int blockSize>
-__global__ void KeCosSim(real* output,
-                         real* input1,
-                         real* input2,
-                         int width,
-                         int input1_height,
-                         int input2_height,
-                         real scale) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-
-  __shared__ real xx[blockSize];
-  __shared__ real yy[blockSize];
-  __shared__ real xy[blockSize];
-
-  xx[tid] = 0.0;
-  yy[tid] = 0.0;
-  xy[tid] = 0.0;
-  __syncthreads();
-
-  input1 += ty * width;
-  if (input2_height > 1) {
-    input2 += ty * width;
-  }
-  for (int index = tid; index < width; index += blockSize) {
-    real x = input1[index];
-    real y = input2[index];
-    xx[tid] += x * x;
-    yy[tid] += y * y;
-    xy[tid] += x * y;
-  }
-  __syncthreads();
-
-  for (int s = blockSize / 2; s > 0; s >>= 1) {
-    if (tid < s) {
-      xx[tid] += xx[tid + s];
-      yy[tid] += yy[tid + s];
-      xy[tid] += xy[tid + s];
-    }
-    __syncthreads();
-  }
-  if (tid == 0) {
-    output[ty] = scale * xy[0] / (sqrt(xx[0]) * sqrt(yy[0]));
-  }
-}
-
-void hl_cossim(real* output,
-               real* input1,
-               real* input2,
-               int width,
-               int input1_height,
-               int input2_height,
-               real scale) {
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(input1);
-  CHECK_NOTNULL(input2);
-  const int blockSize = 256;
-  dim3 threads(blockSize, 1);
-  dim3 grid(1, input1_height);
-
-  KeCosSim<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>
-    (output, input1, input2, width, input1_height, input2_height, scale);
-  CHECK_SYNC("hl_cossim failed");
-}
-
-template<int blockSize>
-__global__ void KeCosSimDerivative(real* grad,
-                                   real* output,
-                                   real* prevOutX,
-                                   real* prevOutY,
-                                   real* prevGradX,
-                                   real* prevGradY,
-                                   int width,
-                                   int input1_height,
-                                   int input2_height,
-                                   real scale) {
-  const int ty = blockIdx.y;
-  int tid = threadIdx.x;
-
-  __shared__ real xx[blockSize];
-  __shared__ real yy[blockSize];
-  __shared__ real xy[blockSize];
-
-  xx[tid] = 0.0;
-  yy[tid] = 0.0;
-  xy[tid] = 0.0;
-  __syncthreads();
-
-  prevOutX += ty * width;
-  prevGradX += ty * width;
-  if (input2_height > 1) {
-    prevOutY += ty * width;
-    prevGradY += ty * width;
-  }
-  for (int index = tid; index < width; index += blockSize) {
-    real x = prevOutX[index];
-    real y = prevOutY[index];
-    xx[tid] += x * x;
-    yy[tid] += y * y;
-    xy[tid] += x * y;
-  }
-  __syncthreads();
-
-  for (int s = blockSize / 2; s > 0; s >>= 1) {
-    if (tid < s) {
-      xx[tid] += xx[tid + s];
-      yy[tid] += yy[tid + s];
-      xy[tid] += xy[tid + s];
-    }
-    __syncthreads();
-  }
-  if (xy[0] == 0) {
-    real reciprocal = 1.0 / (sqrt(xx[0]) * sqrt(yy[0]));
-    for (int index = tid; index < width; index += blockSize) {
-      prevGradX[index] +=
-        scale * grad[ty] * prevOutY[index] * reciprocal;
-      if (input2_height > 1) {
-        prevGradY[index] +=
-          scale * grad[ty] * prevOutX[index] * reciprocal;
-      } else {
-        paddle::paddleAtomicAdd(prevGradY + index,
-          scale * grad[ty] * prevOutX[index] * reciprocal);
-      }
-    }
-  } else {
-    real reciprocalXY = 1.0 / xy[0];
-    real reciprocalSquareSumX = 1.0 / xx[0];
-    real reciprocalSquareSumY = 1.0 / yy[0];
-    for (int index = tid; index < width; index += blockSize) {
-      prevGradX[index] += output[ty] * grad[ty] *
-        (prevOutY[index] * reciprocalXY -
-         prevOutX[index] * reciprocalSquareSumX);
-      if (input2_height > 1) {
-        prevGradY[index] += output[ty] * grad[ty] *
-          (prevOutX[index] * reciprocalXY -
-           prevOutY[index] * reciprocalSquareSumY);
-      } else {
-        paddle::paddleAtomicAdd(prevGradY + index, output[ty] * grad[ty] *
-          (prevOutX[index] * reciprocalXY -
-           prevOutY[index] * reciprocalSquareSumY));
-      }
-    }
-  }
-}
-
-
-void hl_cossim_derivative(real* grad,
-                          real* output,
-                          real* prevOutX,
-                          real* prevOutY,
-                          real* prevGradX,
-                          real* prevGradY,
-                          int width,
-                          int input1_height,
-                          int input2_height,
-                          real scale) {
-  CHECK_NOTNULL(grad);
-  CHECK_NOTNULL(output);
-  CHECK_NOTNULL(prevOutX);
-  CHECK_NOTNULL(prevOutY);
-  CHECK_NOTNULL(prevGradX);
-  CHECK_NOTNULL(prevGradY);
-  const int blockSize = 256;
-  dim3 threads(blockSize, 1);
-  dim3 grid(1, input1_height);
-  KeCosSimDerivative<blockSize><<<grid, threads, 0, STREAM_DEFAULT>>>
-    (grad, output, prevOutX, prevOutY, prevGradX, prevGradY, width,
-        input1_height, input2_height, scale);
-  CHECK_SYNC("hl_cossim_derivate failed");
-}
-
 __global__ void KeMatrixAddSharedBias(real* A,
                                      real* B,
                                      const int channel,
@@ -840,3 +616,28 @@ void hl_matrix_collect_shared_bias(real* B_d,
      (B_d, A_d, channel, dimM, dimN, dim, limit, scale);
  CHECK_SYNC("hl_matrix_collect_shared_bias failed");
 }
+
+__global__ void keMatrixRotate(real* mat, real* matRot,
+                               int dimM, int dimN, bool clockWise) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < dimM * dimN) {
+        int i = idx / dimN;
+        int j = idx % dimN;
+        if (clockWise) {
+            matRot[j * dimM + i] = mat[(dimM - i - 1) * dimN + j];
+        } else {
+            matRot[j * dimM + i] = mat[i * dimN + (dimN - j - 1)];
+        }
+    }
+}
+
+void hl_matrix_rotate(real *mat, real* matRot,
+                      int dimM, int dimN, bool clockWise) {
+    CHECK_NOTNULL(mat);
+    CHECK_NOTNULL(matRot);
+    const int threads = 512;
+    const int blocks = DIVUP(dimM * dimN, threads);
+    keMatrixRotate<<< blocks, threads, 0, STREAM_DEFAULT >>>
+            (mat, matRot, dimM, dimN, clockWise);
+    CHECK_SYNC("hl_matrix_rotate failed");
+}
--- a/paddle/cuda/src/hl_top_k.cu
+++ b/paddle/cuda/src/hl_top_k.cu
@@ -384,3 +384,81 @@ void hl_sparse_matrix_top_k(real* topVal, int ldv,
  CHECK_SYNC("hl_sparse_matrix_top_k failed");
 }

+/**
+ * Each block compute one sample.
+ * In a block:
+ * 1. every thread get top maxLength value;
+ * 2. merge to shTopK, block reduce and get max value;
+ * 3. go to the second setp, until one thread's topK value is null;
+ * 4. go to the first setp, until get the topK value.
+ */
+template<int maxLength, int blockSize>
+__global__ void KeMatrixTopKClassificationError(real* topVal, int ldv,
+                                                int * topIds,
+                                                real* src, int lds,
+                                                int dim,
+                                                int beamSize,
+                                                int* label,
+                                                real* recResult) {
+  __shared__ Pair shTopK[blockSize];
+  __shared__ int maxId[blockSize / 2];
+  const int tid = threadIdx.x;
+  const int warp = threadIdx.x / 32;
+  src += blockIdx.x * lds;
+  topVal += blockIdx.x * ldv;
+  topIds += blockIdx.x * beamSize;
+
+  Pair topK[maxLength]; // NOLINT
+  int beam = maxLength;
+  Pair max;
+  bool isEmpty = false;
+  bool firstStep = true;
+  int topkSize = beamSize;
+
+  for (int k = 0; k < maxLength; k++) {
+    topK[k].set(-HL_FLOAT_MAX, -1);
+  }
+
+  while (beamSize) {
+    threadGetTopK<maxLength, blockSize>
+      (topK, beam, beamSize, src, firstStep, isEmpty, max, dim, tid);
+
+    shTopK[tid] = topK[0];
+    blockReduce<maxLength, blockSize>
+      (shTopK, maxId, topK, &topVal, &topIds, beam, beamSize, tid, warp);
+  }
+
+  __syncthreads();
+  if (tid == 0) {
+    for (int i = 0; i < topkSize; i++) {
+        if (*--topIds == label[blockIdx.x]) {
+            recResult[blockIdx.x] = 0;
+            break;
+        }
+        recResult[blockIdx.x] = 1.0f;
+    }
+  }
+}
+
+void hl_matrix_classification_error(real* topVal, int ldv,
+                                   int* topIds,
+                                   real* src, int lds,
+                                   int dim,
+                                   int topkSize,
+                                   int numSamples,
+                                   int* label,
+                                   real* recResult) {
+  CHECK_NOTNULL(topVal);
+  CHECK_NOTNULL(topIds);
+  CHECK_NOTNULL(src);
+
+  if (topkSize > dim) topkSize = dim;
+
+  dim3 threads(256, 1);
+  dim3 grid(numSamples, 1);
+  KeMatrixTopKClassificationError<5, 256>
+  <<< grid, threads, 0, STREAM_DEFAULT >>>
+  (topVal, ldv, topIds, src, lds, dim, topkSize, label, recResult);
+
+  CHECK_SYNC("hl_matrix_top_k classification error failed");
+}
--- a/paddle/cuda/src/hl_warpctc_wrap.cc
+++ b/paddle/cuda/src/hl_warpctc_wrap.cc
--- a/paddle/function/BufferArg.cpp
+++ b/paddle/function/BufferArg.cpp
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
--- a/paddle/function/ContextProjectionOp.h
+++ b/paddle/function/ContextProjectionOp.h
--- a/paddle/function/ContextProjectionOpTest.cpp
+++ b/paddle/function/ContextProjectionOpTest.cpp
--- a/paddle/function/CosSimOp.cpp
+++ b/paddle/function/CosSimOp.cpp
--- a/paddle/function/CosSimOp.h
+++ b/paddle/function/CosSimOp.h
--- a/paddle/function/CosSimOpGpu.cu
+++ b/paddle/function/CosSimOpGpu.cu
--- a/paddle/function/CosSimOpTest.cpp
+++ b/paddle/function/CosSimOpTest.cpp
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
--- a/paddle/function/MulOp.cpp
+++ b/paddle/function/MulOp.cpp
--- a/paddle/function/MulOp.h
+++ b/paddle/function/MulOp.h
--- a/paddle/function/MulOpGpu.cu
+++ b/paddle/function/MulOpGpu.cu
--- a/paddle/function/MulOpTest.cpp
+++ b/paddle/function/MulOpTest.cpp
--- a/paddle/function/PadOp.cpp
+++ b/paddle/function/PadOp.cpp
--- a/paddle/function/PadOp.h
+++ b/paddle/function/PadOp.h
--- a/paddle/function/PadOpGpu.cu
+++ b/paddle/function/PadOpGpu.cu
--- a/paddle/function/PadOpTest.cpp
+++ b/paddle/function/PadOpTest.cpp
--- a/paddle/function/TensorShape.h
+++ b/paddle/function/TensorShape.h
--- a/paddle/function/TensorType.h
+++ b/paddle/function/TensorType.h
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
--- a/paddle/gserver/activations/ActivationFunction.h
+++ b/paddle/gserver/activations/ActivationFunction.h
--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
--- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
--- a/paddle/gserver/evaluators/Evaluator.cpp
+++ b/paddle/gserver/evaluators/Evaluator.cpp
--- a/paddle/gserver/evaluators/Evaluator.h
+++ b/paddle/gserver/evaluators/Evaluator.h
--- a/paddle/gserver/gradientmachines/GradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/GradientMachine.cpp
--- a/paddle/gserver/gradientmachines/GradientMachine.h
+++ b/paddle/gserver/gradientmachines/GradientMachine.h
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.h
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.h
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.h
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
--- a/paddle/gserver/layers/AddtoLayer.h
+++ b/paddle/gserver/layers/AddtoLayer.h
--- a/paddle/gserver/layers/AgentLayer.h
+++ b/paddle/gserver/layers/AgentLayer.h
--- a/paddle/gserver/layers/AverageLayer.h
+++ b/paddle/gserver/layers/AverageLayer.h
--- a/paddle/gserver/layers/BatchNormBaseLayer.h
+++ b/paddle/gserver/layers/BatchNormBaseLayer.h
--- a/paddle/gserver/layers/BatchNormalizationLayer.h
+++ b/paddle/gserver/layers/BatchNormalizationLayer.h
--- a/paddle/gserver/layers/BilinearInterpLayer.h
+++ b/paddle/gserver/layers/BilinearInterpLayer.h
--- a/paddle/gserver/layers/BlockExpandLayer.h
+++ b/paddle/gserver/layers/BlockExpandLayer.h
--- a/paddle/gserver/layers/CRFDecodingLayer.h
+++ b/paddle/gserver/layers/CRFDecodingLayer.h
--- a/paddle/gserver/layers/CRFLayer.h
+++ b/paddle/gserver/layers/CRFLayer.h
--- a/paddle/gserver/layers/CTCLayer.h
+++ b/paddle/gserver/layers/CTCLayer.h
--- a/paddle/gserver/layers/ConcatenateLayer.cpp
+++ b/paddle/gserver/layers/ConcatenateLayer.cpp
--- a/paddle/gserver/layers/ConvBaseLayer.h
+++ b/paddle/gserver/layers/ConvBaseLayer.h
--- a/paddle/gserver/layers/ConvShiftLayer.cpp
+++ b/paddle/gserver/layers/ConvShiftLayer.cpp
--- a/paddle/gserver/layers/ConvexCombinationLayer.cpp
+++ b/paddle/gserver/layers/ConvexCombinationLayer.cpp
--- a/paddle/gserver/layers/CosSimLayer.cpp
+++ b/paddle/gserver/layers/CosSimLayer.cpp
--- a/paddle/gserver/layers/CosSimLayer.h
+++ b/paddle/gserver/layers/CosSimLayer.h
--- a/paddle/gserver/layers/CosSimVecMatLayer.cpp
+++ b/paddle/gserver/layers/CosSimVecMatLayer.cpp
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
--- a/paddle/gserver/layers/CostLayer.h
+++ b/paddle/gserver/layers/CostLayer.h
--- a/paddle/gserver/layers/CudnnBatchNormLayer.h
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.h
--- a/paddle/gserver/layers/CudnnConvLayer.h
+++ b/paddle/gserver/layers/CudnnConvLayer.h
--- a/paddle/gserver/layers/CudnnPoolLayer.h
+++ b/paddle/gserver/layers/CudnnPoolLayer.h
--- a/paddle/gserver/layers/DataLayer.h
+++ b/paddle/gserver/layers/DataLayer.h
--- a/paddle/gserver/layers/DataNormLayer.h
+++ b/paddle/gserver/layers/DataNormLayer.h
--- a/paddle/gserver/layers/EosIdCheckLayer.cpp
+++ b/paddle/gserver/layers/EosIdCheckLayer.cpp
--- a/paddle/gserver/layers/ExpandConvBaseLayer.h
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.h
--- a/paddle/gserver/layers/ExpandConvLayer.h
+++ b/paddle/gserver/layers/ExpandConvLayer.h
--- a/paddle/gserver/layers/ExpandConvTransLayer.h
+++ b/paddle/gserver/layers/ExpandConvTransLayer.h
--- a/paddle/gserver/layers/ExpandLayer.h
+++ b/paddle/gserver/layers/ExpandLayer.h
--- a/paddle/gserver/layers/FeatureMapExpandLayer.cpp
+++ b/paddle/gserver/layers/FeatureMapExpandLayer.cpp
--- a/paddle/gserver/layers/FullyConnectedLayer.h
+++ b/paddle/gserver/layers/FullyConnectedLayer.h
--- a/paddle/gserver/layers/GatedRecurrentLayer.cpp
+++ b/paddle/gserver/layers/GatedRecurrentLayer.cpp
--- a/paddle/gserver/layers/GatedRecurrentLayer.h
+++ b/paddle/gserver/layers/GatedRecurrentLayer.h
--- a/paddle/gserver/layers/GetOutputLayer.cpp
+++ b/paddle/gserver/layers/GetOutputLayer.cpp
--- a/paddle/gserver/layers/GruStepLayer.cpp
+++ b/paddle/gserver/layers/GruStepLayer.cpp
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.h
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
--- a/paddle/gserver/layers/InterpolationLayer.cpp
+++ b/paddle/gserver/layers/InterpolationLayer.cpp
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
--- a/paddle/gserver/layers/LstmLayer.h
+++ b/paddle/gserver/layers/LstmLayer.h
--- a/paddle/gserver/layers/LstmStepLayer.cpp
+++ b/paddle/gserver/layers/LstmStepLayer.cpp
--- a/paddle/gserver/layers/MDLstmLayer.cpp
+++ b/paddle/gserver/layers/MDLstmLayer.cpp
--- a/paddle/gserver/layers/MaxIdLayer.cpp
+++ b/paddle/gserver/layers/MaxIdLayer.cpp
--- a/paddle/gserver/layers/MaxLayer.h
+++ b/paddle/gserver/layers/MaxLayer.h
--- a/paddle/gserver/layers/MaxOutLayer.h
+++ b/paddle/gserver/layers/MaxOutLayer.h
--- a/paddle/gserver/layers/MixedLayer.h
+++ b/paddle/gserver/layers/MixedLayer.h
--- a/paddle/gserver/layers/MultiplexLayer.cpp
+++ b/paddle/gserver/layers/MultiplexLayer.cpp
--- a/paddle/gserver/layers/NCELayer.cpp
+++ b/paddle/gserver/layers/NCELayer.cpp
--- a/paddle/gserver/layers/NormLayer.h
+++ b/paddle/gserver/layers/NormLayer.h
--- a/paddle/gserver/layers/NormProjectionLayer.h
+++ b/paddle/gserver/layers/NormProjectionLayer.h
--- a/paddle/gserver/layers/OuterProdLayer.cpp
+++ b/paddle/gserver/layers/OuterProdLayer.cpp
--- a/paddle/gserver/layers/PadLayer.cpp
+++ b/paddle/gserver/layers/PadLayer.cpp
--- a/paddle/gserver/layers/PadLayer.h
+++ b/paddle/gserver/layers/PadLayer.h
--- a/paddle/gserver/layers/ParameterReluLayer.h
+++ b/paddle/gserver/layers/ParameterReluLayer.h
--- a/paddle/gserver/layers/PoolLayer.h
+++ b/paddle/gserver/layers/PoolLayer.h
--- a/paddle/gserver/layers/PoolProjectionLayer.h
+++ b/paddle/gserver/layers/PoolProjectionLayer.h
--- a/paddle/gserver/layers/PowerLayer.cpp
+++ b/paddle/gserver/layers/PowerLayer.cpp
--- a/paddle/gserver/layers/PrintLayer.cpp
+++ b/paddle/gserver/layers/PrintLayer.cpp
--- a/paddle/gserver/layers/PriorBox.cpp
+++ b/paddle/gserver/layers/PriorBox.cpp
--- a/paddle/gserver/layers/RecurrentLayer.cpp
+++ b/paddle/gserver/layers/RecurrentLayer.cpp
--- a/paddle/gserver/layers/RecurrentLayerGroup.cpp
+++ b/paddle/gserver/layers/RecurrentLayerGroup.cpp
--- a/paddle/gserver/layers/ResizeLayer.cpp
+++ b/paddle/gserver/layers/ResizeLayer.cpp
--- a/paddle/gserver/layers/RotateLayer.cpp
+++ b/paddle/gserver/layers/RotateLayer.cpp
--- a/paddle/gserver/layers/RotateLayer.h
+++ b/paddle/gserver/layers/RotateLayer.h
--- a/paddle/gserver/layers/SamplingIdLayer.cpp
+++ b/paddle/gserver/layers/SamplingIdLayer.cpp
--- a/paddle/gserver/layers/ScalingLayer.cpp
+++ b/paddle/gserver/layers/ScalingLayer.cpp
--- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
+++ b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
--- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
+++ b/paddle/gserver/layers/SelectiveFullyConnectedLayer.h
--- a/paddle/gserver/layers/SequenceConcatLayer.cpp
+++ b/paddle/gserver/layers/SequenceConcatLayer.cpp
--- a/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
+++ b/paddle/gserver/layers/SequenceLastInstanceLayer.cpp
--- a/paddle/gserver/layers/SequencePoolLayer.h
+++ b/paddle/gserver/layers/SequencePoolLayer.h
--- a/paddle/gserver/layers/SequenceReshapeLayer.cpp
+++ b/paddle/gserver/layers/SequenceReshapeLayer.cpp
--- a/paddle/gserver/layers/SlopeInterceptLayer.cpp
+++ b/paddle/gserver/layers/SlopeInterceptLayer.cpp
--- a/paddle/gserver/layers/SpatialPyramidPoolLayer.h
+++ b/paddle/gserver/layers/SpatialPyramidPoolLayer.h
--- a/paddle/gserver/layers/SubSequenceLayer.cpp
+++ b/paddle/gserver/layers/SubSequenceLayer.cpp
--- a/paddle/gserver/layers/SumToOneNormLayer.cpp
+++ b/paddle/gserver/layers/SumToOneNormLayer.cpp
--- a/paddle/gserver/layers/TensorLayer.h
+++ b/paddle/gserver/layers/TensorLayer.h
--- a/paddle/gserver/layers/TransLayer.h
+++ b/paddle/gserver/layers/TransLayer.h
--- a/paddle/gserver/layers/ValidationLayer.h
+++ b/paddle/gserver/layers/ValidationLayer.h
--- a/paddle/gserver/layers/WarpCTCLayer.h
+++ b/paddle/gserver/layers/WarpCTCLayer.h
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ b/paddle/gserver/tests/test_Evaluator.cpp
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
--- a/paddle/gserver/tests/test_WarpCTCLayer.cpp
+++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp
--- a/paddle/math/CpuSparseMatrix.cpp
+++ b/paddle/math/CpuSparseMatrix.cpp
--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
--- a/paddle/math/SparseMatrix.cpp
+++ b/paddle/math/SparseMatrix.cpp
--- a/paddle/math/SparseMatrix.h
+++ b/paddle/math/SparseMatrix.h
--- a/paddle/math/SparseRowMatrix.cpp
+++ b/paddle/math/SparseRowMatrix.cpp
--- a/paddle/math/SparseRowMatrix.h
+++ b/paddle/math/SparseRowMatrix.h
--- a/paddle/math/tests/test_Matrix.cpp
+++ b/paddle/math/tests/test_Matrix.cpp
--- a/paddle/math/tests/test_RowBuffer.cpp
+++ b/paddle/math/tests/test_RowBuffer.cpp
--- a/paddle/math/tests/test_SparseMatrix.cpp
+++ b/paddle/math/tests/test_SparseMatrix.cpp
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
--- a/paddle/math/tests/test_matrixUtil.h
+++ b/paddle/math/tests/test_matrixUtil.h
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
--- a/paddle/parameter/Parameter.cpp
+++ b/paddle/parameter/Parameter.cpp
--- a/paddle/parameter/ParameterUpdaterBase.h
+++ b/paddle/parameter/ParameterUpdaterBase.h
--- a/paddle/pserver/test/CMakeLists.txt
+++ b/paddle/pserver/test/CMakeLists.txt
--- a/paddle/pserver/test/test_ProtoServer.cpp
+++ b/paddle/pserver/test/test_ProtoServer.cpp
--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
--- a/paddle/py_paddle/util.py
+++ b/paddle/py_paddle/util.py
--- a/paddle/scripts/docker/Dockerfile
+++ b/paddle/scripts/docker/Dockerfile
--- a/paddle/scripts/docker/Dockerfile.gpu
+++ b/paddle/scripts/docker/Dockerfile.gpu
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
--- a/paddle/scripts/docker/root/.bashrc
+++ b/paddle/scripts/docker/root/.bashrc
--- a/paddle/scripts/docker/root/.gitconfig
+++ b/paddle/scripts/docker/root/.gitconfig
--- a/paddle/scripts/docker/root/.scripts/git-completion.sh
+++ b/paddle/scripts/docker/root/.scripts/git-completion.sh
--- a/paddle/scripts/docker/root/.scripts/git-prompt.sh
+++ b/paddle/scripts/docker/root/.scripts/git-prompt.sh
--- a/paddle/scripts/travis/before_install.osx.sh
+++ b/paddle/scripts/travis/before_install.osx.sh
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
--- a/paddle/scripts/travis/docs.sh
+++ b/paddle/scripts/travis/docs.sh
--- a/paddle/setup.py.in
+++ b/paddle/setup.py.in
--- a/paddle/trainer/ParameterUpdater.h
+++ b/paddle/trainer/ParameterUpdater.h
--- a/paddle/trainer/Tester.cpp
+++ b/paddle/trainer/Tester.cpp
--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
--- a/paddle/trainer/Trainer.h
+++ b/paddle/trainer/Trainer.h
--- a/paddle/trainer/TrainerInternal.cpp
+++ b/paddle/trainer/TrainerInternal.cpp
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
--- a/paddle/trainer/tests/test_Prediction.cpp
+++ b/paddle/trainer/tests/test_Prediction.cpp
--- a/paddle/utils/Compiler.h
+++ b/paddle/utils/Compiler.h
--- a/paddle/utils/Error.h
+++ b/paddle/utils/Error.h
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
--- a/paddle/utils/tests/CMakeLists.txt
+++ b/paddle/utils/tests/CMakeLists.txt
--- a/paddle/utils/tests/test_Error.cpp
+++ b/paddle/utils/tests/test_Error.cpp
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
--- a/proto/ParameterServerConfig.proto
+++ b/proto/ParameterServerConfig.proto
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
--- a/python/paddle/trainer/PyDataProvider2.py
+++ b/python/paddle/trainer/PyDataProvider2.py
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
--- a/python/paddle/trainer/recurrent_units.py
+++ b/python/paddle/trainer/recurrent_units.py
--- a/python/paddle/trainer_config_helpers/data_sources.py
+++ b/python/paddle/trainer_config_helpers/data_sources.py
--- a/python/paddle/trainer_config_helpers/default_decorators.py
+++ b/python/paddle/trainer_config_helpers/default_decorators.py
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ b/python/paddle/trainer_config_helpers/evaluators.py
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_gru.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_seq_concat_reshape.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_split_datasource.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_split_datasource.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/shared_gru.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_pad.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_pad.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_seq_concat_reshape.py
--- a/python/paddle/trainer_config_helpers/tests/layers_test_config.py
+++ b/python/paddle/trainer_config_helpers/tests/layers_test_config.py
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
--- a/python/paddle/v2/activation.py
+++ b/python/paddle/v2/activation.py
--- a/python/paddle/v2/attr.py
+++ b/python/paddle/v2/attr.py
--- a/python/paddle/v2/config_base.py
+++ b/python/paddle/v2/config_base.py
--- a/python/paddle/v2/data_feeder.py
+++ b/python/paddle/v2/data_feeder.py
--- a/python/paddle/v2/data_type.py
+++ b/python/paddle/v2/data_type.py
--- a/python/paddle/v2/dataset/__init__.py
+++ b/python/paddle/v2/dataset/__init__.py
--- a/python/paddle/v2/dataset/cifar.py
+++ b/python/paddle/v2/dataset/cifar.py
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
--- a/python/paddle/v2/dataset/conll05.py
+++ b/python/paddle/v2/dataset/conll05.py
--- a/python/paddle/v2/dataset/imdb.py
+++ b/python/paddle/v2/dataset/imdb.py
--- a/python/paddle/v2/dataset/imikolov.py
+++ b/python/paddle/v2/dataset/imikolov.py
--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/v2/dataset/mnist.py
--- a/python/paddle/v2/dataset/movielens.py
+++ b/python/paddle/v2/dataset/movielens.py
--- a/python/paddle/v2/dataset/sentiment.py
+++ b/python/paddle/v2/dataset/sentiment.py
--- a/python/paddle/v2/dataset/tests/cifar_test.py
+++ b/python/paddle/v2/dataset/tests/cifar_test.py
--- a/python/paddle/v2/dataset/tests/common_test.py
+++ b/python/paddle/v2/dataset/tests/common_test.py
--- a/python/paddle/v2/dataset/tests/imdb_test.py
+++ b/python/paddle/v2/dataset/tests/imdb_test.py
--- a/python/paddle/v2/dataset/tests/imikolov_test.py
+++ b/python/paddle/v2/dataset/tests/imikolov_test.py
--- a/python/paddle/v2/dataset/tests/mnist_test.py
+++ b/python/paddle/v2/dataset/tests/mnist_test.py
--- a/python/paddle/v2/dataset/tests/test_sentiment.py
+++ b/python/paddle/v2/dataset/tests/test_sentiment.py
--- a/python/paddle/v2/dataset/uci_housing.py
+++ b/python/paddle/v2/dataset/uci_housing.py
--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/v2/dataset/wmt14.py
--- a/python/paddle/v2/event.py
+++ b/python/paddle/v2/event.py
--- a/python/paddle/v2/inference.py
+++ b/python/paddle/v2/inference.py
--- a/python/paddle/v2/layer.py
+++ b/python/paddle/v2/layer.py
--- a/python/paddle/v2/minibatch.py
+++ b/python/paddle/v2/minibatch.py
--- a/python/paddle/v2/networks.py
+++ b/python/paddle/v2/networks.py
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
--- a/python/paddle/v2/pooling.py
+++ b/python/paddle/v2/pooling.py
--- a/python/paddle/v2/reader/__init__.py
+++ b/python/paddle/v2/reader/__init__.py
--- a/python/paddle/v2/reader/creator.py
+++ b/python/paddle/v2/reader/creator.py
--- a/python/paddle/v2/reader/decorator.py
+++ b/python/paddle/v2/reader/decorator.py
--- a/python/paddle/v2/reader/tests/CMakeLists.txt
+++ b/python/paddle/v2/reader/tests/CMakeLists.txt
--- a/python/paddle/v2/reader/tests/__init__.py
+++ b/python/paddle/v2/reader/tests/__init__.py
--- a/python/paddle/v2/reader/tests/creator_test.py
+++ b/python/paddle/v2/reader/tests/creator_test.py
--- a/python/paddle/v2/reader/tests/decorator_test.py
+++ b/python/paddle/v2/reader/tests/decorator_test.py
--- a/python/paddle/v2/reader/tests/run_tests.sh
+++ b/python/paddle/v2/reader/tests/run_tests.sh
--- a/python/paddle/v2/reader/tests/test_data_creator.txt
+++ b/python/paddle/v2/reader/tests/test_data_creator.txt
--- a/python/paddle/v2/tests/CMakeLists.txt
+++ b/python/paddle/v2/tests/CMakeLists.txt
--- a/python/paddle/v2/tests/run_tests.sh
+++ b/python/paddle/v2/tests/run_tests.sh
--- a/python/paddle/v2/tests/test_data_feeder.py
+++ b/python/paddle/v2/tests/test_data_feeder.py
--- a/python/paddle/v2/tests/test_layer.py
+++ b/python/paddle/v2/tests/test_layer.py
--- a/python/paddle/v2/tests/test_parameters.py
+++ b/python/paddle/v2/tests/test_parameters.py
--- a/python/paddle/v2/tests/test_rnn_layer.py
+++ b/python/paddle/v2/tests/test_rnn_layer.py
--- a/python/paddle/v2/tests/test_topology.py
+++ b/python/paddle/v2/tests/test_topology.py
--- a/python/paddle/v2/topology.py
+++ b/python/paddle/v2/topology.py
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
--- a/python/setup.py.in
+++ b/python/setup.py.in