Merge commit '12b61934'

d539e780 · Yu Yang · a14393f2 · 12b61934 · d539e780 · d539e780
269 changed file
--- a/.gitignore
+++ b/.gitignore
 *.DS_Store
 build/
+*.user
+
+.vscode
+.idea
\ No newline at end of file
--- a/.travis.yml
+++ b/.travis.yml
+language: cpp
+cache: ccache
+sudo: required
+dist: trusty
+env:
+  - JOB=DOCS
+  - JOB=BUILD_AND_TEST
+addons:
+  apt:
+    packages:
+      - gcc-4.8
+      - g++-4.8
+      - wget
+      - git
+      - build-essential
+      - libatlas-base-dev
+      - python
+      - python-pip
+      - python2.7-dev
+      - m4
+      - libprotobuf-dev
+      - doxygen
+      - protobuf-compiler
+      - python-protobuf
+      - python-numpy
+      - python-wheel
+      - libgoogle-glog-dev
+      - libgflags-dev
+      - libgtest-dev
+before_install:
+  - pip install wheel protobuf sphinx breathe recommonmark
+  - sudo paddle/scripts/travis/before_install.sh
+script:
+  - paddle/scripts/travis/main.sh
+notifications:
+  email:
+    on_success: change
+    on_failure: always
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 2.8)
 project(paddle CXX C)
 set(PADDLE_MAJOR_VERSION 0)
 set(PADDLE_MINOR_VERSION 8)
-set(PADDLE_PATCH_VERSION 0b)
+set(PADDLE_PATCH_VERSION 0b1)
 set(PADDLE_VERSION ${PADDLE_MAJOR_VERSION}.${PADDLE_MINOR_VERSION}.${PADDLE_PATCH_VERSION})

 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
@@ -14,8 +14,10 @@ find_package(CUDA QUIET)
 find_package(Protobuf REQUIRED)
 find_package(PythonLibs 2.7 REQUIRED)
 find_package(PythonInterp 2.7 REQUIRED)
-find_package(NumPy)
+find_package(ZLIB REQUIRED)
+find_package(NumPy REQUIRED)
 find_package(Threads REQUIRED)
+find_package(AVX QUIET)
 find_package(Glog)
 find_package(Gflags QUIET)
 find_package(GTest)
@@ -27,7 +29,7 @@ find_program(M4_EXECUTABLE m4)
 option(WITH_DSO "Compile PaddlePaddle with dynamic linked libraries" ON)
 option(WITH_GPU "Compile PaddlePaddle with gpu" ${CUDA_FOUND})
 option(WITH_DOUBLE "Compile PaddlePaddle with double precision, otherwise use single precision" OFF)
-option(WITH_AVX "Compile PaddlePaddle with avx intrinsics" ON) # TODO(yuyang18): Check AVX is supported or not as default value
+option(WITH_AVX "Compile PaddlePaddle with avx intrinsics" ${AVX_FOUND})
 option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
 option(WITH_STYLE_CHECK "Style Check for PaddlePaddle" ${PYTHONINTERP_FOUND})
 option(WITH_RDMA "Compile PaddlePaddle with rdma support" OFF)
@@ -37,6 +39,7 @@ option(WITH_TIMER "Compile PaddlePaddle use timer" OFF)
 option(WITH_TESTING "Compile and run unittest for PaddlePaddle" ${GTEST_FOUND})
 option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
 option(WITH_SWIG_PY "Compile PaddlePaddle with py PaddlePaddle prediction api" ${SWIG_FOUND})
+option(ON_TRAVIS "Running test on travis-ci or not." OFF)
 if(NOT CMAKE_BUILD_TYPE)
    set(CMAKE_BUILD_TYPE "RelWithDebInfo" CACHE STRING 
        "Choose the type of build, options are: Debug Release RelWithDebInfo MinSizeRel"
@@ -99,8 +102,8 @@ if(NOT WITH_TIMER)
 endif(NOT WITH_TIMER)

 if(WITH_AVX)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mavx")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAGS}")
 else(WITH_AVX)
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -msse3")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse3")

--- a/README.md
+++ b/README.md
 # PaddlePaddle
+[![Build Status](https://travis-ci.org/baidu/Paddle.svg?branch=master)](https://travis-ci.org/baidu/Paddle)
+
+Welcome to the PaddlePaddle GitHub.
+
+The software will be released on Sept. 30 with full documentation and installation support. 
+
+A pre-release version is available now for those who are eager to take a look.

 PaddlePaddle (PArallel Distributed Deep LEarning) is an easy-to-use,
 efficient, flexible and scalable deep learning platform, which is originally

--- a/cmake/FindAVX.cmake
+++ b/cmake/FindAVX.cmake
+# This file is use to check all support level of AVX on your machine
+# so that PaddlePaddle can unleash the vectorization power of muticore.
+
+INCLUDE(CheckCXXSourceRuns)
+
+SET(FIND_AVX_10)
+SET(FIND_AVX_20)
+SET(AVX_FLAGS)
+SET(AVX_FOUND)
+
+# Check AVX 2
+SET(CMAKE_REQUIRED_FLAGS)
+IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  SET(CMAKE_REQUIRED_FLAGS "-mavx2")
+ELSEIF(MSVC AND NOT CMAKE_CL_64)  # reserve for WINDOWS
+  SET(CMAKE_REQUIRED_FLAGS "/arch:AVX2")
+ENDIF()
+
+CHECK_CXX_SOURCE_RUNS("
+#include <immintrin.h>
+int main()
+{
+    __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4);
+    __m256i result = _mm256_abs_epi32 (a);
+    return 0;
+}" FIND_AVX_20)
+
+# Check AVX
+SET(CMAKE_REQUIRED_FLAGS)
+IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+    SET(CMAKE_REQUIRED_FLAGS "-mavx")
+ELSEIF(MSVC AND NOT CMAKE_CL_64)
+    SET(CMAKE_REQUIRED_FLAGS "/arch:AVX")
+endif()
+
+CHECK_CXX_SOURCE_RUNS("
+#include <immintrin.h>
+int main()
+{
+    __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f);
+    __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f);
+    __m256 result = _mm256_add_ps (a, b);
+    return 0;
+}" FIND_AVX_10)
+
+IF(${FIND_AVX_20})
+    IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+        SET(AVX_FLAGS "${AVX_FLAGS} -mavx2")
+    ELSEIF(MSVC)
+        SET(AVX_FLAGS "${AVX_FLAGS} /arch:AVX2")
+    ENDIF()
+ENDIF()
+
+IF(${FIND_AVX_10})
+    IF(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+        SET(AVX_FLAGS "${AVX_FLAGS} -mavx")
+    ELSEIF(MSVC)
+        SET(AVX_FLAGS "${AVX_FLAGS} /arch:AVX")
+    ENDIF()
+ENDIF()
+
+IF("${FIND_AVX_10}" OR "${FIND_AVX_20}")
+    SET(AVX_FOUND TRUE)
+    MESSAGE(STATUS "Find CPU supports ${AVX_FLAGS}.")
+ENDIF()
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -65,12 +65,14 @@ set(OPENBLAS_ROOT $ENV{OPENBLAS_ROOT} CACHE PATH "Folder contains Openblas")
 set(OPENBLAS_INCLUDE_SEARCH_PATHS
        ${OPENBLAS_ROOT}/include
        /usr/include
-        /usr/include/openblas)
+        /usr/include/openblas
+        /usr/local/opt/openblas/include)
 set(OPENBLAS_LIB_SEARCH_PATHS
        ${OPENBLAS_ROOT}/lib
        /usr/lib
        /usr/lib/blas/openblas
-        /usr/lib/openblas)
+        /usr/lib/openblas
+        /usr/local/opt/openblas/lib)

 find_path(OPENBLAS_INC_DIR NAMES cblas.h
  PATHS ${OPENBLAS_INCLUDE_SEARCH_PATHS})

--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -15,7 +15,7 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
    $ENV{CUDNN_ROOT}/lib64
    $ENV{CUDNN_ROOT}/lib
    /usr/lib)
-find_library(CUDNN_LIBRARY NAMES libcudnn.so # libcudnn_static.a
+find_library(CUDNN_LIBRARY NAMES libcudnn.so libcudnn.dylib # libcudnn_static.a
    PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist}
          NO_DEFAULT_PATH
    DOC "Path to cuDNN library.")

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -8,7 +8,7 @@ include(CheckCXXSymbolExists)
 # is_c: is C flag or C++ flag, bool type.
 # src_list: The list name which the flag name will be append to.
 # flag_name: the flag name for compiler, such as '-Werror' '-Wall' etc
-# rest arguments: not used. 
+# rest arguments: not used.
 function(safe_set_flag is_c src_list flag_name)
    string(REPLACE "-" "_" safe_name ${flag_name})
    string(REPLACE "=" "_" safe_name ${safe_name})
@@ -44,7 +44,7 @@ CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS)
 if(NOT UINT64_MAX_EXISTS)
  set(CMAKE_REQUIRED_DEFINITIONS -D__STDC_LIMIT_MACROS)
  CHECK_CXX_SYMBOL_EXISTS(UINT64_MAX "stdint.h" UINT64_MAX_EXISTS_HERE)
-  if(UINT64_MAX_EXISTS_HERE) 
+  if(UINT64_MAX_EXISTS_HERE)
    set(CMAKE_REQUIRED_DEFINITIONS)
    add_definitions(-D__STDC_LIMIT_MACROS)
  else()
@@ -74,13 +74,37 @@ endforeach()
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.

+function(specify_cuda_arch cuda_version cuda_arch)
+    if(${cuda_version} VERSION_GREATER "8.0")
+        foreach(capability 61 62)
+          if(${cuda_arch} STREQUAL ${capability})
+            list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
+          endif()
+        endforeach()
+    elseif(${cuda_version} VERSION_GREATER "7.0" and ${cuda_arch} STREQUAL "53")
+        list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
+    endif()
+endfunction()
+
+# Common gpu architectures: Kepler, Maxwell
 foreach(capability 30 35 50)
-    list(APPEND __arch_flags "-gencode arch=compute_${capability},code=sm_${capability}")
+    list(APPEND __arch_flags " -gencode arch=compute_${capability},code=sm_${capability}")
 endforeach()

 if (CUDA_VERSION VERSION_GREATER "7.0")
-    list(APPEND __arch_flags "-gencode arch=compute_52,code=sm_52")
+      list(APPEND __arch_flags " -gencode arch=compute_52,code=sm_52")
 endif()

-set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
+# Modern gpu architectures: Pascal
+if (CUDA_VERSION VERSION_GREATER "8.0")
+      list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60")
+endif()

+# Custom gpu architecture
+set(CUDA_ARCH)
+
+if(CUDA_ARCH)
+  specify_cuda_arch(${CUDA_VERSION} ${CUDA_ARCH})
+endif()
+
+set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
 # Some common routine for paddle compile.

-
 # target_circle_link_libraries
 # Link libraries to target which has circle dependencies.
 #
 # First Argument: target name want to be linked with libraries
 # Rest Arguments: libraries which link together.
 function(target_circle_link_libraries TARGET_NAME)
-    target_link_libraries(${TARGET_NAME}
-        -Wl,--start-group
-        ${ARGN}
-        -Wl,--end-group)
+    if(APPLE)
+        set(LIBS)
+        set(inArchive OFF)
+        set(libsInArgn)
+
+        foreach(arg ${ARGN})
+            if(${arg} STREQUAL "ARCHIVE_START")
+                set(inArchive ON)
+            elseif(${arg} STREQUAL "ARCHIVE_END")
+                set(inArchive OFF)
+            else()
+                if(inArchive)
+                    list(APPEND LIBS "-Wl,-force_load")
+                endif()
+                list(APPEND LIBS ${arg})
+                list(APPEND libsInArgn ${arg})
+            endif()
+        endforeach()
+        if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+            list(APPEND LIBS "-undefined dynamic_lookup")
+        endif()
+        list(REVERSE libsInArgn)
+        target_link_libraries(${TARGET_NAME}
+            ${LIBS}
+            ${libsInArgn})
+
+    else()  # LINUX
+        set(LIBS)
+
+        foreach(arg ${ARGN})
+            if(${arg} STREQUAL "ARCHIVE_START")
+                list(APPEND LIBS "-Wl,--whole-archive")
+            elseif(${arg} STREQUAL "ARCHIVE_END")
+                list(APPEND LIBS "-Wl,--no-whole-archive")
+            else()
+                list(APPEND LIBS ${arg})
+            endif()
+        endforeach()
+
+        target_link_libraries(${TARGET_NAME}
+                "-Wl,--start-group"
+                ${LIBS}
+                "-Wl,--end-group")
+    endif()
 endfunction()

 # compile_cu_as_cpp
@@ -41,20 +80,20 @@ function(link_paddle_exe TARGET_NAME)
    if(PADDLE_WITH_INTERNAL)
        set(INTERAL_LIBS paddle_internal_gserver paddle_internal_parameter)
        target_circle_link_libraries(${TARGET_NAME}
-            -Wl,--whole-archive
+            ARCHIVE_START
            paddle_internal_gserver
            paddle_internal_owlqn
-            -Wl,--no-whole-archive
+            ARCHIVE_END
            paddle_internal_parameter)
    else()
        set(INTERAL_LIBS "")
    endif()

    target_circle_link_libraries(${TARGET_NAME}
-        -Wl,--whole-archive
+        ARCHIVE_START
        paddle_gserver
        ${METRIC_LIBS}
-        -Wl,--no-whole-archive
+        ARCHIVE_END
        paddle_pserver
        paddle_trainer_lib
        paddle_network
@@ -67,9 +106,9 @@ function(link_paddle_exe TARGET_NAME)
        ${PROTOBUF_LIBRARY}
        ${CMAKE_THREAD_LIBS_INIT}
        ${CBLAS_LIBS}
-        ${CMAKE_DL_LIBS}
+        ${ZLIB_LIBRARIES}
        ${INTERAL_LIBS}
-        -lz)
+        ${CMAKE_DL_LIBS})
    
    if(WITH_PYTHON)
        target_link_libraries(${TARGET_NAME}

--- a/demo/image_classification/prediction.py
+++ b/demo/image_classification/prediction.py
@@ -20,9 +20,8 @@ from optparse import OptionParser

 import paddle.utils.image_util as image_util

-from py_paddle import swig_paddle, util
-from py_paddle import DataProviderWrapperConverter
-from paddle.trainer.PyDataProviderWrapper import DenseSlot
+from py_paddle import swig_paddle, DataProviderConverter
+from paddle.trainer.PyDataProvider2 import dense_vector
 from paddle.trainer.config_parser import parse_config

 logging.basicConfig(format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
@@ -75,8 +74,8 @@ class ImageClassifier():
        self.network.loadParameters(self.model_dir)

        data_size = 3 * self.crop_dims[0] * self.crop_dims[1]
-        slots = [DenseSlot(data_size)]
-        self.converter = util.DataProviderWrapperConverter(False, slots)
+        slots = [dense_vector(data_size)]
+        self.converter = DataProviderConverter(slots)

    def get_data(self, img_path):
        """

--- a/demo/image_classification/preprocess.sh
+++ b/demo/image_classification/preprocess.sh
@@ -14,8 +14,6 @@
 # limitations under the License.
 set -e

-export PYTHONPATH=$PYTHONPATH:../../
-
 data_dir=./data/cifar-out

 python preprocess.py -i $data_dir -s 32 -c 1
--- a/demo/model_zoo/resnet/classify.py
+++ b/demo/model_zoo/resnet/classify.py
@@ -22,9 +22,8 @@ from optparse import OptionParser

 import paddle.utils.image_util as image_util

-from py_paddle import swig_paddle, util
-from py_paddle import DataProviderWrapperConverter
-from paddle.trainer.PyDataProviderWrapper import DenseSlot
+from py_paddle import swig_paddle, DataProviderConverter
+from paddle.trainer.PyDataProvider2 import dense_vector
 from paddle.trainer.config_parser import parse_config

 logging.basicConfig(format='[%(levelname)s %(asctime)s %(filename)s:%(lineno)s] %(message)s')
@@ -85,9 +84,8 @@ class ImageClassifier():
        self.network.loadParameters(self.model_dir)

        data_size = 3 * self.crop_dims[0] * self.crop_dims[1]
-        slots = [DenseSlot(data_size)]
-        is_sequence = False
-        self.converter = util.DataProviderWrapperConverter(is_sequence, slots)
+        slots = [dense_vector(data_size)]
+        self.converter = DataProviderConverter(slots)

    def get_data(self, img_path):
        """

--- a/demo/quick_start/preprocess.sh
+++ b/demo/quick_start/preprocess.sh
-#!/bin/sh
+#!/bin/bash
 # Copyright (c) 2016 Baidu, Inc. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -33,7 +33,7 @@ test_num=$((min_len/10))
 if [ $test_num -gt 12500 ];then
 test_num=12500
 fi
-train_num=((min_len-test_num))
+train_num=$((min_len-test_num))

 head -n$train_num pos.shuffed >train.pos
 head -n$train_num neg.shuffed >train.neg

--- a/demo/recommendation/dataprovider.py
+++ b/demo/recommendation/dataprovider.py
@@ -12,15 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-try:
-    import cPickle as pickle
-except ImportError:
-    import pickle
-
 from paddle.trainer.PyDataProvider2 import *
 import common_utils  # parse

-
 def hook(settings, meta, **kwargs):
    """
    Init hook is invoked before process data. It will set obj.slots and store
@@ -47,7 +41,6 @@ def hook(settings, meta, **kwargs):
    settings.input_types = headers
    settings.meta = meta

-
 @provider(init_hook=hook, cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, filename):
    with open(filename, 'r') as f:

--- a/demo/semantic_role_labeling/predict.py
+++ b/demo/semantic_role_labeling/predict.py
@@ -15,12 +15,12 @@
 import os
 import numpy as np
 from optparse import OptionParser
-from py_paddle import swig_paddle, util, DataProviderWrapperConverter
-from paddle.trainer.PyDataProviderWrapper import IndexSlot
+from py_paddle import swig_paddle, DataProviderConverter
+from paddle.trainer.PyDataProvider2 import integer_value_sequence
 from paddle.trainer.config_parser import parse_config
 """
 Usage: run following command to show help message.
-  python predict.py -h 
+  python predict.py -h
 """
 UNK_IDX = 0

@@ -43,16 +43,22 @@ class Prediction():

        conf = parse_config(
            train_conf,
-            'dict_len=' + str(len_dict) + 
+            'dict_len=' + str(len_dict) +
            ',label_len=' + str(len_label) +
            ',is_predict=True')
        self.network = swig_paddle.GradientMachine.createFromConfigProto(
            conf.model_config)
        self.network.loadParameters(model_dir)

-        slots = [IndexSlot(len_dict), IndexSlot(len_dict), IndexSlot(len_dict),
-                 IndexSlot(len_dict), IndexSlot(len_dict), IndexSlot(2)]
-        self.converter = util.DataProviderWrapperConverter(True, slots)
+        slots = [
+            integer_value_sequence(len_dict),
+            integer_value_sequence(len_dict),
+            integer_value_sequence(len_dict),
+            integer_value_sequence(len_dict),
+            integer_value_sequence(len_dict),
+            integer_value_sequence(2)
+        ]
+        self.converter = DataProviderConverter(slots)

    def load_dict_label(self, dict_file, label_file):
        """
@@ -109,7 +115,7 @@ class Prediction():


 def option_parser():
-    usage = ("python predict.py -c config -w model_dir " 
+    usage = ("python predict.py -c config -w model_dir "
             "-d word dictionary -l label_file -i input_file")
    parser = OptionParser(usage="usage: %s [options]" % usage)
    parser.add_option(

--- a/demo/sentiment/predict.py
+++ b/demo/sentiment/predict.py
@@ -15,13 +15,13 @@
 import os
 import numpy as np
 from optparse import OptionParser
-from py_paddle import swig_paddle, util, DataProviderWrapperConverter
-from paddle.trainer.PyDataProviderWrapper import IndexSlot
+from py_paddle import swig_paddle, DataProviderConverter
+from paddle.trainer.PyDataProvider2 import integer_value_sequence
 from paddle.trainer.config_parser import parse_config

 """
 Usage: run following command to show help message.
-  python predict.py -h 
+  python predict.py -h
 """

 class SentimentPrediction():
@@ -46,8 +46,8 @@ class SentimentPrediction():
        conf = parse_config(train_conf, "is_predict=1")
        self.network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
        self.network.loadParameters(self.model_dir)
-        slots = [IndexSlot(self.dict_dim)]
-        self.converter = util.DataProviderWrapperConverter(True, slots)
+        slots = [integer_value_sequence(self.dict_dim)]
+        self.converter = DataProviderConverter(slots)

    def load_dict(self):
        """

--- a/demo/sentiment/sentiment_net.py
+++ b/demo/sentiment/sentiment_net.py
@@ -65,7 +65,7 @@ def bidirectional_lstm_net(input_dim,
    bi_lstm = bidirectional_lstm(input=emb, size=lstm_dim)
    dropout = dropout_layer(input=bi_lstm, dropout_rate=0.5)
    output = fc_layer(input=dropout, size=class_dim,
-                      act_type=SoftmaxActivation())
+                      act=SoftmaxActivation())

    if not is_predict:
        lbl = data_layer("label", 1)

--- a/demo/seqToseq/seqToseq_net.py
+++ b/demo/seqToseq/seqToseq_net.py
@@ -128,12 +128,16 @@ def gru_encoder_decoder(data_conf,
        return out

    decoder_group_name = "decoder_group"
+    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
+                  StaticInput(input=encoded_proj,is_seq=True)]
+
    if not is_generating:
        trg_embedding = embedding_layer(
            input=data_layer(name='target_language_word',
                             size=target_dict_dim),
            size=word_vector_dim,
            param_attr=ParamAttr(name='_target_language_embedding'))
+        group_inputs.append(trg_embedding)

        # For decoder equipped with attention mechanism, in training,
        # target embeding (the groudtruth) is the data input,
@@ -142,22 +146,13 @@ def gru_encoder_decoder(data_conf,
        # for the recurrent_group.
        decoder = recurrent_group(name=decoder_group_name,
                                  step=gru_decoder_with_attention,
-                                  input=[
-                                      StaticInput(input=encoded_vector,
-                                                  is_seq=True),
-                                      StaticInput(input=encoded_proj,
-                                                  is_seq=True), trg_embedding
-                                  ])
+                                  input=group_inputs)

        lbl = data_layer(name='target_language_next_word',
                         size=target_dict_dim)
-        cost = classification_cost(input=decoder, label=lbl, )
+        cost = classification_cost(input=decoder, label=lbl)
        outputs(cost)
    else:
-        gen_inputs = [StaticInput(input=encoded_vector,
-                                  is_seq=True),
-                      StaticInput(input=encoded_proj,
-                                  is_seq=True), ]
        # In generation, the decoder predicts a next target word based on
        # the encoded source sequence and the last generated target word.

@@ -171,16 +166,18 @@ def gru_encoder_decoder(data_conf,
            size=target_dict_dim,
            embedding_name='_target_language_embedding',
            embedding_size=word_vector_dim)
-        gen_inputs.append(trg_embedding)
+        group_inputs.append(trg_embedding)
+
        beam_gen = beam_search(name=decoder_group_name,
                               step=gru_decoder_with_attention,
-                               input=gen_inputs,
-                               id_input=data_layer(name="sent_id",
-                                                   size=1),
-                               dict_file=trg_dict_path,
+                               input=group_inputs,
                               bos_id=0,
                               eos_id=1,
                               beam_size=beam_size,
-                               max_length=max_length,
-                               result_file=gen_trans_file)
+                               max_length=max_length)
+
+        seqtext_printer_evaluator(input=beam_gen,
+                                  id_input=data_layer(name="sent_id", size=1),
+                                  dict_file=trg_dict_path,
+                                  result_file=gen_trans_file)
        outputs(beam_gen)
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
-
-
-
 if(NOT DEFINED SPHINX_THEME)
    set(SPHINX_THEME default)
 endif()
@@ -46,4 +43,4 @@ sphinx_add_target(paddle_docs

 add_dependencies(paddle_docs 
  gen_proto_py
-  paddle_doxygen_docs)
\ No newline at end of file
+  paddle_doxygen_docs)
--- a/doc/algorithm/rnn/index.rst
+++ b/doc/algorithm/rnn/index.rst
@@ -30,7 +30,7 @@ Then at the :code:`process` function, each :code:`yield` function will return th
    yield src_ids, trg_ids, trg_ids_next


-For more details description of how to write a data provider, please refer to :doc:`Python Data Provider <../py_data_provider_wrapper>`. The full data provider file is located at :code:`demo/seqToseq/dataprovider.py`.
+For more details description of how to write a data provider, please refer to `PyDataProvider2 <../../ui/data_provider/index.html>`_. The full data provider file is located at :code:`demo/seqToseq/dataprovider.py`.

 ===============================================
 Configure Recurrent Neural Network Architecture
@@ -106,7 +106,7 @@ We will use the sequence to sequence model with attention as an example to demon

 In this model, the source sequence :math:`S = \{s_1, \dots, s_T\}` is encoded with a bidirectional gated recurrent neural networks. The hidden states of the bidirectional gated recurrent neural network :math:`H_S = \{H_1, \dots, H_T\}` is called *encoder vector* The decoder is a gated recurrent neural network. When decoding each token :math:`y_t`, the gated recurrent neural network generates a set of weights :math:`W_S^t = \{W_1^t, \dots, W_T^t\}`, which are used to compute a weighted sum of the encoder vector. The weighted sum of the encoder vector is utilized to condition the generation of the token :math:`y_t`.

-The encoder part of the model is listed below. It calls :code:`grumemory` to represent gated recurrent neural network. It is the recommended way of using recurrent neural network if the network architecture is simple, because it is faster than :code:`recurrent_group`. We have implemented most of the commonly used recurrent neural network architectures, you can refer to :doc:`Layers <../trainer_config_helpers/layers>`  for more details.
+The encoder part of the model is listed below. It calls :code:`grumemory` to represent gated recurrent neural network. It is the recommended way of using recurrent neural network if the network architecture is simple, because it is faster than :code:`recurrent_group`. We have implemented most of the commonly used recurrent neural network architectures, you can refer to `Layers <../../ui/api/trainer_config_helpers/layers_index.html>`_  for more details.

 We also project the encoder vector to :code:`decoder_size` dimensional space, get the first instance of the backward recurrent network, and project it to :code:`decoder_size` dimensional space:

@@ -143,11 +143,15 @@ The decoder uses :code:`recurrent_group` to define the recurrent neural network.

 .. code-block:: python

+    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
+                  StaticInput(input=encoded_proj,is_seq=True)]
    trg_embedding = embedding_layer(
        input=data_layer(name='target_language_word',
                         size=target_dict_dim),
        size=word_vector_dim,
        param_attr=ParamAttr(name='_target_language_embedding'))
+    group_inputs.append(trg_embedding)
+
    # For decoder equipped with attention mechanism, in training,
    # target embedding (the groudtruth) is the data input,
    # while encoded source sequence is accessed to as an unbounded memory.
@@ -156,13 +160,7 @@ The decoder uses :code:`recurrent_group` to define the recurrent neural network.
    # All sequence inputs should have the same length.
    decoder = recurrent_group(name=decoder_group_name,
                              step=gru_decoder_with_attention,
-                              input=[
-                                  StaticInput(input=encoded_vector,
-                                              is_seq=True),
-                                  StaticInput(input=encoded_proj,
-                                              is_seq=True),
-                                  trg_embedding
-                              ])
+                              input=group_inputs)


 The implementation of the step function is listed as below. First, it defines the **memory** of the decoder network. Then it defines attention, gated recurrent unit step function, and the output function:
@@ -205,22 +203,23 @@ After training the model, we can use it to generate sequences. A common practice
 * use :code:`GeneratedInput` for trg_embedding. :code:`GeneratedInput` computes the embedding of the generated token at the last time step for the input at the current time step.
 * use :code:`beam_search` function. This function needs to set:

-  - :code:`id_input`: the integer ID of the data, used to identify the corresponding output in the generated files.
-  - :code:`dict_file`: the dictionary file for converting word id to word.
  - :code:`bos_id`: the start token. Every sentence starts with the start token.
  - :code:`eos_id`: the end token. Every sentence ends with the end token.
  - :code:`beam_size`: the beam size used in beam search.
  - :code:`max_length`: the maximum length of the generated sentences.
-  - :code:`result_file`: the path of the generation result file.

+* use :code:`seqtext_printer_evaluator` to print text according to index matrix and dictionary. This function needs to set:
+
+  - :code:`id_input`: the integer ID of the data, used to identify the corresponding output in the generated files.
+  - :code:`dict_file`: the dictionary file for converting word id to word.
+  - :code:`result_file`: the path of the generation result file.
+    
 The code is listed below:

 .. code-block:: python

-    gen_inputs = [StaticInput(input=encoded_vector,
-                              is_seq=True),
-                  StaticInput(input=encoded_proj,
-                              is_seq=True), ]
+    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
+                  StaticInput(input=encoded_proj,is_seq=True)]
    # In generation, decoder predicts a next target word based on
    # the encoded source sequence and the last generated target word.
    # The encoded source sequence (encoder's output) must be specified by
@@ -231,21 +230,22 @@ The code is listed below:
        size=target_dict_dim,
        embedding_name='_target_language_embedding',
        embedding_size=word_vector_dim)
-    gen_inputs.append(trg_embedding)
+    group_inputs.append(trg_embedding)
    beam_gen = beam_search(name=decoder_group_name,
                           step=gru_decoder_with_attention,
-                           input=gen_inputs,
-                           id_input=data_layer(name="sent_id",
-                                               size=1),
-                           dict_file=trg_dict_path,
+                           input=group_inputs,
                           bos_id=0, # Beginnning token.
                           eos_id=1, # End of sentence token.
                           beam_size=beam_size,
-                           max_length=max_length,
-                           result_file=gen_trans_file)
+                           max_length=max_length)
+
+    seqtext_printer_evaluator(input=beam_gen,
+                              id_input=data_layer(name="sent_id", size=1),
+                              dict_file=trg_dict_path,
+                              result_file=gen_trans_file)
    outputs(beam_gen)


-Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to :doc:`Semantic Role Labeling Demo <../../../demo/semantic_role_labeling>` for more details.
+Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to `Semantic Role Labeling Demo <../../demo/semantic_role_labeling/index.html>`_ for more details.

 The full configuration file is located at :code:`demo/seqToseq/seqToseq_net.py`.
--- a/doc/build/build_from_source.md
+++ b/doc/build/build_from_source.md
-Build and Install
+Installing from Sources
 =================

-## Requirement
+* [1. Download and Setup](#download)
+* [2. Requirements](#requirements)
+* [3. Build on Ubuntu](#ubuntu)
+* [4. Build on Mac OS X](#mac)

-### Dependents
+## <span id="download">Download and Setup</span> 
+You can download PaddlePaddle from the [github source](https://github.com/gangliao/Paddle).

- **CMake**: required for 2.8+ version
- **g++**: a recent c++ compiler supporting c++11, >= 4.6, < 5
- **BLAS library**: such as openBLAS, MKL, ATLAS
- **protobuf**: required for 2.4+ version, 3.x is not supported
- **python**: currently only 2.7 version is supported
+```bash
+git clone https://github.com/baidu/Paddle paddle
+cd paddle
+```
+
+## <span id="requirements">Requirements</span>
+
+To compile the source code, your computer must be equipped with GCC >=4.6 or Clang compiler.
+### Dependencies
+
+- **CMake**: version >= 2.8
+- **BLAS**: MKL, OpenBlas or ATLAS
+- **protobuf**: version >= 2.4, **Note: 3.x is not supported**
+- **python**: only python 2.7 is supported currently
+
+### Options
+
+PaddlePaddle supports some build options. To enable it, first you need to install the related libraries. 
+
+<style type="text/css">
+.tg  {border-collapse:collapse;border-spacing:0;border-color:#ccc;}
+.tg td{font-family:Arial, sans-serif;font-size:14px;padding:10px 5px;border-style:solid;border-width:0px;overflow:hidden;word-break:normal;border-color:#ccc;color:#333;background-color:#fff;border-top-width:1px;border-bottom-width:1px;}
+.tg th{font-family:Arial, sans-serif;font-size:14px;font-weight:normal;padding:10px 5px;border-style:solid;border-width:0px;overflow:hidden;word-break:normal;border-color:#ccc;color:#333;background-color:#f0f0f0;border-top-width:1px;border-bottom-width:1px;}
+.tg .tg-yw4l{vertical-align:top}
+.tg .tg-9hbo{font-weight:bold;vertical-align:top}
+</style>
+<table class="tg">
+  <tr>
+    <th class="tg-yw4l">Optional</th>
+    <th class="tg-yw4l">Description</th>
+  </tr>
+  <tr>
+    <td class="tg-9hbo">WITH_GPU</td>
+    <td class="tg-yw4l">Compile with GPU mode.</td>
+  </tr>
+  <tr>
+    <td class="tg-9hbo">WITH_DOUBLE</td>
+    <td class="tg-yw4l">Compile with double precision floating-point, default: single precision.</td>
+  </tr>
+  <tr>
+    <td class="tg-9hbo">WITH_GLOG</td>
+    <td class="tg-yw4l">Compile with glog. If not found, default: an internal log implementation.</td>
+  </tr>
+  <tr>
+    <td class="tg-9hbo">WITH_GFLAGS</td>
+    <td class="tg-yw4l">Compile with gflags. If not found, default: an internal flag implementation.</td>
+  </tr>
+  <tr>
+    <td class="tg-9hbo">WITH_TESTING</td>
+    <td class="tg-yw4l">Compile with gtest for PaddlePaddle's unit testing.</td>
+  </tr>
+  <tr>
+    <td class="tg-9hbo">WITH_DOC</td>
+    <td class="tg-yw4l">Compile to generate PaddlePaddle's docs, default: disabled (OFF)</td>
+  </tr>
+  <tr>
+    <td class="tg-9hbo">WITH_SWIG_PY</td>
+    <td class="tg-yw4l">Compile with python predict API, default: disabled (OFF).</td>
+  </tr>
+  <tr>
+    <td class="tg-9hbo">WITH_STYLE_CHECK</td>
+    <td class="tg-yw4l">Compile with code style check, default: enabled (ON).</td>
+  </tr>
+</table>
+
+**Note:**
+  - The GPU version works best with Cuda Toolkit 7.5 and cuDNN v5.
+  - Other versions like Cuda Toolkit 6.5, 7.0, 8.0 and cuDNN v2, v3, v4 are also supported.
+  - **To utilize cuDNN v5, Cuda Toolkit 7.5 is prerequisite and vice versa.**
+
+As a simple example, consider the following:  
+
+1. **Python Dependencies(optional)**
+  
+    To compile PaddlePaddle with python predict API, make sure swig installed and set `-DWITH_SWIG_PY=ON` as follows:
+
+    ```bash
+    # install swig on ubuntu
+    sudo apt-get install swig
+    # install swig on Mac OS X
+    brew install swig

-### Optional
+    # active swig in cmake
+    cmake .. -DWITH_SWIG_PY=ON
+    ```

-PaddlePaddle also support some build options, you have to install related libraries. 
+2. **Doc Dependencies(optional)**

- **WITH_GPU**: Compile with gpu mode
-  - The GPU version works best with Cuda Toolkit 7.5 and cuDNN v5
-  - Other versions Cuda Toolkit 6.5, 7.0 and cuDNN v2, v3, v4 are also supported
-  - Note: to utilize cuDNN v5, Cuda Toolkit 7.5 is prerequisite and vice versa
- **WITH_DOUBLE**: Compile with double precision, otherwise use single precision 
- **WITH_GLOG**: Compile with glog, otherwise use a log implement internally
- **WITH_GFLAGS**: Compile with gflags, otherwise use a flag implement internally
- **WITH_TESTING**: Compile with gtest and run unittest for PaddlePaddle 
- **WITH_DOC**: Compile with documentation
- **WITH_SWIG_PY**: Compile with python predict api
- **WITH_STYLE_CHECK**: Style check for source code
+    To generate PaddlePaddle's documentation, install dependencies and set `-DWITH_DOC=ON` as follows:

+    ```bash
+    pip install 'sphinx>=1.4.0'
+    pip install sphinx_rtd_theme breathe recommonmark

-## Building on Ubuntu14.04
+    # install doxygen on Ubuntu
+    sudo apt-get install doxygen 
+    # install doxygen on Mac OS X
+    brew install doxygen
+
+    # active docs in cmake
+    cmake .. -DWITH_DOC=ON`
+    ```
+
+## <span id="ubuntu">Build on Ubuntu 14.04</span>

 ### Install Dependencies

 - **CPU Dependencies**

-```bash
-# necessary
-sudo apt-get update
-sudo apt-get install -y g++ make cmake build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git 
-# optional
-sudo apt-get install libgoogle-glog-dev
-sudo apt-get install libgflags-dev
-sudo apt-get install libgtest-dev
-pushd /usr/src/gtest
-cmake .
-make
-sudo cp *.a /usr/lib
-popd
-```
-    
+    ```bash
+    # necessary
+    sudo apt-get update
+    sudo apt-get install -y g++ make cmake build-essential libatlas-base-dev python python-pip libpython-dev m4 libprotobuf-dev protobuf-compiler python-protobuf python-numpy git
+    # optional
+    sudo apt-get install libgoogle-glog-dev
+    sudo apt-get install libgflags-dev
+    sudo apt-get install libgtest-dev
+    sudo pip install wheel
+    pushd /usr/src/gtest
+    cmake .
+    make
+    sudo cp *.a /usr/lib
+    popd
+    ```
  
- **GPU Dependencies(optional)**
+- **GPU Dependencies (optional)**

-If you need to build GPU version, the first thing you need is a machine that has GPU and CUDA installed.
-And you also need to install cuDNN.
+    To build GPU version, you will need the following installed:

-You can download CUDA toolkit and cuDNN from nvidia website:
-    
-```bash
-https://developer.nvidia.com/cuda-downloads
-https://developer.nvidia.com/cudnn
-```
-You can copy cuDNN files into the CUDA toolkit directory, such as:
+        1. a CUDA-capable GPU
+        2. A supported version of Linux with a gcc compiler and toolchain
+        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
+        4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
+
+    The CUDA development environment relies on tight integration with the host development environment,
+    including the host compiler and C runtime libraries, and is therefore only supported on
+    distribution versions that have been qualified for this CUDA Toolkit release.
+        
+    After downloading cuDNN library, issue the following commands:
+
+    ```bash
+    sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
+    sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
+    ```
+    Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
+
+    ```bash
+    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+    export PATH=/usr/local/cuda/bin:$PATH
+    ```
+
+### Build and Install
+
+As usual, the best option is to create build folder under paddle project directory.

 ```bash
-sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
-sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
+mkdir build && cd build
+cmake ..
 ```
-Then you need to set LD\_LIBRARY\_PATH, CUDA\_HOME and PATH environment variables in ~/.bashrc.
+
+CMake first check PaddlePaddle's dependencies in system default path. After installing some optional
+libraries, corresponding build option will be set automatically (for instance, glog, gtest and gflags).
+If still not found, you can manually set it based on CMake error information from your screen.
+
+As a simple example, consider the following:
+
+- **Only CPU**
+
+  ```bash
+  cmake  .. -DWITH_GPU=OFF -DWITH_DOC=OFF
+  ```
+- **GPU**
+
+  ```bash
+  cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF
+  ```
+
+- **GPU with doc and swig**
+
+  ```bash
+  cmake .. -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
+  ``` 
+
+Finally, you can build PaddlePaddle:

 ```bash
-export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-export CUDA_HOME=/usr/local/cuda
-export PATH=/usr/local/cuda/bin:$PATH
+# you can add build option here, such as:    
+cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF -DCMAKE_INSTALL_PREFIX=<path to install>
+# please use sudo make install, if you want to install PaddlePaddle into the system
+make -j `nproc` && make install
+# set PaddlePaddle installation path in ~/.bashrc
+export PATH=<path to install>/bin:$PATH
 ```
- **Python Dependencies(optional)**

-If you want to compile PaddlePaddle with python predict api, you need to add -DWITH_SWIG_PY=ON in cmake command and install these first:
+**Note:**
+
+If you set `WITH_SWIG_PY=ON`, related python dependencies also need to be installed.
+Otherwise, PaddlePaddle will automatically install python dependencies
+at first time when user run paddle commands, such as `paddle version`, `paddle train`.
+It may require sudo privileges:

 ```bash
-sudo apt-get install swig
+# you can run
+sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
+# or just run 
+sudo paddle version
 ```

- **Doc Dependencies(optional)**
+## <span id="mac">Building on Mac OS X</span>

-If you want to compile PaddlePaddle with doc, you need to add -DWITH_DOC=ON in cmake command and install these first:
+### Prerequisites
+This guide is based on Mac OS X 10.11 (El Capitan). Note that if you are running an up to date version of OS X, 
+you will already have Python 2.7.10 and Numpy 1.8 installed.
+
+The best option is to use the package manager homebrew to handle installations and upgrades for you.
+To install [homebrew](http://brew.sh/), first open a terminal window (you can find Terminal in the Utilities folder in Applications), and issue the command:

 ```bash
-pip install sphinx
-pip install sphinx_rtd_theme breathe recommonmark
-sudo apt-get install python-sphinx doxygen 
+# install brew
+/usr/bin/ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
+# install pip
+easy_install pip
 ```

-### Build and Install
+### Install Dependencies

-CMake will find dependent libraries in system default paths first. After installing some optional libraries, corresponding build option will automatically be on(such as glog, gtest and gflags). And if libraries are not found, you have to set following variables manually in cmake command(CUDNN_ROOT, ATLAS_ROOT, MKL_ROOT, OPENBLAS_ROOT).
+- **CPU Dependencies**

-Here are some examples of cmake command with different options:
+  ```bash
+  # Install fundamental dependents 
+  brew install glog gflags cmake protobuf openblas
+
+  # Install google test on Mac OS X
+  # Download gtest 1.7.0
+  wget https://github.com/google/googletest/archive/release-1.7.0.tar.gz
+  tar -xvf googletest-release-1.7.0.tar.gz && cd googletest-release-1.7.0
+  # Build gtest
+  mkdir build && cmake ..
+  make
+  # Install gtest library
+  sudo cp -r ../include/gtest /usr/local/include/
+  sudo cp lib*.a /usr/local/lib
+  ```

-**only cpu**
+- **GPU Dependencies(optional)**

-```bash
-cmake -DWITH_GPU=OFF -DWITH_DOC=OFF
-```
+    To build GPU version, you will need the following installed:
+
+        1. a CUDA-capable GPU
+        2. Mac OS X 10.11 or later
+        2. the Clang compiler and toolchain installed using Xcode
+        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
+        4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
+
+    The CUDA development environment relies on tight integration with the host development environment,
+    including the host compiler and C runtime libraries, and is therefore only supported on
+    distribution versions that have been qualified for this CUDA Toolkit release.
+        
+    1. After downloading cuDNN library, issue the following commands:
+
+        ```bash
+        sudo tar -xzf cudnn-7.5-osx-x64-v5.0-ga.tgz -C /usr/local
+        sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
+        ```
+    2. Then you need to set DYLD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.

-**gpu**
+        ```bash
+        export DYLD_LIBRARY_PATH=/usr/local/cuda/lib:$DYLD_LIBRARY_PATH
+        export PATH=/usr/local/cuda/bin:$PATH
+        ```
+
+### Build and Install
+
+As usual, the best option is to create build folder under paddle project directory.

 ```bash
-cmake -DWITH_GPU=ON -DWITH_DOC=OFF
+mkdir build && cd build
+cmake ..
 ```

-**gpu with doc and swig**
+CMake first check PaddlePaddle's dependencies in system default path. After installing some optional
+libraries, corresponding build option will be set automatically (for instance, glog, gtest and gflags).
+If still not found, you can manually set it based on CMake error information from your screen.

-```bash
-cmake -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
-``` 
+As a simple example, consider the following:
+
+- **Only CPU**
+
+  ```bash
+  cmake  .. -DWITH_GPU=OFF -DWITH_DOC=OFF
+  ```
+- **GPU**
+
+  ```bash
+  cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF
+  ```

-Finally, you can download source code and build:
+- **GPU with doc and swig**
+
+  ```bash
+  cmake .. -DWITH_GPU=ON -DWITH_DOC=ON -DWITH_SWIG_PY=ON
+  ``` 
+
+Finally, you can build PaddlePaddle:

 ```bash
-git clone https://github.com/baidu/Paddle paddle
-cd paddle
-mkdir build
-cd build
 # you can add build option here, such as:    
-cmake -DWITH_GPU=ON -DWITH_DOC=OFF -DCMAKE_INSTALL_PREFIX=<path to install> ..
+cmake .. -DWITH_GPU=ON -DWITH_DOC=OFF -DCMAKE_INSTALL_PREFIX=<installation path>
+# please use sudo make install, if you want to install PaddlePaddle into the system
 make -j `nproc` && make install
-# PaddlePaddle installation path
-export PATH=<path to install>/bin:$PATH
+# set PaddlePaddle installation path in ~/.bashrc
+export PATH=<installation path>/bin:$PATH
 ```
-**Note**
+**Note:**

-And if you set WITH_SWIG_PY=ON, you have to install related python predict api at the same time:
+If you set `WITH_SWIG_PY=ON`, related python dependencies also need to be installed.
+Otherwise, PaddlePaddle will automatically install python dependencies
+at first time when user run paddle commands, such as `paddle version`, `paddle train`.
+It may require sudo privileges:

 ```bash
-pip install <path to install>/opt/paddle/share/wheels/*.whl
-```
+# you can run
+sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
+# or just run 
+sudo paddle version
+```
\ No newline at end of file
--- a/doc/build/contribute_to_paddle.md
+++ b/doc/build/contribute_to_paddle.md
@@ -25,9 +25,12 @@ repo or just head straight to the command line:
 
 ```shell
 # Clone your fork to your local machine
-git clone git@github.com:USERNAME/paddle.git
+git clone https://github.com/USERNAME/Paddle.git
+```
+Then you can start to develop by making a local developement branch
+```shell
+git checkout -b MY_COOL_STUFF_BRANCH origin/master
 ```
-Then you can start to develop. 

 ## Commit

@@ -45,14 +48,14 @@ are the details if any.

 ## Keeping Fork Up to Date

-Before pull your request, you shold sync you code from the latest PaddlePaddle.
+Before pull your request, you should sync your code from the latest PaddlePaddle.
 To do this, you'll need to add a remote at first:

 ```shell
 # see the current configured remote repository
 git remote -v
 # add upstream repository
-git remote add upstream https://github.com/paddle/paddle.git
+git remote add upstream https://github.com/baidu/Paddle.git
 # verify the new upstream
 git remote -v
 ```
@@ -60,8 +63,7 @@ git remote -v
 Update your fork with the latest upstream changes:

 ```shell
-git fetch upstream
-git pull upstream master
+git pull --rebase upstream HEAD
 ```

 If there are no unique commits locally, git will simply perform a fast-forward.
@@ -74,10 +76,26 @@ Now, your local master branch is up-to-date with everything modified upstream.

 ```shell
 # push to your repository in Github
-git push origin master
+git push origin HEAD
 ```

 ## Pull Request

 Go to the page for your fork on GitHub, select your development branch,
 and click the **pull request button**.
+
+## Update your pull request with the lastest version
+
+During the code review, your pull request may become stale because new commits in
+baidu/Paddle. GitHub allows autmotic update if there is no conflict. You can do this
+by clicking the "Update Branch" button in your pull request page. However, in the case
+of conflict, you need to do the update manually. You need to do the following on
+your local repository:
+```shell
+git checkout MY_COOL_STUFF_BRANCH
+git pull --rebase upstream HEAD
+# You may need to resolve the conflict according to the git prompt.
+# Make and test your code.
+git push -f origin HEAD
+```
+Now your Pull Request is updated with the latest version.
--- a/doc/build/docker_install.md
+++ b/doc/build/docker_install.md
+Docker installation guide
+====================
+PaddlePaddle provides some pre-compiled binary, including Docker images, ubuntu deb packages. It is welcomed to contributed more installation package of different linux distribution (such as ubuntu, centos, debian, gentoo and so on). We recommend to use Docker images to deploy PaddlePaddle.
+## Docker installation
+
+Docker is a tool designed to make it easier to create, deploy, and run applications by using containers.
+
+### PaddlePaddle Docker images
+There are six Docker images:
+
+- paddledev/paddle:cpu-latest: PaddlePaddle CPU binary image.
+- paddledev/paddle:gpu-latest: PaddlePaddle GPU binary image.
+- paddledev/paddle:cpu-devel-latest: PaddlePaddle CPU binary image plus source code.
+- paddledev/paddle:gpu-devel-latest: PaddlePaddle GPU binary image plus source code.
+- paddledev/paddle:cpu-demo-latest: PaddlePaddle CPU binary image plus source code and demo
+- paddledev/paddle:gpu-demo-latest: PaddlePaddle GPU binary image plus source code and demo
+
+Tags with latest will be replaced by a released version. 
+
+### Download and Run Docker images
+
+You have to install Docker in your machine which has linux kernel version 3.10+ first. You can refer to the official guide https://docs.docker.com/engine/installation/ for further information.
+
+You can use ```docker pull ```to download images first, or just launch a container with ```docker run```:
+```bash
+docker run -it paddledev/paddle:cpu-latest
+```
+
+If you want to launch container with GPU support, you need to set some environment variables at the same time:
+
+```bash
+export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}"
+export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+docker run -it paddledev/paddle:gpu-latest
+``` 
+
+### Notice
+
+#### Performance
+
+Since Docker is based on the lightweight virtual containers, the CPU computing performance maintains well. And GPU driver and equipments are all mapped to the container, so the GPU computing performance would not be seriously affected.
+
+If you use high performance nic, such as RDMA(RoCE 40GbE or IB 56GbE), Ethernet(10GbE), it is recommended to use config "-net = host".
+
+
+
+
+#### Remote access
+If you want to enable ssh access background, you need to build an image by yourself. Please refer to official guide https://docs.docker.com/engine/reference/builder/ for further information.
+
+Following is a simple Dockerfile with ssh:
+```bash
+FROM paddledev/paddle
+
+MAINTAINER PaddlePaddle dev team <paddle-dev@baidu.com>
+
+RUN apt-get update
+RUN apt-get install -y openssh-server
+RUN mkdir /var/run/sshd
+RUN echo 'root:root' | chpasswd
+
+RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
+RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+
+EXPOSE 22
+
+CMD    ["/usr/sbin/sshd", "-D"]
+```
+
+Then you can build an image with Dockerfile and launch a container:
+
+```bash
+# cd into Dockerfile directory
+docker build . -t paddle_ssh
+# run container, and map host machine port 8022 to container port 22
+docker run -d -p 8022:22 --name paddle_ssh_machine paddle_ssh
+```
+Now, you can ssh on port 8022 to access the container, username is root, password is also root:
+
+```bash
+ssh -p 8022 root@YOUR_HOST_MACHINE
+```
+
+
+You can stop and delete the container as following:
+```bash
+# stop
+docker stop paddle_ssh_machine
+# delete
+docker rm paddle_ssh_machine
+```
--- a/doc/build/index.rst
+++ b/doc/build/index.rst
@@ -5,9 +5,11 @@ Install PaddlePaddle
 ----------------------

 ..  toctree::
+    :maxdepth: 1
    :glob:

    install_*
+    internal/install_from_jumbo.md

 Build from Source
 -----------------
@@ -15,20 +17,24 @@ Build from Source
 If you want to hack and contribute PaddlePaddle source code, following guides can help you\:

 ..  toctree::
+    :maxdepth: 1
    :glob:

    build_from_source.md
    contribute_to_paddle.md

-Build Docker Images
-------------------
+Docker and Debian Package installation
+--------------------------------------

-Note: The intallation packages are still in pre-release 
+Note: The installation packages are still in pre-release 
 state and your experience of installation may not be smooth.

 If you want to pack docker image, the following guide can help you\:

 ..  toctree::
+    :maxdepth: 1
    :glob:

-    docker/*
+    docker_install.md
+    ubuntu_install.md
+
--- a/doc/build/ubuntu_install.md
+++ b/doc/build/ubuntu_install.md
+Debian Package installation guide
+=================================
+
+## Debian Package installation
+Currently , PaddlePaddle only provides ubuntu14.04 debian packages.
+There are two versions package, including CPU and GPU. The download address is:
+
+https://github.com/baidu/Paddle/releases/tag/V0.8.0b0
+
+
+After downloading PaddlePaddle deb packages, you can run:
+
+```bash
+dpkg -i paddle-0.8.0b-cpu.deb
+apt-get install -f
+```
+And if you use GPU version deb package, you need to install CUDA toolkit and cuDNN, and set related environment variables(such as LD_LIBRARY_PATH) first. It is normal when `dpkg -i` get errors. `apt-get install -f` will continue install paddle, and install dependences. 
+
+**Note**
+
+PaddlePaddle package only supports x86 CPU with AVX instructions. If not, you have to download and build from source code.
--- a/doc/cluster/index.rst
+++ b/doc/cluster/index.rst
@@ -5,3 +5,4 @@ Cluster Train
  :glob:

  opensource/cluster_train.md
+  internal/index.md
--- a/doc/conf.py.in
+++ b/doc/conf.py.in
@@ -23,6 +23,8 @@ AutoStructify = transform.AutoStructify
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 sys.path.insert(0, '@PROJ_ROOT@/python')

+templates_path = ["@PROJ_ROOT@/doc/templates"]
+
 # -- Doxygen Settings
 breathe_projects = {
   'paddle': '@PADDLE_DOXYGEN_OUTPUT@/xml'
@@ -66,8 +68,6 @@ extensions = [

 autodoc_member_order = 'bysource'

-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']

 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:

--- a/doc/demo/embedding_model/index.md
+++ b/doc/demo/embedding_model/index.md
@@ -93,7 +93,7 @@ where `train.sh` is almost the same as `demo/seqToseq/translation/train.sh`, the
 - `--init_model_path`: path of the initialization model, here is `data/paraphrase_model`
 - `--load_missing_parameter_strategy`: operations when model file is missing, here use a normal distibution to initialize the other parameters except for the embedding layer

-For users who want to understand the dataset format, model architecture and training procedure in detail, please refer to [Text generation Tutorial](text_generation.md).
+For users who want to understand the dataset format, model architecture and training procedure in detail, please refer to [Text generation Tutorial](../text_generation/text_generation.md).

 ## Optional Function ##
 ###  Embedding Parameters Observation

--- a/doc/demo/image_classification/image_classification.md
+++ b/doc/demo/image_classification/image_classification.md
-#Image Classification Tutorial
+Image Classification Tutorial
+==============================

 This tutorial will guide you through training a convolutional neural network to classify objects using the CIFAR-10 image classification dataset.
 As shown in the following figure, the convolutional neural network can recognize the main object in images, and output the classification result.
@@ -172,7 +173,7 @@ python -m paddle.utils.plotcurve -i $log > plot.png
 - The script `plotcurve.py` requires the python module of `matplotlib`, so if it fails, maybe you need to install `matplotlib`.


-After training finishes, the training and testing error curve will be saved to `plot.png` using `plotcurve.py` script. An example of the plot is shown below:
+After training finishes, the training and testing error curves will be saved to `plot.png` using `plotcurve.py` script. An example of the plot is shown below:

 <center>![Training and testing curves.](./plot.png)</center>


--- a/doc/demo/imagenet_model/resnet_model.md
+++ b/doc/demo/imagenet_model/resnet_model.md
 # Model Zoo - ImageNet #

-[ImageNet](http://www.image-net.org/) is a popular dataset for generic object classification. This tutorial provided convolutional neural network(CNN) models for ImageNet.
+[ImageNet](http://www.image-net.org/) is a popular dataset for generic object classification. This tutorial provides convolutional neural network(CNN) models for ImageNet.

 ## ResNet Introduction

@@ -48,11 +48,11 @@ We present three ResNet models, which are converted from the models provided by

 ## ResNet Model

-See ```demo/model_zoo/resnet/resnet.py```. This confgiure contains network of 50, 101 and 152 layers. You can specify layer number by adding argument like this ```--config_args=layer_num=50``` in command line arguments.
+See ```demo/model_zoo/resnet/resnet.py```. This config contains network of 50, 101 and 152 layers. You can specify layer number by adding argument like ```--config_args=layer_num=50``` in command line arguments.

 ### Network Visualization

-You can get a diagram of ResNet network by running the following command. The script generates dot file and then converts dot file to PNG file, which uses installed draw_dot tool in our server. If you can not access the server, just install graphviz to convert dot file.
+You can get a diagram of ResNet network by running the following commands. The script generates dot file and then converts dot file to PNG file, which uses installed draw_dot tool in our server. If you can not access the server, just install graphviz to convert dot file.

 ```
 cd demo/model_zoo/resnet
@@ -165,7 +165,7 @@ We provide both C++ and Python interfaces to extract features. The following exa

 ### C++ Interface

-First, specify image data list in `define_py_data_sources` in the config, see example `demo/model_zoo/resnet/resnet.py`.
+First, specify image data list in `define_py_data_sources2` in the config, see example `demo/model_zoo/resnet/resnet.py`.

 ```
    train_list = 'train.list' if not is_test else None
@@ -190,8 +190,7 @@ Second, specify layers to extract features in `Outputs()` of `resnet.py`. For ex
 Outputs("res5_3_branch2c_conv", "res5_3_branch2c_bn")
 ```

-Third, specify model path and output directory in `extract_fea_c++.sh
-`, and then run following commands
+Third, specify model path and output directory in `extract_fea_c++.sh`, and then run the following commands.

 ```
 cd demo/model_zoo/resnet

--- a/doc/demo/index.md
+++ b/doc/demo/index.md
@@ -9,7 +9,7 @@ There are serveral examples and demos here.

 * [Sentiment Analysis](sentiment_analysis/index.rst)
 * [Text Generation](text_generation/index.rst)
-* [Semantic Role Labeling](semantic_role_labeling/index.md)
+* [Semantic Role Labeling](semantic_role_labeling/index.rst)

 ## Recommendation

@@ -19,6 +19,3 @@ There are serveral examples and demos here.
 ## Model Zoo
 * [ImageNet: ResNet](imagenet_model/resnet_model.md)
 * [Embedding: Chinese Word](embedding_model/index.md)
-
-## Customization
-* [Writing New Layers](new_layer/index.rst)
--- a/doc/demo/quick_start/index_en.md
+++ b/doc/demo/quick_start/index_en.md
@@ -59,7 +59,7 @@ To build your text classification system, your code will need to perform five st
 ## Preprocess data into standardized format
 In this example, you are going to use [Amazon electronic product review dataset](http://jmcauley.ucsd.edu/data/amazon/) to build a bunch of deep neural network models for text classification. Each text in this dataset is a product review. This dataset has two categories: “positive” and “negative”. Positive means the reviewer likes the product, while negative means the reviewer does not like the product.

-`demo/quick_start` provides scripts for downloading data and preprocessing data as shown below. The data process takes several minutes (about 3 minutes in our machine).
+`demo/quick_start` in the [source code](https://github.com/baidu/Paddle) provides scripts for downloading data and preprocessing data as shown below. The data process takes several minutes (about 3 minutes in our machine).

 ```bash
 cd demo/quick_start
@@ -157,9 +157,7 @@ define_py_data_sources2(train_list='data/train.list',
                        obj="process",
                        args={"dictionary": word_dict})
 ```
-
-You can refer to the following link for more detailed examples
-: <a href = "../../ui/data_provider/python_case.html">Python Use Case</a>，The detailed documentation on data format is: <a href = "../../ui/api/py_data_provider_wrapper.html"> PyDataProviderWrapper</a>。
+You can refer to the following link for more detailed examples and data formats: <a href = "../../ui/data_provider/pydataprovider2.html">PyDataProvider2</a>.

 ## Network Architecture
 You will describe four kinds of network architectures in this section.
@@ -425,7 +423,7 @@ paddle train \

 mv rank-00000 result.txt
 ```
-There are several differences between training and inference network configurations.
+User can choose the best model base on the training log instead of model `output/pass-00003`. There are several differences between training and inference network configurations.
 - You do not need labels during inference.
 - Outputs need to be specified to the classification probability layer (the output of softmax layer), or the id of maximum probability (`max_id` layer). An example to output the id and probability is given in the code snippet.
 - batch_size = 1.

--- a/doc/demo/rec/ml_regression.rst
+++ b/doc/demo/rec/ml_regression.rst
@@ -219,9 +219,9 @@ The network structure shows below.

 The demo's neural network config file "trainer_config.py" show as below.

-..  include:: ../../../demo/recommendation/trainer_config.py
-    :code: python
-    :literal:
+..  literalinclude:: ../../../demo/recommendation/trainer_config.py
+    :language: python
+    :lines: 15-

 In this :code:`trainer_config.py`, we just map each feature type to
 a feature vector, following shows how to map each feature to a vector shows below.
@@ -257,15 +257,15 @@ In these network, we use several api in `trainer_config_helpers
 *  Text Convolution Pooling Layer, `text_conv_pool
   <../../ui/api/trainer_config_helpers/networks.html
   #trainer_config_helpers.networks.text_conv_pool>`_
-*  Declare Python Data Sources, `define_py_data_sources
+*  Declare Python Data Sources, `define_py_data_sources2
   <../../ui/api/trainer_config_helpers/data_sources.html>`_

 Data Provider
 '''''''''''''

-..  include:: ../../../demo/recommendation/dataprovider.py
-    :code: python
-    :literal:
+..  literalinclude:: ../../../demo/recommendation/dataprovider.py
+    :language: python
+    :lines: 15-

 The data provider just read the meta.bin and rating file, yield each sample for training.
 In this :code:`dataprovider.py`, we should set\:
@@ -274,7 +274,7 @@ In this :code:`dataprovider.py`, we should set\:
 * use_seq\: Whether this :code:`dataprovider.py` in sequence mode or not.
 * process\: Return each sample of data to :code:`paddle`.

-The data provider details document see `there <../../ui/DataProvider.html>`_.
+The data provider details document see `there <../../ui/data_provider/pydataprovider2.html>`_.

 Train
 `````
@@ -283,15 +283,15 @@ After prepare data, config network, writting data provider, now we can run paddl

 The run.sh is shown as follow:

-..  include:: ../../../demo/recommendation/run.sh
-    :code: bash
-    :literal:
+..  literalinclude:: ../../../demo/recommendation/run.sh
+    :language: bash
+    :lines: 16-

 It just start a paddle training process, write the log to `log.txt`,
 then print it on screen.

 Each command line argument in :code:`run.sh`, please refer to the `command line
-arguments <TBD>`_ page. The short description of these arguments is shown as follow.
+arguments <../../ui/index.html#command-line-argument>`_ page. The short description of these arguments is shown as follow.

 *  config\: Tell paddle which file is neural network configuration.
 *  save_dir\: Tell paddle save model into './output'
@@ -303,8 +303,6 @@ arguments <TBD>`_ page. The short description of these arguments is shown as fol
 *  dot_period\: Print a :code:`.` after train :code:`dot_period` batches.
 *  num_passes\: Train at most :code:`num_passes`.

-
-
 If training process starts successfully, the output likes follow:

 ..  code-block:: text

--- a/doc/demo/semantic_role_labeling/index.rst
+++ b/doc/demo/semantic_role_labeling/index.rst
+Semantic Role Labeling Tutorial
+===============================
+
+.. toctree::
+    :maxdepth: 3
+
+    semantic_role_labeling.md
--- a/doc/demo/semantic_role_labeling/index.md
+++ b/doc/demo/semantic_role_labeling/index.md
-# Semantic Role Labelling Tutorial
-Semantic role labeling (SRL) is a form of shallow semantic parsing whose goal is to discover the predicate-argument structure of each predicate in a given input sentence. SRL is useful as an intermediate step in a wide range of natural language processing tasks, such as information extraction. automatic document categorization and question answering.  An instance is as following [1]:
+# Semantic Role labeling Tutorial #

+Semantic role labeling (SRL) is a form of shallow semantic parsing whose goal is to discover the predicate-argument structure of each predicate in a given input sentence. SRL is useful as an intermediate step in a wide range of natural language processing tasks, such as information extraction. automatic document categorization and question answering.  An instance is as following [1]:

 [ <sub>A0</sub> He ] [ <sub>AM-MOD</sub> would ][ <sub>AM-NEG</sub> n’t ] [ <sub>V</sub> accept] [ <sub>A1</sub> anything of value ] from [<sub>A2</sub> those he was writing about ]. 

@@ -12,12 +12,10 @@ Semantic role labeling (SRL) is a form of shallow semantic parsing whose goal is
 - AM-MOD: modal 
 - AM-NEG: negation

-
 Given the verb "accept", the chunks in sentence would play certain semantic roles. Here, the label scheme is from Penn Proposition Bank. 

 To this date, most of the successful SRL systems are built on top of some form of parsing results where pre-defined feature templates over the syntactic structure are used. This tutorial will present an end-to-end system using deep bidirectional long short-term memory (DB-LSTM)[2] for solving the SRL task, which largely outperforms the previous state-of-the-art systems. The system regards SRL task as the sequence labelling problem. 

-
 ## Data Description
 The relevant paper[2] takes the data set in CoNLL-2005&2012 Shared Task for training and testing. Accordingto data license,  the demo adopts the test data set of CoNLL-2005, which can be reached on website.

@@ -36,7 +34,6 @@ src.dict：the dictionary of words in sentences
 tgt.dict：the labels dictionary
 feature: the extracted features from data set
 ```
- 

 ## Training
 ### DB-LSTM
@@ -49,8 +46,6 @@ The following figure shows a temporal expanded 2-layer DB-LSTM network.
 ![pic](./network_arch.png)
 </center>

-
-
 ### Features
 Two input features play an essential role in this pipeline: predicate (pred) and argument (argu). Two other features: predicate context (ctx-p) and region mark (mr) are also adopted. Because a single predicate word can not exactly describe the predicate information, especially when the same words appear more than one times in a sentence. With the predicate context, the ambiguity can be largely eliminated. Similarly, we use region mark m<sub>r</sub> = 1 to denote the argument position if it locates in the predicate context region, or m<sub>r</sub> = 0 if does not. These four simple features are all we need for our SRL system. Features of one sample with context size set to 1 is showed as following[2]:
 <center>
@@ -130,7 +125,6 @@ paddle train \
 2>&1 | tee 'train.log'
 ```

-
 -  \--config=./db_lstm.py : network config file.
 -  \--save_di=./output: output path to save models.
 -  \--trainer_count=4 : set thread number (or GPU count).
@@ -183,12 +177,7 @@ python predict.py

 After prediction,  the result is saved in `predict.res`.

-
-
-
-
 ## Reference
 [1] Martha Palmer, Dan Gildea, and Paul Kingsbury. The Proposition Bank: An Annotated Corpus of Semantic Roles , Computational Linguistics, 31(1), 2005. 

 [2] Zhou, Jie, and Wei Xu. "End-to-end learning of semantic role labeling using recurrent neural networks." Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
-
--- a/doc/source/math/utils/utils.rst
+++ b/doc/source/math/utils/utils.rst
 Utils
 =======

-Bits
-------
-.. doxygenfile:: paddle/math/Bits.h
-
 Memory Handle
 --------------
 .. doxygenfile:: paddle/math/MemoryHandle.h

--- a/doc/templates/layout.html
+++ b/doc/templates/layout.html
+{# layout.html #}
+{# Import the theme's layout. #}
+{% extends "!layout.html" %}
+
+
+{%- block extrahead %} 
+<script>
+var _hmt = _hmt || [];
+(function() {
+  var hm = document.createElement("script");
+  hm.src = "//hm.baidu.com/hm.js?b9a314ab40d04d805655aab1deee08ba";
+  var s = document.getElementsByTagName("script")[0]; 
+  s.parentNode.insertBefore(hm, s);
+})();
+</script>
+{% endblock %}
--- a/doc/ui/api/trainer_config_helpers/activations.rst
+++ b/doc/ui/api/trainer_config_helpers/activations.rst
@@ -12,6 +12,13 @@ AbsActivation
    :members: AbsActivation
    :noindex:
    
+ExpActivation
+===============
+
+..  automodule:: paddle.trainer_config_helpers.activations
+    :members: ExpActivation
+    :noindex:
+    
 IdentityActivation
 ==================


--- a/doc/ui/api/trainer_config_helpers/layers.rst
+++ b/doc/ui/api/trainer_config_helpers/layers.rst
@@ -82,12 +82,6 @@ img_cmrnorm_layer
    :members: img_cmrnorm_layer
    :noindex:

-img_rnorm_layer
-----------------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: img_rnorm_layer
-    :noindex:
-
 batch_norm_layer
 ---------------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -175,6 +169,12 @@ dotmul_projection
    :members: dotmul_projection
    :noindex:

+dotmul_operator
+---------------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: dotmul_operator
+    :noindex:
+
 full_matrix_projection
 ----------------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -251,10 +251,10 @@ addto_layer
    :members: addto_layer
    :noindex:

-convex_comb_layer
+linear_comb_layer
 -----------------
 ..  automodule:: paddle.trainer_config_helpers.layers
-    :members: convex_comb_layer
+    :members: linear_comb_layer
    :noindex:

 interpolation_layer
@@ -286,7 +286,13 @@ tensor_layer
 ..  automodule:: paddle.trainer_config_helpers.layers
    :members: tensor_layer
    :noindex:
-    
+
+cos_sim
+-------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: cos_sim
+    :noindex:
+
 trans_layer
 ------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -347,12 +353,6 @@ rank_cost
    :members: rank_cost
    :noindex:

-cos_sim
-------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: cos_sim
-    :noindex:
-
 crf_layer
 -----------------
 ..  automodule:: paddle.trainer_config_helpers.layers

--- a/doc/ui/api/trainer_config_helpers/optimizers.rst
+++ b/doc/ui/api/trainer_config_helpers/optimizers.rst
@@ -4,6 +4,12 @@ BaseSGDOptimizer
    :members: BaseSGDOptimizer
    :noindex:

+MomentumOptimizer
+=================
+..  automodule:: paddle.trainer_config_helpers.optimizers
+    :members: MomentumOptimizer
+    :noindex:
+
 AdamOptimizer
 =============
 ..  automodule:: paddle.trainer_config_helpers.optimizers

--- a/doc/ui/data_provider/index.rst
+++ b/doc/ui/data_provider/index.rst
-PaddlePaddle DataProvider Introduction
-================================
+DataProvider Introduction
+=========================
 DataProvider is a module that loads training or testing data into cpu or gpu
 memory for the following triaining or testing process.

@@ -10,7 +10,7 @@ customized, with sacrificing the efficiency only a little. This is extremly
 useful when you have to dynamically generate certain kinds of data according to,
 for example, the training performance.

-Besides, users also can also customize a C++ :code:`DataProvider` for a more
+Besides, users also can customize a C++ :code:`DataProvider` for a more
 complex usage, or for a higher efficiency.

 The following parameters are required to define in the PaddlePaddle network

--- a/doc/ui/data_provider/pydataprovider2.rst
+++ b/doc/ui/data_provider/pydataprovider2.rst
@@ -17,24 +17,23 @@ how to write a simple PyDataProvider.

 MNIST is a handwriting classification data set. It contains 70,000 digital
 grayscale images. Labels of the training sample range from 0 to 9. All the
-images have been size-normalized and centered into images with a same size
+images have been size-normalized and centered into images with the same size
 of 28 x 28 pixels.

-A small part of the original data as an example can be found in the path below:
+A small part of the original data as an example is shown as below:

 .. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_train.txt

-Each line of the data contains two parts, separated by ';'. The first part is
+Each line of the data contains two parts, separated by :code:`;`. The first part is
 label of an image. The second part contains 28x28 pixel float values.

 Just write path of the above data into train.list. It looks like this:

 .. literalinclude:: ../../../doc_cn/ui/data_provider/train.list

-The corresponding dataprovider can be found in the path below:
+The corresponding dataprovider is shown as below:

 .. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_provider.py
-   : linenos:

 The first line imports PyDataProvider2 package.
 The main function is the process function, that has two parameters.
@@ -45,8 +44,8 @@ This parameter is passed to the process function by PaddlePaddle.
 :code:`@provider` is a Python
 `Decorator <http://www.learnpython.org/en/Decorators>`_ .
 It sets some properties to DataProvider, and constructs a real PaddlePaddle
-DataProvider from a very sample user implemented python function. It does not
-matter if you are not familiar with `Decorator`_. You can keep it sample by
+DataProvider from a very simple user implemented python function. It does not
+matter if you are not familiar with `Decorator`_. You can keep it simple by
 just taking :code:`@provider` as a fixed mark above the provider function you
 implemented.

@@ -59,9 +58,9 @@ document of `input_types`_ for more details.

 The process method is the core part to construct a real DataProvider in
 PaddlePaddle. It implements how to open the text file, how to read one sample
-from the original text file, converted them into `input_types`_, and give them
+from the original text file, convert them into `input_types`_, and give them
 back to PaddlePaddle process at line 23.
-Note that data yields by the process function must follow a same order that
+Note that data yielded by the process function must follow the same order that
 `input_types`_ are defined.


@@ -75,7 +74,20 @@ you can take this as an example.

 .. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_config.py

-Here we specify training data by 'train.list', and no testing data is specified.
+Here we specify training data by :code:`train.list`, and no testing data is specified.
+The method which actually provide data is :code:`process`.
+
+User also can use another style to provide data, which defines the
+:code:`data_layer`'s name explicitly when `yield`. For example,
+the :code:`dataprovider` is shown as below.
+
+.. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_provider.dict.py
+   :linenos:
+
+If user did't give the :code:`data_layer`'s name, PaddlePaddle will use
+the order of :code:`data_layer` definition roughly to determine which feature to
+which :code:`data_layer`. This order may be not correct, so TO DEFINE THE
+:code:`data_layer`'s NAMES EXPLICITLY IS THE RECOMMANDED WAY TO PROVIDER DATA.

 Now, this simple example of using PyDataProvider is finished.
 The only thing that the user should know is how to generte **one sample** from
@@ -94,7 +106,7 @@ DataProvider for the sequential model
 -------------------------------------
 A sequence model takes sequences as its input. A sequence is made up of several
 timesteps. The so-called timestep, is not necessary to have something to do
-with 'time'. It can also be explained to that the order of data are taken into
+with time. It can also be explained to that the order of data are taken into
 consideration into model design and training.
 For example, the sentence can be interpreted as a kind of sequence data in NLP
 tasks.
@@ -111,7 +123,7 @@ The corresponding data provider can be found in the path below:

 .. literalinclude:: ../../../doc_cn/ui/data_provider/sentimental_provider.py

-This data provider for sequential model is a little bit complex than that
+This data provider for sequential model is a little more complex than that
 for MINST dataset.
 A new initialization method is introduced here.
 The method :code:`on_init` is configured to DataProvider by :code:`@provider`'s
@@ -153,49 +165,29 @@ Please refer to the following section reference for details.
 Reference
 ---------

-.. _@provider::
 @provider
 +++++++++

-'@provider' is a Python `Decorator`_, it can construct a PyDataProvider in
-PaddlePaddle from a user defined function. Its parameters are:
-
-* `input_types`_ defines format of the data input.
-* should_shuffle defines whether to shuffle data or not. By default, it is set
-  true during training, and false during testing.
-* pool_size is the memory pool size (in sample number) in DataProvider.
-  -1 means no limit.
-* can_over_batch_size defines whether PaddlePaddle can store little more
-  samples than pool_size. It is better to set True to avoid some deadlocks.
-* calc_batch_size is a function define how to calculate batch size. This is
-  usefull in sequential model, that defines batch size is counted upon sequence
-  or token. By default, each sample or sequence counts to 1 when calculating
-  batch size.
-* cache is a data cache strategy, see `cache`_
-* Init_hook function is invoked once the data provider is initialized,
-  see `init_hook`_
-
-.. _input_types::
+.. autofunction:: paddle.trainer.PyDataProvider2.provider
+
 input_types
 +++++++++++

 PaddlePaddle has four data types, and three sequence types.
 The four data types are: 

-* dense_vector represents dense float vector.
-* sparse_binary_vector sparse binary vector, most of the value is 0, and
+* :code:`dense_vector`: dense float vector.
+* :code:`sparse_binary_vector`: sparse binary vector, most of the value is 0, and
  the non zero elements are fixed to 1.
-* sparse_float_vector sparse float vector, most of the value is 0, and some
-  non zero elements that can be any float value. They are given by the user.
-* integer represents an integer scalar, that is especially used for label or
-  word index.
-
+* :code:`sparse_float_vector`: sparse float vector, most of the value is 0, and some
+  non zero elements can be any float value. They are given by the user.
+* :code:`integer`: an integer scalar, that is especially used for label or word index.

-The three sequence types are
+The three sequence types are:

-* SequenceType.NO_SEQUENCE means the sample is not a sequence
-* SequenceType.SEQUENCE means the sample is a sequence
-* SequenceType.SUB_SEQUENCE means it is a nested sequence, that each timestep of
+* :code:`SequenceType.NO_SEQUENCE` means the sample is not a sequence.
+* :code:`SequenceType.SEQUENCE` means the sample is a sequence.
+* :code:`SequenceType.SUB_SEQUENCE` means it is a nested sequence, that each timestep of
  the input sequence is also a sequence.

 Different input type has a defferenct input format. Their formats are shown
@@ -215,36 +207,39 @@ in the above table.

 where f represents a float value, i represents an integer value.

-.. _init_hook::
-.. _settings::
 init_hook
 +++++++++

 init_hook is a function that is invoked once the data provoder is initialized.
 Its parameters lists as follows:

-* The first parameter is a settings object, which is the same to :code:'settings'
-  in :code:`process` method.  The object contains several attributes, including:
-  * settings.input_types the input types. Reference `input_types`_
-  * settings.logger a logging object
+* The first parameter is a settings object, which is the same to :code:`settings`
+  in :code:`process` method. The object contains several attributes, including:
+
+  * :code:`settings.input_types`: the input types. Reference `input_types`_.
+  * :code:`settings.logger`: a logging object.
+
 * The rest parameters are the key word arguments. It is made up of PaddpePaddle
  pre-defined parameters and user defined parameters.
-  * PaddlePaddle defines parameters including:
-    * is_train is a bool parameter that indicates the DataProvider is used in
-      training or testing
-    * file_list is the list of all files.
+
+  * PaddlePaddle-defined parameters including:
+
+    * :code:`is_train` is a bool parameter that indicates the DataProvider is used in
+      training or testing.
+    * :code:`file_list` is the list of all files.
+      
  * User-defined parameters args can be set in training configuration.

 Note, PaddlePaddle reserves the right to add pre-defined parameter, so please
 use :code:`**kwargs` in init_hook to ensure compatibility by accepting the
 parameters which your init_hook does not use.

-.. _cache ::
 cache
 +++++
-DataProvider provides two simple cache strategy. They are
-* CacheType.NO_CACHE means do not cache any data, then data is read runtime by
+DataProvider provides two simple cache strategy. They are:
+
+* :code:`CacheType.NO_CACHE` means do not cache any data, then data is read at runtime by
  the user implemented python module every pass.
-* CacheType.CACHE_PASS_IN_MEM means the first pass reads data by the user
+* :code:`CacheType.CACHE_PASS_IN_MEM` means the first pass reads data by the user
  implemented python module, and the rest passes will directly read data from
  memory.
--- a/doc/ui/index.md
+++ b/doc/ui/index.md
@@ -7,7 +7,7 @@

 ## API Reference

-* [Trainer Config Helpers](api/trainer_config_helpers/index.md)
+* [Model Config Interface](api/trainer_config_helpers/index.md)

 ## Command Line Argument


--- a/doc/ui/predict/predict_sample.py
+++ b/doc/ui/predict/predict_sample.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from py_paddle import swig_paddle, DataProviderWrapperConverter
-from paddle.trainer.PyDataProviderWrapper import DenseSlot
+from py_paddle import swig_paddle, DataProviderConverter
+from paddle.trainer.PyDataProvider2 import dense_vector
 from paddle.trainer.config_parser import parse_config

 TEST_DATA = [[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -89,12 +89,12 @@ TEST_DATA = [[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,


 def main():
-    conf = parse_config("./mnist_model/trainer_config.conf.norm", "")
+    conf = parse_config("./mnist_model/trainer_config.py", "")
    print conf.data_config.load_data_args
    network = swig_paddle.GradientMachine.createFromConfigProto(conf.model_config)
    assert isinstance(network, swig_paddle.GradientMachine)  # For code hint.
    network.loadParameters("./mnist_model/")
-    converter = DataProviderWrapperConverter(False, [DenseSlot(784)])
+    converter = DataProviderConverter([dense_vector(784)])
    inArg = converter(TEST_DATA)
    print network.forwardTest(inArg)


--- a/doc/ui/predict/swig_py_paddle_en.rst
+++ b/doc/ui/predict/swig_py_paddle_en.rst
@@ -10,27 +10,35 @@ SWIG. The main steps of predict values in python are:
 * Predict

 Here is a sample python script that shows the typical prediction process for the
-MNIST classification problem.
+MNIST classification problem. A complete sample code could be found at
+:code:`src_root/doc/ui/predict/predict_sample.py`.

 ..  literalinclude:: ./predict_sample.py
    :language: python
-    :linenos:
+    :lines: 15-18,90-100,101-104

 The module that does the most of the job is py_paddle.swig_paddle, it's
 generated by SWIG and has complete documents, for more details you can use
 python's :code:`help()` function. Let's walk through the above python script:

-* At the beginning, initialize PaddlePaddle with command line arguments(line 90).
-* Parse the configuration file that is used in training(line 93).
-* Create a neural network at line 95 according the parsed configuration, then
-  load the trained parameters from model at line 97.
-* A utility class for data transformation is created at line 98.
+* At the beginning, use :code:`swig_paddle.initPaddle()` to initialize
+  PaddlePaddle with command line arguments, for more about command line arguments
+  see `Command Line Arguments <../cmd_argument/detail_introduction.html>`_.
+* Parse the configuration file that is used in training with :code:`parse_config()`.
+  Because data to predict with always have no label, and output of prediction work
+  normally is the output layer rather than the cost layer, so you should modify
+  the configuration file accordingly before using it in the prediction work.
+* Create a neural network with
+  :code:`swig_paddle.GradientMachine.createFromConfigproto()`, which takes the
+  parsed configuration :code:`conf.model_config` as argument. Then load the
+  trained parameters from the model with :code:`network.loadParameters()`.
+* Create a data converter object of utility class :code:`DataProviderConverter`.
    - Note: As swig_paddle can only accept C++ matrices, we offer a utility
-      class DataProviderWraaperConverter that can accept the same input data with
-      PyDataProviderWrapper, for more information please refer to document
-      of `PyDataProviderWrapper <../py_data_provider_wrapper_api.html>`_.
-* Do the prediction and output the result at line 100, forwardTest is another
-  utility class that directly takes the activations of the output layer.
+      class DataProviderConverter that can accept the same input data with
+      PyDataProvider2, for more information please refer to document
+      of `PyDataProvider2 <../data_provider/pydataprovider2.html>`_.
+* Do the prediction with :code:`forwardTest()`, which takes the converted
+  input data and outputs the activations of the output layer.

 Here is a typical output:


--- a/doc_cn/algorithm/rnn/rnn.rst
+++ b/doc_cn/algorithm/rnn/rnn.rst
-RNN 配置
-========
-
-.. toctree::
-  :maxdepth: 3
-
-* `RNN配置 <../../../doc/algorithm/rnn/rnn.html>`_
--- a/doc_cn/build_and_install/index.rst
+++ b/doc_cn/build_and_install/index.rst
 编译与安装
 ========================

-..  toctree::
-    
-    install/index.rst
-    cmake/index.rst
+PaddlePaddle提供数个预编译的二进制来进行安装，包括Docker镜像，ubuntu的deb安装包等。我们推荐使用Docker镜像来部署环境，同时欢迎贡献更多的安装包。
+
+Note: The intallation packages are still in pre-release state and your experience of installation may not be smooth.
+
+注意：目前PaddlePaddle的安装包还处在pre-release的状态，使用起来或许会不是很顺畅。
+
+.. toctree::
+   :maxdepth: 1
+   :glob:
+   
+   源码下载(对内) <../build/internal/download_paddle_source_zh_cn.rst>
+   使用Jumbo安装(对内) <../build/internal/install_from_jumbo.rst>
+   从源码编译安装(对内)  <../build/internal/build_from_source_zh_cn.rst>
+   install/docker_install.rst 
+   install/ubuntu_install.rst
+   cmake/index.rst
--- a/doc_cn/build_and_install/install/index.rst
+++ b/doc_cn/build_and_install/install/index.rst
-安装PaddlePaddle
-==========
-
-PaddlePaddle提供数个预编译的二进制来进行安装。他们包括Docker镜像，ubuntu的deb安装包等
-。欢迎贡献更多的安装包。我们更推荐使用Docker镜像来部署PaddlePaddle环境。
-
-Note: The intallation packages are still in pre-release
-state and your experience of installation may not be smooth.
-
-注意!目前PaddlePaddle的安装包还处在pre-release的状态，
-使用起来或许会不是很顺畅。
-
-..	toctree::
-	docker_install.rst
-	ubuntu_install.rst
--- a/doc_cn/build_and_install/install/ubuntu_install.rst
+++ b/doc_cn/build_and_install/install/ubuntu_install.rst
@@ -4,10 +4,8 @@
 PaddlePaddle目前支持ubuntu 14.04版本使用deb包安装。更多的安装包PaddlePaddle会在近期提供。
 欢迎大家贡献各个发行版的安装包(例如，ubuntu，centos，debian，gentoo)。

-PaddlePaddle的ubuntu安装包分为两个版本，即CPU版本，和GPU版本，他们的下载地址是:
-
-* CPU版本的PaddlePaddle安装包:  TBD
-* GPU版本的PaddlePaddle安装包:  TBD
+PaddlePaddle的ubuntu安装包分为两个版本，即CPU版本，和GPU版本，他们的下载地址是\:
+https://github.com/baidu/Paddle/releases/tag/V0.8.0b0

 需要注意的是，目前PaddlePaddle的安装包只支持 
 `AVX <https://en.wikipedia.org/wiki/Advanced_Vector_Extensions>`_
@@ -21,8 +19,10 @@ PaddlePaddle的ubuntu安装包分为两个版本，即CPU版本，和GPU版本
    dpkg -i paddle-0.8.0b-cpu.deb
    apt-get install -f

-需要注意的是，如果使用GPU版本的PaddlePaddle，请安装CUDA 7.5 和CUDNN 5到本地环境中，并
-设置好对应的环境变量(LD_LIBRARY_PATH等等)。
+在 :code:`dpkg -i` 的时候如果报一些依赖未找到的错误是正常的，
+在 :code:`apt-get install -f` 里会继续安装 PaddlePaddle。
+需要注意的是，如果使用GPU版本的PaddlePaddle，请安装CUDA 7.5 和CUDNN 5到本地环境中，
+并设置好对应的环境变量(LD_LIBRARY_PATH等等)。

 可能遇到的问题
 --------------

--- a/doc_cn/cluster/index.rst
+++ b/doc_cn/cluster/index.rst
 集群训练
 ========

-参见 `集群训练 <../../doc/cluster/index.html>`_
+* `集群训练 <../../doc/cluster/index.html>`_
+
+.. toctree::
+    :maxdepth: 2
+    :glob:
+
+    集群训练(对内) <internal/index.md>
+
--- a/doc_cn/conf.py.in
+++ b/doc_cn/conf.py.in
@@ -22,6 +22,7 @@ AutoStructify = transform.AutoStructify
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 sys.path.insert(0, '@PROJ_ROOT@/python')
+templates_path = ["@PROJ_ROOT@/doc/templates"]

 # -- General configuration ------------------------------------------------

@@ -51,9 +52,6 @@ table_styling_embed_css = True

 autodoc_member_order = 'bysource'

-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-
 # The suffix(es) of source filenames.
 # You can specify multiple suffix as a list of string:
 # source_suffix = ['.rst', '.md']

--- a/doc_cn/demo/embedding_model/index.md
+++ b/doc_cn/demo/embedding_model/index.md
-# Embedding Demo
--- a/doc_cn/demo/image_classification/index.rst
+++ b/doc_cn/demo/image_classification/index.rst
-图片分类教程
-============
-
-TBD
--- a/doc_cn/demo/imagenet_model/index.md
+++ b/doc_cn/demo/imagenet_model/index.md
-# Resnet
- TBD
--- a/doc_cn/demo/index.rst
+++ b/doc_cn/demo/index.rst
@@ -21,5 +21,6 @@

 常用模型
 ''''''''
+
 * `ImageNet: ResNet <../../doc/demo/imagenet_model/resnet_model.html>`_
 * `Embedding: Chinese Word <../../doc/demo/embedding_model/index.html>`_
--- a/doc_cn/demo/quick_start/index.md
+++ b/doc_cn/demo/quick_start/index.md
@@ -4,7 +4,7 @@

 ## 安装(Install)

-首先请参考<a href = "../../build/index.html">安装教程</a>安装PaddlePaddle。
+首先请参考<a href = "../../build_and_install/index.html">安装教程</a>安装PaddlePaddle。

 ## 使用概述(Overview)

@@ -32,7 +32,7 @@

 ## 数据格式准备(Data Preparation)
 在本问题中，我们使用[Amazon电子产品评论数据](http://jmcauley.ucsd.edu/data/amazon/)，
-将评论分为好评(正样本)和差评(负样本)两类。`demo/quick_start`里提供了数据下载脚本
+将评论分为好评(正样本)和差评(负样本)两类。[源码](https://github.com/baidu/Paddle)的`demo/quick_start`里提供了数据下载脚本
 和预处理脚本。

 ```bash
@@ -134,8 +134,8 @@ define_py_data_sources2(train_list='data/train.list',
 * obj="process": 指定生成数据的函数
 * args={"dictionary": word_dict}: 额外的参数，这里指定词典

-更详细用例请参考文档<a href = "../../ui/data_provider/python_case.html">Python Use Case</a>，
-数据格式和详细文档请参考<a href = "../../ui/py_data_provider_wrapper_api.html">
+更详细用例请参考文档<a href = "../../../doc/ui/data_provider/python_case.html">Python Use Case</a>，
+数据格式和详细文档请参考<a href = "../../../doc/ui/data_provider/pydataprovider2.html">
 PyDataProviderWrapper</a>。

 ## 网络结构(Network Architecture)
@@ -143,8 +143,8 @@ PyDataProviderWrapper</a>。
 <center> ![](./PipelineNetwork.jpg) </center>

 我们将以基本的逻辑回归网络作为起点，并逐渐展示更加深入的功能。更详细的网络配置
-连接请参考<a href = "../../ui/trainer_config_helpers_api.html#module-paddle.trainer_config_helpers.layers">Layer文档</a>。
-所有配置在`demo/quick_start`目录，首先列举逻辑回归网络。
+连接请参考<a href = "../../../doc/layer.html">Layer文档</a>。
+所有配置在[源码](https://github.com/baidu/Paddle)`demo/quick_start`目录，首先列举逻辑回归网络。

 ### 逻辑回归模型(Logistic Regression)

@@ -350,7 +350,7 @@ lstm = simple_lstm(input=emb, size=lstm_size)
 <br>

 ## 优化算法(Optimization Algorithm)
-<a href = "../../ui/trainer_config_helpers_api.html#module-paddle.trainer_config_helpers.optimizers">优化算法</a>包括
+<a href = "../../../doc/ui/trainer_config_helpers_api.html#module-paddle.trainer_config_helpers.optimizers">优化算法</a>包括
 Momentum, RMSProp，AdaDelta，AdaGrad，ADAM，Adamax等，这里采用Adam优化方法，加了L2正则和梯度截断。

 ```python
@@ -375,7 +375,7 @@ paddle train \
 --num_passes=15 \
 --use_gpu=false
 ```
-这里没有介绍多机分布式训练，可以参考<a href = "../../platform/index.html">分布式训练</a>的demo学习如何进行多机训练。
+这里没有介绍多机分布式训练，可以参考<a href = "../../cluster/index.html">分布式训练</a>的demo学习如何进行多机训练。

 ## 预测(Prediction)
 可以使用训练好的模型评估带有label的验证集，也可以预测没有label的测试集。
@@ -407,7 +407,7 @@ paddle train \

 mv rank-00000 result.txt
 ```
-与训练网络配置不同的是：无需label相关的层，指定outputs输出概率层(softmax输出)，
+这里以`output/pass-00003`为例进行预测，用户可以根据训练log选择test结果最好的模型来预测。与训练网络配置不同的是：无需label相关的层，指定outputs输出概率层(softmax输出)，
 指定batch_size=1，数据传输无需label数据，预测数据指定test_list的位置。

 预测结果以文本的形式保存在`result.txt`中，一行为一个样本，格式如下：

--- a/doc_cn/demo/semantic_role_labeling/index.md
+++ b/doc_cn/demo/semantic_role_labeling/index.md
-# 语义标注
-TBD
--- a/doc_cn/demo/sentiment_analysis/index.md
+++ b/doc_cn/demo/sentiment_analysis/index.md
-# 情感分析
-TBD
--- a/doc_cn/demo/text_generation/index.rst
+++ b/doc_cn/demo/text_generation/index.rst
-文本生成
-========
-TBD
--- a/doc_cn/dev/new_layer/index.rst
+++ b/doc_cn/dev/new_layer/index.rst
-新写Layer
-=========
-
-* `新写Layer <../../../doc/dev/new_layer/index.html>`_
--- a/doc_cn/index.md
+++ b/doc_cn/index.md
-PaddlePaddle文档
-================
-
-使用指南
--------
-* [快速入门](demo/quick_start/index.md)
-* [编译与安装](build_and_install/index.rst)
-* [用户接口](ui/index.rst)
-* [使用示例](demo/index.rst)
-* [模型配置](ui/model.rst)
-* [集群训练](cluster/index.rst)
-
-开发指南
--------
-* [新写Layer](dev/new_layer/index.rst)
-
-算法教程
--------
-* [RNN配置](algorithm/rnn/rnn.rst)
--- a/doc_cn/index.rst
+++ b/doc_cn/index.rst
+PaddlePaddle文档
+================
+
+使用指南
+--------
+* `快速入门 <demo/quick_start/index.html>`_
+* `编译与安装 <build_and_install/index.html>`_
+* `用户接口 <ui/index.html>`_
+* `使用示例 <demo/index.html>`_
+* `模型配置 <../doc/ui/api/trainer_config_helpers/index.html>`_
+* `集群训练 <cluster/index.html>`_
+
+开发指南
+--------
+* `新写Layer <../doc/dev/new_layer/index.html>`_
+
+算法教程
+--------
+* `RNN配置 <../doc/algorithm/rnn/rnn.html>`_
--- a/doc_cn/ui/data_provider/index.rst
+++ b/doc_cn/ui/data_provider/index.rst
 PaddlePaddle的数据提供(DataProvider)介绍
-==================================
+========================================

-数据提供(DataProvider，后用DataProvider代替)是PaddlePaddle负责提供数据的模块。其作用是将训练数据
-传入内存或者显存，让神经网络可以进行训练。简单的使用，用户可以使用Python的
-:code:`PyDataProvider` 来自定义传数据的过程。如果有更复杂的使用，或者需要更高的效率，
-用户也可以在C++端自定义一个 :code:`DataProvider` 。
+数据提供(DataProvider)是PaddlePaddle负责提供数据的模块。其作用是将训练数据传入内存或者显存，让神经网络可以进行训练。简单的使用，用户可以使用Python的 :code:`PyDataProvider` 来自定义传数据的过程。如果有更复杂的使用，或者需要更高的效率，用户也可以在C++端自定义一个 :code:`DataProvider` 。

-PaddlePaddle需要用户在网络配置(trainer_config.py)中定义使用什么DataProvider，和DataProvider
-的一些参数，训练文件列表(train.list)和测试文件列表(test.list)。
+PaddlePaddle需要用户在网络配置(trainer_config.py)中定义使用哪种DataProvider及其参数，训练文件列表(train.list)和测试文件列表(test.list)。

-其中，train.list和test.list均为本地的两个文件(推荐直接放置到训练目录，以相对路径引用)。如果
-test.list不设置，或者设置为None的话，那么在训练过程中，不会执行测试操作。否则，则会根据命令行
-参数指定的测试方式，在训练过程中进行测试，从而防止过拟合。
+其中，train.list和test.list均为本地的两个文件(推荐直接放置到训练目录，以相对路径引用)。如果test.list不设置，或者设置为None，那么在训练过程中，不会执行测试操作。否则，会根据命令行参数指定的测试方式，在训练过程中进行测试，从而防止过拟合。

-一般情况下，train.list和test.list为纯文本文件，其每一行对应这每一个数据文件。数据文件存放在
-本地磁盘中，将文件的绝对路径或相对路径(相对于PaddlePaddle程序运行时的路径)的方式写在train.list和
-test.list中。当然，train.list和test.list也可以放置hdfs文件路径，或者数据库连接地址等等。
-用户在DataProvider中需要实现如何访问其中每一个文件。
+一般情况下，train.list和test.list为纯文本文件，一行对应一个数据文件，数据文件存放在本地磁盘中。将文件的绝对路径或相对路径(相对于PaddlePaddle程序运行时的路径)写在train.list和test.list中。当然，train.list和test.list也可以放置hdfs文件路径，或者数据库连接地址等等。

-DataProvider的具体用法和如何实现一个新的DataProvider，请参考下述文章:
+用户在DataProvider中需要实现如何访问其中每一个文件。DataProvider的具体用法和如何实现一个新的DataProvider，请参考下述文章:

 ..	toctree::


--- a/doc_cn/ui/data_provider/mnist_config.py
+++ b/doc_cn/ui/data_provider/mnist_config.py
@@ -4,3 +4,5 @@ define_py_data_sources2(train_list='train.list',
                        test_list=None,
                        module='mnist_provider',
                        obj='process')
+img = data_layer(name='pixel', size=784)
+label = data_layer(name='label', size=10)
--- a/doc_cn/ui/data_provider/mnist_provider.dict.py
+++ b/doc_cn/ui/data_provider/mnist_provider.dict.py
+from paddle.trainer.PyDataProvider2 import *
+
+
+# Define a py data provider
+@provider(input_types=[
+    dense_vector(28 * 28),
+    integer_value(10)
+])
+def process(settings, filename):  # settings is not used currently.
+    f = open(filename, 'r')  # open one of training file
+
+    for line in f:  # read each line
+        label, pixel = line.split(';')
+
+        # get features and label
+        pixels_str = pixel.split(' ')
+
+        pixels_float = []
+        for each_pixel_str in pixels_str:
+            pixels_float.append(float(each_pixel_str))
+
+        # give data to paddle.
+        yield { "pixel": pixels_float, 'label': int(label) }
+
+    f.close()  # close file
--- a/doc_cn/ui/data_provider/pydataprovider2.rst
+++ b/doc_cn/ui/data_provider/pydataprovider2.rst
@@ -56,6 +56,14 @@ process函数调用多次 :code:`yield` 即可。 :code:`yield` 是Python的一
 这里说明了训练数据是 'train.list'，而没有测试数据。引用的DataProvider是 'mnist_provider' 
 这个模块中的 'process' 函数。

+同时，根据模型配置文件中 :code:`data_layer` 的名字，用户也可以显式指定返回的数据对应关系。例如:
+
+.. literalinclude:: mnist_provider.dict.py
+   :linenos:
+
+如果用户不指定返回数据的对应关系，那么PaddlePaddle会粗略的根据layer的声明顺序，
+来确定对应关系。这个对应关系可能不正确。所以推荐使用显式指定返回值和数据对应关系。
+
 至此，简单的PyDataProvider样例就说明完毕了。对于用户来说，讲数据发送给PaddlePaddle，仅仅需要
 知道如何从 **一个文件** 里面读取 **一条** 样本。而PaddlePaddle进程帮助用户做了

@@ -116,16 +124,16 @@ DataProvider创建的时候执行。这个初始化函数具有如下参数:
 参考(Reference)
 ---------------

-..  _@provider::
-
 @provider
 +++++++++

-'@provider'是一个Python的 `Decorator`_ ，他可以将某一个函数标记成一个PyDataProvider。它包含的参数有:
+:code:`@provider` 是一个Python的 `Decorator`_ ，他可以将某一个函数标记成一个PyDataProvider。它包含的参数有:

 *  `input_types`_ 是数据输入格式。具体有哪些格式，参考 `input_types`_ 。
 *  should_shuffle 是个DataProvider是不是要做shuffle，如果不设置的话，训练的时候默认shuffle，
-   测试的时候默认不shuffle
+   测试的时候默认不shuffle。
+*  min_pool_size 是设置DataProvider在内存中最小暂存的数据条数。这个也是PaddlePaddle所能够保证的shuffle粒度。
+   设置成-1的话，会预先读取全部数据到内存中。
 *  pool_size 是设置DataProvider在内存中暂存的数据条数。设置成-1的话，即不在乎内存暂存多少条数据。
 *  can_over_batch_size 表示是否允许Paddle暂存略微多余pool_size的数据。这样做可以避免很多死锁问题。
   一般推荐设置成True
@@ -133,9 +141,11 @@ DataProvider创建的时候执行。这个初始化函数具有如下参数:
   是一个batch size，但是有时为了计算均衡性，可以将一条数据设置成多个batch size
 *  cache 是数据缓存的策略，参考 `cache`_
 *  init_hook 是初始化时调用的函数，参考 `init_hook`_
-
-
-..  _input_types::
+*  use_dynamic_order 如果是true的话，可以返回一个dict，key是data_layer的名字，value是特征值。同时，也可以
+   返回一个list或者tuple。如果是false的话，只能够返回list或者tuple
+*  check 设置成true的话，会根据input_types检查数据的合法性。
+*  check_fail_continue 如果设置成true的话，即使在check中数据不合法，也会扔到这条数据，继续训练。 如果
+   check是false的话，没有作用。

 input_types
 +++++++++++
@@ -169,16 +179,11 @@ PaddlePaddle的数据包括四种主要类型，和三种序列模式。其中

 其中，f代表一个浮点数，i代表一个整数。

-..  _init_hook::
-..  _settings::
-
 init_hook
 +++++++++

 init_hook可以传入一个函数。这个函数在初始化的时候会被调用。这个函数的参数是:

-
-
 * 第一个参数是 settings 对象。这个对象和process的第一个参数一致。具有的属性有
    * settings.input_types 设置输入类型。参考 `input_types`_
    * settings.logger 一个logging对象
@@ -192,8 +197,6 @@ init_hook可以传入一个函数。这个函数在初始化的时候会被调
 注意，PaddlePaddle保留添加参数的权力，所以init_hook尽量使用 :code:`**kwargs` , 来接受不使用的
 函数来保证兼容性。

-..  _cache::
-
 cache
 +++++

@@ -202,3 +205,55 @@ DataProvider提供了两种简单的Cache策略。他们是
 * CacheType.NO_CACHE 不缓存任何数据，每次都会从python端读取数据
 * CacheType.CACHE_PASS_IN_MEM 第一个pass会从python端读取数据，剩下的pass会直接从内存里
  读取数据。 
+
+
+注意事项
+--------
+
+可能的内存泄露问题
++++++++++++++++++
+
+PaddlePaddle将train.list中的每一行，都传递给process函数，从而生成多个generator。
+即如果train.list中，有100个训练文件，即会生成100个generator。这个本身不是一个很
+严重的问题。
+
+但是，如果在训练时，每一条训练数据都是一个文件，并且，训练数据非常多的情况下，就
+会生成多个generator。每个generator在没有调用的时候，是几乎不占内存的。但是，当调
+用过一次的时候，generator便会存下当前的上下文(Context)。而这个Context可能会非常
+大。并且，generator至少调用两次才会知道是否停止。所以，即使在process里面只会有一
+个yield，也需要两次随机选择到同样的generator的时候，才会释放该段内存。
+
+..  code-block:: python
+
+    def func():
+        yield 0
+
+    f = func()  # 创建generator
+    tmp = next(f)  # 调用一次，返回0
+    tmp = next(f)  # 调用第二次的时候，才会Stop Iteration
+
+而如果按顺序调用这些generator就不会出现这个问题。
+
+所以最佳实践推荐不要将每一个样本都放入train.list。而是将样本的地址放入另一个文本
+文件，train.list写入那个文本文件的地址。 或者在python generator的上下文中尽量留
+下非常少的变量引用。例如
+
+..  code-block:: python
+
+    def real_process(fn):
+        # ... read from fn
+        return result   # 当函数返回的时候，python可以解除掉内部变量的引用。
+
+    def process(fn):
+        yield real_process(fn)
+
+这个问题是PyDataProvider读数据时候的逻辑问题，基本上不能整体修正。
+
+
+内存不够用的情况
++++++++++++++++
+
+PyDataProvider2会尽量使用内存。所以如果对于内存比较小的机器，推荐设置
+:code:`pool_size` 变量，而这个变量推荐大于训练的batch size，并且在内存足够
+的情况下越大越好。
+
--- a/doc_cn/ui/index.rst
+++ b/doc_cn/ui/index.rst
@@ -5,6 +5,7 @@
 ''''''''

 ..  toctree::
+    :maxdepth: 1

    data_provider/index.rst


--- a/doc_cn/ui/model.rst
+++ b/doc_cn/ui/model.rst
-模型配置
-========
-
-* `Model Config Interface <../../doc/ui/api/trainer_config_helpers/index.html>`_
--- a/doc_cn/ui/predict/swig_py_paddle.rst
+++ b/doc_cn/ui/predict/swig_py_paddle.rst
@@ -9,22 +9,30 @@ PaddlePaddle目前使用Swig对其常用的预测接口进行了封装，使在P
 * 准备数据
 * 预测

-典型的预测代码如下，使用mnist手写识别作为样例。
+典型的预测代码如下，使用mnist手写识别作为样例, 完整代码见
+:code:`src_root/doc/ui/predict/predict_sample.py` 。

 ..  literalinclude:: ../../../doc/ui/predict/predict_sample.py
    :language: python
-    :linenos:
-
-主要的软件包为py_paddle.swig_paddle，这个软件包文档相对完善。可以使用python的 :code:`help()` 函数查询文档。主要步骤为:
-
-* 在程序开始阶段，使用命令行参数初始化PaddlePaddle
-* 在98行载入PaddlePaddle的训练文件。读取config
-* 在100行创建神经网络，并在83行载入参数。
-* 103行创建一个从工具类，用来转换数据。
+    :lines: 15-18,90-100,101-104
+
+主要的软件包为py_paddle.swig_paddle，这个软件包文档相对完善。可以使用python的
+:code:`help()` 函数查询文档。主要步骤为:
+
+* 在程序开始阶段，使用 :code:`swig_paddle.initPaddle()` 传入命令行参数初始化
+  PaddlePaddle。详细的命令行参数请参考
+  `命令行参数 <../cmd_argument/detail_introduction.html>`_ 。
+* 接下来使用 :code:`parse_config()` 解析训练时的配置文件。这里要注意预测数据通常
+  不包含label, 而且预测网络通常直接输出最后一层的结果而不是像训练时一样以cost
+  layer作为输出，所以用于预测的配置文件要做相应的修改。
+* 使用 :code:`swig_paddle.GradientMachine.createFromConfigproto()` 根据上一步解
+  析好的配置创建神经网络。
+* 创建一个 :code:`DataProviderConverter` 对象converter。
    - swig_paddle接受的原始数据是C++的Matrix，也就是直接写内存的float数组。
-    - 这个接口并不用户友好。所以，我们提供了一个工具类DataProviderWrapperConverter.
-    - 这个工具类接收和PyDataProviderWrapper一样的输入数据，请参考PyDataProviderWrapper的文档。
-* 在第105行执行预测。forwardTest是一个工具类，直接提取出神经网络Output层的输出结果。典型的输出结果为\:
+      这个接口并不用户友好。所以，我们提供了一个工具类DataProviderConverter。
+      这个工具类接收和PyDataProvider2一样的输入数据，详情请参考
+      `PyDataProvider2文档 <../../../doc/ui/data_provider/pydataprovider2.html>`_ 。
+* 最后使用 :code:`forwardTest()` 直接提取出神经网络Output层的输出结果。典型的输出结果为\:

 ..  code-block:: text

@@ -37,4 +45,4 @@ PaddlePaddle目前使用Swig对其常用的预测接口进行了封装，使在P
          2.70634608e-08,   3.48565123e-08,   5.25639710e-09,
          4.48684503e-08]], dtype=float32)}]

-其中，value即为softmax层的输出。由于数据是两个，所以输出的value。
+其中，value即为softmax层的输出。由于数据是两条，所以输出的value包含两个向量 。
--- a/paddle/.gitignore
+++ b/paddle/.gitignore
@@ -40,3 +40,4 @@ HPPL_ERROR_LOG
 unittest.list
 proto
 dist
+setup.py
--- a/paddle/.set_python_path.sh
+++ b/paddle/.set_python_path.sh
@@ -22,15 +22,21 @@
 # It same as PYTHONPATH=${YOUR_PYTHON_PATH}:$PYTHONPATH {exec...}
 #

-PYPATH=""
-set -x
-while getopts "d:" opt; do
-  case $opt in
-    d)
-      PYPATH=$OPTARG
-      ;;
-  esac
-done
-shift $(($OPTIND - 1))
-export PYTHONPATH=$PYPATH
-$@
+if ! python -c "import paddle" >/dev/null 2>/dev/null; then
+  PYPATH=""
+  set -x
+  while getopts "d:" opt; do
+    case $opt in
+      d)
+        PYPATH=$OPTARG
+        ;;
+    esac
+  done
+  shift $(($OPTIND - 1))
+  export PYTHONPATH=$PYPATH
+  $@
+else
+  echo "paddle package is already in your PYTHONPATH. But unittest need a clean environment."
+  echo "Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'"
+  exit 1
+fi
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -7,6 +7,9 @@ add_subdirectory(pserver)
 add_subdirectory(trainer)
 add_subdirectory(scripts)

+configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
+    ${CMAKE_CURRENT_SOURCE_DIR}/setup.py)
+
 if(WITH_PREDICT_SDK)
    add_subdirectory(predict)
 endif()

--- a/paddle/api/Arguments.cpp
+++ b/paddle/api/Arguments.cpp
@@ -110,8 +110,8 @@ IVector* Arguments::getSlotSequenceStartPositions(size_t idx) const
  }
 }

-IVector*Arguments::getSlotSubSequenceStartPositions(size_t idx) const
-    throw (RangeError){
+IVector* Arguments::getSlotSubSequenceStartPositions(size_t idx) const
+    throw(RangeError) {
  auto& a = m->getArg(idx);
  if (a.subSequenceStartPositions) {
    return IVector::createByPaddleVectorPtr(
@@ -129,7 +129,7 @@ void Arguments::setSlotSequenceStartPositions(size_t idx,
 }

 void Arguments::setSlotSubSequenceStartPositions(
-    size_t idx, IVector *vec) throw (RangeError) {
+    size_t idx, IVector *vec) throw(RangeError) {
  auto& a = m->getArg(idx);
  auto& v = m->cast<paddle::IVector>(vec->getSharedPtr());
  a.subSequenceStartPositions = std::make_shared<paddle::ICpuGpuVector>(v);

--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/utils/GlobalConstants.h"
+#include "paddle/utils/TypeDefs.h"

 /// Import PaddlePaddle's enumeration into global namespace.
 using namespace paddle::enumeration_wrapper;  // NOLINT
@@ -392,7 +393,7 @@ public:
  void setSlotSequenceStartPositions(size_t idx,
                                     IVector* vec) throw(RangeError);
  void setSlotSubSequenceStartPositions(size_t idx,
-                                        IVector* vec) throw (RangeError);
+                                        IVector* vec) throw(RangeError);
  void setSlotSequenceDim(size_t idx, IVector* vec) throw(RangeError);

 private:

--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/utils/Util.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Flags.h"
+#include "paddle/utils/Excepts.h"
 #include "paddle/parameter/Parameter.h"

 #include <fenv.h>

--- a/paddle/api/paddle_ld_flags.py
+++ b/paddle/api/paddle_ld_flags.py
@@ -15,6 +15,19 @@
 try:
    from paddle_api_config import *
    import os.path
+    import platform
+
+    system = platform.system().lower()
+    is_osx = (system == 'darwin')
+    is_win = (system == 'windows')
+    is_lin = (system == 'linux')
+
+    if is_lin:
+        whole_start = "-Wl,--whole-archive"
+        whole_end = "-Wl,--no-whole-archive"
+    elif is_osx:
+        whole_start = ""
+        whole_end = ""

    LIB_DIRS = ["math", 'utils', 'parameter', "gserver", "api", "cuda", "pserver", "trainer"]
    PARENT_LIB_DIRS = ['proto']
@@ -56,9 +69,9 @@ try:

        def libs_str(self):
            libs = [
-                "-Wl,--whole-archive",
+                whole_start,
                "-lpaddle_gserver",
-                "-Wl,--no-whole-archive",
+                whole_end,
                "-lpaddle_pserver",
                "-lpaddle_trainer_lib",
                "-lpaddle_network",

--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
+set(AVX_SOURCES
+    src/hl_math.cc
+    src/hl_avx_functions.cc
+)
 set(CUDA_SOURCES
    src/hl_time.cc
-    src/hl_math.cc
    src/hl_cpu_functions.cc
-    src/hl_avx_functions.cc)
+    ${AVX_SOURCES})

 set(CUDA_CXX_WITH_GPU_SOURCES
    src/hl_cuda_cublas.cc
@@ -12,7 +15,7 @@ set(CUDA_CXX_WITH_GPU_SOURCES
 set_source_files_properties(${CUDA_CXX_WITH_GPU_SOURCES}
                            PROPERTIES COMPILE_FLAGS "-D__NVCC__")

-set_source_files_properties(${CUDA_SOURCES}
+set_source_files_properties(${AVX_SOURCES}
                            PROPERTIES COMPILE_FLAGS "-mavx")

 set(CUDA_DSO_SOURCES
@@ -73,4 +76,3 @@ endif()

 add_style_check_target(paddle_cuda ${CUDA_SOURCES})
 add_style_check_target(paddle_cuda ${CUDA_HEADERS})
-# add_style_check_target(hppl ${HPPL_CU_SOURCES})   # TODO(yuyang18): Format hppl style
--- a/paddle/cuda/include/hl_cuda.h
+++ b/paddle/cuda/include/hl_cuda.h
@@ -321,13 +321,14 @@ extern const char* hl_get_device_error_string(size_t err);
 extern int hl_get_device_last_error();

 /**
- * @brief   hppl query event.
+ * @brief   check cuda event is ready
 *
- * @param[in]   event       cuda event to query.
- * @param[out]  isNotReady  this work under device has not yet been
- *                          completed, vice versa.
+ * @param[in]  event        cuda event to query.
+ *
+ * @return     true    cuda event is ready.
+ *             false   cuda event is not ready.
 */
-extern void hl_cuda_event_query(hl_event_t event, bool& isNotReady);
+extern bool hl_cuda_event_is_ready(hl_event_t event);

 /**
 * @brief   hppl device synchronization.

--- a/paddle/cuda/include/hl_device_functions.cuh
+++ b/paddle/cuda/include/hl_device_functions.cuh
@@ -16,26 +16,37 @@ limitations under the License. */
 #ifndef HL_DEVICE_FUNCTIONS_CUH_
 #define HL_DEVICE_FUNCTIONS_CUH_

-namespace hppl {
-
-static __inline__ __device__ double atomicAdd(double* address, double val) {
-    // NOLINTNEXTLINE
-    unsigned long long int* address_as_ull = (unsigned long long int*)address;
-    unsigned long long int old = *address_as_ull, assumed; // NOLINT
-
-    do {
-        assumed = old;
-        old = atomicCAS(address_as_ull,
-                        assumed,
-                        __double_as_longlong(val +
-                        __longlong_as_double(assumed)));
-    } while (assumed != old);
-
-    return __longlong_as_double(old);
+namespace paddle {
+
+template <class T>
+inline __device__ T paddleAtomicAdd(T* address, T val);
+
+template <>
+inline __device__ float paddleAtomicAdd(float* address, float val) {
+  return atomicAdd(address, val);
 }

-}  // namespace hppl
+template <>
+inline __device__ double paddleAtomicAdd(double* address, double val) {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
+  return atomicAdd(address, val);
+#else
+  // NOLINTNEXTLINE
+  unsigned long long int* address_as_ull = (unsigned long long int*)address;
+  unsigned long long int old = *address_as_ull, assumed; // NOLINT
+
+  do {
+    assumed = old;
+    old = atomicCAS(address_as_ull,
+                    assumed,
+                    __double_as_longlong(val +
+                    __longlong_as_double(assumed)));
+  } while (assumed != old);
+
+  return __longlong_as_double(old);
+#endif
+}
+}  // namespace paddle

-using hppl::atomicAdd;

 #endif /* HL_DEVICE_FUNCTIONS_CUH_ */
--- a/paddle/cuda/include/hl_gpu_lstm.cuh
+++ b/paddle/cuda/include/hl_gpu_lstm.cuh
@@ -192,10 +192,10 @@ __global__ void KeLstmBackward(Op op,

  if (isBatch) {
    if (value.prevStateValue) {
-      if (grad.checkIgGrad) atomicAdd(grad.checkIgGrad+frameIdx, rCheckIGrad);
-      if (grad.checkFgGrad) atomicAdd(grad.checkFgGrad+frameIdx, rCheckFGrad);
+      if (grad.checkIgGrad) paddle::paddleAtomicAdd(grad.checkIgGrad+frameIdx, rCheckIGrad);
+      if (grad.checkFgGrad) paddle::paddleAtomicAdd(grad.checkFgGrad+frameIdx, rCheckFGrad);
    }
-    if (grad.checkOgGrad) atomicAdd(grad.checkOgGrad+frameIdx, rCheckOGrad);
+    if (grad.checkOgGrad) paddle::paddleAtomicAdd(grad.checkOgGrad+frameIdx, rCheckOGrad);
  } else {
    if (value.prevStateValue) {
      if (grad.checkIgGrad) grad.checkIgGrad[frameIdx] += rCheckIGrad;

--- a/paddle/cuda/include/hl_matrix_type.cuh
+++ b/paddle/cuda/include/hl_matrix_type.cuh
@@ -27,6 +27,8 @@ typedef float4 vecType;
 typedef double2 vecType;
 #endif
 #else
+#include <mmintrin.h>
+#include <xmmintrin.h>
 #include <emmintrin.h>
 #ifndef HPPL_TYPE_DOUBLE
 typedef __m128  vecType;

--- a/paddle/cuda/include/hl_sse_matrix_kernel.cuh
+++ b/paddle/cuda/include/hl_sse_matrix_kernel.cuh
@@ -25,6 +25,9 @@ limitations under the License. */
 #define     VECTOR_LEN      4
 #define     VECTOR_SET      _mm_set_ps1
 #else
+#if   defined(__APPLE__) || defined(__OSX__)
+#define     _mm_set_pd1     _mm_set1_pd
+#endif
 /* number of double in vector */
 #define     VECTOR_LEN      2
 #define     VECTOR_SET      _mm_set_pd1

--- a/paddle/cuda/include/stub/hl_cuda_stub.h
+++ b/paddle/cuda/include/stub/hl_cuda_stub.h
@@ -89,7 +89,7 @@ inline const char* hl_get_device_error_string() { return NULL; }

 inline const char* hl_get_device_error_string(size_t err) { return NULL; }

-inline void hl_cuda_event_query(hl_event_t event, bool& isNotReady) {}
+inline bool hl_cuda_event_is_ready(hl_event_t event) { return true; }

 inline void hl_device_synchronize() {}


--- a/paddle/cuda/src/hl_cuda_aggregate.cu
+++ b/paddle/cuda/src/hl_cuda_aggregate.cu
@@ -261,11 +261,7 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) {

  struct _hl_event_st hl_event_st  = {.cu_event = t_resource.event};
  hl_event_t hl_event = &hl_event_st;
-
-  bool isNotReady = false;
-  do {
-    hl_cuda_event_query(hl_event, isNotReady);
-  } while (isNotReady == cudaErrorNotReady);
+  while (!hl_cuda_event_is_ready(hl_event)) {}

  KeVectorSum<128><<< grid, threads, 0, STREAM_DEFAULT >>>
           (A_d, t_resource.gpu_mem, dimM);
@@ -275,7 +271,10 @@ void hl_vector_sum(real *A_d, real *C_h, int dimM) {
  hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
  hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);

-  CHECK_SYNC("hl_vector_sum failed");
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  cudaError_t err = (cudaError_t)hl_get_device_last_error();
+  CHECK_EQ(cudaSuccess, err)
+    << "CUDA error: " << hl_get_device_error_string((size_t)err);
 }

 template <int blockSize>
@@ -317,11 +316,7 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {

  struct _hl_event_st hl_event_st  = {.cu_event = t_resource.event};
  hl_event_t hl_event = &hl_event_st;
-
-  bool isNotReady = false;
-  do {
-    hl_cuda_event_query(hl_event, isNotReady);
-  } while (isNotReady == cudaErrorNotReady);
+  while (!hl_cuda_event_is_ready(hl_event)) {}

  KeVectorAbsSum<128><<< grid, threads, 0, STREAM_DEFAULT >>>
           (A_d, t_resource.gpu_mem, dimM);
@@ -331,5 +326,8 @@ void hl_vector_abs_sum(real *A_d, real *C_h, int dimM) {
  hl_memcpy_async(C_h, t_resource.cpu_mem, sizeof(real), HPPL_STREAM_DEFAULT);
  hl_stream_record_event(HPPL_STREAM_DEFAULT, hl_event);

-  CHECK_SYNC("hl_vector_abs_sum failed");
+  hl_stream_synchronize(HPPL_STREAM_DEFAULT);
+  cudaError_t err = (cudaError_t)hl_get_device_last_error();
+  CHECK_EQ(cudaSuccess, err)
+    << "CUDA error: " << hl_get_device_error_string((size_t)err);
 }
--- a/paddle/cuda/src/hl_cuda_cublas.cc
+++ b/paddle/cuda/src/hl_cuda_cublas.cc
@@ -217,7 +217,7 @@ void hl_matrix_mul(real *A_d, hl_trans_op_t transa,
  } else {
    LOG(FATAL) << "parameter transa error!";
  }
-  CHECK_EQ(stat, CUBLAS_STATUS_SUCCESS);
+  CHECK_EQ(stat, CUBLAS_STATUS_SUCCESS) << hl_cublas_get_error_string(stat);
  CHECK_SYNC("hl_matrix_mul failed");
 }

@@ -266,7 +266,7 @@ void hl_matrix_mul_vector(real *A_d, hl_trans_op_t trans,
    LOG(FATAL) << "parameter transa error!";
  }

-  CHECK_EQ(stat, CUBLAS_STATUS_SUCCESS);
+  CHECK_EQ(stat, CUBLAS_STATUS_SUCCESS) << hl_cublas_get_error_string(stat);
  CHECK_SYNC("hl_matrix_mul_vector");
 }


--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -150,7 +150,7 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R3(DYNAMIC_LOAD_CUDNN_WRAP)


 // APIs available after R4:
-#if CUDNN_VERSION >= 4000
+#if CUDNN_VERSION >= 4007
 #define CUDNN_DNN_ROUTINE_EACH_AFTER_R4(__macro)             \
  __macro(cudnnBatchNormalizationForwardTraining)            \
  __macro(cudnnBatchNormalizationForwardInference)           \
@@ -999,7 +999,7 @@ void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,
                                    double epsilon,
                                    real *savedMean,
                                    real *savedVar) {
-#if CUDNN_VERSION >= 4000
+#if CUDNN_VERSION >= 4007
  if ((NULL != runningMean && NULL == runningInvVar) ||
      (NULL == runningMean && NULL != runningInvVar)) {
    LOG(FATAL) << "runningMean and runningInvVar can be NULL "
@@ -1024,7 +1024,7 @@ void hl_batch_norm_forward_training(hl_tensor_descriptor inputDesc,

  CHECK_SYNC("hl_batch_norm_forward_training failed");
 #else
-  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4000. "
+  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
             << "But cudnn lib version is " << g_cudnn_lib_version;
 #endif
 }
@@ -1039,7 +1039,7 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,
                                    real *estimatedMean,
                                    real *estimatedInvVar,
                                    double epsilon) {
-#if CUDNN_VERSION >= 4000
+#if CUDNN_VERSION >= 4007
  cudnnTensorDescriptor_t xDesc = GET_TENSOR_DESCRIPTOR(inputDesc);
  cudnnTensorDescriptor_t yDesc = GET_TENSOR_DESCRIPTOR(outputDesc);
  cudnnTensorDescriptor_t bnDesc = GET_TENSOR_DESCRIPTOR(bnParamDesc);
@@ -1053,7 +1053,7 @@ void hl_batch_norm_forward_inference(hl_tensor_descriptor inputDesc,

  CHECK_SYNC("hl_batch_norm_forward_inference failed");
 #else
-  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4000. "
+  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
             << "But cudnn lib version is " << g_cudnn_lib_version;
 #endif
 }
@@ -1071,7 +1071,7 @@ void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
                            double epsilon,
                            real *savedMean,
                            real *savedInvVar) {
-#if CUDNN_VERSION >= 4000
+#if CUDNN_VERSION >= 4007
  if ((NULL != savedMean && NULL == savedInvVar) ||
      (NULL == savedMean && NULL != savedInvVar)) {
    LOG(FATAL) << "savedMean and savedVar can be NULL "
@@ -1087,16 +1087,14 @@ void hl_batch_norm_backward(hl_tensor_descriptor inputDesc,
  cudnnBatchNormMode_t mode = CUDNN_BATCHNORM_SPATIAL;
  CHECK_CUDNN(dynload::cudnnBatchNormalizationBackward(
              t_resource.cudnn_handle, mode, &alpha, &beta,
-#if CUDNN_VERSION >= 5000
              &alpha, &beta,
-#endif
              xDesc, input, dyDesc, outGrad, dxDesc, inGrad,
              bnDesc, scale, scaleGrad, biasGrad, epsilon,
              savedMean, savedInvVar));

  CHECK_SYNC("hl_batch_norm_backward failed");
 #else
-  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4000. "
+  LOG(FATAL) << "CudnnBatchNorm requires cudnn version >= 4007. "
             << "But cudnn lib version is " << g_cudnn_lib_version;
 #endif
 }
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -209,7 +209,18 @@ __thread cudaStream_t default_stream = 0;
 __thread bool g_sync_flag = true;
 bool hl_start_flag = false;

-#define gettid() syscall(SYS_gettid)
+inline pid_t gettid() {
+#if defined(__APPLE__) || defined(__OSX__)
+  pid_t tid = syscall(SYS_thread_selfid);
+#else
+  #ifndef __NR_gettid
+  #define __NR_gettid 224
+  #endif
+  pid_t tid = syscall(__NR_gettid);
+#endif
+  CHECK_NE(tid, -1);
+  return tid;    
+}

 void hl_init(int device) {
  CHECK(hl_start_flag)
@@ -751,11 +762,12 @@ void hl_set_device_flags_block() {
             cudaDeviceScheduleBlockingSync));
 }

-void hl_cuda_event_query(hl_event_t event, bool& isNotReady) {
+bool hl_cuda_event_is_ready(hl_event_t event) {
  cudaError_t err = dynload::cudaEventQuery(event->cu_event);
  CHECK(cudaSuccess == err || cudaErrorNotReady == err);

  if (cudaErrorNotReady == err) {
-    isNotReady = true;
+    return false;
  }
+  return true;
 }
--- a/paddle/cuda/src/hl_cuda_lstm.cu
+++ b/paddle/cuda/src/hl_cuda_lstm.cu
@@ -564,11 +564,11 @@ __global__ void KeLstmBackward(real *gateValue,

  /* TODO: Temporary save & merger in another kernel */
  if (frameIdy == 1) {
-    if (checkIgGrad) atomicAdd(checkIgGrad+frameIdx, rCheckGrad);
+    if (checkIgGrad) paddle::paddleAtomicAdd(checkIgGrad+frameIdx, rCheckGrad);
  } else if (frameIdy == 2) {
-    if (checkFgGrad) atomicAdd(checkFgGrad+frameIdx, rCheckGrad);
+    if (checkFgGrad) paddle::paddleAtomicAdd(checkFgGrad+frameIdx, rCheckGrad);
  } else if (frameIdy == 3) {
-    if (checkOgGrad) atomicAdd(checkOgGrad+frameIdx, rCheckGrad);
+    if (checkOgGrad) paddle::paddleAtomicAdd(checkOgGrad+frameIdx, rCheckGrad);
  }
 }


--- a/paddle/cuda/src/hl_cuda_matrix.cu
+++ b/paddle/cuda/src/hl_cuda_matrix.cu
@@ -19,6 +19,7 @@ limitations under the License. */
 #include "hl_matrix_apply.cuh"
 #include "hl_sequence.h"
 #include "paddle/utils/Logging.h"
+#include "hl_device_functions.cuh"

 DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
 DEFINE_MATRIX_TERNARY_PARAMETER_OP(_add, TWO_PARAMETER, c = p1*a + p2*b);
@@ -266,25 +267,21 @@ template<int blockSize>
 __global__ void KeMatrixClassificationError(real* in_A,
                                            int* in_B,
                                            real* out_C,
-                                            int dimM,
                                            int dimN) {
  __shared__ real max_s[blockSize];
  __shared__ int max_l[blockSize];
-  int cnt = (dimN + blockSize -1) / blockSize;
-  int tid = threadIdx.x;
-  int lmt = tid;
-  int index = 0;
-  real t;
+  const int tid = threadIdx.x;
+  const int rowId = blockIdx.x;

  max_s[tid] = -1e30f;
-  for (int ii = 0; ii < cnt && lmt < dimN; ii++) {
-    index = blockIdx.y*dimN + lmt;
-    t = in_A[index];
-    if (max_s[tid] < t) {
-      max_s[tid] = t;
-      max_l[tid] = lmt;
+  in_A += rowId * dimN;
+  real tmp;
+  for (int colId = tid; colId < dimN; colId += blockSize) {
+    tmp = in_A[colId];
+    if (max_s[tid] < tmp) {
+      max_s[tid] = tmp;
+      max_l[tid] = colId;
    }
-    lmt += blockSize;
  }
  __syncthreads();

@@ -300,7 +297,7 @@ __global__ void KeMatrixClassificationError(real* in_A,
  __syncthreads();

  if (tid == 0) {
-    out_C[blockIdx.y] = (max_l[0] == in_B[blockIdx.y] ? 0 : 1.0f);
+    out_C[rowId] = (max_l[0] == in_B[rowId] ? 0 : 1.0f);
  }
 }

@@ -313,12 +310,9 @@ void hl_matrix_classification_error(real* A_d,
  CHECK_NOTNULL(B_d);
  CHECK_NOTNULL(C_d);

-  int blocksX = 1;
-  int blocksY = dimM;
-  dim3 threads(1024, 1);
-  dim3 grid(blocksX, blocksY);
-  KeMatrixClassificationError<1024><<< grid, threads, 0, STREAM_DEFAULT >>>
-           (A_d, B_d, C_d, dimM, dimN);
+  // each sample is calculated by one block
+  KeMatrixClassificationError<1024><<< dimM, 1024, 0, STREAM_DEFAULT >>>
+    (A_d, B_d, C_d, dimN);
  CHECK_SYNC("hl_matrix_classification_error");
 }

@@ -629,7 +623,7 @@ __global__ void KeCosSimDerivative(real* grad,
        prevGradY[index] +=
          scale * grad[ty] * prevOutX[index] * reciprocal;
      } else {
-        atomicAdd(prevGradY + index,
+        paddle::paddleAtomicAdd(prevGradY + index,
          scale * grad[ty] * prevOutX[index] * reciprocal);
      }
    }
@@ -646,7 +640,7 @@ __global__ void KeCosSimDerivative(real* grad,
          (prevOutX[index] * reciprocalXY -
           prevOutY[index] * reciprocalSquareSumY);
      } else {
-        atomicAdd(prevGradY + index, output[ty] * grad[ty] *
+        paddle::paddleAtomicAdd(prevGradY + index, output[ty] * grad[ty] *
          (prevOutX[index] * reciprocalXY -
           prevOutY[index] * reciprocalSquareSumY));
      }

--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -362,7 +362,7 @@ __global__ void KeMatrixAddRows(real* output,
        if (AddRow == 0) {
          outputData[i] += tableData[i];
        } else {
-          atomicAdd(&tableData[i], outputData[i]);
+          paddle::paddleAtomicAdd(&tableData[i], outputData[i]);
        }
      }
    }

--- a/paddle/cuda/src/hl_cuda_sparse.cuh
+++ b/paddle/cuda/src/hl_cuda_sparse.cuh
@@ -280,7 +280,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d,
        if (index_n_t < dimN) {
          real tmp;
          tmp = alpha*a_r*b_r[n];
-          atomicAdd(C_d_r, tmp);
+          paddle::paddleAtomicAdd(C_d_r, tmp);
          C_d_r += CU_CSC_MUL_DENSE_THREAD_X;
          index_n_t += CU_CSC_MUL_DENSE_THREAD_X;
        }
@@ -328,7 +328,7 @@ __global__ void KeSMatrixCscMulDense(real *C_d,
        if (index_n_t < dimN) {
          real tmp;
          tmp = alpha*a_r*b_r[n];
-          atomicAdd(C_d_r, tmp);
+          paddle::paddleAtomicAdd(C_d_r, tmp);
          C_d_r += CU_CSC_MUL_DENSE_THREAD_X;
          index_n_t += CU_CSC_MUL_DENSE_THREAD_X;
        }
@@ -629,7 +629,7 @@ __global__ void KeSMatrixDenseMulCsr(real *C_d,
      for (int n=0; n < CU_DM_CSR_N; n++) {
        if (index_m_t++ < dimM) {
          tmp = alpha * b_r * a_r[n];
-          atomicAdd(C_d_r, tmp);
+          paddle::paddleAtomicAdd(C_d_r, tmp);
          C_d_r += dimN;
        }
      }
@@ -660,7 +660,7 @@ __global__ void KeSMatrixDenseMulCsr(real *C_d,
      for (int n=0; n < CU_DM_CSR_N; n++) {
        if (index_m_t++ < dimM) {
          tmp = alpha * b_r * a_r[n];
-          atomicAdd(C_d_r, tmp);
+          paddle::paddleAtomicAdd(C_d_r, tmp);
          C_d_r += dimN;
        }
      }
@@ -912,7 +912,7 @@ __global__ void KeSMatrixCsrColumnSum(real* a_val, real* csr_val,
  for (int idx = gid; idx < dimNNZ; idx += gridDim.x * blockDim.x) {
    int colIdx = csr_col[idx];
    real val = csr_val[idx];
-    atomicAdd(a_val + colIdx, val);
+    paddle::paddleAtomicAdd(a_val + colIdx, val);
  }
 }


--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -69,23 +69,40 @@ static inline void GetDsoHandleWithSearchPath(

    CHECK(nullptr != *dso_handle)
      << "For Gpu version of PaddlePaddle, it couldn't find CUDA library: "
-      << dlPath.c_str() << " Please make sure you already specify its path."
-      << "Note: for training data on Cpu using Gpu version of PaddlePaddle,"
-      << "you must specify libcudart.so via LD_LIBRARY_PATH.";
+      << dlPath.c_str() << ". Please make sure you already specify its path. "
+      << "Note: for training data on Cpu using Gpu version of PaddlePaddle, "
+      << "you must specify libcudart via export LD_LIBRARY_PATH for Linux or "
+      << "export DYLD_LIBRARY_PATH for MAC OS.";
 }

 void GetCublasDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcublas.dylib", dso_handle);
+#else
    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcublas.so", dso_handle);
+#endif
 }

 void GetCudnnDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+    GetDsoHandleWithSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", dso_handle);
+#else
    GetDsoHandleWithSearchPath(FLAGS_cudnn_dir, "libcudnn.so", dso_handle);
+#endif
 }

 void GetCudartDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+    GetDsoHandleWithSearchPath("", "libcudart.dylib", dso_handle);
+#else
    GetDsoHandleWithSearchPath("", "libcudart.so", dso_handle);
+#endif
 }

 void GetCurandDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcurand.dylib", dso_handle);
+#else
    GetDsoHandleWithSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
+#endif
 }
--- a/paddle/cuda/src/hl_table_apply.cu
+++ b/paddle/cuda/src/hl_table_apply.cu
@@ -35,7 +35,7 @@ __global__ void KeMatrixAddRows(real* output, int ldo,
      real *tab = table + tableId * ldt;
      for (int i = idx; i < dim; i += blockDimX) {
        if (AddRow) {
-          atomicAdd(&tab[i], out[i]);
+          paddle::paddleAtomicAdd(&tab[i], out[i]);
        } else {
          out[i] += tab[i];
        }

--- a/paddle/gserver/dataproviders/DataProvider.cpp
+++ b/paddle/gserver/dataproviders/DataProvider.cpp
@@ -149,9 +149,13 @@ void DoubleBuffer::startAsyncLoad() {
  taskReadySem_.post();
 }

-ClassRegistrar<DataProvider, DataConfig, bool> DataProvider::registrar_;
-DataProvider* DataProvider::create(const DataConfig& config, bool useGpu) {
-  return registrar_.createByType(config.type(), config, useGpu);
+ClassRegistrar<DataProvider, DataConfig, ModelConfig, bool>
+DataProvider::registrar_;
+
+DataProvider* DataProvider::create(const DataConfig& config,
+                                   const ModelConfig& modelConfig,
+                                   bool useGpu) {
+  return registrar_.createByType(config.type(), config, modelConfig, useGpu);
 }

 REGISTER_DATA_PROVIDER(simple, SimpleDataProvider);

--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -39,15 +39,30 @@ limitations under the License. */
 #include "paddle/parameter/Argument.h"

 namespace paddle {
-
 /**
 * @def REGISTER_DATA_PROVIDER
- * @brief Macro for registering a data provider
+ * @brief Macro for registering a data provider. The class type should contain
+ *        a consturctor with parameter (DataConfig, bool).
 */
-#define REGISTER_DATA_PROVIDER(__type_name, __class_name)               \
-  static InitFunction __reg_type_##__type_name([]() {                   \
-    DataProvider::registrar_.registerClass<__class_name>(#__type_name); \
-  })
+#define REGISTER_DATA_PROVIDER(__type_name, __class_name)\
+  static InitFunction __reg_type_##__type_name([]() {\
+  DataProvider::registrar_.registerClass(\
+  #__type_name, \
+  [](DataConfig conf, ModelConfig, bool useGpu) -> DataProvider* { \
+    DataProvider* dp = new __class_name (conf, useGpu);\
+    return dp;\
+  });\
+})
+
+/**
+ * @def REGISTER_DATA_PROVIDER_EX
+ * @brief Macro for registering a data provider, which contains a constructor
+ *        with parameter (DataConfig, ModelConfig, bool).
+ */
+#define REGISTER_DATA_PROVIDER_EX(__type_name, __class_name)            \
+  static InitFunction __reg_type_##__type_name([] {                     \
+  DataProvider::registrar_.registerClass<__class_name>(#__type_name);   \
+})

 class DataBatch;
 class BufferBatch;
@@ -285,10 +300,18 @@ protected:
 */
 class DataProvider {
 public:
-  static ClassRegistrar<DataProvider, DataConfig, bool> registrar_;
+  static ClassRegistrar<DataProvider, DataConfig, ModelConfig, bool> registrar_;
  static DataProvider* create(const DataConfig& config,
+                              const ModelConfig& modelConfig,
                              bool useGpu = FLAGS_use_gpu);

+  /**
+   * @brief create only used for unittest.
+   */
+  inline static DataProvider* create(const DataConfig &config, bool useGpu) {
+    return create(config, ModelConfig(), useGpu);
+  }
+
  DataProvider(const DataConfig& config, bool useGpu)
      : config_(config),
        skipShuffle_(false),
@@ -336,13 +359,13 @@ public:
   * @note return -1 to indicate unlimited number of samples.
   */
  virtual int64_t getSize() = 0;
+
  /**
   * @brief Get next batch training samples internally
   * @param[in]    size      size of training samples to get
   * @param[out]   batch     a batch of training samples
   * @return actual size of obtained training samples
   */
-
  virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch) = 0;

 protected:

--- a/paddle/gserver/dataproviders/DataProviderGroup.h
+++ b/paddle/gserver/dataproviders/DataProviderGroup.h
@@ -65,7 +65,8 @@ void DataProviderGroup<T>::reset() {
  provider_ = nullptr;

  // shuffle file list
-  std::random_shuffle(fileList_.begin(), fileList_.end());
+  std::shuffle(fileList_.begin(), fileList_.end(),
+      ThreadLocalRandomEngine::get());

  startLoader();
  DataProvider::reset();

--- a/paddle/gserver/dataproviders/MultiDataProvider.cpp
+++ b/paddle/gserver/dataproviders/MultiDataProvider.cpp
@@ -22,7 +22,9 @@ namespace paddle {

 using namespace std;

-MultiDataProvider::MultiDataProvider(const DataConfig& config, bool useGpu)
+MultiDataProvider::MultiDataProvider(const DataConfig& config,
+                                     const ModelConfig& modelConfig,
+                                     bool useGpu)
    : DataProvider(config, useGpu) {
  bool atLeastOneMainDataFlag = false;
  totalDataRatio_ = 0;
@@ -58,7 +60,9 @@ MultiDataProvider::MultiDataProvider(const DataConfig& config, bool useGpu)
      subConfig.set_async_load_data(false);
    }
    subDataProviders_[i] =
-        std::unique_ptr<DataProvider>(DataProvider::create(subConfig, useGpu_));
+        std::unique_ptr<DataProvider>(DataProvider::create(subConfig,
+                                                           modelConfig,
+                                                           useGpu_));
  }
 }

@@ -116,6 +120,6 @@ int64_t MultiDataProvider::getNextBatchInternal(int64_t size,
  return batch->getSize();
 }

-REGISTER_DATA_PROVIDER(multi, MultiDataProvider);
+REGISTER_DATA_PROVIDER_EX(multi, MultiDataProvider);

 }  // namespace paddle
--- a/paddle/gserver/dataproviders/MultiDataProvider.h
+++ b/paddle/gserver/dataproviders/MultiDataProvider.h
@@ -24,7 +24,9 @@ protected:
  std::vector<std::unique_ptr<DataProvider>> subDataProviders_;

 public:
-  MultiDataProvider(const DataConfig& config, bool useGpu);
+  MultiDataProvider(const DataConfig& config,
+                    const ModelConfig& modelConfig,
+                    bool useGpu);
  ~MultiDataProvider() {}
  virtual void reset();
  virtual void shuffle();

--- a/paddle/gserver/dataproviders/ProtoDataProvider.cpp
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
@@ -374,7 +374,8 @@ void ProtoDataProvider::reset() {
 }

 void ProtoDataProvider::shuffle() {
-  std::random_shuffle(shuffledSequenceIds_.begin(), shuffledSequenceIds_.end());
+  std::shuffle(shuffledSequenceIds_.begin(), shuffledSequenceIds_.end(),
+      ThreadLocalRandomEngine::get());
 }

 /*

--- a/paddle/gserver/dataproviders/PyDataProvider.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider.cpp
@@ -17,6 +17,8 @@ limitations under the License. */
 #include "paddle/utils/PythonUtil.h"
 #include <fenv.h>
 #include "paddle/utils/Util.h"
+#include "paddle/utils/Excepts.h"
+

 namespace paddle {

@@ -44,7 +46,6 @@ PyDataProvider::PyDataProvider(const DataConfig& config, bool useGpu,
 }

 void PyDataProvider::loadData(const std::vector<std::string>& fileList) {
-  int feFlag = fegetexcept();
  VLOG(1) << "module:" << pyModuleName_ << " class:" << pyClassName_;
  classInstance_ =
      createPythonClass(pyModuleName_, pyClassName_, fileList, pyUserArgs_);
@@ -55,7 +56,7 @@ void PyDataProvider::loadData(const std::vector<std::string>& fileList) {
  std::string headerInfo =
      std::string(PyString_AsString(obj.get()), PyString_Size(obj.get()));
  parseHeaderData(headerInfo);
-  feenableexcept(feFlag);
+  feenableexcept(FE_INVALID | FE_DIVBYZERO | FE_OVERFLOW);
 }

 void PyDataProvider::parseHeaderData(const std::string& headerData) {

--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
--- a/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
+++ b/paddle/gserver/evaluators/CTCErrorEvaluator.cpp
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
--- a/paddle/gserver/layers/AgentLayer.cpp
+++ b/paddle/gserver/layers/AgentLayer.cpp
--- a/paddle/gserver/layers/CRFLayer.cpp
+++ b/paddle/gserver/layers/CRFLayer.cpp
--- a/paddle/gserver/layers/CRFLayer.h
+++ b/paddle/gserver/layers/CRFLayer.h
--- a/paddle/gserver/layers/CTCLayer.cpp
+++ b/paddle/gserver/layers/CTCLayer.cpp
--- a/paddle/gserver/layers/ConvOperator.cpp
+++ b/paddle/gserver/layers/ConvOperator.cpp
--- a/paddle/gserver/layers/ConvexCombinationLayer.cpp
+++ b/paddle/gserver/layers/ConvexCombinationLayer.cpp
--- a/paddle/gserver/layers/CosSimLayer.cpp
+++ b/paddle/gserver/layers/CosSimLayer.cpp
--- a/paddle/gserver/layers/CosSimLayer.h
+++ b/paddle/gserver/layers/CosSimLayer.h
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
--- a/paddle/gserver/layers/CudnnBatchNormLayer.cpp
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
--- a/paddle/gserver/layers/CudnnConvLayer.cpp
+++ b/paddle/gserver/layers/CudnnConvLayer.cpp
--- a/paddle/gserver/layers/CudnnConvLayer.h
+++ b/paddle/gserver/layers/CudnnConvLayer.h
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
--- a/paddle/gserver/layers/LinearChainCRF.h
+++ b/paddle/gserver/layers/LinearChainCRF.h
--- a/paddle/gserver/layers/MultinomialSampler.cpp
+++ b/paddle/gserver/layers/MultinomialSampler.cpp
--- a/paddle/gserver/layers/NormProjectionLayer.cpp
+++ b/paddle/gserver/layers/NormProjectionLayer.cpp
--- a/paddle/gserver/layers/NormProjectionLayer.h
+++ b/paddle/gserver/layers/NormProjectionLayer.h
--- a/paddle/gserver/layers/PrintLayer.cpp
+++ b/paddle/gserver/layers/PrintLayer.cpp
--- a/paddle/gserver/layers/SamplingIdLayer.cpp
+++ b/paddle/gserver/layers/SamplingIdLayer.cpp
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
--- a/paddle/gserver/tests/Sequence/dummy.list
+++ b/paddle/gserver/tests/Sequence/dummy.list
--- a/paddle/gserver/tests/concat_table_a.conf
+++ b/paddle/gserver/tests/concat_table_a.conf
--- a/paddle/gserver/tests/concat_table_b.conf
+++ b/paddle/gserver/tests/concat_table_b.conf
--- a/paddle/gserver/tests/rnn_data_provider.py
+++ b/paddle/gserver/tests/rnn_data_provider.py
--- a/paddle/gserver/tests/sequenceGen.py
+++ b/paddle/gserver/tests/sequenceGen.py
--- a/paddle/gserver/tests/sequence_layer_group.conf
+++ b/paddle/gserver/tests/sequence_layer_group.conf
--- a/paddle/gserver/tests/sequence_nest_layer_group.conf
+++ b/paddle/gserver/tests/sequence_nest_layer_group.conf
--- a/paddle/gserver/tests/sequence_nest_rnn.conf
+++ b/paddle/gserver/tests/sequence_nest_rnn.conf
--- a/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
+++ b/paddle/gserver/tests/sequence_nest_rnn_multi_input.conf
--- a/paddle/gserver/tests/sequence_rnn.conf
+++ b/paddle/gserver/tests/sequence_rnn.conf
--- a/paddle/gserver/tests/sequence_rnn_multi_input.conf
+++ b/paddle/gserver/tests/sequence_rnn_multi_input.conf
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ b/paddle/gserver/tests/test_Evaluator.cpp
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
--- a/paddle/gserver/tests/test_MultinomialSampler.cpp
+++ b/paddle/gserver/tests/test_MultinomialSampler.cpp
--- a/paddle/gserver/tests/test_PyDataProvider2.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider2.cpp
--- a/paddle/gserver/tests/test_PyDataProvider2.py
+++ b/paddle/gserver/tests/test_PyDataProvider2.py
--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
--- a/paddle/math/Allocator.h
+++ b/paddle/math/Allocator.h
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
--- a/paddle/math/MatrixBitCode.cpp
+++ b/paddle/math/MatrixBitCode.cpp
--- a/paddle/math/PoolAllocator.h
+++ b/paddle/math/PoolAllocator.h
--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
--- a/paddle/math/Vector.cpp
+++ b/paddle/math/Vector.cpp
--- a/paddle/math/tests/test_SIMDFunctions.cpp
+++ b/paddle/math/tests/test_SIMDFunctions.cpp
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
--- a/paddle/math/tests/test_matrixUtil.h
+++ b/paddle/math/tests/test_matrixUtil.h
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
--- a/paddle/parameter/tests/test_common.cpp
+++ b/paddle/parameter/tests/test_common.cpp
--- a/paddle/pserver/LightNetwork.cpp
+++ b/paddle/pserver/LightNetwork.cpp
--- a/paddle/pserver/ParameterClient2.cpp
+++ b/paddle/pserver/ParameterClient2.cpp
--- a/paddle/pserver/ParameterServer2.cpp
+++ b/paddle/pserver/ParameterServer2.cpp
--- a/paddle/pserver/SocketChannel.cpp
+++ b/paddle/pserver/SocketChannel.cpp
--- a/paddle/scripts/docker/cpu/Dockerfile
+++ b/paddle/scripts/docker/cpu/Dockerfile
--- a/paddle/scripts/docker/cpu-demo/Dockerfile
+++ b/paddle/scripts/docker/cpu-demo/Dockerfile
--- a/paddle/scripts/docker/cpu-devel/Dockerfile
+++ b/paddle/scripts/docker/cpu-devel/Dockerfile
--- a/paddle/scripts/docker/Dockerfile.cpu-noavx
+++ b/paddle/scripts/docker/Dockerfile.cpu-noavx
--- a/paddle/scripts/docker/Dockerfile.cpu-noavx-demo
+++ b/paddle/scripts/docker/Dockerfile.cpu-noavx-demo
--- a/paddle/scripts/docker/Dockerfile.cpu-noavx-devel
+++ b/paddle/scripts/docker/Dockerfile.cpu-noavx-devel
--- a/paddle/scripts/docker/gpu/Dockerfile
+++ b/paddle/scripts/docker/gpu/Dockerfile
--- a/paddle/scripts/docker/gpu-demo/Dockerfile
+++ b/paddle/scripts/docker/gpu-demo/Dockerfile
--- a/paddle/scripts/docker/gpu-devel/Dockerfile
+++ b/paddle/scripts/docker/gpu-devel/Dockerfile
--- a/paddle/scripts/docker/Dockerfile.gpu-noavx
+++ b/paddle/scripts/docker/Dockerfile.gpu-noavx
--- a/paddle/scripts/docker/Dockerfile.gpu-noavx-demo
+++ b/paddle/scripts/docker/Dockerfile.gpu-noavx-demo
--- a/paddle/scripts/docker/Dockerfile.gpu-noavx-devel
+++ b/paddle/scripts/docker/Dockerfile.gpu-noavx-devel
--- a/paddle/scripts/docker/Dockerfile.m4
+++ b/paddle/scripts/docker/Dockerfile.m4
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
--- a/paddle/scripts/docker/cpu-demo/build.sh
+++ b/paddle/scripts/docker/cpu-demo/build.sh
--- a/paddle/scripts/docker/cpu-devel/build.sh
+++ b/paddle/scripts/docker/cpu-devel/build.sh
--- a/paddle/scripts/docker/cpu/build.sh
+++ b/paddle/scripts/docker/cpu/build.sh
--- a/paddle/scripts/docker/generate.sh
+++ b/paddle/scripts/docker/generate.sh
--- a/paddle/scripts/docker/gpu-demo/build.sh
+++ b/paddle/scripts/docker/gpu-demo/build.sh
--- a/paddle/scripts/docker/gpu-devel/build.sh
+++ b/paddle/scripts/docker/gpu-devel/build.sh
--- a/paddle/scripts/docker/gpu/build.sh
+++ b/paddle/scripts/docker/gpu/build.sh
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
--- a/paddle/scripts/travis/before_install.sh
+++ b/paddle/scripts/travis/before_install.sh
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
--- a/paddle/scripts/travis/common.sh
+++ b/paddle/scripts/travis/common.sh
--- a/paddle/scripts/travis/deploy_key.enc
+++ b/paddle/scripts/travis/deploy_key.enc
--- a/paddle/scripts/travis/docs.sh
+++ b/paddle/scripts/travis/docs.sh
--- a/paddle/scripts/travis/main.sh
+++ b/paddle/scripts/travis/main.sh
--- a/paddle/setup.py
+++ b/paddle/setup.py
--- a/paddle/trainer/ThreadParameterUpdater.cpp
+++ b/paddle/trainer/ThreadParameterUpdater.cpp
--- a/paddle/trainer/ThreadParameterUpdater.h
+++ b/paddle/trainer/ThreadParameterUpdater.h
--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
--- a/paddle/trainer/TrainerInternal.cpp
+++ b/paddle/trainer/TrainerInternal.cpp
--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/trainer/TrainerMain.cpp
--- a/paddle/trainer/tests/.gitignore
+++ b/paddle/trainer/tests/.gitignore
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
--- a/paddle/trainer/tests/mnist.list
+++ b/paddle/trainer/tests/mnist.list
--- a/paddle/trainer/tests/mnist_bin_part
+++ b/paddle/trainer/tests/mnist_bin_part
--- a/paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.nest
+++ b/paddle/trainer/tests/rnn_gen_test_model_dir/r1.test.nest
--- a/paddle/trainer/tests/sample_trainer_config_opt_a.conf
+++ b/paddle/trainer/tests/sample_trainer_config_opt_a.conf
--- a/paddle/trainer/tests/sample_trainer_config_opt_b.conf
+++ b/paddle/trainer/tests/sample_trainer_config_opt_b.conf
--- a/paddle/trainer/tests/sample_trainer_config_parallel.conf
+++ b/paddle/trainer/tests/sample_trainer_config_parallel.conf
--- a/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf
+++ b/paddle/trainer/tests/sample_trainer_nest_rnn_gen.conf
--- a/paddle/trainer/tests/sample_trainer_rnn_gen.conf
+++ b/paddle/trainer/tests/sample_trainer_rnn_gen.conf
--- a/paddle/trainer/tests/test_CompareSparse.cpp
+++ b/paddle/trainer/tests/test_CompareSparse.cpp
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ b/paddle/trainer/tests/test_Trainer.cpp
--- a/paddle/trainer/tests/test_TrainerOnePass.cpp
+++ b/paddle/trainer/tests/test_TrainerOnePass.cpp
--- a/paddle/trainer/tests/test_recurrent_machine_generation.cpp
+++ b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
--- a/paddle/utils/CustomStackTrace.cpp
+++ b/paddle/utils/CustomStackTrace.cpp
--- a/paddle/utils/CustomStackTrace.h
+++ b/paddle/utils/CustomStackTrace.h
--- a/paddle/math/Bits.h
+++ b/paddle/math/Bits.h
--- a/paddle/utils/Excepts.h
+++ b/paddle/utils/Excepts.h
--- a/paddle/utils/Locks.h
+++ b/paddle/utils/Locks.h
--- a/paddle/utils/Logging.h
+++ b/paddle/utils/Logging.h
--- a/paddle/utils/PythonUtil.h
+++ b/paddle/utils/PythonUtil.h
--- a/paddle/utils/Stat.cpp
+++ b/paddle/utils/Stat.cpp
--- a/paddle/utils/Thread.h
+++ b/paddle/utils/Thread.h
--- a/paddle/utils/ThreadLocal.cpp
+++ b/paddle/utils/ThreadLocal.cpp
--- a/paddle/utils/ThreadLocal.h
+++ b/paddle/utils/ThreadLocal.h
--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
--- a/paddle/utils/Util.h
+++ b/paddle/utils/Util.h
--- a/paddle/utils/arch/linux/Locks.cpp
+++ b/paddle/utils/arch/linux/Locks.cpp
--- a/paddle/utils/arch/osx/Locks.cpp
+++ b/paddle/utils/arch/osx/Locks.cpp
--- a/paddle/utils/tests/CMakeLists.txt
+++ b/paddle/utils/tests/CMakeLists.txt
--- a/paddle/utils/tests/test_CustomStackTrace.cpp
+++ b/paddle/utils/tests/test_CustomStackTrace.cpp
--- a/paddle/utils/tests/test_CustomStackTracePrint.cpp
+++ b/paddle/utils/tests/test_CustomStackTracePrint.cpp
--- a/paddle/utils/tests/test_CustomStackTracePrint.sh
+++ b/paddle/utils/tests/test_CustomStackTracePrint.sh
--- a/paddle/utils/tests/test_ThreadBarrier.cpp
+++ b/paddle/utils/tests/test_ThreadBarrier.cpp
--- a/proto/ModelConfig.proto.m4
+++ b/proto/ModelConfig.proto.m4
--- a/proto/ParameterConfig.proto.m4
+++ b/proto/ParameterConfig.proto.m4
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
--- a/python/paddle/trainer/PyDataProvider2.py
+++ b/python/paddle/trainer/PyDataProvider2.py
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
--- a/python/paddle/trainer_config_helpers/activations.py
+++ b/python/paddle/trainer_config_helpers/activations.py
--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
--- a/python/paddle/trainer_config_helpers/data_sources.py
+++ b/python/paddle/trainer_config_helpers/data_sources.py
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ b/python/paddle/trainer_config_helpers/evaluators.py
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
--- a/python/paddle/trainer_config_helpers/optimizers.py
+++ b/python/paddle/trainer_config_helpers/optimizers.py
--- a/python/paddle/trainer_config_helpers/poolings.py
+++ b/python/paddle/trainer_config_helpers/poolings.py
--- a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
+++ b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
--- a/python/paddle/trainer_config_helpers/tests/configs/.gitignore
+++ b/python/paddle/trainer_config_helpers/tests/configs/.gitignore
--- a/python/paddle/trainer_config_helpers/tests/configs/check.md5
+++ b/python/paddle/trainer_config_helpers/tests/configs/check.md5
--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
--- a/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/img_layers.py
--- a/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/last_first_seq.py
--- a/python/paddle/trainer_config_helpers/tests/configs/layer_activations.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/layer_activations.py
--- a/python/paddle/trainer_config_helpers/tests/configs/projections.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/projections.py
--- a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
--- a/python/paddle/trainer_config_helpers/tests/configs/simple_rnn_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/simple_rnn_layers.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_expand_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_fc.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_fc.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_grumemory_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_grumemory_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_hsigmoid.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_hsigmoid.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_lstmemory_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_lstmemory_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_ntm_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_ntm_layers.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_print_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_print_layer.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
--- a/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_sequence_pooling.py
--- a/python/paddle/trainer_config_helpers/tests/configs/unused_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/unused_layers.py
--- a/python/paddle/trainer_config_helpers/tests/configs/util_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/util_layers.py
--- a/python/paddle/trainer_config_helpers/tests/layers_test.py
+++ b/python/paddle/trainer_config_helpers/tests/layers_test.py
--- a/python/paddle/trainer_config_helpers/tests/layers_test_config.py
+++ b/python/paddle/trainer_config_helpers/tests/layers_test_config.py
--- a/python/setup.py.in
+++ b/python/setup.py.in