Merge branch 'develop' of github.com:PaddlePaddle/Paddle into fix_404_dist_train

ee3483b0 · Yancey1989 · 10acacf1 · 36444461 · ee3483b0 · ee3483b0
192 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,6 +16,8 @@ cmake_minimum_required(VERSION 3.0)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
+SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
+SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 include(system)
@@ -54,6 +56,7 @@ option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
+option(WITH_DISTRIBUTE  "Compile with grpc distributed support"         OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
 # CMAKE_BUILD_TYPE
@@ -67,9 +70,6 @@ if(ANDROID OR IOS)
    if(ANDROID)
        if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
            message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16")
-        elseif(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
-            # TODO: support glog for Android api 16 ~ 19 in the future
-            message(WARNING "Using the unofficial git repository <https://github.com/Xreki/glog.git> instead")
        endif()
    endif()
@@ -83,6 +83,8 @@ if(ANDROID OR IOS)
        "Disable RDMA when cross-compiling for Android and iOS" FORCE)
    set(WITH_MKL OFF CACHE STRING
        "Disable MKL when cross-compiling for Android and iOS" FORCE)
+    set(WITH_GOLANG OFF CACHE STRING
+        "Disable golang when cross-compiling for Android and iOS" FORCE)
    # Compile PaddlePaddle mobile inference library
    if (NOT WITH_C_API)

--- a/benchmark/paddle/image/googlenet.py
+++ b/benchmark/paddle/image/googlenet.py
@@ -6,10 +6,21 @@ width = 224
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
 use_gpu = get_config_arg('use_gpu', bool, True)
+is_infer = get_config_arg("is_infer", bool, False)
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+args = {
+    'height': height,
+    'width': width,
+    'color': True,
+    'num_class': num_class,
+    'is_infer': is_infer
+}
 define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
+    "train.list" if not is_infer else None,
+    "test.list" if is_infer else None,
+    module="provider",
+    obj="process",
+    args=args)
 settings(
    batch_size=batch_size,
@@ -146,7 +157,6 @@ def inception(name, input, channels, \
    return cat
-lab = data_layer(name="label", size=1000)
 data = data_layer(name="input", size=3 * height * width)
 # stage 1
@@ -224,6 +234,10 @@ pool5 = img_pool_layer(
 dropout = dropout_layer(name="dropout", input=pool5, dropout_rate=0.4)
 out3 = fc_layer(
    name="output3", input=dropout, size=1000, act=SoftmaxActivation())
-loss3 = cross_entropy(name='loss3', input=out3, label=lab)
-outputs(loss3)
+if is_infer:
+    outputs(out3)
+else:
+    lab = data_layer(name="label", size=num_class)
+    loss3 = cross_entropy(name='loss3', input=out3, label=lab)
+    outputs(loss3)
--- a/benchmark/paddle/image/provider.py
+++ b/benchmark/paddle/image/provider.py
@@ -13,14 +13,20 @@ def initHook(settings, height, width, color, num_class, **kwargs):
        settings.data_size = settings.height * settings.width * 3
    else:
        settings.data_size = settings.height * settings.width
+    settings.is_infer = kwargs.get('is_infer', False)
+    if settings.is_infer:
+        settings.slots = [dense_vector(settings.data_size)]
+    else:
        settings.slots = [dense_vector(settings.data_size), integer_value(1)]
 @provider(
    init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, file_list):
-    for i in xrange(1024):
+    for i in xrange(2560 if settings.is_infer else 1024):
        img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
+        if settings.is_infer:
+            yield img.astype('float32')
+        else:
            lab = random.randint(0, settings.num_class - 1)
            yield img.astype('float32'), int(lab)
--- a/benchmark/paddle/image/resnet.py
+++ b/benchmark/paddle/image/resnet.py
@@ -6,11 +6,21 @@ width = 224
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 64)
 layer_num = get_config_arg("layer_num", int, 50)
-is_test = get_config_arg("is_test", bool, False)
+is_infer = get_config_arg("is_infer", bool, False)
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+args = {
+    'height': height,
+    'width': width,
+    'color': True,
+    'num_class': num_class,
+    'is_infer': is_infer
+}
 define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
+    "train.list" if not is_infer else None,
+    "test.list" if is_infer else None,
+    module="provider",
+    obj="process",
+    args=args)
 settings(
    batch_size=batch_size,
@@ -45,7 +55,10 @@ def conv_bn_layer(name,
        act=LinearActivation(),
        bias_attr=False)
    return batch_norm_layer(
-        name=name + "_bn", input=tmp, act=active_type, use_global_stats=is_test)
+        name=name + "_bn",
+        input=tmp,
+        act=active_type,
+        use_global_stats=is_infer)
 def bottleneck_block(name, input, num_filters1, num_filters2):
@@ -207,7 +220,9 @@ elif layer_num == 152:
 else:
    print("Wrong layer number.")
-lbl = data_layer(name="label", size=num_class)
+if is_infer:
-loss = cross_entropy(name='loss', input=resnet, label=lbl)
+    outputs(resnet)
-inputs(img, lbl)
+else:
-outputs(loss)
+    lbl = data_layer(name="label", size=num_class)
+    loss = cross_entropy(name='loss', input=resnet, label=lbl)
+    outputs(loss)
--- a/benchmark/paddle/image/run_mkldnn_infer.sh
+++ b/benchmark/paddle/image/run_mkldnn_infer.sh
+set -e
+function clock_to_seconds() {
+  hours=`echo $1 | awk -F ':' '{print $1}'`
+  mins=`echo $1 | awk -F ':' '{print $2}'`
+  secs=`echo $1 | awk -F ':' '{print $3}'`
+  echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
+}
+function infer() {
+  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
+  topology=$1
+  layer_num=$2
+  bs=$3
+  use_mkldnn=$4
+  if [ $4 == "True" ]; then
+    thread=1
+    log="logs/infer-${topology}-${layer_num}-mkldnn-${bs}.log"
+  elif [ $4 == "False" ]; then
+    thread=`nproc`
+    if [ $thread -gt $bs ]; then
+      thread=$bs
+    fi
+    log="logs/infer-${topology}-${layer_num}-${thread}mklml-${bs}.log"
+  else
+    echo "Wrong input $4, use True or False."
+    exit 0
+  fi
+  models_in="models/${topology}-${layer_num}/pass-00000/"
+  if [ ! -d $models_in ]; then
+    echo "Training model ${topology}_${layer_num}"
+    paddle train --job=train \
+      --config="${topology}.py" \
+      --use_mkldnn=True \
+      --use_gpu=False \
+      --trainer_count=1 \
+      --num_passes=1 \
+      --save_dir="models/${topology}-${layer_num}" \
+      --config_args="batch_size=128,layer_num=${layer_num}" \
+      > /dev/null 2>&1
+    echo "Done"
+  fi
+  log_period=$((256 / bs))
+  paddle train --job=test \
+    --config="${topology}.py" \
+    --use_mkldnn=$use_mkldnn \
+    --use_gpu=False \
+    --trainer_count=$thread \
+    --log_period=$log_period \
+    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \
+    --init_model_path=$models_in \
+    2>&1 | tee ${log}
+  # calculate the last 5 logs period time of 1280 samples,
+  # the time before are burning time.
+  start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+  end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+  start_sec=`clock_to_seconds $start`
+  end_sec=`clock_to_seconds $end`
+  fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'`
+  echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
+  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
+}
+if [ ! -f "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -f "test.list" ]; then
+  echo " " > test.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+if [ ! -d "models" ]; then
+  mkdir -p models
+fi
+# inference benchmark
+for use_mkldnn in True False; do
+  for batchsize in 1 2 4 8 16; do
+    infer googlenet v1 $batchsize $use_mkldnn
+    infer resnet 50 $batchsize $use_mkldnn
+    infer vgg 19 $batchsize $use_mkldnn
+  done
+done
--- a/benchmark/paddle/image/run_mkldnn.sh
+++ b/benchmark/paddle/image/run_mkldnn.sh
@@ -8,13 +8,13 @@ function train() {
  use_mkldnn=$4
  if [ $4 == "True" ]; then
    thread=1
-    log="logs/${topology}-${layer_num}-mkldnn-${bs}.log"
+    log="logs/train-${topology}-${layer_num}-mkldnn-${bs}.log"
  elif [ $4 == "False" ]; then
    thread=`nproc`
    # each trainer_count use only 1 core to avoid conflict
-    log="logs/${topology}-${layer_num}-${thread}mklml-${bs}.log"
+    log="logs/train-${topology}-${layer_num}-${thread}mklml-${bs}.log"
  else
-    echo "Wrong input $3, use True or False."
+    echo "Wrong input $4, use True or False."
    exit 0
  fi
  args="batch_size=${bs},layer_num=${layer_num}"
@@ -30,13 +30,14 @@ function train() {
    2>&1 | tee ${log} 
 }
-if [ ! -d "train.list" ]; then
+if [ ! -f "train.list" ]; then
  echo " " > train.list
 fi
 if [ ! -d "logs" ]; then
  mkdir logs
 fi
+# training benchmark
 for use_mkldnn in True False; do
  for batchsize in 64 128 256; do
    train vgg 19 $batchsize $use_mkldnn

--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
@@ -6,10 +6,21 @@ width = 224
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 64)
 layer_num = get_config_arg('layer_num', int, 19)
+is_infer = get_config_arg("is_infer", bool, False)
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+args = {
+    'height': height,
+    'width': width,
+    'color': True,
+    'num_class': num_class,
+    'is_infer': is_infer
+}
 define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
+    "train.list" if not is_infer else None,
+    "test.list" if is_infer else None,
+    module="provider",
+    obj="process",
+    args=args)
 settings(
    batch_size=batch_size,
@@ -98,6 +109,9 @@ elif layer_num == 19:
 else:
    print("Wrong layer number.")
-lab = data_layer('label', num_class)
+if is_infer:
-loss = cross_entropy(input=vgg, label=lab)
+    outputs(vgg)
-outputs(loss)
+else:
+    lab = data_layer('label', num_class)
+    loss = cross_entropy(input=vgg, label=lab)
+    outputs(loss)
--- a/cmake/external/cares.cmake
+++ b/cmake/external/cares.cmake
@@ -13,7 +13,7 @@
 # limitations under the License.
 #
-IF(MOBILE_INFERENCE)
+IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE)
    return()
 ENDIF()

--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -26,12 +26,21 @@ ENDIF(WIN32)
 INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
+IF(ANDROID AND ${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
+  # Using the unofficial glog for Android API < 21
+  SET(GLOG_REPOSITORY "https://github.com/Xreki/glog.git")
+  SET(GLOG_TAG "8a547150548b284382ccb6582408e9140ff2bea8")
+ELSE()
+  SET(GLOG_REPOSITORY "https://github.com/google/glog.git")
+  SET(GLOG_TAG "v0.3.5")
+ENDIF()
 ExternalProject_Add(
    extern_glog
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS gflags
-    GIT_REPOSITORY  "https://github.com/google/glog.git"
+    GIT_REPOSITORY  ${GLOG_REPOSITORY}
-    GIT_TAG         v0.3.5
+    GIT_TAG         ${GLOG_TAG}
    PREFIX          ${GLOG_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}

--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
@@ -13,7 +13,7 @@
 # limitations under the License.
 #
-IF(MOBILE_INFERENCE)
+IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE)
    return()
 ENDIF()
@@ -23,6 +23,11 @@ SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc)
 SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
 SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
 SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
+IF(APPLE)
+  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
+ELSE()
+  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin)
+ENDIF()
 ExternalProject_Add(
    extern_grpc
@@ -33,7 +38,11 @@ ExternalProject_Add(
    UPDATE_COMMAND  ""
    CONFIGURE_COMMAND ""
    BUILD_IN_SOURCE 1
-    BUILD_COMMAND   make
+    # NOTE(yuyang18):
+    # Disable -Werror, otherwise the compile will fail in MacOS.
+    # It seems that we cannot configure that by make command.
+    # Just dry run make command and remove `-Werror`, then use a shell to run make commands
+    BUILD_COMMAND  ${BUILD_CMD}
    INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install
 )
@@ -55,4 +64,3 @@ SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION
 include_directories(${GRPC_INCLUDE_DIR})
 ADD_DEPENDENCIES(grpc++_unsecure extern_grpc)
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -15,7 +15,18 @@
 INCLUDE(ExternalProject)
 # Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp
 FIND_PACKAGE(Protobuf QUIET)
-SET(PROTOBUF_FOUND "OFF")
+macro(UNSET_VAR VAR_NAME)
+    UNSET(${VAR_NAME} CACHE)
+    UNSET(${VAR_NAME})
+endmacro()
+UNSET_VAR(PROTOBUF_INCLUDE_DIR)
+UNSET_VAR(PROTOBUF_FOUND)
+UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE)
+UNSET_VAR(PROTOBUF_PROTOC_LIBRARY)
+UNSET_VAR(PROTOBUF_LITE_LIBRARY)
+UNSET_VAR(PROTOBUF_LIBRARY)
+UNSET_VAR(PROTOBUF_INCLUDE_DIR)
+UNSET_VAR(Protobuf_PROTOC_EXECUTABLE)
 if(NOT COMMAND protobuf_generate_python)  # before cmake 3.4, protobuf_genrerate_python is not defined.
    function(protobuf_generate_python SRCS)
@@ -110,7 +121,6 @@ macro(PROMPT_PROTOBUF_LIB)
    # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`.
    # make `protobuf_generate_cpp` happy.
    SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE})
    FOREACH(dep ${protobuf_DEPS})
        ADD_DEPENDENCIES(protobuf ${dep})
        ADD_DEPENDENCIES(protobuf_lite ${dep})
@@ -128,11 +138,11 @@ endmacro()
 set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
 if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
-    find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include)
+    find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH)
-    find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib)
+    find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
-    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib)
+    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
-    find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib)
+    find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
-    find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin)
+    find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH)
    if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE)
        message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.")
        SET_PROTOBUF_VERSION()
@@ -178,14 +188,26 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
        SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}")
    ENDIF()
+    SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
+    SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
+    IF(MOBILE_INFERENCE)
+        # The reason why the official version is not used is described in
+        # https://github.com/PaddlePaddle/Paddle/issues/6114
+        SET(PROTOBUF_REPO "https://github.com/qingqing01/protobuf.git")
+        SET(PROTOBUF_TAG "v3.2.0")
+        IF(NOT BUILD_FOR_HOST)
+            SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-Dprotobuf_BUILD_PROTOC_BINARIES=OFF")
+        ENDIF()
+    ENDIF()
    ExternalProject_Add(
        ${TARGET_NAME}
        ${EXTERNAL_PROJECT_LOG_ARGS}
        PREFIX          ${PROTOBUF_SOURCES_DIR}
        UPDATE_COMMAND  ""
        DEPENDS         zlib
-        GIT_REPOSITORY  "https://github.com/google/protobuf.git"
+        GIT_REPOSITORY  ${PROTOBUF_REPO}
-        GIT_TAG         "9f75c5aa851cd877fb0d93ccc31b8567a6706546"
+        GIT_TAG         ${PROTOBUF_TAG}
        CONFIGURE_COMMAND
        ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake
            ${OPTIONAL_ARGS}
@@ -203,7 +225,11 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
    )
 ENDFUNCTION()
-SET(PROTOBUF_VERSION 3.1)
+IF(NOT MOBILE_INFERENCE)
+    SET(PROTOBUF_VERSION 3.1)
+ELSE()
+    SET(PROTOBUF_VERSION 3.2)
+ENDIF()
 IF(CMAKE_CROSSCOMPILING)
    build_protobuf(protobuf_host TRUE)
    LIST(APPEND external_project_dependencies protobuf_host)

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -111,6 +111,8 @@ set(COMMON_FLAGS
    -Wno-error=sign-compare
    -Wno-error=unused-local-typedefs
    -Wno-error=parentheses-equality # Warnings in pybind11
+    -Wno-error=ignored-attributes  # Warnings in Eigen, gcc 6.3
+    -Wno-error=terminate  # Warning in PADDLE_ENFORCE
 )
 set(GPU_COMMON_FLAGS

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -227,8 +227,8 @@ function(cc_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS)
    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
-    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
+    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
    add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
  endif()
 endfunction(cc_test)
@@ -288,8 +288,8 @@ function(nv_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS)
    cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main)
+    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
-    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main)
+    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
    add_test(${TARGET_NAME} ${TARGET_NAME})
  endif()
 endfunction(nv_test)
@@ -505,12 +505,12 @@ function(grpc_library TARGET_NAME)
  set_source_files_properties(
    ${grpc_grpc_srcs}
    PROPERTIES
-    COMPILE_FLAGS  "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+    COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
  cc_library("${TARGET_NAME}_grpc" SRCS "${grpc_grpc_srcs}")
  set_source_files_properties(
    ${grpc_library_SRCS}
    PROPERTIES
-    COMPILE_FLAGS  "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+    COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
  cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}")
 endfunction()
--- a/doc/api/index_en.rst
+++ b/doc/api/index_en.rst
@@ -7,3 +7,4 @@ API
    v2/model_configs.rst
    v2/data.rst
    v2/run_logic.rst
+    v2/fluid.rst
--- a/doc/api/v2/fluid.rst
+++ b/doc/api/v2/fluid.rst
+======================
+Fluid
+======================
+..  toctree::
+    :maxdepth: 1
+    fluid/layers.rst
+    fluid/data_feeder.rst
+    fluid/executor.rst
+    fluid/initializer.rst
+    fluid/evaluator.rst
+    fluid/nets.rst
+    fluid/optimizer.rst
+    fluid/param_attr.rst
+    fluid/profiler.rst
+    fluid/regularizer.rst
--- a/doc/api/v2/fluid/data_feeder.rst
+++ b/doc/api/v2/fluid/data_feeder.rst
+===========
+DataFeeder
+===========
+DataFeeder
+-----------
+..  automodule:: paddle.v2.fluid.data_feeder
+    :members: DataFeeder
+    :noindex:
--- a/doc/api/v2/fluid/evaluator.rst
+++ b/doc/api/v2/fluid/evaluator.rst
+===========
+Evaluator
+===========
+Evaluator
+-----------
+..  automodule:: paddle.v2.fluid.evaluator
+    :members: Evaluator
+    :noindex:
--- a/doc/api/v2/fluid/executor.rst
+++ b/doc/api/v2/fluid/executor.rst
+===========
+Executor
+===========
+Executor
+-----------
+..  automodule:: paddle.v2.fluid.executor
+    :members: Executor
+    :noindex:
--- a/doc/api/v2/fluid/initializer.rst
+++ b/doc/api/v2/fluid/initializer.rst
+===========
+Initializer
+===========
+Initializer
+-----------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: Initializer
+    :noindex:
+ConstantInitializer
+-------------------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: ConstantInitializer
+    :noindex:
+UniformInitializer
+------------------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: UniformInitializer
+    :noindex:
+NormalInitializer
+-----------------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: NormalInitializer
+    :noindex:
+XavierInitializer
+-----------------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: XavierInitializer
+    :noindex:
+MSRAInitializer
+---------------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: MSRAInitializer
+    :noindex:
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
+==========
+Layers
+==========
+fc
+---
+..  autofunction:: paddle.v2.fluid.layers.fc
+    :noindex:
+embedding
+---------
+..  autofunction:: paddle.v2.fluid.layers.embedding
+    :noindex:
+dynamic_lstm
+------------
+..  autofunction:: paddle.v2.fluid.layers.dynamic_lstm
+    :noindex:
+data
+---------
+..  autofunction:: paddle.v2.fluid.layers.data
+    :noindex:
+mean
+---------
+..  autofunction:: paddle.v2.fluid.layers.mean
+    :noindex:
+mul
+---------
+..  autofunction:: paddle.v2.fluid.layers.mul
+    :noindex:
+elementwise_add
+---------------
+..  autofunction:: paddle.v2.fluid.layers.elementwise_add
+    :noindex:
+elementwise_div
+---------------
+..  autofunction:: paddle.v2.fluid.layers.elementwise_div
+    :noindex:
+dropout
+---------
+..  autofunction:: paddle.v2.fluid.layers.dropout
+    :noindex:
+reshape
+---------
+..  autofunction:: paddle.v2.fluid.layers.reshape
+    :noindex:
+sigmoid
+---------
+..  autofunction:: paddle.v2.fluid.layers.sigmoid
+    :noindex:
+scale
+---------
+..  autofunction:: paddle.v2.fluid.layers.scale
+    :noindex:
+reshape
+---------
+..  autofunction:: paddle.v2.fluid.layers.reshape
+    :noindex:
+transpose
+---------
+..  autofunction:: paddle.v2.fluid.layers.transpose
+    :noindex:
+sigmoid_cross_entropy_with_logits
+---------
+..  autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits
+    :noindex:
+cast
+---------
+..  autofunction:: paddle.v2.fluid.layers.cast
+    :noindex:
+concat
+---------
+..  autofunction:: paddle.v2.fluid.layers.concat
+    :noindex:
+sums
+---------
+..  autofunction:: paddle.v2.fluid.layers.sums
+    :noindex:
+linear_chain_crf
+---------
+..  autofunction:: paddle.v2.fluid.layers.linear_chain_crf
+    :noindex:
+assign
+---------
+..  autofunction:: paddle.v2.fluid.layers.embedding
+    :noindex:
+split_lod_tensor
+---------
+..  autofunction:: paddle.v2.fluid.layers.split_lod_tensor
+    :noindex:
+merge_lod_tensor
+---------
+..  autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
+    :noindex:
+cos_sim
+---------
+..  autofunction:: paddle.v2.fluid.layers.cos_sim
+    :noindex:
+cross_entropy
+---------
+..  autofunction:: paddle.v2.fluid.layers.cross_entropy
+    :noindex:
+square_error_cost
+---------
+..  autofunction:: paddle.v2.fluid.layers.square_error_cost
+    :noindex:
+accuracy
+---------
+..  autofunction:: paddle.v2.fluid.layers.accuracy
+    :noindex:
+sequence_conv
+---------
+..  autofunction:: paddle.v2.fluid.layers.sequence_conv
+    :noindex:
+conv2d
+---------
+..  autofunction:: paddle.v2.fluid.layers.conv2d
+    :noindex:
+sequence_pool
+---------
+..  autofunction:: paddle.v2.fluid.layers.sequence_pool
+    :noindex:
+pool2d
+---------
+..  autofunction:: paddle.v2.fluid.layers.pool2d
+    :noindex:
+batch_norm
+---------
+..  autofunction:: paddle.v2.fluid.layers.batch_norm
+    :noindex:
+beam_search_decode
+---------
+..  autofunction:: paddle.v2.fluid.layers.beam_search_decode
+    :noindex:
+lstm
+---------
+..  autofunction:: paddle.v2.fluid.layers.lstm
+    :noindex:
+lod_rank_table
+---------
+..  autofunction:: paddle.v2.fluid.layers.lod_rank_table
+    :noindex:
+max_sequence_len
+---------
+..  autofunction:: paddle.v2.fluid.layers.max_sequence_len
+    :noindex:
+topk
+---------
+..  autofunction:: paddle.v2.fluid.layers.topk
+    :noindex:
+lod_tensor_to_array
+---------
+..  autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
+    :noindex:
+array_to_lod_tensor
+---------
+..  autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
+    :noindex:
+fill_constant
+---------
+..  autofunction:: paddle.v2.fluid.layers.fill_constant
+    :noindex:
+fill_constant_batch_size_like
+---------
+..  autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
+    :noindex:
+ones
+---------
+..  autofunction:: paddle.v2.fluid.layers.ones
+    :noindex:
+zeros
+---------
+..  autofunction:: paddle.v2.fluid.layers.zeros
+    :noindex:
+increment
+---------
+..  autofunction:: paddle.v2.fluid.layers.increment
+    :noindex:
+array_write
+---------
+..  autofunction:: paddle.v2.fluid.layers.array_write
+    :noindex:
+create_array
+---------
+..  autofunction:: paddle.v2.fluid.layers.create_array
+    :noindex:
+less_than
+---------
+..  autofunction:: paddle.v2.fluid.layers.less_than
+    :noindex:
+array_read
+---------
+..  autofunction:: paddle.v2.fluid.layers.array_read
+    :noindex:
+shrink_memory
+---------
+..  autofunction:: paddle.v2.fluid.layers.shrink_memory
+    :noindex:
+array_length
+---------
+..  autofunction:: paddle.v2.fluid.layers.array_length
+    :noindex:
+conv2d_transpose
+---------
+..  autofunction:: paddle.v2.fluid.layers.conv2d_transpose
+    :noindex:
--- a/doc/api/v2/fluid/nets.rst
+++ b/doc/api/v2/fluid/nets.rst
+===========
+Nets
+===========
+simple_img_conv_pool
+-----------
+..  autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
+    :noindex:
+img_conv_group
+-----------
+..  autofunction:: paddle.v2.fluid.nets.img_conv_group
+    :noindex:
+sequence_conv_pool
+-----------
+..  autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
+    :noindex:
--- a/doc/api/v2/fluid/optimizer.rst
+++ b/doc/api/v2/fluid/optimizer.rst
+===========
+Optimizer
+===========
+Optimizer
+-----------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: Optimizer
+    :noindex:
+SGDOptimizer
+-----------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: SGDOptimizer
+    :noindex:
+MomentumOptimizer
+-----------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: MomentumOptimizer
+    :noindex:
+AdagradOptimizer
+-----------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: AdagradOptimizer
+    :noindex:
+AdamOptimizer
+-----------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: AdamOptimizer
+    :noindex:
+AdamaxOptimizer
+-----------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: AdamaxOptimizer
+    :noindex:
+DecayedAdagradOptimizer
+-----------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: DecayedAdagradOptimizer
+    :noindex:
--- a/doc/api/v2/fluid/param_attr.rst
+++ b/doc/api/v2/fluid/param_attr.rst
+===========
+ParamAttr
+===========
+ParamAttr
+-----------
+..  automodule:: paddle.v2.fluid.param_attr
+    :members: ParamAttr
+    :noindex:
--- a/doc/api/v2/fluid/profiler.rst
+++ b/doc/api/v2/fluid/profiler.rst
+===========
+Profiler
+===========
+Profiler
+-----------
+..  autofunction:: paddle.v2.fluid.profiler.cuda_profiler
+    :noindex:
--- a/doc/api/v2/fluid/regularizer.rst
+++ b/doc/api/v2/fluid/regularizer.rst
+===========
+Regularizer
+===========
+WeightDecayRegularizer
+-----------
+..  automodule:: paddle.v2.fluid.regularizer
+    :members: WeightDecayRegularizer
+    :noindex:
+L2DecayRegularizer
+-----------
+..  automodule:: paddle.v2.fluid.regularizer
+    :members: L2DecayRegularizer
+    :noindex:
+L1DecayRegularizer
+-----------
+..  automodule:: paddle.v2.fluid.regularizer
+    :members: L1DecayRegularizer
--- a/doc/design/evaluator.md
+++ b/doc/design/evaluator.md
 ## Evaluator Design
-### The Problem
+### Problem Statement
-During training or serving, we provide the evaluation function to measure the model performance, e.g., accuracy, precision. In the operator based framework design, the data go through the network pipeline batch by batch. As a result, inside the operator, we only can calculate one minibatch metrics. We need to provide a mechanism to calculate the metrics for each N pass/batch the user wanted.
+During training or inference, we provide an evaluation function to measure the model performance, for example, accuracy, precision, etc. In the operator based framework design, the data passes through the network pipeline batch by batch. As a result, inside the operator, we only calculate the metrics for one minibatch. Thus, we need to provide a mechanism to calculate the metrics for each N pass/batch the user wants.
 ### Evaluator Design
-Currently, every operation is expressed in the graph. we divide the evaluator process into three steps.
+Currently, every operation is expressed in the graph. We divide the evaluator process into three steps.
 1. Initialize the metric state and add it into the block.
-2. Calculate the statistic of the metric state in every mini-batch. The single operator is only responsible for calculating necessary statistics for one mini-batch. For example, accuracy operator only calculate a minibatch data if run once.
+2. Calculate the concerned metrics for every mini-batch. The single evaluator operator is only responsible for calculating the necessary statistics for one mini-batch. For example, the accuracy operator only calculates the accuracy for a minibatch data if run once.
 3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices.
 ### Implementation
-This design is shown in python API. 
+This design is shown in the Python API. 
-Each metric operator need to caculate the metric statistic and return the batch aware states, Python side responsible for accumulate the states for each pass. 
+Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass. 
 ```python

--- a/doc/design/float16.md
+++ b/doc/design/float16.md
@@ -28,6 +28,51 @@ The goal of float16 is to serve as a key for the executor to find and run the co
 - [Eigen](https://github.com/RLovelett/eigen) >= 3.3 supports float16 calculation on both GPU and CPU using the `Eigen::half` class. It is mostly useful for Nvidia GPUs because of the overloaded arithmetic operators using cuda intrinsics. It falls back to using software emulation on CPU for calculation and there is no special treatment to ARM processors.
 - [ARM compute library](https://github.com/ARM-software/ComputeLibrary) >= 17.02.01 supports NEON FP16 kernels (requires ARMv8.2-A CPU).
+### CUDA version issue
+There are currently three versions of CUDA that supports `__half` data type, namely, CUDA 7.5, 8.0, and 9.0. 
+CUDA 7.5 and 8.0 define `__half` as a simple struct that has a `uint16_t` data (see [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/9212ab5a3ddbe48f30ef373f9c1fb546804c7a8c/include/isaac/external/CUDA/cuda_fp16.h)) as follows:
+```
+typedef struct __align__(2) {
+   unsigned short x;
+} __half;
+typedef __half half;
+```
+This struct does not define any overloaded arithmetic operators. So you have to directly use `__hadd` instead of `+` to correctly add two half types:
+```
+__global__ void Add() {
+  half a, b, c;
+  c = __hadd(a, b); // correct
+  c = a + b; // compiler error: no operator "+" matches these operands
+}
+```
+CUDA 9.0 provides a major update to the half data type. The related code can be found in the updated [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.h) and the newly added [`cuda_fp16.hpp`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.hpp).
+Essentially, CUDA 9.0 renames the original `__half` type in 7.5 and 8.0 as `__half_raw`, and defines a new `__half` class type that has constructors, conversion operators, and also provides overloaded arithmetic operators such as follows:
+```
+typedef struct __CUDA_ALIGN__(2) {
+    unsigned short x;
+} __half_raw;
+struct __CUDA_ALIGN__(2) __half {
+protected:
+    unsigned short __x;
+public:
+    // constructors and conversion operators from/to 
+    // __half_raw and other built-in data types
+}
+typedef __half half;
+__device__ __forceinline__ 
+__half operator+(const __half &lh, const __half &rh) { 
+    return __hadd(lh, rh); 
+}
+// Other overloaded operators
+``` 
+This new design makes `c = a + b` work correctly for CUDA half data type. 
 ## Implementation
 The float16 class holds a 16-bit `uint16_t` data internally.

--- a/doc/design/mkldnn/README.MD
+++ b/doc/design/mkldnn/README.MD
 # Intel® MKL-DNN on PaddlePaddle: Design Doc
-我们计划将Intel深度神经网络数学库(**MKL-DNN**\[[1](#references)\])集成到PaddlePaddle，充分展现英特尔平台的优势，有效提升PaddlePaddle在英特尔架构上的性能。
+我们计划将英特尔深度神经网络数学库[Intel MKL-DNN](https://github.com/01org/mkl-dnn)
+(Intel Math Kernel Library for Deep Neural Networks)集成到PaddlePaddle，
+充分展现英特尔平台的优势，有效提升PaddlePaddle在英特尔架构上的性能。
-我们短期内的基本目标是：
+<div align="center">
+<img src="image/overview.png"><br/>
+Figure 1. PaddlePaddle on IA
+</div>
+近期目标
- 完成常用layer的MKL-DNN实现。
+- 完成常用Layer的MKL-DNN实现。
 - 完成常见深度神经网络VGG，GoogLeNet 和 ResNet的MKL-DNN实现。
+目前的优化，主要针对PaddlePaddle在重构之前的代码框架以及V1的API。
+具体的完成状态可以参见[这里](https://github.com/PaddlePaddle/Paddle/projects/21)。
 ## Contents
 - [Overview](#overview)
 - [Actions](#actions)
 	- [CMake](#cmake)
+ 	- [Matrix](#matrix)
 	- [Layers](#layers)
 	- [Activations](#activations)
-	- [Weights](#weights)
+	- [Parameters](#parameters)
+	- [Gradients](#gradients)
 	- [Unit Tests](#unit-tests)
-	- [Protobuf Messages](#protobuf-messages)
 	- [Python API](#python-api)
-	- [Demos](#demos)
 	- [Benchmarking](#benchmarking)
 	- [Others](#others)
 - [Design Concerns](#design-concerns)
 ## Overview
-我们会把MKL-DNN作为第三方库集成进PaddlePaddle，整体框架图
+我们会把MKL-DNN会作为第三方库集成进PaddlePaddle，与其他第三方库一样，会在编译PaddlePaddle的时候下载并编译MKL-DNN。
+同时，为了进一步提升PaddlePaddle在基本数学运算的计算速度，我们也将MKLML即(MKL small library\[[1](#references)\])
+作为另一个第三方库集成进PaddlePaddle，它只会包括生成好的动态库和头文件。
+MKL，MKLML以及MKL-DNN三者关系如下表：
+| Name        |  Open Source     | License     | Descriptions  |
+| :---------- | :--------------- | :---------- | :------------ |
+|   MKL       |     No           | Proprietary | Accelerate math processing routines | 
+|   MKLML     |     No           | Proprietary | Small package of MKL, especially for Machine Learning |
+|   MKL-DNN   |     Yes          | Apache 2.0  | Accelerate primitives processing routines especially for Deep Neural Networks  |
+MKLML可以与MKL-DNN共同使用，以此达到最好的性能。
 <div align="center">
-<img src="image/overview.png" width=350><br/>
+<img src="image/engine.png"><br/>
-Figure 1. PaddlePaddle on IA.
+Figure 2. PaddlePaddle with MKL Engines
 </div>
 ## Actions
-我们把集成方案大致分为了如下几个方面。
+添加的相关文件和目录结构如下：
+```txt
+PaddlePaddle/Paddle
+├── ...
+├── cmake/
+│   ├── external/
+│   │   ├── ...
+│   │   ├── mkldnn.cmake
+│   │   └── mklml.cmake
+└── paddle/
+    ├── ...
+    ├── math/
+    │   ├── ...
+    │   └── MKLDNNMatrix.*
+    └── gserver/
+        ├── ...
+        ├── layers/
+        │   ├── ...
+        │   └── MKLDNN*Layer.*
+        ├── activations/
+        │   ├── ...
+        │   └── MKLDNNActivations.*
+        └── tests/
+            ├── ...
+            ├── MKLDNNTester.*
+            └── test_MKLDNN.cpp
+```
 ### CMake
-我们会在`CMakeLists.txt`中会给用户添加一个`WITH_MKL`的开关，他是负责`WITH_MKLML`和`WITH_MKLDNN`的总开关。
+在`CMakeLists.txt`中提供一个与MKL有关的总开关：`WITH_MKL`，它负责决定编译时是否使用MKLML和MKL-DNN
-当打开`WITH_MKL`时，会开启MKLML的功能，作为PaddlePaddle的CBLAS和LAPACK库，同时会开启Intel OpenMP用于提高MKLML的性能。 如果系统支持AVX2指令集及以上，同时会开启MKL-DNN功能。
+- `WITH_MKLML` 控制是否使用MKLML库。 
+当打开`WITH_MKL`时，会自动使用MKLML库作为PaddlePaddle的CBLAS和LAPACK库，同时会开启Intel OpenMP用于提高MKLML的性能。
+编译时会把对应的头文件和库放在`build/third_party/install/mklml/*`目录下对应的地方。
+MKLML的库目前都是动态库，主要包括`libiomp5.so`和`libmklml_intel.so`。
+- `WITH_MKLDNN` 控制是否使用MKL-DNN。
+当开启`WITH_MKL`时，会自动根据硬件配置[[2](#references)]选择是否编译MKL-DNN。
+编译时会把对应的头文件和库放在`build/third_party/install/mkldnn/*`目录下对应的地方。
+MKL-DNN的库目前只有动态库`libmkldnn.so`。
-当关闭`WITH_MKL`时，MKLML和MKL-DNN功能会同时关闭。
+### Matrix
+目前在PaddlePaddle中数据都是以`NCHW`的格式存储，但是在MKL-DNN中的排列方式不止这一种。
+所以我们定义了一个`MKLDNNMatrix`用于管理MKL-DNN数据的不同格式以及相互之间的转换。
-所以，我们会在`cmake/external`目录新建`mkldnn.cmake`和`mklml.cmake`文件，它们会在编译PaddlePaddle的时候下载对应的软件包，并放到PaddlePaddle的third party目录中。
+<div align="center">
+<img src="image/matrix.png"><br/>
+Figure 3. MKLDNNMatrix
+</div>
 ### Layers
-所有MKL-DNN相关的C++ layers，都会按照PaddlePaddle的目录结构存放在
+所有MKL-DNN的Layers都会继承于`MKLDNNLayer`，该类继承于PaddlePaddle的基类`Layer`。
-`paddle/gserver/layers`中，并且文件名都会一以*MKLDNN*开头。
+在`MKLDNNLayer`中会提供一些必要的接口和函数，并且会写好`forward`和`backward`的基本逻辑，
+子类只需要使用定义好的接口，实现具体的函数功能即可。
+<div align="center">
+<img src="image/layers.png"><br/>
+Figure 4. MKLDNNLayer
+</div>
+每个MKLDNNLayer都包含用于内部存储和外部存储的一系列MKLDNNMatrix：
-所有MKL-DNN的layers都会继承于一个叫做`MKLDNNLayer`的父类，该父类继承于PaddlePaddle的基类`Layer`。
+- 内部存储（internel memory）：`inVal_`,`inGrad_`,`outVal_`和`outGrad_`，分别代表输入数据，输入梯度，输出数据和输出梯度。
+- 外部存储（external memory）：都是以ext开头，比如`extInVal_`和`extInGrad_`，它们主要是用于，
+当数据格式与PaddlePaddle默认的`NCHW`格式不匹配时，转换内存的工作。
+需要注意的是，PaddlePaddle的activation会直接使用`output_.value`和`output_.grad`，
+所以`extOutVal_`和`extOutGrad_`必须分别与`output_.value`和`output_.grad`共享内存，
+如果不需要外部存储用于转换，那么对应的内部存储也会与它们共享内存。
+- 转换函数（resetXXX）： 包括`resetInValue`，`resetInGrad`，`resetOutValue`和`resetOutGrad`，
+表示对输入数据，输入梯度，输出数据和输出梯度的转换。
+这些函数会根据输入参数重新设置内部和外部存储，当然这两者也可以相等，即表示不需要转换。
-在`MKLDNNLayer`中会提供一些必要的接口和函数，并且会写好`forward`和`backward`的基本逻辑。部分函数定义为纯虚函数，子类只需要实现这些函数即可。
+注意：每个`MKLDNNlayer`的子类只需要使用内部存储就可以了，所有外部的转换工作都会在reset系列函数中都准备好。
 ### Activations
-由于在PaddlePaddle中，激活函数是独立于layer概念的，所以会在`paddle/gserver/activations`目录下添加`MKLDNNActivation.h`和`MKLDNNActivation.cpp`文件用于定义和使用MKL-DNN的接口。
+在重构前的PaddlePaddle中，激活函数是独立于`Layer`的概念，并且输入输出都是共用一块内存，
+所以添加了对应的`MKLDNNActivation`来实现，方式类似于`MKLDNNLayer`。
+### Parameters
+对于有参数的层，我们会保证`MKLDNNLayer`使用的参数与PaddlePaddle申请的buffer共用一块内存。
+如果存在数据排列格式不一样的情况时，我们会在网络训练之前把格式转换为MKL-DNN希望的格式，
+在训练结束的时候再保存为PaddlePaddle的格式，但是整个训练过程中不需要任何转换。
+这样既使得最终保存的参数格式与PaddlePaddle一致，又可以避免不必要的转换。
+### Gradients
+由于MKL-DNN的操作都是直接覆盖的形式，也就是说输出的结果不会在原来的数据上累加，
+这样带来的好处就是不需要一直清空memory，节省了不必要的操作。
+但是注意的是，当网络出现分支且在`backward`的时候，需要累加不同Layer传过来的梯度。
+所以在`MKLDNNlayer`中实现了一个merge的方法，此时每个小分支的`Input Gradient`
+会先临时保存在`MKLDNNMatrix`中，由分支处的Layer负责求和，并把结果放到当前层的`output_.grad`中。
+所以整体上，在实现每个子类的时候就不需要关心分支的事情了。
-### Weights
+<div align="center">
-由于有些layer是含有参数的，我们会尽量让MKL-DNN的参数与PaddlePaddle中`parameter`共享一块内存。
+<img src="image/gradients.png"><br/>
-同时，由于MKL-DNN在训练时使用的参数layout可能与PaddlePaddle默认的`nchw`不一致，我们会在网络训练的开始和结束时分别转换这个layout，使得最终保存的参数格式与PaddlePaddle一致。
+Figure 5. Merge Gradients
+</div>
 ### Unit Tests
-会在`paddle/gserver/test`目录下添加`test_MKLDNN.cpp`和`MKLDNNTester.*`用于MKL-DNN的测试。
+我们会添加`test_MKLDNN.cpp`和`MKLDNNTester.*`用于MKL-DNN的测试。
-测试分为每个layer(或activation)的单元测试和简单网络的整体测试。
+测试分为每个Layer（或Activation）的单元测试和简单网络的整体测试。
 每个测试会对比PaddlePaddle中CPU算出的结果与MKL-DNN的结果，小于某个比较小的阈值认为通过。
-### Protobuf Messages
-根据具体layer的需求可能会在`proto/ModelConfig.proto`里面添加必要的选项。
 ### Python API
 目前只考虑**v1 API**。
@@ -80,41 +172,40 @@ if use_mkldnn
    self.layer_type = mkldnn_*
 ```
-所有MKL-DNN的layer type会以*mkldnn_*开头，以示区分。 
+所有MKL-DNN的`layer_type`会以*mkldnn_*开头，这些会在`MKLDNN*Layer`注册layer的时候保证，以示区分。 
-并且可能在`python/paddle/trainer_config_helper`目录下的`activations.py `和`layers.py`里面添加必要的MKL-DNN的接口。
-### Demos
+同时,会在`paddle/utils.Flags`中添加一个`use_mkldnn`的flag，用于选择是否使用MKL-DNN的相关功能。
-会在`v1_api_demo`目录下添加一个`mkldnn`的文件夹，里面放入一些用于MKL-DNN测试的demo脚本。
 ### Benchmarking
-会添加`benchmark/paddle/image/run_mkldnn.sh`，用于测试使用MKL-DNN之后的性能。
+会添加相应的脚本在[这里](https://github.com/PaddlePaddle/Paddle/tree/develop/benchmark/paddle/image)，用于测试和对比在使用MKL-DNN前后的CNN网络性能。
+测试的性能对比结果会在[IntelOptimizedPaddle.md](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md)
 ### Others
-1. 如果在使用MKL-DNN的情况下，会把CPU的Buffer对齐为64。
+1. 如果在使用MKL-DNN的情况下，会把CPU的Buffer对齐为4096，具体可以参考MKL-DNN中的[memory](https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp#L673)。
 2. 深入PaddlePaddle，寻找有没有其他可以优化的可能，进一步优化。比如可能会用OpenMP改进SGD的更新性能。
 ## Design Concerns
-为了更好的符合PaddlePaddle的代码风格\[[2](#references)\]，同时又尽可能少的牺牲MKL-DNN的性能\[[3](#references)\]。
+为了更好的符合PaddlePaddle的代码风格\[[3](#references)\]，同时又尽可能少的牺牲MKL-DNN的性能\[[4](#references)\]。
 我们总结出一些特别需要注意的点：
-1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数，我们决定使用已有的`deviceId_`变量来区分layer的属性，定义`-2`为`MKLDNNLayer`特有的设备ID。
+1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数，
+我们决定使用已有的`deviceId_`变量来区分layer的属性，定义`-2`为`MKLDNNLayer`特有的设备ID。
 2. 重写父类Layer的**init**函数，修改`deviceId_`为`-2`，代表这个layer是用于跑在MKL-DNN的环境下。
-3. 创建`MKLDNNMatrix`，同时继承`CpuMatrix`和`mkldnn::memory`。用于管理MKL-DNN会用到的相关memory函数、接口以及会用的到格式信息。
+3. 创建`MKLDNNBase`，定义一些除了layer和memory相关的类和函数。
-4. 创建`MKLDNNBase`，定义一些除了layer和memory相关的类和函数。包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`，和未来可能还会用到`FPGAEngine`等。
+包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`，和未来可能还会用到`FPGAEngine`等。
-5. 每个`MKLDNNlayer`都会有`inVal_`,`inGrad_`,`outVal_`和`outGrad_`，分别代表input value， input gradient，output value和output gradient。他们会存放MKL-DNN用到的internal memory。同时还会定义以*ext*开头的`MKLDNNMatrix`(表示external的memory)，主要是在格式与PaddlePaddle默认的`nchw`格式不匹配时，用于转换内存的工作。必要的转换函数也会在`MKLDNNLayer`中提前定义好，每个子类只需要调用定义好的reset buffer函数即可。
+4. 如果MKL-DNN layer的后面接有cpu device，那么就会使`output_.value`与`extOutVal_`共享内存，
-6. 每个`MKLDNNlayer`的resetbuffer相关的函数（包括reset input、output的Value和grad），他们会根据输入参数reset internal和external的memory，当然这两者也可以相等，即表示不需要转换。只需要把握一个原则，每个`MKLDNNlayer`的子类，只需要使用internal的memory就可以了，所有external的转换工作在父类的reset函数中都提前准备好了。
+同时数据格式就是`NCHW`，这样下一个cpu device就能拿到正确的数据。
-7. 一般来说，external的memory会尽量与PaddlePaddle中的`value`和`grad`共享内存。同时每个`MKLDNNLayer`中的external output value和gradient(也就是`extOutVal_`和`extOutGrad_`)必须分别与`output_.value`和`output_.grad`共享内存，因为PaddlePaddle的activation会直接使用`output_.value`和`output_.grad`。如果不需要external的buffer用于转换，那么internal的buffer也会与他们共享内存。
+在有普通的CPU layer时， `extOutVal_`和`extOutGrad_`的格式始终是`NCHW`或者`NC`。
-8. 如果MKL-DNN layer的后面接有cpu device，那么就会使`output_.value`与`extOutVal_`共享内存，同时数据格式就是`nchw`，这样下一个cpu device就能拿到正确的数据。在有cpu device的时候，external的memory的格式始终是`nchw`或者`nc`。
-9. 由于MKL-DNN的输出操作都是覆盖data的，不是在原来的数据上累加，所以当网络出现分支时，在`backward`时会需要merge不同layer的梯度。`MKLDNNlayer`中会实现merge的方法，此时每个小分支的input gradient会先临时保存在一个`MKLDNNMatrix`中，由分支处的layer负责求和，并把结果放到这个layer的`output_.grad`中。所以整体上，每个子类并不会需要关心分支的事情，也是在父类都实现好了。
-10. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag，用于选择是否使用MKL-DNN的相关功能。
 ## References
+1. [MKL small library](https://github.com/01org/mkl-dnn#linking-your-application)是[Intel MKL](https://software.intel.com/en-us/mkl)的一个子集。
-1. [Intel Math Kernel Library for Deep Neural Networks (Intel MKL-DNN)](https://github.com/01org/mkl-dnn "Intel MKL-DNN")
+主要包括了深度学习相关的数学原语与操作，一般由MKL-DNN在发布[新版本](https://github.com/01org/mkl-dnn/releases)时一起更新。
-2. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。但是在PaddlePaddle中，无论是重构前的layer还是重构后的op，都不会想要知道next layer/op的信息。
+2. [MKL-DNN System Requirements](https://github.com/01org/mkl-dnn#system-requirements)。
-3. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的CUDNN部分使用的也是`NCHW`，所以不存在这个问题)，所以需要引入一个转换方法，并且只需要在必要的时候转换这种格式，才能更好的发挥MKL-DNN的性能。
+目前在PaddlePaddle中，仅会在支持AVX2指令集及以上的机器才使用MKL-DNN。
+3. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。
+但是在PaddlePaddle中，无论是重构前的layer还是重构后的op，都不会想要知道next layer/op的信息。
+4. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的cuDNN部分使用的也是`NCHW`，所以不存在这个问题)。
+所以需要引入一个转换方法，并且只需要在必要的时候转换这种格式，才能更好的发挥MKL-DNN的性能。
--- a/doc/design/mkldnn/image/engine.png
+++ b/doc/design/mkldnn/image/engine.png
--- a/doc/design/mkldnn/image/gradients.png
+++ b/doc/design/mkldnn/image/gradients.png
--- a/doc/design/mkldnn/image/layers.png
+++ b/doc/design/mkldnn/image/layers.png
--- a/doc/design/mkldnn/image/matrix.png
+++ b/doc/design/mkldnn/image/matrix.png
--- a/doc/design/mkldnn/image/overview.png
+++ b/doc/design/mkldnn/image/overview.png
--- a/doc/design/refactor/distributed_architecture.md
+++ b/doc/design/refactor/distributed_architecture.md
@@ -2,106 +2,70 @@
 ## Abstract
-PaddlePaddle v0.10.0 uses the "trainer-parameter server"
+PaddlePaddle version 0.10.0 uses the "trainer-parameter server" architecture. We run multiple instances of trainers (where each trainer runs the same model) and parameter servers for distributed training. This architecture serves well, but has few limitations:
-architecture. We run multiple replicated instances of trainers (runs
-the same code written by the user) and parameter servers for
-distributed training. This architecture served us well, but has some
-limitations:
-1. Need to write special code to handle tasks which should only be run
+1. There is a need to write special code that handles tasks which should only be run on a single trainer. E.g., initializing the model, saving the model etc.
-  by a single trainer. E.g., initializing model and saving model.
-2. Model parallelism is hard: need to write if-else branches conditioned
+2. Model parallelism is hard: It would need all the if-else branches conditioned on the trainer ID to partition the model onto the trainers, and eventually manually writing out the inter-model-shard communication code to communicate between different trainers.
-  on the trainer ID to partition model onto each trainer, and manually
-  write the inter-model-shard communication code.
-3. The user can not directly specify the parameter update rule: need
+3. The user can not directly specify the parameter update rule: This would need to modify the parameter server code and compile a new binary. This makes things more complicated for researchers: A lot of extra effort is required to make this work. Besides, the training job submission program may not allow running arbitrary binaries.
-   to modify the parameter server C++ code and compile a new
-   binary. This adds complication for researchers: A lot of extra
-   effort is required. Besides, the training job submission program
-   may not allow running arbitrary binaries.
-This design doc discusses PaddlePaddle's new distributed training
+This design doc discusses PaddlePaddle's new distributed training architecture that addresses the above mentioned limitations.
-architecture that addresses the above limitations.
 ## Analysis
-We will assume the user writes the trainer program by Python, the same
+The assumption is that the user writes the trainer program in either Python or C++.
-analysis holds if the trainer program is written in C++.
 ### Limitation 1
-If we look at the Python code that the user writes, there are two
+There are two basic functionalities in the trainer program:
-kinds of functionalities:
- The training logic such as load / save model and print log.
+1. The training logic such as loading / saving the model and printing out the logs.
- The neural network definition such as the definition of the data
+2. The neural network definition such as the definition of the data layer, the fully connected layer, the cost function and the
-  layer, the fully connected layer, the cost function and the
  optimizer.
-When we training with PaddlePaddle v0.10.0 distributedly, multiple
+When we train using PaddlePaddle v0.10.0 in a distributed fashion, multiple instances of the same Python code are run on different nodes, hence both: the
-replicated Python instances are running on different nodes: both the
+training logic as well as the neural network computation logic, is replicated.
-training logic and the neural network computation is replicated.
-The tasks that should only run once all belong to the training logic,
+The tasks that only need to be run once belong to the training logic. Hence if we only replicate the neural network computation part, and do **not**
-if we only replicate the neural network computation, but do **not**
+replicate the training logic, the limitation mentioned above can be avoided.
-replicate the training logic, the limitation could be solved.
 ### Limitation 2
-Model parallelism means running a single model on multiple nodes by
+Model parallelism means that a single model is partitioned into different components and each node runs one of the component separately. This comes at the extra cost of managing the
-partitioning the model onto different nodes and managing the
+inter-model-shard communication between nodes.
-inter-model-shard communications.
-PaddlePaddle should be able to modify the nerual network computation
+PaddlePaddle should ideally be able to modify the neural network computation and figure out the support for model parallelism automatically. However, the
-definition to support model parallelism automatically. However, the
+computation is only specified in Python code which sits outside of PaddlePaddle, hence PaddlePaddle can not support the feature in this setup.
-computation is only specified in Python code, and PaddlePaddle can not
-modify Python code.
-Just like compiler uses a intermediate representation (IR) so that
+Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows:
-programmer does not need to manually optimize their code in most of
-the cases - the compiler will optimize the IR:
 <img src="src/compiler.png"/>
-We can have our own IR too: PaddlePaddle can support model parallel by
+PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component:
-converting the IR so the user no longer need to manually do it in
-Python:
 <img src="src/paddle-compile.png"/>
-The IR for PaddlePaddle after refactor is called `Block`, it specifies
+The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation.
-the computation dependency graph and the variables used in the
-computation.
 ### Limitation 3
-The user can not directly specify the parameter update rule for the
+The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly.
-parameter server because the parameter server does not use the same
-computation definition as the trainer. Instead, the update rule is
-baked in the parameter server. The user can not specify the update
-rule in the same way of specifying the trainer computation.
-This could be fixed by making the parameter server run the same
+This could be fixed by making the parameter server run the same computation definition as the trainer (the user's Python module). For a detailed explanation, refer to this document -
-computation definition as the trainer. For a detailed explanation,
-please
-see
 [Design Doc: Operation Graph Based Parameter Server](./parameter_server.md)
 ## Distributed Training Architecture
-The new distributed training architecture can address the above
+The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so:
-limitations. Below is the illustration:
 <img src="src/distributed_architecture.png"/>
-The architecture includes major components: *PaddlePaddle Python*,
+The major components in the architecture are: *PaddlePaddle Python*, *PaddlePaddle converter* and *PaddlePaddle runtime*.
-*PaddlePaddle converter* and *PaddlePaddle runtime*:
 ### PaddlePaddle Python
-PaddlePaddle Python is the Python library that user's Python trainer
+PaddlePaddle Python is the Python library that user's Python code invokes, to read the data. build the neural network topology, start training, etc.
-invoke to build the neural network topology, start training, etc.
 ```Python
 paddle.init()
@@ -117,102 +81,60 @@ for i in range(1000):
 	print cost_val
 ```
-The code above is a typical Python trainer code, the neural network
+The above code is what a typical Python trainer code is, the neural network topology is built using the helper functions such as `paddle.layer.fc`. Training is done by calling `session.eval` iteratively.
-topology is built using helper functions such as
-`paddle.layer.fc`. The training is done by calling `session.eval`
-iteratively.
 #### session.eval
-As shown in the graph, `session.eval` sends the IR and the evaluation
+As shown in the graph, `session.eval` sends the IR and the evaluation inputs or targets to the PaddlePaddle cluster for evaluation.
-inputs/targets to the PaddlePaddle cluster for evaluation. The
+The targets can be any variable in the computation graph. When the target is say, the `optimizer` variable, the neural network will be optimized once. When the target is the `cost` variable, `session.eval` returns the cost value. Based on what the target is, an appropriate action is taken.
-targets can be any variable in the computation graph. When the target
-is the `optimizer` variable, the neural network will be optimized
-once. When the target is the `cost` variable, `session.eval` returns
-the cost value.
-The Python `session` is a wrapper of the C++ `Session` class. For more
+The Python `session` is a wrapper of the C++ `Session` class. For more information about `Session`, refer to this document - [Design Doc: Session](./session.md).
-information about `Session`, please
-see [Design Doc: Session](./session.md).
 ### PaddlePaddle Converter
-PaddlePaddle converter automatically converts the IR in the request
+The PaddlePaddle converter automatically converts the IR in the request (IR and evaluation inputs/targets) from PaddlePaddle Python to partitioned IRs and dispatches the new IRs and evaluation inputs/targets to different PaddlePaddle runtimes. Below are the steps that are followed :
-(IR and evaluation inputs/targets) from PaddlePaddle Python to new
-partitioned IRs and dispatch the new IRs and evaluation inputs/targets
-to different PaddlePaddle runtimes. Below are the steps:
-1. Add `feed` OP that feeds the eval inputs, and `fetch` OP that
+1. Add a `feed` OP that feeds the eval inputs, and a `fetch` OP that fetches the eval targets to the IR.
-   fetches the eval targets to the IR.
-1. Extract a new computation (sub)graph with `feed` and `fetch` OP as
+2. Extract a new computation (sub)graph with the `feed` and `fetch` OPs as the boundary. The runtime does not need to run the OP that is not dependent on the `fetch` OP.
-   the boundary. The runtime does not need to run the OP that is not
-   dependent by the `fetch` OP.
-1. Optimizes the computation graph.
+3. Optimize the computation graph.
-1. Place the OPs in the graph onto different devices on different
+4. Place the OPs in the graph onto different devices on different PaddlePaddle runtime according to a placement algorithm and the device constraints specified by the user.
-   PaddlePaddle runtime according to a placement algorithm and device
-   constraint specified by the user.
-1. Partition the graph according to runtime boundaries and add `send` /
+5. Partition the graph according to runtime boundaries and add `send` / `recv` OP pair on the runtime boundaries.
-   `recv` OP pair on the runtime boundaries.
-1. Dispatch the partitioned graph to different PaddlePaddle runtimes.
+6. Dispatch the partitioned graph to different PaddlePaddle runtimes.
-1. PaddlePaddle runtimes with the `fetch` OP reports evaluation
+7. PaddlePaddle runtimes with the `fetch` OP reports evaluation results back to the converter, the converter reports the evaluation results back to the PaddlePaddle Python.
-   results back to the converter, the convert reports the evaluation
-   results back to the PaddlePaddle Python.
 The output IRs will be cached to optimize the conversion latency.
 #### Placement Algorithm
-Our first implementation will only support "trainer-parameter server"
+Our first implementation will only support "trainer-parameter server" placement: the parameters, initializers, and optimizers are all placed on the PaddlePaddle runtimes with the parameter server role. Everything else will be placed on the PaddlePaddle runtimes with the trainer role. This has the same functionality as the "trainer-parameter server" architecture of PaddlePaddle v0.10.0, but is more generic and flexible.
-placement: the parameters, initializers, and optimizers are placed on
-the PaddlePaddle runtimes with the parameter server role. And
-everything else will be placed on the PaddlePaddle runtimes with the
-trainer role. This has the same functionality of our
-"trainer-parameter server" architecture of PaddlePaddle v0.10.0, but
-is more general and flexible.
-In the future, we will implement the general placement algorithm,
+In the future, a more general placement algorithm should be implemented, which makes placements according to the input IR, and a model of device computation time and device communication time. Model parallelism requires the generic placement algorithm.
-which makes placements according to the input IR, and a model of
-device computation time and device communication time. Model
-parallelism requires the general placement algorithm.
 ### PaddlePaddle Runtime
-The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and
+The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and runs the IR. The runtime does not need to do OP placement since it is already done by the converter.
-runs the IR. The runtime does not need to do OP placement since it's
-already done by the converter.
 ### Local Training Architecture
-The local training architecture will be the same as the distributed
+The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:
-training architecture, the differences are everything runs locally,
-and there is just one PaddlePaddle runtime:
 <img src="src/local_architecture.png"/>
 ### Training Data
-In PaddlePaddle v0.10.0, training data is typically read
+In PaddlePaddle v0.10.0, training data is typically read with a [data reader](../reader/README.md) from Python. This approach is no longer efficient when training in a distributed fashion since the Python process no longer runs on the same node with the trainer processes. The Python reader will need to read from the distributed filesystem (assuming it has the required access) and send to the trainers, doubling the network traffic.
-with [data reader](../reader/README.md) from Python. This approach is
-no longer efficient when training distributedly since the Python
+When doing distributed training, the user can still use Python data reader: the training data are sent with `session.eval`. However this should be used for debugging purpose only. The users are encouraged to use the read data OPs.
-process no longer runs on the same node with the trainer processes,
-the Python reader will need to read from the distributed filesystem
-(assuming it has the access) and send to the trainers, doubling the
-network traffic.
-When doing distributed training, the user can still use Python data
-reader: the training data are sent with `session.eval`. However should
-be used for debugging purpose only. The users are encouraged to use
-the read data OPs.
 ## References:

--- a/doc/getstarted/build_and_install/build_from_source_cn.rst
+++ b/doc/getstarted/build_and_install/build_from_source_cn.rst
-从源码编译PaddlePaddle
+从源码编译
 ======================
 .. _build_step:
@@ -7,8 +7,11 @@
 ----------------
 PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译工具。
-我们推荐您使用PaddlePaddle编译环境镜像完成编译，这样可以免去单独安装编译依赖的步骤，可选的不同编译环境
+我们推荐您使用PaddlePaddle Docker编译环境镜像完成编译，这样可以免去单独安装编译依赖的步骤，可选的不同编译环境Docker镜像
 可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ 找到。
+如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。
 编译PaddlePaddle，需要执行：
 .. code-block:: bash
@@ -23,7 +26,6 @@ PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译
   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
   make
 编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：
 .. code-block:: bash
@@ -31,7 +33,33 @@ PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译
   pip install python/dist/*.whl
-.. _build_step:
+.. _run_test:
+执行单元测试
+----------------
+如果您期望在编译完成后立即执行所有的单元测试，可以按照下面的方法：
+使用Docker的情况下，设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后，立即执行单元测试。
+开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。
+.. code-block:: bash
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+如果不使用Docker，可以执行ctest命令即可：
+.. code-block:: bash
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
+   make
+   ctest
+   # 指定执行其中一个单元测试 test_mul_op
+   ctest -R test_mul_op
+.. _compile_deps:
 编译依赖
 ----------------

--- a/doc/getstarted/build_and_install/build_from_source_en.rst
+++ b/doc/getstarted/build_and_install/build_from_source_en.rst
-Build PaddlePaddle from Sources
+Build from Sources
 ==========================
 .. _build_step:
@@ -9,14 +9,18 @@ How To Build
 PaddlePaddle mainly uses `CMake <https://cmake.org>`_ and GCC, G++ as compile
 tools. We recommend you to use our pre-built Docker image to run the build
 to avoid installing dependencies by yourself. We have several build environment
-Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_.
+Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ .
+If you choose not to use Docker image for your build, you need to install the
+below `Compile Dependencies`_ before run the build.
 Then run:
 .. code-block:: bash
   git clone https://github.com/PaddlePaddle/Paddle.git
   cd Paddle
-   # run the following command to build CPU-Only binaries if you are using docker
+   # run the following command to build a CPU-Only binaries if you are using docker
   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
   # else run these commands
   mkdir build
@@ -32,7 +36,35 @@ machine or copy it to the target machine.
   pip install python/dist/*.whl
-.. _build_step:
+.. _run_test:
+Run Tests
+----------------
+If you wish to run the tests, you may follow the below steps:
+When using Docker, set :code:`RUN_TEST=ON` and :code:`WITH_TESTING=ON` will run test immediately after the build.
+Set :code:`WITH_GPU=ON` Can also run tests on GPU.
+.. code-block:: bash
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+If you don't use Docker, just run ctest will start the tests:
+.. code-block:: bash
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=ON ..
+   make
+   ctest
+   # run a single test like test_mul_op
+   ctest -R test_mul_op
+.. _compile_deps:
 Compile Dependencies
 ----------------

--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
-使用Docker安装运行PaddlePaddle
+使用Docker安装运行
 ================================
 使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境即可运行。并且也可以在Windows的docker中运行。

--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
-PaddlePaddle in Docker Containers
+Run in Docker Containers
 =================================
 Run PaddlePaddle in Docker container so that you don't need to care about

--- a/doc/getstarted/build_and_install/pip_install_cn.rst
+++ b/doc/getstarted/build_and_install/pip_install_cn.rst
-使用pip安装PaddlePaddle
+使用pip安装
 ================================
 PaddlePaddle可以使用常用的Python包管理工具
@@ -34,7 +34,7 @@ PaddlePaddle可以使用常用的Python包管理工具
   :align: center
 ..  csv-table:: 各个版本最新的whl包
-    :header: "版本说明", "cp27-cp27mu", "cp27-cp27mu", "C-API"
+    :header: "版本说明", "cp27-cp27mu", "cp27-cp27m", "C-API"
    :widths: 1, 3, 3, 3
    "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"

--- a/doc/getstarted/build_and_install/pip_install_en.rst
+++ b/doc/getstarted/build_and_install/pip_install_en.rst
-Install PaddlePaddle Using pip
+Install Using pip
 ================================
 You can use current widely used Python package management
@@ -37,7 +37,7 @@ If the links below shows up the login form, just click "Log in as guest" to star
   :align: center
 ..  csv-table:: whl package of each version
-    :header: "version", "cp27-cp27mu", "cp27-cp27mu", "C-API"
+    :header: "version", "cp27-cp27mu", "cp27-cp27m", "C-API"
    :widths: 1, 3, 3, 3
    "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"

--- a/doc/howto/dev/write_docs_cn.rst
+++ b/doc/howto/dev/write_docs_cn.rst
@@ -3,12 +3,64 @@
 ##################
 PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成，生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。
+也可以利用PaddlePaddle 工具来编译文档，这个情况下所有的文件会存在整理过的的文件目录 .ppo_workspace/content 下
 如何构建文档
 ============
-PaddlePaddle的文档构建有两种方式。
+PaddlePaddle的文档构建有三种方式。
+使用PaddlePaddle.org工具
+--------------
+这个是目前推荐的使用方法。除了可以自动编译文档，也可以直接在网页预览文档。
+文件工具是使用Docker，需要在系统里先安装好Docker工具包。Docker安装请参考Docker的官网。安装好Docker之后及可用以下命令启动工具
+..  code-block:: bash
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+    # Clone the content repositories
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+    # Please specify the working directory through -v
+    docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
+注意: PaddlePaddle.org 会在 -v (volume) 指定的内容存储库运行命令
+之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档
+编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
+如果不想使用 Docker，你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。
+..  code-block:: bash
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+    # Clone the content repositories and PaddlePaddle.org
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+    git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git
+    # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd
+    export CONTENT_DIR=<path_to_paddlepaddle_working_directory>
+    export ENV=''
+    cd PaddlePaddle.org/portal/
+    pip install -r requirements.txt
+    python manage.py runserver
+工具服务器将读取环境变量 CONTENT_DIR 搜索代码库。请指定的PaddlePaddle工作目录给环境变量 CONTENT_DIR。
+之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档。
+编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
+想了解更多PaddlePaddle.org工具的详细信息，可以 `点击这里 <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.cn.md>`_ 。
 使用Docker构建
 --------------
@@ -47,17 +99,12 @@ PaddlePaddle的文档构建有两种方式。
 PaddlePaddle文档使用 `sphinx`_ 自动生成，用户可以参考sphinx教程进行书写。
-如何更新文档主题
+如何更新www.paddlepaddle.org
-================
-PaddlePaddle文档主题在 `TO_YOUR_PADDLE_CLONE_PATH/doc_theme` 文件夹下，包含所有和前端网页设计相关的文件。
-如何更新doc.paddlepaddle.org
 ============================
-更新的文档以PR的形式提交到github中，提交方式参见 `贡献文档 <http://doc.paddlepaddle.org/develop/doc_cn/howto/dev/contribute_to_paddle_cn.html>`_ 。
+更新的文档以PR的形式提交到github中，提交方式参见 `贡献文档 <http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html>`_ 。
-目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://doc.paddlepaddle.org/develop/doc_cn/>`_ 和
+目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ 和
-`英文文档 <http://doc.paddlepaddle.org/develop/doc/>`_ 。
+`英文文档 <http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html>`_ 。
 ..  _cmake: https://cmake.org/

--- a/doc/howto/dev/write_docs_en.rst
+++ b/doc/howto/dev/write_docs_en.rst
+##################
+Contribute Documentation
+##################
+PaddlePaddle supports English documentation ``doc`` and Chinese documentation ``doc_cn``.
+Both are compiled by `cmake`_ and `sphinx`_ , the compiled documentations will be stored under ``doc`` and ``doc_cn`` directories.
+When using the PaddlePaddle.org to compile documentations, the compiled documentations will be stored under a consolidated directory: .ppo_workspace/content
+How to Build Documentations
+============
+We recommend using PaddlePaddle.org tool to build documentation
+Use PaddlePaddle.org tool
+--------------
+This is the recommended method to build documentation. It can compile documentation and preview the documentation in a web browser.
+The tool uses Docker, please install it on your system. Please check Docker official website on how to install Docker. You may use the following commands to activate the tool
+..  code-block:: bash
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+    # Clone the content repositories. You may only clone the contents you need
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+    # Please specify the working directory through -v
+    docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
+Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run command
+Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation
+The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content
+If you don't wish to use Docker, you can also activate the tool through Django. Use the following the commands to set up
+..  code-block:: bash
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+    # Clone the content repositories and PaddlePaddle.org
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+    git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git
+    # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd
+    export CONTENT_DIR=<path_to_paddlepaddle_working_directory>
+    export ENV=''
+    cd PaddlePaddle.org/portal/
+    pip install -r requirements.txt
+    python manage.py runserver
+Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation
+The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content
+If you want to learn more on the PaddlePaddle.org, please `click here <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.md>`_ 。
+How to write Documentations
+============
+PaddlePaddle uses `sphinx`_ to compile documentations，Please check sphinx official website for more detail.
+How to update www.paddlepaddle.org
+============================
+Please create PRs and submit them to github, please check `Contribute Code <http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html>`_ 。
+PaddlePaddle develop branch will update the documentation once the PR is merged. User may check latest `Chinese Docs <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ and
+`English Docs <http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html>`_ 。
+..  _cmake: https://cmake.org/
+..  _sphinx: http://www.sphinx-doc.org/en/1.4.8/
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -19,7 +19,7 @@
 ..  toctree::
  :maxdepth: 1
-  dev/build_cn.rst
+  dev/contribute_to_paddle_cn.md
  dev/write_docs_cn.rst
 模型配置

--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -18,9 +18,9 @@ Development
 ..  toctree::
  :maxdepth: 1
-  dev/build_en.rst
  dev/new_layer_en.rst
  dev/contribute_to_paddle_en.md
+  dev/write_docs_en.rst
 Configuration
 -------------

--- a/doc/howto/optimization/cpu_profiling.md
+++ b/doc/howto/optimization/cpu_profiling.md
-此教程会介绍如何使用Python的cProfile包，与Python库yep，google perftools来运行性能分析(Profiling)与调优。
+This tutorial introduces techniques we use to profile and tune the
+CPU performance of PaddlePaddle.  We will use Python packages
+`cProfile` and `yep`, and Google's `perftools`.
-运行性能分析可以让开发人员科学的，有条不紊的对程序进行性能优化。性能分析是性能调优的基础。因为在程序实际运行中，真正的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。
+Profiling is the process that reveals performance bottlenecks,
+which could be very different from what's in the developers' mind.
+Performance tuning is done to fix these bottlenecks. Performance optimization
+repeats the steps of profiling and tuning alternatively.
-性能优化的步骤，通常是循环重复若干次『性能分析 --> 寻找瓶颈 ---> 调优瓶颈 --> 性能分析确认调优效果』。其中性能分析是性能调优的至关重要的量化指标。
+PaddlePaddle users program AI applications by calling the Python API, which calls
+into `libpaddle.so.` written in C++.  In this tutorial, we focus on
+the profiling and tuning of
-Paddle提供了Python语言绑定。用户使用Python进行神经网络编程，训练，测试。Python解释器通过`pybind`和`swig`调用Paddle的动态链接库，进而调用Paddle C++部分的代码。所以Paddle的性能分析与调优分为两个部分:
+1. the Python code and
+1. the mixture of Python and C++ code.
-* Python代码的性能分析
+## Profiling the Python Code
-* Python与C++混合代码的性能分析
+### Generate the Performance Profiling File
-## Python代码的性能分析
+We can use Python standard
+package, [`cProfile`](https://docs.python.org/2/library/profile.html),
-### 生成性能分析文件
+to generate Python profiling file.  For example:
-Python标准库中提供了性能分析的工具包，[cProfile](https://docs.python.org/2/library/profile.html)。生成Python性能分析的命令如下:
 ```bash
 python -m cProfile -o profile.out main.py
 ```
-其中`-o`标识了一个输出的文件名，用来存储本次性能分析的结果。如果不指定这个文件，`cProfile`会打印一些统计信息到`stdout`。这不方便我们进行后期处理(进行`sort`, `split`, `cut`等等)。
+where `main.py` is the program we are going to profile, `-o` specifies
+the output file.  Without `-o`, `cProfile` would outputs to standard
-### 查看性能分析文件
+output.
-当main.py运行完毕后，性能分析结果文件`profile.out`就生成出来了。我们可以使用[cprofilev](https://github.com/ymichael/cprofilev)来查看性能分析结果。`cprofilev`是一个Python的第三方库。使用它会开启一个HTTP服务，将性能分析结果以网页的形式展示出来。
+### Look into the Profiling File
-使用`pip install cprofilev`安装`cprofilev`工具。安装完成后，使用如下命令开启HTTP服务
+`cProfile` generates `profile.out` after `main.py` completes. We can
+use [`cprofilev`](https://github.com/ymichael/cprofilev) to look into
+the details:
 ```bash
 cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
 ```
-其中`-a`标识HTTP服务绑定的IP。使用`0.0.0.0`允许外网访问这个HTTP服务。`-p`标识HTTP服务的端口。`-f`标识性能分析的结果文件。`main.py`标识被性能分析的源文件。
+where `-a` specifies the HTTP IP, `-p` specifies the port, `-f`
+specifies the profiling file, and `main.py` is the source file.
-访问对应网址，即可显示性能分析的结果。性能分析结果格式如下:
+Open the Web browser and points to the local IP and the specifies
+port, we will see the output like the following:
-```text
+```
   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.284    0.284   29.514   29.514 main.py:1(<module>)
     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run)
@@ -44,23 +54,23 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
        1    0.144    0.144    6.534    6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
 ```
-每一列的含义是:
+where each line corresponds to Python function, and the meaning of
+each column is as follows:
-| 列名 | 含义 |
+| column | meaning |
 | --- | --- |
-| ncalls | 函数的调用次数 |
+| ncalls | the number of calls into a function |
-| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 |
+| tottime | the total execution time of the function, not including the
-| percall | tottime的每次调用平均时间 |
+ execution time of other functions called by the function |
-| cumtime | 函数总时间。包含这个函数调用其他函数的时间 |
+| percall | tottime divided by ncalls |
-| percall | cumtime的每次调用平均时间 |
+| cumtime | the total execution time of the function, including the execution time of other functions being called |
-| filename:lineno(function) | 文件名, 行号，函数名 |
+| percall | cumtime divided by ncalls |
+| filename:lineno(function) | where the function is defined |
+### Identify Performance Bottlenecks
-### 寻找性能瓶颈
+Usually, `tottime` and the related `percall` time is what we want to
+focus on. We can sort above profiling file by tottime:
-通常`tottime`和`cumtime`是寻找瓶颈的关键指标。这两个指标代表了某一个函数真实的运行时间。
-将性能分析结果按照tottime排序，效果如下:
 ```text
     4696   12.040    0.003   12.040    0.003 {built-in method run}
@@ -68,12 +78,15 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1(<module>)
 ```
-可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python与C++混合代码的性能分析`来进行调优。而`sync_with_cpp`函数的总共耗时很长，每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息，了解其调用关系。
+We can see that the most time-consuming function is the `built-in
+method run`, which is a C++ function in `libpaddle.so`.  We will
+explain how to profile C++ code in the next section.  At this 
+moment, let's look into the third function `sync_with_cpp`, which is a
+Python function.  We can click it to understand more about it:
-```text
+```
 Called By:
   Ordered by: internal time
@@ -92,72 +105,93 @@ Called:
   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
 ```
-通常观察热点函数间的调用关系，和对应行的代码，就可以了解到问题代码在哪里。当我们做出性能修正后，再次进行性能分析(profiling)即可检查我们调优后的修正是否能够改善程序的性能。
+The lists of the callers of `sync_with_cpp` might help us understand
+how to improve the function definition.
+## Profiling Python and C++ Code
+### Generate the Profiling File
-## Python与C++混合代码的性能分析
+To profile a mixture of Python and C++ code, we can use a Python
+package, `yep`, that can work with Google's `perftools`, which is a
+commonly-used profiler for C/C++ code.
-### 生成性能分析文件
+In Ubuntu systems, we can install `yep` and `perftools` by running the
+following commands:
-C++的性能分析工具非常多。常见的包括`gprof`, `valgrind`, `google-perftools`。但是调试Python中使用的动态链接库与直接调试原始二进制相比增加了很多复杂度。幸而Python的一个第三方库`yep`提供了方便的和`google-perftools`交互的方法。于是这里使用`yep`进行Python与C++混合代码的性能分析
-使用`yep`前需要安装`google-perftools`与`yep`包。ubuntu下安装命令为
 ```bash
+apt update
 apt install libgoogle-perftools-dev
 pip install yep
 ```
-安装完毕后，我们可以通过
+Then we can run the following command
 ```bash
 python -m yep -v main.py
 ```
-生成性能分析文件。生成的性能分析文件为`main.py.prof`。
+to generate the profiling file.  The default filename is
+`main.py.prof`.
+Please be aware of the `-v` command line option, which prints the
+analysis results after generating the profiling file.  By examining the
+ the print result, we'd know that if we stripped debug
+information from `libpaddle.so` at build time.  The following hints
+help make sure that the analysis results are readable:
-命令行中的`-v`指定在生成性能分析文件之后，在命令行显示分析结果。我们可以在命令行中简单的看一下生成效果。因为C++与Python不同，编译时可能会去掉调试信息，运行时也可能因为多线程产生混乱不可读的性能分析结果。为了生成更可读的性能分析结果，可以采取下面几点措施:
+1. Use GCC command line option `-g` when building `libpaddle.so` so to
+   include the debug information.  The standard building system of
+   PaddlePaddle is CMake, so you might want to set
+   `CMAKE_BUILD_TYPE=RelWithDebInfo`.
-1. 编译时指定`-g`生成调试信息。使用cmake的话，可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`。
+1. Use GCC command line option `-O2` or `-O3` to generate optimized
-2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。
+   binary code. It doesn't make sense to profile `libpaddle.so`
-3. 运行性能分析的时候，先从单线程开始，再开启多线程，进而多机。毕竟如果单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。
+   without optimization, because it would anyway run slowly.
-### 查看性能分析文件
+1. Profiling the single-threaded binary file before the
+   multi-threading version, because the latter often generates tangled
+   profiling analysis result.  You might want to set environment
+   variable `OMP_NUM_THREADS=1` to prevents OpenMP from automatically
+   starting multiple threads.
-在运行完性能分析后，会生成性能分析结果文件。我们可以使用[pprof](https://github.com/google/pprof)来显示性能分析结果。注意，这里使用了用`Go`语言重构后的`pprof`，因为这个工具具有web服务界面，且展示效果更好。
+### Examining the Profiling File
-安装`pprof`的命令和一般的`Go`程序是一样的，其命令如下:
+The tool we used to examine the profiling file generated by
+`perftools` is [`pprof`](https://github.com/google/pprof), which
+provides a Web-based GUI like `cprofilev`.
+We can rely on the standard Go toolchain to retrieve the source code
+of `pprof` and build it:
 ```bash
 go get github.com/google/pprof
 ```
-进而我们可以使用如下命令开启一个HTTP服务:
+Then we can use it to profile `main.py.prof` generated in the previous
+section:
 ```bash
 pprof -http=0.0.0.0:3213 `which python`  ./main.py.prof
 ```
-这行命令中，`-http`指开启HTTP服务。`which python`会产生当前Python二进制的完整路径，进而指定了Python可执行文件的路径。`./main.py.prof`输入了性能分析结果。
+Where `-http` specifies the IP and port of the HTTP service.
+Directing our Web browser to the service, we would see something like
-访问对应的网址，我们可以查看性能分析的结果。结果如下图所示:
+the following:
 ![result](./pprof_1.png)
+### Identifying the Performance Bottlenecks
-### 寻找性能瓶颈
+Similar to how we work with `cprofilev`, we'd focus on `tottime` and
+`cumtime`.
-与寻找Python代码的性能瓶颈类似，寻找Python与C++混合代码的性能瓶颈也是要看`tottime`和`cumtime`。而`pprof`展示的调用图也可以帮助我们发现性能中的问题。
-例如下图中，
 ![kernel_perf](./pprof_2.png)
-在一次训练中，乘法和乘法梯度的计算占用2%-4%左右的计算时间。而`MomentumOp`占用了17%左右的计算时间。显然，`MomentumOp`的性能有问题。
+We can see that the execution time of multiplication and the computing
+of the gradient of multiplication takes 2% to 4% of the total running
-在`pprof`中，对于性能的关键路径都做出了红色标记。先检查关键路径的性能问题，再检查其他部分的性能问题，可以更有次序的完成性能的优化。
+time, and `MomentumOp` takes about 17%. Obviously, we'd want to
+optimize `MomentumOp`.
-## 总结
-至此，两种性能分析的方式都介绍完毕了。希望通过这两种性能分析的方式，Paddle的开发人员和使用人员可以有次序的，科学的发现和解决性能问题。
+`pprof` would mark performance critical parts of the program in
+red. It's a good idea to follow the hints.
--- a/doc/howto/optimization/cpu_profiling_cn.md
+++ b/doc/howto/optimization/cpu_profiling_cn.md
+此教程会介绍如何使用Python的cProfile包、Python库yep、Google perftools来进行性能分析 (profiling) 与调优（performance tuning）。
+Profling 指发现性能瓶颈。系统中的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。Tuning 指消除瓶颈。性能优化的过程通常是不断重复地 profiling 和 tuning。
+PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大部分 Python API 调用用 C++ 写的 libpaddle.so。所以 PaddlePaddle 的性能分析与调优分为两个部分:
+* Python 代码的性能分析
+* Python 与 C++ 混合代码的性能分析
+## Python代码的性能分析
+### 生成性能分析文件
+Python标准库中提供了性能分析的工具包，[cProfile](https://docs.python.org/2/library/profile.html)。生成Python性能分析的命令如下:
+```bash
+python -m cProfile -o profile.out main.py
+```
+其中 `main.py` 是我们要分析的程序，`-o`标识了一个输出的文件名，用来存储本次性能分析的结果。如果不指定这个文件，`cProfile`会打印到标准输出。
+### 查看性能分析文件
+`cProfile` 在main.py 运行完毕后输出`profile.out`。我们可以使用[`cprofilev`](https://github.com/ymichael/cprofilev)来查看性能分析结果。`cprofilev`是一个Python的第三方库。使用它会开启一个HTTP服务，将性能分析结果以网页的形式展示出来：
+```bash
+cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
+```
+其中`-a`标识HTTP服务绑定的IP。使用`0.0.0.0`允许外网访问这个HTTP服务。`-p`标识HTTP服务的端口。`-f`标识性能分析的结果文件。`main.py`标识被性能分析的源文件。
+用Web浏览器访问对应网址，即可显示性能分析的结果：
+```
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.284    0.284   29.514   29.514 main.py:1(<module>)
+     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run)
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+        1    0.144    0.144    6.534    6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
+```
+每一列的含义是:
+| 列名 | 含义 |
+| --- | --- |
+| ncalls | 函数的调用次数 |
+| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 |
+| percall | tottime的每次调用平均时间 |
+| cumtime | 函数总时间。包含这个函数调用其他函数的时间 |
+| percall | cumtime的每次调用平均时间 |
+| filename:lineno(function) | 文件名, 行号，函数名 |
+### 寻找性能瓶颈
+通常`tottime`和`cumtime`是寻找瓶颈的关键指标。这两个指标代表了某一个函数真实的运行时间。
+将性能分析结果按照tottime排序，效果如下:
+```text
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+   300005    0.874    0.000    1.681    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
+   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
+     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
+        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1(<module>)
+```
+可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python`与`C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长，每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息，了解其调用关系。
+```text
+Called By:
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+Function                                                                                                 was called by...
+                                                                                                             ncalls  tottime  cumtime
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone)
+                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward)
+Called:
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+```
+通常观察热点函数间的调用关系，和对应行的代码，就可以了解到问题代码在哪里。当我们做出性能修正后，再次进行性能分析(profiling)即可检查我们调优后的修正是否能够改善程序的性能。
+## Python与C++混合代码的性能分析
+### 生成性能分析文件
+C++的性能分析工具非常多。常见的包括`gprof`, `valgrind`, `google-perftools`。但是调试Python中使用的动态链接库与直接调试原始二进制相比增加了很多复杂度。幸而Python的一个第三方库`yep`提供了方便的和`google-perftools`交互的方法。于是这里使用`yep`进行Python与C++混合代码的性能分析
+使用`yep`前需要安装`google-perftools`与`yep`包。ubuntu下安装命令为
+```bash
+apt update
+apt install libgoogle-perftools-dev
+pip install yep
+```
+安装完毕后，我们可以通过
+```bash
+python -m yep -v main.py
+```
+生成性能分析文件。生成的性能分析文件为`main.py.prof`。
+命令行中的`-v`指定在生成性能分析文件之后，在命令行显示分析结果。我们可以在命令行中简单的看一下生成效果。因为C++与Python不同，编译时可能会去掉调试信息，运行时也可能因为多线程产生混乱不可读的性能分析结果。为了生成更可读的性能分析结果，可以采取下面几点措施:
+1. 编译时指定`-g`生成调试信息。使用cmake的话，可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`。
+2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。
+3. 运行性能分析的时候，先从单线程开始，再开启多线程，进而多机。毕竟单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。
+### 查看性能分析文件
+在运行完性能分析后，会生成性能分析结果文件。我们可以使用[`pprof`](https://github.com/google/pprof)来显示性能分析结果。注意，这里使用了用`Go`语言重构后的`pprof`，因为这个工具具有web服务界面，且展示效果更好。
+安装`pprof`的命令和一般的`Go`程序是一样的，其命令如下:
+```bash
+go get github.com/google/pprof
+```
+进而我们可以使用如下命令开启一个HTTP服务:
+```bash
+pprof -http=0.0.0.0:3213 `which python`  ./main.py.prof
+```
+这行命令中，`-http`指开启HTTP服务。`which python`会产生当前Python二进制的完整路径，进而指定了Python可执行文件的路径。`./main.py.prof`输入了性能分析结果。
+访问对应的网址，我们可以查看性能分析的结果。结果如下图所示:
+![result](./pprof_1.png)
+### 寻找性能瓶颈
+与寻找Python代码的性能瓶颈类似，寻找Python与C++混合代码的性能瓶颈也是要看`tottime`和`cumtime`。而`pprof`展示的调用图也可以帮助我们发现性能中的问题。
+例如下图中，
+![kernel_perf](./pprof_2.png)
+在一次训练中，乘法和乘法梯度的计算占用2%-4%左右的计算时间。而`MomentumOp`占用了17%左右的计算时间。显然，`MomentumOp`的性能有问题。
+在`pprof`中，对于性能的关键路径都做出了红色标记。先检查关键路径的性能问题，再检查其他部分的性能问题，可以更有次序的完成性能的优化。
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@@ -4,6 +4,16 @@ else ()
  set(PADDLE_FLOAT_TYPE float)
 endif()
+execute_process(
+  COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
+  WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+  OUTPUT_VARIABLE PADDLE_GIT_COMMIT
+  RESULT_VARIABLE PADDLE_GIT_COMMIT_RESULT
+  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+if(NOT PADDLE_GIT_COMMIT)
+  set(PADDLE_GIT_COMMIT "no commit information")
+endif()
 # config.h used for C-API. It will store Paddle building configuration as a
 # header. Make user just include PaddleCAPI.h then can get building
 # configuration without explicitly set -DPADDLE_WITH_DOUBLE when building their

--- a/paddle/capi/config.h.in
+++ b/paddle/capi/config.h.in
@@ -3,6 +3,9 @@
 typedef @PADDLE_FLOAT_TYPE@ paddle_real;
+#define __PADDLE_VERSION__  "@PADDLE_VERSION@"
+#define __PADDLE_COMMIT__   "@PADDLE_GIT_COMMIT@"
 // Since we only support linux and macos in compile, always use clang or
 // gcc 4.8+. DLL_IMPORT/DLL_EXPORT is as simple as below.
 #define PD_API __attribute__((visibility("default")))

--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -27,6 +27,18 @@
 namespace paddle {
 namespace framework {
+static std::unordered_set<std::string>* g_ctrl_flow_ops_ = nullptr;
+// Control Flow operators's backward is significantly different from
+// computational operators. Hack Code here.
+// We should design a better way to backward CtrlFlowOps.
+static std::unordered_set<std::string>& CtrlFlowOps() {
+  if (g_ctrl_flow_ops_ == nullptr) {
+    g_ctrl_flow_ops_ =
+        new std::unordered_set<std::string>{"increment", "lod_rank_table"};
+  }
+  return *g_ctrl_flow_ops_;
+}
 static inline std::unique_ptr<OperatorBase> CreateGradOp(
    const OperatorBase& op, const std::unordered_set<std::string>& no_grad_set,
    std::unordered_map<std::string, std::string>* grad_to_var) {
@@ -288,12 +300,24 @@ static void CreateGradVarInBlock(
  for (size_t op_index = grad_op_start_index; op_index < ops.size();
       ++op_index) {
    std::unordered_set<std::string> new_vars;
+    auto& ctrl_flow_ops = CtrlFlowOps();
    ForEachVarName(ops[op_index]->Outputs(),
                   [&](const std::string& grad_var_name) {
+                     if (ctrl_flow_ops.find(ops[op_index]->Type()) !=
+                         ctrl_flow_ops.end()) {
+                       if (block_desc->HasVarRecursive(grad_var_name)) {
+                         return false;
+                       }
+                     } else {
                       if (block_desc->HasVar(grad_var_name)) {
                         return false;
                       }
+                     }
+                     if (grad_var_name == framework::kEmptyVarName) {
+                       return false;
+                     }
                     auto var = block_desc->Var(grad_var_name);
+                     VLOG(10) << "Creating Variable " << grad_var_name;
                     new_vars.insert(var->Name());
                     auto it = param_name_map.find(grad_var_name);
                     if (it == param_name_map.end()) {
@@ -333,14 +357,25 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
  // All input gradients of forwarding operator do not need to calculate.
  const std::vector<std::string>& inputs = op_desc->InputArgumentNames();
  if (AllGradInSet(inputs, *no_grad_vars)) {
+    VLOG(10) << "Drop operator  " << op_desc->Type();
    return grad_op_descs;  // empty vector
  }
  // All output gradients of forwarding operator do not need to calculate.
  const std::vector<std::string>& outputs = op_desc->OutputArgumentNames();
  if (AllGradInSet(outputs, *no_grad_vars)) {
+    VLOG(10) << "Drop operator " << op_desc->Type();
+    // FIXME: Hack code here
+    auto& ctrl_flow_ops = CtrlFlowOps();
+    if (ctrl_flow_ops.find(op_desc->Type()) == ctrl_flow_ops.end()) {
+      // Only computational op need drop input's gradient.
      for (const std::string& name : inputs) {
        no_grad_vars->insert(GradVarName(name));
+        VLOG(10) << " Also drop " << GradVarName(name);
+      }
    }
    return grad_op_descs;  // empty vector
  }

--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/framework/block_desc.h"
+#include "paddle/framework/operator.h"
 #include "paddle/framework/program_desc.h"
 namespace paddle {
@@ -42,6 +43,8 @@ bool BlockDescBind::HasVar(const std::string &name) const {
 }
 VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const {
+  if (name == kEmptyVarName) return nullptr;
  auto it = vars_.find(name);
  if (it == vars_.end()) {
    return Parent() == kNoneBlockIndex ? nullptr

--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -97,6 +97,10 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id,
  if (create_local_scope) {
    local_scope = &scope->NewScope();
    for (auto& var : block.AllVars()) {
+      if (var->Name() == framework::kEmptyVarName) {
+        continue;
+      }
      if (var->Persistable()) {
        auto* ptr = scope->Var(var->Name());
        CreateTensor(ptr, var->GetType());

--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -65,7 +65,7 @@ class CompileTimeInferShapeContext : public InferShapeContext {
    PADDLE_ENFORCE_EQ(in_var->GetType(), VarDesc::LOD_TENSOR,
                      "The %d-th output of Output(%s) must be LoDTensor.", j,
                      out);
-    in_var->SetLoDLevel(out_var->GetLodLevel());
+    out_var->SetLoDLevel(in_var->GetLodLevel());
  }
  bool IsRuntime() const override;
@@ -466,7 +466,12 @@ DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
  auto var = block_.FindVarRecursive(name);
  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
  try {
+    auto shape = var->Shape();
+    if (shape.empty()) {
+      return framework::make_ddim({0UL});
+    } else {
      return framework::make_ddim(var->Shape());
+    }
  } catch (...) {
    VLOG(5) << "GetDim of variable " << name << " error";
    std::rethrow_exception(std::current_exception());

--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -36,12 +36,9 @@ Scope& Scope::NewScope() const {
 }
 Variable* Scope::Var(const std::string& name) {
-  auto iter = vars_.find(name);
+  auto* v = FindVarLocally(name);
-  if (iter != vars_.end()) {
+  if (v != nullptr) return v;
-    VLOG(3) << "Get existing variable " << name;
+  v = new Variable();
-    return iter->second;
-  }
-  Variable* v = new Variable();
  vars_[name] = v;
  VLOG(3) << "Create variable " << name;
  v->name_ = &(vars_.find(name)->first);
@@ -57,8 +54,10 @@ Variable* Scope::Var(std::string* name) {
 }
 Variable* Scope::FindVar(const std::string& name) const {
-  auto it = vars_.find(name);
+  auto var = FindVarLocally(name);
-  if (it != vars_.end()) return it->second;
+  if (var != nullptr) {
+    return var;
+  }
  return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
 }
@@ -116,6 +115,11 @@ std::string Scope::Rename(const std::string& origin_name) const {
  Rename(origin_name, var_name);
  return var_name;
 }
+Variable* Scope::FindVarLocally(const std::string& name) const {
+  auto it = vars_.find(name);
+  if (it != vars_.end()) return it->second;
+  return nullptr;
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -76,6 +76,8 @@ class Scope {
  std::string Rename(const std::string& origin_name) const;
 private:
+  Variable* FindVarLocally(const std::string& name) const;
  // Call Scope::NewScope for a sub-scope.
  explicit Scope(Scope const* parent) : parent_(parent) {}

--- a/paddle/framework/shape_inference.cc
+++ b/paddle/framework/shape_inference.cc
@@ -12,6 +12,8 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
 #include "paddle/framework/shape_inference.h"
+#include "grad_op_desc_maker.h"
+#include "paddle/framework/operator.h"
 namespace paddle {
 namespace framework {
@@ -22,6 +24,12 @@ std::vector<framework::DDim> InferShapeContext::GetInputsDim(
  return GetDims(names);
 }
+DDim InferShapeContext::GetInputsElementDim(const std::string &name,
+                                            int idx) const {
+  const std::vector<std::string> &names = Inputs(name);
+  return this->GetDim(names[idx]);
+}
 void InferShapeContext::SetOutputsDim(
    const std::string &name, const std::vector<framework::DDim> &dims) {
  auto &names = Outputs(name);
@@ -43,6 +51,9 @@ void InferShapeContext::SetDims(const std::vector<std::string> &names,
  size_t length = names.size();
  PADDLE_ENFORCE_EQ(length, dims.size());
  for (size_t i = 0; i < length; ++i) {
+    if (names[i] == framework::kEmptyVarName) {
+      continue;
+    }
    SetDim(names[i], dims[i]);
  }
 }

--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -37,6 +37,7 @@ class InferShapeContext {
  virtual framework::DDim GetInputDim(const std::string &name) const = 0;
  std::vector<framework::DDim> GetInputsDim(const std::string &name) const;
+  DDim GetInputsElementDim(const std::string &name, int idx) const;
  virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0;
  void SetOutputsDim(const std::string &name,

--- a/paddle/function/EigenGemm.cpp
+++ b/paddle/function/EigenGemm.cpp
@@ -21,7 +21,7 @@ template <class T>
 struct EigenBlasGemm {
  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, int>,
                           Eigen::Aligned>
-      Matrix;
+      EigenMatrix;
  static void compute(const bool transA,
                      const bool transB,
@@ -56,14 +56,13 @@ struct EigenBlasGemm {
      sizeB[1] = N;
      CHECK_EQ(N, ldb);
    }
-    Eigen::array<int, 2> sizeC;
+    Eigen::array<int, 2> sizeC = {{M, ldc}};
-    sizeC[0] = M;
+    Eigen::array<int, 2> offsetC = {{0, 0}};
-    sizeC[1] = N;
+    Eigen::array<int, 2> extentC = {{M, N}};
-    CHECK_EQ(N, ldc);
-    const Matrix a(const_cast<T*>(A), sizeA);
+    const EigenMatrix a(const_cast<T*>(A), sizeA);
-    const Matrix b(const_cast<T*>(B), sizeB);
+    const EigenMatrix b(const_cast<T*>(B), sizeB);
-    Matrix c(C, sizeC);
+    EigenMatrix c(C, sizeC);
    typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
    Eigen::array<DimPair, 1> dims;
@@ -72,6 +71,7 @@ struct EigenBlasGemm {
    dims[0].second = transB ? 1 : 0;
    Eigen::DefaultDevice device;
+    if (N == ldc) {
      if (alpha == T(1) && beta == T(0)) {
        c.device(device) = a.contract(b, dims);
      } else if (alpha == T(1) && beta == T(1)) {
@@ -79,6 +79,16 @@ struct EigenBlasGemm {
      } else {
        c.device(device) = alpha * a.contract(b, dims) + beta * c;
      }
+    } else {
+      if (alpha == T(1) && beta == T(0)) {
+        c.slice(offsetC, extentC).device(device) = a.contract(b, dims);
+      } else if (alpha == T(1) && beta == T(1)) {
+        c.slice(offsetC, extentC).device(device) += a.contract(b, dims);
+      } else {
+        c.slice(offsetC, extentC).device(device) =
+            alpha * a.contract(b, dims) + beta * c.slice(offsetC, extentC);
+      }
+    }
  }
 };

--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
@@ -64,49 +64,111 @@ void HierarchicalSigmoidLayer::forward(PassType passType) {
                         batchSize,
                         codeLength_,
                         /* trans */ false,
-                         useGpu(deviceId_));
+                         false);
  Matrix::resizeOrCreate(preOutput_.grad,
                         batchSize,
                         codeLength_,
                         /* trans */ false,
-                         useGpu(deviceId_));
+                         false);
  IVectorPtr label = getInput(*getLabelLayer()).ids;
  preOutput_.value->zeroMem();
+  if (useGpu_) {
+    Matrix::resizeOrCreate(cpuOutput_,
+                           output_.value->getHeight(),
+                           output_.value->getWidth(),
+                           /* trans */ false,
+                           false);
+    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
+    cpuLabel_->copyFrom(*label);
+    cpuOutput_->copyFrom(*output_.value);
+  } else {
+    cpuOutput_ = output_.value;
+    cpuLabel_ = label;
+  }
  /* add the bias-vector */
  if (biases_.get() != NULL) {
-    preOutput_.value->addByBitCode(numClasses_, *label, *biases_->getW());
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuBias_,
+                             1,
+                             numClasses_ - 1,
+                             /* trans */ false,
+                             false);
+      cpuBias_->copyFrom(*biases_->getW());
+    } else {
+      cpuBias_ = biases_->getW();
+    }
+    preOutput_.value->addByBitCode(numClasses_, *cpuLabel_, *cpuBias_);
  }
  for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
    MatrixPtr input = getInputValue(i);
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuInput_,
+                             input->getHeight(),
+                             input->getWidth(),
+                             /* trans */ false,
+                             false);
+      Matrix::resizeOrCreate(cpuWeight_,
+                             weights_[i]->getW()->getHeight(),
+                             weights_[i]->getW()->getWidth(),
+                             /* trans */ false,
+                             false);
+      cpuInput_->copyFrom(*input);
+      cpuWeight_->copyFrom(*weights_[i]->getW());
+    } else {
+      cpuInput_ = input;
+      cpuWeight_ = weights_[i]->getW();
+    }
    preOutput_.value->mulByBitCode(
-        numClasses_, *label, *weights_[i]->getW(), *input);
+        numClasses_, *cpuLabel_, *cpuWeight_, *cpuInput_);
  }
  // keep consistent with the clipping in the following softrelu
  preOutput_.value->clip(-40.0, 40.0);
  preOutput_.value->sumByBitCode(numClasses_,
-                                 *label,
+                                 *cpuLabel_,
-                                 *output_.value,
+                                 *cpuOutput_,
                                 -1);  // scaleSum
  preOutput_.value->softrelu(*preOutput_.value);
-  MatrixPtr sum =
+  MatrixPtr sum = Matrix::create(batchSize, 1, /* trans= */ false, false);
-      Matrix::create(batchSize, 1, /* trans= */ false, useGpu(deviceId_));
  preOutput_.value->rowSum(*sum);
-  output_.value->add(*sum);
+  cpuOutput_->add(*sum);
+  if (useGpu_) {
+    output_.value->copyFrom(*cpuOutput_);
+  } else {
+    output_.value = cpuOutput_;
+  }
 }
 void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
  IVectorPtr label = getInput(*getLabelLayer()).ids;
+  if (useGpu_) {
+    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
+    cpuLabel_->copyFrom(*label);
+  } else {
+    cpuLabel_ = label;
+  }
  preOutput_.grad->one();
  preOutput_.grad->softreluDerivative(*preOutput_.value);
-  preOutput_.grad->subByBitCode(numClasses_, *label);
+  preOutput_.grad->subByBitCode(numClasses_, *cpuLabel_);
  if (biases_ && biases_->getWGrad()) {
-    preOutput_.grad->addByBitCodeBackward(
+    MatrixPtr biases_grad = biases_->getWGrad();
-        numClasses_, *label, *biases_->getWGrad());
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuBias_,
+                             1,
+                             numClasses_ - 1,
+                             /* trans */ false,
+                             false);
+      cpuBias_->copyFrom(*biases_grad);
+    } else {
+      cpuBias_ = biases_grad;
+    }
+    preOutput_.grad->addByBitCodeBackward(numClasses_, *cpuLabel_, *cpuBias_);
+    if (useGpu_) {
+      biases_grad->copyFrom(*cpuBias_);
+    } else {
+      biases_grad = cpuBias_;
+    }
    /* Increasing the number of gradient */
    biases_->getParameterPtr()->incUpdate(callback);
  }
@@ -115,9 +177,31 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
    /* Calculate the W-gradient for the current layer */
    MatrixPtr input = getInputValue(i);
    if (weights_[i]->getWGrad()) {
+      MatrixPtr weights_grad = weights_[i]->getWGrad();
+      if (useGpu_) {
+        Matrix::resizeOrCreate(cpuInput_,
+                               input->getHeight(),
+                               input->getWidth(),
+                               /* trans */ false,
+                               false);
+        Matrix::resizeOrCreate(cpuWeightGrad_,
+                               weights_grad->getHeight(),
+                               weights_grad->getWidth(),
+                               /* trans */ false,
+                               false);
+        cpuInput_->copyFrom(*input);
+        cpuWeightGrad_->copyFrom(*weights_grad);
+      } else {
+        cpuInput_ = input;
+        cpuWeightGrad_ = weights_grad;
+      }
      preOutput_.grad->mulByBitCodeBackwardWeight(
-          numClasses_, *label, *weights_[i]->getWGrad(), *input);
+          numClasses_, *cpuLabel_, *cpuWeightGrad_, *cpuInput_);
+      if (useGpu_) {
+        weights_grad->copyFrom(*cpuWeightGrad_);
+      } else {
+        weights_grad = cpuWeightGrad_;
+      }
      /* Increasing the number of gradient */
      weights_[i]->getParameterPtr()->incUpdate(callback);
    }
@@ -125,8 +209,30 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
    /* Calculate the input layers error */
    MatrixPtr inputGrad = getInputGrad(i);
    if (inputGrad) {
+      if (useGpu_) {
+        Matrix::resizeOrCreate(cpuInputGrad_,
+                               inputGrad->getHeight(),
+                               inputGrad->getWidth(),
+                               /* trans */ false,
+                               false);
+        Matrix::resizeOrCreate(cpuWeight_,
+                               weights_[i]->getW()->getHeight(),
+                               weights_[i]->getW()->getWidth(),
+                               /* trans */ false,
+                               false);
+        cpuInputGrad_->copyFrom(*inputGrad);
+        cpuWeight_->copyFrom(*weights_[i]->getW());
+      } else {
+        cpuInputGrad_ = inputGrad;
+        cpuWeight_ = weights_[i]->getW();
+      }
      preOutput_.grad->mulByBitCodeBackwardError(
-          numClasses_, *label, *weights_[i]->getW(), *inputGrad);
+          numClasses_, *cpuLabel_, *cpuWeight_, *cpuInputGrad_);
+      if (useGpu_) {
+        inputGrad->copyFrom(*cpuInputGrad_);
+      } else {
+        inputGrad = cpuInputGrad_;
+      }
    }
  }
 }

--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.h
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
@@ -80,6 +80,15 @@ protected:
  int codeLength_;
  /// temporary result of output_
  Argument preOutput_;
+  /// The temporary variables in CPU memory.
+  MatrixPtr cpuWeight_;
+  MatrixPtr cpuWeightGrad_;
+  MatrixPtr cpuInput_;
+  MatrixPtr cpuInputGrad_;
+  MatrixPtr cpuBias_;
+  MatrixPtr cpuOutput_;
+  IVectorPtr cpuLabel_;
 };
 }  // namespace paddle
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
 # gserver pacakge unittests
 add_simple_unittest(test_LinearChainCRF)
 add_simple_unittest(test_RecurrentLayer)
@@ -29,6 +28,26 @@ gserver_test(test_KmaxSeqScore)
 gserver_test(test_Expand)
 gserver_test(test_MaxPoolingWithMaskOutput)
+set(PYTHON_PATH 
+   ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d 
+   ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/gserver/tests)
+function(gserver_test_with_python TARGET)
+  add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
+  add_test(NAME ${TARGET}
+    COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
+      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+endfunction()
+gserver_test_with_python(test_PyDataProvider2)
+if(WITH_PYTHON)
+    gserver_test_with_python(test_PyDataProvider)
+endif()
+if(NOT MOBILE_INFERENCE)
+    gserver_test_with_python(test_CompareTwoNets)
+    # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine, I will fix it.
+    gserver_test_with_python(test_RecurrentGradientMachine)
+endif()
 ########## test_MKLDNN layers and activations ##########
 if(WITH_MKLDNN)
    add_unittest_without_exec(test_MKLDNN
@@ -36,18 +55,7 @@ if(WITH_MKLDNN)
        MKLDNNTester.cpp
        LayerGradUtil.cpp)
    add_test(NAME test_MKLDNN
-        COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python
+        COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
-            ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
-            WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
-endif()
-############## test_PyDataProvider ########################
-if(WITH_PYTHON)
-    add_unittest_without_exec(test_PyDataProvider
-        test_PyDataProvider.cpp)
-    add_test(NAME test_PyDataProvider
-        COMMAND .set_python_path.sh -d ./gserver/tests:${PADDLE_SOURCE_DIR}/python/ ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider
            WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 endif()
@@ -55,68 +63,35 @@ endif()
 if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
    add_unittest_without_exec(test_WarpCTCLayer
        test_WarpCTCLayer.cpp)
    add_test(NAME test_WarpCTCLayer
        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR}
        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 endif()
 if(NOT MOBILE_INFERENCE)
-################## test_Evaluator #######################
+    ################## test_Evaluator #############
    add_unittest(test_Evaluator
        test_Evaluator.cpp)
-############### test_RecurrentGradientMachine ###############
+    ########### test_NetworkCompare ###############
-    # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine
-    # I will fix it.
-    add_unittest_without_exec(test_RecurrentGradientMachine
-        test_RecurrentGradientMachine.cpp)
-    add_test(NAME test_RecurrentGradientMachine
-        COMMAND .set_python_path.sh -d
-                ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
-                ${CMAKE_CURRENT_BINARY_DIR}/test_RecurrentGradientMachine
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
-############### test_NetworkCompare ###############
    add_unittest_without_exec(test_NetworkCompare
        test_NetworkCompare.cpp)
    if(WITH_GPU)
-        add_test(NAME test_NetworkCompare
+        set(use_gpu true)
-            COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=true
-            WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
    else()
+        set(use_gpu false)
+    endif()
    add_test(NAME test_NetworkCompare
-            COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=false
+        COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=${use_gpu}
        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
-    endif()
-endif()
-add_unittest_without_exec(test_PyDataProvider2
-        test_PyDataProvider2.cpp)
-add_test(NAME test_PyDataProvider2
-   COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
-)
-################# test_CompareSparse ##################
+    ############ test_CompareSparse ################
-add_unittest_without_exec(test_CompareSparse
+    add_unittest_without_exec(test_CompareSparse
        test_CompareSparse.cpp)
-if(NOT ON_TRAVIS)
+    if(NOT ON_TRAVIS)
      add_test(NAME test_CompareSparse
-    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
+        COMMAND ${PYTHON_PATH} ./.set_port.sh -p port -n 6
-          ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
-              ./.set_port.sh -p port -n 6
                ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+    endif()
 endif()
-################ test_CompareTwoNets ######################
-add_unittest_without_exec(test_CompareTwoNets
-    test_CompareTwoNets.cpp)
-add_test(NAME test_CompareTwoNets
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
-        ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
-        ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
--- a/paddle/gserver/tests/sequence_rnn_matched_inputs.py
+++ b/paddle/gserver/tests/sequence_rnn_matched_inputs.py
@@ -41,7 +41,7 @@ nonseq = embedding_layer(input=label, size=word_dim)
 # This hierarchical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn_multi_unequalength_inputs.conf
+# sequence_rnn_mixed_inputs.conf
 def outer_step(subseq, seq, nonseq, encoding):
    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)

--- a/paddle/gserver/tests/sequence_rnn_mixed_inputs.py
+++ b/paddle/gserver/tests/sequence_rnn_mixed_inputs.py
@@ -37,7 +37,7 @@ encoding = embedding_layer(input=data2, size=word_dim)
 # This hierarchical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn_multi_unequalength_inputs.conf
+# sequence_rnn_matched_inputs.conf
 def outer_step(subseq, seq, nonseq, encoding):
    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)

--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -681,12 +681,13 @@ TEST(Layer, hsigmoidLayer) {
  config.layerConfig.add_inputs();
  config.layerConfig.add_inputs();
-  // Not support GPU now
+  for (auto useGpu : {false, true}) {
    testLayerGrad(config,
                  "hsigmoid",
                  100,
-                /* trans */ false, /* useGpu */
+                  /* trans */ false,
-                false);
+                  /* useGpu */ useGpu);
+  }
 }
 TEST(Layer, multi_cross) {

--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@@ -26,8 +26,6 @@ else()
 endif()
 if(MOBILE_INFERENCE)
-    list(REMOVE_ITEM MATH_SOURCES
-         ${CMAKE_CURRENT_SOURCE_DIR}/SIMDFunctions.cpp)
    # Remove sparse
    list(REMOVE_ITEM MATH_HEADERS
         ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.h

--- a/paddle/math/SIMDFunctions.h
+++ b/paddle/math/SIMDFunctions.h
@@ -116,9 +116,11 @@ inline bool vec_check(size_t len) {
 }
 namespace internal {
+#ifdef __SSE3__
 void addToImpl(float* a, const float* b, size_t len);
 void batchAddToImpl(float* a, const float* b[], int batch, size_t len);
 void colMaxImpl(float* result, const float* data, int dim, int numSamples);
+#endif
 #ifdef __AVX__
 void decayL1AvxImpl(float* dst, float* src, float lambda, size_t len);
 void decayL1AvxImpl(

--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -81,18 +81,33 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
 }
 template <>
-void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
+size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
-  return GetGPUBuddyAllocator(place.device)->Alloc(size);
+  return GetGPUBuddyAllocator(place.device)->Used();
 }
 template <>
-void Free<platform::GPUPlace>(platform::GPUPlace place, void* p) {
+void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
-  GetGPUBuddyAllocator(place.device)->Free(p);
+  auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
+  auto* ptr = buddy_allocator->Alloc(size);
+  if (ptr == nullptr) {
+    int cur_dev = platform::GetCurrentDeviceId();
+    platform::SetDeviceId(place.device);
+    size_t avail, total;
+    platform::GpuMemoryUsage(avail, total);
+    LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
+                 << place.device << ", available " << avail << " bytes";
+    LOG(WARNING) << "total " << total;
+    LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize();
+    LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize();
+    LOG(WARNING) << "GPU memory used: " << Used<platform::GPUPlace>(place);
+    platform::SetDeviceId(cur_dev);
+  }
+  return ptr;
 }
 template <>
-size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
+void Free<platform::GPUPlace>(platform::GPUPlace place, void* p) {
-  return GetGPUBuddyAllocator(place.device)->Used();
+  GetGPUBuddyAllocator(place.device)->Free(p);
 }
 #endif

--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -191,6 +191,7 @@ set(DEPS_OPS
    sum_op
    pool_op
    maxout_op
+    unpool_op
    pool_with_index_op
    conv_op
    conv_transpose_op
@@ -211,18 +212,22 @@ set(DEPS_OPS
    send_op
    recv_op)
+if(WITH_DISTRIBUTE)
 add_subdirectory(detail)
 op_library(send_op SRCS send_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
 set_source_files_properties(
    send_op.cc
    PROPERTIES
-    COMPILE_FLAGS  "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+    COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
 op_library(recv_op SRCS recv_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
 set_source_files_properties(
    recv_op.cc
    PROPERTIES
-    COMPILE_FLAGS  "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+    COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
+endif()
 op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
 op_library(cross_entropy_op DEPS cross_entropy)
@@ -235,6 +240,7 @@ op_library(adagrad_op DEPS selected_rows_functor)
 op_library(conv_op DEPS vol2col)
 op_library(pool_op DEPS pooling)
 op_library(maxout_op DEPS maxouting)
+op_library(unpool_op DEPS unpooling)
 op_library(pool_with_index_op DEPS pooling)
 op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
 op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op)
@@ -273,4 +279,3 @@ if(WITH_GPU)
  cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
 endif()
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
-cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -62,13 +62,14 @@ class BatchNormOp : public framework::OperatorWithKernel {
    const auto x_dims = ctx->GetInputDim("X");
    const TensorFormat tensor_format =
        StringToTensorFormat(ctx->Attrs().Get<std::string>("tensor_format"));
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "Input X must have 2 to 5 dimensions.");
    const int C =
        (tensor_format == TensorFormat::NCHW ? x_dims[1]
                                             : x_dims[x_dims.size() - 1]);
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "Input X must have 3 to 5 dimensions.");
    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
@@ -146,8 +147,8 @@ class BatchNormKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
    const auto *x = ctx.Input<Tensor>("X");
    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
-                   "The Input dim size should be between 3 and 5");
+                   "The Input dim size should be between 2 and 5");
    const int N = x_dims[0];
    const int C =
        (tensor_format == TensorFormat::NCHW ? x_dims[1]
@@ -339,8 +340,8 @@ class BatchNormGradKernel<platform::CPUPlace, T>
    // Get the size for each dimension.
    // NCHW [batch_size, in_channels, in_height, in_width]
    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
-                   "The Input dim size should be between 3 and 5");
+                   "The Input dim size should be between 2 and 5");
    const int N = x_dims[0];
    const int C =
        (tensor_format == TensorFormat::NCHW ? x_dims[1]

--- a/paddle/operators/batch_norm_op.cu.cc
+++ b/paddle/operators/batch_norm_op.cu.cc
@@ -29,6 +29,12 @@ void ExtractNCWHD(const framework::DDim &dims,
                  const TensorFormat &tensor_format, int *N, int *C, int *H,
                  int *W, int *D) {
  *N = dims[0];
+  if (dims.size() == 2) {
+    *C = dims[1];
+    *H = 1;
+    *W = 1;
+    *D = 1;
+  } else {
    *C = tensor_format == TensorFormat::NCHW ? dims[1] : dims[dims.size() - 1];
    *H = tensor_format == TensorFormat::NCHW ? dims[2] : dims[1];
    *W = dims.size() > 3
@@ -37,6 +43,7 @@ void ExtractNCWHD(const framework::DDim &dims,
    *D = dims.size() > 4
             ? (tensor_format == TensorFormat::NCHW ? dims[4] : dims[3])
             : 1;
+  }
 }
 template <typename T>
@@ -56,8 +63,8 @@ class BatchNormKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
    // NCHW [batch_size, in_channels, in_height, in_width]
    const auto *x = ctx.Input<Tensor>("X");
    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
-                   "The Input dim size should be between 3 and 5");
+                   "The Input dim size should be between 2 and 5");
    int N, C, H, W, D;
    ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D);
@@ -180,8 +187,8 @@ class BatchNormGradKernel<platform::GPUPlace, T>
    const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
-                   "The Input dim size should be between 3 and 5");
+                   "The Input dim size should be between 2 and 5");
    int N, C, H, W, D;
    ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D);

--- a/paddle/operators/concat_op.cc
+++ b/paddle/operators/concat_op.cc
@@ -25,7 +25,7 @@ class ConcatOp : public framework::OperatorWithKernel {
  void InferShape(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL,
-                      "Inputs(X) of ConcatOp should be empty.")
+                      "Inputs(X) of ConcatOp should be empty.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   "Output(Out) of ConcatOp should not be null.");
@@ -45,7 +45,7 @@ class ConcatOp : public framework::OperatorWithKernel {
        }
        PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
                          "Input tensors should have the same "
-                          "elements except the specify axis.")
+                          "elements except the specify axis.");
      }
    }
    ctx->SetOutputDim("Out", out_dims);

--- a/paddle/operators/conv_cudnn_op.cu.cc
+++ b/paddle/operators/conv_cudnn_op.cu.cc
@@ -63,7 +63,7 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
    cudnnConvolutionDescriptor_t cudnn_conv_desc =
        conv_desc.descriptor<T>(paddings, strides, dilations);
-#if CUDNN_VERSION_MIN(7, 0, 0)
+#if CUDNN_VERSION_MIN(7, 0, 1)
    // cudnn 7 can support groups, no need to do it mannually
    // FIXME(typhoonzero): find a better way to disable groups
    // rather than setting it to 1.
@@ -180,7 +180,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
    cudnnConvolutionDescriptor_t cudnn_conv_desc =
        conv_desc.descriptor<T>(paddings, strides, dilations);
-#if CUDNN_VERSION_MIN(7, 0, 0)
+#if CUDNN_VERSION_MIN(7, 0, 1)
    // cudnn 7 can support groups, no need to do it mannually
    // FIXME(typhoonzero): find a better way to disable groups
    // rather than setting it to 1.

--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -97,7 +97,7 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
      .SetDefault({0, 0});
  AddAttr<int>(
      "groups",
-      "(int default:1), the group size of convolution operator. "
+      "(int default:1), the groups number of the convolution operator. "
      "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
      "when group=2, the first half of the filters is only connected to the "
      "first half of the input channels, while the second half of the filters "
@@ -112,23 +112,29 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
 Convolution Operator.
 The convolution operation calculates the output based on the input, filter
-and strides, paddings, groups, dilations parameters. The size of each dimension of the
+and strides, paddings, dilations, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
-Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch
+Input(Input) and Output(Output) are in NCHW format. Where N is batch
 size, C is the number of channels, H is the height of the feature, and W is
-the width of the feature. Parameters(ksize, strides, paddings, dilations) are two elements.
+the width of the feature.
-These two elements represent height and width, respectively.
+Filters(Input) is MCHW format. Where M is the number of output image channels, C is
+the number of input image channels, H is the height of the filter, and W
+is the width of the filter.
+Parameters(strides, paddings, dilations) are two elements. These two elements represent
+height and width, respectively.
 The input(X) size and output(Out) size may be different.
 Example:
  Input:
-       Input shape: (N, C_in, H_in, W_in)
+       Input shape: $(N, C_{in}, H_{in}, W_{in})$
-       Filter shape: (C_out, C_in, H_f, W_f)
+       Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
  Output:
-       Output shape: (N, C_out, H_out, W_out)
+       Output shape: $(N, C_{out}, H_{out}, W_{out})$
-  where
+  Where
-       H_out = (H_in + 2 * paddings[0] - (dilations[0]*(filter_size[0] - 1) + 1)) / strides[0] + 1;
+$$
-       W_out = (W_in + 2 * paddings[1] - (dilations[1]*(filter_size[1] - 1) + 1)) / strides[1] + 1;
+       H_{out}= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]}+ 1 \\
+       W_{out}= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]}+ 1
+$$
 )DOC");
 }
@@ -165,7 +171,7 @@ Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto,
      .SetDefault({0, 0, 0});
  AddAttr<int>(
      "groups",
-      "(int default:1), the group size of convolution operator. "
+      "(int default:1), the groups number of the convolution operator. "
      "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
      "when group=2, the first half of the filters is only connected to the "
      "first half of the input channels, while the second half of the filters "
@@ -174,32 +180,37 @@ Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto,
  AddAttr<std::vector<int>>("dilations",
                            "(vector<int> default:{1, 1, 1}), the "
                            "dilations(d_dilation, h_dilation, w_dilation) of "
-                            "convolution operator. Currently, conv3d doesn't "
+                            "convolution operator.")
-                            "support dilation.")
      .SetDefault({1, 1, 1});
  AddComment(R"DOC(
 Convolution3D Operator.
 The convolution operation calculates the output based on the input, filter
-and strides, paddings, groups parameters. The size of each dimension of the
+and strides, paddings, dilations, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
-Input(Input, Filter) and output(Output) are in NCDHW format. Where N is batch
+Input(Input) and output(Output) are in NCDHW format, where N is batch
 size, C is the number of channels,D is the depth of the feature, H is the height of
-the feature, and W is the width of the feature. Parameters(ksize, strides, paddings)
+the feature, and W is the width of the feature.
-are three elements. These three elements represent depth, height and width, respectively.
+Filters(Input) is MCDHW format, where M is the number of output image channels,
+C is the number of input image channels, D is the depth of the filter,
+H is the height of the filter, and W is the width of the filter.
+Parameters(strides, paddings, dilations) are three elements. These three elements
+represent depth, height and width, respectively.
 The input(X) size and output(Out) size may be different.
 Example:
  Input:
-       Input shape: (N, C_in, D_in, H_in, W_in)
+       Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
-       Filter shape: (C_out, C_in, D_f, H_f, W_f)
+       Filter shape: $(C_{out}, C_{in}, D_f, H_f, W_f)$
  Output:
-       Output shape: (N, C_out, D_out, H_out, W_out)
+       Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
-  where
+  Where
-       D_out = (D_in - filter_size[0] + 2 * paddings[0]) / strides[0] + 1;
+  $$
-       H_out = (H_in - filter_size[1] + 2 * paddings[1]) / strides[1] + 1;
+       D_{out}= \frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{ strides[0]}+ 1 \\
-       W_out = (W_in - filter_size[2] + 2 * paddings[2]) / strides[2] + 1;
+       H_{out}= \frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{ strides[1]}+ 1 \\
+       W_{out}= \frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{ strides[2]}+ 1
+  $$
 )DOC");
 }

--- a/paddle/operators/conv_transpose_op.cc
+++ b/paddle/operators/conv_transpose_op.cc
@@ -39,7 +39,7 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
                 "ConvTransposeOp input dimension and strides dimension should "
                 "be consistent.");
  PADDLE_ENFORCE_EQ(paddings.size(), strides.size(),
-                    "ConvTransposeOp paddings dimension and Conv strides "
+                    "ConvTransposeOp paddings dimension and strides "
                    "dimension should be the same.");
  PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
                    "In ConvTransposeOp, The input channel should be the same "
@@ -62,24 +62,25 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
      "The format of input tensor is NCHW. Where N is batch size, C is the "
      "number of input channels, H is the height of the feature, and "
      "W is the width of the feature.");
-  AddInput("Filter",
+  AddInput(
+      "Filter",
      "(Tensor) The filter tensor of convolution transpose operator. "
-           "The format of the filter tensor is CMHW, where C is the number of "
+      "The format of the filter tensor is MCHW, where M is the number of "
-           "output image channels, M is the number of input image channels, "
+      "input feature channels, C is the number of "
+      "output feature channels,"
      "H is the height of the filter, and W is the width of the filter. "
-           "We enforce groups number == 1 and padding == 0 in "
+      "We enforce groups number == 1 in the convolution transpose scenario.");
-           "the convolution transpose scenario.");
  AddOutput("Output",
            "(Tensor) The output tensor of convolution transpose operator. "
            "The format of output tensor is also NCHW.");
  AddAttr<std::vector<int>>(
      "strides",
-      "(vector<int> defalut:{1, 1}), the strides(h_stride, w_stride) of "
+      "(vector<int> default:{1, 1}), the strides(h_stride, w_stride) of "
      "convolution transpose operator.")
      .SetDefault({1, 1});
  AddAttr<std::vector<int>>(
      "paddings",
-      "(vector<int> defalut:{0, 0}), the paddings(h_pad, w_pad) of convolution "
+      "(vector<int> default:{0, 0}), the paddings(h_pad, w_pad) of convolution "
      "transpose operator.")
      .SetDefault({0, 0});
  AddComment(R"DOC(
@@ -88,21 +89,26 @@ Convolution2D Transpose Operator.
 The convolution transpose operation calculates the output based on the input, filter
 and strides, paddings, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
+Input(Input) and output(Output) are in NCHW format. Where N is batchsize, C is the
-Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch
+number of channels, H is the height of the feature, and W is the width of the feature.
-size, C is the number of channels, H is the height of the feature, and 
+Filter(Input) is in MCHW format. Where M is the number of input feature channels,
-W is the width of the feature. Parameters(ksize, strides, paddings) are two elements.
+C is the number of output feature channels, H is the height of the filter,
-These two elements represent height and width, respectively.
+and W is the width of the filter.
+Parameters(strides, paddings) are two elements. These two elements represent height
+and width, respectively.
 The input(X) size and output(Out) size may be different.
 Example:
  Input:
-       Input shape: (N, C_in, H_in, W_in)
+       Input shape: $(N, C_{in}, H_{in}, W_{in})$
-       Filter shape: (C_in, C_out, H_f, W_f)
+       Filter shape: $(C_{in}, C_{out}, H_f, W_f)$
  Output:
-       Output shape: (N, C_out, H_out, W_out)
+       Output shape: $(N, C_{out}, H_{out}, W_{out})$
-  where
+  Where
-       H_out = (H_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0];
+  $$
-       W_out = (W_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1];
+       H_{out} = (H_{in} - 1) * strides[0] - 2 * paddings[0] + H_f \\
+       W_{out} = (W_{in} - 1) * strides[1] - 2 * paddings[1] + W_f
+  $$
 )DOC");
 }
@@ -117,8 +123,9 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
           "W is the width of the feature.");
  AddInput("Filter",
           "(Tensor) The filter tensor of convolution transpose operator."
-           "The format of the filter tensor is CMDHW, where C is the number of "
+           "The format of the filter tensor is MCDHW, where M is the number of "
-           "output image channels, M is the number of input image channels, D "
+           "input feature channels, C is the number of "
+           "output feature channels, D "
           "is the depth of the filter, H is the height of the filter, and "
           "W is the width of the filter."
           "We enforce groups number == 1 and padding == 0 in "
@@ -130,12 +137,12 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
            "the number of channels, D is the depth of the feature, H is the "
            "height of the feature, and W is the width of the feature.");
  AddAttr<std::vector<int>>("strides",
-                            "(vector<int> defalut:{1, 1, 1}), the "
+                            "(vector<int> default:{1, 1, 1}), the "
                            "strides{d_stride, h_stride, w_stride} of "
                            "convolution transpose operator.")
      .SetDefault({1, 1, 1});
  AddAttr<std::vector<int>>("paddings",
-                            "(vector<int> defalut:{0, 0, 0}), paddings(d_pad, "
+                            "(vector<int> default:{0, 0, 0}), paddings(d_pad, "
                            "h_pad, w_pad) of convolution transpose operator.")
      .SetDefault({0, 0, 0});
  AddComment(R"DOC(
@@ -144,23 +151,28 @@ Convolution3D Transpose Operator.
 The convolution transpose operation calculates the output based on the input, filter
 and strides, paddings, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
+Input(Input) and output(Output) are in NCDHW format. Where N is batch size, C is the
-Input(Input, Filter) and output(Output) are in NCDHW format. Where N is batch
+number of channels, D is the depth of the feature, H is the height of the feature,
-size, C is the number of channels, D is the depth of the feature, 
+and W is the width of the feature.
-H is the height of the feature, and W is the width of the feature. 
+Filter(Input) is in MCDHW format. Where M is the number of input feature channels,
-Parameters(ksize, strides, paddings) are three elements.
+C is the number of output feature channels, D is the depth of the filter,H is the
-These three elements represent depth, height and width, respectively.
+height of the filter, and W is the width of the filter.
+Parameters(strides, paddings) are three elements. These three elements represent
+depth, height and width, respectively.
 The input(X) size and output(Out) size may be different.
 Example:   
  Input:
-       Input shape: (N, C_in, D_in, H_in, W_in)
+       Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
-       Filter shape: (C_in, C_out, D_f, H_f, W_f)
+       Filter shape: $(C_{in}, C_{out}, D_f, H_f, W_f)$
  Output:
-       Output shape: (N, C_out, D_out, H_out, W_out)
+       Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
-  where
+  Where
-       D_out = (D_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0];
+  $$
-       H_out = (H_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1];
+       D_{out} = (D_{in} - 1) * strides[0] - 2 * paddings[0] + D_f \\
-       W_out = (W_in - 1) * strides[2] - 2 * paddings[2] + filter_size[2];
+       H_{out} = (H_{in} - 1) * strides[1] - 2 * paddings[1] + H_f \\
+       W_{out} = (W_{in} - 1) * strides[2] - 2 * paddings[2] + W_f
+  $$
 )DOC");
 }

--- a/paddle/operators/conv_transpose_op.h
+++ b/paddle/operators/conv_transpose_op.h
@@ -63,7 +63,6 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    // TODO(Zhuoyuan): Paddings can be added in future.
    // groups will alway be disabled in conv2dtranspose.
    const int batch_size = static_cast<int>(input->dims()[0]);

--- a/paddle/operators/detail/send_recv.proto
+++ b/paddle/operators/detail/send_recv.proto
@@ -32,6 +32,4 @@ message VariableMessage {
  bytes serialized = 2;
 }
-message VoidMessage {
+message VoidMessage {}
-}
\ No newline at end of file
--- a/paddle/operators/elementwise_add_op.h
+++ b/paddle/operators/elementwise_add_op.h
@@ -19,11 +19,48 @@
 namespace paddle {
 namespace operators {
+template <typename T>
+struct AddFunctor {
+  HOSTDEVICE T operator()(T a, T b) const { return a + b; }
+};
 template <typename Place, typename T>
 class ElementwiseAddKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
-    ElementwiseCompute<EigenAddFunctor, Place, T>(ctx);
+    using Tensor = framework::Tensor;
+    auto* x = ctx.Input<Tensor>("X");
+    auto* y = ctx.Input<Tensor>("Y");
+    auto* z = ctx.Output<Tensor>("Out");
+    z->mutable_data<T>(ctx.GetPlace());
+    TransformFunctor<AddFunctor<T>, T, Place> functor(
+        x, y, z, ctx.device_context(), AddFunctor<T>());
+    auto x_dims = x->dims();
+    auto y_dims = y->dims();
+    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
+                      "Rank of first input must >= rank of second input.");
+    if (x_dims == y_dims) {
+      functor.Run();
+      return;
+    }
+    int axis = ctx.Attr<int>("axis");
+    axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
+    PADDLE_ENFORCE(axis >= 0 && axis < x_dims.size(),
+                   "Axis should be in range [0, x_dims)");
+    int pre, n, post;
+    get_mid_dims(x_dims, y_dims, axis, pre, n, post);
+    if (post == 1) {
+      functor.RunRowWise(n, pre);
+      return;
+    } else {
+      functor.RunMidWise(n, pre, post);
+      return;
+    }
  }
 };

--- a/paddle/operators/elementwise_op.h
+++ b/paddle/operators/elementwise_op.h
@@ -35,7 +35,7 @@ class ElementwiseOp : public framework::OperatorWithKernel {
    auto x_dim = ctx->GetInputDim("X");
    auto y_dim = ctx->GetInputDim("Y");
    PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
-                      "Rank of first input must >= rank of second input.")
+                      "Rank of first input must >= rank of second input.");
    ctx->SetOutputDim("Out", x_dim);
    ctx->ShareLoD("X", /*->*/ "Out");
  }
@@ -120,7 +120,7 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
    PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
-                      "Rank of first input must >= rank of second input.")
+                      "Rank of first input must >= rank of second input.");
    auto x_grad_name = framework::GradVarName("X");
    auto y_grad_name = framework::GradVarName("Y");

--- a/paddle/operators/elementwise_op_function.h
+++ b/paddle/operators/elementwise_op_function.h
@@ -16,6 +16,11 @@
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/operator.h"
+#include "paddle/platform/transform.h"
+#ifdef __NVCC__
+#include <thrust/iterator/iterator_adaptor.h>
+#endif
 #include "paddle/operators/math/math_function.h"
@@ -54,6 +59,153 @@ inline void get_mid_dims(const framework::DDim& x_dims,
  }
 }
+template <typename T, typename Place>
+class RowwiseTransformIterator;
+template <typename T, typename Place>
+class MidWiseTransformIterator;
+template <typename T>
+class RowwiseTransformIterator<T, platform::CPUPlace> {
+ public:
+  RowwiseTransformIterator(const T* ptr, int n) : ptr_(ptr), i_(0), n_(n) {}
+  RowwiseTransformIterator<T, platform::CPUPlace>& operator++() {
+    ++i_;
+    i_ %= n_;
+    return *this;
+  }
+  bool operator==(
+      const RowwiseTransformIterator<T, platform::CPUPlace>& rhs) const {
+    return (ptr_ + i_) == &(*rhs);
+  }
+  bool operator!=(
+      const RowwiseTransformIterator<T, platform::CPUPlace>& rhs) const {
+    return (ptr_ + i_) != &(*rhs);
+  }
+  const T& operator*() { return ptr_[i_]; }
+ private:
+  const T* ptr_;
+  int i_;
+  int64_t n_;
+};
+template <typename T>
+class MidWiseTransformIterator<T, platform::CPUPlace> {
+ public:
+  MidWiseTransformIterator(const T* ptr, int n, int post)
+      : ptr_(ptr), i_(0), j_(0), n_(n), post_(post) {}
+  MidWiseTransformIterator<T, platform::CPUPlace>& operator++() {
+    i_ = (++j_ / post_) % n_;
+    return *this;
+  }
+  bool operator==(
+      const MidWiseTransformIterator<T, platform::CPUPlace>& rhs) const {
+    return (ptr_ + i_) == &(*rhs);
+  }
+  bool operator!=(
+      const MidWiseTransformIterator<T, platform::CPUPlace>& rhs) const {
+    return (ptr_ + i_) != &(*rhs);
+  }
+  const T& operator*() { return ptr_[i_]; }
+ private:
+  const T* ptr_;
+  int i_;
+  int64_t j_;
+  int64_t n_;
+  int post_;
+};
+#ifdef __NVCC__
+template <typename T>
+class RowwiseTransformIterator<T, platform::GPUPlace>
+    : public thrust::iterator_adaptor<
+          RowwiseTransformIterator<T, platform::GPUPlace>, const T*> {
+ public:
+  typedef thrust::iterator_adaptor<
+      RowwiseTransformIterator<T, platform::GPUPlace>, const T*>
+      super_t;
+  HOSTDEVICE RowwiseTransformIterator(const T* x, int n)
+      : super_t(x), begin_(x), n_(n){};
+  friend class thrust::iterator_core_access;
+ private:
+  unsigned int n_;
+  const T* begin_;
+  HOSTDEVICE typename super_t::reference dereference() const {
+    return *(begin_ + (this->base() - begin_) % n_);
+  }
+};
+template <typename T>
+class MidWiseTransformIterator<T, platform::GPUPlace>
+    : public thrust::iterator_adaptor<
+          MidWiseTransformIterator<T, platform::GPUPlace>, const T*> {
+ public:
+  typedef thrust::iterator_adaptor<
+      MidWiseTransformIterator<T, platform::GPUPlace>, const T*>
+      super_t;
+  HOSTDEVICE MidWiseTransformIterator(const T* x, int n, int post)
+      : super_t(x), begin_(x), n_(n), post_(post){};
+  friend class thrust::iterator_core_access;
+ private:
+  unsigned int post_;
+  unsigned int n_;
+  const T* begin_;
+  HOSTDEVICE typename super_t::reference dereference() const {
+    return *(begin_ + (((this->base() - begin_) / post_) % n_));
+  }
+};
+#endif
+template <typename Functor, typename T, typename Place>
+class TransformFunctor {
+ public:
+  TransformFunctor(const framework::Tensor* x, const framework::Tensor* y,
+                   framework::Tensor* z, const platform::DeviceContext& ctx,
+                   Functor func)
+      : x_(x->data<T>()),
+        y_(y->data<T>()),
+        z_(z->mutable_data<T>(ctx.GetPlace())),
+        nx_(x->numel()),
+        ctx_(ctx),
+        func_(func) {}
+  inline void Run() const {
+    platform::Transform<Place> trans;
+    trans(ctx_, x_, x_ + nx_, y_, z_, func_);
+  }
+  inline void RunRowWise(int n, int pre) const {
+    platform::Transform<Place> trans;
+    trans(ctx_, x_, x_ + nx_, RowwiseTransformIterator<T, Place>(y_, n), z_,
+          func_);
+  }
+  inline void RunMidWise(int n, int pre, int post) const {
+    platform::Transform<Place> trans;
+    trans(ctx_, x_, x_ + nx_, MidWiseTransformIterator<T, Place>(y_, n, post),
+          z_, func_);
+  }
+ private:
+  const T* x_;
+  const T* y_;
+  T* z_;
+  int64_t nx_;
+  const platform::DeviceContext& ctx_;
+  Functor func_;
+};
 #define EIGEN_FUNCTOR(name, eigen_op)                                          \
  struct Eigen##name##Functor {                                                \
    template <typename Place, typename T>                                      \
@@ -106,7 +258,7 @@ void ElementwiseCompute(const framework::ExecutionContext& ctx) {
  auto x_dims = x->dims();
  auto y_dims = y->dims();
  PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
-                    "Rank of first input must >= rank of second input.")
+                    "Rank of first input must >= rank of second input.");
  if (x_dims == y_dims) {
    functor f;

--- a/paddle/operators/gru_op.h
+++ b/paddle/operators/gru_op.h
@@ -71,8 +71,8 @@ class GRUKernel : public framework::OpKernel<T> {
    int frame_size = hidden_dims[1];
    math::hl_gru_value<T> gru_value;
-    gru_value.gateWeight = const_cast<T*>(weight_data);
+    gru_value.gate_weight = const_cast<T*>(weight_data);
-    gru_value.stateWeight =
+    gru_value.state_weight =
        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
    Tensor ordered_h0;
    const size_t* order = batch_gate->lod()[2].data();
@@ -82,9 +82,9 @@ class GRUKernel : public framework::OpKernel<T> {
      // to reorder.
      ReorderInitState<Place, T>(context.device_context(), *h0, order,
                                 &ordered_h0, true);
-      gru_value.prevOutValue = ordered_h0.data<T>();
+      gru_value.prev_out_value = ordered_h0.data<T>();
    } else {
-      gru_value.prevOutValue = nullptr;
+      gru_value.prev_out_value = nullptr;
    }
    auto batch_starts = batch_gate->lod()[0];
    size_t num_batch = batch_starts.size() - 1;
@@ -96,14 +96,14 @@ class GRUKernel : public framework::OpKernel<T> {
      Tensor gate_t = batch_gate->Slice(bstart, bend);
      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
      Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-      gru_value.outputValue = hidden_t.data<T>();
+      gru_value.output_value = hidden_t.data<T>();
-      gru_value.gateValue = gate_t.data<T>();
+      gru_value.gate_value = gate_t.data<T>();
-      gru_value.resetOutputValue = reset_hidden_prev_t.data<T>();
+      gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
      math::GRUUnitFunctor<Place, T>::compute(
          dev_ctx, gru_value, frame_size, cur_batch_size,
          math::ActiveType(context.Attr<std::string>("activation")),
          math::ActiveType(context.Attr<std::string>("gate_activation")));
-      gru_value.prevOutValue = gru_value.outputValue;
+      gru_value.prev_out_value = gru_value.output_value;
    }
    math::Batch2LoDTensorFunctor<Place, T> to_seq;
@@ -169,20 +169,20 @@ class GRUGradKernel : public framework::OpKernel<T> {
    to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse);
    math::hl_gru_value<T> gru_value;
-    gru_value.gateWeight = const_cast<T*>(weight_data);
+    gru_value.gate_weight = const_cast<T*>(weight_data);
-    gru_value.stateWeight =
+    gru_value.state_weight =
        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
    math::hl_gru_grad<T> gru_grad;
    if (weight_grad) {
-      gru_grad.gateWeightGrad =
+      gru_grad.gate_weight_grad =
          weight_grad->mutable_data<T>(context.GetPlace());
      zero(dev_ctx, weight_grad, static_cast<T>(0.0));
-      gru_grad.stateWeightGrad =
+      gru_grad.state_weight_grad =
          weight_grad->data<T>() + 2 * frame_size * frame_size;
    } else {
-      gru_grad.gateWeightGrad = nullptr;
+      gru_grad.gate_weight_grad = nullptr;
-      gru_grad.stateWeightGrad = nullptr;
+      gru_grad.state_weight_grad = nullptr;
    }
    auto batch_starts = batch_hidden_grad.lod()[0];
@@ -193,27 +193,27 @@ class GRUGradKernel : public framework::OpKernel<T> {
      int cur_batch_size = bend - bstart;
      Tensor gate_t = batch_gate->Slice(bstart, bend);
-      gru_value.gateValue = gate_t.data<T>();
+      gru_value.gate_value = gate_t.data<T>();
      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
-      gru_value.resetOutputValue = reset_hidden_prev_t.data<T>();
+      gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
      Tensor hidden_grad_t = batch_hidden_grad.Slice(bstart, bend);
-      gru_grad.outputGrad = hidden_grad_t.data<T>();
+      gru_grad.output_grad = hidden_grad_t.data<T>();
      Tensor gate_grad_t = batch_gate_grad.Slice(bstart, bend);
-      gru_grad.gateGrad = gate_grad_t.data<T>();
+      gru_grad.gate_grad = gate_grad_t.data<T>();
      Tensor reset_hidden_prev_grad_t =
          batch_reset_hidden_prev_grad.Slice(bstart, bend);
-      gru_grad.resetOutputGrad = reset_hidden_prev_grad_t.data<T>();
+      gru_grad.reset_output_grad = reset_hidden_prev_grad_t.data<T>();
      if (n == 0) {
-        gru_value.prevOutValue = h0 ? ordered_h0.data<T>() : nullptr;
+        gru_value.prev_out_value = h0 ? ordered_h0.data<T>() : nullptr;
-        gru_grad.prevOutGrad =
+        gru_grad.prev_out_grad =
            h0 && h0_grad ? ordered_h0_grad.data<T>() : nullptr;
      } else {
        int bstart_pre = static_cast<int>(batch_starts[n - 1]);
        Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart);
-        gru_value.prevOutValue = hidden_prev_t.data<T>();
+        gru_value.prev_out_value = hidden_prev_t.data<T>();
        Tensor hidden_prev_grad_t = batch_hidden_grad.Slice(bstart_pre, bstart);
-        gru_grad.prevOutGrad = hidden_prev_grad_t.data<T>();
+        gru_grad.prev_out_grad = hidden_prev_grad_t.data<T>();
      }
      math::GRUUnitGradFunctor<Place, T>::compute(

--- a/paddle/operators/hinge_loss_op.cc
+++ b/paddle/operators/hinge_loss_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/hinge_loss_op.h"
+namespace paddle {
+namespace operators {
+class HingeLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Logits"),
+                   "Input(Logits) must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) must be initialized.");
+    auto pred_dims = ctx->GetInputDim("Logits");
+    auto label_dims = ctx->GetInputDim("Labels");
+    PADDLE_ENFORCE_EQ(pred_dims, label_dims);
+    PADDLE_ENFORCE_EQ(pred_dims.size(), 2,
+                      "The rank of Input(Logits) must be 2 and the shape is "
+                      "[batch_size, 1].");
+    PADDLE_ENFORCE_EQ(pred_dims[1], 1,
+                      "Each row of Input(Logits) contains a real value, "
+                      "so the 2nd dimension of Input(Logits) must be 1.");
+    ctx->SetOutputDim("Loss", {pred_dims[0], 1});
+    ctx->ShareLoD("Logits", "Loss");
+  }
+};
+template <typename AttrType>
+class HingeLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  HingeLossOpMaker(framework::OpProto* proto,
+                   framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Logits",
+             "The input value (Logits) of Hinge loss op."
+             "Logits is a 2-D tensor with shape [batch_size, 1].");
+    AddInput("Labels",
+             "The target value (Labels) of Hinge loss op."
+             "Labels is a 2-D tensor with shape [batch_size, 1].");
+    AddOutput("Loss",
+              "The output tensor with shape [batch_size, 1] "
+              "which represents the hinge loss.");
+    AddComment(R"DOC(
+HingeLoss Operator.
+Let x be a logit (prediction) and y be the actual label. The logit can
+take any values from (-inf, inf), but the labels should be either -1 or 1.
+Then, the hinge loss is computed as follows:
+$$
+L_(x, y) = max(1 - y.x, 0) 
+$$
+Note that the labels passed as input will have values as either 0 or 1.
+)DOC");
+  }
+};
+class HingeLossGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Logits"),
+                   "Input(Logits) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
+                   "Input(Loss@GRAD) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")),
+                   "Input(Logits@GRAD) should not be null.");
+    auto pred_dims = ctx->GetInputDim("Logits");
+    auto lab_dims = ctx->GetInputDim("Labels");
+    auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
+    PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
+    auto pred_grad_name = framework::GradVarName("Logits");
+    ctx->SetOutputDim(pred_grad_name, pred_dims);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP(hinge_loss, ops::HingeLossOp, ops::HingeLossOpMaker<float>,
+            hinge_loss_grad, ops::HingeLossGradOp);
+REGISTER_OP_CPU_KERNEL(hinge_loss,
+                       ops::HingeLossKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    hinge_loss_grad,
+    ops::HingeLossGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/hinge_loss_op.cu
+++ b/paddle/operators/hinge_loss_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#define EIGEN_USE_GPU
+#include "paddle/operators/hinge_loss_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(hinge_loss,
+                       ops::HingeLossKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    hinge_loss_grad,
+    ops::HingeLossGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/hinge_loss_op.h
+++ b/paddle/operators/hinge_loss_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+template <typename Place, typename T, typename AttrType = T>
+class HingeLossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* pred = context.Input<framework::Tensor>("Logits");
+    auto* label = context.Input<framework::Tensor>("Labels");
+    auto* loss = context.Output<framework::Tensor>("Loss");
+    auto place = context.GetEigenDevice<Place>();
+    auto x = framework::EigenVector<T>::Flatten(*pred);
+    auto y = framework::EigenVector<T>::Flatten(*label);
+    loss->mutable_data<T>(context.GetPlace());
+    auto l = framework::EigenVector<T>::Flatten(*loss);
+    l.device(place) =
+        (static_cast<T>(1) - x * (static_cast<T>(2) * y - static_cast<T>(1)))
+            .cwiseMax(static_cast<T>(0));
+  }
+};
+template <typename Place, typename T, typename AttrType = T>
+class HingeLossGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* pred = context.Input<framework::Tensor>("Logits");
+    auto* label = context.Input<framework::Tensor>("Labels");
+    auto* dloss =
+        context.Input<framework::Tensor>(framework::GradVarName("Loss"));
+    auto* dpred =
+        context.Output<framework::Tensor>(framework::GradVarName("Logits"));
+    auto place = context.GetEigenDevice<Place>();
+    auto x = framework::EigenVector<T>::Flatten(*pred);
+    auto y = framework::EigenVector<T>::Flatten(*label);
+    auto dl = framework::EigenVector<T>::Flatten(*dloss);
+    if (dpred) {
+      dpred->mutable_data<T>(context.GetPlace());
+      auto dx = framework::EigenVector<T>::Flatten(*dpred);
+      auto alt_labels = static_cast<T>(2) * y - static_cast<T>(1);
+      dx.device(place) =
+          dl * ((x * alt_labels) < static_cast<T>(1)).template cast<T>() *
+          (-alt_labels);
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/increment_op.cc
+++ b/paddle/operators/increment_op.cc
@@ -61,6 +61,8 @@ class IncrementOp : public framework::OperatorBase {
    out.Resize(x.dims());
    out.mutable_data(x.place(), x.type());
    float value = Attr<float>("step");
+    VLOG(10) << Output("Out") << " increase " << Input("X") << " with "
+             << value;
    framework::VisitDataType(framework::ToDataType(out.type()),
                             IncrementFunctor(x, &out, value));
  }

--- a/paddle/operators/lod_tensor_to_array_op.cc
+++ b/paddle/operators/lod_tensor_to_array_op.cc
@@ -14,6 +14,7 @@
 #include "paddle/framework/lod_rank_table.h"
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/op_registry.h"
+#include "paddle/operators/detail/safe_ref.h"
 namespace paddle {
 namespace operators {
@@ -32,15 +33,20 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
      : OperatorBase(type, inputs, outputs, attrs) {}
  void Run(const framework::Scope &scope,
           const platform::DeviceContext &dev_ctx) const override {
-    auto &x = scope.FindVar(Input("X"))->Get<framework::LoDTensor>();
+    auto &x = detail::Ref(scope.FindVar(Input("X")), "Cannot find input %s",
-    auto &rank_table =
+                          Input("X"))
-        scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
+                  .Get<framework::LoDTensor>();
-    auto &out =
+    auto &rank_table = detail::Ref(scope.FindVar(Input("RankTable")))
-        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensorArray>();
+                           .Get<framework::LoDRankTable>();
+    auto &out = *detail::Ref(scope.FindVar(Output("Out")))
+                     .GetMutable<framework::LoDTensorArray>();
    auto &items = rank_table.items();
    auto max_seq_len = items[0].length;
    auto rank_level = rank_table.level();
+    PADDLE_ENFORCE_LT(rank_level, x.lod().size(),
+                      "Input should be a LOD tensor, and size is at least %d",
+                      rank_level + 1);
    out.resize(max_seq_len);
    std::vector<std::vector<CopyRange>> copy_ranges(max_seq_len);
@@ -55,16 +61,13 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
        size_t start_idx = x.lod()[rank_level][item.index] + t;
        auto lod_and_offset = framework::GetSubLoDAndAbsoluteOffset(
            x.lod(), start_idx, start_idx + 1, rank_level + 1);
        auto &lod_length = lod_and_offset.first;
        framework::AppendLoD(&lod, lod_length);
        size_t start_offset = lod_and_offset.second.first;
        size_t end_offset = lod_and_offset.second.second;
        copy_ranges[t].emplace_back(CopyRange{start_offset, end_offset});
      }
    }
    for (size_t i = 0; i < max_seq_len; ++i) {
      auto &ranges = copy_ranges[i];
      size_t height = std::accumulate(

--- a/paddle/operators/log_loss_op.cc
+++ b/paddle/operators/log_loss_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/log_loss_op.h"
+namespace paddle {
+namespace operators {
+class LogLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Predicted"),
+                   "Input(Predicted) must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) must be initialized.");
+    auto pred_dims = ctx->GetInputDim("Predicted");
+    auto label_dims = ctx->GetInputDim("Labels");
+    PADDLE_ENFORCE_EQ(pred_dims, label_dims);
+    PADDLE_ENFORCE_EQ(pred_dims.size(), 2,
+                      "The rank of Input(Predicted) must be 2 and the shape is "
+                      "[batch_size, 1].");
+    PADDLE_ENFORCE_EQ(pred_dims[1], 1,
+                      "Each row of Input(Predicted) contains a real value, "
+                      "so the 2nd dimension of Input(X) must be 1.");
+    ctx->SetOutputDim("Loss", {pred_dims[0], 1});
+    ctx->ShareLoD("Predicted", "Loss");
+  }
+};
+template <typename AttrType>
+class LogLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LogLossOpMaker(framework::OpProto* proto,
+                 framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Predicted",
+             "The input value (Predicted) of Log loss op."
+             "Predicted is a 2-D tensor with shape [batch_size, 1].");
+    AddInput("Labels",
+             "The target value (Labels) of Log loss op."
+             "Labels is a 2-D tensor with shape [batch_size, 1].");
+    AddOutput("Loss",
+              "The output tensor with shape [batch_size, 1] "
+              "which represents the log loss.");
+    AddAttr<AttrType>("epsilon", "Epsilon in log loss.");
+    AddComment(R"DOC(
+LogLoss Operator.
+Log loss is a loss function used for binary classification. Log Loss quantifies
+the accuracy of a classifier by penalising false classifications. Minimising the
+Log Loss is equivalent to maximising the accuracy of the classifier. We define
+Predicted as the values predicted by our model and Labels as the target ground
+truth value. Log loss can evaluate how close the predicted values are to the
+target. The shapes of Predicted and Labels are both [batch_size, 1].
+The equation is:
+$$
+Loss = - Labels * log(Predicted + \epsilon) -
+        (1 - Labels) * log(1 - Predicted + \epsilon)
+$$
+)DOC");
+  }
+};
+class LogLossGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Predicted"),
+                   "Input(Predicted) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
+                   "Input(Loss@GRAD) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Predicted")),
+                   "Output(Predicted@GRAD) should not be null.");
+    auto pred_dims = ctx->GetInputDim("Predicted");
+    auto label_dims = ctx->GetInputDim("Labels");
+    auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
+    PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
+    auto pred_grad_name = framework::GradVarName("Predicted");
+    ctx->SetOutputDim(pred_grad_name, pred_dims);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>, log_loss_grad,
+            ops::LogLossGradOp);
+REGISTER_OP_CPU_KERNEL(log_loss,
+                       ops::LogLossKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    log_loss_grad, ops::LogLossGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/log_loss_op.cu
+++ b/paddle/operators/log_loss_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#define EIGEN_USE_GPU
+#include "paddle/operators/log_loss_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(log_loss,
+                       ops::LogLossKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    log_loss_grad, ops::LogLossGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/log_loss_op.h
+++ b/paddle/operators/log_loss_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename Place, typename T, typename AttrType = T>
+class LogLossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* loss_out = ctx.Output<Tensor>("Loss");
+    loss_out->mutable_data<T>(ctx.GetPlace());
+    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
+    auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
+    auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
+    auto loss = EigenVector<T>::Flatten(*loss_out);
+    auto place = ctx.GetEigenDevice<Place>();
+    loss.device(place) = (-(label * (prediction + epsilon).log()) -
+                          ((static_cast<T>(1) - label) *
+                           (static_cast<T>(1) - prediction + epsilon).log()));
+  }
+};
+template <typename Place, typename T, typename AttrType = T>
+class LogLossGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
+    auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
+    auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
+    auto* dloss = ctx.Input<Tensor>(framework::GradVarName("Loss"));
+    auto* dpred = ctx.Output<Tensor>(framework::GradVarName("Predicted"));
+    auto dl = EigenVector<T>::Flatten(*dloss);
+    auto place = ctx.GetEigenDevice<Place>();
+    if (dpred) {
+      dpred->mutable_data<T>(ctx.GetPlace());
+      auto dx = framework::EigenVector<T>::Flatten(*dpred);
+      dx.device(place) = dl * (-(label / (prediction + epsilon)) +
+                               ((static_cast<T>(1) - label) /
+                                (static_cast<T>(1) - prediction + epsilon)));
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/lstm_op.cc
+++ b/paddle/operators/lstm_op.cc
@@ -198,27 +198,27 @@ c_t = f_t \odot c_{t-1} + i_t \odot \tilde{c_t} \\
 h_t = o_t \odot act_h(c_t)
 $$
-where the W terms denote weight matrices (e.g. \f$W_{xi}\f$ is the matrix
+where the W terms denote weight matrices (e.g. $W_{xi}$ is the matrix
-of weights from the input gate to the input), \f$W_{ic}, W_{fc}, W_{oc}\f$
+of weights from the input gate to the input), $W_{ic}, W_{fc}, W_{oc}$
 are diagonal weight matrices for peephole connections. In our implementation,
 we use vectors to reprenset these diagonal weight matrices. The b terms
-denote bias vectors (\f$b_i\f$ is the input gate bias vector), \f$\sigma\f$
+denote bias vectors ($b_i$ is the input gate bias vector), $\sigma$
 is the non-line activations, such as logistic sigmoid function, and
-\f$i, f, o\f$ and \f$c\f$ are the input gate, forget gate, output gate,
+$i, f, o$ and $c$ are the input gate, forget gate, output gate,
 and cell activation vectors, respectively, all of which have the same size as
-the cell output activation vector \f$h\f$.
+the cell output activation vector $h$.
-The \f$\odot\f$ is the element-wise product of the vectors. \f$act_g\f$ and \f$act_h\f$
+The $\odot$ is the element-wise product of the vectors. $act_g$ and $act_h$
 are the cell input and cell output activation functions and `tanh` is usually
-used for them. \f$\tilde{c_t}\f$ is also called candidate hidden state,
+used for them. $\tilde{c_t}$ is also called candidate hidden state,
 which is computed based on the current input and the previous hidden state.
-Set `use_peepholes` False to disable peephole connection 
+Set `use_peepholes` False to disable peephole connection. The formula
-(http://www.bioinf.jku.at/publications/older/2604.pdf). The formula
+is omitted here, please refer to the paper
-is omitted here.
+http://www.bioinf.jku.at/publications/older/2604.pdf for details.
-Note that these \f$W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}\f$
+Note that these $W_{xi}x_{t}, W_{xf}x_{t}, W_{xc}x_{t}, W_{xo}x_{t}$
-operations on the input \f$x_{t}\f$ are NOT included in this operator.
+operations on the input $x_{t}$ are NOT included in this operator.
 Users can choose to use fully-connect operator before LSTM operator.
 )DOC");

--- a/paddle/operators/lstm_op.h
+++ b/paddle/operators/lstm_op.h
@@ -73,15 +73,15 @@ class LSTMKernel : public framework::OpKernel<T> {
      T* bias_data = const_cast<T*>(bias->data<T>());
      // the code style in LstmMetaValue will be updated later.
-      lstm_value.checkIg = bias_data + 4 * frame_size;
+      lstm_value.check_ig = bias_data + 4 * frame_size;
-      lstm_value.checkFg = lstm_value.checkIg + frame_size;
+      lstm_value.check_fg = lstm_value.check_ig + frame_size;
-      lstm_value.checkOg = lstm_value.checkFg + frame_size;
+      lstm_value.check_og = lstm_value.check_fg + frame_size;
    } else {
-      lstm_value.checkIg = nullptr;
+      lstm_value.check_ig = nullptr;
-      lstm_value.checkFg = nullptr;
+      lstm_value.check_fg = nullptr;
-      lstm_value.checkOg = nullptr;
+      lstm_value.check_og = nullptr;
    }
-    lstm_value.prevStateValue = nullptr;
+    lstm_value.prev_state_value = nullptr;
    Tensor ordered_c0;
    const size_t* order = batch_gate->lod()[2].data();
    if (cell_t0) {
@@ -90,7 +90,7 @@ class LSTMKernel : public framework::OpKernel<T> {
      // to reorder.
      ReorderInitState<Place, T>(device_ctx, *cell_t0, order, &ordered_c0,
                                 true);
-      lstm_value.prevStateValue = ordered_c0.data<T>();
+      lstm_value.prev_state_value = ordered_c0.data<T>();
    }
    // Use the local variable as here.
@@ -140,14 +140,14 @@ class LSTMKernel : public framework::OpKernel<T> {
                               static_cast<T>(1.0));
      }
-      lstm_value.gateValue = gate_t.data<T>();
+      lstm_value.gate_value = gate_t.data<T>();
-      lstm_value.outputValue = out_t.data<T>();
+      lstm_value.output_value = out_t.data<T>();
-      lstm_value.stateValue = cell_t.data<T>();
+      lstm_value.state_value = cell_t.data<T>();
-      lstm_value.stateActiveValue = cell_pre_act_t.data<T>();
+      lstm_value.state_active_value = cell_pre_act_t.data<T>();
      math::LstmUnitFunctor<Place, T>::compute(device_ctx, lstm_value,
                                               frame_size, cur_batch_size,
                                               gate_act, cell_act, cand_act);
-      lstm_value.prevStateValue = lstm_value.stateValue;
+      lstm_value.prev_state_value = lstm_value.state_value;
    }
    math::Batch2LoDTensorFunctor<Place, T> to_seq;
@@ -214,13 +214,13 @@ class LSTMGradKernel : public framework::OpKernel<T> {
    math::LstmMetaValue<T> lstm_value;
    if (bias && ctx.Attr<bool>("use_peepholes")) {
      T* bias_data = const_cast<T*>(bias->data<T>());
-      lstm_value.checkIg = bias_data + 4 * frame_size;
+      lstm_value.check_ig = bias_data + 4 * frame_size;
-      lstm_value.checkFg = lstm_value.checkIg + frame_size;
+      lstm_value.check_fg = lstm_value.check_ig + frame_size;
-      lstm_value.checkOg = lstm_value.checkFg + frame_size;
+      lstm_value.check_og = lstm_value.check_fg + frame_size;
    } else {
-      lstm_value.checkIg = nullptr;
+      lstm_value.check_ig = nullptr;
-      lstm_value.checkFg = nullptr;
+      lstm_value.check_fg = nullptr;
-      lstm_value.checkOg = nullptr;
+      lstm_value.check_og = nullptr;
    }
    math::LstmMetaGrad<T> lstm_grad;
@@ -231,13 +231,13 @@ class LSTMGradKernel : public framework::OpKernel<T> {
    }
    if (bias && bias_g && ctx.Attr<bool>("use_peepholes")) {
      T* bias_g_data = bias_g->data<T>();
-      lstm_grad.checkIgGrad = bias_g_data + 4 * frame_size;
+      lstm_grad.check_ig_grad = bias_g_data + 4 * frame_size;
-      lstm_grad.checkFgGrad = lstm_grad.checkIgGrad + frame_size;
+      lstm_grad.check_fg_grad = lstm_grad.check_ig_grad + frame_size;
-      lstm_grad.checkOgGrad = lstm_grad.checkFgGrad + frame_size;
+      lstm_grad.check_og_grad = lstm_grad.check_fg_grad + frame_size;
    } else {
-      lstm_grad.checkIgGrad = nullptr;
+      lstm_grad.check_ig_grad = nullptr;
-      lstm_grad.checkFgGrad = nullptr;
+      lstm_grad.check_fg_grad = nullptr;
-      lstm_grad.checkOgGrad = nullptr;
+      lstm_grad.check_og_grad = nullptr;
    }
    math::LoDTensor2BatchFunctor<Place, T> to_batch;
@@ -276,26 +276,26 @@ class LSTMGradKernel : public framework::OpKernel<T> {
      Tensor gate = batch_gate->Slice(bstart, bend);
      Tensor cell = batch_cell.Slice(bstart, bend);
      Tensor cell_pre_act = batch_cell_pre_act->Slice(bstart, bend);
-      lstm_value.gateValue = gate.data<T>();
+      lstm_value.gate_value = gate.data<T>();
-      lstm_value.stateValue = cell.data<T>();
+      lstm_value.state_value = cell.data<T>();
-      lstm_value.stateActiveValue = cell_pre_act.data<T>();
+      lstm_value.state_active_value = cell_pre_act.data<T>();
      Tensor out_g = batch_hidden_g.Slice(bstart, bend);
      Tensor gate_g = batch_gate_g.Slice(bstart, bend);
      Tensor cell_g = batch_cell_g.Slice(bstart, bend);
-      lstm_grad.stateGrad = cell_g.data<T>();
+      lstm_grad.state_grad = cell_g.data<T>();
-      lstm_grad.gateGrad = gate_g.data<T>();
+      lstm_grad.gate_grad = gate_g.data<T>();
-      lstm_grad.outputGrad = out_g.data<T>();
+      lstm_grad.output_grad = out_g.data<T>();
      if (n > 0) {
        int bstart_pre = static_cast<int>(batch_starts[n - 1]);
        Tensor cell_pre = batch_cell.Slice(bstart_pre, bstart);
        Tensor cell_pre_g = batch_cell_g.Slice(bstart_pre, bstart);
-        lstm_value.prevStateValue = cell_pre.data<T>();
+        lstm_value.prev_state_value = cell_pre.data<T>();
-        lstm_grad.prevStateGrad = cell_pre_g.data<T>();
+        lstm_grad.prev_state_grad = cell_pre_g.data<T>();
      } else {
-        lstm_value.prevStateValue = c0 ? ordered_c0.data<T>() : nullptr;
+        lstm_value.prev_state_value = c0 ? ordered_c0.data<T>() : nullptr;
-        lstm_grad.prevStateGrad = c0_g ? ordered_c0_g.data<T>() : nullptr;
+        lstm_grad.prev_state_grad = c0_g ? ordered_c0_g.data<T>() : nullptr;
      }
      int cur_batch_size = bend - bstart;

--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -13,8 +13,9 @@ if(WITH_GPU)
    nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
    nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
    nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
-    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
    nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context)
+    nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context)
+    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
 else()
    cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto)
    cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
@@ -26,8 +27,9 @@ else()
    cc_library(context_project SRCS context_project.cc DEPS device_context math_function)
    cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)
    cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
-    cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
    cc_library(maxouting SRCS maxouting.cc DEPS device_context)
+    cc_library(unpooling SRCS unpooling.cc DEPS device_context)
+    cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
 endif()
 cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)

--- a/paddle/operators/math/detail/gru_cpu_kernel.h
+++ b/paddle/operators/math/detail/gru_cpu_kernel.h
@@ -25,393 +25,397 @@ namespace detail {
 #ifndef __NVCC__
 template <class OpResetOutput, typename T>
-void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput,
+void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
-                                       T *gateValue, T *resetOutputValue,
+                                       T *gate_value, T *reset_output_value,
-                                       T *prevOutputValue, int frameSize,
+                                       T *prev_output_value, int frame_size,
                                       activation_mode_t active_gate) {
-  T rValueUpdateGate;
+  T r_value_update_gate;
-  T rValueResetGate;
+  T r_value_reset_gate;
-  T rValueResetOutput;
+  T r_value_reset_output;
-  T rPrevOut = 0;
+  T r_prev_out = 0;
-  T *updateGate = gateValue;
+  T *update_gate = gate_value;
-  T *resetGate = gateValue + frameSize;
+  T *reset_gate = gate_value + frame_size;
-  for (int i = 0; i < frameSize; i++) {
+  for (int i = 0; i < frame_size; i++) {
-    rValueUpdateGate = updateGate[i];
+    r_value_update_gate = update_gate[i];
-    rValueResetGate = resetGate[i];
+    r_value_reset_gate = reset_gate[i];
-    if (prevOutputValue) {
+    if (prev_output_value) {
-      rPrevOut = prevOutputValue[i];
+      r_prev_out = prev_output_value[i];
    }
-    opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut,
+    op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
-                  rValueResetOutput, active_gate);
+                    r_value_reset_output, active_gate);
-    updateGate[i] = rValueUpdateGate;
+    update_gate[i] = r_value_update_gate;
-    resetGate[i] = rValueResetGate;
+    reset_gate[i] = r_value_reset_gate;
-    resetOutputValue[i] = rValueResetOutput;
+    reset_output_value[i] = r_value_reset_output;
  }
 }
 template <class OpFinalOutput, typename T>
-void hl_naive_gru_forward_final_output(OpFinalOutput opFinalOutput,
+void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
-                                       T *gateValue, T *prevOutputValue,
+                                       T *gate_value, T *prev_output_value,
-                                       T *outputValue, int frameSize,
+                                       T *output_value, int frame_size,
                                       activation_mode_t active_node) {
-  T rValueUpdateGate;
+  T r_value_update_gate;
-  T rValueFrameState;
+  T r_value_frame_state;
-  T rPrevOut = 0;
+  T r_prev_out = 0;
-  T rOutput;
+  T r_output;
-  T *updateGate = gateValue;
+  T *update_gate = gate_value;
-  T *frameState = gateValue + frameSize * 2;
+  T *frame_state = gate_value + frame_size * 2;
-  for (int i = 0; i < frameSize; i++) {
+  for (int i = 0; i < frame_size; i++) {
-    rValueUpdateGate = updateGate[i];
+    r_value_update_gate = update_gate[i];
-    rValueFrameState = frameState[i];
+    r_value_frame_state = frame_state[i];
-    if (prevOutputValue) {
+    if (prev_output_value) {
-      rPrevOut = prevOutputValue[i];
+      r_prev_out = prev_output_value[i];
    }
-    opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
+    op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
-                  active_node);
+                    r_output, active_node);
-    frameState[i] = rValueFrameState;
+    frame_state[i] = r_value_frame_state;
-    outputValue[i] = rOutput;
+    output_value[i] = r_output;
  }
 }
 template <class OpResetOutput, typename T>
-void hl_avx_gru_forward_reset_output(OpResetOutput opResetOutput, T *gateValue,
+void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
-                                     T *resetOutputValue, T *prevOutputValue,
+                                     T *gate_value, T *reset_output_value,
-                                     int frameSize,
+                                     T *prev_output_value, int frame_size,
                                     activation_mode_t active_gate) {
 #ifdef __AVX__
-  __m256 rValueUpdateGate;
+  __m256 r_value_update_gate;
-  __m256 rValueResetGate;
+  __m256 r_value_reset_gate;
-  __m256 rValueResetOutput;
+  __m256 r_value_reset_output;
-  __m256 rPrevOut = _mm256_set1_ps(0.0f);
+  __m256 r_prev_out = _mm256_set1_ps(0.0f);
-  __m256 *updateGate = (__m256 *)gateValue;
+  __m256 *update_gate = (__m256 *)gate_value;
-  __m256 *resetGate = (__m256 *)(gateValue + frameSize);
+  __m256 *reset_gate = (__m256 *)(gate_value + frame_size);
-  for (int i = 0; i < frameSize / 8; i++) {
+  for (int i = 0; i < frame_size / 8; i++) {
-    rValueUpdateGate = updateGate[i];
+    r_value_update_gate = update_gate[i];
-    rValueResetGate = resetGate[i];
+    r_value_reset_gate = reset_gate[i];
-    if (prevOutputValue) {
+    if (prev_output_value) {
-      rPrevOut = ((__m256 *)prevOutputValue)[i];
+      r_prev_out = ((__m256 *)prev_output_value)[i];
    }
-    opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut,
+    op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
-                  rValueResetOutput, active_gate);
+                    r_value_reset_output, active_gate);
-    updateGate[i] = rValueUpdateGate;
+    update_gate[i] = r_value_update_gate;
-    resetGate[i] = rValueResetGate;
+    reset_gate[i] = r_value_reset_gate;
-    ((__m256 *)resetOutputValue)[i] = rValueResetOutput;
+    ((__m256 *)reset_output_value)[i] = r_value_reset_output;
  }
 #endif
 }
 template <class OpFinalOutput, typename T>
-void hl_avx_gru_forward_final_output(OpFinalOutput opFinalOutput, T *gateValue,
+void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
-                                     T *prevOutputValue, T *outputValue,
+                                     T *gate_value, T *prev_output_value,
-                                     int frameSize,
+                                     T *output_value, int frame_size,
                                     activation_mode_t active_node) {
 #ifdef __AVX__
-  __m256 rValueUpdateGate;
+  __m256 r_value_update_gate;
-  __m256 rValueFrameState;
+  __m256 r_value_frame_state;
-  __m256 rPrevOut = _mm256_set1_ps(0.0f);
+  __m256 r_prev_out = _mm256_set1_ps(0.0f);
-  __m256 rOutput;
+  __m256 r_output;
-  __m256 *updateGate = (__m256 *)gateValue;
+  __m256 *update_gate = (__m256 *)gate_value;
-  __m256 *frameState = (__m256 *)(gateValue + frameSize * 2);
+  __m256 *frame_state = (__m256 *)(gate_value + frame_size * 2);
-  for (int i = 0; i < frameSize / 8; i++) {
+  for (int i = 0; i < frame_size / 8; i++) {
-    rValueUpdateGate = updateGate[i];
+    r_value_update_gate = update_gate[i];
-    rValueFrameState = frameState[i];
+    r_value_frame_state = frame_state[i];
-    if (prevOutputValue) {
+    if (prev_output_value) {
-      rPrevOut = ((__m256 *)prevOutputValue)[i];
+      r_prev_out = ((__m256 *)prev_output_value)[i];
    }
-    opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
+    op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
-                  active_node);
+                    r_output, active_node);
-    frameState[i] = rValueFrameState;
+    frame_state[i] = r_value_frame_state;
-    ((__m256 *)outputValue)[i] = rOutput;
+    ((__m256 *)output_value)[i] = r_output;
  }
 #endif
 }
 template <class OpResetOutput, typename T>
-inline void forward_reset_output(OpResetOutput opResetOutput,
+inline void forward_reset_output(OpResetOutput op_reset_output,
-                                 hl_gru_value<T> value, int frameSize,
+                                 hl_gru_value<T> value, int frame_size,
-                                 int batchSize, activation_mode_t active_gate) {
+                                 int batch_size,
-  for (int b = 0; b < batchSize; b++) {
+                                 activation_mode_t active_gate) {
-    if (OpResetOutput::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpResetOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
      hl_avx_gru_forward_reset_output(
-          opResetOutput, value.gateValue, value.resetOutputValue,
+          op_reset_output, value.gate_value, value.reset_output_value,
-          value.prevOutValue, frameSize, active_gate);
+          value.prev_out_value, frame_size, active_gate);
    } else {
      hl_naive_gru_forward_reset_output(
-          opResetOutput, value.gateValue, value.resetOutputValue,
+          op_reset_output, value.gate_value, value.reset_output_value,
-          value.prevOutValue, frameSize, active_gate);
+          value.prev_out_value, frame_size, active_gate);
    }
-    value.gateValue += frameSize * 3;
+    value.gate_value += frame_size * 3;
-    value.resetOutputValue += frameSize;
+    value.reset_output_value += frame_size;
-    if (value.prevOutValue) {
+    if (value.prev_out_value) {
-      value.prevOutValue += frameSize;
+      value.prev_out_value += frame_size;
    }
  }
 }
 template <class OpFinalOutput, typename T>
-inline void forward_final_output(OpFinalOutput opFinalOutput,
+inline void forward_final_output(OpFinalOutput op_final_output,
-                                 hl_gru_value<T> value, int frameSize,
+                                 hl_gru_value<T> value, int frame_size,
-                                 int batchSize, activation_mode_t active_node) {
+                                 int batch_size,
-  for (int b = 0; b < batchSize; b++) {
+                                 activation_mode_t active_node) {
-    if (OpFinalOutput::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+  for (int b = 0; b < batch_size; b++) {
-      hl_avx_gru_forward_final_output(opFinalOutput, value.gateValue,
+    if (OpFinalOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
-                                      value.prevOutValue, value.outputValue,
+      hl_avx_gru_forward_final_output(op_final_output, value.gate_value,
-                                      frameSize, active_node);
+                                      value.prev_out_value, value.output_value,
+                                      frame_size, active_node);
    } else {
-      hl_naive_gru_forward_final_output(opFinalOutput, value.gateValue,
+      hl_naive_gru_forward_final_output(
-                                        value.prevOutValue, value.outputValue,
+          op_final_output, value.gate_value, value.prev_out_value,
-                                        frameSize, active_node);
+          value.output_value, frame_size, active_node);
    }
-    value.gateValue += frameSize * 3;
+    value.gate_value += frame_size * 3;
-    value.outputValue += frameSize;
+    value.output_value += frame_size;
-    if (value.prevOutValue) {
+    if (value.prev_out_value) {
-      value.prevOutValue += frameSize;
+      value.prev_out_value += frame_size;
    }
  }
 }
 template <class OpStateGrad, typename T>
-void hl_naive_gru_backward_state_grad(OpStateGrad opStateGrad, T *gateValue,
+void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
-                                      T *gateGrad, T *prevOutValue,
+                                      T *gate_grad, T *prev_out_value,
-                                      T *prevOutGrad, T *outputGrad,
+                                      T *prev_out_grad, T *output_grad,
-                                      int frameSize,
+                                      int frame_size,
                                      activation_mode_t active_node) {
-  T rUpdateGateValue;
+  T r_update_gate_value;
-  T rUpdateGateGrad;
+  T r_update_gate_grad;
-  T rFrameStateValue;
+  T r_frame_state_value;
-  T rFrameStateGrad;
+  T r_frame_state_grad;
-  T rOutGrad;
+  T r_out_grad;
-  T rPrevOutValue = 0;
+  T r_prev_out_value = 0;
-  T rPrevOutGrad = 0;
+  T r_prev_out_grad = 0;
-  T *updateGateValue = gateValue;
+  T *update_gate_value = gate_value;
-  T *updateGateGrad = gateGrad;
+  T *update_gate_grad = gate_grad;
-  T *frameStateValue = gateValue + frameSize * 2;
+  T *frame_state_value = gate_value + frame_size * 2;
-  T *frameStateGrad = gateGrad + frameSize * 2;
+  T *frame_state_grad = gate_grad + frame_size * 2;
-  for (int i = 0; i < frameSize; i++) {
+  for (int i = 0; i < frame_size; i++) {
-    rUpdateGateValue = updateGateValue[i];
+    r_update_gate_value = update_gate_value[i];
-    rFrameStateValue = frameStateValue[i];
+    r_frame_state_value = frame_state_value[i];
-    rOutGrad = outputGrad[i];
+    r_out_grad = output_grad[i];
-    if (prevOutValue) {
+    if (prev_out_value) {
-      rPrevOutValue = prevOutValue[i];
+      r_prev_out_value = prev_out_value[i];
    }
-    if (prevOutGrad) {
+    if (prev_out_grad) {
-      rPrevOutGrad = prevOutGrad[i];
+      r_prev_out_grad = prev_out_grad[i];
    }
-    opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
+    op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
-                rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
+                  r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
-                active_node);
+                  r_out_grad, active_node);
-    updateGateGrad[i] = rUpdateGateGrad;
+    update_gate_grad[i] = r_update_gate_grad;
-    frameStateGrad[i] = rFrameStateGrad;
+    frame_state_grad[i] = r_frame_state_grad;
-    if (prevOutGrad) {
+    if (prev_out_grad) {
-      prevOutGrad[i] = rPrevOutGrad;
+      prev_out_grad[i] = r_prev_out_grad;
    }
  }
 }
 template <class OpResetGrad, typename T>
-void hl_naive_gru_backward_reset_grad(OpResetGrad opResetGrad, T *gateValue,
+void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
-                                      T *gateGrad, T *prevOutValue,
+                                      T *gate_grad, T *prev_out_value,
-                                      T *prevOutGrad, T *resetOutputGrad,
+                                      T *prev_out_grad, T *reset_output_grad,
-                                      int frameSize,
+                                      int frame_size,
                                      activation_mode_t active_gate) {
-  T rUpdateGateValue;
+  T r_update_gate_value;
-  T rUpdateGateGrad;
+  T r_update_gate_grad;
-  T rResetGateValue;
+  T r_reset_gate_value;
-  T rResetGateGrad;
+  T r_reset_gate_grad;
-  T rResetOutputGrad = 0;
+  T r_reset_output_grad = 0;
-  T rPrevOutValue = 0;
+  T r_prev_out_value = 0;
-  T rPrevOutGrad = 0;
+  T r_prev_out_grad = 0;
-  T *updateGateValue = gateValue;
+  T *update_gate_value = gate_value;
-  T *updateGateGrad = gateGrad;
+  T *update_gate_grad = gate_grad;
-  T *resetGateValue = gateValue + frameSize;
+  T *reset_gate_value = gate_value + frame_size;
-  T *resetGateGrad = gateGrad + frameSize;
+  T *reset_gate_grad = gate_grad + frame_size;
-  for (int i = 0; i < frameSize; i++) {
+  for (int i = 0; i < frame_size; i++) {
-    rUpdateGateValue = updateGateValue[i];
+    r_update_gate_value = update_gate_value[i];
-    rUpdateGateGrad = updateGateGrad[i];
+    r_update_gate_grad = update_gate_grad[i];
-    rResetGateValue = resetGateValue[i];
+    r_reset_gate_value = reset_gate_value[i];
-    if (prevOutValue && prevOutGrad) {
+    if (prev_out_value && prev_out_grad) {
-      rResetOutputGrad = resetOutputGrad[i];
+      r_reset_output_grad = reset_output_grad[i];
    }
-    if (prevOutValue) {
+    if (prev_out_value) {
-      rPrevOutValue = prevOutValue[i];
+      r_prev_out_value = prev_out_value[i];
    }
-    if (prevOutGrad) {
+    if (prev_out_grad) {
-      rPrevOutGrad = prevOutGrad[i];
+      r_prev_out_grad = prev_out_grad[i];
    }
-    opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
+    op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
-                rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
+                  r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
-                active_gate);
+                  r_reset_output_grad, active_gate);
-    updateGateGrad[i] = rUpdateGateGrad;
+    update_gate_grad[i] = r_update_gate_grad;
-    resetGateGrad[i] = rResetGateGrad;
+    reset_gate_grad[i] = r_reset_gate_grad;
-    if (prevOutGrad) {
+    if (prev_out_grad) {
-      prevOutGrad[i] = rPrevOutGrad;
+      prev_out_grad[i] = r_prev_out_grad;
    }
  }
 }
 template <class OpStateGrad, typename T>
-void hl_avx_gru_backward_state_grad(OpStateGrad opStateGrad, T *gateValue,
+void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
-                                    T *gateGrad, T *prevOutValue,
+                                    T *gate_grad, T *prev_out_value,
-                                    T *prevOutGrad, T *outputGrad,
+                                    T *prev_out_grad, T *output_grad,
-                                    int frameSize,
+                                    int frame_size,
                                    activation_mode_t active_node) {
 #ifdef __AVX__
-  __m256 rUpdateGateValue;
+  __m256 r_update_gate_value;
-  __m256 rUpdateGateGrad;
+  __m256 r_update_gate_grad;
-  __m256 rFrameStateValue;
+  __m256 r_frame_state_value;
-  __m256 rFrameStateGrad;
+  __m256 r_frame_state_grad;
-  __m256 rOutGrad;
+  __m256 r_out_grad;
-  __m256 rPrevOutValue = _mm256_set1_ps(0.0f);
+  __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
-  __m256 rPrevOutGrad = _mm256_set1_ps(0.0f);
+  __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
-  __m256 *updateGateValue = (__m256 *)gateValue;
+  __m256 *update_gate_value = (__m256 *)gate_value;
-  __m256 *updateGateGrad = (__m256 *)gateGrad;
+  __m256 *update_gate_grad = (__m256 *)gate_grad;
-  __m256 *frameStateValue = (__m256 *)(gateValue + frameSize * 2);
+  __m256 *frame_state_value = (__m256 *)(gate_value + frame_size * 2);
-  __m256 *frameStateGrad = (__m256 *)(gateGrad + frameSize * 2);
+  __m256 *frame_state_grad = (__m256 *)(gate_grad + frame_size * 2);
-  for (int i = 0; i < frameSize / 8; i++) {
+  for (int i = 0; i < frame_size / 8; i++) {
-    rUpdateGateValue = updateGateValue[i];
+    r_update_gate_value = update_gate_value[i];
-    rFrameStateValue = frameStateValue[i];
+    r_frame_state_value = frame_state_value[i];
-    rOutGrad = ((__m256 *)outputGrad)[i];
+    r_out_grad = ((__m256 *)output_grad)[i];
-    if (prevOutValue) {
+    if (prev_out_value) {
-      rPrevOutValue = ((__m256 *)prevOutValue)[i];
+      r_prev_out_value = ((__m256 *)prev_out_value)[i];
    }
-    if (prevOutGrad) {
+    if (prev_out_grad) {
-      rPrevOutGrad = ((__m256 *)prevOutGrad)[i];
+      r_prev_out_grad = ((__m256 *)prev_out_grad)[i];
    }
-    opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
+    op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
-                rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
+                  r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
-                active_node);
+                  r_out_grad, active_node);
-    updateGateGrad[i] = rUpdateGateGrad;
+    update_gate_grad[i] = r_update_gate_grad;
-    frameStateGrad[i] = rFrameStateGrad;
+    frame_state_grad[i] = r_frame_state_grad;
-    if (prevOutGrad) {
+    if (prev_out_grad) {
-      ((__m256 *)prevOutGrad)[i] = rPrevOutGrad;
+      ((__m256 *)prev_out_grad)[i] = r_prev_out_grad;
    }
  }
 #endif
 }
 template <class OpResetGrad, typename T>
-void hl_avx_gru_backward_reset_grad(OpResetGrad opResetGrad, T *gateValue,
+void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
-                                    T *gateGrad, T *prevOutValue,
+                                    T *gate_grad, T *prev_out_value,
-                                    T *prevOutGrad, T *resetOutputGrad,
+                                    T *prev_out_grad, T *reset_output_grad,
-                                    int frameSize,
+                                    int frame_size,
                                    activation_mode_t active_gate) {
 #ifdef __AVX__
-  __m256 rUpdateGateValue;
+  __m256 r_update_gate_value;
-  __m256 rUpdateGateGrad;
+  __m256 r_update_gate_grad;
-  __m256 rResetGateValue;
+  __m256 r_reset_gate_value;
-  __m256 rResetGateGrad;
+  __m256 r_reset_gate_grad;
-  __m256 rResetOutputGrad = _mm256_set1_ps(0.0f);
+  __m256 r_reset_output_grad = _mm256_set1_ps(0.0f);
-  __m256 rPrevOutValue = _mm256_set1_ps(0.0f);
+  __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
-  __m256 rPrevOutGrad = _mm256_set1_ps(0.0f);
+  __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
-  __m256 *updateGateValue = (__m256 *)gateValue;
+  __m256 *update_gate_value = (__m256 *)gate_value;
-  __m256 *updateGateGrad = (__m256 *)gateGrad;
+  __m256 *update_gate_grad = (__m256 *)gate_grad;
-  __m256 *resetGateValue = (__m256 *)(gateValue + frameSize);
+  __m256 *reset_gate_value = (__m256 *)(gate_value + frame_size);
-  __m256 *resetGateGrad = (__m256 *)(gateGrad + frameSize);
+  __m256 *reset_gate_grad = (__m256 *)(gate_grad + frame_size);
-  for (int i = 0; i < frameSize / 8; i++) {
+  for (int i = 0; i < frame_size / 8; i++) {
-    rUpdateGateValue = updateGateValue[i];
+    r_update_gate_value = update_gate_value[i];
-    rUpdateGateGrad = updateGateGrad[i];
+    r_update_gate_grad = update_gate_grad[i];
-    rResetGateValue = resetGateValue[i];
+    r_reset_gate_value = reset_gate_value[i];
-    if (prevOutValue && prevOutGrad) {
+    if (prev_out_value && prev_out_grad) {
-      rResetOutputGrad = ((__m256 *)resetOutputGrad)[i];
+      r_reset_output_grad = ((__m256 *)reset_output_grad)[i];
    }
-    if (prevOutValue) {
+    if (prev_out_value) {
-      rPrevOutValue = ((__m256 *)prevOutValue)[i];
+      r_prev_out_value = ((__m256 *)prev_out_value)[i];
    }
-    if (prevOutGrad) {
+    if (prev_out_grad) {
-      rPrevOutGrad = ((__m256 *)prevOutGrad)[i];
+      r_prev_out_grad = ((__m256 *)prev_out_grad)[i];
    }
-    opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
+    op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
-                rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
+                  r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
-                active_gate);
+                  r_reset_output_grad, active_gate);
-    updateGateGrad[i] = rUpdateGateGrad;
+    update_gate_grad[i] = r_update_gate_grad;
-    resetGateGrad[i] = rResetGateGrad;
+    reset_gate_grad[i] = r_reset_gate_grad;
-    if (prevOutGrad) {
+    if (prev_out_grad) {
-      ((__m256 *)prevOutGrad)[i] = rPrevOutGrad;
+      ((__m256 *)prev_out_grad)[i] = r_prev_out_grad;
    }
  }
 #endif
 }
 template <class OpStateGrad, typename T>
-inline void backward_state_grad(OpStateGrad opStateGrad, hl_gru_value<T> value,
+inline void backward_state_grad(OpStateGrad op_state_grad,
-                                hl_gru_grad<T> grad, int frameSize,
+                                hl_gru_value<T> value, hl_gru_grad<T> grad,
-                                int batchSize, activation_mode_t active_node) {
+                                int frame_size, int batch_size,
-  for (int b = 0; b < batchSize; b++) {
+                                activation_mode_t active_node) {
-    if (OpStateGrad::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpStateGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
      hl_avx_gru_backward_state_grad(
-          opStateGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
+          op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
-          grad.prevOutGrad, grad.outputGrad, frameSize, active_node);
+          grad.prev_out_grad, grad.output_grad, frame_size, active_node);
    } else {
      hl_naive_gru_backward_state_grad(
-          opStateGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
+          op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
-          grad.prevOutGrad, grad.outputGrad, frameSize, active_node);
+          grad.prev_out_grad, grad.output_grad, frame_size, active_node);
    }
-    value.gateValue += frameSize * 3;
+    value.gate_value += frame_size * 3;
-    if (value.prevOutValue) {
+    if (value.prev_out_value) {
-      value.prevOutValue += frameSize;
+      value.prev_out_value += frame_size;
    }
-    grad.gateGrad += frameSize * 3;
+    grad.gate_grad += frame_size * 3;
-    grad.outputGrad += frameSize;
+    grad.output_grad += frame_size;
-    if (grad.prevOutGrad) {
+    if (grad.prev_out_grad) {
-      grad.prevOutGrad += frameSize;
+      grad.prev_out_grad += frame_size;
    }
  }
 }
 template <class OpResetGrad, typename T>
-inline void backward_reset_grad(OpResetGrad opResetGrad, hl_gru_value<T> value,
+inline void backward_reset_grad(OpResetGrad op_reset_grad,
-                                hl_gru_grad<T> grad, int frameSize,
+                                hl_gru_value<T> value, hl_gru_grad<T> grad,
-                                int batchSize, activation_mode_t active_gate) {
+                                int frame_size, int batch_size,
-  for (int b = 0; b < batchSize; b++) {
+                                activation_mode_t active_gate) {
-    if (OpResetGrad::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpResetGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
      hl_avx_gru_backward_reset_grad(
-          opResetGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
+          op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
-          grad.prevOutGrad, grad.resetOutputGrad, frameSize, active_gate);
+          grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
    } else {
      hl_naive_gru_backward_reset_grad(
-          opResetGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
+          op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
-          grad.prevOutGrad, grad.resetOutputGrad, frameSize, active_gate);
+          grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
    }
-    value.gateValue += frameSize * 3;
+    value.gate_value += frame_size * 3;
-    if (value.prevOutValue) {
+    if (value.prev_out_value) {
-      value.prevOutValue += frameSize;
+      value.prev_out_value += frame_size;
    }
-    grad.gateGrad += frameSize * 3;
+    grad.gate_grad += frame_size * 3;
-    grad.resetOutputGrad += frameSize;
+    grad.reset_output_grad += frame_size;
-    if (grad.prevOutGrad) {
+    if (grad.prev_out_grad) {
-      grad.prevOutGrad += frameSize;
+      grad.prev_out_grad += frame_size;
    }
  }
 }

--- a/paddle/operators/math/detail/gru_gpu_kernel.h
+++ b/paddle/operators/math/detail/gru_gpu_kernel.h
@@ -27,174 +27,174 @@ namespace math {
 namespace detail {
 /*
- * threads(framePerBlock, batchPerBlock)
+ * threads(frame_per_block, batch_per_block)
- * grid(frameBlocks, batchBlocks)
+ * grid(frame_blocks, batch_blocks)
 */
-template <class OpResetOutput, bool isBatch, typename T>
+template <class OpResetOutput, bool is_batch, typename T>
-__global__ void KeGruForwardResetOutput(OpResetOutput opResetOutput,
+__global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output,
-                                        T *gateValue, T *resetOutputValue,
+                                        T *gate_value, T *reset_output_value,
-                                        T *prevOutputValue, int frameSize,
+                                        T *prev_output_value, int frame_size,
-                                        int batchSize,
+                                        int batch_size,
                                        activation_mode_t active_gate) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
+  if (frame_idx >= frame_size) return;
-  int batchIdx = 0;
+  int batch_idx = 0;
-  if (isBatch) {
+  if (is_batch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
+    if (batch_idx >= batch_size) return;
-    gateValue += batchIdx * 3 * frameSize;
+    gate_value += batch_idx * 3 * frame_size;
-    resetOutputValue += batchIdx * frameSize;
+    reset_output_value += batch_idx * frame_size;
  }
-  T rPrevOut = 0;
+  T r_prev_out = 0;
-  T rValueResetOutput;
+  T r_value_reset_output;
-  T rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
+  T r_value_update_gate = gate_value[frame_idx + frame_size * 0];
-  T rValueResetGate = gateValue[frameIdx + frameSize * 1];
+  T r_value_reset_gate = gate_value[frame_idx + frame_size * 1];
-  if (prevOutputValue) {
+  if (prev_output_value) {
-    if (isBatch) prevOutputValue += batchIdx * frameSize;
+    if (is_batch) prev_output_value += batch_idx * frame_size;
-    rPrevOut = prevOutputValue[frameIdx];
+    r_prev_out = prev_output_value[frame_idx];
  }
-  opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut, rValueResetOutput,
+  op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
-                active_gate);
+                  r_value_reset_output, active_gate);
-  gateValue[frameIdx + frameSize * 0] = rValueUpdateGate;
+  gate_value[frame_idx + frame_size * 0] = r_value_update_gate;
-  gateValue[frameIdx + frameSize * 1] = rValueResetGate;
+  gate_value[frame_idx + frame_size * 1] = r_value_reset_gate;
-  resetOutputValue[frameIdx] = rValueResetOutput;
+  reset_output_value[frame_idx] = r_value_reset_output;
 }
 /*
- * threads(framePerBlock, batchPerBlock)
+ * threads(frame_per_block, batch_per_block)
- * grid(frameBlocks, batchBlocks)
+ * grid(frame_blocks, batch_blocks)
 */
-template <class OpFinalOutput, bool isBatch, typename T>
+template <class OpFinalOutput, bool is_batch, typename T>
-__global__ void KeGruForwardFinalOutput(OpFinalOutput opFinalOutput,
+__global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
-                                        T *gateValue, T *prevOutputValue,
+                                        T *gate_value, T *prev_output_value,
-                                        T *outputValue, int frameSize,
+                                        T *output_value, int frame_size,
-                                        int batchSize,
+                                        int batch_size,
                                        activation_mode_t active_node) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
+  if (frame_idx >= frame_size) return;
-  int batchIdx = 0;
+  int batch_idx = 0;
-  if (isBatch) {
+  if (is_batch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
+    if (batch_idx >= batch_size) return;
-    gateValue += batchIdx * 3 * frameSize;
+    gate_value += batch_idx * 3 * frame_size;
-    outputValue += batchIdx * frameSize;
+    output_value += batch_idx * frame_size;
  }
-  T rOutput;
+  T r_output;
-  T rPrevOut = 0;
+  T r_prev_out = 0;
-  T rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
+  T r_value_update_gate = gate_value[frame_idx + frame_size * 0];
-  T rValueFrameState = gateValue[frameIdx + frameSize * 2];
+  T r_value_frame_state = gate_value[frame_idx + frame_size * 2];
-  if (prevOutputValue) {
+  if (prev_output_value) {
-    if (isBatch) prevOutputValue += batchIdx * frameSize;
+    if (is_batch) prev_output_value += batch_idx * frame_size;
-    rPrevOut = prevOutputValue[frameIdx];
+    r_prev_out = prev_output_value[frame_idx];
  }
-  opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
+  op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
-                active_node);
+                  r_output, active_node);
-  gateValue[frameIdx + frameSize * 2] = rValueFrameState;
+  gate_value[frame_idx + frame_size * 2] = r_value_frame_state;
-  outputValue[frameIdx] = rOutput;
+  output_value[frame_idx] = r_output;
 }
 /*
- * threads(framePerBlock, batchPerBlock)
+ * threads(frame_per_block, batch_per_block)
- * grid(frameBlocks, batchBlocks)
+ * grid(frame_blocks, batch_blocks)
 */
-template <class OpStateGrad, bool isBatch, typename T>
+template <class OpStateGrad, bool is_batch, typename T>
-__global__ void KeGruBackwardStateGrad(OpStateGrad opStateGrad, T *gateValue,
+__global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
-                                       T *gateGrad, T *prevOutValue,
+                                       T *gate_grad, T *prev_out_value,
-                                       T *prevOutGrad, T *outputGrad,
+                                       T *prev_out_grad, T *output_grad,
-                                       int frameSize, int batchSize,
+                                       int frame_size, int batch_size,
                                       activation_mode_t active_node) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
+  if (frame_idx >= frame_size) return;
-  int batchIdx = 0;
+  int batch_idx = 0;
-  if (isBatch) {
+  if (is_batch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
+    if (batch_idx >= batch_size) return;
-    gateValue += batchIdx * 3 * frameSize;
+    gate_value += batch_idx * 3 * frame_size;
-    gateGrad += batchIdx * 3 * frameSize;
+    gate_grad += batch_idx * 3 * frame_size;
-    outputGrad += batchIdx * frameSize;
+    output_grad += batch_idx * frame_size;
  }
-  T rUpdateGateGrad;
+  T r_update_gate_grad;
-  T rFrameStateGrad;
+  T r_frame_state_grad;
-  T rPrevOutValue = 0;
+  T r_prev_out_value = 0;
-  T rPrevOutGrad = 0;
+  T r_prev_out_grad = 0;
-  T rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
+  T r_update_gate_value = gate_value[frame_idx + frame_size * 0];
-  T rFrameStateValue = gateValue[frameIdx + frameSize * 2];
+  T r_frame_state_value = gate_value[frame_idx + frame_size * 2];
-  T rOutGrad = outputGrad[frameIdx];
+  T r_out_grad = output_grad[frame_idx];
-  if (prevOutValue && prevOutGrad) {
+  if (prev_out_value && prev_out_grad) {
-    if (isBatch) prevOutValue += batchIdx * frameSize;
+    if (is_batch) prev_out_value += batch_idx * frame_size;
-    rPrevOutValue = prevOutValue[frameIdx];
+    r_prev_out_value = prev_out_value[frame_idx];
-    if (isBatch) prevOutGrad += batchIdx * frameSize;
+    if (is_batch) prev_out_grad += batch_idx * frame_size;
-    rPrevOutGrad = prevOutGrad[frameIdx];
+    r_prev_out_grad = prev_out_grad[frame_idx];
  }
-  opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
+  op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
-              rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
+                r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
-              active_node);
+                r_out_grad, active_node);
-  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
+  gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
-  gateGrad[frameIdx + frameSize * 2] = rFrameStateGrad;
+  gate_grad[frame_idx + frame_size * 2] = r_frame_state_grad;
-  if (prevOutGrad) {
+  if (prev_out_grad) {
-    prevOutGrad[frameIdx] = rPrevOutGrad;
+    prev_out_grad[frame_idx] = r_prev_out_grad;
  }
 }
 /*
- * threads(framePerBlock, batchPerBlock)
+ * threads(frame_per_block, batch_per_block)
- * grid(frameBlocks, batchBlocks)
+ * grid(frame_blocks, batch_blocks)
 */
-template <class OpResetGrad, bool isBatch, typename T>
+template <class OpResetGrad, bool is_batch, typename T>
-__global__ void KeGruBackwardResetGrad(OpResetGrad opResetGrad, T *gateValue,
+__global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value,
-                                       T *gateGrad, T *prevOutValue,
+                                       T *gate_grad, T *prev_out_value,
-                                       T *prevOutGrad, T *resetOutputGrad,
+                                       T *prev_out_grad, T *reset_output_grad,
-                                       int frameSize, int batchSize,
+                                       int frame_size, int batch_size,
                                       activation_mode_t active_gate) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
+  if (frame_idx >= frame_size) return;
-  int batchIdx = 0;
+  int batch_idx = 0;
-  if (isBatch) {
+  if (is_batch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
+    if (batch_idx >= batch_size) return;
-    gateValue += batchIdx * 3 * frameSize;
+    gate_value += batch_idx * 3 * frame_size;
-    gateGrad += batchIdx * 3 * frameSize;
+    gate_grad += batch_idx * 3 * frame_size;
-    resetOutputGrad += batchIdx * frameSize;
+    reset_output_grad += batch_idx * frame_size;
  }
-  T rResetGateGrad;
+  T r_reset_gate_grad;
-  T rPrevOutValue = 0;
+  T r_prev_out_value = 0;
-  T rPrevOutGrad = 0;
+  T r_prev_out_grad = 0;
-  T rResetOutputGrad = 0;
+  T r_reset_output_grad = 0;
-  T rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
+  T r_update_gate_value = gate_value[frame_idx + frame_size * 0];
-  T rUpdateGateGrad = gateGrad[frameIdx + frameSize * 0];
+  T r_update_gate_grad = gate_grad[frame_idx + frame_size * 0];
-  T rResetGateValue = gateValue[frameIdx + frameSize * 1];
+  T r_reset_gate_value = gate_value[frame_idx + frame_size * 1];
-  if (prevOutValue && prevOutGrad) {
+  if (prev_out_value && prev_out_grad) {
-    if (isBatch) prevOutValue += batchIdx * frameSize;
+    if (is_batch) prev_out_value += batch_idx * frame_size;
-    if (isBatch) prevOutGrad += batchIdx * frameSize;
+    if (is_batch) prev_out_grad += batch_idx * frame_size;
-    rPrevOutValue = prevOutValue[frameIdx];
+    r_prev_out_value = prev_out_value[frame_idx];
-    rPrevOutGrad = prevOutGrad[frameIdx];
+    r_prev_out_grad = prev_out_grad[frame_idx];
-    rResetOutputGrad = resetOutputGrad[frameIdx];
+    r_reset_output_grad = reset_output_grad[frame_idx];
  }
-  opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
+  op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
-              rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
+                r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
-              active_gate);
+                r_reset_output_grad, active_gate);
-  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
+  gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
-  gateGrad[frameIdx + frameSize * 1] = rResetGateGrad;
+  gate_grad[frame_idx + frame_size * 1] = r_reset_gate_grad;
-  if (prevOutGrad) {
+  if (prev_out_grad) {
-    prevOutGrad[frameIdx] = rPrevOutGrad;
+    prev_out_grad[frame_idx] = r_prev_out_grad;
  }
 }
 }  // namespace detail

--- a/paddle/operators/math/detail/gru_kernel.h
+++ b/paddle/operators/math/detail/gru_kernel.h
@@ -28,23 +28,25 @@ namespace forward {
 template <typename T>
 class gru_resetOutput {
 public:
-  HOSTDEVICE void operator()(T &valueUpdateGate, T &valueResetGate, T &prevOut,
+  HOSTDEVICE void operator()(T &value_update_gate, T &value_reset_gate,
-                             T &valueResetOutput, activation_mode_t actGate) {
+                             T &prev_out, T &value_reset_output,
-    valueUpdateGate = activation(valueUpdateGate, actGate);
+                             activation_mode_t act_gate) {
-    valueResetGate = activation(valueResetGate, actGate);
+    value_update_gate = activation(value_update_gate, act_gate);
-    valueResetOutput = prevOut * valueResetGate;
+    value_reset_gate = activation(value_reset_gate, act_gate);
+    value_reset_output = prev_out * value_reset_gate;
  }
 #ifndef __NVCC__
 #ifndef __AVX__
  static const bool avx = false;
 #else
  static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &valueResetGate,
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
-                             __m256 &prevOut, __m256 &valueResetOutput,
+                             __m256 &value_reset_gate, __m256 &prev_out,
-                             activation_mode_t actGate) {
+                             __m256 &value_reset_output,
-    valueUpdateGate = activation(valueUpdateGate, actGate);
+                             activation_mode_t act_gate) {
-    valueResetGate = activation(valueResetGate, actGate);
+    value_update_gate = activation(value_update_gate, act_gate);
-    valueResetOutput = _mm256_mul_ps(prevOut, valueResetGate);
+    value_reset_gate = activation(value_reset_gate, act_gate);
+    value_reset_output = _mm256_mul_ps(prev_out, value_reset_gate);
  }
 #endif
 #endif
@@ -53,24 +55,26 @@ class gru_resetOutput {
 template <typename T>
 class gru_finalOutput {
 public:
-  HOSTDEVICE void operator()(T &valueUpdateGate, T &valueFrameState, T &prevOut,
+  HOSTDEVICE void operator()(T &value_update_gate, T &value_frame_state,
-                             T &valueOutput, activation_mode_t actInput) {
+                             T &prev_out, T &value_output,
-    valueFrameState = activation(valueFrameState, actInput);
+                             activation_mode_t act_input) {
-    valueOutput = prevOut - (valueUpdateGate * prevOut) +
+    value_frame_state = activation(value_frame_state, act_input);
-                  (valueUpdateGate * valueFrameState);
+    value_output = prev_out - (value_update_gate * prev_out) +
+                   (value_update_gate * value_frame_state);
  }
 #ifndef __NVCC__
 #ifndef __AVX__
  static const bool avx = false;
 #else
  static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &valueFrameState,
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
-                             __m256 &prevOut, __m256 &valueOutput,
+                             __m256 &value_frame_state, __m256 &prev_out,
-                             activation_mode_t actInput) {
+                             __m256 &value_output,
-    valueFrameState = activation(valueFrameState, actInput);
+                             activation_mode_t act_input) {
-    valueOutput = _mm256_add_ps(
+    value_frame_state = activation(value_frame_state, act_input);
-        _mm256_sub_ps(prevOut, _mm256_mul_ps(valueUpdateGate, prevOut)),
+    value_output = _mm256_add_ps(
-        _mm256_mul_ps(valueUpdateGate, valueFrameState));
+        _mm256_sub_ps(prev_out, _mm256_mul_ps(value_update_gate, prev_out)),
+        _mm256_mul_ps(value_update_gate, value_frame_state));
  }
 #endif
 #endif
@@ -82,34 +86,37 @@ namespace backward {
 template <typename T>
 class gru_stateGrad {
 public:
-  HOSTDEVICE void operator()(T &valueUpdateGate, T &gradUpdateGate,
+  HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
-                             T &valueFrameState, T &gradFrameState,
+                             T &value_frame_state, T &grad_frame_state,
-                             T &valuePrevOut, T &gradPrevOut, T &gradOutput,
+                             T &value_prev_out, T &grad_prev_out,
-                             activation_mode_t actInput) {
+                             T &grad_output, activation_mode_t act_input) {
-    gradUpdateGate = (gradOutput * valueFrameState);
+    grad_update_gate = (grad_output * value_frame_state);
-    gradUpdateGate -= (gradOutput * valuePrevOut);
+    grad_update_gate -= (grad_output * value_prev_out);
-    gradPrevOut -= (gradOutput * valueUpdateGate);
+    grad_prev_out -= (grad_output * value_update_gate);
-    gradPrevOut += gradOutput;
+    grad_prev_out += grad_output;
-    gradFrameState =
+    grad_frame_state = activation(grad_output * value_update_gate,
-        activation(gradOutput * valueUpdateGate, valueFrameState, actInput);
+                                  value_frame_state, act_input);
  }
 #ifndef __NVCC__
 #ifndef __AVX__
  static const bool avx = false;
 #else
  static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &gradUpdateGate,
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
-                             __m256 &valueFrameState, __m256 &gradFrameState,
+                             __m256 &grad_update_gate,
-                             __m256 &valuePrevOut, __m256 &gradPrevOut,
+                             __m256 &value_frame_state,
-                             __m256 &gradOutput, activation_mode_t actInput) {
+                             __m256 &grad_frame_state, __m256 &value_prev_out,
-    gradUpdateGate = _mm256_mul_ps(gradOutput, valueFrameState);
+                             __m256 &grad_prev_out, __m256 &grad_output,
-    gradUpdateGate =
+                             activation_mode_t act_input) {
-        _mm256_sub_ps(gradUpdateGate, _mm256_mul_ps(gradOutput, valuePrevOut));
+    grad_update_gate = _mm256_mul_ps(grad_output, value_frame_state);
-    gradPrevOut = _mm256_add_ps(
+    grad_update_gate = _mm256_sub_ps(
-        _mm256_sub_ps(gradPrevOut, _mm256_mul_ps(gradOutput, valueUpdateGate)),
+        grad_update_gate, _mm256_mul_ps(grad_output, value_prev_out));
-        gradOutput);
+    grad_prev_out = _mm256_add_ps(
-    gradFrameState = activation(_mm256_mul_ps(gradOutput, valueUpdateGate),
+        _mm256_sub_ps(grad_prev_out,
-                                valueFrameState, actInput);
+                      _mm256_mul_ps(grad_output, value_update_gate)),
+        grad_output);
+    grad_frame_state = activation(_mm256_mul_ps(grad_output, value_update_gate),
+                                  value_frame_state, act_input);
  }
 #endif
 #endif
@@ -118,30 +125,32 @@ class gru_stateGrad {
 template <typename T>
 class gru_resetGrad {
 public:
-  HOSTDEVICE void operator()(T &valueUpdateGate, T &gradUpdateGate,
+  HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
-                             T &valueResetGate, T &gradResetGate,
+                             T &value_reset_gate, T &grad_reset_gate,
-                             T &valuePrevOut, T &gradPrevOut,
+                             T &value_prev_out, T &grad_prev_out,
-                             T &gradResetOutput, activation_mode_t actGate) {
+                             T &grad_reset_output, activation_mode_t act_gate) {
-    gradResetGate = (gradResetOutput * valuePrevOut);
+    grad_reset_gate = (grad_reset_output * value_prev_out);
-    gradPrevOut += (gradResetOutput * valueResetGate);
+    grad_prev_out += (grad_reset_output * value_reset_gate);
-    gradUpdateGate = activation(gradUpdateGate, valueUpdateGate, actGate);
+    grad_update_gate =
-    gradResetGate = activation(gradResetGate, valueResetGate, actGate);
+        activation(grad_update_gate, value_update_gate, act_gate);
+    grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate);
  }
 #ifndef __NVCC__
 #ifndef __AVX__
  static const bool avx = false;
 #else
  static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &gradUpdateGate,
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
-                             __m256 &valueResetGate, __m256 &gradResetGate,
+                             __m256 &grad_update_gate, __m256 &value_reset_gate,
-                             __m256 &valuePrevOut, __m256 &gradPrevOut,
+                             __m256 &grad_reset_gate, __m256 &value_prev_out,
-                             __m256 &gradResetOutput,
+                             __m256 &grad_prev_out, __m256 &grad_reset_output,
-                             activation_mode_t actGate) {
+                             activation_mode_t act_gate) {
-    gradResetGate = _mm256_mul_ps(gradResetOutput, valuePrevOut);
+    grad_reset_gate = _mm256_mul_ps(grad_reset_output, value_prev_out);
-    gradPrevOut = _mm256_add_ps(gradPrevOut,
+    grad_prev_out = _mm256_add_ps(
-                                _mm256_mul_ps(gradResetOutput, valueResetGate));
+        grad_prev_out, _mm256_mul_ps(grad_reset_output, value_reset_gate));
-    gradUpdateGate = activation(gradUpdateGate, valueUpdateGate, actGate);
+    grad_update_gate =
-    gradResetGate = activation(gradResetGate, valueResetGate, actGate);
+        activation(grad_update_gate, value_update_gate, act_gate);
+    grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate);
  }
 #endif
 #endif

--- a/paddle/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/operators/math/detail/lstm_cpu_kernel.h
@@ -26,278 +26,284 @@ namespace detail {
 template <class T, class Op>
 void naive_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
-                                     int frameSize,
+                                     int frame_size,
                                     activation_mode_t active_node,
                                     activation_mode_t active_gate,
                                     activation_mode_t active_state) {
-  T rValueIn;
+  T r_value_in;
-  T rValueIg;
+  T r_value_ig;
-  T rValueFg;
+  T r_value_fg;
-  T rValueOg;
+  T r_value_og;
-  T rCheckI;
+  T r_checkI;
-  T rCheckF;
+  T r_checkF;
-  T rCheckO;
+  T r_checkO;
-  T rState;
+  T r_state;
-  T rPrevState = 0;
+  T r_prev_state = 0;
-  T rStateAtv;
+  T r_state_atv;
-  T rOut;
+  T r_out;
-  T *valueIn = value.gateValue;
+  T *value_in = value.gate_value;
-  T *valueIg = value.gateValue + frameSize;
+  T *value_ig = value.gate_value + frame_size;
-  T *valueFg = value.gateValue + frameSize * 2;
+  T *value_fg = value.gate_value + frame_size * 2;
-  T *valueOg = value.gateValue + frameSize * 3;
+  T *value_og = value.gate_value + frame_size * 3;
-  for (int i = 0; i < frameSize; i++) {
+  for (int i = 0; i < frame_size; i++) {
-    rValueIn = valueIn[i];
+    r_value_in = value_in[i];
-    rValueIg = valueIg[i];
+    r_value_ig = value_ig[i];
-    rValueFg = valueFg[i];
+    r_value_fg = value_fg[i];
-    rValueOg = valueOg[i];
+    r_value_og = value_og[i];
-    rCheckI = value.checkIg ? value.checkIg[i] : 0;
+    r_checkI = value.check_ig ? value.check_ig[i] : 0;
-    rCheckF = value.checkFg ? value.checkFg[i] : 0;
+    r_checkF = value.check_fg ? value.check_fg[i] : 0;
-    rCheckO = value.checkOg ? value.checkOg[i] : 0;
+    r_checkO = value.check_og ? value.check_og[i] : 0;
-    if (value.prevStateValue) {
+    if (value.prev_state_value) {
-      rPrevState = value.prevStateValue[i];
+      r_prev_state = value.prev_state_value[i];
    }
-    op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv,
+    op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
-       rOut, rCheckI, rCheckF, rCheckO, active_node, active_gate, active_state);
+       r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node,
+       active_gate, active_state);
-    valueIn[i] = rValueIn;
+    value_in[i] = r_value_in;
-    valueIg[i] = rValueIg;
+    value_ig[i] = r_value_ig;
-    valueFg[i] = rValueFg;
+    value_fg[i] = r_value_fg;
-    valueOg[i] = rValueOg;
+    value_og[i] = r_value_og;
-    value.stateValue[i] = rState;
+    value.state_value[i] = r_state;
-    value.stateActiveValue[i] = rStateAtv;
+    value.state_active_value[i] = r_state_atv;
-    value.outputValue[i] = rOut;
+    value.output_value[i] = r_out;
  }
 }
 template <class T, class Op>
 void naive_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
-                                      LstmMetaGrad<T> grad, int frameSize,
+                                      LstmMetaGrad<T> grad, int frame_size,
                                      activation_mode_t active_node,
                                      activation_mode_t active_gate,
                                      activation_mode_t active_state) {
-  T rValueIn;
+  T r_value_in;
-  T rValueIg;
+  T r_value_ig;
-  T rValueFg;
+  T r_value_fg;
-  T rValueOg;
+  T r_value_og;
-  T rGradIn;
+  T r_grad_in;
-  T rGradIg;
+  T r_grad_ig;
-  T rGradFg;
+  T r_grad_fg;
-  T rGradOg;
+  T r_grad_og;
-  T rPrevState = 0;
+  T r_prev_state = 0;
-  T rPrevStateGrad;
+  T r_prev_state_grad;
-  T rState;
+  T r_state;
-  T rStateGrad;
+  T r_state_grad;
-  T rStateAtv;
+  T r_state_atv;
-  T rOutputGrad;
+  T r_output_grad;
-  T rCheckI;
+  T r_checkI;
-  T rCheckF;
+  T r_checkF;
-  T rCheckO;
+  T r_checkO;
-  T rCheckIGrad;
+  T r_checkIGrad;
-  T rCheckFGrad;
+  T r_checkFGrad;
-  T rCheckOGrad;
+  T r_checkOGrad;
-  T *valueIn = value.gateValue;
+  T *value_in = value.gate_value;
-  T *valueIg = value.gateValue + frameSize;
+  T *value_ig = value.gate_value + frame_size;
-  T *valueFg = value.gateValue + frameSize * 2;
+  T *value_fg = value.gate_value + frame_size * 2;
-  T *valueOg = value.gateValue + frameSize * 3;
+  T *value_og = value.gate_value + frame_size * 3;
-  T *gradIn = grad.gateGrad;
+  T *grad_in = grad.gate_grad;
-  T *gradIg = grad.gateGrad + frameSize;
+  T *grad_ig = grad.gate_grad + frame_size;
-  T *gradFg = grad.gateGrad + frameSize * 2;
+  T *grad_fg = grad.gate_grad + frame_size * 2;
-  T *gradOg = grad.gateGrad + frameSize * 3;
+  T *grad_og = grad.gate_grad + frame_size * 3;
-  for (int i = 0; i < frameSize; i++) {
+  for (int i = 0; i < frame_size; i++) {
-    rValueIn = valueIn[i];
+    r_value_in = value_in[i];
-    rValueIg = valueIg[i];
+    r_value_ig = value_ig[i];
-    rValueFg = valueFg[i];
+    r_value_fg = value_fg[i];
-    rValueOg = valueOg[i];
+    r_value_og = value_og[i];
-    rCheckI = value.checkIg ? value.checkIg[i] : 0;
+    r_checkI = value.check_ig ? value.check_ig[i] : 0;
-    rCheckF = value.checkFg ? value.checkFg[i] : 0;
+    r_checkF = value.check_fg ? value.check_fg[i] : 0;
-    rCheckO = value.checkOg ? value.checkOg[i] : 0;
+    r_checkO = value.check_og ? value.check_og[i] : 0;
-    rState = value.stateValue[i];
+    r_state = value.state_value[i];
-    rStateAtv = value.stateActiveValue[i];
+    r_state_atv = value.state_active_value[i];
-    rOutputGrad = grad.outputGrad[i];
+    r_output_grad = grad.output_grad[i];
-    rStateGrad = grad.stateGrad[i];
+    r_state_grad = grad.state_grad[i];
-    if (value.prevStateValue) {
+    if (value.prev_state_value) {
-      rPrevState = value.prevStateValue[i];
+      r_prev_state = value.prev_state_value[i];
    }
-    op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg,
+    op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
-       rGradOg, rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv,
+       r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state,
-       rOutputGrad, rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad,
+       r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO,
-       rCheckOGrad, active_node, active_gate, active_state);
+       r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate,
+       active_state);
-    gradIn[i] = rGradIn;
-    gradIg[i] = rGradIg;
+    grad_in[i] = r_grad_in;
-    gradFg[i] = rGradFg;
+    grad_ig[i] = r_grad_ig;
-    gradOg[i] = rGradOg;
+    grad_fg[i] = r_grad_fg;
-    grad.stateGrad[i] = rStateGrad;
+    grad_og[i] = r_grad_og;
+    grad.state_grad[i] = r_state_grad;
-    if (grad.prevStateGrad) grad.prevStateGrad[i] = rPrevStateGrad;
-    if (value.prevStateValue) {
+    if (grad.prev_state_grad) grad.prev_state_grad[i] = r_prev_state_grad;
-      if (grad.checkIgGrad) grad.checkIgGrad[i] += rCheckIGrad;
+    if (value.prev_state_value) {
-      if (grad.checkFgGrad) grad.checkFgGrad[i] += rCheckFGrad;
+      if (grad.check_ig_grad) grad.check_ig_grad[i] += r_checkIGrad;
+      if (grad.check_fg_grad) grad.check_fg_grad[i] += r_checkFGrad;
    }
-    if (grad.checkOgGrad) grad.checkOgGrad[i] += rCheckOGrad;
+    if (grad.check_og_grad) grad.check_og_grad[i] += r_checkOGrad;
  }
 }
 template <class T, class Op>
-void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value, int frameSize,
+void avx_lstm_forward_one_sequence(Op op, LstmMetaValue<T> value,
+                                   int frame_size,
                                   activation_mode_t active_node,
                                   activation_mode_t active_gate,
                                   activation_mode_t active_state) {
 #ifdef __AVX__
-  __m256 rValueIn;
+  __m256 r_value_in;
-  __m256 rValueIg;
+  __m256 r_value_ig;
-  __m256 rValueFg;
+  __m256 r_value_fg;
-  __m256 rValueOg;
+  __m256 r_value_og;
-  __m256 rCheckI = _mm256_set1_ps(0.0f);
+  __m256 r_checkI = _mm256_set1_ps(0.0f);
-  __m256 rCheckF = _mm256_set1_ps(0.0f);
+  __m256 r_checkF = _mm256_set1_ps(0.0f);
-  __m256 rCheckO = _mm256_set1_ps(0.0f);
+  __m256 r_checkO = _mm256_set1_ps(0.0f);
-  __m256 rState;
+  __m256 r_state;
-  __m256 rPrevState = _mm256_set1_ps(0.0f);
+  __m256 r_prev_state = _mm256_set1_ps(0.0f);
-  __m256 rStateAtv;
+  __m256 r_state_atv;
-  __m256 rOut;
+  __m256 r_out;
-  __m256 *valueIn = (__m256 *)value.gateValue;
+  __m256 *value_in = (__m256 *)value.gate_value;
-  __m256 *valueIg = (__m256 *)(value.gateValue + frameSize);
+  __m256 *value_ig = (__m256 *)(value.gate_value + frame_size);
-  __m256 *valueFg = (__m256 *)(value.gateValue + frameSize * 2);
+  __m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2);
-  __m256 *valueOg = (__m256 *)(value.gateValue + frameSize * 3);
+  __m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3);
-  for (int i = 0; i < frameSize / 8; i++) {
+  for (int i = 0; i < frame_size / 8; i++) {
-    rValueIn = valueIn[i];
+    r_value_in = value_in[i];
-    rValueIg = valueIg[i];
+    r_value_ig = value_ig[i];
-    rValueFg = valueFg[i];
+    r_value_fg = value_fg[i];
-    rValueOg = valueOg[i];
+    r_value_og = value_og[i];
-    if (value.checkIg) {
+    if (value.check_ig) {
-      rCheckI = ((__m256 *)value.checkIg)[i];
+      r_checkI = ((__m256 *)value.check_ig)[i];
-      rCheckF = ((__m256 *)value.checkFg)[i];
+      r_checkF = ((__m256 *)value.check_fg)[i];
-      rCheckO = ((__m256 *)value.checkOg)[i];
+      r_checkO = ((__m256 *)value.check_og)[i];
    }
-    if (value.prevStateValue) {
+    if (value.prev_state_value) {
-      rPrevState = ((__m256 *)value.prevStateValue)[i];
+      r_prev_state = ((__m256 *)value.prev_state_value)[i];
    }
-    op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv,
+    op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
-       rOut, rCheckI, rCheckF, rCheckO, active_node, active_gate, active_state);
+       r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node,
+       active_gate, active_state);
-    valueIn[i] = rValueIn;
+    value_in[i] = r_value_in;
-    valueIg[i] = rValueIg;
+    value_ig[i] = r_value_ig;
-    valueFg[i] = rValueFg;
+    value_fg[i] = r_value_fg;
-    valueOg[i] = rValueOg;
+    value_og[i] = r_value_og;
-    ((__m256 *)value.stateValue)[i] = rState;
+    ((__m256 *)value.state_value)[i] = r_state;
-    ((__m256 *)value.stateActiveValue)[i] = rStateAtv;
+    ((__m256 *)value.state_active_value)[i] = r_state_atv;
-    ((__m256 *)value.outputValue)[i] = rOut;
+    ((__m256 *)value.output_value)[i] = r_out;
  }
 #endif
 }
 template <class T, class Op>
 void avx_lstm_backward_one_sequence(Op op, LstmMetaValue<T> value,
-                                    LstmMetaGrad<T> grad, int frameSize,
+                                    LstmMetaGrad<T> grad, int frame_size,
                                    activation_mode_t active_node,
                                    activation_mode_t active_gate,
                                    activation_mode_t active_state) {
 #ifdef __AVX__
-  __m256 rValueIn;
+  __m256 r_value_in;
-  __m256 rValueIg;
+  __m256 r_value_ig;
-  __m256 rValueFg;
+  __m256 r_value_fg;
-  __m256 rValueOg;
+  __m256 r_value_og;
-  __m256 rGradIn;
+  __m256 r_grad_in;
-  __m256 rGradIg;
+  __m256 r_grad_ig;
-  __m256 rGradFg;
+  __m256 r_grad_fg;
-  __m256 rGradOg;
+  __m256 r_grad_og;
-  __m256 rPrevState = _mm256_set1_ps(0.0f);
+  __m256 r_prev_state = _mm256_set1_ps(0.0f);
-  __m256 rPrevStateGrad;
+  __m256 r_prev_state_grad;
-  __m256 rStateGrad;
+  __m256 r_state_grad;
-  __m256 rState;
+  __m256 r_state;
-  __m256 rStateAtv;
+  __m256 r_state_atv;
-  __m256 rOutputGrad;
+  __m256 r_output_grad;
-  __m256 rCheckI = _mm256_set1_ps(0.0f);
+  __m256 r_checkI = _mm256_set1_ps(0.0f);
-  __m256 rCheckF = _mm256_set1_ps(0.0f);
+  __m256 r_checkF = _mm256_set1_ps(0.0f);
-  __m256 rCheckO = _mm256_set1_ps(0.0f);
+  __m256 r_checkO = _mm256_set1_ps(0.0f);
-  __m256 rCheckIGrad;
+  __m256 r_checkIGrad;
-  __m256 rCheckFGrad;
+  __m256 r_checkFGrad;
-  __m256 rCheckOGrad;
+  __m256 r_checkOGrad;
-  __m256 *valueIn = (__m256 *)value.gateValue;
+  __m256 *value_in = (__m256 *)value.gate_value;
-  __m256 *valueIg = (__m256 *)(value.gateValue + frameSize);
+  __m256 *value_ig = (__m256 *)(value.gate_value + frame_size);
-  __m256 *valueFg = (__m256 *)(value.gateValue + frameSize * 2);
+  __m256 *value_fg = (__m256 *)(value.gate_value + frame_size * 2);
-  __m256 *valueOg = (__m256 *)(value.gateValue + frameSize * 3);
+  __m256 *value_og = (__m256 *)(value.gate_value + frame_size * 3);
-  __m256 *gradIn = (__m256 *)grad.gateGrad;
+  __m256 *grad_in = (__m256 *)grad.gate_grad;
-  __m256 *gradIg = (__m256 *)(grad.gateGrad + frameSize);
+  __m256 *grad_ig = (__m256 *)(grad.gate_grad + frame_size);
-  __m256 *gradFg = (__m256 *)(grad.gateGrad + frameSize * 2);
+  __m256 *grad_fg = (__m256 *)(grad.gate_grad + frame_size * 2);
-  __m256 *gradOg = (__m256 *)(grad.gateGrad + frameSize * 3);
+  __m256 *grad_og = (__m256 *)(grad.gate_grad + frame_size * 3);
-  for (int i = 0; i < frameSize / 8; i++) {
+  for (int i = 0; i < frame_size / 8; i++) {
-    rValueIn = valueIn[i];
+    r_value_in = value_in[i];
-    rValueIg = valueIg[i];
+    r_value_ig = value_ig[i];
-    rValueFg = valueFg[i];
+    r_value_fg = value_fg[i];
-    rValueOg = valueOg[i];
+    r_value_og = value_og[i];
-    if (value.checkIg) {
+    if (value.check_ig) {
-      rCheckI = ((__m256 *)value.checkIg)[i];
+      r_checkI = ((__m256 *)value.check_ig)[i];
-      rCheckF = ((__m256 *)value.checkFg)[i];
+      r_checkF = ((__m256 *)value.check_fg)[i];
-      rCheckO = ((__m256 *)value.checkOg)[i];
+      r_checkO = ((__m256 *)value.check_og)[i];
    }
-    rState = ((__m256 *)value.stateValue)[i];
+    r_state = ((__m256 *)value.state_value)[i];
-    rStateAtv = ((__m256 *)value.stateActiveValue)[i];
+    r_state_atv = ((__m256 *)value.state_active_value)[i];
-    rOutputGrad = ((__m256 *)grad.outputGrad)[i];
+    r_output_grad = ((__m256 *)grad.output_grad)[i];
-    rStateGrad = ((__m256 *)grad.stateGrad)[i];
+    r_state_grad = ((__m256 *)grad.state_grad)[i];
-    if (value.prevStateValue) {
+    if (value.prev_state_value) {
-      rPrevState = ((__m256 *)value.prevStateValue)[i];
+      r_prev_state = ((__m256 *)value.prev_state_value)[i];
    }
-    op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg,
+    op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
-       rGradOg, rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv,
+       r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state,
-       rOutputGrad, rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad,
+       r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO,
-       rCheckOGrad, active_node, active_gate, active_state);
+       r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate,
+       active_state);
-    gradIn[i] = rGradIn;
-    gradIg[i] = rGradIg;
+    grad_in[i] = r_grad_in;
-    gradFg[i] = rGradFg;
+    grad_ig[i] = r_grad_ig;
-    gradOg[i] = rGradOg;
+    grad_fg[i] = r_grad_fg;
-    ((__m256 *)grad.stateGrad)[i] = rStateGrad;
+    grad_og[i] = r_grad_og;
+    ((__m256 *)grad.state_grad)[i] = r_state_grad;
-    if (grad.prevStateGrad) ((__m256 *)grad.prevStateGrad)[i] = rPrevStateGrad;
-    if (value.prevStateValue) {
+    if (grad.prev_state_grad)
-      if (grad.checkIgGrad) ((__m256 *)grad.checkIgGrad)[i] += rCheckIGrad;
+      ((__m256 *)grad.prev_state_grad)[i] = r_prev_state_grad;
-      if (grad.checkFgGrad) ((__m256 *)grad.checkFgGrad)[i] += rCheckFGrad;
+    if (value.prev_state_value) {
+      if (grad.check_ig_grad) ((__m256 *)grad.check_ig_grad)[i] += r_checkIGrad;
+      if (grad.check_fg_grad) ((__m256 *)grad.check_fg_grad)[i] += r_checkFGrad;
    }
-    if (grad.checkOgGrad) ((__m256 *)grad.checkOgGrad)[i] += rCheckOGrad;
+    if (grad.check_og_grad) ((__m256 *)grad.check_og_grad)[i] += r_checkOGrad;
  }
 #endif
 }
 template <class T, class Op>
-void cpu_lstm_forward(Op op, LstmMetaValue<T> value, int frameSize,
+void cpu_lstm_forward(Op op, LstmMetaValue<T> value, int frame_size,
                      activation_mode_t active_node,
                      activation_mode_t active_gate,
                      activation_mode_t active_state) {
-  if (Op::avx && !(frameSize & (8 - 1)) && (std::is_same<T, float>::value)) {
+  if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
-    avx_lstm_forward_one_sequence<T>(op, value, frameSize, active_node,
+    avx_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
                                     active_gate, active_state);
  } else {
-    naive_lstm_forward_one_sequence<T>(op, value, frameSize, active_node,
+    naive_lstm_forward_one_sequence<T>(op, value, frame_size, active_node,
                                       active_gate, active_state);
  }
 }
 template <class T, class Op>
 void cpu_lstm_backward(Op op, LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                       int frameSize, activation_mode_t active_node,
+                       int frame_size, activation_mode_t active_node,
                       activation_mode_t active_gate,
                       activation_mode_t active_state) {
-  if (Op::avx && !(frameSize & (8 - 1)) && (std::is_same<T, float>::value)) {
+  if (Op::avx && !(frame_size & (8 - 1)) && (std::is_same<T, float>::value)) {
-    avx_lstm_backward_one_sequence<T>(op, value, grad, frameSize, active_node,
+    avx_lstm_backward_one_sequence<T>(op, value, grad, frame_size, active_node,
                                      active_gate, active_state);
  } else {
-    naive_lstm_backward_one_sequence<T>(op, value, grad, frameSize, active_node,
+    naive_lstm_backward_one_sequence<T>(op, value, grad, frame_size,
-                                        active_gate, active_state);
+                                        active_node, active_gate, active_state);
  }
 }

--- a/paddle/operators/math/detail/lstm_gpu_kernel.h
+++ b/paddle/operators/math/detail/lstm_gpu_kernel.h
@@ -26,189 +26,192 @@ namespace math {
 namespace detail {
 /*
- * threads(framePerBlock, batchPerBlock)
+ * threads(frame_per_block, batch_per_block)
- * grid(frameBlocks, batchBlocks)
+ * grid(frame_blocks, batch_blocks)
 */
-template <class T, class Op, bool isBatch>
+template <class T, class Op, bool is_batch>
-__global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frameSize,
+__global__ void KeLstmForward(Op op, LstmMetaValue<T> value, int frame_size,
-                              int batchSize, activation_mode_t active_node,
+                              int batch_size, activation_mode_t active_node,
                              activation_mode_t active_gate,
                              activation_mode_t active_state) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
+  if (frame_idx >= frame_size) return;
-  int batchIdx = 0;
+  int batch_idx = 0;
-  if (isBatch) {
+  if (is_batch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
+    if (batch_idx >= batch_size) return;
-    value.gateValue += batchIdx * frameSize * 4;
+    value.gate_value += batch_idx * frame_size * 4;
-    value.outputValue += batchIdx * frameSize;
+    value.output_value += batch_idx * frame_size;
-    value.stateValue += batchIdx * frameSize;
+    value.state_value += batch_idx * frame_size;
-    value.stateActiveValue += batchIdx * frameSize;
+    value.state_active_value += batch_idx * frame_size;
  }
-  T rState;
+  T r_state;
-  T rPrevState = 0;
+  T r_prev_state = 0;
-  T rStateAtv;
+  T r_state_atv;
-  T rOut;
+  T r_out;
-  T rValueIn;
+  T r_value_in;
-  T rValueIg;
+  T r_value_ig;
-  T rValueFg;
+  T r_value_fg;
-  T rValueOg;
+  T r_value_og;
-  T rCheckI = value.checkIg ? value.checkIg[frameIdx] : 0;
+  T r_checkI = value.check_ig ? value.check_ig[frame_idx] : 0;
-  T rCheckF = value.checkFg ? value.checkFg[frameIdx] : 0;
+  T r_checkF = value.check_fg ? value.check_fg[frame_idx] : 0;
-  T rCheckO = value.checkOg ? value.checkOg[frameIdx] : 0;
+  T r_checkO = value.check_og ? value.check_og[frame_idx] : 0;
-  rValueIn = value.gateValue[frameIdx];
+  r_value_in = value.gate_value[frame_idx];
-  rValueIg = value.gateValue[frameIdx + frameSize];
+  r_value_ig = value.gate_value[frame_idx + frame_size];
-  rValueFg = value.gateValue[frameIdx + frameSize * 2];
+  r_value_fg = value.gate_value[frame_idx + frame_size * 2];
-  rValueOg = value.gateValue[frameIdx + frameSize * 3];
+  r_value_og = value.gate_value[frame_idx + frame_size * 3];
-  if (value.prevStateValue) {
+  if (value.prev_state_value) {
-    if (isBatch) value.prevStateValue += batchIdx * frameSize;
+    if (is_batch) value.prev_state_value += batch_idx * frame_size;
-    rPrevState = value.prevStateValue[frameIdx];
+    r_prev_state = value.prev_state_value[frame_idx];
  }
-  op(rValueIn, rValueIg, rValueFg, rValueOg, rPrevState, rState, rStateAtv,
+  op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_prev_state, r_state,
-     rOut, rCheckI, rCheckF, rCheckO, active_node, active_gate, active_state);
+     r_state_atv, r_out, r_checkI, r_checkF, r_checkO, active_node, active_gate,
+     active_state);
-  value.gateValue[frameIdx] = rValueIn;
+  value.gate_value[frame_idx] = r_value_in;
-  value.gateValue[frameIdx + frameSize] = rValueIg;
+  value.gate_value[frame_idx + frame_size] = r_value_ig;
-  value.gateValue[frameIdx + frameSize * 2] = rValueFg;
+  value.gate_value[frame_idx + frame_size * 2] = r_value_fg;
-  value.gateValue[frameIdx + frameSize * 3] = rValueOg;
+  value.gate_value[frame_idx + frame_size * 3] = r_value_og;
-  value.stateValue[frameIdx] = rState;
+  value.state_value[frame_idx] = r_state;
-  value.stateActiveValue[frameIdx] = rStateAtv;
+  value.state_active_value[frame_idx] = r_state_atv;
-  value.outputValue[frameIdx] = rOut;
+  value.output_value[frame_idx] = r_out;
 }
 /*
- * threads(framePerBlock, batchPerBlock)
+ * threads(frame_per_block, batch_per_block)
- * grid(frameBlocks, batchBlocks)
+ * grid(frame_blocks, batch_blocks)
 */
-template <class T, class Op, bool isBatch>
+template <class T, class Op, bool is_batch>
 __global__ void KeLstmBackward(Op op, LstmMetaValue<T> value,
-                               LstmMetaGrad<T> grad, int frameSize,
+                               LstmMetaGrad<T> grad, int frame_size,
-                               int batchSize, activation_mode_t active_node,
+                               int batch_size, activation_mode_t active_node,
                               activation_mode_t active_gate,
                               activation_mode_t active_state) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
+  if (frame_idx >= frame_size) return;
-  int batchIdx = 0;
+  int batch_idx = 0;
-  if (isBatch) {
+  if (is_batch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
+    if (batch_idx >= batch_size) return;
-    value.gateValue += batchIdx * frameSize * 4;
+    value.gate_value += batch_idx * frame_size * 4;
-    value.stateValue += batchIdx * frameSize;
+    value.state_value += batch_idx * frame_size;
-    value.stateActiveValue += batchIdx * frameSize;
+    value.state_active_value += batch_idx * frame_size;
-    grad.gateGrad += batchIdx * frameSize * 4;
+    grad.gate_grad += batch_idx * frame_size * 4;
-    grad.stateGrad += batchIdx * frameSize;
+    grad.state_grad += batch_idx * frame_size;
-    grad.outputGrad += batchIdx * frameSize;
+    grad.output_grad += batch_idx * frame_size;
  }
-  T rValueIn;
+  T r_value_in;
-  T rValueIg;
+  T r_value_ig;
-  T rValueFg;
+  T r_value_fg;
-  T rValueOg;
+  T r_value_og;
-  T rGradIn;
+  T r_grad_in;
-  T rGradIg;
+  T r_grad_ig;
-  T rGradFg;
+  T r_grad_fg;
-  T rGradOg;
+  T r_grad_og;
-  T rPrevState = 0;
+  T r_prev_state = 0;
-  T rPrevStateGrad;
+  T r_prev_state_grad;
-  T rState;
+  T r_state;
-  T rStateGrad;
+  T r_state_grad;
-  T rStateAtv;
+  T r_state_atv;
-  T rOutputGrad;
+  T r_output_grad;
-  T rCheckI = value.checkIg ? value.checkIg[frameIdx] : 0;
+  T r_checkI = value.check_ig ? value.check_ig[frame_idx] : 0;
-  T rCheckF = value.checkFg ? value.checkFg[frameIdx] : 0;
+  T r_checkF = value.check_fg ? value.check_fg[frame_idx] : 0;
-  T rCheckO = value.checkOg ? value.checkOg[frameIdx] : 0;
+  T r_checkO = value.check_og ? value.check_og[frame_idx] : 0;
-  T rCheckIGrad;
+  T r_checkIGrad;
-  T rCheckFGrad;
+  T r_checkFGrad;
-  T rCheckOGrad;
+  T r_checkOGrad;
-  rValueIn = value.gateValue[frameIdx];
+  r_value_in = value.gate_value[frame_idx];
-  rValueIg = value.gateValue[frameIdx + frameSize];
+  r_value_ig = value.gate_value[frame_idx + frame_size];
-  rValueFg = value.gateValue[frameIdx + frameSize * 2];
+  r_value_fg = value.gate_value[frame_idx + frame_size * 2];
-  rValueOg = value.gateValue[frameIdx + frameSize * 3];
+  r_value_og = value.gate_value[frame_idx + frame_size * 3];
-  rState = value.stateValue[frameIdx];
+  r_state = value.state_value[frame_idx];
-  rStateAtv = value.stateActiveValue[frameIdx];
+  r_state_atv = value.state_active_value[frame_idx];
-  rOutputGrad = grad.outputGrad[frameIdx];
+  r_output_grad = grad.output_grad[frame_idx];
-  rStateGrad = grad.stateGrad[frameIdx];
+  r_state_grad = grad.state_grad[frame_idx];
-  if (value.prevStateValue) {
+  if (value.prev_state_value) {
-    if (isBatch) value.prevStateValue += batchIdx * frameSize;
+    if (is_batch) value.prev_state_value += batch_idx * frame_size;
-    rPrevState = value.prevStateValue[frameIdx];
+    r_prev_state = value.prev_state_value[frame_idx];
  }
-  op(rValueIn, rValueIg, rValueFg, rValueOg, rGradIn, rGradIg, rGradFg, rGradOg,
+  op(r_value_in, r_value_ig, r_value_fg, r_value_og, r_grad_in, r_grad_ig,
-     rPrevState, rPrevStateGrad, rState, rStateGrad, rStateAtv, rOutputGrad,
+     r_grad_fg, r_grad_og, r_prev_state, r_prev_state_grad, r_state,
-     rCheckI, rCheckF, rCheckO, rCheckIGrad, rCheckFGrad, rCheckOGrad,
+     r_state_grad, r_state_atv, r_output_grad, r_checkI, r_checkF, r_checkO,
-     active_node, active_gate, active_state);
+     r_checkIGrad, r_checkFGrad, r_checkOGrad, active_node, active_gate,
+     active_state);
-  grad.gateGrad[frameIdx] = rGradIn;
-  grad.gateGrad[frameIdx + frameSize] = rGradIg;
+  grad.gate_grad[frame_idx] = r_grad_in;
-  grad.gateGrad[frameIdx + frameSize * 2] = rGradFg;
+  grad.gate_grad[frame_idx + frame_size] = r_grad_ig;
-  grad.gateGrad[frameIdx + frameSize * 3] = rGradOg;
+  grad.gate_grad[frame_idx + frame_size * 2] = r_grad_fg;
-  grad.stateGrad[frameIdx] = rStateGrad;
+  grad.gate_grad[frame_idx + frame_size * 3] = r_grad_og;
-  if (grad.prevStateGrad) {
+  grad.state_grad[frame_idx] = r_state_grad;
-    if (isBatch) grad.prevStateGrad += batchIdx * frameSize;
+  if (grad.prev_state_grad) {
-    grad.prevStateGrad[frameIdx] = rPrevStateGrad;
+    if (is_batch) grad.prev_state_grad += batch_idx * frame_size;
+    grad.prev_state_grad[frame_idx] = r_prev_state_grad;
  }
-  if (isBatch) {
+  if (is_batch) {
-    if (value.prevStateValue) {
+    if (value.prev_state_value) {
-      if (grad.checkIgGrad)
+      if (grad.check_ig_grad)
-        paddle::platform::CudaAtomicAdd(grad.checkIgGrad + frameIdx,
+        paddle::platform::CudaAtomicAdd(grad.check_ig_grad + frame_idx,
-                                        rCheckIGrad);
+                                        r_checkIGrad);
-      if (grad.checkFgGrad)
+      if (grad.check_fg_grad)
-        paddle::platform::CudaAtomicAdd(grad.checkFgGrad + frameIdx,
+        paddle::platform::CudaAtomicAdd(grad.check_fg_grad + frame_idx,
-                                        rCheckFGrad);
+                                        r_checkFGrad);
    }
-    if (grad.checkOgGrad)
+    if (grad.check_og_grad)
-      paddle::platform::CudaAtomicAdd(grad.checkOgGrad + frameIdx, rCheckOGrad);
+      paddle::platform::CudaAtomicAdd(grad.check_og_grad + frame_idx,
+                                      r_checkOGrad);
  } else {
-    if (value.prevStateValue) {
+    if (value.prev_state_value) {
-      if (grad.checkIgGrad) grad.checkIgGrad[frameIdx] += rCheckIGrad;
+      if (grad.check_ig_grad) grad.check_ig_grad[frame_idx] += r_checkIGrad;
-      if (grad.checkFgGrad) grad.checkFgGrad[frameIdx] += rCheckFGrad;
+      if (grad.check_fg_grad) grad.check_fg_grad[frame_idx] += r_checkFGrad;
    }
-    if (grad.checkOgGrad) grad.checkOgGrad[frameIdx] += rCheckOGrad;
+    if (grad.check_og_grad) grad.check_og_grad[frame_idx] += r_checkOGrad;
  }
 }
 template <class T, class Op>
 void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
-                      LstmMetaValue<T> value, int frameSize, int batchSize,
+                      LstmMetaValue<T> value, int frame_size, int batch_size,
                      activation_mode_t active_node,
                      activation_mode_t active_gate,
                      activation_mode_t active_state) {
  dim3 threads;
  dim3 grid;
-  if (batchSize == 1) {
+  if (batch_size == 1) {
-    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
+    int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
-    int frameBlocks = (frameSize + 1024 - 1) / 1024;
+    int frame_blocks = (frame_size + 1024 - 1) / 1024;
-    threads = dim3(framePerBlock, 1);
+    threads = dim3(frame_per_block, 1);
-    grid = dim3(frameBlocks, 1);
+    grid = dim3(frame_blocks, 1);
  } else {
-    /* framePerBlock = 32 batchPerBlock = 32 */
+    /* frame_per_block = 32 batch_per_block = 32 */
    threads = dim3(32, 32);
-    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+    grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
  }
  auto stream =
      reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
-  if (batchSize == 1) {
+  if (batch_size == 1) {
    KeLstmForward<T, Op,
-                  /* isBatch= */ false><<<grid, threads, 0, stream>>>(
+                  /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-        op, value, frameSize, batchSize, active_node, active_gate,
+        op, value, frame_size, batch_size, active_node, active_gate,
        active_state);
  } else {
    KeLstmForward<T, Op,
-                  /* isBatch= */ true><<<grid, threads, 0, stream>>>(
+                  /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-        op, value, frameSize, batchSize, active_node, active_gate,
+        op, value, frame_size, batch_size, active_node, active_gate,
        active_state);
  }
 }
@@ -216,34 +219,34 @@ void gpu_lstm_forward(const platform::DeviceContext& context, Op op,
 template <class T, class Op>
 void gpu_lstm_backward(const platform::DeviceContext& context, Op op,
                       LstmMetaValue<T> value, LstmMetaGrad<T> grad,
-                       int frameSize, int batchSize,
+                       int frame_size, int batch_size,
                       activation_mode_t active_node,
                       activation_mode_t active_gate,
                       activation_mode_t active_state) {
  dim3 threads;
  dim3 grid;
-  if (batchSize == 1) {
+  if (batch_size == 1) {
-    int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
+    int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
-    int frameBlocks = (frameSize + 1024 - 1) / 1024;
+    int frame_blocks = (frame_size + 1024 - 1) / 1024;
-    threads = dim3(framePerBlock, 1);
+    threads = dim3(frame_per_block, 1);
-    grid = dim3(frameBlocks, 1);
+    grid = dim3(frame_blocks, 1);
  } else {
-    /* framePerBlock = 32 batchPerBlock = 16 */
+    /* frame_per_block = 32 batch_per_block = 16 */
    threads = dim3(32, 16);
-    grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 16 - 1) / 16);
+    grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 16 - 1) / 16);
  }
  auto stream =
      reinterpret_cast<const platform::CUDADeviceContext&>(context).stream();
-  if (batchSize == 1) {
+  if (batch_size == 1) {
    KeLstmBackward<T, Op,
-                   /* isBatch= */ false><<<grid, threads, 0, stream>>>(
+                   /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-        op, value, grad, frameSize, batchSize, active_node, active_gate,
+        op, value, grad, frame_size, batch_size, active_node, active_gate,
        active_state);
  } else {
    KeLstmBackward<T, Op,
-                   /* isBatch= */ true><<<grid, threads, 0, stream>>>(
+                   /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-        op, value, grad, frameSize, batchSize, active_node, active_gate,
+        op, value, grad, frame_size, batch_size, active_node, active_gate,
        active_state);
  }
 }

--- a/paddle/operators/math/detail/lstm_kernel.h
+++ b/paddle/operators/math/detail/lstm_kernel.h
@@ -27,19 +27,19 @@ namespace forward {
 template <class T>
 class lstm {
 public:
-  HOSTDEVICE void operator()(T &valueIn, T &valueIg, T &valueFg, T &valueOg,
+  HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og,
-                             T &prevState, T &state, T &stateAtv, T &output,
+                             T &prev_state, T &state, T &state_atv, T &output,
                             T &checkI, T &checkF, T &checkO,
                             activation_mode_t active_node,
                             activation_mode_t active_gate,
                             activation_mode_t active_state) {
-    valueIn = activation(valueIn, active_node);
+    value_in = activation(value_in, active_node);
-    valueIg = activation(valueIg + prevState * checkI, active_gate);
+    value_ig = activation(value_ig + prev_state * checkI, active_gate);
-    valueFg = activation(valueFg + prevState * checkF, active_gate);
+    value_fg = activation(value_fg + prev_state * checkF, active_gate);
-    state = valueIn * valueIg + prevState * valueFg;
+    state = value_in * value_ig + prev_state * value_fg;
-    valueOg = activation(valueOg + state * checkO, active_gate);
+    value_og = activation(value_og + state * checkO, active_gate);
-    stateAtv = activation(state, active_state);
+    state_atv = activation(state, active_state);
-    output = valueOg * stateAtv;
+    output = value_og * state_atv;
  }
 #ifndef __NVCC__
 #ifndef __AVX__  // If not compiled with AVX instructs. Disable AVX by default
@@ -48,24 +48,27 @@ class lstm {
  // Only float support AVX optimization
  static const bool avx = std::is_same<T, float>::value;
-  HOSTDEVICE void operator()(__m256 &valueIn, __m256 &valueIg, __m256 &valueFg,
+  HOSTDEVICE void operator()(__m256 &value_in, __m256 &value_ig,
-                             __m256 &valueOg, __m256 &prevState, __m256 &state,
+                             __m256 &value_fg, __m256 &value_og,
-                             __m256 &stateAtv, __m256 &output, __m256 &checkI,
+                             __m256 &prev_state, __m256 &state,
+                             __m256 &state_atv, __m256 &output, __m256 &checkI,
                             __m256 &checkF, __m256 &checkO,
                             activation_mode_t active_node,
                             activation_mode_t active_gate,
                             activation_mode_t active_state) {
-    valueIn = activation(valueIn, active_node);
+    value_in = activation(value_in, active_node);
-    valueIg = activation(
+    value_ig =
-        _mm256_add_ps(valueIg, _mm256_mul_ps(prevState, checkI)), active_gate);
+        activation(_mm256_add_ps(value_ig, _mm256_mul_ps(prev_state, checkI)),
-    valueFg = activation(
-        _mm256_add_ps(valueFg, _mm256_mul_ps(prevState, checkF)), active_gate);
-    state = _mm256_add_ps(_mm256_mul_ps(valueIn, valueIg),
-                          _mm256_mul_ps(prevState, valueFg));
-    valueOg = activation(_mm256_add_ps(valueOg, _mm256_mul_ps(state, checkO)),
                   active_gate);
-    stateAtv = activation(state, active_state);
+    value_fg =
-    output = _mm256_mul_ps(valueOg, stateAtv);
+        activation(_mm256_add_ps(value_fg, _mm256_mul_ps(prev_state, checkF)),
+                   active_gate);
+    state = _mm256_add_ps(_mm256_mul_ps(value_in, value_ig),
+                          _mm256_mul_ps(prev_state, value_fg));
+    value_og = activation(_mm256_add_ps(value_og, _mm256_mul_ps(state, checkO)),
+                          active_gate);
+    state_atv = activation(state, active_state);
+    output = _mm256_mul_ps(value_og, state_atv);
  }
 #endif
 #endif
@@ -78,25 +81,26 @@ namespace backward {
 template <class T>
 class lstm {
 public:
-  HOSTDEVICE void operator()(T &valueIn, T &valueIg, T &valueFg, T &valueOg,
+  HOSTDEVICE void operator()(T &value_in, T &value_ig, T &value_fg, T &value_og,
-                             T &gradIn, T &gradIg, T &gradFg, T &gradOg,
+                             T &grad_in, T &grad_ig, T &grad_fg, T &grad_og,
-                             T &prevState, T &prevStateGrad, T &state,
+                             T &prev_state, T &prev_state_grad, T &state,
-                             T &stateGrad, T &stateAtv, T &outputGrad,
+                             T &state_grad, T &state_atv, T &output_grad,
                             T &checkI, T &checkF, T &checkO, T &checkIGrad,
                             T &checkFGrad, T &checkOGrad,
                             activation_mode_t active_node,
                             activation_mode_t active_gate,
                             activation_mode_t active_state) {
-    gradOg = activation(outputGrad * stateAtv, valueOg, active_gate);
+    grad_og = activation(output_grad * state_atv, value_og, active_gate);
-    stateGrad += activation(outputGrad * valueOg, stateAtv, active_state) +
+    state_grad += activation(output_grad * value_og, state_atv, active_state) +
-                 gradOg * checkO;
+                  grad_og * checkO;
-    gradIn = activation(stateGrad * valueIg, valueIn, active_node);
+    grad_in = activation(state_grad * value_ig, value_in, active_node);
-    gradIg = activation(stateGrad * valueIn, valueIg, active_gate);
+    grad_ig = activation(state_grad * value_in, value_ig, active_gate);
-    gradFg = activation(stateGrad * prevState, valueFg, active_gate);
+    grad_fg = activation(state_grad * prev_state, value_fg, active_gate);
-    prevStateGrad = gradIg * checkI + gradFg * checkF + stateGrad * valueFg;
+    prev_state_grad =
-    checkIGrad = gradIg * prevState;
+        grad_ig * checkI + grad_fg * checkF + state_grad * value_fg;
-    checkFGrad = gradFg * prevState;
+    checkIGrad = grad_ig * prev_state;
-    checkOGrad = gradOg * state;
+    checkFGrad = grad_fg * prev_state;
+    checkOGrad = grad_og * state;
  }
 #ifndef __NVCC__
 #ifndef __AVX__  // If not compiled with AVX instructs. Disable AVX by default
@@ -105,32 +109,32 @@ class lstm {
  // Only float support AVX optimization
  static const bool avx = std::is_same<T, float>::value;
  HOSTDEVICE void operator()(
-      __m256 &valueIn, __m256 &valueIg, __m256 &valueFg, __m256 &valueOg,
+      __m256 &value_in, __m256 &value_ig, __m256 &value_fg, __m256 &value_og,
-      __m256 &gradIn, __m256 &gradIg, __m256 &gradFg, __m256 &gradOg,
+      __m256 &grad_in, __m256 &grad_ig, __m256 &grad_fg, __m256 &grad_og,
-      __m256 &prevState, __m256 &prevStateGrad, __m256 &state,
+      __m256 &prev_state, __m256 &prev_state_grad, __m256 &state,
-      __m256 &stateGrad, __m256 &stateAtv, __m256 &outputGrad, __m256 &checkI,
+      __m256 &state_grad, __m256 &state_atv, __m256 &output_grad,
-      __m256 &checkF, __m256 &checkO, __m256 &checkIGrad, __m256 &checkFGrad,
+      __m256 &checkI, __m256 &checkF, __m256 &checkO, __m256 &checkIGrad,
-      __m256 &checkOGrad, activation_mode_t active_node,
+      __m256 &checkFGrad, __m256 &checkOGrad, activation_mode_t active_node,
      activation_mode_t active_gate, activation_mode_t active_state) {
-    gradOg =
+    grad_og = activation(_mm256_mul_ps(output_grad, state_atv), value_og,
-        activation(_mm256_mul_ps(outputGrad, stateAtv), valueOg, active_gate);
+                         active_gate);
-    stateGrad = _mm256_add_ps(
+    state_grad = _mm256_add_ps(activation(_mm256_mul_ps(output_grad, value_og),
-        activation(_mm256_mul_ps(outputGrad, valueOg), stateAtv, active_state),
+                                          state_atv, active_state),
-        stateGrad);
+                               state_grad);
-    stateGrad = _mm256_add_ps(_mm256_mul_ps(gradOg, checkO), stateGrad);
+    state_grad = _mm256_add_ps(_mm256_mul_ps(grad_og, checkO), state_grad);
-    gradIn =
+    grad_in =
-        activation(_mm256_mul_ps(stateGrad, valueIg), valueIn, active_node);
+        activation(_mm256_mul_ps(state_grad, value_ig), value_in, active_node);
-    gradIg =
+    grad_ig =
-        activation(_mm256_mul_ps(stateGrad, valueIn), valueIg, active_gate);
+        activation(_mm256_mul_ps(state_grad, value_in), value_ig, active_gate);
-    gradFg =
+    grad_fg = activation(_mm256_mul_ps(state_grad, prev_state), value_fg,
-        activation(_mm256_mul_ps(stateGrad, prevState), valueFg, active_gate);
+                         active_gate);
-    prevStateGrad = _mm256_add_ps(_mm256_mul_ps(gradIg, checkI),
+    prev_state_grad = _mm256_add_ps(_mm256_mul_ps(grad_ig, checkI),
-                                  _mm256_mul_ps(gradFg, checkF));
+                                    _mm256_mul_ps(grad_fg, checkF));
-    prevStateGrad =
+    prev_state_grad =
-        _mm256_add_ps(_mm256_mul_ps(stateGrad, valueFg), prevStateGrad);
+        _mm256_add_ps(_mm256_mul_ps(state_grad, value_fg), prev_state_grad);
-    checkIGrad = _mm256_mul_ps(gradIg, prevState);
+    checkIGrad = _mm256_mul_ps(grad_ig, prev_state);
-    checkFGrad = _mm256_mul_ps(gradFg, prevState);
+    checkFGrad = _mm256_mul_ps(grad_fg, prev_state);
-    checkOGrad = _mm256_mul_ps(gradOg, state);
+    checkOGrad = _mm256_mul_ps(grad_og, state);
  }
 #endif
 #endif

--- a/paddle/operators/math/gru_compute.cc
+++ b/paddle/operators/math/gru_compute.cc
@@ -21,29 +21,29 @@ namespace math {
 template <typename T>
 struct GRUUnitFunctor<platform::CPUPlace, T> {
  static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, int frameSize, int batchSize,
+                      hl_gru_value<T> value, int frame_size, int batch_size,
                      activation_mode_t active_node,
                      activation_mode_t active_gate) {
 #ifndef __NVCC__
-    if (value.prevOutValue) {
+    if (value.prev_out_value) {
      math::gemm<platform::CPUPlace, T>(
-          context, false, false, batchSize, frameSize * 2, frameSize, 1,
+          context, false, false, batch_size, frame_size * 2, frame_size, 1,
-          value.prevOutValue, frameSize, value.gateWeight, frameSize * 2, 1,
+          value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
-          value.gateValue, frameSize * 3);
+          1, value.gate_value, frame_size * 3);
    }
    detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value,
-                                 frameSize, batchSize, active_gate);
+                                 frame_size, batch_size, active_gate);
-    if (value.prevOutValue) {
+    if (value.prev_out_value) {
      math::gemm<platform::CPUPlace, T>(
-          context, false, false, batchSize, frameSize, frameSize, 1,
+          context, false, false, batch_size, frame_size, frame_size, 1,
-          value.resetOutputValue, frameSize, value.stateWeight, frameSize, 1,
+          value.reset_output_value, frame_size, value.state_weight, frame_size,
-          value.gateValue + frameSize * 2, frameSize * 3);
+          1, value.gate_value + frame_size * 2, frame_size * 3);
    }
    detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
-                                 frameSize, batchSize, active_node);
+                                 frame_size, batch_size, active_node);
 #endif
  }
 };
@@ -51,41 +51,43 @@ struct GRUUnitFunctor<platform::CPUPlace, T> {
 template <typename T>
 struct GRUUnitGradFunctor<platform::CPUPlace, T> {
  static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
+                      hl_gru_value<T> value, hl_gru_grad<T> grad,
-                      int batchSize, activation_mode_t active_node,
+                      int frame_size, int batch_size,
+                      activation_mode_t active_node,
                      activation_mode_t active_gate) {
 #ifndef __NVCC__
    detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value,
-                                grad, frameSize, batchSize, active_node);
+                                grad, frame_size, batch_size, active_node);
-    if (value.prevOutValue && grad.prevOutGrad) {
+    if (value.prev_out_value && grad.prev_out_grad) {
      math::gemm<platform::CPUPlace, T>(
-          context, false, true, batchSize, frameSize, frameSize, 1,
+          context, false, true, batch_size, frame_size, frame_size, 1,
-          grad.gateGrad + frameSize * 2, frameSize * 3, value.stateWeight,
+          grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
-          frameSize, 0, grad.resetOutputGrad, frameSize);
+          frame_size, 0, grad.reset_output_grad, frame_size);
-      if (grad.stateWeightGrad) {
+      if (grad.state_weight_grad) {
        math::gemm<platform::CPUPlace, T>(
-            context, true, false, frameSize, frameSize, batchSize, 1,
+            context, true, false, frame_size, frame_size, batch_size, 1,
-            value.resetOutputValue, frameSize, grad.gateGrad + frameSize * 2,
+            value.reset_output_value, frame_size,
-            frameSize * 3, 1, grad.stateWeightGrad, frameSize);
+            grad.gate_grad + frame_size * 2, frame_size * 3, 1,
+            grad.state_weight_grad, frame_size);
      }
    }
    detail::backward_reset_grad(detail::backward::gru_resetGrad<T>(), value,
-                                grad, frameSize, batchSize, active_gate);
+                                grad, frame_size, batch_size, active_gate);
-    if (grad.prevOutGrad && value.prevOutValue) {
+    if (grad.prev_out_grad && value.prev_out_value) {
      math::gemm<platform::CPUPlace, T>(
-          context, false, true, batchSize, frameSize, frameSize * 2, 1,
+          context, false, true, batch_size, frame_size, frame_size * 2, 1,
-          grad.gateGrad, frameSize * 3, value.gateWeight, frameSize * 2, 1,
+          grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
-          grad.prevOutGrad, frameSize);
+          grad.prev_out_grad, frame_size);
-      if (grad.gateWeightGrad) {
+      if (grad.gate_weight_grad) {
        math::gemm<platform::CPUPlace, T>(
-            context, true, false, frameSize, frameSize * 2, batchSize, 1,
+            context, true, false, frame_size, frame_size * 2, batch_size, 1,
-            value.prevOutValue, frameSize, grad.gateGrad, frameSize * 3, 1,
+            value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
-            grad.gateWeightGrad, frameSize * 2);
+            grad.gate_weight_grad, frame_size * 2);
      }
    }
 #endif

--- a/paddle/operators/math/gru_compute.cu
+++ b/paddle/operators/math/gru_compute.cu
@@ -21,66 +21,66 @@ namespace math {
 template <typename T>
 struct GRUUnitFunctor<platform::GPUPlace, T> {
  static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, int frameSize, int batchSize,
+                      hl_gru_value<T> value, int frame_size, int batch_size,
                      activation_mode_t active_node,
                      activation_mode_t active_gate) {
    auto stream =
        reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
    dim3 threads;
    dim3 grid;
-    if (batchSize == 1) {
+    if (batch_size == 1) {
-      int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
+      int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
-      int frameBlocks = (frameSize + 1024 - 1) / 1024;
+      int frame_blocks = (frame_size + 1024 - 1) / 1024;
-      threads = dim3(framePerBlock, 1);
+      threads = dim3(frame_per_block, 1);
-      grid = dim3(frameBlocks, 1);
+      grid = dim3(frame_blocks, 1);
    } else {
      threads = dim3(32, 32);
-      grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+      grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
    }
-    if (value.prevOutValue) {
+    if (value.prev_out_value) {
      math::gemm<platform::GPUPlace, T>(
-          context, false, false, batchSize, frameSize * 2, frameSize, 1,
+          context, false, false, batch_size, frame_size * 2, frame_size, 1,
-          value.prevOutValue, frameSize, value.gateWeight, frameSize * 2, 1,
+          value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
-          value.gateValue, frameSize * 3);
+          1, value.gate_value, frame_size * 3);
    }
-    if (batchSize == 1) {
+    if (batch_size == 1) {
      detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
-                                      /* isBatch= */ false,
+                                      /* is_batch= */ false,
                                      T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_resetOutput<T>(), value.gateValue,
+          detail::forward::gru_resetOutput<T>(), value.gate_value,
-          value.resetOutputValue, value.prevOutValue, frameSize, batchSize,
+          value.reset_output_value, value.prev_out_value, frame_size,
-          active_gate);
+          batch_size, active_gate);
    } else {
      detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
-                                      /* isBatch= */ true,
+                                      /* is_batch= */ true,
                                      T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_resetOutput<T>(), value.gateValue,
+          detail::forward::gru_resetOutput<T>(), value.gate_value,
-          value.resetOutputValue, value.prevOutValue, frameSize, batchSize,
+          value.reset_output_value, value.prev_out_value, frame_size,
-          active_gate);
+          batch_size, active_gate);
    }
-    if (value.prevOutValue) {
+    if (value.prev_out_value) {
      math::gemm<platform::GPUPlace, T>(
-          context, false, false, batchSize, frameSize, frameSize, 1,
+          context, false, false, batch_size, frame_size, frame_size, 1,
-          value.resetOutputValue, frameSize, value.stateWeight, frameSize, 1,
+          value.reset_output_value, frame_size, value.state_weight, frame_size,
-          value.gateValue + frameSize * 2, frameSize * 3);
+          1, value.gate_value + frame_size * 2, frame_size * 3);
    }
-    if (batchSize == 1) {
+    if (batch_size == 1) {
      detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
-                                      /* isBatch= */ false,
+                                      /* is_batch= */ false,
                                      T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_finalOutput<T>(), value.gateValue,
+          detail::forward::gru_finalOutput<T>(), value.gate_value,
-          value.prevOutValue, value.outputValue, frameSize, batchSize,
+          value.prev_out_value, value.output_value, frame_size, batch_size,
          active_node);
    } else {
      detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
-                                      /* isBatch= */ true,
+                                      /* is_batch= */ true,
                                      T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_finalOutput<T>(), value.gateValue,
+          detail::forward::gru_finalOutput<T>(), value.gate_value,
-          value.prevOutValue, value.outputValue, frameSize, batchSize,
+          value.prev_out_value, value.output_value, frame_size, batch_size,
          active_node);
    }
  }
@@ -89,80 +89,82 @@ struct GRUUnitFunctor<platform::GPUPlace, T> {
 template <typename T>
 struct GRUUnitGradFunctor<platform::GPUPlace, T> {
  static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
+                      hl_gru_value<T> value, hl_gru_grad<T> grad,
-                      int batchSize, activation_mode_t active_node,
+                      int frame_size, int batch_size,
+                      activation_mode_t active_node,
                      activation_mode_t active_gate) {
    auto stream =
        reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
    dim3 threads;
    dim3 grid;
-    if (batchSize == 1) {
+    if (batch_size == 1) {
-      int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
+      int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
-      int frameBlocks = (frameSize + 1024 - 1) / 1024;
+      int frame_blocks = (frame_size + 1024 - 1) / 1024;
-      threads = dim3(framePerBlock, 1);
+      threads = dim3(frame_per_block, 1);
-      grid = dim3(frameBlocks, 1);
+      grid = dim3(frame_blocks, 1);
    } else {
      threads = dim3(32, 32);
-      grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+      grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
    }
-    if (batchSize == 1) {
+    if (batch_size == 1) {
      detail::KeGruBackwardStateGrad<
          detail::backward::gru_stateGrad<T>,
-          /* isBatch= */ false><<<grid, threads, 0, stream>>>(
+          /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_stateGrad<T>(), value.gateValue, grad.gateGrad,
+          detail::backward::gru_stateGrad<T>(), value.gate_value,
-          value.prevOutValue, grad.prevOutGrad, grad.outputGrad, frameSize,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
-          batchSize, active_node);
+          grad.output_grad, frame_size, batch_size, active_node);
    } else {
      detail::KeGruBackwardStateGrad<
          detail::backward::gru_stateGrad<T>,
-          /* isBatch= */ true><<<grid, threads, 0, stream>>>(
+          /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_stateGrad<T>(), value.gateValue, grad.gateGrad,
+          detail::backward::gru_stateGrad<T>(), value.gate_value,
-          value.prevOutValue, grad.prevOutGrad, grad.outputGrad, frameSize,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
-          batchSize, active_node);
+          grad.output_grad, frame_size, batch_size, active_node);
    }
-    if (value.prevOutValue && grad.prevOutGrad) {
+    if (value.prev_out_value && grad.prev_out_grad) {
      math::gemm<platform::GPUPlace, T>(
-          context, false, true, batchSize, frameSize, frameSize, 1,
+          context, false, true, batch_size, frame_size, frame_size, 1,
-          grad.gateGrad + frameSize * 2, frameSize * 3, value.stateWeight,
+          grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
-          frameSize, 0, grad.resetOutputGrad, frameSize);
+          frame_size, 0, grad.reset_output_grad, frame_size);
-      if (grad.stateWeightGrad) {
+      if (grad.state_weight_grad) {
        math::gemm<platform::GPUPlace, T>(
-            context, true, false, frameSize, frameSize, batchSize, 1,
+            context, true, false, frame_size, frame_size, batch_size, 1,
-            value.resetOutputValue, frameSize, grad.gateGrad + frameSize * 2,
+            value.reset_output_value, frame_size,
-            frameSize * 3, 1, grad.stateWeightGrad, frameSize);
+            grad.gate_grad + frame_size * 2, frame_size * 3, 1,
+            grad.state_weight_grad, frame_size);
      }
    }
-    if (batchSize == 1) {
+    if (batch_size == 1) {
      detail::KeGruBackwardResetGrad<
          detail::backward::gru_resetGrad<T>,
-          /* isBatch= */ false><<<grid, threads, 0, stream>>>(
+          /* is_batch= */ false><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_resetGrad<T>(), value.gateValue, grad.gateGrad,
+          detail::backward::gru_resetGrad<T>(), value.gate_value,
-          value.prevOutValue, grad.prevOutGrad, grad.resetOutputGrad, frameSize,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
-          batchSize, active_gate);
+          grad.reset_output_grad, frame_size, batch_size, active_gate);
    } else {
      detail::KeGruBackwardResetGrad<
          detail::backward::gru_resetGrad<T>,
-          /* isBatch= */ true><<<grid, threads, 0, stream>>>(
+          /* is_batch= */ true><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_resetGrad<T>(), value.gateValue, grad.gateGrad,
+          detail::backward::gru_resetGrad<T>(), value.gate_value,
-          value.prevOutValue, grad.prevOutGrad, grad.resetOutputGrad, frameSize,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
-          batchSize, active_gate);
+          grad.reset_output_grad, frame_size, batch_size, active_gate);
    }
-    if (grad.prevOutGrad && value.prevOutValue) {
+    if (grad.prev_out_grad && value.prev_out_value) {
      math::gemm<platform::GPUPlace, T>(
-          context, false, true, batchSize, frameSize, frameSize * 2, 1,
+          context, false, true, batch_size, frame_size, frame_size * 2, 1,
-          grad.gateGrad, frameSize * 3, value.gateWeight, frameSize * 2, 1,
+          grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
-          grad.prevOutGrad, frameSize);
+          grad.prev_out_grad, frame_size);
-      if (grad.gateWeightGrad) {
+      if (grad.gate_weight_grad) {
        math::gemm<platform::GPUPlace, T>(
-            context, true, false, frameSize, frameSize * 2, batchSize, 1,
+            context, true, false, frame_size, frame_size * 2, batch_size, 1,
-            value.prevOutValue, frameSize, grad.gateGrad, frameSize * 3, 1,
+            value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
-            grad.gateWeightGrad, frameSize * 2);
+            grad.gate_weight_grad, frame_size * 2);
      }
    }
  }

--- a/paddle/operators/math/gru_compute.h
+++ b/paddle/operators/math/gru_compute.h
@@ -22,28 +22,28 @@ namespace math {
 // TODO(guosheng): refine code style in gru_compute
 template <typename T>
 struct hl_gru_value {
-  T *gateWeight;
+  T *gate_weight;
-  T *stateWeight;
+  T *state_weight;
-  T *gateValue;
+  T *gate_value;
-  T *resetOutputValue;
+  T *reset_output_value;
-  T *outputValue;
+  T *output_value;
-  T *prevOutValue;
+  T *prev_out_value;
 };
 template <typename T>
 struct hl_gru_grad {
-  T *gateWeightGrad;
+  T *gate_weight_grad;
-  T *stateWeightGrad;
+  T *state_weight_grad;
-  T *gateGrad;
+  T *gate_grad;
-  T *resetOutputGrad;
+  T *reset_output_grad;
-  T *outputGrad;
+  T *output_grad;
-  T *prevOutGrad;
+  T *prev_out_grad;
 };
 template <typename Place, typename T>
 struct GRUUnitFunctor {
  static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, int frameSize, int batchSize,
+                      hl_gru_value<T> value, int frame_size, int batch_size,
                      activation_mode_t active_node,
                      activation_mode_t active_gate);
 };
@@ -51,8 +51,9 @@ struct GRUUnitFunctor {
 template <typename Place, typename T>
 struct GRUUnitGradFunctor {
  static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
+                      hl_gru_value<T> value, hl_gru_grad<T> grad,
-                      int batchSize, activation_mode_t active_node,
+                      int frame_size, int batch_size,
+                      activation_mode_t active_node,
                      activation_mode_t active_gate);
 };

--- a/paddle/operators/math/lstm_compute.cc
+++ b/paddle/operators/math/lstm_compute.cc
@@ -30,12 +30,12 @@ struct LstmUnitFunctor<platform::CPUPlace, T> {
      detail::cpu_lstm_forward(detail::forward::lstm<T>(), value, frame_size,
                               ActiveType(cand_act), ActiveType(gate_act),
                               ActiveType(cell_act));
-      value.gateValue += frame_size * 4;
+      value.gate_value += frame_size * 4;
-      value.stateValue += frame_size;
+      value.state_value += frame_size;
-      value.stateActiveValue += frame_size;
+      value.state_active_value += frame_size;
-      value.outputValue += frame_size;
+      value.output_value += frame_size;
-      if (value.prevStateValue) {
+      if (value.prev_state_value) {
-        value.prevStateValue += frame_size;
+        value.prev_state_value += frame_size;
      }
    }
  }
@@ -53,20 +53,20 @@ struct LstmUnitGradFunctor<platform::CPUPlace, T> {
                                frame_size, ActiveType(cand_act),
                                ActiveType(gate_act), ActiveType(cell_act));
-      value.gateValue += frame_size * 4;
+      value.gate_value += frame_size * 4;
-      value.stateValue += frame_size;
+      value.state_value += frame_size;
-      value.stateActiveValue += frame_size;
+      value.state_active_value += frame_size;
-      value.outputValue += frame_size;
+      value.output_value += frame_size;
-      if (value.prevStateValue) {
+      if (value.prev_state_value) {
-        value.prevStateValue += frame_size;
+        value.prev_state_value += frame_size;
      }
-      grad.gateGrad += frame_size * 4;
+      grad.gate_grad += frame_size * 4;
-      grad.stateGrad += frame_size;
+      grad.state_grad += frame_size;
-      grad.stateActiveGrad += frame_size;
+      grad.state_active_grad += frame_size;
-      grad.outputGrad += frame_size;
+      grad.output_grad += frame_size;
-      if (grad.prevStateGrad) {
+      if (grad.prev_state_grad) {
-        grad.prevStateGrad += frame_size;
+        grad.prev_state_grad += frame_size;
      }
    }
  }

--- a/paddle/operators/math/lstm_compute.h
+++ b/paddle/operators/math/lstm_compute.h
@@ -31,26 +31,26 @@ typedef enum {
 template <class T>
 struct LstmMetaValue {
-  T *gateValue;
+  T *gate_value;
-  T *prevStateValue;
+  T *prev_state_value;
-  T *stateValue;
+  T *state_value;
-  T *stateActiveValue;
+  T *state_active_value;
-  T *outputValue;
+  T *output_value;
-  T *checkIg;
+  T *check_ig;
-  T *checkFg;
+  T *check_fg;
-  T *checkOg;
+  T *check_og;
 };
 template <class T>
 struct LstmMetaGrad {
-  T *gateGrad;
+  T *gate_grad;
-  T *prevStateGrad;
+  T *prev_state_grad;
-  T *stateGrad;
+  T *state_grad;
-  T *stateActiveGrad;
+  T *state_active_grad;
-  T *outputGrad;
+  T *output_grad;
-  T *checkIgGrad;
+  T *check_ig_grad;
-  T *checkFgGrad;
+  T *check_fg_grad;
-  T *checkOgGrad;
+  T *check_og_grad;
 };
 inline activation_mode_t ActiveType(const std::string &type) {

--- a/paddle/operators/math/unpooling.cc
+++ b/paddle/operators/math/unpooling.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/math/unpooling.h"
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+class Unpool2dMaxFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    int input_feasize = input_height * input_width;
+    int output_feasize = output_height * output_width;
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    for (int b = 0; b < batch_size; ++b) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int i = 0; i < input_feasize; ++i) {
+          int index = indices_data[i];
+          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
+          output_data[index] = input_data[i];
+        }
+        input_data += input_feasize;
+        indices_data += input_feasize;
+        output_data += output_feasize;
+      }
+    }
+  }
+};
+template <class T>
+class Unpool2dMaxGradFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    int input_feasize = input_height * input_width;
+    int output_feasize = output_height * output_width;
+    const int* indices_data = indices.data<int>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    for (int b = 0; b < batch_size; ++b) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int i = 0; i < input_feasize; ++i) {
+          int index = indices_data[i];
+          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
+          input_grad_data[i] = output_grad_data[index];
+        }
+        input_grad_data += input_feasize;
+        indices_data += input_feasize;
+        output_grad_data += output_feasize;
+      }
+    }
+  }
+};
+template class Unpool2dMaxGradFunctor<platform::CPUPlace, float>;
+template class Unpool2dMaxGradFunctor<platform::CPUPlace, double>;
+template class Unpool2dMaxFunctor<platform::CPUPlace, float>;
+template class Unpool2dMaxFunctor<platform::CPUPlace, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/unpooling.cu
+++ b/paddle/operators/math/unpooling.cu
+/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/math/unpooling.h"
+#include "paddle/platform/cuda_helper.h"
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+__global__ void KernelUnpool2dMax(const int nthreads, const T* input_data,
+                                  const int* indices_data,
+                                  const int input_height, const int input_width,
+                                  const int channels, T* output_data,
+                                  const int output_height,
+                                  const int output_width) {
+  int in_n_stride = input_height * input_width * channels;
+  int in_c_stride = input_height * input_width;
+  int out_n_stride = output_height * output_width * channels;
+  int out_c_stride = output_height * output_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int bidx = i / in_n_stride;
+    int boffset = i % in_n_stride;
+    int cidx = boffset / in_c_stride;
+    int out_offset = bidx * out_n_stride + cidx * out_c_stride;
+    int out_index = indices_data[i];
+    PADDLE_ASSERT(out_index < out_c_stride);
+    output_data[out_offset + out_index] = input_data[i];
+  }
+}
+template <typename T>
+__global__ void KernelUnpool2dMaxGrad(
+    const int nthreads, const T* input_data, const int* indices_data,
+    const int input_height, const int input_width, const int channels,
+    const T* output_data, const T* output_grad, const int output_height,
+    const int output_width, T* input_grad) {
+  int in_n_stride = input_height * input_width * channels;
+  int in_c_stride = input_height * input_width;
+  int out_n_stride = output_height * output_width * channels;
+  int out_c_stride = output_height * output_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int bidx = i / in_n_stride;
+    int boffset = i % in_n_stride;
+    int cidx = boffset / in_c_stride;
+    int out_offset = bidx * out_n_stride + cidx * out_c_stride;
+    int out_index = indices_data[i];
+    PADDLE_ASSERT(out_index < out_c_stride);
+    input_grad[i] = output_grad[out_offset + out_index];
+  }
+}
+/*
+ * All tensors are in NCHW format.
+ */
+template <typename T>
+class Unpool2dMaxFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    int threads = 1024;
+    int grid = (input.numel() + threads - 1) / threads;
+    KernelUnpool2dMax<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(input.numel(), input_data, indices_data,
+                              input_height, input_width, output_channels,
+                              output_data, output_height, output_width);
+  }
+};
+/*
+ * All tensors are in NCHW format.
+ */
+template <typename T>
+class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    int threads = 1024;
+    int grid = (input.numel() + threads - 1) / threads;
+    KernelUnpool2dMaxGrad<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(input.numel(), input_data, indices_data,
+                              input_height, input_width, output_channels,
+                              output_data, output_grad_data, output_height,
+                              output_width, input_grad_data);
+  }
+};
+template class Unpool2dMaxGradFunctor<platform::GPUPlace, float>;
+template class Unpool2dMaxGradFunctor<platform::GPUPlace, double>;
+template class Unpool2dMaxFunctor<platform::GPUPlace, float>;
+template class Unpool2dMaxFunctor<platform::GPUPlace, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/math/unpooling.h
+++ b/paddle/operators/math/unpooling.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/framework/tensor.h"
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename Place, typename T>
+class Unpool2dMaxFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output);
+};
+template <typename Place, class T>
+class Unpool2dMaxGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad);
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
@@ -99,13 +99,7 @@ class MultiplexGradOp : public framework::OperatorWithKernel {
                   "Output(X@Grad) should not be null.");
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                   "Input(Out@GRAD) should not be null.");
-    std::vector<framework::DDim> d_ins;
+    ctx->SetOutputsDim(framework::GradVarName("X"), ctx->GetInputsDim("X"));
-    auto ins = ctx->GetInputsDim("X");
-    // No need to compute gradient for Input(Ids)
-    for (size_t i = 0; i < ins.size(); i++) {
-      d_ins.push_back(ins[i]);
-    }
-    ctx->SetOutputsDim(framework::GradVarName("X"), d_ins);
  }
 protected:

--- a/paddle/operators/nce_op.cc
+++ b/paddle/operators/nce_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/nce_op.h"
+namespace paddle {
+namespace operators {
+using framework::Tensor;
+class NCEOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"));
+    PADDLE_ENFORCE(ctx->HasInput("Label"));
+    PADDLE_ENFORCE(ctx->HasInput("Weight"));
+    PADDLE_ENFORCE(ctx->HasOutput("Cost"));
+    PADDLE_ENFORCE(ctx->HasOutput("SampleLogits"));
+    PADDLE_ENFORCE(ctx->HasOutput("SampleLabels"));
+    auto x_dims = ctx->GetInputDim("Input");
+    auto label_dims = ctx->GetInputDim("Label");
+    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0]);
+    int num_true_classes = label_dims.size() == 2 ? label_dims[1] : 1;
+    if (ctx->HasInput("Bias")) {
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Weight")[0],
+                        ctx->GetInputDim("Bias")[0]);
+    }
+    auto num_neg_samples = ctx->Attrs().Get<int>("num_neg_samples");
+    auto num_total_classes = ctx->Attrs().Get<int>("num_total_classes");
+    std::vector<int> custom_neg_classes =
+        ctx->Attrs().Get<std::vector<int>>("custom_neg_classes");
+    PADDLE_ENFORCE_EQ(num_total_classes, ctx->GetInputDim("Weight")[0]);
+    if (custom_neg_classes.size() > 0) {
+      PADDLE_ENFORCE_EQ(custom_neg_classes.size(),
+                        static_cast<size_t>(num_neg_samples));
+    }
+    // set dims of output(Out)
+    std::vector<int64_t> out_dims;
+    out_dims.push_back(x_dims[0]);
+    out_dims.push_back(1);
+    ctx->SetOutputDim("Cost", framework::make_ddim(out_dims));
+    // set dims of output(SampleOut)
+    std::vector<int64_t> sample_out_dims;
+    sample_out_dims.push_back(x_dims[0]);
+    sample_out_dims.push_back(num_neg_samples + num_true_classes);
+    ctx->SetOutputDim("SampleLogits", framework::make_ddim(sample_out_dims));
+    ctx->SetOutputDim("SampleLabels", framework::make_ddim(sample_out_dims));
+  }
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
+        ctx.device_context());
+  }
+};
+class NCEOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  NCEOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Input", "(Tensor) A tensor of shape [batch_size, dim].");
+    AddInput(
+        "Label",
+        "(Tensor) A tensor of shape [batch_size, num_true_class]. "
+        "'num_true_class' is the number of target classes in each sample."
+        "The number of target classes per sample should be same. "
+        "If you have a variable number of target classes, "
+        "you can pad them out to a constant number by either repeating them"
+        " or by padding with an otherwise unused class.)");
+    AddInput("Weight",
+             "(Tensor) A tensor of shape [num_class, dim]. 'num_class' is the "
+             "total number of class.");
+    AddInput(
+        "Bias",
+        "(Tensor) A tensor of shape [num_class, 1]. 'num_class' is the total "
+        "number of class. It is a dispensable input.")
+        .AsDispensable();
+    AddInput("SampleWeight",
+             "(Tensor) A tensor of shape [batch_size, 1] storing a weight for "
+             "each sample. And it is a dispensable input. The default value of "
+             "sample is 1.")
+        .AsDispensable();
+    AddOutput("Cost",
+              "(Tensor) A tensor of shape [batch_size, 1]. Cost of samples.");
+    AddOutput("SampleLogits",
+              "An intermediate tensor of shape[batch_size, num_neg_samples + "
+              "num_pos_samples]."
+              "This tensor is output of forward kernel and used in backward "
+              "kernel to compute grads."
+              "Given X is  the dot product of input tensor and sampled labels' "
+              "weights."
+              "Then 'SampleLogits' is sigmoid(X).")
+        .AsIntermediate();
+    AddOutput("SampleLabels",
+              "An intermediate tensor of shape[batch_size, num_neg_samples + "
+              "num_pos_samples]."
+              "This tensor is output of forward kernel and used in backward "
+              "kernel to compute grads."
+              "")
+        .AsIntermediate();
+    AddAttr<int>("num_total_classes",
+                 "Total number of classes in all samples.");
+    AddAttr<int>("num_neg_samples",
+                 "The number of negative classes. The default value is 10.")
+        .SetDefault(10);
+    AddAttr<std::vector<int>>("custom_neg_classes",
+                              "This attribute only be used in unitest. Classes "
+                              "in this list wiil be used as negative classes "
+                              "for every samples. Under normal conditions, "
+                              "user should avoid setting this attribute.");
+    AddComment(R"DOC(
+Compute and return the noise-contrastive estimation training loss.
+See [Noise-contrastive estimation: A new estimation principle for unnormalized statistical models](http://www.jmlr.org/proceedings/papers/v9/gutmann10a/gutmann10a.pdf).
+By default this operator uses a uniform distribution for sampling.
+)DOC");
+  }
+};
+class NCEOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Input"));
+    PADDLE_ENFORCE(ctx->HasInput("Weight"));
+    PADDLE_ENFORCE(ctx->HasInput("Cost"));
+    PADDLE_ENFORCE(ctx->HasInput("SampleLogits"));
+    PADDLE_ENFORCE(ctx->HasInput("SampleLabels"));
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Cost")),
+                   "The input(Out@GRAD) should not be null.");
+    auto x_dims = ctx->GetInputDim("Input");
+    auto x_grad_name = framework::GradVarName("Input");
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, x_dims);
+    }
+    auto w_dims = ctx->GetInputDim("Weight");
+    auto w_grad_name = framework::GradVarName("Weight");
+    if (ctx->HasOutput(w_grad_name)) {
+      ctx->SetOutputDim(w_grad_name, w_dims);
+    }
+    auto bias_grad_name = framework::GradVarName("Bias");
+    if (ctx->HasOutput(bias_grad_name)) {
+      auto bias_dims = ctx->GetInputDim("Bias");
+      ctx->SetOutputDim(bias_grad_name, bias_dims);
+    }
+  }
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
+        ctx.device_context());
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP(nce, ops::NCEOp, ops::NCEOpMaker, nce_grad, ops::NCEOpGrad);
+REGISTER_OP_CPU_KERNEL(nce, ops::NCEKernel<paddle::platform::CPUPlace, float>,
+                       ops::NCEKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(nce_grad,
+                       ops::NCEGradKernel<paddle::platform::CPUPlace, float>,
+                       ops::NCEGradKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/operators/nce_op.h
+++ b/paddle/operators/nce_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+   http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+#pragma once
+#include <math.h>
+#include <random>
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+template <typename Place, typename T>
+void PrepareSamples(const framework::ExecutionContext& context) {
+  auto label = context.Input<Tensor>("Label");
+  const int64_t* label_data = label->data<int64_t>();
+  auto label_dims = label->dims();
+  int num_total_classes = context.Attr<int>("num_total_classes");
+  // for unitest
+  std::vector<int> custom_neg_classes =
+      context.Attr<std::vector<int>>("custom_neg_classes");
+  // random machine
+  std::random_device rd;
+  std::mt19937 rng(rd());
+  std::uniform_int_distribution<int> rand(0, num_total_classes - 1);
+  auto sample_labels = context.Output<Tensor>("SampleLabels");
+  auto sample_labels_dims = sample_labels->dims();
+  int64_t* sample_labels_data =
+      sample_labels->mutable_data<int64_t>(context.GetPlace());
+  int num_label = label_dims.size() == 2 ? label_dims[1] : 1;
+  int index = 0;
+  for (size_t i = 0; i < label_dims[0]; ++i) {
+    int j = 0;
+    for (; j < num_label; ++j) {
+      sample_labels_data[index++] = label_data[i * num_label + j];
+    }
+    if (custom_neg_classes.size() > 0) {
+      for (auto label : custom_neg_classes) {
+        sample_labels_data[index++] = label;
+      }
+    } else {
+      for (; j < sample_labels_dims[1]; ++j) {
+        // TODO(wanghaoshuang): support more distribution sampling
+        sample_labels_data[index++] = rand(rng);
+      }
+    }
+  }
+}
+template <typename Place, typename T>
+class NCEKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    PrepareSamples<Place, T>(context);
+    auto sample_labels = context.Output<Tensor>("SampleLabels");
+    const int64_t* sample_labels_data = sample_labels->data<int64_t>();
+    auto sample_out = context.Output<Tensor>("SampleLogits");
+    T* sample_out_data = sample_out->mutable_data<T>(context.GetPlace());
+    auto label = context.Input<Tensor>("Label");
+    auto sample_weight = context.Input<Tensor>("SampleWeight");
+    const T* sample_weight_data = nullptr;
+    if (sample_weight != nullptr) {
+      sample_weight_data = sample_weight->data<T>();
+    }
+    auto out = context.Output<Tensor>("Cost");
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+    int num_neg_samples = context.Attr<int>("num_neg_samples");
+    int num_total_classes = context.Attr<int>("num_total_classes");
+    int num_true_class = 1;
+    if (label != nullptr) {
+      num_true_class = label->dims()[1];
+    }
+    T b = 1. / num_total_classes * num_neg_samples;
+    // forward bias
+    auto bias = context.Input<Tensor>("Bias");
+    if (bias != nullptr) {
+      const T* bias_data = bias->data<T>();
+      for (size_t i = 0; i < sample_labels->numel(); ++i) {
+        sample_out_data[i] = bias_data[sample_labels_data[i]];
+      }
+    } else {
+      for (size_t i = 0; i < sample_labels->numel(); ++i) {
+        sample_out_data[i] = 0;
+      }
+    }
+    // forward mul
+    auto input_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
+    auto weight_mat = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
+    for (size_t i = 0; i < sample_labels->numel(); ++i) {
+      Eigen::Tensor<T, 0, Eigen::RowMajor, Eigen::DenseIndex> result =
+          (input_mat.chip((int)(i / sample_labels->dims()[1]), 0) *
+           weight_mat.chip(sample_labels_data[i], 0))
+              .sum();
+      sample_out_data[i] += result(0);
+      sample_out_data[i] = (1. / (1. + exp(-sample_out_data[i])));
+    }
+    // forward cost
+    for (size_t i = 0; i < sample_labels->dims()[0]; ++i) {
+      size_t j = 0;
+      out_data[i] = 0;
+      T w = sample_weight == nullptr ? 1. : sample_weight_data[i];
+      // for true classes
+      for (; j < num_true_class; ++j) {
+        T o = sample_out_data[i * sample_out->dims()[1] + j];
+        T cost = -log(o / (o + b));
+        out_data[i] += w * cost;
+      }
+      // for sampled neg classes
+      for (; j < sample_labels->dims()[1]; ++j) {
+        T o = sample_out_data[i * sample_out->dims()[1] + j];
+        T cost = -log(b / (o + b));
+        out_data[i] += w * cost;
+      }
+    }
+  }
+};
+template <typename Place, typename T>
+class NCEGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto d_out = context.Input<Tensor>(framework::GradVarName("Cost"));
+    const T* d_out_data = d_out->data<T>();
+    auto label = context.Input<Tensor>("Label");
+    auto sample_out = context.Input<Tensor>("SampleLogits");
+    const T* sample_out_data = sample_out->data<T>();
+    auto sample_labels = context.Input<Tensor>("SampleLabels");
+    const int64_t* sample_labels_data = sample_labels->data<int64_t>();
+    auto sample_weight = context.Input<Tensor>("SampleWeight");
+    const T* sample_weight_data = nullptr;
+    if (sample_weight != nullptr) {
+      sample_weight_data = sample_weight->data<T>();
+    }
+    int num_neg_samples = context.Attr<int>("num_neg_samples");
+    int num_total_classes = context.Attr<int>("num_total_classes");
+    int num_true_class = 1;
+    if (label != nullptr) {
+      num_true_class = label->dims()[1];
+    }
+    T b = 1. / num_total_classes * num_neg_samples;
+    Tensor sample_grad;  // tmp tensor
+    T* sample_grad_data =
+        sample_grad.mutable_data<T>(sample_labels->dims(), context.GetPlace());
+    // backward cost
+    for (size_t i = 0; i < sample_labels->numel(); ++i) {
+      T o = sample_out_data[i];
+      T w = sample_weight == nullptr
+                ? 1
+                : sample_weight_data[i / sample_labels->dims()[1]];
+      sample_grad_data[i] = (i % sample_labels->dims()[1]) < num_true_class
+                                ? w * (b / (o + b)) * (o - 1)
+                                : w * (o * (1 - o) / (o + b));
+      sample_grad_data[i] *= d_out_data[i / sample_labels->dims()[1]];
+    }
+    // get d_bias
+    auto d_bias = context.Output<Tensor>(framework::GradVarName("Bias"));
+    if (d_bias != nullptr) {
+      T* d_bias_data = d_bias->mutable_data<T>(context.GetPlace());
+      std::fill(d_bias_data, d_bias_data + d_bias->numel(), 0.0);
+      for (size_t i = 0; i < sample_labels->numel(); ++i) {
+        d_bias_data[sample_labels_data[i]] += sample_grad_data[i];
+      }
+    }
+    // get d_w
+    auto d_w = context.Output<Tensor>(framework::GradVarName("Weight"));
+    if (d_w != nullptr) {
+      auto d_w_data = d_w->mutable_data<T>(context.GetPlace());
+      std::fill(d_w_data, d_w_data + d_w->numel(), 0.0);
+      auto d_w_matrix = EigenMatrix<T>::From(*d_w);
+      auto x_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Input")));
+      for (size_t i = 0; i < sample_labels->numel(); ++i) {
+        d_w_matrix.chip(sample_labels_data[i], 0) +=
+            x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) *
+            sample_grad_data[i];
+      }
+    }
+    // get d_x
+    auto d_x = context.Output<Tensor>(framework::GradVarName("Input"));
+    if (d_x != nullptr) {
+      d_x->mutable_data<T>(context.GetPlace());
+      auto d_x_matrix = EigenMatrix<T>::From(*d_x);
+      auto w_matrix = EigenMatrix<T>::From(*(context.Input<Tensor>("Weight")));
+      for (size_t i = 0; i < sample_labels->numel(); ++i) {
+        d_x_matrix.chip((int)(i / sample_labels->dims()[1]), 0) +=
+            w_matrix.chip(sample_labels_data[i], 0) * sample_grad_data[i];
+      }
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
@@ -105,7 +105,7 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
  // TypedAttrChecker don't support vector type.)
  AddAttr<std::vector<int>>(
      "paddings",
-      "(vector<int>, defalut {0,0}), paddings(height, width) of pooling "
+      "(vector<int>, default {0,0}), paddings(height, width) of pooling "
      "operator."
      "If global_pooling = true, paddings and ksize will be ignored.")
      .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
@@ -127,10 +127,10 @@ Example:
       X shape: $(N, C, H_{in}, W_{in})$
  Output:
       Out shape: $(N, C, H_{out}, W_{out})$
-  where 
+  Where
       $$ 
-       H_{out} = (H_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
+       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
-       W_{out} = (W_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
       $$
 )DOC");
@@ -177,7 +177,7 @@ Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
                               // TypedAttrChecker don't support vector type.)
  AddAttr<std::vector<int>>(
      "paddings",
-      "(vector<int>, defalut {0,0,0}), paddings(depth, height, "
+      "(vector<int>, default {0,0,0}), paddings(depth, height, "
      "width) of pooling operator. "
      "If global_pooling = true, ksize and paddings will be ignored.")
      .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
@@ -199,11 +199,11 @@ Example:
       X shape: $(N, C, D_{in}, H_{in}, W_{in})$
  Output:
       Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
-  where
+  Where
  $$
-       D_{out} = (D_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
+       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
-       H_{out} = (H_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1 \\
+       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
-       W_{out} = (W_{in} - ksize[2] + 2 * paddings[2]) / strides[2] + 1
+       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
  $$
 )DOC");

--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
@@ -142,7 +142,7 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
    // TypedAttrChecker don't support vector type.)
    AddAttr<std::vector<int>>(
        "paddings",
-        "(vector<int>, defalut:{0, 0}), paddings(height, width) of pooling "
+        "(vector<int>, default:{0, 0}), paddings(height, width) of pooling "
        "operator. "
        "If global_pooling = true, paddings and will be ignored.")
        .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
@@ -166,10 +166,10 @@ Example:
  Output:
       Out shape: $(N, C, H_{out}, W_{out})$
       Mask shape: $(N, C, H_{out}, W_{out})$
-  where
+  Where
       $$
-       H_{out} = (H_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
+       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
-       W_{out} = (W_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
       $$
 )DOC");
@@ -220,7 +220,7 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
    // TypedAttrChecker don't support vector type.)
    AddAttr<std::vector<int>>(
        "paddings",
-        "(vector, defalut {0,0,0}), paddings(depth, "
+        "(vector, default {0,0,0}), paddings(depth, "
        "height, width) of pooling operator. "
        "If global_pooling = true, paddings and ksize will be ignored.")
        .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
@@ -244,11 +244,11 @@ Example:
  Output:
       Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
       Mask shape: $(N, C, D_{out}, H_{out}, W_{out})$
-  where
+  Where
       $$
-       D_{out} = (D_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
+       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
-       H_{out} = (H_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1 \\
+       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
-       W_{out} = (W_{in} - ksize[2] + 2 * paddings[2]) / strides[2] + 1
+       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
       $$
 )DOC");

--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
@@ -35,9 +35,10 @@ class RankLossOp : public framework::OperatorWithKernel {
    auto right_dims = ctx->GetInputDim("Right");
    PADDLE_ENFORCE((label_dims == left_dims) && (left_dims == right_dims),
-                   "All inputs must have the same size");
+                   "All inputs must have the same size.");
-    PADDLE_ENFORCE((label_dims.size() == 2) && (label_dims[1] == 1),
+    PADDLE_ENFORCE(
-                   "All inputs must be row vector with size batch_size x 1.");
+        (label_dims.size() == 2) && (label_dims[1] == 1),
+        "All inputs must be 2-D tensors with shape [batch_size x 1].");
    ctx->SetOutputDim("Out", label_dims);
  }
 };
@@ -48,10 +49,17 @@ class RankLossOpMaker : public framework::OpProtoAndCheckerMaker {
                  framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("Label",
-             "The label indicating A ranked higher than B or not, row vector.");
+             "(2-D Tensor with shape [batch_size x 1]) "
-    AddInput("Left", "The output of RankNet for doc A, vector.");
+             "The label indicating A ranked higher than B or not.");
-    AddInput("Right", "The output of RankNet for doc B, vetor.");
+    AddInput("Left",
-    AddOutput("Out", "The output loss of RankLoss operator, vector.");
+             "(2-D Tensor with shape [batch_size x 1]) "
+             "The output of RankNet for doc A.");
+    AddInput("Right",
+             "(2-D Tensor with shape [batch_size x 1]) "
+             "The output of RankNet for doc B.");
+    AddOutput("Out",
+              "(2-D Tensor with shape [batch_size x 1]) "
+              "The output loss of RankLoss operator.");
    AddComment(R"DOC(
 RankLoss Operator.
@@ -65,16 +73,17 @@ P = {0, 1} or {0, 0.5, 1}, where 0.5 means no information about the rank of
 the input pair.
 The RankLoss operator takes three inputs: Left (o_i), Right (o_j) and Label
-(P_{i,j}), which represent the output of RankNet for the two docs and the label, 
+(P_{i,j}), which represent the output score of RankNet for the two docs and 
-respectively, and yields the rank loss C_{i,j} using the following equation:
+the label respectively, and yields the rank loss C_{i,j} using the following 
+equation:
-\f$$
+$$
-  C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}}) \\
+  C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\
  o_{i,j} =  o_i - o_j  \\
  \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
-\f$$
+$$
-The operator can take inputs of one sample or in batch.
+The operator can take batch inputs with size batch_size (batch_size >= 1).
 )DOC");
  }

--- a/paddle/operators/rank_loss_op.cu
+++ b/paddle/operators/rank_loss_op.cu
--- a/paddle/operators/rank_loss_op.h
+++ b/paddle/operators/rank_loss_op.h
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -599,7 +599,9 @@ class RecurrentGradOpShapeInference : public framework::InferShapeBase {
    std::vector<std::string> output{kOutputs};
    for (auto &s : input) {
      PADDLE_ENFORCE(ctx->HasInputs(s));
-      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(s)));
+      PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(s)),
+                     "Cannot find the gradient variable %s",
+                     framework::GradVarName(s));
    }
    for (auto &s : output) {
      PADDLE_ENFORCE(ctx->HasInputs(s));

--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
   Licensed under the Apache License, Version 2.0 (the "License");
@@ -38,8 +37,8 @@ class ReshapeOp : public framework::OperatorWithKernel {
    // TODO(qiao) change batch_size
    for (size_t i = 1; i < shape.size(); ++i) {
      PADDLE_ENFORCE(shape[i] > 0,
-                     "Each dimension of shape "
+                     "Each dimension of Attr(shape) "
-                     "must be positiv except the first.");
+                     "must be positive except the first one.");
    }
    if (shape[0] < 0) {
      shape[0] = x_dims[0];

--- a/paddle/operators/reshape_op.cu.cc
+++ b/paddle/operators/reshape_op.cu.cc
--- a/paddle/operators/reshape_op.h
+++ b/paddle/operators/reshape_op.h
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -77,4 +77,6 @@ REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker<float>,
                  ops::ScaleGradMaker);
 REGISTER_OP_CPU_KERNEL(scale,
                       ops::ScaleKernel<paddle::platform::CPUPlace, float>,
-                       ops::ScaleKernel<paddle::platform::CPUPlace, double>);
+                       ops::ScaleKernel<paddle::platform::CPUPlace, double>,
+                       ops::ScaleKernel<paddle::platform::CPUPlace, int>,
+                       ops::ScaleKernel<paddle::platform::CPUPlace, int64_t>);
--- a/paddle/operators/scale_op.cu
+++ b/paddle/operators/scale_op.cu
@@ -16,4 +16,6 @@
 REGISTER_OP_GPU_KERNEL(
    scale, paddle::operators::ScaleKernel<paddle::platform::GPUPlace, float>,
-    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, double>);
+    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, double>,
+    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, int>,
+    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, int64_t>);
--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
@@ -104,6 +104,7 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
      PADDLE_ENFORCE_EQ(og_dims[i], x_dims[i], "The dimension mismatch.");
    }
    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->ShareLoD("X", framework::GradVarName("X"));
  }
 protected:

--- a/paddle/operators/sequence_slice_op.h
+++ b/paddle/operators/sequence_slice_op.h
@@ -54,10 +54,10 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
    PADDLE_ENFORCE_EQ(
        n, static_cast<size_t>(length->dims()[0]),
-        "The size of input-sequence and length-array should be the same")
+        "The size of input-sequence and length-array should be the same");
    PADDLE_ENFORCE_EQ(
        n, static_cast<size_t>(offset->dims()[0]),
-        "The size of input-sequence and offset-array should be the same")
+        "The size of input-sequence and offset-array should be the same");
    const int64_t* offset_data = offset->data<int64_t>();
    const int64_t* length_data = length->data<int64_t>();
@@ -78,11 +78,11 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
    for (size_t i = 0; i < n; ++i) {
      PADDLE_ENFORCE_LT(0, offset_data[i],
-                        "The offset[%d] must greater than zero.", i)
+                        "The offset[%d] must greater than zero.", i);
      PADDLE_ENFORCE_LT(0, length_data[i],
-                        "The length[%d] must greater than zero.", i)
+                        "The length[%d] must greater than zero.", i);
      PADDLE_ENFORCE_LT(lod[0][i] + offset_data[i] + length_data[i],
-                        lod[0][i + 1], "The target tensor's length overflow.")
+                        lod[0][i + 1], "The target tensor's length overflow.");
    }
    out->mutable_data<T>(ctx.GetPlace());

--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -25,20 +25,19 @@ class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
-                   "Input(Labels) should be not null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
    auto x_dims = ctx->GetInputDim("X");
-    auto labels_dims = ctx->GetInputDim("Labels");
+    auto labels_dims = ctx->GetInputDim("Label");
    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
    PADDLE_ENFORCE_EQ(labels_dims.size(), 2,
-                      "Input(Labels)'s rank should be 2.");
+                      "Input(Label)'s rank should be 2.");
    PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0],
-                      "The 1st dimension of Input(X) and Input(Labels) should "
+                      "The 1st dimension of Input(X) and Input(Label) should "
                      "be equal.");
    PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1],
-                      "The 2nd dimension of Input(X) and Input(Labels) should "
+                      "The 2nd dimension of Input(X) and Input(Label) should "
                      "be equal.");
    ctx->SetOutputDim("Out", x_dims);
@@ -53,26 +52,25 @@ class SigmoidCrossEntropyWithLogitsGradOp
  void InferShape(framework::InferShapeContext* ctx) const override {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
-                   "Input(Labels) should be not null.");
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                   "Input(Out@GRAD) shoudl be not null.");
    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
                   "Output(X@GRAD) should be not null.");
    auto x_dims = ctx->GetInputDim("X");
-    auto labels_dims = ctx->GetInputDim("Labels");
+    auto labels_dims = ctx->GetInputDim("Label");
    auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
    PADDLE_ENFORCE_EQ(labels_dims.size(), 2,
-                      "Input(Labels)'s rank should be 2.");
+                      "Input(Label)'s rank should be 2.");
    PADDLE_ENFORCE_EQ(dout_dims.size(), 2,
                      "Input(Out@Grad)'s rank should be 2.");
    PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0],
-                      "The 1st dimension of Input(X) and Input(Labels) should "
+                      "The 1st dimension of Input(X) and Input(Label) should "
                      "be equal.");
    PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1],
-                      "The 2nd dimension of Input(X) and Input(Labels) should "
+                      "The 2nd dimension of Input(X) and Input(Label) should "
                      "be equal.");
    PADDLE_ENFORCE_EQ(x_dims[0], dout_dims[0],
                      "The 1st dimension of Input(X) and Input(Out@Grad) "
@@ -97,7 +95,7 @@ class SigmoidCrossEntropyWithLogitsOpMaker
             "This input is a tensor of logits computed by the previous "
             " operator. Logits are unscaled log probabilities given as "
             "log(p/(1-p)).");
-    AddInput("Labels",
+    AddInput("Label",
             "(Tensor, default Tensor<float>), a 2-D tensor of the same type "
             "and shape as X. This input is a tensor of probabalistic labels "
             "for each logit");

--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
@@ -25,8 +25,7 @@ class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {
    const framework::Tensor *X = context.Input<framework::Tensor>("X");
-    const framework::Tensor *Labels =
+    const framework::Tensor *Labels = context.Input<framework::Tensor>("Label");
-        context.Input<framework::Tensor>("Labels");
    framework::Tensor *Out = context.Output<framework::Tensor>("Out");
    Out->mutable_data<T>(context.GetPlace());
@@ -52,8 +51,7 @@ class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {
    const framework::Tensor *X = context.Input<framework::Tensor>("X");
-    const framework::Tensor *Labels =
+    const framework::Tensor *Labels = context.Input<framework::Tensor>("Label");
-        context.Input<framework::Tensor>("Labels");
    const framework::Tensor *dOut =
        context.Input<framework::Tensor>(framework::GradVarName("Out"));
    framework::Tensor *dX =

--- a/paddle/operators/smooth_l1_loss_op.cc
+++ b/paddle/operators/smooth_l1_loss_op.cc
@@ -22,22 +22,20 @@ class SmoothL1LossOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
    auto x_dims = ctx->GetInputDim("X");
    auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(x_dims, y_dims, "The shape of X and Y must be the same.");
+    PADDLE_ENFORCE_EQ(x_dims, y_dims);
    PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                      "The tensor rank of X must be at least 2.");
+                      "The tensor rank of Input(X) should not be less than 2.");
    if (ctx->HasInput("InsideWeight")) {
      PADDLE_ENFORCE(ctx->HasInput("OutsideWeight"),
                     "If weights are provided, must specify both "
                     "inside and outside weights.");
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims,
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims);
-                        "The shape of InsideWeight must be same as X.");
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims);
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims,
-                        "The shape of OutsideWeight must be same as X.");
    }
    ctx->SetOutputDim("Diff", x_dims);
@@ -53,25 +51,29 @@ class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
                      framework::OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X",
-             "The input tensor of smooth l1 loss op."
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
-             "The rank should be greater or equal to 2 with shape "
+             "The input value of smooth l1 loss op with shape "
-             "[batch_size, value_dim1, value_dim2, ..., value_dimN]");
+             "[batch_size, dim1, ..., dimN].");
    AddInput("Y",
-             "The target tensor of smooth l1 loss op "
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
-             "with the same shape as X.");
+             "The target value of smooth l1 loss op with same shape as X.");
    AddInput("InsideWeight",
-             "Optional input tensor of smooth l1 loss op with the same shape "
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
-             "as X. If provided, the result of (X - Y) will be multiplied "
+             "This input is optional and should have same shape with X. "
+             "If provided, the result of (X - Y) will be multiplied "
             "by this tensor element by element.")
        .AsDispensable();
    AddInput("OutsideWeight",
-             "Optinal input of smooth l1 loss op with the same shape as X."
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
-             "If provided, the output smooth l1 loss will be multiplied by "
+             "This input is optional and should have same shape with X. "
-             "this tensor element by element.")
+             "If provided, the out smooth l1 loss will be multiplied by this "
+             "tensor element by element.")
        .AsDispensable();
-    AddOutput("Diff", "Intermediate variable to cache InsideWeight*(X-Y).")
+    AddOutput("Diff", "Intermediate variable to cache InsideWeight * (X - Y).")
        .AsIntermediate();
-    AddOutput("Out", "Smooth l1 loss.");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>) A tensor with rank be 2. "
+              "The output smooth l1 loss with shape [batch_size, 1].");
    AddAttr<AttrType>("sigma",
                      "Hyper parameter of smooth l1 loss op."
                      "A float scalar with default value 3.0.")
@@ -79,15 +81,23 @@ class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(R"DOC(
 Smooth L1 Loss Operator.
-This operator computes the smooth l1 loss for input and target.
+This operator computes the smooth l1 loss for X and Y.
-The operator takes the first dimension of input as the batch size.
+The operator takes the first dimension of X and Y as batch size.
 For each instance, it computes the smooth l1 loss element by element first
-and then sums all the losses. So the resulting output shape
+and then sums all the losses. So the shape of Out is [batch_size, 1].
-is [batch_size, 1].
 The equation is:
-loss = $$0.5 * (\sigma * (x-y))^2$$   if $$|x - y| < 1 /({\sigma}^2)$$
+$$
-       $$\frac{|x - y| - 0.5}{{\sigma}^2}$$ otherwise
+Out_{\sigma}(X, Y)_i = \begin{cases}
+0.5 * (\sigma * (X_i - Y_i)) ^ 2
+\quad |X_i - Y_i| \lt \frac{1} {{\sigma} ^ 2} \\
+\frac{|X_i - Y_i| - 0.5}{{\sigma}^2},
+\quad otherwise
+\end{cases}
+$$
+In the above equation, $Out_{\sigma}(X, Y)_i$, $X_i$ and $Y_i$ represent the ith
+element of Out, X and Y.
 )DOC");
  }

--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/operators/softmax_with_cross_entropy_op.h"
-#include <paddle/function/TensorType.h>
 namespace paddle {
 namespace operators {

--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
@@ -37,10 +37,16 @@ class SumOp : public framework::OperatorWithKernel {
    size_t N = x_dims.size();
    PADDLE_ENFORCE_GT(N, 1, "Input tensors count should > 1.");
-    auto in_dim = x_dims[0];
+    framework::DDim in_dim({0});
-    for (size_t i = 1; i < N; i++) {
+    for (auto& x_dim : x_dims) {
-      auto dim = x_dims[i];
+      if (framework::product(x_dim) == 0) {
-      PADDLE_ENFORCE_EQ(in_dim, dim, "Input tensors must have same shape");
+        continue;
+      }
+      if (framework::product(in_dim) == 0) {
+        in_dim = x_dim;
+      } else {
+        PADDLE_ENFORCE_EQ(in_dim, x_dim, "Input tensors must have same shape");
+      }
    }
    ctx->SetOutputDim("Out", in_dim);
    ctx->ShareLoD("X", /*->*/ "Out");
@@ -51,8 +57,22 @@ class SumOp : public framework::OperatorWithKernel {
      const framework::ExecutionContext& ctx) const override {
    auto x_vars = ctx.MultiInputVar("X");
    if (x_vars[0]->IsType<framework::LoDTensor>()) {
-      return framework::OpKernelType(
+      int dtype = -1;
-          framework::ToDataType(x_vars[0]->Get<framework::LoDTensor>().type()),
+      for (auto& x_var : x_vars) {
+        auto& lod_tensor = x_var->Get<framework::LoDTensor>();
+        if (lod_tensor.numel() == 0) {
+          continue;
+        }
+        if (dtype == -1) {
+          dtype = framework::ToDataType(lod_tensor.type());
+        } else {
+          PADDLE_ENFORCE_EQ(dtype, framework::ToDataType(lod_tensor.type()));
+        }
+      }
+      PADDLE_ENFORCE_NE(dtype, -1,
+                        "Sum operator should have at least one tensor");
+      return framework::OpKernelType(static_cast<framework::DataType>(dtype),
                                     ctx.device_context());
    } else if (x_vars[0]->IsType<framework::SelectedRows>()) {
      return framework::OpKernelType(

--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
@@ -53,6 +53,9 @@ class SumKernel : public framework::OpKernel<T> {
      for (int i = in_place ? 1 : 0; i < N; i++) {
        if (in_vars[i]->IsType<framework::LoDTensor>()) {
          auto &in_t = in_vars[i]->Get<framework::LoDTensor>();
+          if (in_t.numel() == 0) {
+            continue;
+          }
          auto in = EigenVector<T>::Flatten(in_t);
          result.device(place) = result + in;
        } else if (in_vars[i]->IsType<framework::SelectedRows>()) {
@@ -84,7 +87,7 @@ class SumKernel : public framework::OpKernel<T> {
      int64_t offset = 0;
      for (int i = 0; i < N; i++) {
        PADDLE_ENFORCE_EQ(out->height(),
-                          in_vars[i]->Get<SelectedRows>().height())
+                          in_vars[i]->Get<SelectedRows>().height());
        functor(context.device_context(), in_vars[i]->Get<SelectedRows>(),
                offset, out);
        offset += in_vars[i]->Get<SelectedRows>().value().numel();

--- a/paddle/operators/tensor_array_read_write_op.cc
+++ b/paddle/operators/tensor_array_read_write_op.cc
@@ -27,7 +27,7 @@ class WriteToArrayOp : public ArrayOp {
  void Run(const framework::Scope &scope,
           const platform::DeviceContext &dev_ctx) const override {
    auto *x = scope.FindVar(Input("X"));
-    PADDLE_ENFORCE(x != nullptr, "X must be set");
+    if (x == nullptr) return;
    auto &x_tensor = x->Get<framework::LoDTensor>();
    size_t offset = GetOffset(scope, dev_ctx);
    auto *out =
@@ -37,9 +37,15 @@ class WriteToArrayOp : public ArrayOp {
               << " to " << offset + 1;
      out->resize(offset + 1);
    }
+    if (x_tensor.memory_size() > 0) {
      auto *out_tensor = &out->at(offset);
      CopyFrom(x_tensor, dev_ctx.GetPlace(), dev_ctx, out_tensor);
      out_tensor->set_lod(x_tensor.lod());
+    } else {
+      VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
+                  "nothing has been written to output array["
+               << offset << "].";
+    }
  }
 };
@@ -70,7 +76,9 @@ class WriteToArrayInferShape : public framework::InferShapeBase {
    PADDLE_ENFORCE(context->HasInput("I"), "Must set the subscript index");
    PADDLE_ENFORCE_EQ(framework::product(context->GetInputDim("I")), 1,
                      "The number of element of subscript index must be 1");
-    PADDLE_ENFORCE(context->HasInput("X"), NotHasXError());
+    if (!context->HasInput("X")) {
+      return;
+    }
    PADDLE_ENFORCE(context->HasOutput("Out"), NotHasOutError());
    context->SetOutputDim("Out", context->GetInputDim("X"));
  }
@@ -93,9 +101,10 @@ class WriteToArrayInferVarType : public framework::VarTypeInference {
    auto &out = detail::Ref(block->FindRecursiveOrCreateVar(out_name),
                            "Cannot found %s", out_name);
    out.SetType(framework::VarDesc::LOD_TENSOR_ARRAY);
-    auto &x =
+    auto *x = block->FindVarRecursive(x_name);
-        detail::Ref(block->FindVarRecursive(x_name), "Cannot found %s", x_name);
+    if (x != nullptr) {
-    out.SetDataType(x.GetDataType());
+      out.SetDataType(x->GetDataType());
+    }
  }
 };
@@ -115,10 +124,13 @@ class ReadFromArrayOp : public ArrayOp {
    PADDLE_ENFORCE(out != nullptr, "Out must be set");
    auto *out_tensor = out->GetMutable<framework::LoDTensor>();
    size_t offset = GetOffset(scope, dev_ctx);
-    PADDLE_ENFORCE_LT(offset, x_array.size());
+    if (offset < x_array.size()) {
      framework::CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx,
                          out_tensor);
      out_tensor->set_lod(x_array[offset].lod());
+    } else {
+      VLOG(10) << "offset " << offset << " >= " << x_array.size();
+    }
  }
 };

--- a/paddle/operators/unpool_op.cc
+++ b/paddle/operators/unpool_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/unpool_op.h"
+namespace paddle {
+namespace operators {
+class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Unpool2dOpMaker(framework::OpProto* proto,
+                  framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of unpool operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddInput(
+        "Indices",
+        "(Tensor) The input tensor of the indices given out by MaxPool2d. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddOutput("Out",
+              "(Tensor) The output tensor of unpool operator."
+              "The format of output tensor is also NCHW."
+              "Where N is batch size, C is "
+              "the number of channels, H and W is the height and "
+              "width of feature.");
+    AddAttr<std::vector<int>>(
+        "ksize",
+        "(vector), the unpooling window size(height, width) "
+        "of unpooling operator.");
+    AddAttr<std::vector<int>>("strides",
+                              "(vector, default:{1, 1}), "
+                              "strides (height, width) of unpooling operator.")
+        .SetDefault({1, 1});
+    AddAttr<std::vector<int>>("paddings",
+                              "(vector defalut:{0,0}), "
+                              "paddings (height, width) of unpooling operator.")
+        .SetDefault({0, 0});
+    AddAttr<std::string>(
+        "unpooling_type",
+        "(string), unpooling type, can be \"max\" for max-unpooling ")
+        .InEnum({"max"});
+    AddComment(R"DOC(
+        "Input shape: $(N, C_{in}, H_{in}, W_{in})$
+        Output shape: $(N, C_{out}, H_{out}, W_{out})$
+        Where
+          $$
+            H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\
+            W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1]
+          $$
+        Paper: http://www.matthewzeiler.com/wp-content/uploads/2017
+        /07/iccv2011.pdf
+        )DOC");
+  }
+};
+int OutputSize(int input_size, int ksize, int padding, int stride) {
+  int output_size = (input_size - 1) * stride - 2 * padding + ksize;
+  return output_size;
+}
+class UnpoolOp : public framework::OperatorWithKernel {
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of UnpoolOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Indices"),
+                   "Input(Indices) of UnpoolOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of UnpoolOp should not be null.");
+    auto in_x_dims = ctx->GetInputDim("X");
+    auto in_y_dims = ctx->GetInputDim("Indices");
+    std::string unpooling_type =
+        ctx->Attrs().Get<std::string>("unpooling_type");
+    std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    PADDLE_ENFORCE(in_x_dims.size() == 4,
+                   "Unpooling intput must be of 4-dimensional.");
+    PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims);
+    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      output_shape.push_back(
+          OutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+class UnpoolOpGrad : public framework::OperatorWithKernel {
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad,
+            ops::UnpoolOpGrad);
+REGISTER_OP_CPU_KERNEL(unpool,
+                       ops::UnpoolKernel<paddle::platform::CPUPlace, float>,
+                       ops::UnpoolKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    unpool_grad, ops::UnpoolGradKernel<paddle::platform::CPUPlace, float>,
+    ops::UnpoolGradKernel<paddle::platform::CPUPlace, double>);
--- a/paddle/operators/unpool_op.cu.cc
+++ b/paddle/operators/unpool_op.cu.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/operators/unpool_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(unpool,
+                       ops::UnpoolKernel<paddle::platform::GPUPlace, float>,
+                       ops::UnpoolKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(
+    unpool_grad, ops::UnpoolGradKernel<paddle::platform::GPUPlace, float>,
+    ops::UnpoolGradKernel<paddle::platform::GPUPlace, double>);
--- a/paddle/operators/unpool_op.h
+++ b/paddle/operators/unpool_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/unpooling.h"
+namespace paddle {
+namespace operators {
+template <typename Place, typename T>
+class UnpoolKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
+    auto* out = context.Output<framework::Tensor>("Out");
+    std::string unpooling_type = context.Attr<std::string>("unpooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    T* output_data = out->mutable_data<T>(context.GetPlace());
+    if (output_data) {
+      math::SetConstant<Place, T> set_zero;
+      set_zero(context.device_context(), out, static_cast<T>(0));
+    }
+    math::Unpool2dMaxFunctor<Place, T> unpool2d_max_forward;
+    unpool2d_max_forward(context.device_context(), *in_x, *in_y, out);
+  }
+};
+template <typename Place, typename T>
+class UnpoolGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
+    const framework::Tensor* out = context.Input<framework::Tensor>("Out");
+    const framework::Tensor* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    framework::Tensor* in_x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    std::string unpooling_type = context.Attr<std::string>("unpooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    auto& device_ctx = context.device_context();
+    math::SetConstant<Place, T> zero;
+    if (in_x_grad) {
+      in_x_grad->mutable_data<T>(context.GetPlace());
+      zero(device_ctx, in_x_grad, static_cast<T>(0));
+    }
+    math::Unpool2dMaxGradFunctor<Place, T> unpool2d_max_backward;
+    unpool2d_max_backward(context.device_context(), *in_x, *in_y, *out,
+                          *out_grad, in_x_grad);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
@@ -98,8 +98,6 @@ class WhileGradOp : public framework::OperatorBase {
  void Run(const framework::Scope &scope,
           const platform::DeviceContext &dev_ctx) const override {
-    //    PADDLE_ENFORCE(...)
    framework::Executor executor(dev_ctx);
    auto *block = Attr<framework::BlockDescBind *>(kStepBlock);
    auto *program = block->Program();
@@ -124,8 +122,12 @@ class WhileGradOp : public framework::OperatorBase {
        auto inside_og_name = inside_og_names[i];
        VLOG(10) << "Linking outside " << outside_og_name << " --> inside "
                 << inside_og_name;
-        auto &og_outside = detail::Ref(scope.FindVar(outside_og_name));
+        auto &og_outside =
-        auto &og_inside = detail::Ref(cur_scope.Var(inside_og_name));
+            detail::Ref(scope.FindVar(outside_og_name),
+                        "Cannot find Outside Gradient %s", outside_og_name);
+        auto &og_inside =
+            detail::Ref(cur_scope.Var(inside_og_name),
+                        "Cannot find inside gradient %s", inside_og_name);
        if (og_outside.Type().hash_code() ==
            typeid(framework::LoDTensor).hash_code()) {
          auto &outside_tensor = og_outside.Get<framework::LoDTensor>();
@@ -160,7 +162,7 @@ class WhileGradOp : public framework::OperatorBase {
      PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size());
      for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) {
        if (pg_names[param_id] == framework::kEmptyVarName) {
-          continue;  // iterator doesn't have gradient
+          continue;  // parameter doesn't have gradient
        }
        auto inside_grad_name = framework::GradVarName(p_names[param_id]);
@@ -190,7 +192,6 @@ class WhileGradOp : public framework::OperatorBase {
          }
        }
-        // sum gradient
        auto new_inside_name = cur_scope.Rename(inside_grad_name);
        auto sum_op = framework::OpRegistry::CreateOp(
            "sum", {{"X", {pg_names[param_id], new_inside_name}}},
@@ -207,18 +208,35 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
  using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
 protected:
-  virtual std::unique_ptr<framework::OpDescBind> Apply() const {
+  std::unique_ptr<framework::OpDescBind> Apply() const override {
    auto *grad = new framework::OpDescBind();
    grad->SetType("while_grad");
    grad->SetInput(kParameters, Input(kParameters));
-    grad->SetOutput(
-        framework::GradVarName(kParameters),
+    // Not all of IGs will be generated by inner gradient operators of while op.
-        InputGrad(kParameters, /*do not drop empty gradient*/ false));
+    // Ignore IGs that is not generated by the inside block.
+    auto igs = InputGrad(kParameters, /*do not drop empty gradient*/ false);
+    std::unordered_set<std::string> all_outs;
+    for (size_t i = 0; i < grad_block_[0]->OpSize(); ++i) {
+      for (auto &oname : grad_block_[0]->Op(i)->OutputArgumentNames()) {
+        all_outs.insert(oname);
+      }
+    }
+    for (auto &each_ig : igs) {
+      if (all_outs.find(each_ig) == all_outs.end()) {
+        VLOG(10) << "Ignore " << each_ig;
+        each_ig = framework::kEmptyVarName;
+      }
+    }
+    grad->SetOutput(framework::GradVarName(kParameters), igs);
    grad->SetInput(kOutputs, Output(kOutputs));
    // OG should be re-calculated by step blocks, since many outputs of while op
    // do not need to calculate gradients.
    std::unordered_set<std::string> block_ins;
+    auto *fwd_block = this->grad_block_[0]->ParentBlock();
    {
      for (auto &p : Input(kParameters)) {
        block_ins.insert(p);
@@ -233,6 +251,13 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
        if (block_ins.find(input_name) != block_ins.end()) {
          continue;
        }
+        // If the input of Op is generated by the forward block, do not make it
+        // as input again.
+        if (fwd_block->FindVar(input_name) != nullptr) {
+          continue;
+        }
        extra_inputs.insert(input_name);
      }
@@ -287,7 +312,6 @@ class WhileGradOpShapeInference : public framework::InferShapeBase {
    auto p_names = ctx->Inputs(kParameters);
    auto pg_names = ctx->Outputs(kParamGrads);
-    auto dims = ctx->GetInputsDim(kParameters);
    auto var_types = ctx->GetInputsVarType(kParameters);
    std::vector<std::string> names_to_set;
    std::vector<framework::DDim> dims_to_set;
@@ -295,13 +319,14 @@ class WhileGradOpShapeInference : public framework::InferShapeBase {
      if (pg_names[i] == framework::kEmptyVarName) {
        continue;
      }
+      auto dims = ctx->GetInputsElementDim(kParameters, i);
      if (var_types[i] == framework::VarDesc::LOD_TENSOR) {
        names_to_set.push_back(pg_names[i]);
-        dims_to_set.push_back(dims[i]);
+        dims_to_set.push_back(dims);
      } else if (var_types[i] == framework::VarDesc::LOD_TENSOR_ARRAY) {
        // not sure how to set the dim of LOD_TENSOR_ARRAY
        names_to_set.push_back(pg_names[i]);
-        dims_to_set.push_back(dims[i]);
+        dims_to_set.push_back(dims);
      }
    }
    ctx->SetDims(names_to_set, dims_to_set);

--- a/paddle/optimizer/parameter_optimizer_test.cc
+++ b/paddle/optimizer/parameter_optimizer_test.cc
@@ -127,8 +127,3 @@ TEST_F(OptimizerTest, TestGetWeight) { TestGetWeight(); }
 TEST_F(OptimizerTest, TestUpdate) { TestUpdate(); }
 TEST_F(OptimizerTest, TestCheckPoint) { TestCheckPoint(); }
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
--- a/paddle/optimizer/serialization_test.cc
+++ b/paddle/optimizer/serialization_test.cc
@@ -46,8 +46,3 @@ TEST(TensorToProto, Case2) {
    EXPECT_EQ(t1[i], t[i]);
  }
 }
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
--- a/paddle/platform/cuda_profiler.h
+++ b/paddle/platform/cuda_profiler.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <cuda_profiler_api.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+namespace paddle {
+namespace platform {
+void CudaProfilerInit(std::string output_file, std::string output_mode,
+                      std::vector<std::string> config_flags) {
+  std::array<char, 128> buf;
+  std::string tmpl = "/tmp/cuda_profile_config.XXXXXX";
+  PADDLE_ENFORCE_LT(tmpl.size(), buf.size());
+  memcpy(buf.data(), tmpl.data(), tmpl.size());
+  auto result = mktemp(buf.data());
+  PADDLE_ENFORCE(strlen(result) != 0);
+  std::string config_file = result;
+  {
+    std::ofstream ofs(config_file, std::ios::out | std::ios::trunc);
+    PADDLE_ENFORCE(ofs.is_open(), "ofstream: ", ofs.rdstate());
+    for (const auto& line : config_flags) {
+      ofs << line << std::endl;
+    }
+  }
+  PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv");
+  cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
+  PADDLE_ENFORCE(
+      cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode));
+}
+void CudaProfilerStart() { PADDLE_ENFORCE(cudaProfilerStart()); }
+void CudaProfilerStop() { PADDLE_ENFORCE(cudaProfilerStop()); }
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/platform/dynload/cudnn.cc
+++ b/paddle/platform/dynload/cudnn.cc
@@ -37,6 +37,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
 CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
 #endif
+#ifdef CUDNN_DNN_ROUTINE_EACH_R7
+CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
+#endif
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/platform/dynload/cudnn.h
+++ b/paddle/platform/dynload/cudnn.h
@@ -135,6 +135,12 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
+#if CUDNN_VERSION >= 7001
+#define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \
+  __macro(cudnnSetConvolutionGroupCount);
+CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -235,15 +235,23 @@ inline void throw_on_error(T e) {
 #define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
 #define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                  \
-  PADDLE_ENFORCE(nullptr != (__VAL), #__VAL " should not be null\n%s", \
+  do {                                                       \
-                 paddle::string::Sprintf("" __VA_ARGS__));
+    if (UNLIKELY(nullptr == (__VAL))) {                      \
+      PADDLE_THROW(#__VAL " should not be null\n%s",         \
+                   paddle::string::Sprintf("" __VA_ARGS__)); \
+    }                                                        \
+  } while (0)
 #define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...)  \
-  PADDLE_ENFORCE(__VAL0 __CMP __VAL1,                                         \
+  do {                                                                  \
-                 "enforce %s " #__CMP " %s failed, %s " #__INV_CMP " %s\n%s", \
+    if (UNLIKELY(!((__VAL0)__CMP(__VAL1)))) {                           \
+      PADDLE_THROW("enforce %s " #__CMP " %s failed, %s " #__INV_CMP    \
+                   " %s\n%s",                                           \
                   #__VAL0, #__VAL1, paddle::string::to_string(__VAL0), \
                   paddle::string::to_string(__VAL1),                   \
-                 paddle::string::Sprintf("" __VA_ARGS__));
+                   paddle::string::Sprintf("" __VA_ARGS__));            \
+    }                                                                   \
+  } while (0)
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -18,8 +18,8 @@ limitations under the License. */
 #include "paddle/platform/enforce.h"
-DEFINE_double(fraction_of_gpu_memory_to_use, 0.95,
+DEFINE_double(fraction_of_gpu_memory_to_use, 0.92,
-              "Default use 95% of GPU memory for PaddlePaddle,"
+              "Default use 92% of GPU memory for PaddlePaddle,"
              "reserve the rest for page tables, etc");
 namespace paddle {
@@ -75,15 +75,19 @@ size_t GpuMaxChunkSize() {
  GpuMemoryUsage(available, total);
  // Reserving the rest memory for page tables, etc.
-  size_t reserving = (1 - FLAGS_fraction_of_gpu_memory_to_use) * total;
+  size_t reserving = 0.05 * total;
  // If available less than minimum chunk size, no usable memory exists.
-  available = std::max(available, GpuMinChunkSize()) - GpuMinChunkSize();
+  available =
+      std::max(std::max(available, GpuMinChunkSize()) - GpuMinChunkSize(),
+               reserving) -
+      reserving;
-  // If available less than reserving, no usable memory exists.
+  size_t allocating = FLAGS_fraction_of_gpu_memory_to_use * total;
-  size_t usable = std::max(available, reserving) - reserving;
-  return usable;
+  PADDLE_ENFORCE_LT(allocating, available);
+  return allocating;
 }
 void GpuMemcpyAsync(void *dst, const void *src, size_t count,

--- a/paddle/pserver/CMakeLists.txt
+++ b/paddle/pserver/CMakeLists.txt
@@ -49,7 +49,7 @@ if(WITH_TESTING)
  add_subdirectory(test)
 endif()
-if(NOT WITH_C_API)
+if(NOT MOBILE_INFERENCE)
  add_executable(paddle_pserver_main ${PSERVER_MAIN_SOURCES})
  link_paddle_exe(paddle_pserver_main)

--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
@@ -5,4 +5,6 @@ if(WITH_PYTHON)
    ${GLOB_OP_LIB})
 endif(WITH_PYTHON)
-cc_binary(print_operators_doc SRCS print_operators_doc.cc DEPS ${GLOB_OP_LIB})
+if(WITH_DOC)
+  cc_binary(print_operators_doc SRCS print_operators_doc.cc DEPS ${GLOB_OP_LIB})
+endif(WITH_DOC)
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -37,6 +37,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/operators/nccl/nccl_gpu_common.h"
+#include "paddle/platform/cuda_profiler.h"
 #include "paddle/platform/gpu_info.h"
 #endif
@@ -460,6 +461,10 @@ All parameter, weight, gradient are variables in Paddle.
  m.def("op_support_gpu", OpSupportGPU);
 #ifdef PADDLE_WITH_CUDA
  m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
+  m.def("nvprof_init", platform::CudaProfilerInit);
+  m.def("nvprof_start", platform::CudaProfilerStart);
+  m.def("nvprof_stop", platform::CudaProfilerStop);
 #endif
  return m.ptr();

--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -36,6 +36,7 @@ function cmake_gen() {
        ${PYTHON_FLAGS}
        -DWITH_DOC=OFF
        -DWITH_GPU=${WITH_GPU:-OFF}
+        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
        -DWITH_MKL=${WITH_MKL:-ON}
        -DWITH_AVX=${WITH_AVX:-OFF}
        -DWITH_GOLANG=${WITH_GOLANG:-ON}
@@ -57,6 +58,7 @@ EOF
        ${PYTHON_FLAGS} \
        -DWITH_DOC=OFF \
        -DWITH_GPU=${WITH_GPU:-OFF} \
+        -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
        -DWITH_MKL=${WITH_MKL:-ON} \
        -DWITH_AVX=${WITH_AVX:-OFF} \
        -DWITH_GOLANG=${WITH_GOLANG:-ON} \
@@ -183,6 +185,14 @@ EOF
    ${DOCKERFILE_GPU_ENV}
    ADD go/cmd/pserver/pserver /usr/bin/
    ADD go/cmd/master/master /usr/bin/
+EOF
+    if [[ ${WITH_DOC:-OFF} == 'ON' ]]; then
+        cat >> /paddle/build/Dockerfile <<EOF
+        ADD paddle/pybind/print_operators_doc /usr/bin/
+EOF
+    fi
+    cat >> /paddle/build/Dockerfile <<EOF
    # default command shows the paddle version and exit
    CMD ["paddle", "version"]
 EOF

--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -5,4 +5,8 @@ if(WITH_TESTING)
  add_dependencies(paddle_test_main paddle_proto ${external_project_dependencies})
  add_library(paddle_test_util STATIC TestUtil.cpp)
  add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
+  if(NOT MOBILE_INFERENCE)
+    add_library(paddle_gtest_main STATIC paddle_gtest_main.cc)
+    add_dependencies(paddle_gtest_main paddle_memory gtest gflags)
+  endif()
 endif()
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <cstring>
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+#include "paddle/memory/memory.h"
+int main(int argc, char** argv) {
+  std::vector<char*> new_argv;
+  std::string gflags_env;
+  new_argv.push_back(argv[0]);
+#ifdef PADDLE_WITH_CUDA
+  new_argv.push_back(
+      strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
+#else
+  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory"));
+#endif
+  int new_argc = static_cast<int>(new_argv.size());
+  char** new_argv_address = new_argv.data();
+  google::ParseCommandLineFlags(&new_argc, &new_argv_address, false);
+  testing::InitGoogleTest(&argc, argv);
+  paddle::memory::Used(paddle::platform::CPUPlace());
+#ifdef PADDLE_WITH_CUDA
+  paddle::memory::Used(paddle::platform::GPUPlace(0));
+#endif
+  return RUN_ALL_TESTS();
+}
--- a/paddle/trainer/CMakeLists.txt
+++ b/paddle/trainer/CMakeLists.txt
@@ -54,7 +54,7 @@ if(WITH_TESTING)
  add_subdirectory(tests)
 endif()
-if(NOT WITH_C_API)
+if(NOT MOBILE_INFERENCE)
  add_paddle_exe(paddle_trainer TrainerMain.cpp)
  add_paddle_exe(paddle_merge_model MergeModel.cpp)
@@ -74,7 +74,5 @@ endif()
 if(WITH_GOLANG)
  add_dependencies(paddle_trainer_lib paddle_pserver_cclient)
  target_link_libraries(paddle_trainer_lib paddle_pserver_cclient)
-  if(NOT WITH_C_API)
  target_link_libraries(paddle_trainer paddle_pserver_cclient)
-  endif()
 endif(WITH_GOLANG)
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
-################# test_Compare ############################
+set(PYTHON_PATH 
-add_unittest_without_exec(test_Compare
+   ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d 
-    test_Compare.cpp)
+   ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests)
-add_test(NAME test_Compare
+function(trainer_test TARGET)
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python
+  add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
-        ${CMAKE_CURRENT_BINARY_DIR}/test_Compare
+  add_test(NAME ${TARGET}
+    COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+endfunction()
-################# test_Trainer ###########################
+trainer_test(test_Compare)
-add_unittest_without_exec(test_Trainer
+trainer_test(test_PyDataProviderWrapper)
-    test_Trainer.cpp)
+trainer_test(test_recurrent_machine_generation)
-add_test(NAME test_Trainer
+trainer_test(test_Trainer)
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-        ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-        ${CMAKE_CURRENT_BINARY_DIR}/test_Trainer
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 ############### test_TrainerOnePass ##########################
 if(WITH_PYTHON)
@@ -22,32 +20,13 @@ if(WITH_PYTHON)
  add_unittest_without_exec(test_TrainerOnePass
      test_TrainerOnePass.cpp)
  add_test(NAME test_TrainerOnePass
-    COMMAND  ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
+    COMMAND ${PYTHON_PATH} ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port 
-          ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests
+          ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
-          ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 endif()
-################# test_recurrent_machine_generation ###############
-add_unittest_without_exec(test_recurrent_machine_generation
-    test_recurrent_machine_generation.cpp)
-add_test(NAME test_recurrent_machine_generation
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
-        ${CMAKE_CURRENT_BINARY_DIR}/test_recurrent_machine_generation
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
-#################### test_PyDataProviderWrapper #########################
-add_unittest_without_exec(test_PyDataProviderWrapper
-    test_PyDataProviderWrapper.cpp)
-add_test(NAME test_PyDataProviderWrapper
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
-        ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests
-        ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProviderWrapper
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 #################### test_config_parser #########################
 add_test(NAME test_config_parser
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
+  COMMAND ${PYTHON_PATH} ${PYTHON_EXECUTABLE} 
-        ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/paddle/trainer/tests/config_parser_test.py
+        ${PADDLE_SOURCE_DIR}/paddle/trainer/tests/config_parser_test.py
    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2400,6 +2400,14 @@ class CropLayer(LayerBase):
        image_conf.img_size_y = input_layer.height
        image_conf.channels = input_layer.size / (input_layer.width *
                                                  input_layer.height)
+        # only support for 4-dims inputs and NCHW order
+        if (len(self.config.inputs) == 2):
+            self.set_layer_height_width(
+                self.get_input_layer(1).height, self.get_input_layer(1).width)
+            self.set_layer_size(self.get_input_layer(1).size)
+        else:
+            self.set_layer_height_width(shape[-2], shape[-1])
+            self.set_layer_size(reduce(lambda x, y: x * y, shape[1:]))
 @config_layer('batch_norm')
@@ -3849,6 +3857,26 @@ class SwitchOrderLayer(LayerBase):
            name, 'switch_order', 0, inputs=inputs, **xargs)
        self.config.reshape_conf.height_axis.extend(reshape['height'])
        self.config.reshape_conf.width_axis.extend(reshape['width'])
+        input_layer = self.get_input_layer(0)
+        if reshape is None:
+            self.set_layer_size(input_layer.size)
+        else:
+            in_h = input_layer.height
+            in_w = input_layer.width
+            out_dims = None
+            if input_layer.has_depth():
+                in_d = input_layer.depth
+                in_c = input_layer.size / in_h / in_w / in_d
+                # batch_size, depth, height, width, channel
+                out_dims = [0, in_d, in_h, in_w, in_c]
+            else:
+                in_c = input_layer.size / in_h / in_w
+                # batch_size, height, width, channel
+                out_dims = [0, in_h, in_w, in_c]
+            # Because (reshape['width'][0] > 0) always be true.
+            # So out_dims[0] won't be used.
+            size = reduce(lambda x, y: x * y, out_dims[reshape['width'][0]:])
+            self.set_layer_size(size)
 @config_layer('scale_sub_region')

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -6873,6 +6873,7 @@ def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None):
    :param input: The input of this layer. If two inputs are given, the second one
                  will be regarded as the reference.
+                  And the input must be 4-dims and in NCHW order.
    :type input: LayerOutput | Sequence
    :param offset: The crop offset.
    :type offset: Sequence

--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -13,13 +13,15 @@ import nets
 import optimizer
 import backward
 import regularizer
+from param_attr import ParamAttr
+from data_feeder import DataFeeder
 from core import LoDTensor, CPUPlace, GPUPlace
 Tensor = LoDTensor
 __all__ = framework.__all__ + executor.__all__ + [
    'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward',
-    'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor'
+    'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor', 'ParamAttr'
+    'DataFeeder'
 ]
@@ -35,7 +37,8 @@ def __read_gflags_from_env__():
    read_env_flags = ['use_pinned_memory']
    if core.is_compile_gpu():
        read_env_flags.append('fraction_of_gpu_memory_to_use')
-    core.init_gflags(sys.argv + ["--tryfromenv=" + ",".join(read_env_flags)])
+    core.init_gflags([sys.argv[0]] +
+                     ["--tryfromenv=" + ",".join(read_env_flags)])
 __read_gflags_from_env__()
--- a/python/paddle/v2/fluid/data_feeder.py
+++ b/python/paddle/v2/fluid/data_feeder.py
+from __future__ import print_function
+import core
+import numpy
+import six.moves as six
+from framework import Variable
+__all__ = ['DataFeeder']
+class DataToLoDTensorConverter(object):
+    def __init__(self, place, lod_level, shape, dtype):
+        self.place = place
+        self.lod_level = lod_level
+        self.shape = shape
+        if dtype == core.DataType.FP32:
+            self.dtype = 'float32'
+        elif dtype == core.DataType.INT64:
+            self.dtype = 'int64'
+        elif dtype == core.DataType.FP64:
+            self.dtype = 'float64'
+        elif dtype == core.DataType.INT32:
+            self.dtype = 'int32'
+        else:
+            raise ValueError("dtype must be any of [int32, float32, int64, "
+                             "float64]")
+        self.data = []
+        self.lod = []
+        for i in six.range(lod_level):
+            self.lod.append([0])
+    def feed(self, data):
+        self._feed_impl_(data, self.lod, self.lod_level)
+    def _feed_impl_(self, data, lod, lod_level):
+        if lod_level == 0:
+            self.data.append(data)
+        else:
+            cur_lod_len = len(data)
+            lod[-1].append(lod[-1][-1] + cur_lod_len)
+            for each_data in data:
+                self._feed_impl_(each_data, lod[:-1], lod_level - 1)
+    def done(self):
+        arr = numpy.array(self.data, dtype=self.dtype).reshape(self.shape)
+        t = core.LoDTensor()
+        t.set(arr, self.place)
+        if self.lod_level > 0:
+            t.set_lod(self.lod)
+        return t
+class DataFeeder(object):
+    def __init__(self, feed_list, place):
+        self.feed_dtypes = []
+        self.feed_names = []
+        self.feed_shapes = []
+        self.feed_lod_level = []
+        for each_var in feed_list:
+            if not isinstance(each_var, Variable):
+                raise TypeError("Feed list should contain a list of variable")
+            self.feed_dtypes.append(each_var.dtype)
+            self.feed_names.append(each_var.name)
+            shape = each_var.shape
+            batch_size_dim = -1
+            for i, s in enumerate(shape):
+                if s < 0:
+                    batch_size_dim = i
+                    break
+            if batch_size_dim == -1:
+                raise ValueError("Variable {0} must has a batch size dimension",
+                                 each_var.name)
+            self.feed_lod_level.append(each_var.lod_level)
+            self.feed_shapes.append(shape)
+        self.place = place
+    def feed(self, iterable):
+        converter = []
+        for lod_level, shape, dtype in six.zip(
+                self.feed_lod_level, self.feed_shapes, self.feed_dtypes):
+            converter.append(
+                DataToLoDTensorConverter(
+                    place=self.place,
+                    lod_level=lod_level,
+                    shape=shape,
+                    dtype=dtype))
+        for each_sample in iterable:
+            for each_converter, each_slot in six.zip(converter, each_sample):
+                each_converter.feed(each_slot)
+        ret_dict = {}
+        for each_name, each_converter in six.zip(self.feed_names, converter):
+            ret_dict[each_name] = each_converter.done()
+        return ret_dict
--- a/python/paddle/v2/fluid/evaluator.py
+++ b/python/paddle/v2/fluid/evaluator.py
@@ -26,9 +26,9 @@ class Evaluator(object):
        name(str): The name of evaluator. such as, "accuracy". Used for generate 
            temporary variable name.
        main_program(Program, optional): The evaluator should be added to this 
-            main_program. Default g_main_program 
+            main_program. Default default_main_program()
        startup_program(Program, optional):The parameter should be added to this 
-            startup_program. Default g_startup_program
+            startup_program. Default default_startup_program()
    Attributes:
        states(list): The list of state variables. states will be reset to zero 

--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
 import numpy as np
 from . import core
-from framework import Program, g_main_program
+from framework import Program, default_main_program
 __all__ = ['Executor', 'g_scope']
@@ -103,7 +103,7 @@ class Executor(object):
            fetch_list = []
        if program is None:
-            program = g_main_program
+            program = default_main_program()
        if not isinstance(program, Program):
            raise TypeError()

--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -3,10 +3,12 @@ import collections
 import numpy as np
 from . import core
 import proto.framework_pb2 as framework_pb2
+import contextlib
 __all__ = [
    'Block', 'Variable', 'Program', 'Operator', 'default_startup_program',
-    'default_main_program', 'g_startup_program', 'g_main_program'
+    'default_main_program', 'program_guard', 'switch_startup_program',
+    'switch_main_program'
 ]
@@ -654,13 +656,88 @@ class Parameter(Variable):
 # program is a global instance.
-g_main_program = Program()
+_main_program_ = Program()
-g_startup_program = Program()
+_startup_program_ = Program()
 def default_startup_program():
-    return g_startup_program
+    """
+    Get default startup program. In startup program, Paddle will initialize
+    parameters, initialize nccl handle, etc.
+    Returns:
+        Program: startup program
+    """
+    return _startup_program_
 def default_main_program():
-    return g_main_program
+    """
+    Get default main program. The main program is used for training or testing.
+    Returns:
+        Program: main program
+    """
+    return _main_program_
+def switch_main_program(program):
+    """
+    Switch the main program to a new program.
+    Args:
+        program(Program): The new main program
+    Returns:
+        Program: The previous main program
+    """
+    global _main_program_
+    prev_program = _main_program_
+    _main_program_ = program
+    return prev_program
+def switch_startup_program(program):
+    """
+    Switch the startup program to a new program 
+    Args:
+        program(Program): The new startup program
+    Returns:
+        Program: The previous startup program
+    """
+    global _startup_program_
+    prev_program = _startup_program_
+    _startup_program_ = program
+    return prev_program
+@contextlib.contextmanager
+def program_guard(main_program, startup_program=None):
+    """
+    Switch program with `with` statement
+    Examples:
+        >>> with program_guard(Program()):
+        >>>   data = fluid.layers.data(...)
+        >>>   hidden = fluid.layers.fc(...)
+    Args:
+        main_program(Program): New main program inside `with` statement
+        startup_program(Program): New startup program inside `with` statement. 
+            None means do not change startup program.
+    Returns:
+        None
+    """
+    if not isinstance(main_program, Program):
+        raise TypeError("main_program should be Program")
+    main_program = switch_main_program(main_program)
+    if startup_program is not None:
+        if not isinstance(startup_program, Program):
+            raise TypeError("startup_program should be Program")
+        startup_program = switch_startup_program(startup_program)
+    yield
+    switch_main_program(main_program)
+    if startup_program is not None:
+        switch_startup_program(startup_program)
--- a/python/paddle/v2/fluid/io.py
+++ b/python/paddle/v2/fluid/io.py
 import os
 import cPickle as pickle
-from paddle.v2.fluid.framework import Program, Parameter, g_main_program, \
+from paddle.v2.fluid.framework import Program, Parameter, default_main_program, Variable
-    Variable
 __all__ = [
    'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
@@ -46,7 +45,7 @@ def save_vars(executor, dirname, main_program=None, vars=None, predicate=None):
    """
    if vars is None:
        if main_program is None:
-            main_program = g_main_program
+            main_program = default_main_program()
        if not isinstance(main_program, Program):
            raise TypeError("program should be as Program type or None")
@@ -98,7 +97,7 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
    :param executor: executor that save variable
    :param dirname: directory path
    :param main_program: program. If vars is None, then filter all variables in this
-    program which fit `predicate`. Default g_program.
+    program which fit `predicate`. Default default_main_program().
    :param predicate: The Predicate describes a callable that returns a variable
    as a bool. If it returns true, the variables will be loaded.
    :param vars: variables need to be loaded. If specify vars, program &
@@ -107,7 +106,7 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
    """
    if vars is None:
        if main_program is None:
-            main_program = g_main_program
+            main_program = default_main_program()
        if not isinstance(main_program, Program):
            raise TypeError("program's type should be Program")
@@ -154,7 +153,7 @@ def load_persistables(executor, dirname, main_program=None):
 def get_inference_program(target_vars, main_program=None):
    if main_program is None:
-        main_program = g_main_program
+        main_program = default_main_program()
    if not isinstance(target_vars, list):
        target_vars = [target_vars]
@@ -177,12 +176,12 @@ def save_inference_model(dirname,
    :param target_vars: Variables from which we can get inference results.
    :param executor: executor that save inference model
    :param main_program: original program, which will be pruned to build the inference model.
-    Default g_main_program.
+            Default default_main_program().
    :return: None
    """
    if main_program is None:
-        main_program = g_main_program
+        main_program = default_main_program()
    if not isinstance(target_vars, list):
        target_vars = [target_vars]
@@ -272,10 +271,10 @@ def get_parameter_value_by_name(name, executor, program=None):
    :param executor: executor for retrieving the value
    :param name: the name of the parameter
    :param program: the program where the variable is found
-    Default g_main_program.
+            Default default_main_program().
    :return: the LoDTensor for the variable
    """
    if program is None:
-        program = g_main_program
+        program = default_main_program()
    var = program.global_block().var(name)
    return get_parameter_value(var, executor)
--- a/python/paddle/v2/fluid/layer_helper.py
+++ b/python/paddle/v2/fluid/layer_helper.py
 import copy
 import itertools
-from framework import Variable, g_main_program, \
+from framework import Variable, default_main_program, default_startup_program, \
-    g_startup_program, unique_name, dtype_is_floating
+    unique_name, dtype_is_floating
 from paddle.v2.fluid.initializer import Constant, Xavier
+from param_attr import ParamAttr
 class LayerHelper(object):
@@ -22,7 +23,7 @@ class LayerHelper(object):
    def main_program(self):
        prog = self.kwargs.get('main_program', None)
        if prog is None:
-            return g_main_program
+            return default_main_program()
        else:
            return prog
@@ -30,7 +31,7 @@ class LayerHelper(object):
    def startup_program(self):
        prog = self.kwargs.get('startup_program', None)
        if prog is None:
-            return g_startup_program
+            return default_startup_program()
        else:
            return prog
@@ -60,31 +61,15 @@ class LayerHelper(object):
    @property
    def param_attr(self):
-        default = {'name': None}
+        return ParamAttr.to_attr(self.kwargs.get('param_attr', None))
-        actual = self.kwargs.get('param_attr', None)
-        if actual is None:
-            actual = default
-        for default_field in default.keys():
-            if default_field not in actual:
-                actual[default_field] = default[default_field]
-        return actual
    @property
    def bias_attr(self):
-        default = {'name': None}
+        return ParamAttr.to_attr(self.kwargs.get('bias_attr', None))
-        bias_attr = self.kwargs.get('bias_attr', None)
-        if bias_attr is None:
-            bias_attr = default
-        if isinstance(bias_attr, dict):
-            for default_field in default.keys():
-                if default_field not in bias_attr:
-                    bias_attr[default_field] = default[default_field]
-        return bias_attr
    def multiple_param_attr(self, length):
        param_attr = self.param_attr
-        if isinstance(param_attr, dict):
+        if isinstance(param_attr, ParamAttr):
            param_attr = [param_attr]
        if len(param_attr) != 1 and len(param_attr) != length:
@@ -112,23 +97,30 @@ class LayerHelper(object):
                raise ValueError("Data Type mismatch")
        return dtype
-    def create_parameter(self, attr, shape, dtype, suffix='w',
+    def create_parameter(self,
-                         initializer=None):
+                         attr,
+                         shape,
+                         dtype,
+                         is_bias=False,
+                         default_initializer=None):
        # Deepcopy the attr so that parameters can be shared in program
-        attr_copy = copy.deepcopy(attr)
+        assert isinstance(attr, ParamAttr)
-        if initializer is not None:
+        suffix = 'b' if is_bias else 'w'
-            attr_copy['initializer'] = initializer
+        if default_initializer is None:
+            if is_bias:
+                attr.set_default_bias_initializer()
+            else:
+                attr.set_default_param_initializer()
        else:
-            attr_copy['initializer'] = self._get_default_initializer(dtype)
+            attr.set_default_initializer(default_initializer)
-        if attr_copy['name'] is None:
+        if attr.name is None:
-            attr_copy['name'] = unique_name(".".join([self.name, suffix]))
+            attr.name = unique_name(".".join([self.name, suffix]))
        self.startup_program.global_block().create_parameter(
-            dtype=dtype, shape=shape, **attr_copy)
+            dtype=dtype, shape=shape, **attr.to_kwargs(with_initializer=True))
        return self.main_program.global_block().create_parameter(
-            name=attr_copy['name'],
+            dtype=dtype, shape=shape, **attr.to_kwargs())
-            dtype=dtype,
-            shape=shape,
-            trainable=attr_copy.get('trainable', True))
    def create_tmp_variable(self, dtype):
        return self.main_program.current_block().create_var(
@@ -153,11 +145,7 @@ class LayerHelper(object):
            persistable=True,
            initializer=initializer)
-    def append_bias_op(self,
+    def append_bias_op(self, input_var, dim_start=1, dim_end=None):
-                       input_var,
-                       bias_initializer,
-                       dim_start=1,
-                       dim_end=None):
        """
        Append bias operator and return its output. If the user does not set
        bias_attr, append_bias_op will return input_var
@@ -177,11 +165,7 @@ class LayerHelper(object):
            return input_var
        b = self.create_parameter(
-            attr=bias_attr,
+            attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)
-            shape=size,
-            dtype=input_var.dtype,
-            suffix='b',
-            initializer=bias_initializer)
        tmp = self.create_tmp_variable(dtype=input_var.dtype)
        self.append_op(
            type='elementwise_add',

--- a/python/paddle/v2/fluid/layers.py
+++ b/python/paddle/v2/fluid/layers.py
-from . import core
+import core
 import proto.framework_pb2 as framework_pb2
 from framework import OpProtoHolder, Variable, Program, Operator
-from initializer import Constant, Normal, Xavier
+from initializer import Constant, Normal, Xavier, Initializer
 from paddle.v2.fluid.layer_helper import LayerHelper, unique_name
 import re
 import cStringIO
+from param_attr import ParamAttr
 __all__ = [
    'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat',
    'StaticRNN', 'cast', 'sequence_conv', 'sequence_pool', 'sums', 'cos_sim',
-    'batch_norm', 'accuracy', 'split_lod_tensor'
+    'batch_norm', 'accuracy', 'split_lod_tensor', 'While'
 ]
@@ -17,9 +18,7 @@ def fc(input,
       size,
       num_flatten_dims=1,
       param_attr=None,
-       param_initializer=None,
       bias_attr=None,
-       bias_initializer=None,
       act=None,
       name=None,
       main_program=None,
@@ -32,11 +31,9 @@ def fc(input,
       size: The size of the layer
       num_flatten_dims: Number of columns in input
       param_attr: The parameters/weights to the FC Layer
-       param_initializer: Initializer used for the weight/parameter.
+       param_initializer: Initializer used for the weight/parameter. If None, XavierInitializer() is used
-       If None, XavierInitializer() is used
       bias_attr: The bias parameter for the FC layer
-       bias_initializer: Initializer used for the bias.
+       bias_initializer: Initializer used for the bias. If None, then ConstantInitializer() is used
-       If None, then ConstantInitializer() is used
       act: Activation to be applied to the output of FC layer
       name: Name/alias of the function
       main_program: Name of the main program that calls this
@@ -54,23 +51,10 @@ def fc(input,
    to the LayerHelper constructor.
    """
-    def _get_default_param_initializer():
-        return Xavier()
-    def _get_default_bias_initializer():
-        return Constant()
    helper = LayerHelper('fc', **locals())
    dtype = helper.input_dtype()
-    if param_initializer is None:
-        param_initializer = _get_default_param_initializer()
-    if bias_initializer is None:
-        bias_initializer = _get_default_bias_initializer()
    mul_results = []
    for input_var, param_attr in helper.iter_inputs_and_params():
        input_shape = input_var.shape
@@ -78,10 +62,7 @@ def fc(input,
            reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
        ] + [size]
        w = helper.create_parameter(
-            attr=param_attr,
+            attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)
-            initializer=param_initializer,
-            shape=param_shape,
-            dtype=dtype)
        tmp = helper.create_tmp_variable(dtype)
        helper.append_op(
            type="mul",
@@ -102,7 +83,7 @@ def fc(input,
        helper.append_op(
            type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
    # add bias
-    pre_activation = helper.append_bias_op(pre_bias, bias_initializer)
+    pre_activation = helper.append_bias_op(pre_bias)
    # add activation
    return helper.append_activation(pre_activation)
@@ -110,7 +91,6 @@ def fc(input,
 def embedding(input,
              size,
              is_sparse=False,
-              param_initializer=None,
              param_attr=None,
              dtype='float32',
              main_program=None,
@@ -119,6 +99,7 @@ def embedding(input,
    Embedding Layer.
    Args:
+       param_initializer:
       input: The input to the function
       size: The size of the layer
       is_sparse: A flag that decleares whether the input is sparse
@@ -136,15 +117,9 @@ def embedding(input,
    """
-    def _get_default_param_initializer():
-        return Xavier()
    helper = LayerHelper('embedding', **locals())
    w = helper.create_parameter(
-        attr=helper.param_attr,
+        attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False)
-        shape=size,
-        dtype=dtype,
-        initializer=param_initializer or _get_default_param_initializer())
    tmp = helper.create_tmp_variable(dtype)
    helper.append_op(
        type='lookup_table',
@@ -176,7 +151,7 @@ def dynamic_lstm(input,
    if not use_peepholes:
        bias_size[1] = 4 * size
    bias = helper.create_parameter(
-        attr=helper.bias_attr, shape=bias_size, dtype=dtype, suffix='b')
+        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
    hidden = helper.create_tmp_variable(dtype)
    cell = helper.create_tmp_variable(dtype)
@@ -208,6 +183,7 @@ def data(name,
         shape,
         append_batch_size=True,
         dtype='float32',
+         lod_level=0,
         type=core.VarDesc.VarType.LOD_TENSOR,
         main_program=None,
         startup_program=None,
@@ -221,6 +197,7 @@ def data(name,
       append_batch_size: Whether or not to append the data as a batch.
       dtype: The type of data : float32, float_16, int etc
       type: The output type. By default it is LOD_TENSOR.
+       lod_level(int): The LoD Level. 0 means the input data is not a sequence.
       main_program: Name of the main program that calls this
       startup_program: Name of the startup program
       stop_gradient: A boolean that mentions whether gradient should flow.
@@ -251,7 +228,8 @@ def data(name,
        shape=shape,
        dtype=dtype,
        type=type,
-        stop_gradient=stop_gradient)
+        stop_gradient=stop_gradient,
+        lod_level=lod_level)
 def create_tensor(dtype, name=None, main_program=None, startup_program=None):
@@ -423,6 +401,7 @@ _create_op_func_('sigmoid')
 _create_op_func_('scale')
 _create_op_func_('reshape')
 _create_op_func_('transpose')
+_create_op_func_('sigmoid_cross_entropy_with_logits')
 def cast(x, dtype, main_program=None):
@@ -471,19 +450,14 @@ def sums(input, out=None, main_program=None, startup_program=None):
 def linear_chain_crf(input,
                     label,
                     param_attr=None,
-                     param_initializer=None,
                     main_program=None,
                     startup_program=None):
-    def _get_default_param_initializer():
-        return Xavier()
    helper = LayerHelper('linear_chain_crf', **locals())
    size = input.shape[1]
    transition = helper.create_parameter(
        attr=helper.param_attr,
        shape=[size + 2, size],
-        dtype=helper.input_dtype(),
+        dtype=helper.input_dtype())
-        initializer=param_initializer or _get_default_param_initializer())
    alpha = helper.create_tmp_variable(dtype=helper.input_dtype())
    emission_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
    transition_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
@@ -646,9 +620,7 @@ def sequence_conv(input,
                  filter_stride=1,
                  padding=None,
                  bias_attr=None,
-                  bias_initializer=None,
                  param_attr=None,
-                  param_initializer=None,
                  act=None,
                  main_program=None,
                  startup_program=None):
@@ -658,30 +630,15 @@ def sequence_conv(input,
    in the input parameters to the function.
    """
-    def _get_default_bias_initializer():
-        return Constant()
-    def _get_default_param_initializer():
-        return Xavier()
    # FIXME(dzh) : want to unify the argument of python layer
    # function. So we ignore some unecessary attributes.
    # such as, padding_trainable, context_start.
    helper = LayerHelper('sequence_conv', **locals())
    dtype = helper.input_dtype()
-    if param_initializer is None:
-        param_initializer = _get_default_param_initializer()
-    if bias_initializer is None:
-        bias_initializer = _get_default_bias_initializer()
    filter_shape = [filter_size * input.shape[1], num_filters]
    filter = helper.create_parameter(
-        attr=helper.param_attr,
+        attr=helper.param_attr, shape=filter_shape, dtype=dtype)
-        shape=filter_shape,
-        dtype=dtype,
-        initializer=param_initializer)
    pre_bias = helper.create_tmp_variable(dtype)
    helper.append_op(
@@ -696,7 +653,7 @@ def sequence_conv(input,
            'contextStart': -int(filter_size / 2),
            'contextLength': filter_size
        })
-    pre_act = helper.append_bias_op(pre_bias, bias_initializer)
+    pre_act = helper.append_bias_op(pre_bias)
    return helper.append_activation(pre_act)
@@ -707,9 +664,7 @@ def conv2d(input,
           padding=None,
           groups=None,
           param_attr=None,
-           param_initializer=None,
           bias_attr=None,
-           bias_initializer=None,
           act=None,
           name=None,
           main_program=None,
@@ -722,13 +677,6 @@ def conv2d(input,
    conv-2d output, if mentioned in the input parameters.
    """
-    def _get_default_bias_initializer():
-        return Constant()
-    def _get_default_param_initializer(filter_size, num_channels):
-        std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
-        return Normal(0.0, std, 0)
    helper = LayerHelper('conv2d', **locals())
    dtype = helper.input_dtype()
@@ -750,17 +698,16 @@ def conv2d(input,
    input_shape = input.shape
    filter_shape = [num_filters, num_filter_channels] + filter_size
-    if param_initializer is None:
+    def _get_default_param_initializer():
-        param_initializer = _get_default_param_initializer(filter_size,
+        std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
-                                                           num_channels)
+        return Normal(0.0, std, 0)
-    if bias_initializer is None:
-        bias_initializer = _get_default_bias_initializer()
    filter = helper.create_parameter(
        attr=helper.param_attr,
        shape=filter_shape,
        dtype=dtype,
-        initializer=param_initializer)
+        default_initializer=_get_default_param_initializer())
    pre_bias = helper.create_tmp_variable(dtype)
    helper.append_op(
@@ -774,8 +721,7 @@ def conv2d(input,
               'paddings': padding,
               'groups': groups})
-    pre_act = helper.append_bias_op(
+    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
-        pre_bias, bias_initializer, dim_start=1, dim_end=2)
    return helper.append_activation(pre_act)
@@ -876,12 +822,10 @@ def batch_norm(input,
        attr=helper.param_attr,
        shape=param_shape,
        dtype=dtype,
-        initializer=Constant(1.0))
+        default_initializer=Constant(1.0))
    bias = helper.create_parameter(
-        attr=helper.param_attr,
+        attr=helper.param_attr, shape=param_shape, dtype=dtype, is_bias=True)
-        shape=param_shape,
-        dtype=dtype,
-        initializer=Constant(0.0))
    mean = helper.create_global_variable(
        dtype=input.dtype, shape=param_shape, persistable=True)
@@ -1495,7 +1439,7 @@ def increment(x, value=1.0, in_place=True, main_program=None):
        type='increment',
        inputs={'X': [x]},
        outputs={'Out': [out]},
-        attrs={'step': value})
+        attrs={'step': float(value)})
    return out
@@ -1587,6 +1531,93 @@ def array_length(array, main_program=None):
    return tmp
+def conv2d_transpose(input,
+                     num_filters,
+                     output_size=None,
+                     filter_size=None,
+                     padding=None,
+                     stride=None,
+                     param_attr=None,
+                     main_program=None,
+                     startup_program=None):
+    """
+    The transpose of conv2d layer.
+    This layer is also known as deconvolution layer.
+    Args:
+        input(Variable): The input image with [N, C, H, W] format.
+        num_filters(int): The number of filter. It is as same as the output
+            image channel.
+        output_size(int|tuple|None): The output image size. If output size is a
+            tuple, it must contain two integers, (image_H, image_W). This
+            parameter only works when filter_size is None.
+        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.  None if use output size to
+            calculate filter_size
+        padding(int|tuple): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding.
+        stride(int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride.
+        param_attr: Parameter Attribute.
+        main_program(Program): the main program
+        startup_program(Program): the startup program
+    Returns:
+        Variable: Output image.
+    """
+    helper = LayerHelper("conv2d_transpose", **locals())
+    if not isinstance(input, Variable):
+        raise TypeError("Input of conv2d_transpose must be Variable")
+    input_channel = input.shape[1]
+    op_attr = dict()
+    if isinstance(padding, int):
+        op_attr['paddings'] = [padding, padding]
+    elif padding is not None:
+        op_attr['paddings'] = padding
+    if isinstance(stride, int):
+        op_attr['strides'] = stride
+    elif stride is not None:
+        op_attr['strides'] = stride
+    if filter_size is None:
+        if output_size is None:
+            raise ValueError("output_size must be set when filter_size is None")
+        if isinstance(output_size, int):
+            output_size = [output_size, output_size]
+        padding = op_attr.get('paddings', [0, 0])
+        stride = op_attr.get('strides', [1, 1])
+        h_in = input.shape[2]
+        w_in = input.shape[3]
+        filter_size_h = output_size[0] - (h_in - 1) * stride[0] + 2 * padding[0]
+        filter_size_w = output_size[1] - (w_in - 1) * stride[1] + 2 * padding[1]
+        filter_size = [filter_size_h, filter_size_w]
+    elif isinstance(filter_size, int):
+        filter_size = [filter_size, filter_size]
+    filter_shape = [input_channel, num_filters] + filter_size
+    img_filter = helper.create_parameter(
+        dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
+    out = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='conv2d_transpose',
+        inputs={'Input': [input],
+                'Filter': [img_filter]},
+        outputs={'Output': out},
+        attrs=op_attr)
+    return out
 class ConditionalBlockGuard(BlockGuard):
    def __init__(self, block):
        if not isinstance(block, ConditionalBlock):

--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
@@ -197,8 +197,7 @@ class Optimizer(object):
        This method combines interface `append_backward_ops()` and
        `create_optimization_pass()` into one.
        """
-        params_grads = append_backward_ops(loss, parameter_list, no_grad_set or
+        params_grads = append_backward_ops(loss, parameter_list, no_grad_set)
-                                           set())
        # Add regularization if any
        params_grads = append_regularization_ops(params_grads)
        optimize_ops = self.create_optimization_pass(params_grads, loss,

--- a/python/paddle/v2/fluid/param_attr.py
+++ b/python/paddle/v2/fluid/param_attr.py
+from initializer import Initializer, Xavier, Constant
+from regularizer import WeightDecayRegularizer
+class ParamAttr(object):
+    def __init__(self,
+                 name=None,
+                 initializer=None,
+                 learning_rate=1.0,
+                 regularizer=None,
+                 trainable=True):
+        self.name = name
+        self.initializer = initializer
+        self.learning_rate = learning_rate
+        self.regularizer = regularizer
+        self.trainable = trainable
+    def set_default_initializer(self, initializer):
+        if initializer is None:
+            if self.initializer is None:
+                raise ValueError("ParamAttr.initializer is not set")
+            return
+        if self.initializer is not None:
+            return
+        self.initializer = initializer
+    def set_default_param_initializer(self):
+        self.set_default_initializer(Xavier())
+    def set_default_bias_initializer(self):
+        self.set_default_initializer(Constant(0.0))
+    @staticmethod
+    def to_attr(arg):
+        if arg is None:
+            return ParamAttr()
+        elif isinstance(arg, ParamAttr):
+            return arg
+        elif isinstance(arg, str) or isinstance(arg, unicode):
+            return ParamAttr(name=arg)
+        elif isinstance(arg, Initializer):
+            return ParamAttr(initializer=arg)
+        elif isinstance(arg, WeightDecayRegularizer):
+            return ParamAttr(regularizer=arg)
+        elif isinstance(arg, bool):
+            return ParamAttr.to_attr(None) if arg else False
+        else:
+            raise TypeError("{0} cast to ParamAttr".format(type(arg)))
+    def to_kwargs(self, with_initializer=False):
+        kwargs = {
+            'name': self.name,
+            'learning_rate': self.learning_rate,
+            'regularizer': self.regularizer,
+            'trainable': self.trainable
+        }
+        if with_initializer:
+            kwargs['initializer'] = self.initializer
+        return kwargs
--- a/python/paddle/v2/fluid/profiler.py
+++ b/python/paddle/v2/fluid/profiler.py
+import paddle.v2.fluid.core as core
+from contextlib import contextmanager
+__all__ = ['CudaProfiler']
+NVPROF_CONFIG = [
+    "gpustarttimestamp",
+    "gpuendtimestamp",
+    "gridsize3d",
+    "threadblocksize",
+    "streamid",
+    "enableonstart 0",
+    "conckerneltrace",
+]
+@contextmanager
+def cuda_profiler(output_file, output_mode=None, config=None):
+    """The CUDA profiler.
+    This fuctions is used to profile CUDA program by CUDA runtime application
+    programming interface. The profiling result will be written into
+    `output_file` with Key-Value pair format or Comma separated values format.
+    The user can set the output mode by `output_mode` argument and set the
+    counters/options for profiling by `config` argument. The default config
+    is ['gpustarttimestamp', 'gpustarttimestamp', 'gridsize3d',
+    'threadblocksize', 'streamid', 'enableonstart 0', 'conckerneltrace'].
+    Args:
+        output_file (string) : The output file name, the result will be
+            written into this file.
+        output_mode (string) : The output mode has Key-Value pair format and
+            Comma separated values format. It should be 'kvp' or 'csv'.
+        config (string) : The profiler options and counters can refer to
+            "Compute Command Line Profiler User Guide".
+    """
+    if output_mode is None:
+        output_mode = 'csv'
+    if output_mode not in ['kvp', 'csv']:
+        raise ValueError("The output mode must be 'kvp' or 'csv'.")
+    config = NVPROF_CONFIG if config is None else config
+    core.nvprof_init(output_file, output_mode, config)
+    # Enables profiler collection by the active CUDA profiling tool.
+    core.nvprof_start()
+    yield
+    # Disables profiler collection.
+    core.nvprof_stop()
--- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
@@ -22,6 +22,7 @@ train_reader = paddle.batch(
    batch_size=BATCH_SIZE)
 place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
 exe = fluid.Executor(place)
 exe.run(fluid.default_startup_program())
@@ -31,12 +32,8 @@ for pass_id in range(PASS_NUM):
    fluid.io.save_persistables(exe, "./fit_a_line.model/")
    fluid.io.load_persistables(exe, "./fit_a_line.model/")
    for data in train_reader():
-        x_data = np.array(map(lambda _: _[0], data)).astype("float32")
-        y_data = np.array(map(lambda _: _[1], data)).astype("float32")
        avg_loss_value, = exe.run(fluid.default_main_program(),
-                                  feed={'x': x_data,
+                                  feed=feeder.feed(data),
-                                        'y': y_data},
                                  fetch_list=[avg_cost])
        if avg_loss_value[0] < 10.0:

--- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
@@ -69,8 +69,7 @@ def vgg16_bn_drop(input):
    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
-    reshape1 = fluid.layers.reshape(x=fc1, shape=list(fc1.shape + (1, 1)))
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
-    bn = fluid.layers.batch_norm(input=reshape1, act='relu')
    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
    return fc2
@@ -114,23 +113,14 @@ train_reader = paddle.batch(
 place = fluid.CPUPlace()
 exe = fluid.Executor(place)
+feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
 exe.run(fluid.default_startup_program())
 for pass_id in range(PASS_NUM):
    accuracy.reset(exe)
    for data in train_reader():
-        img_data = np.array(map(lambda x: x[0].reshape(data_shape),
-                                data)).astype("float32")
-        y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-        batch_size = 1
-        for i in y_data.shape:
-            batch_size = batch_size * i
-        y_data = y_data.reshape([batch_size, 1])
        loss, acc = exe.run(fluid.default_main_program(),
-                            feed={"pixel": img_data,
+                            feed=feeder.feed(data),
-                                  "label": y_data},
                            fetch_list=[avg_cost] + accuracy.metrics)
        pass_acc = accuracy.eval(exe)
        print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(

--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
@@ -28,23 +28,15 @@ def load_parameter(file_name, h, w):
        return np.fromfile(f, dtype=np.float32).reshape(h, w)
-def db_lstm():
+def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
+            **ignored):
    # 8 features
-    word = fluid.layers.data(name='word_data', shape=[1], dtype='int64')
-    predicate = fluid.layers.data(name='verb_data', shape=[1], dtype='int64')
-    ctx_n2 = fluid.layers.data(name='ctx_n2_data', shape=[1], dtype='int64')
-    ctx_n1 = fluid.layers.data(name='ctx_n1_data', shape=[1], dtype='int64')
-    ctx_0 = fluid.layers.data(name='ctx_0_data', shape=[1], dtype='int64')
-    ctx_p1 = fluid.layers.data(name='ctx_p1_data', shape=[1], dtype='int64')
-    ctx_p2 = fluid.layers.data(name='ctx_p2_data', shape=[1], dtype='int64')
-    mark = fluid.layers.data(name='mark_data', shape=[1], dtype='int64')
    predicate_embedding = fluid.layers.embedding(
        input=predicate,
        size=[pred_len, word_dim],
        dtype='float32',
        is_sparse=IS_SPARSE,
-        param_attr={'name': 'vemb'})
+        param_attr='vemb')
    mark_embedding = fluid.layers.embedding(
        input=mark,
@@ -57,8 +49,8 @@ def db_lstm():
        fluid.layers.embedding(
            size=[word_dict_len, word_dim],
            input=x,
-            param_attr={'name': embedding_name,
+            param_attr=fluid.ParamAttr(
-                        'trainable': False}) for x in word_input
+                name=embedding_name, trainable=False)) for x in word_input
    ]
    emb_layers.append(predicate_embedding)
    emb_layers.append(mark_embedding)
@@ -120,13 +112,30 @@ def to_lodtensor(data, place):
 def main():
    # define network topology
-    feature_out = db_lstm()
+    word = fluid.layers.data(
-    target = fluid.layers.data(name='target', shape=[1], dtype='int64')
+        name='word_data', shape=[1], dtype='int64', lod_level=1)
+    predicate = fluid.layers.data(
+        name='verb_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_n2 = fluid.layers.data(
+        name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_n1 = fluid.layers.data(
+        name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_0 = fluid.layers.data(
+        name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_p1 = fluid.layers.data(
+        name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_p2 = fluid.layers.data(
+        name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
+    mark = fluid.layers.data(
+        name='mark_data', shape=[1], dtype='int64', lod_level=1)
+    feature_out = db_lstm(**locals())
+    target = fluid.layers.data(
+        name='target', shape=[1], dtype='int64', lod_level=1)
    crf_cost = fluid.layers.linear_chain_crf(
        input=feature_out,
        label=target,
-        param_attr={"name": 'crfw',
+        param_attr=fluid.ParamAttr(
-                    "learning_rate": mix_hidden_lr})
+            name='crfw', learning_rate=mix_hidden_lr))
    avg_cost = fluid.layers.mean(x=crf_cost)
    # TODO(qiao)
    #   1. add crf_decode_layer and evaluator
@@ -139,6 +148,11 @@ def main():
            paddle.dataset.conll05.test(), buf_size=8192),
        batch_size=BATCH_SIZE)
    place = fluid.CPUPlace()
+    feeder = fluid.DataFeeder(
+        feed_list=[
+            word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
+        ],
+        place=place)
    exe = fluid.Executor(place)
    exe.run(fluid.default_startup_program())
@@ -150,28 +164,8 @@ def main():
    batch_id = 0
    for pass_id in xrange(PASS_NUM):
        for data in train_data():
-            word_data = to_lodtensor(map(lambda x: x[0], data), place)
-            ctx_n2_data = to_lodtensor(map(lambda x: x[1], data), place)
-            ctx_n1_data = to_lodtensor(map(lambda x: x[2], data), place)
-            ctx_0_data = to_lodtensor(map(lambda x: x[3], data), place)
-            ctx_p1_data = to_lodtensor(map(lambda x: x[4], data), place)
-            ctx_p2_data = to_lodtensor(map(lambda x: x[5], data), place)
-            verb_data = to_lodtensor(map(lambda x: x[6], data), place)
-            mark_data = to_lodtensor(map(lambda x: x[7], data), place)
-            target = to_lodtensor(map(lambda x: x[8], data), place)
            outs = exe.run(fluid.default_main_program(),
-                           feed={
+                           feed=feeder.feed(data),
-                               'word_data': word_data,
-                               'ctx_n2_data': ctx_n2_data,
-                               'ctx_n1_data': ctx_n1_data,
-                               'ctx_0_data': ctx_0_data,
-                               'ctx_p1_data': ctx_p1_data,
-                               'ctx_p2_data': ctx_p2_data,
-                               'verb_data': verb_data,
-                               'mark_data': mark_data,
-                               'target': target
-                           },
                           fetch_list=[avg_cost])
            avg_cost_val = np.array(outs[0])

--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
@@ -37,20 +37,14 @@ train_reader = paddle.batch(
 place = fluid.CPUPlace()
 exe = fluid.Executor(place)
+feeder = fluid.DataFeeder(feed_list=[images, label], place=place)
 exe.run(fluid.default_startup_program())
 for pass_id in range(PASS_NUM):
    accuracy.reset(exe)
    for data in train_reader():
-        img_data = np.array(map(lambda x: x[0].reshape([1, 28, 28]),
-                                data)).astype("float32")
-        y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-        y_data = y_data.reshape([BATCH_SIZE, 1])
        loss, acc = exe.run(fluid.default_main_program(),
-                            feed={"pixel": img_data,
+                            feed=feeder.feed(data),
-                                  "label": y_data},
                            fetch_list=[avg_cost] + accuracy.metrics)
        pass_acc = accuracy.eval(exe)
        print("pass_id=" + str(pass_id) + " acc=" + str(acc) + " pass_acc=" +

--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
@@ -6,24 +6,21 @@ import paddle.v2.fluid as fluid
 BATCH_SIZE = 128
 image = fluid.layers.data(name='x', shape=[784], dtype='float32')
-param_attr = {
+regularizer = fluid.regularizer.L2Decay(0.0005 * BATCH_SIZE)
-    'name': None,
-    'regularization': fluid.regularizer.L2Decay(0.0005 * BATCH_SIZE)
-}
 hidden1 = fluid.layers.fc(input=image,
                          size=128,
                          act='relu',
-                          param_attr=param_attr)
+                          param_attr=regularizer)
 hidden2 = fluid.layers.fc(input=hidden1,
                          size=64,
                          act='relu',
-                          param_attr=param_attr)
+                          param_attr=regularizer)
 predict = fluid.layers.fc(input=hidden2,
                          size=10,
                          act='softmax',
-                          param_attr=param_attr)
+                          param_attr=regularizer)
 label = fluid.layers.data(name='y', shape=[1], dtype='int64')
@@ -51,40 +48,22 @@ test_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
 place = fluid.CPUPlace()
 exe = fluid.Executor(place)
+feeder = fluid.DataFeeder(feed_list=[image, label], place=place)
 exe.run(fluid.default_startup_program())
 PASS_NUM = 100
 for pass_id in range(PASS_NUM):
    accuracy.reset(exe)
    for data in train_reader():
-        x_data = np.array(map(lambda x: x[0], data)).astype("float32")
+        out, acc = exe.run(fluid.default_main_program(),
-        y_data = np.array(map(lambda x: x[1], data)).astype("int64")
+                           feed=feeder.feed(data),
-        y_data = np.expand_dims(y_data, axis=1)
-        tensor_x = fluid.LoDTensor()
-        tensor_x.set(x_data, place)
-        tensor_y = fluid.LoDTensor()
-        tensor_y.set(y_data, place)
-        outs = exe.run(fluid.default_main_program(),
-                       feed={'x': tensor_x,
-                             'y': tensor_y},
                           fetch_list=[avg_cost] + accuracy.metrics)
-        out = np.array(outs[0])
-        acc = np.array(outs[1])
        pass_acc = accuracy.eval(exe)
        test_accuracy.reset(exe)
        for data in test_reader():
-            x_data = np.array(map(lambda x: x[0], data)).astype("float32")
-            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-            y_data = np.expand_dims(y_data, axis=1)
            out, acc = exe.run(inference_program,
-                               feed={'x': x_data,
+                               feed=feeder.feed(data),
-                                     'y': y_data},
                               fetch_list=[avg_cost] + test_accuracy.metrics)
        test_pass_acc = test_accuracy.eval(exe)

--- a/python/paddle/v2/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
@@ -24,7 +24,7 @@ def get_usr_combined_features():
        input=uid,
        dtype='float32',
        size=[USR_DICT_SIZE, 32],
-        param_attr={'name': 'user_table'},
+        param_attr='user_table',
        is_sparse=IS_SPARSE)
    usr_fc = layers.fc(input=usr_emb, size=32)
@@ -36,7 +36,7 @@ def get_usr_combined_features():
    usr_gender_emb = layers.embedding(
        input=usr_gender_id,
        size=[USR_GENDER_DICT_SIZE, 16],
-        param_attr={'name': 'gender_table'},
+        param_attr='gender_table',
        is_sparse=IS_SPARSE)
    usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
@@ -48,7 +48,7 @@ def get_usr_combined_features():
        input=usr_age_id,
        size=[USR_AGE_DICT_SIZE, 16],
        is_sparse=IS_SPARSE,
-        param_attr={'name': 'age_table'})
+        param_attr='age_table')
    usr_age_fc = layers.fc(input=usr_age_emb, size=16)
@@ -58,7 +58,7 @@ def get_usr_combined_features():
    usr_job_emb = layers.embedding(
        input=usr_job_id,
        size=[USR_JOB_DICT_SIZE, 16],
-        param_attr={'name': 'job_table'},
+        param_attr='job_table',
        is_sparse=IS_SPARSE)
    usr_job_fc = layers.fc(input=usr_job_emb, size=16)
@@ -81,7 +81,7 @@ def get_mov_combined_features():
        input=mov_id,
        dtype='float32',
        size=[MOV_DICT_SIZE, 32],
-        param_attr={'name': 'movie_table'},
+        param_attr='movie_table',
        is_sparse=IS_SPARSE)
    mov_fc = layers.fc(input=mov_emb, size=32)

--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
@@ -4,10 +4,8 @@ import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
-def convolution_net(input_dim, class_dim=2, emb_dim=32, hid_dim=32):
+def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
-    data = fluid.layers.data(name="words", shape=[1], dtype="int64")
+                    hid_dim=32):
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
    conv_3 = fluid.nets.sequence_conv_pool(
        input=emb,
@@ -55,8 +53,11 @@ def main():
    dict_dim = len(word_dict)
    class_dim = 2
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
    cost, accuracy, acc_out = convolution_net(
-        input_dim=dict_dim, class_dim=class_dim)
+        data, label, input_dim=dict_dim, class_dim=class_dim)
    train_data = paddle.batch(
        paddle.reader.shuffle(
@@ -64,24 +65,15 @@ def main():
        batch_size=BATCH_SIZE)
    place = fluid.CPUPlace()
    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
    exe.run(fluid.default_startup_program())
    for pass_id in xrange(PASS_NUM):
        accuracy.reset(exe)
        for data in train_data():
-            tensor_words = to_lodtensor(map(lambda x: x[0], data), place)
+            cost_val, acc_val = exe.run(fluid.default_main_program(),
+                                        feed=feeder.feed(data),
-            label = np.array(map(lambda x: x[1], data)).astype("int64")
-            label = label.reshape([BATCH_SIZE, 1])
-            tensor_label = fluid.LoDTensor()
-            tensor_label.set(label, place)
-            cost_val, acc_val = exe.run(
-                fluid.default_main_program(),
-                feed={"words": tensor_words,
-                      "label": tensor_label},
                                        fetch_list=[cost, acc_out])
            pass_acc = accuracy.eval(exe)
            print("cost=" + str(cost_val) + " acc=" + str(acc_val) +

--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
@@ -3,14 +3,14 @@ import paddle.v2 as paddle
 import paddle.v2.fluid as fluid
-def stacked_lstm_net(input_dim,
+def stacked_lstm_net(data,
+                     label,
+                     input_dim,
                     class_dim=2,
                     emb_dim=128,
                     hid_dim=512,
                     stacked_num=3):
    assert stacked_num % 2 == 1
-    data = fluid.layers.data(name="words", shape=[1], dtype="int64")
-    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
    # add bias attr
@@ -65,8 +65,11 @@ def main():
    dict_dim = len(word_dict)
    class_dim = 2
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
    cost, accuracy, acc_out = stacked_lstm_net(
-        input_dim=dict_dim, class_dim=class_dim)
+        data, label, input_dim=dict_dim, class_dim=class_dim)
    train_data = paddle.batch(
        paddle.reader.shuffle(
@@ -74,24 +77,15 @@ def main():
        batch_size=BATCH_SIZE)
    place = fluid.CPUPlace()
    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
    exe.run(fluid.default_startup_program())
    for pass_id in xrange(PASS_NUM):
        accuracy.reset(exe)
        for data in train_data():
-            tensor_words = to_lodtensor(map(lambda x: x[0], data), place)
+            cost_val, acc_val = exe.run(fluid.default_main_program(),
+                                        feed=feeder.feed(data),
-            label = np.array(map(lambda x: x[1], data)).astype("int64")
-            label = label.reshape([BATCH_SIZE, 1])
-            tensor_label = fluid.LoDTensor()
-            tensor_label.set(label, place)
-            cost_val, acc_val = exe.run(
-                fluid.default_main_program(),
-                feed={"words": tensor_words,
-                      "label": tensor_label},
                                        fetch_list=[cost, acc_out])
            pass_acc = accuracy.eval(exe)
            print("cost=" + str(cost_val) + " acc=" + str(acc_val) +

--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
@@ -8,7 +8,8 @@ def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50):
        name="words",
        shape=[seq_len * batch_size, 1],
        append_batch_size=False,
-        dtype="int64")
+        dtype="int64",
+        lod_level=1)
    label = fluid.layers.data(
        name="label",
        shape=[batch_size, 1],
@@ -21,6 +22,7 @@ def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50):
    c_pre_init = fluid.layers.fill_constant(
        dtype=emb.dtype, shape=[batch_size, emb_dim], value=0.0)
+    c_pre_init.stop_gradient = False
    layer_1_out = fluid.layers.lstm(
        emb, c_pre_init=c_pre_init, hidden_dim=emb_dim)
    layer_1_out = fluid.layers.transpose(x=layer_1_out, axis=[1, 0, 2])

--- a/python/paddle/v2/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py
@@ -23,25 +23,25 @@ embed_first = fluid.layers.embedding(
    size=[dict_size, EMBED_SIZE],
    dtype='float32',
    is_sparse=IS_SPARSE,
-    param_attr={'name': 'shared_w'})
+    param_attr='shared_w')
 embed_second = fluid.layers.embedding(
    input=second_word,
    size=[dict_size, EMBED_SIZE],
    dtype='float32',
    is_sparse=IS_SPARSE,
-    param_attr={'name': 'shared_w'})
+    param_attr='shared_w')
 embed_third = fluid.layers.embedding(
    input=third_word,
    size=[dict_size, EMBED_SIZE],
    dtype='float32',
    is_sparse=IS_SPARSE,
-    param_attr={'name': 'shared_w'})
+    param_attr='shared_w')
 embed_forth = fluid.layers.embedding(
    input=forth_word,
    size=[dict_size, EMBED_SIZE],
    dtype='float32',
    is_sparse=IS_SPARSE,
-    param_attr={'name': 'shared_w'})
+    param_attr='shared_w')
 concat_embed = fluid.layers.concat(
    input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
@@ -57,28 +57,17 @@ train_reader = paddle.batch(
 place = fluid.CPUPlace()
 exe = fluid.Executor(place)
+feeder = fluid.DataFeeder(
-# fix https://github.com/PaddlePaddle/Paddle/issues/5434 then remove
+    feed_list=[first_word, second_word, third_word, forth_word, next_word],
-# below exit line.
+    place=place)
-exit(0)
 exe.run(fluid.default_startup_program())
 for pass_id in range(PASS_NUM):
    for data in train_reader():
-        input_data = [[data_idx[idx] for data_idx in data] for idx in xrange(5)]
-        input_data = map(lambda x: np.array(x).astype("int64"), input_data)
-        input_data = map(lambda x: np.expand_dims(x, axis=1), input_data)
        avg_cost_np = exe.run(fluid.default_main_program(),
-                              feed={
+                              feed=feeder.feed(data),
-                                  'firstw': input_data[0],
-                                  'secondw': input_data[1],
-                                  'thirdw': input_data[2],
-                                  'forthw': input_data[3],
-                                  'nextw': input_data[4]
-                              },
                              fetch_list=[avg_cost])
-        if avg_cost_np[0] < 10.0:
+        if avg_cost_np[0] < 5.0:
            exit(0)  # if avg cost less than 10.0, we think our code is good.
 exit(1)
--- a/python/paddle/v2/fluid/tests/demo/fc_gan.py
+++ b/python/paddle/v2/fluid/tests/demo/fc_gan.py
+import errno
+import math
+import os
+import matplotlib
+import numpy
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+import matplotlib.gridspec as gridspec
+NOISE_SIZE = 100
+NUM_PASS = 1000
+NUM_REAL_IMGS_IN_BATCH = 121
+NUM_TRAIN_TIMES_OF_DG = 3
+LEARNING_RATE = 2e-5
+def D(x):
+    hidden = fluid.layers.fc(input=x,
+                             size=200,
+                             act='relu',
+                             param_attr='D.w1',
+                             bias_attr='D.b1')
+    logits = fluid.layers.fc(input=hidden,
+                             size=1,
+                             act=None,
+                             param_attr='D.w2',
+                             bias_attr='D.b2')
+    return logits
+def G(x):
+    hidden = fluid.layers.fc(input=x,
+                             size=200,
+                             act='relu',
+                             param_attr='G.w1',
+                             bias_attr='G.b1')
+    img = fluid.layers.fc(input=hidden,
+                          size=28 * 28,
+                          act='tanh',
+                          param_attr='G.w2',
+                          bias_attr='G.b2')
+    return img
+def plot(gen_data):
+    gen_data.resize(gen_data.shape[0], 28, 28)
+    n = int(math.ceil(math.sqrt(gen_data.shape[0])))
+    fig = plt.figure(figsize=(n, n))
+    gs = gridspec.GridSpec(n, n)
+    gs.update(wspace=0.05, hspace=0.05)
+    for i, sample in enumerate(gen_data):
+        ax = plt.subplot(gs[i])
+        plt.axis('off')
+        ax.set_xticklabels([])
+        ax.set_yticklabels([])
+        ax.set_aspect('equal')
+        plt.imshow(sample.reshape(28, 28), cmap='Greys_r')
+    return fig
+def main():
+    try:
+        os.makedirs("./out")
+    except OSError as e:
+        if e.errno != errno.EEXIST:
+            raise
+    startup_program = fluid.Program()
+    d_program = fluid.Program()
+    dg_program = fluid.Program()
+    with fluid.program_guard(d_program, startup_program):
+        img = fluid.layers.data(name='img', shape=[784], dtype='float32')
+        d_loss = fluid.layers.sigmoid_cross_entropy_with_logits(
+            x=D(img),
+            label=fluid.layers.data(
+                name='label', shape=[1], dtype='float32'))
+        d_loss = fluid.layers.mean(x=d_loss)
+    with fluid.program_guard(dg_program, startup_program):
+        noise = fluid.layers.data(
+            name='noise', shape=[NOISE_SIZE], dtype='float32')
+        g_img = G(x=noise)
+        g_program = dg_program.clone()
+        dg_loss = fluid.layers.sigmoid_cross_entropy_with_logits(
+            x=D(g_img),
+            label=fluid.layers.fill_constant_batch_size_like(
+                input=noise, dtype='float32', shape=[-1, 1], value=1.0))
+        dg_loss = fluid.layers.mean(x=dg_loss)
+    opt = fluid.optimizer.Adam(learning_rate=LEARNING_RATE)
+    opt.minimize(loss=d_loss, startup_program=startup_program)
+    opt.minimize(
+        loss=dg_loss,
+        startup_program=startup_program,
+        parameter_list=[
+            p.name for p in g_program.global_block().all_parameters()
+        ])
+    exe = fluid.Executor(fluid.CPUPlace())
+    exe.run(startup_program)
+    num_true = NUM_REAL_IMGS_IN_BATCH
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=60000),
+        batch_size=num_true)
+    for pass_id in range(NUM_PASS):
+        for batch_id, data in enumerate(train_reader()):
+            num_true = len(data)
+            n = numpy.random.uniform(
+                low=-1.0, high=1.0,
+                size=[num_true * NOISE_SIZE]).astype('float32').reshape(
+                    [num_true, NOISE_SIZE])
+            generated_img = exe.run(g_program,
+                                    feed={'noise': n},
+                                    fetch_list={g_img})[0]
+            real_data = numpy.array(map(lambda x: x[0], data)).astype('float32')
+            real_data = real_data.reshape(num_true, 784)
+            total_data = numpy.concatenate([real_data, generated_img])
+            total_label = numpy.concatenate([
+                numpy.ones(
+                    shape=[real_data.shape[0], 1], dtype='float32'),
+                numpy.zeros(
+                    shape=[real_data.shape[0], 1], dtype='float32')
+            ])
+            d_loss_np = exe.run(d_program,
+                                feed={'img': total_data,
+                                      'label': total_label},
+                                fetch_list={d_loss})[0]
+            for _ in xrange(NUM_TRAIN_TIMES_OF_DG):
+                n = numpy.random.uniform(
+                    low=-1.0, high=1.0,
+                    size=[2 * num_true * NOISE_SIZE]).astype('float32').reshape(
+                        [2 * num_true, NOISE_SIZE, 1, 1])
+                dg_loss_np = exe.run(dg_program,
+                                     feed={'noise': n},
+                                     fetch_list={dg_loss})[0]
+            print("Pass ID={0}, Batch ID={1}, D-Loss={2}, DG-Loss={3}".format(
+                pass_id, batch_id, d_loss_np, dg_loss_np))
+        # generate image each batch
+        fig = plot(generated_img)
+        plt.savefig(
+            'out/{0}.png'.format(str(pass_id).zfill(3)), bbox_inches='tight')
+        plt.close(fig)
+if __name__ == '__main__':
+    main()
--- a/python/paddle/v2/fluid/tests/test_array_read_write_op.py
+++ b/python/paddle/v2/fluid/tests/test_array_read_write_op.py
@@ -3,7 +3,7 @@ import paddle.v2.fluid.core as core
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
 from paddle.v2.fluid.backward import append_backward_ops
-from paddle.v2.fluid.framework import g_main_program
+from paddle.v2.fluid.framework import default_main_program
 import numpy
@@ -66,7 +66,7 @@ class TestArrayReadWrite(unittest.TestCase):
        append_backward_ops(total_sum_scaled)
-        g_vars = map(g_main_program.global_block().var,
+        g_vars = map(default_main_program().global_block().var,
                     [each_x.name + "@GRAD" for each_x in x])
        g_out = [
            item.sum()

--- a/python/paddle/v2/fluid/tests/test_batch_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
@@ -21,6 +21,13 @@ def get_backward_op(scope, op, no_grad_set):
 def _reference_training(x, scale, offset, epsilon, data_format):
+    x_shape = x.shape
+    if len(x_shape) == 2:
+        if data_format == "NCHW":
+            x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
+        else:
+            x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
    if data_format == "NCHW":
        n, c, h, w = x.shape
        x_square = x * x
@@ -39,6 +46,8 @@ def _reference_training(x, scale, offset, epsilon, data_format):
        offset_tile = np.reshape(offset, (1, c, 1, 1))
        offset_tile = np.reshape(offset_tile, (1, c, 1, 1))
        y = normalized * scale_tile + offset_tile
+        if len(x_shape) == 2:
+            y = np.reshape(y, (y.shape[0], y.shape[1]))
        return y, mean, var
    elif data_format == "NHWC":
        x_square = x * x
@@ -48,7 +57,10 @@ def _reference_training(x, scale, offset, epsilon, data_format):
        mean = x_sum / element_count
        var = x_square_sum / element_count - mean * mean
        normalized = (x - mean) / np.sqrt(var + epsilon)
-        return (normalized * scale + offset), mean, var
+        y = normalized * scale + offset
+        if len(x_shape) == 2:
+            y = np.reshape(y, x_shape)
+        return y, mean, var
    else:
        raise ValueError("Unknown data order.")
@@ -65,6 +77,18 @@ def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format):
    #   (x - mean) * sum(grad_y * (x - mean)) / (var + epsilon))
    # transfer from (N, C, H, W) to (N, H, W, C) to simplify computation
+    x_shape = x.shape
+    if len(x_shape) == 2:
+        if data_format == "NCHW":
+            x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
+            grad_y = np.reshape(grad_y,
+                                (grad_y.shape[0], grad_y.shape[1], 1, 1))
+        else:
+            x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
+            grad_y = np.reshape(grad_y,
+                                (grad_y.shape[0], 1, 1, grad_y.shape[1]))
    if data_format == "NCHW":
        x = np.transpose(x, (0, 2, 3, 1))
        grad_y = np.transpose(grad_y, (0, 2, 3, 1))
@@ -83,6 +107,9 @@ def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format):
        grad_x = np.transpose(grad_x, (0, 3, 1, 2))
        x = np.transpose(x, (0, 3, 1, 2))
        grad_y = np.transpose(grad_y, (0, 3, 1, 2))
+    if len(x_shape) == 2:
+        grad_x = np.reshape(grad_x, x_shape)
    return grad_x, grad_scale, grad_offset
@@ -127,7 +154,7 @@ class TestBatchNormOp(OpTest):
        momentum = 0.9
        # N, H, W, C: 2, 3, 4, 2
-        n, h, w, c = 2, 3, 4, 2
+        n, h, w, c = 2, 3, 4, 5
        x_shape = [n, h, w, c]
        scale_shape = [c]
@@ -184,14 +211,17 @@ class TestBatchNormOp(OpTest):
        print 'python: NHWC, NCHW, backward checking passed'
    def test_forward_backward(self):
-        def test_with_place(place, tensor_format):
+        def test_with_place(place, tensor_format, shape):
            # attr
            epsilon = 0.00001
            momentum = 0.9
-            # N, H, W, C: 12, 3, 4, 2
+            if len(shape) == 2:
-            n, h, w, c = 2, 3, 4, 2
+                x_shape = shape
+                c = shape[1]
+            else:
+                # n, h, w, c = 2, 3, 4, 2
+                n, h, w, c = shape[0], shape[1], shape[2], shape[3]
                if data_format == "NHWC":
                    x_shape = [n, h, w, c]
                elif data_format == "NCHW":
@@ -219,6 +249,9 @@ class TestBatchNormOp(OpTest):
            #  for gradient test
            # y_grad = np.ones(x_shape).astype(np.float32)
            y_grad = np.zeros(x_shape).astype(np.float32)
+            if len(y_grad.shape) == 2:
+                y_grad[0, 0] = 1.
+            else:
                y_grad[0, 0, 0, 0] = 1.
            # y_grad = np.random.random_sample(x_shape).astype(np.float32)
            x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_grad(
@@ -313,7 +346,8 @@ class TestBatchNormOp(OpTest):
            places.append(core.GPUPlace(0))
        for place in places:
            for data_format in ["NCHW", "NHWC"]:
-                test_with_place(place, data_format)
+                test_with_place(place, data_format, [2, 3, 4, 5])
+                test_with_place(place, data_format, [2, 3])
 if __name__ == '__main__':

--- a/python/paddle/v2/fluid/tests/test_conditional_block.py
+++ b/python/paddle/v2/fluid/tests/test_conditional_block.py
 import unittest
 import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.core as core
-from paddle.v2.fluid.framework import g_startup_program, g_main_program
+from paddle.v2.fluid.framework import default_startup_program, default_main_program
 from paddle.v2.fluid.executor import Executor
 from paddle.v2.fluid.backward import append_backward_ops
 import numpy
@@ -19,7 +19,7 @@ class ConditionalBlock(unittest.TestCase):
        cpu = core.CPUPlace()
        exe = Executor(cpu)
-        exe.run(g_startup_program)
+        exe.run(default_startup_program())
        x = numpy.random.random(size=(10, 1)).astype('float32')
@@ -29,7 +29,9 @@ class ConditionalBlock(unittest.TestCase):
        append_backward_ops(loss=loss)
        outs = exe.run(
            feed={'X': x},
-            fetch_list=[g_main_program.block(0).var(data.name + "@GRAD")])[0]
+            fetch_list=[
+                default_main_program().block(0).var(data.name + "@GRAD")
+            ])[0]
        print outs

--- a/python/paddle/v2/fluid/tests/test_data_feeder.py
+++ b/python/paddle/v2/fluid/tests/test_data_feeder.py
+import paddle.v2.fluid as fluid
+def test_converter():
+    img = fluid.layers.data(name='image', shape=[1, 28, 28])
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    feeder = fluid.DataFeeder([img, label], fluid.CPUPlace())
+    result = feeder.feed([[[0] * 784, [9]], [[1] * 784, [1]]])
+    print(result)
+if __name__ == '__main__':
+    test_converter()
--- a/python/paddle/v2/fluid/tests/test_dyn_rnn.py
+++ b/python/paddle/v2/fluid/tests/test_dyn_rnn.py
+import paddle.v2.fluid as fluid
+import paddle.v2 as paddle
+import unittest
+import numpy
+class TestDynRNN(unittest.TestCase):
+    def setUp(self):
+        self.word_dict = paddle.dataset.imdb.word_dict()
+        self.BATCH_SIZE = 100
+        self.train_data = paddle.batch(
+            paddle.dataset.imdb.train(self.word_dict),
+            batch_size=self.BATCH_SIZE)
+    def test_plain_while_op(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+            sentence = fluid.layers.data(
+                name='word', shape=[1], dtype='int64', lod_level=1)
+            sent_emb = fluid.layers.embedding(
+                input=sentence, size=[len(self.word_dict), 32], dtype='float32')
+            label = fluid.layers.data(name='label', shape=[1], dtype='float32')
+            rank_table = fluid.layers.lod_rank_table(x=sent_emb)
+            sent_emb_array = fluid.layers.lod_tensor_to_array(
+                x=sent_emb, table=rank_table)
+            seq_len = fluid.layers.max_sequence_len(rank_table=rank_table)
+            i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0)
+            i.stop_gradient = False
+            boot_mem = fluid.layers.fill_constant_batch_size_like(
+                input=fluid.layers.array_read(
+                    array=sent_emb_array, i=i),
+                value=0,
+                shape=[-1, 100],
+                dtype='float32')
+            boot_mem.stop_gradient = False
+            mem_array = fluid.layers.array_write(x=boot_mem, i=i)
+            cond = fluid.layers.less_than(x=i, y=seq_len)
+            cond.stop_gradient = False
+            while_op = fluid.layers.While(cond=cond)
+            out = fluid.layers.create_array(dtype='float32')
+            with while_op.block():
+                mem = fluid.layers.array_read(array=mem_array, i=i)
+                ipt = fluid.layers.array_read(array=sent_emb_array, i=i)
+                mem = fluid.layers.shrink_memory(x=mem, i=i, table=rank_table)
+                hidden = fluid.layers.fc(input=[mem, ipt], size=100, act='tanh')
+                fluid.layers.array_write(x=hidden, i=i, array=out)
+                fluid.layers.increment(x=i, in_place=True)
+                fluid.layers.array_write(x=hidden, i=i, array=mem_array)
+                fluid.layers.less_than(x=i, y=seq_len, cond=cond)
+            all_timesteps = fluid.layers.array_to_lod_tensor(
+                x=out, table=rank_table)
+            last = fluid.layers.sequence_pool(
+                input=all_timesteps, pool_type='last')
+            logits = fluid.layers.fc(input=last, size=1, act=None)
+            loss = fluid.layers.sigmoid_cross_entropy_with_logits(
+                x=logits, label=label)
+            loss = fluid.layers.mean(x=loss)
+            sgd = fluid.optimizer.SGD(1e-4)
+            sgd.minimize(loss=loss)
+        cpu = fluid.CPUPlace()
+        exe = fluid.Executor(cpu)
+        exe.run(startup_program)
+        feeder = fluid.DataFeeder(feed_list=[sentence, label], place=cpu)
+        data = next(self.train_data())
+        val = exe.run(main_program, feed=feeder.feed(data),
+                      fetch_list=[loss])[0]
+        self.assertEqual((1, ), val.shape)
+        print(val)
+        self.assertFalse(numpy.isnan(val))
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_executor_and_mul.py
+++ b/python/paddle/v2/fluid/tests/test_executor_and_mul.py
 import unittest
-from paddle.v2.fluid.layers import mul, data, sequence_pool
+import numpy
 import paddle.v2.fluid.core as core
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.framework import g_main_program
+from paddle.v2.fluid.layers import mul, data
-import numpy
 class TestExecutor(unittest.TestCase):
@@ -19,10 +20,7 @@ class TestExecutor(unittest.TestCase):
        a_np = numpy.random.random((100, 784)).astype('float32')
        b_np = numpy.random.random((784, 100)).astype('float32')
        exe = Executor(place)
-        outs = exe.run(g_main_program,
+        outs = exe.run(feed={'a': a_np, 'b': b_np}, fetch_list=[out])
-                       feed={'a': a_np,
-                             'b': b_np},
-                       fetch_list=[out])
        out = outs[0]
        self.assertEqual((100, 100), out.shape)
        self.assertTrue(numpy.allclose(out, numpy.dot(a_np, b_np)))

--- a/python/paddle/v2/fluid/tests/test_hinge_loss_op.py
+++ b/python/paddle/v2/fluid/tests/test_hinge_loss_op.py
+import unittest
+import numpy as np
+from op_test import OpTest
+class TestHingeLossOp(OpTest):
+    def setUp(self):
+        self.op_type = 'hinge_loss'
+        samples_num = 64
+        logits = np.random.uniform(-10, 10, (samples_num, 1)).astype('float32')
+        labels = np.random.randint(0, 2, (samples_num, 1)).astype('float32')
+        self.inputs = {
+            'Logits': logits,
+            'Labels': labels,
+        }
+        loss = np.maximum(1.0 - (2 * labels - 1) * logits, 0)
+        self.outputs = {'Loss': loss}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(['Logits'], 'Loss', max_relative_error=0.008)
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_image_classification_layer.py
+++ b/python/paddle/v2/fluid/tests/test_image_classification_layer.py
 import unittest
-import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid as fluid
 import paddle.v2.fluid.nets as nets
 from paddle.v2.fluid.framework import Program
@@ -29,27 +29,35 @@ class TestLayer(unittest.TestCase):
    def test_batch_norm_layer(self):
        main_program = Program()
        startup_program = Program()
-        images = layers.data(
+        images = fluid.layers.data(
            name='pixel',
            shape=[3, 48, 48],
            dtype='float32',
            main_program=main_program)
-        layers.batch_norm(
+        hidden1 = fluid.layers.batch_norm(
            input=images,
            main_program=main_program,
            startup_program=startup_program)
+        hidden2 = fluid.layers.fc(input=hidden1,
+                                  size=128,
+                                  act='relu',
+                                  main_program=main_program)
+        hidden3 = fluid.layers.batch_norm(
+            input=hidden2,
+            main_program=main_program,
+            startup_program=startup_program)
-        # print str(main_program)
+        print str(main_program)
    def test_dropout_layer(self):
        main_program = Program()
        startup_program = Program()
-        images = layers.data(
+        images = fluid.layers.data(
            name='pixel',
            shape=[3, 48, 48],
            dtype='float32',
            main_program=main_program)
-        layers.dropout(
+        fluid.layers.dropout(
            x=images,
            dropout_prob=0.5,
            main_program=main_program,
@@ -61,7 +69,7 @@ class TestLayer(unittest.TestCase):
        main_program = Program()
        startup_program = Program()
-        images = layers.data(
+        images = fluid.layers.data(
            name='pixel',
            shape=[3, 48, 48],
            dtype='float32',
@@ -77,19 +85,19 @@ class TestLayer(unittest.TestCase):
    def test_elementwise_add_with_act(self):
        main_program = Program()
        startup_program = Program()
-        image1 = layers.data(
+        image1 = fluid.layers.data(
            name='pixel1',
            shape=[3, 48, 48],
            dtype='float32',
            main_program=main_program,
            startup_program=startup_program)
-        image2 = layers.data(
+        image2 = fluid.layers.data(
            name='pixel2',
            shape=[3, 48, 48],
            dtype='float32',
            main_program=main_program,
            startup_program=startup_program)
-        out = layers.elementwise_add(
+        out = fluid.layers.elementwise_add(
            x=image1,
            y=image2,
            act='relu',

--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
+from __future__ import print_function
 import unittest
 import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.nets as nets
-from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.framework import Program, program_guard
 class TestBook(unittest.TestCase):
    def test_fit_a_line(self):
        program = Program()
-        x = layers.data(
+        with program_guard(program, startup_program=Program()):
-            name='x', shape=[13], dtype='float32', main_program=program)
+            x = layers.data(name='x', shape=[13], dtype='float32')
-        y_predict = layers.fc(input=x, size=1, act=None, main_program=program)
+            y_predict = layers.fc(input=x, size=1, act=None)
+            y = layers.data(name='y', shape=[1], dtype='float32')
-        y = layers.data(
+            cost = layers.square_error_cost(input=y_predict, label=y)
-            name='y', shape=[1], dtype='float32', main_program=program)
+            avg_cost = layers.mean(x=cost)
-        cost = layers.square_error_cost(
-            input=y_predict, label=y, main_program=program)
-        avg_cost = layers.mean(x=cost, main_program=program)
            self.assertIsNotNone(avg_cost)
            program.append_backward(avg_cost)
-        print str(program)
+        print(str(program))
    def test_recognize_digits_mlp(self):
        program = Program()
+        with program_guard(program, startup_program=Program()):
            # Change g_program, so the rest layers use `g_program`
-        images = layers.data(
+            images = layers.data(name='pixel', shape=[784], dtype='float32')
-            name='pixel', shape=[784], dtype='float32', main_program=program)
+            label = layers.data(name='label', shape=[1], dtype='int32')
-        label = layers.data(
+            hidden1 = layers.fc(input=images, size=128, act='relu')
-            name='label', shape=[1], dtype='int32', main_program=program)
+            hidden2 = layers.fc(input=hidden1, size=64, act='relu')
-        hidden1 = layers.fc(input=images,
+            predict = layers.fc(input=hidden2, size=10, act='softmax')
-                            size=128,
+            cost = layers.cross_entropy(input=predict, label=label)
-                            act='relu',
+            avg_cost = layers.mean(x=cost)
-                            main_program=program)
-        hidden2 = layers.fc(input=hidden1,
-                            size=64,
-                            act='relu',
-                            main_program=program)
-        predict = layers.fc(input=hidden2,
-                            size=10,
-                            act='softmax',
-                            main_program=program)
-        cost = layers.cross_entropy(
-            input=predict, label=label, main_program=program)
-        avg_cost = layers.mean(x=cost, main_program=program)
            self.assertIsNotNone(avg_cost)
-        print str(program)
+        print(str(program))
    def test_simple_conv2d(self):
        program = Program()
-        images = layers.data(
+        with program_guard(program, startup_program=Program()):
-            name='pixel',
+            images = layers.data(name='pixel', shape=[3, 48, 48], dtype='int32')
-            shape=[3, 48, 48],
+            layers.conv2d(input=images, num_filters=3, filter_size=[4, 4])
-            dtype='int32',
-            main_program=program)
-        layers.conv2d(
-            input=images,
-            num_filters=3,
-            filter_size=[4, 4],
-            main_program=program)
-        print str(program)
+        print(str(program))
-    def test_recognize_digits_conv(self):
+    def test_conv2d_transpose(self):
        program = Program()
+        with program_guard(program):
+            img = layers.data(name='pixel', shape=[3, 2, 2], dtype='float32')
+            layers.conv2d_transpose(input=img, num_filters=10, output_size=28)
+        print(str(program))
+    def test_recognize_digits_conv(self):
+        program = Program()
+        with program_guard(program, startup_program=Program()):
            images = layers.data(
-            name='pixel',
+                name='pixel', shape=[1, 28, 28], dtype='float32')
-            shape=[1, 28, 28],
+            label = layers.data(name='label', shape=[1], dtype='int32')
-            dtype='float32',
-            main_program=program)
-        label = layers.data(
-            name='label', shape=[1], dtype='int32', main_program=program)
            conv_pool_1 = nets.simple_img_conv_pool(
                input=images,
                filter_size=5,
                num_filters=2,
                pool_size=2,
                pool_stride=2,
-            act="relu",
+                act="relu")
-            main_program=program)
            conv_pool_2 = nets.simple_img_conv_pool(
                input=conv_pool_1,
                filter_size=5,
                num_filters=4,
                pool_size=2,
                pool_stride=2,
-            act="relu",
+                act="relu")
-            main_program=program)
-        predict = layers.fc(input=conv_pool_2,
+            predict = layers.fc(input=conv_pool_2, size=10, act="softmax")
-                            size=10,
+            cost = layers.cross_entropy(input=predict, label=label)
-                            act="softmax",
+            avg_cost = layers.mean(x=cost)
-                            main_program=program)
-        cost = layers.cross_entropy(
-            input=predict, label=label, main_program=program)
-        avg_cost = layers.mean(x=cost, main_program=program)
            program.append_backward(avg_cost)
-        print str(program)
+        print(str(program))
    def test_word_embedding(self):
        program = Program()
+        with program_guard(program, startup_program=Program()):
            dict_size = 10000
            embed_size = 32
-        first_word = layers.data(
+            first_word = layers.data(name='firstw', shape=[1], dtype='int64')
-            name='firstw', shape=[1], dtype='int64', main_program=program)
+            second_word = layers.data(name='secondw', shape=[1], dtype='int64')
-        second_word = layers.data(
+            third_word = layers.data(name='thirdw', shape=[1], dtype='int64')
-            name='secondw', shape=[1], dtype='int64', main_program=program)
+            forth_word = layers.data(name='forthw', shape=[1], dtype='int64')
-        third_word = layers.data(
+            next_word = layers.data(name='nextw', shape=[1], dtype='int64')
-            name='thirdw', shape=[1], dtype='int64', main_program=program)
-        forth_word = layers.data(
-            name='forthw', shape=[1], dtype='int64', main_program=program)
-        next_word = layers.data(
-            name='nextw', shape=[1], dtype='int64', main_program=program)
            embed_first = layers.embedding(
                input=first_word,
                size=[dict_size, embed_size],
                dtype='float32',
-            param_attr={'name': 'shared_w'},
+                param_attr='shared_w')
-            main_program=program)
            embed_second = layers.embedding(
                input=second_word,
                size=[dict_size, embed_size],
                dtype='float32',
-            param_attr={'name': 'shared_w'},
+                param_attr='shared_w')
-            main_program=program)
            embed_third = layers.embedding(
                input=third_word,
                size=[dict_size, embed_size],
                dtype='float32',
-            param_attr={'name': 'shared_w'},
+                param_attr='shared_w')
-            main_program=program)
            embed_forth = layers.embedding(
                input=forth_word,
                size=[dict_size, embed_size],
                dtype='float32',
-            param_attr={'name': 'shared_w'},
+                param_attr='shared_w')
-            main_program=program)
            concat_embed = layers.concat(
                input=[embed_first, embed_second, embed_third, embed_forth],
-            axis=1,
+                axis=1)
-            main_program=program)
-        hidden1 = layers.fc(input=concat_embed,
+            hidden1 = layers.fc(input=concat_embed, size=256, act='sigmoid')
-                            size=256,
-                            act='sigmoid',
-                            main_program=program)
            predict_word = layers.fc(input=hidden1,
                                     size=dict_size,
-                                 act='softmax',
+                                     act='softmax')
-                                 main_program=program)
+            cost = layers.cross_entropy(input=predict_word, label=next_word)
-        cost = layers.cross_entropy(
+            avg_cost = layers.mean(x=cost)
-            input=predict_word, label=next_word, main_program=program)
-        avg_cost = layers.mean(x=cost, main_program=program)
            self.assertIsNotNone(avg_cost)
-        print str(program)
+        print(str(program))
    def test_linear_chain_crf(self):
        program = Program()
+        with program_guard(program, startup_program=Program()):
+            images = layers.data(name='pixel', shape=[784], dtype='float32')
+            label = layers.data(name='label', shape=[1], dtype='int32')
+            hidden = layers.fc(input=images, size=128)
+            crf = layers.linear_chain_crf(input=hidden, label=label)
+            self.assertNotEqual(crf, None)
-        # Change g_program, so the rest layers use `g_program`
+        print(str(program))
-        images = layers.data(
-            name='pixel', shape=[784], dtype='float32', main_program=program)
+    def test_sigmoid_cross_entropy(self):
-        label = layers.data(
+        program = Program()
-            name='label', shape=[1], dtype='int32', main_program=program)
+        with program_guard(program):
-        hidden = layers.fc(input=images, size=128, main_program=program)
+            dat = layers.data(name='data', shape=[10], dtype='float32')
-        crf = layers.linear_chain_crf(
+            lbl = layers.data(name='label', shape=[10], dtype='float32')
-            input=hidden, label=label, main_program=program)
+            self.assertIsNotNone(
+                layers.sigmoid_cross_entropy_with_logits(
-        print str(program)
+                    x=dat, label=lbl))
+        print(str(program))
 if __name__ == '__main__':

--- a/python/paddle/v2/fluid/tests/test_lod_rank_table.py
+++ b/python/paddle/v2/fluid/tests/test_lod_rank_table.py
 from paddle.v2.fluid.layers import lod_rank_table, data
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.framework import g_main_program
 import paddle.v2.fluid.core as core
 import numpy
 import unittest
@@ -18,7 +17,7 @@ class TestLoDRankTable(unittest.TestCase):
        tensor = core.LoDTensor()
        tensor.set(numpy.random.random(size=(17, 100)), cpu)
        tensor.set_lod([[0, 1, 3], [0, 5, 6, 7], [0, 3, 4, 9, 10, 13, 16, 17]])
-        exe.run(g_main_program, scope=scope, feed={'x': tensor})
+        exe.run(scope=scope, feed={'x': tensor})
        var = scope.find_var(rank_table.name)
        table = var.get_lod_rank_table()
        self.assertEqual([(0, 5), (1, 1), (2, 1)], table.items())

--- a/python/paddle/v2/fluid/tests/test_log_loss_op.py
+++ b/python/paddle/v2/fluid/tests/test_log_loss_op.py
+import unittest
+import numpy as np
+from op_test import OpTest
+class TestLogLossOp(OpTest):
+    def setUp(self):
+        self.op_type = 'log_loss'
+        samples_num = 32
+        predicted = np.random.uniform(0.1, 1.0,
+                                      (samples_num, 1)).astype("float32")
+        labels = np.random.randint(0, 2, (samples_num, 1)).astype("float32")
+        epsilon = 1e-4
+        self.inputs = {
+            'Predicted': predicted,
+            'Labels': labels,
+        }
+        self.attrs = {'epsilon': epsilon}
+        loss = -labels * np.log(predicted + epsilon) - (
+            1 - labels) * np.log(1 - predicted + epsilon)
+        self.outputs = {'Loss': loss}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(['Predicted'], 'Loss', max_relative_error=0.03)
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_nce.py
+++ b/python/paddle/v2/fluid/tests/test_nce.py
+import unittest
+import numpy as np
+from op_test import OpTest
+def nce(input, weight, bias, sample_weight, labels, num_classes,
+        num_sample_class):
+    samples = []
+    sample_labels = []
+    batch_size = input.shape[0]
+    num_true_class = labels.shape[1]
+    for i in range(batch_size):
+        w = 1 if sample_weight is None else sample_weight[i]
+        for label in labels[i]:
+            samples.append((i, label, True, w))
+            sample_labels.append(label)
+        for num in range(num_sample_class):
+            samples.append((i, num, False, w))
+            sample_labels.append(num)
+    # forward bias
+    sample_out = np.zeros(len(samples)).astype(np.float32)
+    if bias is not None:
+        for i in range(len(samples)):
+            sample_out[i] = bias[samples[i][1]]
+    # forward weight
+    for i in range(len(samples)):
+        sample_out[i] += np.dot(input[samples[i][0]], weight[samples[i][1]])
+    # forward activation
+    sample_out = 1.0 / (1.0 + np.exp(-sample_out))
+    # forward cost
+    out = np.zeros(batch_size).astype(np.float32)
+    b = 1.0 / num_classes * num_sample_class
+    for i in range(len(samples)):
+        o = sample_out[i]
+        cost = -np.log(o / (o + b)) if samples[i][2] else -np.log(b / (o + b))
+        out[samples[i][0]] += cost * samples[i][3]
+    return (out[:, np.newaxis], np.array(sample_out).reshape(
+        batch_size, num_sample_class + num_true_class),
+            np.array(sample_labels).reshape(batch_size,
+                                            num_sample_class + num_true_class))
+class TestNCE(OpTest):
+    def generate_data(self, dim, batch_size, num_classes, num_true_class,
+                      num_neg_samples):
+        input = np.random.randn(batch_size, dim).astype(np.float32)
+        weight = np.random.randn(num_classes, dim).astype(np.float32)
+        bias = np.random.randn(num_classes).astype(np.float32)
+        sample_weight = np.random.randn(batch_size).astype(np.float32)
+        labels = np.random.randint(0, num_classes, (batch_size, num_true_class))
+        self.attrs = {
+            'num_total_classes': num_classes,
+            'num_neg_samples': num_neg_samples,
+            'custom_neg_classes': range(num_neg_samples)
+        }
+        self.inputs = {
+            'Input': input,
+            'Label': labels,
+            'Weight': weight,
+            'Bias': bias,
+            'SampleWeight': sample_weight
+        }
+    def set_data(self):
+        self.generate_data(5, 5, 4, 1, 2)
+    def compute(self):
+        out = nce(self.inputs['Input'], self.inputs['Weight'],
+                  self.inputs['Bias'], self.inputs['SampleWeight'],
+                  self.inputs['Label'], self.attrs['num_total_classes'],
+                  self.attrs['num_neg_samples'])
+        self.outputs = {
+            'Cost': out[0],
+            'SampleLogits': out[1],
+            'SampleLabels': out[2]
+        }
+    def setUp(self):
+        self.op_type = 'nce'
+        self.set_data()
+        self.compute()
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(
+            ["Input", "Weight", "Bias"], "Cost", max_relative_error=0.02)
+class TestNCECase1(TestNCE):
+    def set_data(self):
+        self.generate_data(10, 20, 10, 2, 5)
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_operator_desc.py
+++ b/python/paddle/v2/fluid/tests/test_operator_desc.py
 import unittest
-from paddle.v2.fluid.framework import Variable, Program, g_main_program
 import paddle.v2.fluid.core as core
+from paddle.v2.fluid.framework import Program, default_startup_program
+main_program = default_startup_program()
 class TestOperator(unittest.TestCase):
    def test_error_type(self):
-        block = g_main_program.create_block()
+        block = main_program.create_block()
        try:
            block.append_op()
            self.assertFail()

--- a/python/paddle/v2/fluid/tests/test_parameter.py
+++ b/python/paddle/v2/fluid/tests/test_parameter.py
 import unittest
-from paddle.v2.fluid.framework import g_main_program
+from paddle.v2.fluid.framework import default_main_program
 import paddle.v2.fluid.core as core
 from paddle.v2.fluid.executor import Executor
 import paddle.v2.fluid.io as io
 from paddle.v2.fluid.initializer import ConstantInitializer
 import numpy as np
+main_program = default_main_program()
 class TestParameter(unittest.TestCase):
    def test_param(self):
        shape = [784, 100]
        val = 1.0625
-        b = g_main_program.global_block()
+        b = main_program.global_block()
        param = b.create_parameter(
            name='fc.w',
            shape=shape,
@@ -23,9 +25,9 @@ class TestParameter(unittest.TestCase):
        self.assertEqual(core.DataType.FP32, param.dtype)
        self.assertEqual(0, param.block.idx)
        exe = Executor(core.CPUPlace())
-        p = exe.run(g_main_program, fetch_list=[param])[0]
+        p = exe.run(main_program, fetch_list=[param])[0]
        self.assertTrue(np.allclose(p, np.ones(shape) * val))
-        p = io.get_parameter_value_by_name('fc.w', exe, g_main_program)
+        p = io.get_parameter_value_by_name('fc.w', exe, main_program)
        self.assertTrue(np.allclose(np.array(p), np.ones(shape) * val))

--- a/python/paddle/v2/fluid/tests/test_profiler.py
+++ b/python/paddle/v2/fluid/tests/test_profiler.py
+import unittest
+import numpy as np
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.profiler as profiler
+import paddle.v2.fluid.layers as layers
+class TestProfiler(unittest.TestCase):
+    def test_nvprof(self):
+        if not fluid.core.is_compile_gpu():
+            return
+        epoc = 8
+        dshape = [4, 3, 28, 28]
+        data = layers.data(name='data', shape=[3, 28, 28], dtype='float32')
+        conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
+        place = fluid.GPUPlace(0)
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+        with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+            for i in range(epoc):
+                input = np.random.random(dshape).astype('float32')
+                exe.run(fluid.default_main_program(), feed={'data': input})
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_program.py
+++ b/python/paddle/v2/fluid/tests/test_program.py
 from __future__ import print_function
 import unittest
-from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.framework import Program, default_main_program
-from paddle.v2.fluid.framework import g_main_program
 import paddle.v2.fluid.layers as layers
+main_program = default_main_program()
 class TestProgram(unittest.TestCase):
    def test_program(self):
-        b = g_main_program.current_block()
+        b = main_program.current_block()
        self.assertEqual(-1, b.parent_idx)
        self.assertEqual(0, b.idx)
-        b = g_main_program.create_block()
+        b = main_program.create_block()
        self.assertEqual(1, b.idx)
        self.assertEqual(0, b.parent_idx)
-        b = g_main_program.create_block()
+        b = main_program.create_block()
        self.assertEqual(2, b.idx)
        self.assertEqual(1, b.parent_idx)
-        g_main_program.rollback()
+        main_program.rollback()
-        b = g_main_program.current_block()
+        b = main_program.current_block()
        self.assertEqual(1, b.idx)
        self.assertEqual(0, b.parent_idx)
-        b = g_main_program.create_block()
+        b = main_program.create_block()
        self.assertEqual(3, b.idx)
        self.assertEqual(1, b.parent_idx)
-        g_main_program.rollback()
+        main_program.rollback()
-        b = g_main_program.current_block()
+        b = main_program.current_block()
        self.assertEqual(1, b.idx)
        self.assertEqual(0, b.parent_idx)

--- a/python/paddle/v2/fluid/tests/test_recurrent_op.py
+++ b/python/paddle/v2/fluid/tests/test_recurrent_op.py
@@ -271,12 +271,12 @@ class RecurrentOpTest2(RecurrentOpTest1):
            temp_l = layers.fc(input=x_t,
                               size=self.input_dim,
-                               param_attr={'name': 'W'},
+                               param_attr='W',
                               bias_attr=False,
                               **self.p_info)
            temp_r = layers.fc(input=h_pre,
                               size=self.input_dim,
-                               param_attr={'name': 'U'},
+                               param_attr='U',
                               bias_attr=False,
                               **self.p_info)
@@ -454,4 +454,6 @@ class RecurrentOpNoMemBootTest(RecurrentOpTest1):
 if __name__ == '__main__':
+    # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152
+    exit(0)
    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
+++ b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
@@ -3,9 +3,11 @@ import paddle.v2.fluid.core as core
 from paddle.v2.fluid.executor import Executor
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.backward import append_backward_ops
-from paddle.v2.fluid.framework import g_main_program
+from paddle.v2.fluid.framework import default_main_program
 import numpy
+main_program = default_main_program()
 class TestShrinkRNNMemory(unittest.TestCase):
    def test_shrink_rnn_memory(self):
@@ -36,7 +38,7 @@ class TestShrinkRNNMemory(unittest.TestCase):
        append_backward_ops(loss=mem3_mean)
        x_grad = exe.run(
            feed={'x': tensor},
-            fetch_list=[g_main_program.global_block().var('x@GRAD')])[0]
+            fetch_list=[main_program.global_block().var('x@GRAD')])[0]
        self.assertAlmostEqual(1.0, x_grad.sum(), delta=0.1)

--- a/python/paddle/v2/fluid/tests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/v2/fluid/tests/test_sigmoid_cross_entropy_with_logits_op.py
@@ -2,11 +2,12 @@ import numpy as np
 from op_test import OpTest
 from scipy.special import logit
 from scipy.special import expit
+import unittest
 class TestSigmoidCrossEntropyWithLogitsOp1(OpTest):
-    '''Test sigmoid_cross_entropy_with_logit_op with binary labels
+    """Test sigmoid_cross_entropy_with_logit_op with binary label
-    '''
+    """
    def setUp(self):
        self.op_type = "sigmoid_cross_entropy_with_logits"
@@ -16,16 +17,16 @@ class TestSigmoidCrossEntropyWithLogitsOp1(OpTest):
            'X': logit(
                np.random.uniform(0, 1, (batch_size, num_classes))
                .astype("float32")),
-            'Labels': np.random.randint(0, 2, (batch_size, num_classes))
+            'Label': np.random.randint(0, 2, (batch_size, num_classes))
            .astype("float32")
        }
        # Fw Pass is implemented as elementwise sigmoid followed by
        # elementwise logistic loss
-        # Labels * -log(sigmoid(X)) + (1 - labels) * -log(1 - sigmoid(X))
+        # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
        sigmoid_X = expit(self.inputs['X'])
-        term1 = self.inputs['Labels'] * np.log(sigmoid_X)
+        term1 = self.inputs['Label'] * np.log(sigmoid_X)
-        term2 = (1 - self.inputs['Labels']) * np.log(1 - sigmoid_X)
+        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
        self.outputs = {'Out': -term1 - term2}
    def test_check_output(self):
@@ -36,8 +37,8 @@ class TestSigmoidCrossEntropyWithLogitsOp1(OpTest):
 class TestSigmoidCrossEntropyWithLogitsOp2(OpTest):
-    '''Test sigmoid_cross_entropy_with_logit_op with probabalistic labels
+    """Test sigmoid_cross_entropy_with_logit_op with probabalistic label
-    '''
+    """
    def setUp(self):
        self.op_type = "sigmoid_cross_entropy_with_logits"
@@ -47,16 +48,16 @@ class TestSigmoidCrossEntropyWithLogitsOp2(OpTest):
            'X': logit(
                np.random.uniform(0, 1, (batch_size, num_classes))
                .astype("float32")),
-            'Labels': np.random.uniform(0, 1, (batch_size, num_classes))
+            'Label': np.random.uniform(0, 1, (batch_size, num_classes))
            .astype("float32")
        }
        # Fw Pass is implemented as elementwise sigmoid followed by
        # elementwise logistic loss
-        # Labels * -log(sigmoid(X)) + (1 - labels) * -log(1 - sigmoid(X))
+        # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
        sigmoid_X = expit(self.inputs['X'])
-        term1 = self.inputs['Labels'] * np.log(sigmoid_X)
+        term1 = self.inputs['Label'] * np.log(sigmoid_X)
-        term2 = (1 - self.inputs['Labels']) * np.log(1 - sigmoid_X)
+        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
        self.outputs = {'Out': -term1 - term2}
    def test_check_output(self):
@@ -64,3 +65,7 @@ class TestSigmoidCrossEntropyWithLogitsOp2(OpTest):
    def test_check_grad(self):
        self.check_grad(['X'], 'Out')
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_unpool_op.py
+++ b/python/paddle/v2/fluid/tests/test_unpool_op.py
+import unittest
+import numpy as np
+from op_test import OpTest
+def unpool2dmax_forward_naive(input, indices, ksize, strides, paddings):
+    s0, s1, s2, s3 = input.shape
+    out_hsize = (s2 - 1) * strides[0] - 2 * paddings[0] + ksize[0]
+    out_wsize = (s2 - 1) * strides[1] - 2 * paddings[1] + ksize[1]
+    out = np.zeros((s0, s1, out_hsize, out_wsize))
+    for nidx in xrange(s0):
+        for cidx in xrange(s1):
+            for h in xrange(s2):
+                for w in xrange(s3):
+                    index = indices[nidx, cidx, h, w]
+                    hidx = (index - index % out_wsize) / out_wsize
+                    widx = index % out_wsize
+                    out[nidx, cidx, int(hidx), int(widx)] = \
+                            input[nidx, cidx, h, w]
+    return out
+class TestUnpoolOp(OpTest):
+    def setUp(self):
+        self.op_type = "unpool"
+        self.init_test_case()
+        pre_input = np.random.random(self.shape).astype("float32")
+        nsize, csize, hsize, wsize = pre_input.shape
+        hsize_out = (hsize - self.ksize[0] + 2 * self.paddings[0]) / \
+                self.strides[0] + 1
+        wsize_out = (wsize - self.ksize[1] + 2 * self.paddings[1]) / \
+                self.strides[1] + 1
+        input = np.zeros((nsize, csize, hsize_out, wsize_out))
+        indices = np.zeros((nsize, csize, hsize_out, wsize_out))
+        for i in xrange(hsize_out):
+            for j in xrange(wsize_out):
+                r_start = np.max((i * self.strides[0] - self.paddings[0], 0))
+                r_end = np.min((i * self.strides[0] + self.ksize[0] - \
+                        self.paddings[0], hsize))
+                c_start = np.max((j * self.strides[1] - self.paddings[1], 0))
+                c_end = np.min((j * self.strides[1] + self.ksize[1] - \
+                        self.paddings[1], wsize))
+                for nidx in xrange(nsize):
+                    for cidx in xrange(csize):
+                        x_masked = pre_input[nidx, cidx, r_start:r_end, \
+                                c_start:c_end]
+                        input[nidx, cidx, i, j] = x_masked.max()
+                        arg = x_masked.argmax()
+                        indices[nidx, cidx, i, j] = \
+                                (r_start + arg / self.ksize[1]) * wsize + \
+                                c_start + arg % self.ksize[1]
+        output = self.unpool2d_forward_naive(input, indices, self.ksize, \
+                self.strides, self.paddings).astype("float32")
+        self.inputs = {
+            'X': input.astype('float32'),
+            'Indices': indices.astype('int32')
+        }
+        self.attrs = {
+            'strides': self.strides,
+            'paddings': self.paddings,
+            'ksize': self.ksize,
+            'unpooling_type': self.unpooling_type,
+        }
+        self.outputs = {'Out': output.astype('float32')}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+    def init_test_case(self):
+        self.unpool2d_forward_naive = unpool2dmax_forward_naive
+        self.unpooling_type = "max"
+        self.shape = [6, 4, 5, 5]
+        self.ksize = [3, 3]
+        self.strides = [2, 2]
+        self.paddings = [0, 0]
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/fluid/tests/test_variable.py
+++ b/python/paddle/v2/fluid/tests/test_variable.py
 import unittest
-from paddle.v2.fluid.framework import g_main_program, Program, convert_np_dtype_to_dtype_
+from paddle.v2.fluid.framework import default_main_program, Program, convert_np_dtype_to_dtype_
 import paddle.v2.fluid.core as core
 import numpy as np
@@ -18,7 +18,7 @@ class TestVariable(unittest.TestCase):
        self.assertRaises(ValueError, lambda: convert("int8"))
    def test_var(self):
-        b = g_main_program.current_block()
+        b = default_main_program().current_block()
        w = b.create_var(
            dtype="float64", shape=[784, 100], lod_level=0, name="fc.w")
        self.assertNotEqual(str(w), "")

--- a/python/paddle/v2/reader/decorator.py
+++ b/python/paddle/v2/reader/decorator.py
@@ -14,13 +14,16 @@
 __all__ = [
    'map_readers', 'buffered', 'compose', 'chain', 'shuffle',
-    'ComposeNotAligned', 'firstn', 'xmap_readers'
+    'ComposeNotAligned', 'firstn', 'xmap_readers', 'pipe_reader'
 ]
+from threading import Thread
+import subprocess
+from Queue import Queue
 import itertools
 import random
-from Queue import Queue
+import zlib
-from threading import Thread
 def map_readers(func, *readers):
@@ -323,3 +326,101 @@ def xmap_readers(mapper, reader, process_num, buffer_size, order=False):
                yield sample
    return xreader
+def _buf2lines(buf, line_break="\n"):
+    # FIXME: line_break should be automatically configured.
+    lines = buf.split(line_break)
+    return lines[:-1], lines[-1]
+def pipe_reader(left_cmd,
+                parser,
+                bufsize=8192,
+                file_type="plain",
+                cut_lines=True,
+                line_break="\n"):
+    """
+    pipe_reader read data by stream from a command, take it's 
+    stdout into a pipe buffer and redirect it to the parser to
+    parse, then yield data as your desired format.
+    You can using standard linux command or call another program
+    to read data, from HDFS, Ceph, URL, AWS S3 etc:
+    cmd = "hadoop fs -cat /path/to/some/file"
+    cmd = "cat sample_file.tar.gz"
+    cmd = "curl http://someurl"
+    cmd = "python print_s3_bucket.py"
+    A sample parser:
+    def sample_parser(lines):
+        # parse each line as one sample data,
+        # return a list of samples as batches.
+        ret = []
+        for l in lines:
+            ret.append(l.split(" ")[1:5])
+        return ret
+    :param left_cmd: command to excute to get stdout from.
+    :type left_cmd: string
+    :param parser: parser function to parse lines of data.
+                   if cut_lines is True, parser will receive list
+                   of lines.
+                   if cut_lines is False, parser will receive a
+                   raw buffer each time.
+                   parser should return a list of parsed values.
+    :type parser: callable
+    :param bufsize: the buffer size used for the stdout pipe.
+    :type bufsize: int
+    :param file_type: can be plain/gzip, stream buffer data type.
+    :type file_type: string
+    :param cut_lines: whether to pass lines instead of raw buffer
+                      to the parser
+    :type cut_lines: bool
+    :param line_break: line break of the file, like \n or \r
+    :type line_break: string
+    :return: the reader generator.
+    :rtype: callable
+    """
+    if not isinstance(left_cmd, str):
+        raise TypeError("left_cmd must be a string")
+    if not callable(parser):
+        raise TypeError("parser must be a callable object")
+    process = subprocess.Popen(
+        left_cmd.split(" "), bufsize=bufsize, stdout=subprocess.PIPE)
+    # TODO(typhoonzero): add a thread to read stderr
+    # Always init a decompress object is better than
+    # create in the loop.
+    dec = zlib.decompressobj(
+        32 + zlib.MAX_WBITS)  # offset 32 to skip the header
+    def reader():
+        remained = ""
+        while True:
+            buff = process.stdout.read(bufsize)
+            if buff:
+                if file_type == "gzip":
+                    decomp_buff = dec.decompress(buff)
+                elif file_type == "plain":
+                    decomp_buff = buff
+                else:
+                    raise TypeError("file_type %s is not allowed" % file_type)
+                if cut_lines:
+                    lines, remained = _buf2lines(''.join(
+                        [remained, decomp_buff]), line_break)
+                    parsed_list = parser(lines)
+                    for ret in parsed_list:
+                        yield ret
+                else:
+                    for ret in parser(decomp_buff):
+                        yield ret
+            else:
+                break
+    return reader