Merge branch 'develop' of https://github.com/paddlepaddle/Paddle into sampler

32b7711f · wanghaoshuang · 62efc896 · 8193cc55 · 32b7711f · 32b7711f
292 changed file
--- a/.clang-format
+++ b/.clang-format
@@ -25,4 +25,3 @@ AllowAllParametersOfDeclarationOnNextLine: true
 BinPackParameters: false
 BinPackArguments: false
 ...
--- a/.gitignore
+++ b/.gitignore
@@ -28,3 +28,4 @@ cmake_install.cmake
 paddle/.timestamp
 python/paddlepaddle.egg-info/
 paddle/pybind/pybind.h
+python/paddle/version.py
--- a/.travis.yml
+++ b/.travis.yml
@@ -42,7 +42,7 @@ before_install:
 script:
  - |
    timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
-    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
+    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true ;else exit 1; fi;
  - |
    if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
    if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -16,6 +16,8 @@ cmake_minimum_required(VERSION 3.0)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
 set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
 set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
+SET(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
+SET(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 include(system)
@@ -54,7 +56,9 @@ option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
 option(GLIDE_INSTALL    "Download and install go dependencies "         ON)
 option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
+option(WITH_DISTRIBUTE  "Compile with grpc distributed support"         OFF)
 option(USE_EIGEN_FOR_BLAS   "Use matrix multiplication in Eigen"        OFF)
+option(WITH_ARM_FP16    "Use half precision support on armv8.2-a cpu"   OFF)
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -67,9 +71,6 @@ if(ANDROID OR IOS)
    if(ANDROID)
        if(${CMAKE_SYSTEM_VERSION} VERSION_LESS "16")
            message(FATAL_ERROR "Unsupport standalone toolchains with Android API level lower than 16")
-        elseif(${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
-            # TODO: support glog for Android api 16 ~ 19 in the future
-            message(WARNING "Using the unofficial git repository <https://github.com/Xreki/glog.git> instead")
        endif()
    endif()
@@ -83,6 +84,8 @@ if(ANDROID OR IOS)
        "Disable RDMA when cross-compiling for Android and iOS" FORCE)
    set(WITH_MKL OFF CACHE STRING
        "Disable MKL when cross-compiling for Android and iOS" FORCE)
+    set(WITH_GOLANG OFF CACHE STRING
+        "Disable golang when cross-compiling for Android and iOS" FORCE)
    # Compile PaddlePaddle mobile inference library
    if (NOT WITH_C_API)
@@ -133,6 +136,8 @@ include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/nccl)
+include(external/cares)
+include(external/grpc)
 include(cudnn)              # set cudnn libraries, must before configure
 include(configure)          # add paddle env configuration

--- a/Dockerfile
+++ b/Dockerfile
@@ -29,7 +29,7 @@ RUN apt-get update && \
    automake locales clang-format swig doxygen cmake  \
    liblapack-dev liblapacke-dev libboost-dev \
    clang-3.8 llvm-3.8 libclang-3.8-dev \
-    net-tools && \
+    net-tools libtool && \
    apt-get clean -y
 # Install Go and glide

--- a/benchmark/paddle/image/googlenet.py
+++ b/benchmark/paddle/image/googlenet.py
@@ -6,10 +6,21 @@ width = 224
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
 use_gpu = get_config_arg('use_gpu', bool, True)
+is_infer = get_config_arg("is_infer", bool, False)
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+args = {
+    'height': height,
+    'width': width,
+    'color': True,
+    'num_class': num_class,
+    'is_infer': is_infer
+}
 define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
+    "train.list" if not is_infer else None,
+    "test.list" if is_infer else None,
+    module="provider",
+    obj="process",
+    args=args)
 settings(
    batch_size=batch_size,
@@ -146,7 +157,6 @@ def inception(name, input, channels, \
    return cat
-lab = data_layer(name="label", size=1000)
 data = data_layer(name="input", size=3 * height * width)
 # stage 1
@@ -224,6 +234,10 @@ pool5 = img_pool_layer(
 dropout = dropout_layer(name="dropout", input=pool5, dropout_rate=0.4)
 out3 = fc_layer(
    name="output3", input=dropout, size=1000, act=SoftmaxActivation())
-loss3 = cross_entropy(name='loss3', input=out3, label=lab)
-outputs(loss3)
+if is_infer:
+    outputs(out3)
+else:
+    lab = data_layer(name="label", size=num_class)
+    loss3 = cross_entropy(name='loss3', input=out3, label=lab)
+    outputs(loss3)
--- a/benchmark/paddle/image/provider.py
+++ b/benchmark/paddle/image/provider.py
@@ -13,14 +13,20 @@ def initHook(settings, height, width, color, num_class, **kwargs):
        settings.data_size = settings.height * settings.width * 3
    else:
        settings.data_size = settings.height * settings.width
+    settings.is_infer = kwargs.get('is_infer', False)
-    settings.slots = [dense_vector(settings.data_size), integer_value(1)]
+    if settings.is_infer:
+        settings.slots = [dense_vector(settings.data_size)]
+    else:
+        settings.slots = [dense_vector(settings.data_size), integer_value(1)]
 @provider(
    init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, file_list):
-    for i in xrange(1024):
+    for i in xrange(2560 if settings.is_infer else 1024):
        img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
-        lab = random.randint(0, settings.num_class - 1)
+        if settings.is_infer:
-        yield img.astype('float32'), int(lab)
+            yield img.astype('float32')
+        else:
+            lab = random.randint(0, settings.num_class - 1)
+            yield img.astype('float32'), int(lab)
--- a/benchmark/paddle/image/resnet.py
+++ b/benchmark/paddle/image/resnet.py
@@ -6,11 +6,21 @@ width = 224
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 64)
 layer_num = get_config_arg("layer_num", int, 50)
-is_test = get_config_arg("is_test", bool, False)
+is_infer = get_config_arg("is_infer", bool, False)
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+args = {
+    'height': height,
+    'width': width,
+    'color': True,
+    'num_class': num_class,
+    'is_infer': is_infer
+}
 define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
+    "train.list" if not is_infer else None,
+    "test.list" if is_infer else None,
+    module="provider",
+    obj="process",
+    args=args)
 settings(
    batch_size=batch_size,
@@ -45,7 +55,10 @@ def conv_bn_layer(name,
        act=LinearActivation(),
        bias_attr=False)
    return batch_norm_layer(
-        name=name + "_bn", input=tmp, act=active_type, use_global_stats=is_test)
+        name=name + "_bn",
+        input=tmp,
+        act=active_type,
+        use_global_stats=is_infer)
 def bottleneck_block(name, input, num_filters1, num_filters2):
@@ -207,7 +220,9 @@ elif layer_num == 152:
 else:
    print("Wrong layer number.")
-lbl = data_layer(name="label", size=num_class)
+if is_infer:
-loss = cross_entropy(name='loss', input=resnet, label=lbl)
+    outputs(resnet)
-inputs(img, lbl)
+else:
-outputs(loss)
+    lbl = data_layer(name="label", size=num_class)
+    loss = cross_entropy(name='loss', input=resnet, label=lbl)
+    outputs(loss)
--- a/benchmark/paddle/image/run_mkldnn_infer.sh
+++ b/benchmark/paddle/image/run_mkldnn_infer.sh
+set -e
+function clock_to_seconds() {
+  hours=`echo $1 | awk -F ':' '{print $1}'`
+  mins=`echo $1 | awk -F ':' '{print $2}'`
+  secs=`echo $1 | awk -F ':' '{print $3}'`
+  echo `awk 'BEGIN{printf "%.2f",('$secs' + '$mins' * 60 + '$hours' * 3600)}'`
+}
+function infer() {
+  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
+  topology=$1
+  layer_num=$2
+  bs=$3
+  use_mkldnn=$4
+  if [ $4 == "True" ]; then
+    thread=1
+    log="logs/infer-${topology}-${layer_num}-mkldnn-${bs}.log"
+  elif [ $4 == "False" ]; then
+    thread=`nproc`
+    if [ $thread -gt $bs ]; then
+      thread=$bs
+    fi
+    log="logs/infer-${topology}-${layer_num}-${thread}mklml-${bs}.log"
+  else
+    echo "Wrong input $4, use True or False."
+    exit 0
+  fi
+  models_in="models/${topology}-${layer_num}/pass-00000/"
+  if [ ! -d $models_in ]; then
+    echo "Training model ${topology}_${layer_num}"
+    paddle train --job=train \
+      --config="${topology}.py" \
+      --use_mkldnn=True \
+      --use_gpu=False \
+      --trainer_count=1 \
+      --num_passes=1 \
+      --save_dir="models/${topology}-${layer_num}" \
+      --config_args="batch_size=128,layer_num=${layer_num}" \
+      > /dev/null 2>&1
+    echo "Done"
+  fi
+  log_period=$((256 / bs))
+  paddle train --job=test \
+    --config="${topology}.py" \
+    --use_mkldnn=$use_mkldnn \
+    --use_gpu=False \
+    --trainer_count=$thread \
+    --log_period=$log_period \
+    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \
+    --init_model_path=$models_in \
+    2>&1 | tee ${log}
+  # calculate the last 5 logs period time of 1280 samples,
+  # the time before are burning time.
+  start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+  end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+  start_sec=`clock_to_seconds $start`
+  end_sec=`clock_to_seconds $end`
+  fps=`awk 'BEGIN{printf "%.2f",(1280 / ('$end_sec' - '$start_sec'))}'`
+  echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
+  echo "FPS: $fps images/sec" 2>&1 | tee -a ${log}
+}
+if [ ! -f "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -f "test.list" ]; then
+  echo " " > test.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+if [ ! -d "models" ]; then
+  mkdir -p models
+fi
+# inference benchmark
+for use_mkldnn in True False; do
+  for batchsize in 1 2 4 8 16; do
+    infer googlenet v1 $batchsize $use_mkldnn
+    infer resnet 50 $batchsize $use_mkldnn
+    infer vgg 19 $batchsize $use_mkldnn
+  done
+done
--- a/benchmark/paddle/image/run_mkldnn.sh
+++ b/benchmark/paddle/image/run_mkldnn.sh
@@ -8,13 +8,13 @@ function train() {
  use_mkldnn=$4
  if [ $4 == "True" ]; then
    thread=1
-    log="logs/${topology}-${layer_num}-mkldnn-${bs}.log"
+    log="logs/train-${topology}-${layer_num}-mkldnn-${bs}.log"
  elif [ $4 == "False" ]; then
    thread=`nproc`
    # each trainer_count use only 1 core to avoid conflict
-    log="logs/${topology}-${layer_num}-${thread}mklml-${bs}.log"
+    log="logs/train-${topology}-${layer_num}-${thread}mklml-${bs}.log"
  else
-    echo "Wrong input $3, use True or False."
+    echo "Wrong input $4, use True or False."
    exit 0
  fi
  args="batch_size=${bs},layer_num=${layer_num}"
@@ -30,13 +30,14 @@ function train() {
    2>&1 | tee ${log} 
 }
-if [ ! -d "train.list" ]; then
+if [ ! -f "train.list" ]; then
  echo " " > train.list
 fi
 if [ ! -d "logs" ]; then
  mkdir logs
 fi
+# training benchmark
 for use_mkldnn in True False; do
  for batchsize in 64 128 256; do
    train vgg 19 $batchsize $use_mkldnn

--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
@@ -6,10 +6,21 @@ width = 224
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 64)
 layer_num = get_config_arg('layer_num', int, 19)
+is_infer = get_config_arg("is_infer", bool, False)
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+args = {
+    'height': height,
+    'width': width,
+    'color': True,
+    'num_class': num_class,
+    'is_infer': is_infer
+}
 define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
+    "train.list" if not is_infer else None,
+    "test.list" if is_infer else None,
+    module="provider",
+    obj="process",
+    args=args)
 settings(
    batch_size=batch_size,
@@ -98,6 +109,9 @@ elif layer_num == 19:
 else:
    print("Wrong layer number.")
-lab = data_layer('label', num_class)
+if is_infer:
-loss = cross_entropy(input=vgg, label=lab)
+    outputs(vgg)
-outputs(loss)
+else:
+    lab = data_layer('label', num_class)
+    loss = cross_entropy(input=vgg, label=lab)
+    outputs(loss)
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -24,6 +24,11 @@ if(WITH_DOUBLE)
    add_definitions(-DPADDLE_TYPE_DOUBLE)
 endif(WITH_DOUBLE)
+if(WITH_ARM_FP16)
+    add_definitions(-DPADDLE_ARM_FP16)
+    add_definitions("-march=armv8.2-a+fp16+simd")
+endif(WITH_ARM_FP16)
 if(WITH_TESTING)
    add_definitions(-DPADDLE_WITH_TESTING)
 endif(WITH_TESTING)

--- a/cmake/external/cares.cmake
+++ b/cmake/external/cares.cmake
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE)
+    return()
+ENDIF()
+include (ExternalProject)
+# NOTE: c-ares is needed when linking with grpc.
+SET(CARES_SOURCES_DIR ${THIRD_PARTY_PATH}/cares)
+SET(CARES_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cares)
+SET(CARES_INCLUDE_DIR "${CARES_INSTALL_DIR}/include/" CACHE PATH "cares include directory." FORCE)
+ExternalProject_Add(
+    extern_cares
+    GIT_REPOSITORY "https://github.com/c-ares/c-ares.git"
+    GIT_TAG "cares-1_13_0"
+    PREFIX          ${CARES_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ./buildconf && ./configure --disable-shared --prefix=${CARES_INSTALL_DIR}
+    BUILD_IN_SOURCE 1
+    BUILD_COMMAND   make -j8
+    INSTALL_COMMAND make install
+)
+ADD_LIBRARY(cares STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET cares PROPERTY IMPORTED_LOCATION
+             "${CARES_INSTALL_DIR}/lib/libcares.a")
+include_directories(${CARES_INCLUDE_DIR})
+ADD_DEPENDENCIES(cares extern_cares)
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -26,12 +26,21 @@ ENDIF(WIN32)
 INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
+IF(ANDROID AND ${CMAKE_SYSTEM_VERSION} VERSION_LESS "21")
+  # Using the unofficial glog for Android API < 21
+  SET(GLOG_REPOSITORY "https://github.com/Xreki/glog.git")
+  SET(GLOG_TAG "8a547150548b284382ccb6582408e9140ff2bea8")
+ELSE()
+  SET(GLOG_REPOSITORY "https://github.com/google/glog.git")
+  SET(GLOG_TAG "v0.3.5")
+ENDIF()
 ExternalProject_Add(
    extern_glog
    ${EXTERNAL_PROJECT_LOG_ARGS}
    DEPENDS gflags
-    GIT_REPOSITORY  "https://github.com/google/glog.git"
+    GIT_REPOSITORY  ${GLOG_REPOSITORY}
-    GIT_TAG         v0.3.5
+    GIT_TAG         ${GLOG_TAG}
    PREFIX          ${GLOG_SOURCES_DIR}
    UPDATE_COMMAND  ""
    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}

--- a/cmake/external/grpc.cmake
+++ b/cmake/external/grpc.cmake
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+IF(MOBILE_INFERENCE OR NOT WITH_DISTRIBUTE)
+    return()
+ENDIF()
+include (ExternalProject)
+SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc)
+SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
+SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
+SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
+IF(APPLE)
+  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
+ELSE()
+  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin)
+ENDIF()
+ExternalProject_Add(
+    extern_grpc
+    DEPENDS protobuf zlib
+    GIT_REPOSITORY "https://github.com/grpc/grpc.git"
+    GIT_TAG "v1.7.x"
+    PREFIX          ${GRPC_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_IN_SOURCE 1
+    # NOTE(yuyang18):
+    # Disable -Werror, otherwise the compile will fail in MacOS.
+    # It seems that we cannot configure that by make command.
+    # Just dry run make command and remove `-Werror`, then use a shell to run make commands
+    BUILD_COMMAND  ${BUILD_CMD}
+    INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install
+)
+# FIXME(typhoonzero): hack to get static lib path, try a better way like merge them.
+ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION
+             "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a")
+ADD_LIBRARY(grpc++ STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET grpc++ PROPERTY IMPORTED_LOCATION
+            "${GRPC_INSTALL_DIR}/lib/libgrpc++.a")
+ADD_LIBRARY(gpr STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET gpr PROPERTY IMPORTED_LOCATION
+            "${GRPC_INSTALL_DIR}/lib/libgpr.a")
+ADD_LIBRARY(grpc_unsecure STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION
+            "${GRPC_INSTALL_DIR}/lib/libgrpc_unsecure.a")
+include_directories(${GRPC_INCLUDE_DIR})
+ADD_DEPENDENCIES(grpc++_unsecure extern_grpc)
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -114,11 +114,7 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
 # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
 SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
 FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
-IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
+ADD_LIBRARY(cblas STATIC ${dummyfile})
-    ADD_LIBRARY(cblas SHARED ${dummyfile})
-ELSE()
-    ADD_LIBRARY(cblas STATIC ${dummyfile})
-ENDIF()
 TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
 IF(NOT ${CBLAS_FOUND})

--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -15,7 +15,18 @@
 INCLUDE(ExternalProject)
 # Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp
 FIND_PACKAGE(Protobuf QUIET)
-SET(PROTOBUF_FOUND "OFF")
+macro(UNSET_VAR VAR_NAME)
+    UNSET(${VAR_NAME} CACHE)
+    UNSET(${VAR_NAME})
+endmacro()
+UNSET_VAR(PROTOBUF_INCLUDE_DIR)
+UNSET_VAR(PROTOBUF_FOUND)
+UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE)
+UNSET_VAR(PROTOBUF_PROTOC_LIBRARY)
+UNSET_VAR(PROTOBUF_LITE_LIBRARY)
+UNSET_VAR(PROTOBUF_LIBRARY)
+UNSET_VAR(PROTOBUF_INCLUDE_DIR)
+UNSET_VAR(Protobuf_PROTOC_EXECUTABLE)
 if(NOT COMMAND protobuf_generate_python)  # before cmake 3.4, protobuf_genrerate_python is not defined.
    function(protobuf_generate_python SRCS)
@@ -110,7 +121,6 @@ macro(PROMPT_PROTOBUF_LIB)
    # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`.
    # make `protobuf_generate_cpp` happy.
    SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE})
    FOREACH(dep ${protobuf_DEPS})
        ADD_DEPENDENCIES(protobuf ${dep})
        ADD_DEPENDENCIES(protobuf_lite ${dep})
@@ -128,11 +138,11 @@ endmacro()
 set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
 if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
-    find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include)
+    find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH)
-    find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib)
+    find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
-    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib)
+    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
-    find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib)
+    find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
-    find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin)
+    find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH)
    if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE)
        message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.")
        SET_PROTOBUF_VERSION()
@@ -178,14 +188,26 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
        SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}")
    ENDIF()
+    SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
+    SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
+    IF(MOBILE_INFERENCE)
+        # The reason why the official version is not used is described in
+        # https://github.com/PaddlePaddle/Paddle/issues/6114
+        SET(PROTOBUF_REPO "https://github.com/qingqing01/protobuf.git")
+        SET(PROTOBUF_TAG "v3.2.0")
+        IF(NOT BUILD_FOR_HOST)
+            SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-Dprotobuf_BUILD_PROTOC_BINARIES=OFF")
+        ENDIF()
+    ENDIF()
    ExternalProject_Add(
        ${TARGET_NAME}
        ${EXTERNAL_PROJECT_LOG_ARGS}
        PREFIX          ${PROTOBUF_SOURCES_DIR}
        UPDATE_COMMAND  ""
        DEPENDS         zlib
-        GIT_REPOSITORY  "https://github.com/google/protobuf.git"
+        GIT_REPOSITORY  ${PROTOBUF_REPO}
-        GIT_TAG         "9f75c5aa851cd877fb0d93ccc31b8567a6706546"
+        GIT_TAG         ${PROTOBUF_TAG}
        CONFIGURE_COMMAND
        ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake
            ${OPTIONAL_ARGS}
@@ -203,7 +225,11 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
    )
 ENDFUNCTION()
-SET(PROTOBUF_VERSION 3.1)
+IF(NOT MOBILE_INFERENCE)
+    SET(PROTOBUF_VERSION 3.1)
+ELSE()
+    SET(PROTOBUF_VERSION 3.2)
+ENDIF()
 IF(CMAKE_CROSSCOMPILING)
    build_protobuf(protobuf_host TRUE)
    LIST(APPEND external_project_dependencies protobuf_host)

--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -50,6 +50,8 @@ ExternalProject_Add(
 )
 LIST(APPEND external_project_dependencies zlib)
+ADD_LIBRARY(zlib_target STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET zlib_target PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
 IF(WITH_C_API)
  INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib)

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -111,6 +111,8 @@ set(COMMON_FLAGS
    -Wno-error=sign-compare
    -Wno-error=unused-local-typedefs
    -Wno-error=parentheses-equality # Warnings in pybind11
+    -Wno-error=ignored-attributes  # Warnings in Eigen, gcc 6.3
+    -Wno-error=terminate  # Warning in PADDLE_ENFORCE
 )
 set(GPU_COMMON_FLAGS

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -227,8 +227,8 @@ function(cc_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS)
    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
-    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
+    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
    add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
  endif()
 endfunction(cc_test)
@@ -288,8 +288,8 @@ function(nv_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS)
    cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
    cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main)
+    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
-    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main)
+    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
    add_test(${TARGET_NAME} ${TARGET_NAME})
  endif()
 endfunction(nv_test)
@@ -467,3 +467,50 @@ function(py_test TARGET_NAME)
             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
  endif()
 endfunction()
+# grpc_library generate grpc code using grpc_cpp_plugin and protoc
+# then build the generated protobuf code and grpc code with your
+# implementation source codes together. Use SRCS argument for your
+# implementation source files and PROTO argument for your .proto
+# files.
+#
+# Usage: grpc_library(my_target SRCS my_client.cc PROTO my_target.proto DEPS my_dep)
+function(grpc_library TARGET_NAME)
+  set(oneValueArgs PROTO)
+  set(multiValueArgs SRCS DEPS)
+  set(options "")
+  cmake_parse_arguments(grpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+  message(STATUS "generating grpc ${grpc_library_PROTO}")
+  get_filename_component(ABS_PROTO ${grpc_library_PROTO} ABSOLUTE)
+  get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE)
+  get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
+  protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}")
+  set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc")
+  set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h")
+  cc_library("${TARGET_NAME}_proto" SRCS "${grpc_proto_srcs}")
+  add_custom_command(
+          OUTPUT "${grpc_grpc_srcs}" "${grpc_grpc_hdrs}"
+          COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
+          ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
+          --plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}"
+          DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc)
+  # FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it
+  # as compiler warnings instead of error. Should try remove the warnings also.
+  set_source_files_properties(
+    ${grpc_grpc_srcs}
+    PROPERTIES
+    COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  cc_library("${TARGET_NAME}_grpc" SRCS "${grpc_grpc_srcs}")
+  set_source_files_properties(
+    ${grpc_library_SRCS}
+    PROPERTIES
+    COMPILE_FLAGS  "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}")
+endfunction()
--- a/doc/api/index_en.rst
+++ b/doc/api/index_en.rst
@@ -7,3 +7,4 @@ API
    v2/model_configs.rst
    v2/data.rst
    v2/run_logic.rst
+    v2/fluid.rst
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -54,7 +54,7 @@ img_conv
 ..  _api_v2.layer_context_projection:
-context_projection 
+context_projection
 ------------------
 ..  autoclass:: paddle.v2.layer.context_projection
    :noindex:
@@ -70,7 +70,7 @@ Image Pooling Layer
 img_pool
 --------
 ..  autoclass:: paddle.v2.layer.img_pool
-    :noindex:   
+    :noindex:
 spp
 ---
@@ -104,7 +104,7 @@ sum_to_one_norm
 ---------------
 ..  autoclass:: paddle.v2.layer.sum_to_one_norm
    :noindex:
 cross_channel_norm
 ------------------
 ..  autoclass:: paddle.v2.layer.cross_channel_norm
@@ -114,7 +114,7 @@ row_l2_norm
 -----------
 ..  autoclass:: paddle.v2.layer.row_l2_norm
    :noindex:
 Recurrent Layers
 ================
@@ -415,6 +415,13 @@ multiplex
 ..  autoclass:: paddle.v2.layer.multiplex
    :noindex:
+Factorization Machine Layer
+============================
+factorization_machine
+---------------------
+..  autoclass:: paddle.v2.layer.factorization_machine
+    :noindex:
 Slicing and Joining Layers
 ==========================

--- a/doc/api/v2/fluid.rst
+++ b/doc/api/v2/fluid.rst
+======================
+Fluid
+======================
+..  toctree::
+    :maxdepth: 1
+    fluid/layers.rst
+    fluid/data_feeder.rst
+    fluid/executor.rst
+    fluid/initializer.rst
+    fluid/evaluator.rst
+    fluid/nets.rst
+    fluid/optimizer.rst
+    fluid/param_attr.rst
+    fluid/profiler.rst
+    fluid/regularizer.rst
--- a/doc/api/v2/fluid/data_feeder.rst
+++ b/doc/api/v2/fluid/data_feeder.rst
+===========
+DataFeeder
+===========
+DataFeeder
+-----------
+..  automodule:: paddle.v2.fluid.data_feeder
+    :members: DataFeeder
+    :noindex:
--- a/doc/api/v2/fluid/evaluator.rst
+++ b/doc/api/v2/fluid/evaluator.rst
+===========
+Evaluator
+===========
+Evaluator
+-----------
+..  automodule:: paddle.v2.fluid.evaluator
+    :members: Evaluator
+    :noindex:
--- a/doc/api/v2/fluid/executor.rst
+++ b/doc/api/v2/fluid/executor.rst
+===========
+Executor
+===========
+Executor
+-----------
+..  automodule:: paddle.v2.fluid.executor
+    :members: Executor
+    :noindex:
--- a/doc/api/v2/fluid/initializer.rst
+++ b/doc/api/v2/fluid/initializer.rst
+===========
+Initializer
+===========
+Initializer
+-----------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: Initializer
+    :noindex:
+ConstantInitializer
+-------------------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: ConstantInitializer
+    :noindex:
+UniformInitializer
+------------------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: UniformInitializer
+    :noindex:
+NormalInitializer
+-----------------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: NormalInitializer
+    :noindex:
+XavierInitializer
+-----------------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: XavierInitializer
+    :noindex:
+MSRAInitializer
+---------------
+..  automodule:: paddle.v2.fluid.initializer
+    :members: MSRAInitializer
+    :noindex:
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
+==========
+Layers
+==========
+fc
+---
+..  autofunction:: paddle.v2.fluid.layers.fc
+    :noindex:
+embedding
+---------
+..  autofunction:: paddle.v2.fluid.layers.embedding
+    :noindex:
+dynamic_lstm
+------------
+..  autofunction:: paddle.v2.fluid.layers.dynamic_lstm
+    :noindex:
+data
+---------
+..  autofunction:: paddle.v2.fluid.layers.data
+    :noindex:
+mean
+---------
+..  autofunction:: paddle.v2.fluid.layers.mean
+    :noindex:
+mul
+---------
+..  autofunction:: paddle.v2.fluid.layers.mul
+    :noindex:
+elementwise_add
+---------------
+..  autofunction:: paddle.v2.fluid.layers.elementwise_add
+    :noindex:
+elementwise_div
+---------------
+..  autofunction:: paddle.v2.fluid.layers.elementwise_div
+    :noindex:
+dropout
+---------
+..  autofunction:: paddle.v2.fluid.layers.dropout
+    :noindex:
+reshape
+---------
+..  autofunction:: paddle.v2.fluid.layers.reshape
+    :noindex:
+sigmoid
+---------
+..  autofunction:: paddle.v2.fluid.layers.sigmoid
+    :noindex:
+scale
+---------
+..  autofunction:: paddle.v2.fluid.layers.scale
+    :noindex:
+reshape
+---------
+..  autofunction:: paddle.v2.fluid.layers.reshape
+    :noindex:
+transpose
+---------
+..  autofunction:: paddle.v2.fluid.layers.transpose
+    :noindex:
+sigmoid_cross_entropy_with_logits
+---------
+..  autofunction:: paddle.v2.fluid.layers.esigmoid_cross_entropy_with_logits
+    :noindex:
+cast
+---------
+..  autofunction:: paddle.v2.fluid.layers.cast
+    :noindex:
+concat
+---------
+..  autofunction:: paddle.v2.fluid.layers.concat
+    :noindex:
+sums
+---------
+..  autofunction:: paddle.v2.fluid.layers.sums
+    :noindex:
+linear_chain_crf
+---------
+..  autofunction:: paddle.v2.fluid.layers.linear_chain_crf
+    :noindex:
+assign
+---------
+..  autofunction:: paddle.v2.fluid.layers.embedding
+    :noindex:
+split_lod_tensor
+---------
+..  autofunction:: paddle.v2.fluid.layers.split_lod_tensor
+    :noindex:
+merge_lod_tensor
+---------
+..  autofunction:: paddle.v2.fluid.layers.merge_lod_tensor
+    :noindex:
+cos_sim
+---------
+..  autofunction:: paddle.v2.fluid.layers.cos_sim
+    :noindex:
+cross_entropy
+---------
+..  autofunction:: paddle.v2.fluid.layers.cross_entropy
+    :noindex:
+square_error_cost
+---------
+..  autofunction:: paddle.v2.fluid.layers.square_error_cost
+    :noindex:
+accuracy
+---------
+..  autofunction:: paddle.v2.fluid.layers.accuracy
+    :noindex:
+sequence_conv
+---------
+..  autofunction:: paddle.v2.fluid.layers.sequence_conv
+    :noindex:
+conv2d
+---------
+..  autofunction:: paddle.v2.fluid.layers.conv2d
+    :noindex:
+sequence_pool
+---------
+..  autofunction:: paddle.v2.fluid.layers.sequence_pool
+    :noindex:
+pool2d
+---------
+..  autofunction:: paddle.v2.fluid.layers.pool2d
+    :noindex:
+batch_norm
+---------
+..  autofunction:: paddle.v2.fluid.layers.batch_norm
+    :noindex:
+beam_search_decode
+---------
+..  autofunction:: paddle.v2.fluid.layers.beam_search_decode
+    :noindex:
+lstm
+---------
+..  autofunction:: paddle.v2.fluid.layers.lstm
+    :noindex:
+lod_rank_table
+---------
+..  autofunction:: paddle.v2.fluid.layers.lod_rank_table
+    :noindex:
+max_sequence_len
+---------
+..  autofunction:: paddle.v2.fluid.layers.max_sequence_len
+    :noindex:
+topk
+---------
+..  autofunction:: paddle.v2.fluid.layers.topk
+    :noindex:
+lod_tensor_to_array
+---------
+..  autofunction:: paddle.v2.fluid.layers.lod_tensor_to_array
+    :noindex:
+array_to_lod_tensor
+---------
+..  autofunction:: paddle.v2.fluid.layers.array_to_lod_tensor
+    :noindex:
+fill_constant
+---------
+..  autofunction:: paddle.v2.fluid.layers.fill_constant
+    :noindex:
+fill_constant_batch_size_like
+---------
+..  autofunction:: paddle.v2.fluid.layers.fill_constant_batch_size_like
+    :noindex:
+ones
+---------
+..  autofunction:: paddle.v2.fluid.layers.ones
+    :noindex:
+zeros
+---------
+..  autofunction:: paddle.v2.fluid.layers.zeros
+    :noindex:
+increment
+---------
+..  autofunction:: paddle.v2.fluid.layers.increment
+    :noindex:
+array_write
+---------
+..  autofunction:: paddle.v2.fluid.layers.array_write
+    :noindex:
+create_array
+---------
+..  autofunction:: paddle.v2.fluid.layers.create_array
+    :noindex:
+less_than
+---------
+..  autofunction:: paddle.v2.fluid.layers.less_than
+    :noindex:
+array_read
+---------
+..  autofunction:: paddle.v2.fluid.layers.array_read
+    :noindex:
+shrink_memory
+---------
+..  autofunction:: paddle.v2.fluid.layers.shrink_memory
+    :noindex:
+array_length
+---------
+..  autofunction:: paddle.v2.fluid.layers.array_length
+    :noindex:
+conv2d_transpose
+---------
+..  autofunction:: paddle.v2.fluid.layers.conv2d_transpose
+    :noindex:
--- a/doc/api/v2/fluid/nets.rst
+++ b/doc/api/v2/fluid/nets.rst
+===========
+Nets
+===========
+simple_img_conv_pool
+-----------
+..  autofunction:: paddle.v2.fluid.nets.simple_img_conv_pool
+    :noindex:
+img_conv_group
+-----------
+..  autofunction:: paddle.v2.fluid.nets.img_conv_group
+    :noindex:
+sequence_conv_pool
+-----------
+..  autofunction:: paddle.v2.fluid.nets.sequence_conv_pool
+    :noindex:
--- a/doc/api/v2/fluid/optimizer.rst
+++ b/doc/api/v2/fluid/optimizer.rst
+===========
+Optimizer
+===========
+Optimizer
+-----------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: Optimizer
+    :noindex:
+SGDOptimizer
+-----------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: SGDOptimizer
+    :noindex:
+MomentumOptimizer
+-----------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: MomentumOptimizer
+    :noindex:
+AdagradOptimizer
+-----------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: AdagradOptimizer
+    :noindex:
+AdamOptimizer
+-----------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: AdamOptimizer
+    :noindex:
+AdamaxOptimizer
+-----------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: AdamaxOptimizer
+    :noindex:
+DecayedAdagradOptimizer
+-----------
+..  automodule:: paddle.v2.fluid.optimizer
+    :members: DecayedAdagradOptimizer
+    :noindex:
--- a/doc/api/v2/fluid/param_attr.rst
+++ b/doc/api/v2/fluid/param_attr.rst
+===========
+ParamAttr
+===========
+ParamAttr
+-----------
+..  automodule:: paddle.v2.fluid.param_attr
+    :members: ParamAttr
+    :noindex:
--- a/doc/api/v2/fluid/profiler.rst
+++ b/doc/api/v2/fluid/profiler.rst
+===========
+Profiler
+===========
+Profiler
+-----------
+..  autofunction:: paddle.v2.fluid.profiler.cuda_profiler
+    :noindex:
--- a/doc/api/v2/fluid/regularizer.rst
+++ b/doc/api/v2/fluid/regularizer.rst
+===========
+Regularizer
+===========
+WeightDecayRegularizer
+-----------
+..  automodule:: paddle.v2.fluid.regularizer
+    :members: WeightDecayRegularizer
+    :noindex:
+L2DecayRegularizer
+-----------
+..  automodule:: paddle.v2.fluid.regularizer
+    :members: L2DecayRegularizer
+    :noindex:
+L1DecayRegularizer
+-----------
+..  automodule:: paddle.v2.fluid.regularizer
+    :members: L1DecayRegularizer
--- a/doc/design/evaluator.md
+++ b/doc/design/evaluator.md
 ## Evaluator Design
-### The Problem
+### Problem Statement
-During training or serving, we provide the evaluation function to measure the model performance, e.g., accuracy, precision. In the operator based framework design, the data go through the network pipeline batch by batch. As a result, inside the operator, we only can calculate one minibatch metrics. We need to provide a mechanism to calculate the metrics for each N pass/batch the user wanted.
+During training or inference, we provide an evaluation function to measure the model performance, for example, accuracy, precision, etc. In the operator based framework design, the data passes through the network pipeline batch by batch. As a result, inside the operator, we only calculate the metrics for one minibatch. Thus, we need to provide a mechanism to calculate the metrics for each N pass/batch the user wants.
 ### Evaluator Design
-Currently, every operation is expressed in the graph. we divide the evaluator process into three steps.
+Currently, every operation is expressed in the graph. We divide the evaluator process into three steps.
 1. Initialize the metric state and add it into the block.
-2. Calculate the statistic of the metric state in every mini-batch. The single operator is only responsible for calculating necessary statistics for one mini-batch. For example, accuracy operator only calculate a minibatch data if run once.
+2. Calculate the concerned metrics for every mini-batch. The single evaluator operator is only responsible for calculating the necessary statistics for one mini-batch. For example, the accuracy operator only calculates the accuracy for a minibatch data if run once.
 3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices.
 ### Implementation
-This design is shown in python API. 
+This design is shown in the Python API. 
-Each metric operator need to caculate the metric statistic and return the batch aware states, Python side responsible for accumulate the states for each pass. 
+Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass. 
 ```python

--- a/doc/design/float16.md
+++ b/doc/design/float16.md
@@ -28,6 +28,51 @@ The goal of float16 is to serve as a key for the executor to find and run the co
 - [Eigen](https://github.com/RLovelett/eigen) >= 3.3 supports float16 calculation on both GPU and CPU using the `Eigen::half` class. It is mostly useful for Nvidia GPUs because of the overloaded arithmetic operators using cuda intrinsics. It falls back to using software emulation on CPU for calculation and there is no special treatment to ARM processors.
 - [ARM compute library](https://github.com/ARM-software/ComputeLibrary) >= 17.02.01 supports NEON FP16 kernels (requires ARMv8.2-A CPU).
+### CUDA version issue
+There are currently three versions of CUDA that supports `__half` data type, namely, CUDA 7.5, 8.0, and 9.0. 
+CUDA 7.5 and 8.0 define `__half` as a simple struct that has a `uint16_t` data (see [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/9212ab5a3ddbe48f30ef373f9c1fb546804c7a8c/include/isaac/external/CUDA/cuda_fp16.h)) as follows:
+```
+typedef struct __align__(2) {
+   unsigned short x;
+} __half;
+typedef __half half;
+```
+This struct does not define any overloaded arithmetic operators. So you have to directly use `__hadd` instead of `+` to correctly add two half types:
+```
+__global__ void Add() {
+  half a, b, c;
+  c = __hadd(a, b); // correct
+  c = a + b; // compiler error: no operator "+" matches these operands
+}
+```
+CUDA 9.0 provides a major update to the half data type. The related code can be found in the updated [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.h) and the newly added [`cuda_fp16.hpp`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.hpp).
+Essentially, CUDA 9.0 renames the original `__half` type in 7.5 and 8.0 as `__half_raw`, and defines a new `__half` class type that has constructors, conversion operators, and also provides overloaded arithmetic operators such as follows:
+```
+typedef struct __CUDA_ALIGN__(2) {
+    unsigned short x;
+} __half_raw;
+struct __CUDA_ALIGN__(2) __half {
+protected:
+    unsigned short __x;
+public:
+    // constructors and conversion operators from/to 
+    // __half_raw and other built-in data types
+}
+typedef __half half;
+__device__ __forceinline__ 
+__half operator+(const __half &lh, const __half &rh) { 
+    return __hadd(lh, rh); 
+}
+// Other overloaded operators
+``` 
+This new design makes `c = a + b` work correctly for CUDA half data type. 
 ## Implementation
 The float16 class holds a 16-bit `uint16_t` data internally.

--- a/doc/design/mkldnn/README.MD
+++ b/doc/design/mkldnn/README.MD
 # Intel® MKL-DNN on PaddlePaddle: Design Doc
-我们计划将Intel深度神经网络数学库(**MKL-DNN**\[[1](#references)\])集成到PaddlePaddle，充分展现英特尔平台的优势，有效提升PaddlePaddle在英特尔架构上的性能。
+我们计划将英特尔深度神经网络数学库[Intel MKL-DNN](https://github.com/01org/mkl-dnn)
+(Intel Math Kernel Library for Deep Neural Networks)集成到PaddlePaddle，
+充分展现英特尔平台的优势，有效提升PaddlePaddle在英特尔架构上的性能。
-我们短期内的基本目标是：
+<div align="center">
+<img src="image/overview.png"><br/>
+Figure 1. PaddlePaddle on IA
+</div>
+近期目标
- 完成常用layer的MKL-DNN实现。
+- 完成常用Layer的MKL-DNN实现。
 - 完成常见深度神经网络VGG，GoogLeNet 和 ResNet的MKL-DNN实现。
+目前的优化，主要针对PaddlePaddle在重构之前的代码框架以及V1的API。
+具体的完成状态可以参见[这里](https://github.com/PaddlePaddle/Paddle/projects/21)。
 ## Contents
 - [Overview](#overview)
 - [Actions](#actions)
 	- [CMake](#cmake)
+ 	- [Matrix](#matrix)
 	- [Layers](#layers)
 	- [Activations](#activations)
-	- [Weights](#weights)
+	- [Parameters](#parameters)
+	- [Gradients](#gradients)
 	- [Unit Tests](#unit-tests)
-	- [Protobuf Messages](#protobuf-messages)
 	- [Python API](#python-api)
-	- [Demos](#demos)
 	- [Benchmarking](#benchmarking)
 	- [Others](#others)
 - [Design Concerns](#design-concerns)
 ## Overview
-我们会把MKL-DNN作为第三方库集成进PaddlePaddle，整体框架图
+我们会把MKL-DNN会作为第三方库集成进PaddlePaddle，与其他第三方库一样，会在编译PaddlePaddle的时候下载并编译MKL-DNN。
+同时，为了进一步提升PaddlePaddle在基本数学运算的计算速度，我们也将MKLML即(MKL small library\[[1](#references)\])
+作为另一个第三方库集成进PaddlePaddle，它只会包括生成好的动态库和头文件。
+MKL，MKLML以及MKL-DNN三者关系如下表：
+| Name        |  Open Source     | License     | Descriptions  |
+| :---------- | :--------------- | :---------- | :------------ |
+|   MKL       |     No           | Proprietary | Accelerate math processing routines | 
+|   MKLML     |     No           | Proprietary | Small package of MKL, especially for Machine Learning |
+|   MKL-DNN   |     Yes          | Apache 2.0  | Accelerate primitives processing routines especially for Deep Neural Networks  |
+MKLML可以与MKL-DNN共同使用，以此达到最好的性能。
 <div align="center">
-<img src="image/overview.png" width=350><br/>
+<img src="image/engine.png"><br/>
-Figure 1. PaddlePaddle on IA.
+Figure 2. PaddlePaddle with MKL Engines
 </div>
 ## Actions
-我们把集成方案大致分为了如下几个方面。
+添加的相关文件和目录结构如下：
+```txt
+PaddlePaddle/Paddle
+├── ...
+├── cmake/
+│   ├── external/
+│   │   ├── ...
+│   │   ├── mkldnn.cmake
+│   │   └── mklml.cmake
+└── paddle/
+    ├── ...
+    ├── math/
+    │   ├── ...
+    │   └── MKLDNNMatrix.*
+    └── gserver/
+        ├── ...
+        ├── layers/
+        │   ├── ...
+        │   └── MKLDNN*Layer.*
+        ├── activations/
+        │   ├── ...
+        │   └── MKLDNNActivations.*
+        └── tests/
+            ├── ...
+            ├── MKLDNNTester.*
+            └── test_MKLDNN.cpp
+```
 ### CMake
-我们会在`CMakeLists.txt`中会给用户添加一个`WITH_MKL`的开关，他是负责`WITH_MKLML`和`WITH_MKLDNN`的总开关。
+在`CMakeLists.txt`中提供一个与MKL有关的总开关：`WITH_MKL`，它负责决定编译时是否使用MKLML和MKL-DNN
-当打开`WITH_MKL`时，会开启MKLML的功能，作为PaddlePaddle的CBLAS和LAPACK库，同时会开启Intel OpenMP用于提高MKLML的性能。 如果系统支持AVX2指令集及以上，同时会开启MKL-DNN功能。
+- `WITH_MKLML` 控制是否使用MKLML库。 
+当打开`WITH_MKL`时，会自动使用MKLML库作为PaddlePaddle的CBLAS和LAPACK库，同时会开启Intel OpenMP用于提高MKLML的性能。
+编译时会把对应的头文件和库放在`build/third_party/install/mklml/*`目录下对应的地方。
+MKLML的库目前都是动态库，主要包括`libiomp5.so`和`libmklml_intel.so`。
+- `WITH_MKLDNN` 控制是否使用MKL-DNN。
+当开启`WITH_MKL`时，会自动根据硬件配置[[2](#references)]选择是否编译MKL-DNN。
+编译时会把对应的头文件和库放在`build/third_party/install/mkldnn/*`目录下对应的地方。
+MKL-DNN的库目前只有动态库`libmkldnn.so`。
-当关闭`WITH_MKL`时，MKLML和MKL-DNN功能会同时关闭。
+### Matrix
+目前在PaddlePaddle中数据都是以`NCHW`的格式存储，但是在MKL-DNN中的排列方式不止这一种。
+所以我们定义了一个`MKLDNNMatrix`用于管理MKL-DNN数据的不同格式以及相互之间的转换。
-所以，我们会在`cmake/external`目录新建`mkldnn.cmake`和`mklml.cmake`文件，它们会在编译PaddlePaddle的时候下载对应的软件包，并放到PaddlePaddle的third party目录中。
+<div align="center">
+<img src="image/matrix.png"><br/>
+Figure 3. MKLDNNMatrix
+</div>
 ### Layers
-所有MKL-DNN相关的C++ layers，都会按照PaddlePaddle的目录结构存放在
+所有MKL-DNN的Layers都会继承于`MKLDNNLayer`，该类继承于PaddlePaddle的基类`Layer`。
-`paddle/gserver/layers`中，并且文件名都会一以*MKLDNN*开头。
+在`MKLDNNLayer`中会提供一些必要的接口和函数，并且会写好`forward`和`backward`的基本逻辑，
+子类只需要使用定义好的接口，实现具体的函数功能即可。
+<div align="center">
+<img src="image/layers.png"><br/>
+Figure 4. MKLDNNLayer
+</div>
+每个MKLDNNLayer都包含用于内部存储和外部存储的一系列MKLDNNMatrix：
-所有MKL-DNN的layers都会继承于一个叫做`MKLDNNLayer`的父类，该父类继承于PaddlePaddle的基类`Layer`。
+- 内部存储（internel memory）：`inVal_`,`inGrad_`,`outVal_`和`outGrad_`，分别代表输入数据，输入梯度，输出数据和输出梯度。
+- 外部存储（external memory）：都是以ext开头，比如`extInVal_`和`extInGrad_`，它们主要是用于，
+当数据格式与PaddlePaddle默认的`NCHW`格式不匹配时，转换内存的工作。
+需要注意的是，PaddlePaddle的activation会直接使用`output_.value`和`output_.grad`，
+所以`extOutVal_`和`extOutGrad_`必须分别与`output_.value`和`output_.grad`共享内存，
+如果不需要外部存储用于转换，那么对应的内部存储也会与它们共享内存。
+- 转换函数（resetXXX）： 包括`resetInValue`，`resetInGrad`，`resetOutValue`和`resetOutGrad`，
+表示对输入数据，输入梯度，输出数据和输出梯度的转换。
+这些函数会根据输入参数重新设置内部和外部存储，当然这两者也可以相等，即表示不需要转换。
-在`MKLDNNLayer`中会提供一些必要的接口和函数，并且会写好`forward`和`backward`的基本逻辑。部分函数定义为纯虚函数，子类只需要实现这些函数即可。
+注意：每个`MKLDNNlayer`的子类只需要使用内部存储就可以了，所有外部的转换工作都会在reset系列函数中都准备好。
 ### Activations
-由于在PaddlePaddle中，激活函数是独立于layer概念的，所以会在`paddle/gserver/activations`目录下添加`MKLDNNActivation.h`和`MKLDNNActivation.cpp`文件用于定义和使用MKL-DNN的接口。
+在重构前的PaddlePaddle中，激活函数是独立于`Layer`的概念，并且输入输出都是共用一块内存，
+所以添加了对应的`MKLDNNActivation`来实现，方式类似于`MKLDNNLayer`。
+### Parameters
+对于有参数的层，我们会保证`MKLDNNLayer`使用的参数与PaddlePaddle申请的buffer共用一块内存。
+如果存在数据排列格式不一样的情况时，我们会在网络训练之前把格式转换为MKL-DNN希望的格式，
+在训练结束的时候再保存为PaddlePaddle的格式，但是整个训练过程中不需要任何转换。
+这样既使得最终保存的参数格式与PaddlePaddle一致，又可以避免不必要的转换。
+### Gradients
+由于MKL-DNN的操作都是直接覆盖的形式，也就是说输出的结果不会在原来的数据上累加，
+这样带来的好处就是不需要一直清空memory，节省了不必要的操作。
+但是注意的是，当网络出现分支且在`backward`的时候，需要累加不同Layer传过来的梯度。
+所以在`MKLDNNlayer`中实现了一个merge的方法，此时每个小分支的`Input Gradient`
+会先临时保存在`MKLDNNMatrix`中，由分支处的Layer负责求和，并把结果放到当前层的`output_.grad`中。
+所以整体上，在实现每个子类的时候就不需要关心分支的事情了。
-### Weights
+<div align="center">
-由于有些layer是含有参数的，我们会尽量让MKL-DNN的参数与PaddlePaddle中`parameter`共享一块内存。
+<img src="image/gradients.png"><br/>
-同时，由于MKL-DNN在训练时使用的参数layout可能与PaddlePaddle默认的`nchw`不一致，我们会在网络训练的开始和结束时分别转换这个layout，使得最终保存的参数格式与PaddlePaddle一致。
+Figure 5. Merge Gradients
+</div>
 ### Unit Tests
-会在`paddle/gserver/test`目录下添加`test_MKLDNN.cpp`和`MKLDNNTester.*`用于MKL-DNN的测试。
+我们会添加`test_MKLDNN.cpp`和`MKLDNNTester.*`用于MKL-DNN的测试。
-测试分为每个layer(或activation)的单元测试和简单网络的整体测试。
+测试分为每个Layer（或Activation）的单元测试和简单网络的整体测试。
 每个测试会对比PaddlePaddle中CPU算出的结果与MKL-DNN的结果，小于某个比较小的阈值认为通过。
-### Protobuf Messages
-根据具体layer的需求可能会在`proto/ModelConfig.proto`里面添加必要的选项。
 ### Python API
 目前只考虑**v1 API**。
@@ -80,41 +172,40 @@ if use_mkldnn
    self.layer_type = mkldnn_*
 ```
-所有MKL-DNN的layer type会以*mkldnn_*开头，以示区分。 
+所有MKL-DNN的`layer_type`会以*mkldnn_*开头，这些会在`MKLDNN*Layer`注册layer的时候保证，以示区分。 
-并且可能在`python/paddle/trainer_config_helper`目录下的`activations.py `和`layers.py`里面添加必要的MKL-DNN的接口。
-### Demos
+同时,会在`paddle/utils.Flags`中添加一个`use_mkldnn`的flag，用于选择是否使用MKL-DNN的相关功能。
-会在`v1_api_demo`目录下添加一个`mkldnn`的文件夹，里面放入一些用于MKL-DNN测试的demo脚本。
 ### Benchmarking
-会添加`benchmark/paddle/image/run_mkldnn.sh`，用于测试使用MKL-DNN之后的性能。
+会添加相应的脚本在[这里](https://github.com/PaddlePaddle/Paddle/tree/develop/benchmark/paddle/image)，用于测试和对比在使用MKL-DNN前后的CNN网络性能。
+测试的性能对比结果会在[IntelOptimizedPaddle.md](https://github.com/PaddlePaddle/Paddle/blob/develop/benchmark/IntelOptimizedPaddle.md)
 ### Others
-1. 如果在使用MKL-DNN的情况下，会把CPU的Buffer对齐为64。
+1. 如果在使用MKL-DNN的情况下，会把CPU的Buffer对齐为4096，具体可以参考MKL-DNN中的[memory](https://github.com/01org/mkl-dnn/blob/master/include/mkldnn.hpp#L673)。
 2. 深入PaddlePaddle，寻找有没有其他可以优化的可能，进一步优化。比如可能会用OpenMP改进SGD的更新性能。
 ## Design Concerns
-为了更好的符合PaddlePaddle的代码风格\[[2](#references)\]，同时又尽可能少的牺牲MKL-DNN的性能\[[3](#references)\]。
+为了更好的符合PaddlePaddle的代码风格\[[3](#references)\]，同时又尽可能少的牺牲MKL-DNN的性能\[[4](#references)\]。
 我们总结出一些特别需要注意的点：
-1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数，我们决定使用已有的`deviceId_`变量来区分layer的属性，定义`-2`为`MKLDNNLayer`特有的设备ID。
+1. 使用**deviceId_**。为了尽可能少的在父类Layer中添加变量或者函数，
-2. 重写父类Layer的**init**函数，修改`deviceId_`为`-2`，代表这个layer是用于跑在MKL-DNN的环境下。
+我们决定使用已有的`deviceId_`变量来区分layer的属性，定义`-2`为`MKLDNNLayer`特有的设备ID。
-3. 创建`MKLDNNMatrix`，同时继承`CpuMatrix`和`mkldnn::memory`。用于管理MKL-DNN会用到的相关memory函数、接口以及会用的到格式信息。
+2. 重写父类Layer的**init**函数，修改`deviceId_`为`-2`，代表这个layer是用于跑在MKL-DNN的环境下。
-4. 创建`MKLDNNBase`，定义一些除了layer和memory相关的类和函数。包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`，和未来可能还会用到`FPGAEngine`等。
+3. 创建`MKLDNNBase`，定义一些除了layer和memory相关的类和函数。
-5. 每个`MKLDNNlayer`都会有`inVal_`,`inGrad_`,`outVal_`和`outGrad_`，分别代表input value， input gradient，output value和output gradient。他们会存放MKL-DNN用到的internal memory。同时还会定义以*ext*开头的`MKLDNNMatrix`(表示external的memory)，主要是在格式与PaddlePaddle默认的`nchw`格式不匹配时，用于转换内存的工作。必要的转换函数也会在`MKLDNNLayer`中提前定义好，每个子类只需要调用定义好的reset buffer函数即可。
+包括MKL-DNN会用到`MKLDNNStream`和`CPUEngine`，和未来可能还会用到`FPGAEngine`等。
-6. 每个`MKLDNNlayer`的resetbuffer相关的函数（包括reset input、output的Value和grad），他们会根据输入参数reset internal和external的memory，当然这两者也可以相等，即表示不需要转换。只需要把握一个原则，每个`MKLDNNlayer`的子类，只需要使用internal的memory就可以了，所有external的转换工作在父类的reset函数中都提前准备好了。
+4. 如果MKL-DNN layer的后面接有cpu device，那么就会使`output_.value`与`extOutVal_`共享内存，
-7. 一般来说，external的memory会尽量与PaddlePaddle中的`value`和`grad`共享内存。同时每个`MKLDNNLayer`中的external output value和gradient(也就是`extOutVal_`和`extOutGrad_`)必须分别与`output_.value`和`output_.grad`共享内存，因为PaddlePaddle的activation会直接使用`output_.value`和`output_.grad`。如果不需要external的buffer用于转换，那么internal的buffer也会与他们共享内存。
+同时数据格式就是`NCHW`，这样下一个cpu device就能拿到正确的数据。
-8. 如果MKL-DNN layer的后面接有cpu device，那么就会使`output_.value`与`extOutVal_`共享内存，同时数据格式就是`nchw`，这样下一个cpu device就能拿到正确的数据。在有cpu device的时候，external的memory的格式始终是`nchw`或者`nc`。
+在有普通的CPU layer时， `extOutVal_`和`extOutGrad_`的格式始终是`NCHW`或者`NC`。
-9. 由于MKL-DNN的输出操作都是覆盖data的，不是在原来的数据上累加，所以当网络出现分支时，在`backward`时会需要merge不同layer的梯度。`MKLDNNlayer`中会实现merge的方法，此时每个小分支的input gradient会先临时保存在一个`MKLDNNMatrix`中，由分支处的layer负责求和，并把结果放到这个layer的`output_.grad`中。所以整体上，每个子类并不会需要关心分支的事情，也是在父类都实现好了。
-10. 在原来的`FLAGS`中添加一个`use_mkldnn`的flag，用于选择是否使用MKL-DNN的相关功能。
 ## References
+1. [MKL small library](https://github.com/01org/mkl-dnn#linking-your-application)是[Intel MKL](https://software.intel.com/en-us/mkl)的一个子集。
-1. [Intel Math Kernel Library for Deep Neural Networks (Intel MKL-DNN)](https://github.com/01org/mkl-dnn "Intel MKL-DNN")
+主要包括了深度学习相关的数学原语与操作，一般由MKL-DNN在发布[新版本](https://github.com/01org/mkl-dnn/releases)时一起更新。
-2. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。但是在PaddlePaddle中，无论是重构前的layer还是重构后的op，都不会想要知道next layer/op的信息。
+2. [MKL-DNN System Requirements](https://github.com/01org/mkl-dnn#system-requirements)。
-3. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的CUDNN部分使用的也是`NCHW`，所以不存在这个问题)，所以需要引入一个转换方法，并且只需要在必要的时候转换这种格式，才能更好的发挥MKL-DNN的性能。
+目前在PaddlePaddle中，仅会在支持AVX2指令集及以上的机器才使用MKL-DNN。
+3. [原来的方案](https://github.com/PaddlePaddle/Paddle/pull/3096)会引入**nextLayer**的信息。
+但是在PaddlePaddle中，无论是重构前的layer还是重构后的op，都不会想要知道next layer/op的信息。
+4. MKL-DNN的高性能格式与PaddlePaddle原有的`NCHW`不同(PaddlePaddle中的cuDNN部分使用的也是`NCHW`，所以不存在这个问题)。
+所以需要引入一个转换方法，并且只需要在必要的时候转换这种格式，才能更好的发挥MKL-DNN的性能。
--- a/doc/design/mkldnn/image/engine.png
+++ b/doc/design/mkldnn/image/engine.png
--- a/doc/design/mkldnn/image/gradients.png
+++ b/doc/design/mkldnn/image/gradients.png
--- a/doc/design/mkldnn/image/layers.png
+++ b/doc/design/mkldnn/image/layers.png
--- a/doc/design/mkldnn/image/matrix.png
+++ b/doc/design/mkldnn/image/matrix.png
--- a/doc/design/mkldnn/image/overview.png
+++ b/doc/design/mkldnn/image/overview.png
--- a/doc/design/refactor/distributed_architecture.md
+++ b/doc/design/refactor/distributed_architecture.md
@@ -2,106 +2,70 @@
 ## Abstract
-PaddlePaddle v0.10.0 uses the "trainer-parameter server"
+PaddlePaddle version 0.10.0 uses the "trainer-parameter server" architecture. We run multiple instances of trainers (where each trainer runs the same model) and parameter servers for distributed training. This architecture serves well, but has few limitations:
-architecture. We run multiple replicated instances of trainers (runs
-the same code written by the user) and parameter servers for
-distributed training. This architecture served us well, but has some
-limitations:
-1. Need to write special code to handle tasks which should only be run
+1. There is a need to write special code that handles tasks which should only be run on a single trainer. E.g., initializing the model, saving the model etc.
-  by a single trainer. E.g., initializing model and saving model.
-2. Model parallelism is hard: need to write if-else branches conditioned
+2. Model parallelism is hard: It would need all the if-else branches conditioned on the trainer ID to partition the model onto the trainers, and eventually manually writing out the inter-model-shard communication code to communicate between different trainers.
-  on the trainer ID to partition model onto each trainer, and manually
-  write the inter-model-shard communication code.
-3. The user can not directly specify the parameter update rule: need
+3. The user can not directly specify the parameter update rule: This would need to modify the parameter server code and compile a new binary. This makes things more complicated for researchers: A lot of extra effort is required to make this work. Besides, the training job submission program may not allow running arbitrary binaries.
-   to modify the parameter server C++ code and compile a new
-   binary. This adds complication for researchers: A lot of extra
-   effort is required. Besides, the training job submission program
-   may not allow running arbitrary binaries.
-This design doc discusses PaddlePaddle's new distributed training
+This design doc discusses PaddlePaddle's new distributed training architecture that addresses the above mentioned limitations.
-architecture that addresses the above limitations.
 ## Analysis
-We will assume the user writes the trainer program by Python, the same
+The assumption is that the user writes the trainer program in either Python or C++.
-analysis holds if the trainer program is written in C++.
 ### Limitation 1
-If we look at the Python code that the user writes, there are two
+There are two basic functionalities in the trainer program:
-kinds of functionalities:
- The training logic such as load / save model and print log.
+1. The training logic such as loading / saving the model and printing out the logs.
- The neural network definition such as the definition of the data
+2. The neural network definition such as the definition of the data layer, the fully connected layer, the cost function and the
-  layer, the fully connected layer, the cost function and the
  optimizer.
-When we training with PaddlePaddle v0.10.0 distributedly, multiple
+When we train using PaddlePaddle v0.10.0 in a distributed fashion, multiple instances of the same Python code are run on different nodes, hence both: the
-replicated Python instances are running on different nodes: both the
+training logic as well as the neural network computation logic, is replicated.
-training logic and the neural network computation is replicated.
-The tasks that should only run once all belong to the training logic,
+The tasks that only need to be run once belong to the training logic. Hence if we only replicate the neural network computation part, and do **not**
-if we only replicate the neural network computation, but do **not**
+replicate the training logic, the limitation mentioned above can be avoided.
-replicate the training logic, the limitation could be solved.
 ### Limitation 2
-Model parallelism means running a single model on multiple nodes by
+Model parallelism means that a single model is partitioned into different components and each node runs one of the component separately. This comes at the extra cost of managing the
-partitioning the model onto different nodes and managing the
+inter-model-shard communication between nodes.
-inter-model-shard communications.
-PaddlePaddle should be able to modify the nerual network computation
+PaddlePaddle should ideally be able to modify the neural network computation and figure out the support for model parallelism automatically. However, the
-definition to support model parallelism automatically. However, the
+computation is only specified in Python code which sits outside of PaddlePaddle, hence PaddlePaddle can not support the feature in this setup.
-computation is only specified in Python code, and PaddlePaddle can not
-modify Python code.
-Just like compiler uses a intermediate representation (IR) so that
+Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows:
-programmer does not need to manually optimize their code in most of
-the cases - the compiler will optimize the IR:
 <img src="src/compiler.png"/>
-We can have our own IR too: PaddlePaddle can support model parallel by
+PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component:
-converting the IR so the user no longer need to manually do it in
-Python:
 <img src="src/paddle-compile.png"/>
-The IR for PaddlePaddle after refactor is called `Block`, it specifies
+The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation.
-the computation dependency graph and the variables used in the
-computation.
 ### Limitation 3
-The user can not directly specify the parameter update rule for the
+The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly.
-parameter server because the parameter server does not use the same
-computation definition as the trainer. Instead, the update rule is
-baked in the parameter server. The user can not specify the update
-rule in the same way of specifying the trainer computation.
-This could be fixed by making the parameter server run the same
+This could be fixed by making the parameter server run the same computation definition as the trainer (the user's Python module). For a detailed explanation, refer to this document -
-computation definition as the trainer. For a detailed explanation,
+[Design Doc: Operation Graph Based Parameter Server](./parameter_server.md)
-please
-see
-[Design Doc: Operation Graph Based Parameter Server](./dist_train.md)
 ## Distributed Training Architecture
-The new distributed training architecture can address the above
+The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so:
-limitations. Below is the illustration:
 <img src="src/distributed_architecture.png"/>
-The architecture includes major components: *PaddlePaddle Python*,
+The major components in the architecture are: *PaddlePaddle Python*, *PaddlePaddle converter* and *PaddlePaddle runtime*.
-*PaddlePaddle converter* and *PaddlePaddle runtime*:
 ### PaddlePaddle Python
-PaddlePaddle Python is the Python library that user's Python trainer
+PaddlePaddle Python is the Python library that user's Python code invokes, to read the data. build the neural network topology, start training, etc.
-invoke to build the neural network topology, start training, etc.
 ```Python
 paddle.init()
@@ -117,102 +81,60 @@ for i in range(1000):
 	print cost_val
 ```
-The code above is a typical Python trainer code, the neural network
+The above code is what a typical Python trainer code is, the neural network topology is built using the helper functions such as `paddle.layer.fc`. Training is done by calling `session.eval` iteratively.
-topology is built using helper functions such as
-`paddle.layer.fc`. The training is done by calling `session.eval`
-iteratively.
 #### session.eval
-As shown in the graph, `session.eval` sends the IR and the evaluation
+As shown in the graph, `session.eval` sends the IR and the evaluation inputs or targets to the PaddlePaddle cluster for evaluation.
-inputs/targets to the PaddlePaddle cluster for evaluation. The
+The targets can be any variable in the computation graph. When the target is say, the `optimizer` variable, the neural network will be optimized once. When the target is the `cost` variable, `session.eval` returns the cost value. Based on what the target is, an appropriate action is taken.
-targets can be any variable in the computation graph. When the target
-is the `optimizer` variable, the neural network will be optimized
-once. When the target is the `cost` variable, `session.eval` returns
-the cost value.
-The Python `session` is a wrapper of the C++ `Session` class. For more
+The Python `session` is a wrapper of the C++ `Session` class. For more information about `Session`, refer to this document - [Design Doc: Session](./session.md).
-information about `Session`, please
-see [Design Doc: Session](./session.md).
 ### PaddlePaddle Converter
-PaddlePaddle converter automatically converts the IR in the request
+The PaddlePaddle converter automatically converts the IR in the request (IR and evaluation inputs/targets) from PaddlePaddle Python to partitioned IRs and dispatches the new IRs and evaluation inputs/targets to different PaddlePaddle runtimes. Below are the steps that are followed :
-(IR and evaluation inputs/targets) from PaddlePaddle Python to new
-partitioned IRs and dispatch the new IRs and evaluation inputs/targets
-to different PaddlePaddle runtimes. Below are the steps:
-1. Add `feed` OP that feeds the eval inputs, and `fetch` OP that
+1. Add a `feed` OP that feeds the eval inputs, and a `fetch` OP that fetches the eval targets to the IR.
-   fetches the eval targets to the IR.
-1. Extract a new computation (sub)graph with `feed` and `fetch` OP as
+2. Extract a new computation (sub)graph with the `feed` and `fetch` OPs as the boundary. The runtime does not need to run the OP that is not dependent on the `fetch` OP.
-   the boundary. The runtime does not need to run the OP that is not
-   dependent by the `fetch` OP.
-1. Optimizes the computation graph.
+3. Optimize the computation graph.
-1. Place the OPs in the graph onto different devices on different
+4. Place the OPs in the graph onto different devices on different PaddlePaddle runtime according to a placement algorithm and the device constraints specified by the user.
-   PaddlePaddle runtime according to a placement algorithm and device
-   constraint specified by the user.
-1. Partition the graph according to runtime boundaries and add `send` /
+5. Partition the graph according to runtime boundaries and add `send` / `recv` OP pair on the runtime boundaries.
-   `recv` OP pair on the runtime boundaries.
-1. Dispatch the partitioned graph to different PaddlePaddle runtimes.
+6. Dispatch the partitioned graph to different PaddlePaddle runtimes.
+7. PaddlePaddle runtimes with the `fetch` OP reports evaluation results back to the converter, the converter reports the evaluation results back to the PaddlePaddle Python.
-1. PaddlePaddle runtimes with the `fetch` OP reports evaluation
-   results back to the converter, the convert reports the evaluation
-   results back to the PaddlePaddle Python.
 The output IRs will be cached to optimize the conversion latency.
 #### Placement Algorithm
-Our first implementation will only support "trainer-parameter server"
+Our first implementation will only support "trainer-parameter server" placement: the parameters, initializers, and optimizers are all placed on the PaddlePaddle runtimes with the parameter server role. Everything else will be placed on the PaddlePaddle runtimes with the trainer role. This has the same functionality as the "trainer-parameter server" architecture of PaddlePaddle v0.10.0, but is more generic and flexible.
-placement: the parameters, initializers, and optimizers are placed on
-the PaddlePaddle runtimes with the parameter server role. And
-everything else will be placed on the PaddlePaddle runtimes with the
-trainer role. This has the same functionality of our
-"trainer-parameter server" architecture of PaddlePaddle v0.10.0, but
-is more general and flexible.
-In the future, we will implement the general placement algorithm,
+In the future, a more general placement algorithm should be implemented, which makes placements according to the input IR, and a model of device computation time and device communication time. Model parallelism requires the generic placement algorithm.
-which makes placements according to the input IR, and a model of
-device computation time and device communication time. Model
-parallelism requires the general placement algorithm.
 ### PaddlePaddle Runtime
-The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and
+The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and runs the IR. The runtime does not need to do OP placement since it is already done by the converter.
-runs the IR. The runtime does not need to do OP placement since it's
-already done by the converter.
 ### Local Training Architecture
-The local training architecture will be the same as the distributed
+The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:
-training architecture, the differences are everything runs locally,
-and there is just one PaddlePaddle runtime:
 <img src="src/local_architecture.png"/>
 ### Training Data
-In PaddlePaddle v0.10.0, training data is typically read
+In PaddlePaddle v0.10.0, training data is typically read with a [data reader](../reader/README.md) from Python. This approach is no longer efficient when training in a distributed fashion since the Python process no longer runs on the same node with the trainer processes. The Python reader will need to read from the distributed filesystem (assuming it has the required access) and send to the trainers, doubling the network traffic.
-with [data reader](../reader/README.md) from Python. This approach is
-no longer efficient when training distributedly since the Python
+When doing distributed training, the user can still use Python data reader: the training data are sent with `session.eval`. However this should be used for debugging purpose only. The users are encouraged to use the read data OPs.
-process no longer runs on the same node with the trainer processes,
-the Python reader will need to read from the distributed filesystem
-(assuming it has the access) and send to the trainers, doubling the
-network traffic.
-When doing distributed training, the user can still use Python data
-reader: the training data are sent with `session.eval`. However should
-be used for debugging purpose only. The users are encouraged to use
-the read data OPs.
 ## References:

--- a/doc/design/releasing_process.md
+++ b/doc/design/releasing_process.md
@@ -5,8 +5,9 @@ PaddlePaddle使用git-flow branching model做分支管理，使用[Semantic Vers
 PaddlePaddle每次发新的版本，遵循以下流程:
 1. 从`develop`分支派生出新的分支，分支名为`release/版本号`。例如，`release/0.10.0`
-2. 将新分支的版本打上tag，tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`，第二个为`0.10.0rc2`，依次类推。
+1. 将新分支的版本打上tag，tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`，第二个为`0.10.0rc2`，依次类推。
-3. 对这个版本的提交，做如下几个操作:
+1. 对这个版本的提交，做如下几个操作:
+	* 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。
 	* 编译这个版本的Docker发行镜像，发布到dockerhub。如果失败，修复Docker编译镜像问题，Patch号加一，返回第二步
 	* 编译这个版本的Ubuntu Deb包。如果失败，修复Ubuntu Deb包编译问题，Patch号加一，返回第二步。
 	* 使用Regression Test List作为检查列表，测试Docker镜像/ubuntu安装包的功能正确性
@@ -20,9 +21,9 @@ PaddlePaddle每次发新的版本，遵循以下流程:
 			pip install twine
 			twine upload dist/[package to upload]
 			```
-4. 第三步完成后，将`release/版本号`分支合入master分支，并删除`release/版本号`分支。将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
+1. 第三步完成后，将`release/版本号`分支合入master分支，并删除`release/版本号`分支。将master分支的合入commit打上tag，tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
-5. 编译master分支的Docker发行镜像，发布到dockerhub。编译ubuntu的deb包，发布到github release页面
+1. 编译master分支的Docker发行镜像，发布到dockerhub。编译ubuntu的deb包，发布到github release页面
-6. 协同完成Release Note的书写
+1. 协同完成Release Note的书写
 需要注意的是:
@@ -30,7 +31,7 @@ PaddlePaddle每次发新的版本，遵循以下流程:
 * `release/版本号`分支一旦建立，一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭，方便测试人员测试PaddlePaddle的行为。
 * 在`release/版本号`分支存在的时候，如果有bugfix的行为，需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。
-# PaddlePaddle 分支规范
+## PaddlePaddle 分支规范
 PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范，并适应github的特性做了一些区别。
@@ -47,11 +48,11 @@ PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-
 * BugFix分支也是在开发者自己的fork版本库维护，与功能分支不同的是，BugFix分支需要分别给主版本库的`master`、`develop`与可能有的`release/版本号`分支，同时提起`Pull Request`。
-# PaddlePaddle回归测试列表
+## PaddlePaddle回归测试列表
 本列表说明PaddlePaddle发版之前需要测试的功能点。
-## PaddlePaddle Book中所有章节
+### PaddlePaddle Book中所有章节
 PaddlePaddle每次发版本首先要保证PaddlePaddle Book中所有章节功能的正确性。功能的正确性包括验证PaddlePaddle目前的`paddle_trainer`训练和纯使用`Python`训练模型正确性。

--- a/doc/getstarted/build_and_install/build_from_source_cn.rst
+++ b/doc/getstarted/build_and_install/build_from_source_cn.rst
-从源码编译PaddlePaddle
+从源码编译
 ======================
 .. _build_step:
@@ -7,8 +7,11 @@
 ----------------
 PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译工具。
-我们推荐您使用PaddlePaddle编译环境镜像完成编译，这样可以免去单独安装编译依赖的步骤，可选的不同编译环境
+我们推荐您使用PaddlePaddle Docker编译环境镜像完成编译，这样可以免去单独安装编译依赖的步骤，可选的不同编译环境Docker镜像
 可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ 找到。
+如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。
 编译PaddlePaddle，需要执行：
 .. code-block:: bash
@@ -16,22 +19,47 @@ PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译
   git clone https://github.com/PaddlePaddle/Paddle.git
   cd Paddle
   # 如果使用Docker编译环境，执行下面的命令编译CPU-Only的二进制
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
   # 如果不使用Docker编译环境，执行下面的命令
   mkdir build
   cd build
   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
   make
 编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：
 .. code-block:: bash
-   pip install python/dist/*.whl
+   pip install build/python/dist/*.whl
-.. _build_step:
+.. _run_test:
+执行单元测试
+----------------
+如果您期望在编译完成后立即执行所有的单元测试，可以按照下面的方法：
+使用Docker的情况下，设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后，立即执行单元测试。
+开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。
+.. code-block:: bash
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+如果不使用Docker，可以执行ctest命令即可：
+.. code-block:: bash
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
+   make
+   ctest
+   # 指定执行其中一个单元测试 test_mul_op
+   ctest -R test_mul_op
+.. _compile_deps:
 编译依赖
 ----------------

--- a/doc/getstarted/build_and_install/build_from_source_en.rst
+++ b/doc/getstarted/build_and_install/build_from_source_en.rst
-Build PaddlePaddle from Sources
+Build from Sources
 ==========================
 .. _build_step:
@@ -9,15 +9,19 @@ How To Build
 PaddlePaddle mainly uses `CMake <https://cmake.org>`_ and GCC, G++ as compile
 tools. We recommend you to use our pre-built Docker image to run the build
 to avoid installing dependencies by yourself. We have several build environment
-Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_.
+Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ .
+If you choose not to use Docker image for your build, you need to install the
+below `Compile Dependencies`_ before run the build.
 Then run:
 .. code-block:: bash
   git clone https://github.com/PaddlePaddle/Paddle.git
   cd Paddle
-   # run the following command to build CPU-Only binaries if you are using docker
+   # run the following command to build a CPU-Only binaries if you are using docker
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
   # else run these commands
   mkdir build
   cd build
@@ -30,9 +34,37 @@ machine or copy it to the target machine.
 .. code-block:: bash
-   pip install python/dist/*.whl
+   pip install build/python/dist/*.whl
-.. _build_step:
+.. _run_test:
+Run Tests
+----------------
+If you wish to run the tests, you may follow the below steps:
+When using Docker, set :code:`RUN_TEST=ON` and :code:`WITH_TESTING=ON` will run test immediately after the build.
+Set :code:`WITH_GPU=ON` Can also run tests on GPU.
+.. code-block:: bash
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/paddle/scripts/docker/build.sh
+If you don't use Docker, just run ctest will start the tests:
+.. code-block:: bash
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=ON ..
+   make
+   ctest
+   # run a single test like test_mul_op
+   ctest -R test_mul_op
+.. _compile_deps:
 Compile Dependencies
 ----------------
@@ -85,7 +117,7 @@ You can add :code:`-D` argument to pass such options, like:
    "WITH_PYTHON", "Build with integrated Python interpreter", "ON"
    "WITH_STYLE_CHECK", "Check code style when building", "ON"
    "WITH_TESTING", "Build unit tests", "ON"
-    "WITH_DOC", "Build documentaions", "OFF"
+    "WITH_DOC", "Build documentations", "OFF"
    "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
    "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"
    "WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON"

--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
-使用Docker安装运行PaddlePaddle
+使用Docker安装运行
 ================================
 使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境即可运行。并且也可以在Windows的docker中运行。

--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
-PaddlePaddle in Docker Containers
+Run in Docker Containers
 =================================
 Run PaddlePaddle in Docker container so that you don't need to care about

--- a/doc/getstarted/build_and_install/index_cn.rst
+++ b/doc/getstarted/build_and_install/index_cn.rst
@@ -13,7 +13,7 @@ PaddlePaddle提供pip和Docker的安装方式：
   pip_install_cn.rst
   docker_install_cn.rst
+   ../../howto/dev/build_cn.md
 编译流程
 ++++++++

--- a/doc/getstarted/build_and_install/index_en.rst
+++ b/doc/getstarted/build_and_install/index_en.rst
@@ -13,6 +13,7 @@ You can choose either pip or Docker to complete your install:
   pip_install_en.rst
   docker_install_en.rst
+   ../../howto/dev/build_en.md
 Build from Source

--- a/doc/getstarted/build_and_install/pip_install_cn.rst
+++ b/doc/getstarted/build_and_install/pip_install_cn.rst
-使用pip安装PaddlePaddle
+使用pip安装
 ================================
 PaddlePaddle可以使用常用的Python包管理工具
@@ -34,7 +34,7 @@ PaddlePaddle可以使用常用的Python包管理工具
   :align: center
 ..  csv-table:: 各个版本最新的whl包
-    :header: "版本说明", "cp27-cp27mu", "cp27-cp27mu", "C-API"
+    :header: "版本说明", "cp27-cp27mu", "cp27-cp27m", "C-API"
    :widths: 1, 3, 3, 3
    "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
@@ -83,4 +83,4 @@ PaddlePaddle发布的安装包会尽量对齐 `manylinux1 <https://www.python.or
  获取当前系统支持的安装包格式，并检查和需安装的包是否匹配。pypi安装包可以在 `这个 <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_ 链接中找到。
  如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ，需要升级pip版本到最新； 如果系统支持 manylinux1_x86_64 而安装包（本地）是 linux_x86_64 ，可以重命名这个whl包为 manylinux1_x86_64 再安装。
\ No newline at end of file
--- a/doc/getstarted/build_and_install/pip_install_en.rst
+++ b/doc/getstarted/build_and_install/pip_install_en.rst
-Install PaddlePaddle Using pip
+Install Using pip
 ================================
 You can use current widely used Python package management
@@ -37,7 +37,7 @@ If the links below shows up the login form, just click "Log in as guest" to star
   :align: center
 ..  csv-table:: whl package of each version
-    :header: "version", "cp27-cp27mu", "cp27-cp27mu", "C-API"
+    :header: "version", "cp27-cp27mu", "cp27-cp27m", "C-API"
    :widths: 1, 3, 3, 3
    "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"

--- a/doc/howto/dev/build_cn.md
+++ b/doc/howto/dev/build_cn.md
-# 编译PaddlePaddle和运行单元测试
+# 用Docker编译和测试PaddlePaddle
 ## 需要的软硬件

--- a/doc/howto/dev/build_en.md
+++ b/doc/howto/dev/build_en.md
-# Build PaddlePaddle from Source Code and Run Unit Test
+# Build using Docker
 ## What Developers Need

--- a/doc/howto/dev/write_docs_cn.rst
+++ b/doc/howto/dev/write_docs_cn.rst
@@ -3,12 +3,64 @@
 ##################
 PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成，生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。
+也可以利用PaddlePaddle 工具来编译文档，这个情况下所有的文件会存在整理过的的文件目录 .ppo_workspace/content 下
 如何构建文档
 ============
-PaddlePaddle的文档构建有两种方式。
+PaddlePaddle的文档构建有三种方式。
+使用PaddlePaddle.org工具
+--------------
+这个是目前推荐的使用方法。除了可以自动编译文档，也可以直接在网页预览文档。
+文件工具是使用Docker，需要在系统里先安装好Docker工具包。Docker安装请参考Docker的官网。安装好Docker之后及可用以下命令启动工具
+..  code-block:: bash
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+    # Clone the content repositories
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+    # Please specify the working directory through -v
+    docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
+注意: PaddlePaddle.org 会在 -v (volume) 指定的内容存储库运行命令
+之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档
+编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
+如果不想使用 Docker，你还可以通过运行Django框架直接激活工具的服务器。使用下面的命令来运行它。
+..  code-block:: bash
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+    # Clone the content repositories and PaddlePaddle.org
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+    git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git
+    # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd
+    export CONTENT_DIR=<path_to_paddlepaddle_working_directory>
+    export ENV=''
+    cd PaddlePaddle.org/portal/
+    pip install -r requirements.txt
+    python manage.py runserver
+工具服务器将读取环境变量 CONTENT_DIR 搜索代码库。请指定的PaddlePaddle工作目录给环境变量 CONTENT_DIR。
+之后再用网页连到http://localhost:8000就可以在网页上生成需要的文档。
+编译后的文件将被存储在工作目录 <paddlepaddle working directory>/.ppo_workspace/content。
+想了解更多PaddlePaddle.org工具的详细信息，可以 `点击这里 <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.cn.md>`_ 。
 使用Docker构建
 --------------
@@ -47,17 +99,12 @@ PaddlePaddle的文档构建有两种方式。
 PaddlePaddle文档使用 `sphinx`_ 自动生成，用户可以参考sphinx教程进行书写。
-如何更新文档主题
+如何更新www.paddlepaddle.org
-================
-PaddlePaddle文档主题在 `TO_YOUR_PADDLE_CLONE_PATH/doc_theme` 文件夹下，包含所有和前端网页设计相关的文件。
-如何更新doc.paddlepaddle.org
 ============================
-更新的文档以PR的形式提交到github中，提交方式参见 `贡献文档 <http://doc.paddlepaddle.org/develop/doc_cn/howto/dev/contribute_to_paddle_cn.html>`_ 。
+更新的文档以PR的形式提交到github中，提交方式参见 `贡献文档 <http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html>`_ 。
-目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://doc.paddlepaddle.org/develop/doc_cn/>`_ 和
+目前PaddlePaddle的develop分支的文档是自动触发更新的，用户可以分别查看最新的 `中文文档 <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ 和
-`英文文档 <http://doc.paddlepaddle.org/develop/doc/>`_ 。
+`英文文档 <http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html>`_ 。
 ..  _cmake: https://cmake.org/

--- a/doc/howto/dev/write_docs_en.rst
+++ b/doc/howto/dev/write_docs_en.rst
+##################
+Contribute Documentation
+##################
+PaddlePaddle supports English documentation ``doc`` and Chinese documentation ``doc_cn``.
+Both are compiled by `cmake`_ and `sphinx`_ , the compiled documentations will be stored under ``doc`` and ``doc_cn`` directories.
+When using the PaddlePaddle.org to compile documentations, the compiled documentations will be stored under a consolidated directory: .ppo_workspace/content
+How to Build Documentations
+============
+We recommend using PaddlePaddle.org tool to build documentation
+Use PaddlePaddle.org tool
+--------------
+This is the recommended method to build documentation. It can compile documentation and preview the documentation in a web browser.
+The tool uses Docker, please install it on your system. Please check Docker official website on how to install Docker. You may use the following commands to activate the tool
+..  code-block:: bash
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+    # Clone the content repositories. You may only clone the contents you need
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+    # Please specify the working directory through -v
+    docker run -it -p 8000:8000 -v `pwd`:/var/content paddlepaddle/paddlepaddle.org:latest
+Note: PaddlePaddle.org will read the content repos specified in the -v (volume) flag of the docker run command
+Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation
+The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content
+If you don't wish to use Docker, you can also activate the tool through Django. Use the following the commands to set up
+..  code-block:: bash
+    mkdir paddlepaddle # Create paddlepaddle working directory
+    cd paddlepaddle
+    # Clone the content repositories and PaddlePaddle.org
+    git clone https://github.com/PaddlePaddle/Paddle.git
+    git clone https://github.com/PaddlePaddle/book.git
+    git clone https://github.com/PaddlePaddle/models.git
+    git clone https://github.com/PaddlePaddle/Mobile.git
+    git clone https://github.com/PaddlePaddle/PaddlePaddle.org.git
+    # Please specify the PaddlePaddle working directory. In the current setting, it should be pwd
+    export CONTENT_DIR=<path_to_paddlepaddle_working_directory>
+    export ENV=''
+    cd PaddlePaddle.org/portal/
+    pip install -r requirements.txt
+    python manage.py runserver
+Use a web browser and navigate to http://localhost:8000, click the buttons to compile the documentation
+The compiled documentations will be stored in <paddlepaddle working directory>/.ppo_workspace/content
+If you want to learn more on the PaddlePaddle.org, please `click here <https://github.com/PaddlePaddle/PaddlePaddle.org/blob/develop/README.md>`_ 。
+How to write Documentations
+============
+PaddlePaddle uses `sphinx`_ to compile documentations，Please check sphinx official website for more detail.
+How to update www.paddlepaddle.org
+============================
+Please create PRs and submit them to github, please check `Contribute Code <http://www.paddlepaddle.org/docs/develop/documentation/en/howto/dev/contribute_to_paddle_en.html>`_ 。
+PaddlePaddle develop branch will update the documentation once the PR is merged. User may check latest `Chinese Docs <http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/index_cn.html>`_ and
+`English Docs <http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/index_en.html>`_ 。
+..  _cmake: https://cmake.org/
+..  _sphinx: http://www.sphinx-doc.org/en/1.4.8/
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -19,7 +19,7 @@
 ..  toctree::
  :maxdepth: 1
-  dev/build_cn.rst
+  dev/contribute_to_paddle_cn.md
  dev/write_docs_cn.rst
 模型配置

--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -18,9 +18,9 @@ Development
 ..  toctree::
  :maxdepth: 1
-  dev/build_en.rst
  dev/new_layer_en.rst
  dev/contribute_to_paddle_en.md
+  dev/write_docs_en.rst
 Configuration
 -------------

--- a/doc/howto/optimization/cpu_profiling.md
+++ b/doc/howto/optimization/cpu_profiling.md
-此教程会介绍如何使用Python的cProfile包，与Python库yep，google perftools来运行性能分析(Profiling)与调优。
+This tutorial introduces techniques we use to profile and tune the
+CPU performance of PaddlePaddle.  We will use Python packages
+`cProfile` and `yep`, and Google's `perftools`.
-运行性能分析可以让开发人员科学的，有条不紊的对程序进行性能优化。性能分析是性能调优的基础。因为在程序实际运行中，真正的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。
+Profiling is the process that reveals performance bottlenecks,
+which could be very different from what's in the developers' mind.
+Performance tuning is done to fix these bottlenecks. Performance optimization
+repeats the steps of profiling and tuning alternatively.
-性能优化的步骤，通常是循环重复若干次『性能分析 --> 寻找瓶颈 ---> 调优瓶颈 --> 性能分析确认调优效果』。其中性能分析是性能调优的至关重要的量化指标。
+PaddlePaddle users program AI applications by calling the Python API, which calls
+into `libpaddle.so.` written in C++.  In this tutorial, we focus on
+the profiling and tuning of
-Paddle提供了Python语言绑定。用户使用Python进行神经网络编程，训练，测试。Python解释器通过`pybind`和`swig`调用Paddle的动态链接库，进而调用Paddle C++部分的代码。所以Paddle的性能分析与调优分为两个部分:
+1. the Python code and
+1. the mixture of Python and C++ code.
-* Python代码的性能分析
+## Profiling the Python Code
-* Python与C++混合代码的性能分析
+### Generate the Performance Profiling File
-## Python代码的性能分析
+We can use Python standard
+package, [`cProfile`](https://docs.python.org/2/library/profile.html),
-### 生成性能分析文件
+to generate Python profiling file.  For example:
-Python标准库中提供了性能分析的工具包，[cProfile](https://docs.python.org/2/library/profile.html)。生成Python性能分析的命令如下:
 ```bash
 python -m cProfile -o profile.out main.py
 ```
-其中`-o`标识了一个输出的文件名，用来存储本次性能分析的结果。如果不指定这个文件，`cProfile`会打印一些统计信息到`stdout`。这不方便我们进行后期处理(进行`sort`, `split`, `cut`等等)。
+where `main.py` is the program we are going to profile, `-o` specifies
+the output file.  Without `-o`, `cProfile` would outputs to standard
-### 查看性能分析文件
+output.
-当main.py运行完毕后，性能分析结果文件`profile.out`就生成出来了。我们可以使用[cprofilev](https://github.com/ymichael/cprofilev)来查看性能分析结果。`cprofilev`是一个Python的第三方库。使用它会开启一个HTTP服务，将性能分析结果以网页的形式展示出来。
+### Look into the Profiling File
-使用`pip install cprofilev`安装`cprofilev`工具。安装完成后，使用如下命令开启HTTP服务
+`cProfile` generates `profile.out` after `main.py` completes. We can
+use [`cprofilev`](https://github.com/ymichael/cprofilev) to look into
+the details:
 ```bash
 cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
 ```
-其中`-a`标识HTTP服务绑定的IP。使用`0.0.0.0`允许外网访问这个HTTP服务。`-p`标识HTTP服务的端口。`-f`标识性能分析的结果文件。`main.py`标识被性能分析的源文件。
+where `-a` specifies the HTTP IP, `-p` specifies the port, `-f`
+specifies the profiling file, and `main.py` is the source file.
-访问对应网址，即可显示性能分析的结果。性能分析结果格式如下:
+Open the Web browser and points to the local IP and the specifies
+port, we will see the output like the following:
-```text
+```
   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.284    0.284   29.514   29.514 main.py:1(<module>)
     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run)
@@ -44,23 +54,23 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
        1    0.144    0.144    6.534    6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
 ```
-每一列的含义是:
+where each line corresponds to Python function, and the meaning of
+each column is as follows:
-| 列名 | 含义 |
+| column | meaning |
 | --- | --- |
-| ncalls | 函数的调用次数 |
+| ncalls | the number of calls into a function |
-| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 |
+| tottime | the total execution time of the function, not including the
-| percall | tottime的每次调用平均时间 |
+ execution time of other functions called by the function |
-| cumtime | 函数总时间。包含这个函数调用其他函数的时间 |
+| percall | tottime divided by ncalls |
-| percall | cumtime的每次调用平均时间 |
+| cumtime | the total execution time of the function, including the execution time of other functions being called |
-| filename:lineno(function) | 文件名, 行号，函数名 |
+| percall | cumtime divided by ncalls |
+| filename:lineno(function) | where the function is defined |
+### Identify Performance Bottlenecks
-### 寻找性能瓶颈
+Usually, `tottime` and the related `percall` time is what we want to
+focus on. We can sort above profiling file by tottime:
-通常`tottime`和`cumtime`是寻找瓶颈的关键指标。这两个指标代表了某一个函数真实的运行时间。
-将性能分析结果按照tottime排序，效果如下:
 ```text
     4696   12.040    0.003   12.040    0.003 {built-in method run}
@@ -68,12 +78,15 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1(<module>)
 ```
-可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python与C++混合代码的性能分析`来进行调优。而`sync_with_cpp`函数的总共耗时很长，每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息，了解其调用关系。
+We can see that the most time-consuming function is the `built-in
+method run`, which is a C++ function in `libpaddle.so`.  We will
+explain how to profile C++ code in the next section.  At this 
+moment, let's look into the third function `sync_with_cpp`, which is a
+Python function.  We can click it to understand more about it:
-```text
+```
 Called By:
   Ordered by: internal time
@@ -92,72 +105,93 @@ Called:
   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
 ```
-通常观察热点函数间的调用关系，和对应行的代码，就可以了解到问题代码在哪里。当我们做出性能修正后，再次进行性能分析(profiling)即可检查我们调优后的修正是否能够改善程序的性能。
+The lists of the callers of `sync_with_cpp` might help us understand
+how to improve the function definition.
+## Profiling Python and C++ Code
+### Generate the Profiling File
-## Python与C++混合代码的性能分析
+To profile a mixture of Python and C++ code, we can use a Python
+package, `yep`, that can work with Google's `perftools`, which is a
+commonly-used profiler for C/C++ code.
-### 生成性能分析文件
+In Ubuntu systems, we can install `yep` and `perftools` by running the
+following commands:
-C++的性能分析工具非常多。常见的包括`gprof`, `valgrind`, `google-perftools`。但是调试Python中使用的动态链接库与直接调试原始二进制相比增加了很多复杂度。幸而Python的一个第三方库`yep`提供了方便的和`google-perftools`交互的方法。于是这里使用`yep`进行Python与C++混合代码的性能分析
-使用`yep`前需要安装`google-perftools`与`yep`包。ubuntu下安装命令为
 ```bash
+apt update
 apt install libgoogle-perftools-dev
 pip install yep
 ```
-安装完毕后，我们可以通过
+Then we can run the following command
 ```bash
 python -m yep -v main.py
 ```
-生成性能分析文件。生成的性能分析文件为`main.py.prof`。
+to generate the profiling file.  The default filename is
+`main.py.prof`.
+Please be aware of the `-v` command line option, which prints the
+analysis results after generating the profiling file.  By examining the
+ the print result, we'd know that if we stripped debug
+information from `libpaddle.so` at build time.  The following hints
+help make sure that the analysis results are readable:
-命令行中的`-v`指定在生成性能分析文件之后，在命令行显示分析结果。我们可以在命令行中简单的看一下生成效果。因为C++与Python不同，编译时可能会去掉调试信息，运行时也可能因为多线程产生混乱不可读的性能分析结果。为了生成更可读的性能分析结果，可以采取下面几点措施:
+1. Use GCC command line option `-g` when building `libpaddle.so` so to
+   include the debug information.  The standard building system of
+   PaddlePaddle is CMake, so you might want to set
+   `CMAKE_BUILD_TYPE=RelWithDebInfo`.
-1. 编译时指定`-g`生成调试信息。使用cmake的话，可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`。
+1. Use GCC command line option `-O2` or `-O3` to generate optimized
-2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。
+   binary code. It doesn't make sense to profile `libpaddle.so`
-3. 运行性能分析的时候，先从单线程开始，再开启多线程，进而多机。毕竟如果单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。
+   without optimization, because it would anyway run slowly.
-### 查看性能分析文件
+1. Profiling the single-threaded binary file before the
+   multi-threading version, because the latter often generates tangled
+   profiling analysis result.  You might want to set environment
+   variable `OMP_NUM_THREADS=1` to prevents OpenMP from automatically
+   starting multiple threads.
-在运行完性能分析后，会生成性能分析结果文件。我们可以使用[pprof](https://github.com/google/pprof)来显示性能分析结果。注意，这里使用了用`Go`语言重构后的`pprof`，因为这个工具具有web服务界面，且展示效果更好。
+### Examining the Profiling File
-安装`pprof`的命令和一般的`Go`程序是一样的，其命令如下:
+The tool we used to examine the profiling file generated by
+`perftools` is [`pprof`](https://github.com/google/pprof), which
+provides a Web-based GUI like `cprofilev`.
+We can rely on the standard Go toolchain to retrieve the source code
+of `pprof` and build it:
 ```bash
 go get github.com/google/pprof
 ```
-进而我们可以使用如下命令开启一个HTTP服务:
+Then we can use it to profile `main.py.prof` generated in the previous
+section:
 ```bash
 pprof -http=0.0.0.0:3213 `which python`  ./main.py.prof
 ```
-这行命令中，`-http`指开启HTTP服务。`which python`会产生当前Python二进制的完整路径，进而指定了Python可执行文件的路径。`./main.py.prof`输入了性能分析结果。
+Where `-http` specifies the IP and port of the HTTP service.
+Directing our Web browser to the service, we would see something like
-访问对应的网址，我们可以查看性能分析的结果。结果如下图所示:
+the following:
 ![result](./pprof_1.png)
+### Identifying the Performance Bottlenecks
-### 寻找性能瓶颈
+Similar to how we work with `cprofilev`, we'd focus on `tottime` and
+`cumtime`.
-与寻找Python代码的性能瓶颈类似，寻找Python与C++混合代码的性能瓶颈也是要看`tottime`和`cumtime`。而`pprof`展示的调用图也可以帮助我们发现性能中的问题。
-例如下图中，
 ![kernel_perf](./pprof_2.png)
-在一次训练中，乘法和乘法梯度的计算占用2%-4%左右的计算时间。而`MomentumOp`占用了17%左右的计算时间。显然，`MomentumOp`的性能有问题。
+We can see that the execution time of multiplication and the computing
+of the gradient of multiplication takes 2% to 4% of the total running
-在`pprof`中，对于性能的关键路径都做出了红色标记。先检查关键路径的性能问题，再检查其他部分的性能问题，可以更有次序的完成性能的优化。
+time, and `MomentumOp` takes about 17%. Obviously, we'd want to
+optimize `MomentumOp`.
-## 总结
-至此，两种性能分析的方式都介绍完毕了。希望通过这两种性能分析的方式，Paddle的开发人员和使用人员可以有次序的，科学的发现和解决性能问题。
+`pprof` would mark performance critical parts of the program in
+red. It's a good idea to follow the hints.
--- a/doc/howto/optimization/cpu_profiling_cn.md
+++ b/doc/howto/optimization/cpu_profiling_cn.md
+此教程会介绍如何使用Python的cProfile包、Python库yep、Google perftools来进行性能分析 (profiling) 与调优（performance tuning）。
+Profling 指发现性能瓶颈。系统中的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。Tuning 指消除瓶颈。性能优化的过程通常是不断重复地 profiling 和 tuning。
+PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大部分 Python API 调用用 C++ 写的 libpaddle.so。所以 PaddlePaddle 的性能分析与调优分为两个部分:
+* Python 代码的性能分析
+* Python 与 C++ 混合代码的性能分析
+## Python代码的性能分析
+### 生成性能分析文件
+Python标准库中提供了性能分析的工具包，[cProfile](https://docs.python.org/2/library/profile.html)。生成Python性能分析的命令如下:
+```bash
+python -m cProfile -o profile.out main.py
+```
+其中 `main.py` 是我们要分析的程序，`-o`标识了一个输出的文件名，用来存储本次性能分析的结果。如果不指定这个文件，`cProfile`会打印到标准输出。
+### 查看性能分析文件
+`cProfile` 在main.py 运行完毕后输出`profile.out`。我们可以使用[`cprofilev`](https://github.com/ymichael/cprofilev)来查看性能分析结果。`cprofilev`是一个Python的第三方库。使用它会开启一个HTTP服务，将性能分析结果以网页的形式展示出来：
+```bash
+cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
+```
+其中`-a`标识HTTP服务绑定的IP。使用`0.0.0.0`允许外网访问这个HTTP服务。`-p`标识HTTP服务的端口。`-f`标识性能分析的结果文件。`main.py`标识被性能分析的源文件。
+用Web浏览器访问对应网址，即可显示性能分析的结果：
+```
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.284    0.284   29.514   29.514 main.py:1(<module>)
+     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run)
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+        1    0.144    0.144    6.534    6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
+```
+每一列的含义是:
+| 列名 | 含义 |
+| --- | --- |
+| ncalls | 函数的调用次数 |
+| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 |
+| percall | tottime的每次调用平均时间 |
+| cumtime | 函数总时间。包含这个函数调用其他函数的时间 |
+| percall | cumtime的每次调用平均时间 |
+| filename:lineno(function) | 文件名, 行号，函数名 |
+### 寻找性能瓶颈
+通常`tottime`和`cumtime`是寻找瓶颈的关键指标。这两个指标代表了某一个函数真实的运行时间。
+将性能分析结果按照tottime排序，效果如下:
+```text
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+   300005    0.874    0.000    1.681    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
+   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
+     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
+        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1(<module>)
+```
+可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python`与`C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长，每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息，了解其调用关系。
+```text
+Called By:
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+Function                                                                                                 was called by...
+                                                                                                             ncalls  tottime  cumtime
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone)
+                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward)
+Called:
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+```
+通常观察热点函数间的调用关系，和对应行的代码，就可以了解到问题代码在哪里。当我们做出性能修正后，再次进行性能分析(profiling)即可检查我们调优后的修正是否能够改善程序的性能。
+## Python与C++混合代码的性能分析
+### 生成性能分析文件
+C++的性能分析工具非常多。常见的包括`gprof`, `valgrind`, `google-perftools`。但是调试Python中使用的动态链接库与直接调试原始二进制相比增加了很多复杂度。幸而Python的一个第三方库`yep`提供了方便的和`google-perftools`交互的方法。于是这里使用`yep`进行Python与C++混合代码的性能分析
+使用`yep`前需要安装`google-perftools`与`yep`包。ubuntu下安装命令为
+```bash
+apt update
+apt install libgoogle-perftools-dev
+pip install yep
+```
+安装完毕后，我们可以通过
+```bash
+python -m yep -v main.py
+```
+生成性能分析文件。生成的性能分析文件为`main.py.prof`。
+命令行中的`-v`指定在生成性能分析文件之后，在命令行显示分析结果。我们可以在命令行中简单的看一下生成效果。因为C++与Python不同，编译时可能会去掉调试信息，运行时也可能因为多线程产生混乱不可读的性能分析结果。为了生成更可读的性能分析结果，可以采取下面几点措施:
+1. 编译时指定`-g`生成调试信息。使用cmake的话，可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`。
+2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。
+3. 运行性能分析的时候，先从单线程开始，再开启多线程，进而多机。毕竟单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。
+### 查看性能分析文件
+在运行完性能分析后，会生成性能分析结果文件。我们可以使用[`pprof`](https://github.com/google/pprof)来显示性能分析结果。注意，这里使用了用`Go`语言重构后的`pprof`，因为这个工具具有web服务界面，且展示效果更好。
+安装`pprof`的命令和一般的`Go`程序是一样的，其命令如下:
+```bash
+go get github.com/google/pprof
+```
+进而我们可以使用如下命令开启一个HTTP服务:
+```bash
+pprof -http=0.0.0.0:3213 `which python`  ./main.py.prof
+```
+这行命令中，`-http`指开启HTTP服务。`which python`会产生当前Python二进制的完整路径，进而指定了Python可执行文件的路径。`./main.py.prof`输入了性能分析结果。
+访问对应的网址，我们可以查看性能分析的结果。结果如下图所示:
+![result](./pprof_1.png)
+### 寻找性能瓶颈
+与寻找Python代码的性能瓶颈类似，寻找Python与C++混合代码的性能瓶颈也是要看`tottime`和`cumtime`。而`pprof`展示的调用图也可以帮助我们发现性能中的问题。
+例如下图中，
+![kernel_perf](./pprof_2.png)
+在一次训练中，乘法和乘法梯度的计算占用2%-4%左右的计算时间。而`MomentumOp`占用了17%左右的计算时间。显然，`MomentumOp`的性能有问题。
+在`pprof`中，对于性能的关键路径都做出了红色标记。先检查关键路径的性能问题，再检查其他部分的性能问题，可以更有次序的完成性能的优化。
--- a/paddle/capi/CMakeLists.txt
+++ b/paddle/capi/CMakeLists.txt
@@ -4,6 +4,16 @@ else ()
  set(PADDLE_FLOAT_TYPE float)
 endif()
+execute_process(
+  COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
+  WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
+  OUTPUT_VARIABLE PADDLE_GIT_COMMIT
+  RESULT_VARIABLE PADDLE_GIT_COMMIT_RESULT
+  ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
+if(NOT PADDLE_GIT_COMMIT)
+  set(PADDLE_GIT_COMMIT "no commit information")
+endif()
 # config.h used for C-API. It will store Paddle building configuration as a
 # header. Make user just include PaddleCAPI.h then can get building
 # configuration without explicitly set -DPADDLE_WITH_DOUBLE when building their

--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@@ -55,7 +55,7 @@ paddle_error paddle_matrix_set_row(paddle_matrix mat,
 }
 PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
-                                          paddle_real* value) {
+                                            paddle_real* value) {
  if (mat == nullptr || value == nullptr) return kPD_NULLPTR;
  auto ptr = cast(mat);
  if (ptr->mat == nullptr) return kPD_NULLPTR;
@@ -75,7 +75,7 @@ PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
 }
 PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
-                                          paddle_real* result) {
+                                            paddle_real* result) {
  if (mat == nullptr || result == nullptr) return kPD_NULLPTR;
  auto ptr = cast(mat);
  if (ptr->mat == nullptr) return kPD_NULLPTR;

--- a/paddle/capi/config.h.in
+++ b/paddle/capi/config.h.in
@@ -3,6 +3,9 @@
 typedef @PADDLE_FLOAT_TYPE@ paddle_real;
+#define __PADDLE_VERSION__  "@PADDLE_VERSION@"
+#define __PADDLE_COMMIT__   "@PADDLE_GIT_COMMIT@"
 // Since we only support linux and macos in compile, always use clang or
 // gcc 4.8+. DLL_IMPORT/DLL_EXPORT is as simple as below.
 #define PD_API __attribute__((visibility("default")))

--- a/paddle/capi/matrix.h
+++ b/paddle/capi/matrix.h
@@ -79,7 +79,7 @@ PD_API paddle_error paddle_matrix_set_row(paddle_matrix mat,
 * @note  value should contain enough element of data to init the mat
 */
 PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
-                                          paddle_real* value);
+                                            paddle_real* value);
 /**
 * @brief PDMatGetRow Get raw row buffer from matrix
@@ -93,14 +93,14 @@ PD_API paddle_error paddle_matrix_get_row(paddle_matrix mat,
                                          paddle_real** rawRowBuffer);
 /**
- * @brief copy data from the matrix 
+ * @brief copy data from the matrix
 * @param [in] mat Target matrix
- * @param [out] result pointer to store the matrix data 
+ * @param [out] result pointer to store the matrix data
 * @return paddle_error
 * @note the space of the result should allocated before invoke this API
 */
 PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
-                                          paddle_real* result);
+                                            paddle_real* result);
 /**
 * @brief PDMatCreateNone Create None Matrix
 * @return

--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -116,6 +116,7 @@ extern void hl_maxpool_backward(const int frameCnt,
 * @param[in]   paddingW    padding width.
 * @param[out]  tgtData     output data.
 * @param[in]   tgtStride   stride between output data samples.
+ * @param[in]   excludeMode whether to consider paddings for size.
 *
 */
 extern void hl_avgpool_forward(const int frameCnt,
@@ -132,7 +133,8 @@ extern void hl_avgpool_forward(const int frameCnt,
                               const int paddingH,
                               const int paddingW,
                               real* tgtData,
-                               const int tgtStride);
+                               const int tgtStride,
+                               bool excludeMode);
 /**
 * @brief   Maximum pool backward.
@@ -154,6 +156,7 @@ extern void hl_avgpool_forward(const int frameCnt,
 * @param[in]   scaleB      scale.
 * @param[out]  backGrad    output grad.
 * @param[in]   outStride   stride between output data samples.
+ * @param[in]   excludeMode whether to consider paddings for size.
 *
 */
 extern void hl_avgpool_backward(const int frameCnt,
@@ -172,7 +175,8 @@ extern void hl_avgpool_backward(const int frameCnt,
                                real scaleA,
                                real scaleB,
                                real* backGrad,
-                                const int outStride);
+                                const int outStride,
+                                bool excludeMode);
 extern void hl_maxpool3D_forward(const int frameCnt,
                                 const real* inputData,

--- a/paddle/cuda/include/stub/hl_cnn_stub.h
+++ b/paddle/cuda/include/stub/hl_cnn_stub.h
@@ -68,7 +68,8 @@ inline void hl_avgpool_forward(const int frameCnt,
                               const int paddingH,
                               const int paddingW,
                               real* tgtData,
-                               const int tgtStride) {}
+                               const int tgtStride,
+                               const bool excludeMode) {}
 inline void hl_avgpool_backward(const int frameCnt,
                                const real* outGrad,
@@ -86,7 +87,8 @@ inline void hl_avgpool_backward(const int frameCnt,
                                real scaleA,
                                real scaleB,
                                real* backGrad,
-                                const int outStride) {}
+                                const int outStride,
+                                const bool excludeMode) {}
 inline void hl_maxpool3D_forward(const int frameCnt,
                                 const real* inputData,

--- a/paddle/cuda/src/hl_cuda_cnn.cu
+++ b/paddle/cuda/src/hl_cuda_cnn.cu
@@ -210,7 +210,8 @@ __global__ void KeAvgPoolForward(const int nthreads,
                                 const int padH,
                                 const int padW,
                                 real* tgtData,
-                                 const int tgtStride) {
+                                 const int tgtStride,
+                                 const bool excludeMode) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  if (index < nthreads) {
    int pw = index % pooledW;
@@ -224,7 +225,8 @@ __global__ void KeAvgPoolForward(const int nthreads,
    int wend = min(wstart + sizeX, width);
    hstart = max(hstart, 0);
    wstart = max(wstart, 0);
-    int pool_size = (hend - hstart) * (wend - wstart);
+    int poolSize =
+        excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
    real aveval = 0;
    inputData += (frameNum * channels + c) * height * width;
@@ -235,7 +237,7 @@ __global__ void KeAvgPoolForward(const int nthreads,
    }
    int tgtIndex =
        index % (pooledW * pooledH * channels) + frameNum * tgtStride;
-    tgtData[tgtIndex] = aveval / pool_size;
+    tgtData[tgtIndex] = aveval / poolSize;
  }
 }
@@ -253,7 +255,8 @@ void hl_avgpool_forward(const int frameCnt,
                        const int paddingH,
                        const int paddingW,
                        real* tgtData,
-                        const int tgtStride) {
+                        const int tgtStride,
+                        const bool excludeMode) {
  int num_kernels = pooledH * pooledW * channels * frameCnt;
  int blocks = (num_kernels + 1024 - 1) / 1024;
  KeAvgPoolForward<<<blocks, 1024, 0, STREAM_DEFAULT>>>(num_kernels,
@@ -270,7 +273,8 @@ void hl_avgpool_forward(const int frameCnt,
                                                        paddingH,
                                                        paddingW,
                                                        tgtData,
-                                                        tgtStride);
+                                                        tgtStride,
+                                                        excludeMode);
  CHECK_SYNC("hl_avgpool_forward failed");
 }
@@ -290,7 +294,8 @@ __global__ void KeAvgPoolBackward(const int nthreads,
                                  real scaleA,
                                  real scaleB,
                                  real* tgtGrad,
-                                  const int outStride) {
+                                  const int outStride,
+                                  const bool excludeMode) {
  int index = blockIdx.x * blockDim.x + threadIdx.x;
  if (index < nthreads) {
    int offsetW = index % width + padW;
@@ -314,8 +319,9 @@ __global__ void KeAvgPoolBackward(const int nthreads,
        int wstart = pw * strideW - padW;
        int wend = min(wstart + sizeX, width);
        wstart = max(wstart, 0);
-        int poolsize = (hend - hstart) * (wend - wstart);
+        int poolSize =
-        gradient += outGrad[ph * pooledW + pw] / poolsize;
+            excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
+        gradient += outGrad[ph * pooledW + pw] / poolSize;
      }
    }
    tgtGrad[index] = scaleB * tgtGrad[index] + scaleA * gradient;
@@ -338,7 +344,8 @@ void hl_avgpool_backward(const int frameCnt,
                         real scaleA,
                         real scaleB,
                         real* backGrad,
-                         const int outStride) {
+                         const int outStride,
+                         const bool excludeMode) {
  int num_kernels = height * width * channels * frameCnt;
  int blocks = (num_kernels + 1024 - 1) / 1024;
@@ -358,7 +365,8 @@ void hl_avgpool_backward(const int frameCnt,
                                                         scaleA,
                                                         scaleB,
                                                         backGrad,
-                                                         outStride);
+                                                         outStride,
+                                                         excludeMode);
  CHECK_SYNC("hl_avgpool_backward failed");
 }

--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -27,6 +27,18 @@
 namespace paddle {
 namespace framework {
+static std::unordered_set<std::string>* g_ctrl_flow_ops_ = nullptr;
+// Control Flow operators's backward is significantly different from
+// computational operators. Hack Code here.
+// We should design a better way to backward CtrlFlowOps.
+static std::unordered_set<std::string>& CtrlFlowOps() {
+  if (g_ctrl_flow_ops_ == nullptr) {
+    g_ctrl_flow_ops_ = new std::unordered_set<std::string>{
+        "increment", "lod_rank_table", "less_than"};
+  }
+  return *g_ctrl_flow_ops_;
+}
 static inline std::unique_ptr<OperatorBase> CreateGradOp(
    const OperatorBase& op, const std::unordered_set<std::string>& no_grad_set,
    std::unordered_map<std::string, std::string>* grad_to_var) {
@@ -288,12 +300,24 @@ static void CreateGradVarInBlock(
  for (size_t op_index = grad_op_start_index; op_index < ops.size();
       ++op_index) {
    std::unordered_set<std::string> new_vars;
+    auto& ctrl_flow_ops = CtrlFlowOps();
    ForEachVarName(ops[op_index]->Outputs(),
                   [&](const std::string& grad_var_name) {
-                     if (block_desc->HasVar(grad_var_name)) {
+                     if (ctrl_flow_ops.find(ops[op_index]->Type()) !=
+                         ctrl_flow_ops.end()) {
+                       if (block_desc->HasVarRecursive(grad_var_name)) {
+                         return false;
+                       }
+                     } else {
+                       if (block_desc->HasVar(grad_var_name)) {
+                         return false;
+                       }
+                     }
+                     if (grad_var_name == framework::kEmptyVarName) {
                       return false;
                     }
                     auto var = block_desc->Var(grad_var_name);
+                     VLOG(10) << "Creating Variable " << grad_var_name;
                     new_vars.insert(var->Name());
                     auto it = param_name_map.find(grad_var_name);
                     if (it == param_name_map.end()) {
@@ -333,14 +357,25 @@ std::vector<std::unique_ptr<OpDescBind>> MakeOpGrad(
  // All input gradients of forwarding operator do not need to calculate.
  const std::vector<std::string>& inputs = op_desc->InputArgumentNames();
  if (AllGradInSet(inputs, *no_grad_vars)) {
+    VLOG(10) << "Drop operator  " << op_desc->Type();
    return grad_op_descs;  // empty vector
  }
  // All output gradients of forwarding operator do not need to calculate.
  const std::vector<std::string>& outputs = op_desc->OutputArgumentNames();
  if (AllGradInSet(outputs, *no_grad_vars)) {
-    for (const std::string& name : inputs) {
+    VLOG(10) << "Drop operator " << op_desc->Type();
-      no_grad_vars->insert(GradVarName(name));
+    // FIXME: Hack code here
+    auto& ctrl_flow_ops = CtrlFlowOps();
+    if (ctrl_flow_ops.find(op_desc->Type()) == ctrl_flow_ops.end()) {
+      // Only computational op need drop input's gradient.
+      for (const std::string& name : inputs) {
+        no_grad_vars->insert(GradVarName(name));
+        VLOG(10) << " Also drop " << GradVarName(name);
+      }
    }
    return grad_op_descs;  // empty vector
  }

--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/framework/block_desc.h"
+#include "paddle/framework/operator.h"
 #include "paddle/framework/program_desc.h"
 namespace paddle {
@@ -42,6 +43,8 @@ bool BlockDescBind::HasVar(const std::string &name) const {
 }
 VarDescBind *BlockDescBind::FindVarRecursive(const std::string &name) const {
+  if (name == kEmptyVarName) return nullptr;
  auto it = vars_.find(name);
  if (it == vars_.end()) {
    return Parent() == kNoneBlockIndex ? nullptr

--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -97,6 +97,10 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id,
  if (create_local_scope) {
    local_scope = &scope->NewScope();
    for (auto& var : block.AllVars()) {
+      if (var->Name() == framework::kEmptyVarName) {
+        continue;
+      }
      if (var->Persistable()) {
        auto* ptr = scope->Var(var->Name());
        CreateTensor(ptr, var->GetType());

--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -13,6 +13,8 @@
   limitations under the License. */
 #include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/framework.pb.h"
 #include "paddle/memory/memcpy.h"
 #include "paddle/memory/memory.h"
@@ -27,11 +29,11 @@
 namespace paddle {
 namespace framework {
-std::ostream& operator<<(std::ostream& os, const LoD& lod) {
+std::ostream &operator<<(std::ostream &os, const LoD &lod) {
  os << "{";
-  for (auto& v : lod) {
+  for (auto &v : lod) {
    os << "{";
-    for (auto& i : v) {
+    for (auto &i : v) {
      os << i << ",";
    }
    os << "}";
@@ -41,7 +43,7 @@ std::ostream& operator<<(std::ostream& os, const LoD& lod) {
  return os;
 }
-LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
+LoD SliceLevels(const LoD &in, size_t level_begin, size_t level_end) {
  LoD new_lod;
  new_lod.reserve(level_end - level_begin);
  for (size_t i = level_begin; i < level_end; i++) {
@@ -53,7 +55,7 @@ LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
  return new_lod;
 }
-LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
+LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
                 size_t elem_end) {
  PADDLE_ENFORCE_LT(level, in.size());
  PADDLE_ENFORCE_LT(elem_end, in[level].size());
@@ -64,9 +66,9 @@ LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
  res[0].assign(in[level].begin() + elem_begin,
                in[level].begin() + elem_end + 1);
  for (size_t lvl = 1; lvl < res.size(); lvl++) {
-    const auto& in_level = in[level + lvl];
+    const auto &in_level = in[level + lvl];
-    const auto& above_level = res[lvl - 1];
+    const auto &above_level = res[lvl - 1];
-    auto& out_level = res[lvl];
+    auto &out_level = res[lvl];
    out_level.assign(in_level.begin() + above_level.front(),
                     in_level.begin() + above_level.back() + 1);
  }
@@ -74,33 +76,33 @@ LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
    // to make the first offset equals 0, all the elements minus the first
    // element
    size_t front = res[lvl].front();
-    for (auto& ele : res[lvl]) {
+    for (auto &ele : res[lvl]) {
      ele -= front;
    }
  }
  return res;
 }
-LoD ToAbsOffset(const LoD& in) {
+LoD ToAbsOffset(const LoD &in) {
  // the lowest level stores relative offsets
  if (in.empty() || in.size() == 1) return in;
  LoD result = in;
  for (int level = result.size() - 2; level >= 0; level--) {
-    for (auto& ele : result[level]) {
+    for (auto &ele : result[level]) {
      ele = result[level + 1][ele];
    }
  }
  return result;
 }
-bool operator==(const LoD& a, const LoD& b) {
+bool operator==(const LoD &a, const LoD &b) {
  if (a.size() != b.size()) {
    return false;
  }
  for (size_t i = 0; i < a.size(); i++) {
-    const auto& a_level = a[i];
+    const auto &a_level = a[i];
-    const auto& b_level = b[i];
+    const auto &b_level = b[i];
    if (a_level.size() != b_level.size()) {
      return false;
    }
@@ -151,7 +153,7 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
 }
 using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
-LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD& lod, size_t start_idx,
+LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
                                        size_t end_idx, size_t start_level) {
  LoD sub_lod;
@@ -170,7 +172,7 @@ LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD& lod, size_t start_idx,
  return LoDAndOffset{sub_lod, {start_idx, end_idx}};
 }
-void AppendLoD(LoD* lod, const LoD& lod_length) {
+void AppendLoD(LoD *lod, const LoD &lod_length) {
  PADDLE_ENFORCE(
      lod->empty() || lod->size() == lod_length.size(),
      "The lod_length should has the same size with the appended lod.");
@@ -178,12 +180,139 @@ void AppendLoD(LoD* lod, const LoD& lod_length) {
    *lod = LoD(lod_length.size(), std::vector<size_t>({0}));
  }
  for (size_t i = 0; i < lod->size(); ++i) {
-    auto& level = (*lod)[i];
+    auto &level = (*lod)[i];
    for (size_t len : lod_length[i]) {
      level.push_back(level.back() + len);
    }
  }
 }
+void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
+                       const platform::DeviceContext &dev_ctx) {
+  // TODO(typhoonzero): serialize to ostream
+  {  // the 1st field, uint32_t version
+    constexpr uint32_t version = 0;
+    os.write(reinterpret_cast<const char *>(&version), sizeof(version));
+  }
+  {  // the 2nd field, tensor description
+     // int32_t  size
+     // void*    protobuf message
+    framework::TensorDesc desc;
+    desc.set_data_type(framework::ToDataType(tensor.type()));
+    auto dims = framework::vectorize(tensor.dims());
+    auto *pb_dims = desc.mutable_dims();
+    pb_dims->Resize(static_cast<int>(dims.size()), 0);
+    std::copy(dims.begin(), dims.end(), pb_dims->begin());
+    int32_t size = desc.ByteSize();
+    os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+    auto out = desc.SerializeAsString();
+    os.write(out.data(), size);
+  }
+  {  // the 3rd field, tensor data
+    uint64_t size = tensor.memory_size();
+    auto *data_ptr = tensor.data<void>();
+    PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
+                   "Index overflow when writing tensor");
+    if (platform::is_gpu_place(tensor.place())) {
+#ifdef PADDLE_WITH_CUDA
+      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
+      std::unique_ptr<char[]> buf(new char[kBufSize]);
+      auto &gpu_dev_ctx =
+          static_cast<const platform::CUDADeviceContext &>(dev_ctx);
+      platform::CPUPlace cpu;
+      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
+      while (size != 0) {
+        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
+        memory::Copy(cpu, buf.get(),
+                     boost::get<platform::GPUPlace>(tensor.place()),
+                     reinterpret_cast<const void *>(data), size_to_write,
+                     gpu_dev_ctx.stream());
+        gpu_dev_ctx.Wait();
+        os.write(buf.get(), size_to_write);
+        data += size_to_write;
+        size -= size_to_write;
+      }
+#else
+      PADDLE_THROW("Unexpected branch");
+#endif
+    } else {
+      os.write(static_cast<const char *>(data_ptr),
+               static_cast<std::streamsize>(size));
+    }
+  }
+  {  // the 4th field, lod information
+     // uint64_t lod_level
+     // uint64_t lod_level_1 size in byte.
+     // int*     lod_level_1 data
+     // ...
+    auto lod = tensor.lod();
+    uint64_t size = lod.size();
+    os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+    for (auto &each : lod) {
+      size = each.size() * sizeof(framework::LoD::value_type::value_type);
+      os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+      os.write(reinterpret_cast<const char *>(each.data()),
+               static_cast<std::streamsize>(size));
+    }
+  }
+}
+void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
+  uint32_t version;
+  is.read(reinterpret_cast<char *>(&version), sizeof(version));
+  PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+  framework::TensorDesc desc;
+  {  // int32_t size
+     // proto buffer
+    int32_t size;
+    is.read(reinterpret_cast<char *>(&size), sizeof(size));
+    std::unique_ptr<char[]> buf(new char[size]);
+    is.read(reinterpret_cast<char *>(buf.get()), size);
+    PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
+                   "Cannot parse tensor desc");
+  }
+  {  // read tensor
+    std::vector<int64_t> dims;
+    dims.reserve(static_cast<size_t>(desc.dims().size()));
+    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
+    tensor->Resize(framework::make_ddim(dims));
+    void *buf;
+    platform::Place cpu = platform::CPUPlace();
+    switch (desc.data_type()) {
+      case framework::FP32:
+        buf = tensor->mutable_data<float>(cpu);
+        break;
+      case framework::FP64:
+        buf = tensor->mutable_data<double>(cpu);
+        break;
+      case framework::INT32:
+        buf = tensor->mutable_data<int>(cpu);
+        break;
+      case framework::INT64:
+        buf = tensor->mutable_data<int64_t>(cpu);
+        break;
+      default:
+        PADDLE_THROW("DataType %d not supported", desc.data_type());
+    }
+    is.read(static_cast<char *>(buf), tensor->memory_size());
+  }
+  {  // read lod
+    uint64_t lod_level;
+    is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
+    auto &lod = *tensor->mutable_lod();
+    lod.resize(lod_level);
+    for (uint64_t i = 0; i < lod_level; ++i) {
+      uint64_t size;
+      is.read(reinterpret_cast<char *>(&size), sizeof(size));
+      std::vector<size_t> tmp(size / sizeof(size_t));
+      is.read(reinterpret_cast<char *>(tmp.data()),
+              static_cast<std::streamsize>(size));
+      lod[i] = tmp;
+    }
+  }
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -189,5 +189,14 @@ std::pair<LoD, std::pair<size_t, size_t>> GetSubLoDAndAbsoluteOffset(
 void AppendLoD(LoD* lod, const LoD& lod_length);
+/*
+ * Serialize/Desiralize LoDTensor to std::ostream
+ * You can pass ofstream or ostringstream to serilize to file
+ * or to a in memory string. GPU tensor will be copied to CPU.
+ */
+void SerializeToStream(std::ostream& os, const LoDTensor& tensor,
+                       const platform::DeviceContext& dev_ctx);
+void DeserializeFromStream(std::istream& is, LoDTensor* tensor);
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -65,7 +65,7 @@ class CompileTimeInferShapeContext : public InferShapeContext {
    PADDLE_ENFORCE_EQ(in_var->GetType(), VarDesc::LOD_TENSOR,
                      "The %d-th output of Output(%s) must be LoDTensor.", j,
                      out);
-    in_var->SetLoDLevel(out_var->GetLodLevel());
+    out_var->SetLoDLevel(in_var->GetLodLevel());
  }
  bool IsRuntime() const override;
@@ -466,7 +466,12 @@ DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
  auto var = block_.FindVarRecursive(name);
  PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
  try {
-    return framework::make_ddim(var->Shape());
+    auto shape = var->Shape();
+    if (shape.empty()) {
+      return framework::make_ddim({0UL});
+    } else {
+      return framework::make_ddim(var->Shape());
+    }
  } catch (...) {
    VLOG(5) << "GetDim of variable " << name << " error";
    std::rethrow_exception(std::current_exception());

--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -426,9 +426,6 @@ void OperatorWithKernel::Run(const Scope& scope,
  }
  kernel_iter->second->Compute(ctx);
-  // throws errors if have.
-  dev_ctx.Finish();
 }
 OpKernelType OperatorWithKernel::GetKernelType(
    const ExecutionContext& ctx) const {

--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -36,12 +36,9 @@ Scope& Scope::NewScope() const {
 }
 Variable* Scope::Var(const std::string& name) {
-  auto iter = vars_.find(name);
+  auto* v = FindVarLocally(name);
-  if (iter != vars_.end()) {
+  if (v != nullptr) return v;
-    VLOG(3) << "Get existing variable " << name;
+  v = new Variable();
-    return iter->second;
-  }
-  Variable* v = new Variable();
  vars_[name] = v;
  VLOG(3) << "Create variable " << name;
  v->name_ = &(vars_.find(name)->first);
@@ -57,8 +54,10 @@ Variable* Scope::Var(std::string* name) {
 }
 Variable* Scope::FindVar(const std::string& name) const {
-  auto it = vars_.find(name);
+  auto var = FindVarLocally(name);
-  if (it != vars_.end()) return it->second;
+  if (var != nullptr) {
+    return var;
+  }
  return (parent_ == nullptr) ? nullptr : parent_->FindVar(name);
 }
@@ -116,6 +115,11 @@ std::string Scope::Rename(const std::string& origin_name) const {
  Rename(origin_name, var_name);
  return var_name;
 }
+Variable* Scope::FindVarLocally(const std::string& name) const {
+  auto it = vars_.find(name);
+  if (it != vars_.end()) return it->second;
+  return nullptr;
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -76,6 +76,8 @@ class Scope {
  std::string Rename(const std::string& origin_name) const;
 private:
+  Variable* FindVarLocally(const std::string& name) const;
  // Call Scope::NewScope for a sub-scope.
  explicit Scope(Scope const* parent) : parent_(parent) {}

--- a/paddle/framework/shape_inference.cc
+++ b/paddle/framework/shape_inference.cc
@@ -12,6 +12,8 @@
   See the License for the specific language governing permissions and
   limitations under the License. */
 #include "paddle/framework/shape_inference.h"
+#include "grad_op_desc_maker.h"
+#include "paddle/framework/operator.h"
 namespace paddle {
 namespace framework {
@@ -22,6 +24,12 @@ std::vector<framework::DDim> InferShapeContext::GetInputsDim(
  return GetDims(names);
 }
+DDim InferShapeContext::GetInputsElementDim(const std::string &name,
+                                            int idx) const {
+  const std::vector<std::string> &names = Inputs(name);
+  return this->GetDim(names[idx]);
+}
 void InferShapeContext::SetOutputsDim(
    const std::string &name, const std::vector<framework::DDim> &dims) {
  auto &names = Outputs(name);
@@ -43,6 +51,9 @@ void InferShapeContext::SetDims(const std::vector<std::string> &names,
  size_t length = names.size();
  PADDLE_ENFORCE_EQ(length, dims.size());
  for (size_t i = 0; i < length; ++i) {
+    if (names[i] == framework::kEmptyVarName) {
+      continue;
+    }
    SetDim(names[i], dims[i]);
  }
 }

--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -37,6 +37,7 @@ class InferShapeContext {
  virtual framework::DDim GetInputDim(const std::string &name) const = 0;
  std::vector<framework::DDim> GetInputsDim(const std::string &name) const;
+  DDim GetInputsElementDim(const std::string &name, int idx) const;
  virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0;
  void SetOutputsDim(const std::string &name,

--- a/paddle/framework/tensor_util.h
+++ b/paddle/framework/tensor_util.h
@@ -135,18 +135,17 @@ inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx,
  auto dst_ptr = static_cast<void*>(dst->data());
  if (platform::is_cpu_place(src.place())) {
-    memory::Copy(dst_place, dst_ptr, boost::get<platform::CPUPlace>(src.place()), 
+    memory::Copy(dst_place, dst_ptr,
-		    src_ptr, size);
+                 boost::get<platform::CPUPlace>(src.place()), src_ptr, size);
  }
 #ifdef PADDLE_WITH_CUDA
  else if (platform::is_gpu_place(src.place())) {  // NOLINT
    memory::Copy(
-        dst_place, dst_ptr, boost::get<platform::GPUPlace>(src.place()), src_ptr,
+        dst_place, dst_ptr, boost::get<platform::GPUPlace>(src.place()),
-        size,
+        src_ptr, size,
        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
  }
 #endif
 }
 }  // namespace framework

--- a/paddle/function/EigenGemm.cpp
+++ b/paddle/function/EigenGemm.cpp
@@ -21,7 +21,7 @@ template <class T>
 struct EigenBlasGemm {
  typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, int>,
                           Eigen::Aligned>
-      Matrix;
+      EigenMatrix;
  static void compute(const bool transA,
                      const bool transB,
@@ -56,14 +56,13 @@ struct EigenBlasGemm {
      sizeB[1] = N;
      CHECK_EQ(N, ldb);
    }
-    Eigen::array<int, 2> sizeC;
+    Eigen::array<int, 2> sizeC = {{M, ldc}};
-    sizeC[0] = M;
+    Eigen::array<int, 2> offsetC = {{0, 0}};
-    sizeC[1] = N;
+    Eigen::array<int, 2> extentC = {{M, N}};
-    CHECK_EQ(N, ldc);
-    const Matrix a(const_cast<T*>(A), sizeA);
+    const EigenMatrix a(const_cast<T*>(A), sizeA);
-    const Matrix b(const_cast<T*>(B), sizeB);
+    const EigenMatrix b(const_cast<T*>(B), sizeB);
-    Matrix c(C, sizeC);
+    EigenMatrix c(C, sizeC);
    typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
    Eigen::array<DimPair, 1> dims;
@@ -72,12 +71,23 @@ struct EigenBlasGemm {
    dims[0].second = transB ? 1 : 0;
    Eigen::DefaultDevice device;
-    if (alpha == T(1) && beta == T(0)) {
+    if (N == ldc) {
-      c.device(device) = a.contract(b, dims);
+      if (alpha == T(1) && beta == T(0)) {
-    } else if (alpha == T(1) && beta == T(1)) {
+        c.device(device) = a.contract(b, dims);
-      c.device(device) += a.contract(b, dims);
+      } else if (alpha == T(1) && beta == T(1)) {
+        c.device(device) += a.contract(b, dims);
+      } else {
+        c.device(device) = alpha * a.contract(b, dims) + beta * c;
+      }
    } else {
-      c.device(device) = alpha * a.contract(b, dims) + beta * c;
+      if (alpha == T(1) && beta == T(0)) {
+        c.slice(offsetC, extentC).device(device) = a.contract(b, dims);
+      } else if (alpha == T(1) && beta == T(1)) {
+        c.slice(offsetC, extentC).device(device) += a.contract(b, dims);
+      } else {
+        c.slice(offsetC, extentC).device(device) =
+            alpha * a.contract(b, dims) + beta * c.slice(offsetC, extentC);
+      }
    }
  }
 };

--- a/paddle/gserver/layers/ConvTransProjection.cpp
+++ b/paddle/gserver/layers/ConvTransProjection.cpp
@@ -24,13 +24,13 @@ size_t ConvTransProjection::calOutputSize() {
  if (outputH_ == 0) outputH_ = configOutH_;
  if (outputW_ == 0) outputW_ = configOutW_;
  imageH_ = imageSize(outputH_,
-                      filterH_,
+                      (filterH_ - 1) * dilationH_ + 1,
                      paddingH_,
                      strideH_,
                      /* caffeMode */ true);
  imageW_ = imageSize(outputW_,
-                      filterW_,
+                      (filterW_ - 1) * dilationW_ + 1,
                      paddingW_,
                      strideW_,
                      /* caffeMode */ true);

--- a/paddle/gserver/layers/FactorizationMachineLayer.cpp
+++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "FactorizationMachineLayer.h"
+#include <algorithm>
+#include <vector>
+#include "paddle/math/SparseMatrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+namespace paddle {
+REGISTER_LAYER(factorization_machine, FactorizationMachineLayer);
+bool FactorizationMachineLayer::init(const LayerMap& layerMap,
+                                     const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+  factorSize_ = config_.factor_size();
+  /* initialize the latentVectors_ */
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t inputSize = inputLayers_[0]->getSize();
+  CHECK_EQ(parameters_[0]->getSize(), inputSize * factorSize_);
+  latentVectors_ = std::unique_ptr<Weight>(
+      new Weight(inputSize, factorSize_, parameters_[0]));
+  return true;
+}
+void FactorizationMachineLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  const MatrixPtr& inputV = getInputValue(0);
+  size_t batchSize = inputV->getHeight();
+  size_t outputSize = getSize();
+  size_t inputSize = inputLayers_[0]->getSize();
+  reserveOutput(batchSize, outputSize);
+  MatrixPtr outV = getOutputValue();
+  Matrix::resizeOrCreate(
+      latentVectorsSquare_, inputSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      inputMulFactor_, batchSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_);
+  REGISTER_TIMER_INFO("FmInputMulFactorTimer", getName().c_str());
+  inputMulFactor_->mul(*inputV, *latentVectors_->getW());
+  inputMulFactor_->square2(*tmpOut_);
+  outV->sumRows(*tmpOut_, 0.5, 0);
+  if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
+    Matrix::resizeOrCreateSparseMatrix(inputSquare_,
+                                       inputV->getHeight(),
+                                       inputV->getWidth(),
+                                       inputV->getElementCnt(),
+                                       inputV->getValueType());
+    inputSquare_->copyFrom(*inputV);
+    (dynamic_cast<CpuSparseMatrix*>(inputSquare_.get()))->square2();
+  } else {
+    Matrix::resizeOrCreate(
+        inputSquare_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
+    inputV->square2(*inputSquare_);
+  }
+  latentVectors_->getW()->square2(*latentVectorsSquare_);
+  tmpOut_->mul(*inputSquare_, *latentVectorsSquare_);
+  outV->sumRows(*tmpOut_, -0.5, 1.0);
+  /* activation */ {
+    REGISTER_TIMER_INFO("FmFwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ { backwardActivation(); }
+  const MatrixPtr& inputV = getInputValue(0);
+  const MatrixPtr& oGrad = getOutputGrad();
+  Matrix::resizeOrCreate(
+      tmpSum_, 1, latentVectors_->getW()->getHeight(), false, useGpu_);
+  MatrixPtr tmpSumTrans = Matrix::create(tmpSum_->getRowBuf(0),
+                                         latentVectors_->getW()->getHeight(),
+                                         1,
+                                         false,
+                                         useGpu_);
+  /* Calculate the gradients of the latentVectors_ matrix */
+  if (latentVectors_->getWGrad()) {
+    if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
+      Matrix::resizeOrCreateSparseMatrix(tmpInput_,
+                                         inputV->getHeight(),
+                                         inputV->getWidth(),
+                                         inputV->getElementCnt());
+      CpuSparseMatrix* sparseInputV =
+          dynamic_cast<CpuSparseMatrix*>(inputV.get());
+      CpuSparseMatrix* sparseInputSquare =
+          dynamic_cast<CpuSparseMatrix*>(inputSquare_.get());
+      CpuSparseMatrix* sparseTmpInput =
+          dynamic_cast<CpuSparseMatrix*>(tmpInput_.get());
+      sparseTmpInput->copyFrom(*sparseInputV);
+      sparseTmpInput->rowScale(0, *sparseInputV, *oGrad);
+      latentVectors_->getWGrad()->mul(
+          *sparseTmpInput->getTranspose(), *inputMulFactor_, 1, 1);
+      sparseTmpInput->rowScale(0, *sparseInputSquare, *oGrad);
+      Matrix::resizeOrCreate(negOnes_, 1, inputV->getHeight(), false, useGpu_);
+      negOnes_->zeroMem();
+      negOnes_->add(-1);
+      tmpSum_->mul(*negOnes_, *sparseTmpInput, 1, 0);
+    } else {
+      Matrix::resizeOrCreate(
+          tmpInput_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
+      tmpInput_->rowScale(0, *inputV, *oGrad);
+      latentVectors_->getWGrad()->mul(
+          *tmpInput_->getTranspose(), *inputMulFactor_, 1, 1);
+      tmpInput_->rowScale(0, *inputSquare_, *oGrad);
+      tmpSum_->sumCols(*tmpInput_, -1, 0);
+    }
+    latentVectors_->getWGrad()->addRowScale(
+        0, *latentVectors_->getW(), *tmpSumTrans);
+    /* Increasing the number of gradient */
+    latentVectors_->getParameterPtr()->incUpdate(callback);
+  }
+  /* Calculate the input layers gradient */
+  MatrixPtr inGrad = getInputGrad(0);
+  if (inGrad != NULL) {
+    inGrad->mul(
+        *inputMulFactor_, *latentVectors_->getW()->getTranspose(), 1, 1);
+    tmpSumTrans->sumRows(*latentVectorsSquare_, -1, 0);
+    inGrad->addColScale(0, *inputV, *tmpSum_);
+    inGrad->rowScale(0, *inGrad, *oGrad);
+  }
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/FactorizationMachineLayer.h
+++ b/paddle/gserver/layers/FactorizationMachineLayer.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+namespace paddle {
+/**
+ * @brief The Factorization Machine models pairwise (order-2) feature
+ * interactions as inner product of the learned latent vectors corresponding
+ * to each input feature.
+ *
+ * The Factorization Machine can effectively capture feature interactions
+ * especially when the input is sparse. While in principle FM can model higher
+ * order feature interaction, in practice usually only order-2 feature
+ * interactions are considered. The Factorization Machine Layer here only
+ * computes the order-2 interations with the formula:
+ *
+ * \f[
+ *     y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
+ * \f]
+ *
+ * The detailed calculation for forward and backward can be found at this paper:
+ *
+ *     Factorization machines.
+ *
+ * The config file api is factorization_machine.
+ */
+class FactorizationMachineLayer : public Layer {
+protected:
+  // The latent vectors, shape: (size, factorSize_)
+  // Each row of the latentVectors_ matrix is the latent vector
+  // corresponding to one input feature dimension
+  std::unique_ptr<Weight> latentVectors_;
+  // The hyperparameter that defines the dimensionality of the factorization
+  size_t factorSize_;
+private:
+  // Store the square values of the letent vectors matrix
+  MatrixPtr latentVectorsSquare_;
+  // Store the square values of input matrix
+  MatrixPtr inputSquare_;
+  // The result of input matrix * latent vector matrix that will be used in
+  // both forward and backward step
+  MatrixPtr inputMulFactor_;
+  // Store temporary calculation result
+  MatrixPtr tmpOut_;
+  MatrixPtr tmpSum_;
+  MatrixPtr tmpInput_;
+  // Negative identity matrix
+  MatrixPtr negOnes_;
+public:
+  explicit FactorizationMachineLayer(const LayerConfig& config)
+      : Layer(config) {}
+  ~FactorizationMachineLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+}  // namespace paddle
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
@@ -64,49 +64,111 @@ void HierarchicalSigmoidLayer::forward(PassType passType) {
                         batchSize,
                         codeLength_,
                         /* trans */ false,
-                         useGpu(deviceId_));
+                         false);
  Matrix::resizeOrCreate(preOutput_.grad,
                         batchSize,
                         codeLength_,
                         /* trans */ false,
-                         useGpu(deviceId_));
+                         false);
  IVectorPtr label = getInput(*getLabelLayer()).ids;
  preOutput_.value->zeroMem();
+  if (useGpu_) {
+    Matrix::resizeOrCreate(cpuOutput_,
+                           output_.value->getHeight(),
+                           output_.value->getWidth(),
+                           /* trans */ false,
+                           false);
+    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
+    cpuLabel_->copyFrom(*label);
+    cpuOutput_->copyFrom(*output_.value);
+  } else {
+    cpuOutput_ = output_.value;
+    cpuLabel_ = label;
+  }
  /* add the bias-vector */
  if (biases_.get() != NULL) {
-    preOutput_.value->addByBitCode(numClasses_, *label, *biases_->getW());
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuBias_,
+                             1,
+                             numClasses_ - 1,
+                             /* trans */ false,
+                             false);
+      cpuBias_->copyFrom(*biases_->getW());
+    } else {
+      cpuBias_ = biases_->getW();
+    }
+    preOutput_.value->addByBitCode(numClasses_, *cpuLabel_, *cpuBias_);
  }
  for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
    MatrixPtr input = getInputValue(i);
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuInput_,
+                             input->getHeight(),
+                             input->getWidth(),
+                             /* trans */ false,
+                             false);
+      Matrix::resizeOrCreate(cpuWeight_,
+                             weights_[i]->getW()->getHeight(),
+                             weights_[i]->getW()->getWidth(),
+                             /* trans */ false,
+                             false);
+      cpuInput_->copyFrom(*input);
+      cpuWeight_->copyFrom(*weights_[i]->getW());
+    } else {
+      cpuInput_ = input;
+      cpuWeight_ = weights_[i]->getW();
+    }
    preOutput_.value->mulByBitCode(
-        numClasses_, *label, *weights_[i]->getW(), *input);
+        numClasses_, *cpuLabel_, *cpuWeight_, *cpuInput_);
  }
  // keep consistent with the clipping in the following softrelu
  preOutput_.value->clip(-40.0, 40.0);
  preOutput_.value->sumByBitCode(numClasses_,
-                                 *label,
+                                 *cpuLabel_,
-                                 *output_.value,
+                                 *cpuOutput_,
                                 -1);  // scaleSum
  preOutput_.value->softrelu(*preOutput_.value);
-  MatrixPtr sum =
+  MatrixPtr sum = Matrix::create(batchSize, 1, /* trans= */ false, false);
-      Matrix::create(batchSize, 1, /* trans= */ false, useGpu(deviceId_));
  preOutput_.value->rowSum(*sum);
-  output_.value->add(*sum);
+  cpuOutput_->add(*sum);
+  if (useGpu_) {
+    output_.value->copyFrom(*cpuOutput_);
+  } else {
+    output_.value = cpuOutput_;
+  }
 }
 void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
  IVectorPtr label = getInput(*getLabelLayer()).ids;
+  if (useGpu_) {
+    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
+    cpuLabel_->copyFrom(*label);
+  } else {
+    cpuLabel_ = label;
+  }
  preOutput_.grad->one();
  preOutput_.grad->softreluDerivative(*preOutput_.value);
-  preOutput_.grad->subByBitCode(numClasses_, *label);
+  preOutput_.grad->subByBitCode(numClasses_, *cpuLabel_);
  if (biases_ && biases_->getWGrad()) {
-    preOutput_.grad->addByBitCodeBackward(
+    MatrixPtr biases_grad = biases_->getWGrad();
-        numClasses_, *label, *biases_->getWGrad());
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuBias_,
+                             1,
+                             numClasses_ - 1,
+                             /* trans */ false,
+                             false);
+      cpuBias_->copyFrom(*biases_grad);
+    } else {
+      cpuBias_ = biases_grad;
+    }
+    preOutput_.grad->addByBitCodeBackward(numClasses_, *cpuLabel_, *cpuBias_);
+    if (useGpu_) {
+      biases_grad->copyFrom(*cpuBias_);
+    } else {
+      biases_grad = cpuBias_;
+    }
    /* Increasing the number of gradient */
    biases_->getParameterPtr()->incUpdate(callback);
  }
@@ -115,9 +177,31 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
    /* Calculate the W-gradient for the current layer */
    MatrixPtr input = getInputValue(i);
    if (weights_[i]->getWGrad()) {
+      MatrixPtr weights_grad = weights_[i]->getWGrad();
+      if (useGpu_) {
+        Matrix::resizeOrCreate(cpuInput_,
+                               input->getHeight(),
+                               input->getWidth(),
+                               /* trans */ false,
+                               false);
+        Matrix::resizeOrCreate(cpuWeightGrad_,
+                               weights_grad->getHeight(),
+                               weights_grad->getWidth(),
+                               /* trans */ false,
+                               false);
+        cpuInput_->copyFrom(*input);
+        cpuWeightGrad_->copyFrom(*weights_grad);
+      } else {
+        cpuInput_ = input;
+        cpuWeightGrad_ = weights_grad;
+      }
      preOutput_.grad->mulByBitCodeBackwardWeight(
-          numClasses_, *label, *weights_[i]->getWGrad(), *input);
+          numClasses_, *cpuLabel_, *cpuWeightGrad_, *cpuInput_);
+      if (useGpu_) {
+        weights_grad->copyFrom(*cpuWeightGrad_);
+      } else {
+        weights_grad = cpuWeightGrad_;
+      }
      /* Increasing the number of gradient */
      weights_[i]->getParameterPtr()->incUpdate(callback);
    }
@@ -125,8 +209,30 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
    /* Calculate the input layers error */
    MatrixPtr inputGrad = getInputGrad(i);
    if (inputGrad) {
+      if (useGpu_) {
+        Matrix::resizeOrCreate(cpuInputGrad_,
+                               inputGrad->getHeight(),
+                               inputGrad->getWidth(),
+                               /* trans */ false,
+                               false);
+        Matrix::resizeOrCreate(cpuWeight_,
+                               weights_[i]->getW()->getHeight(),
+                               weights_[i]->getW()->getWidth(),
+                               /* trans */ false,
+                               false);
+        cpuInputGrad_->copyFrom(*inputGrad);
+        cpuWeight_->copyFrom(*weights_[i]->getW());
+      } else {
+        cpuInputGrad_ = inputGrad;
+        cpuWeight_ = weights_[i]->getW();
+      }
      preOutput_.grad->mulByBitCodeBackwardError(
-          numClasses_, *label, *weights_[i]->getW(), *inputGrad);
+          numClasses_, *cpuLabel_, *cpuWeight_, *cpuInputGrad_);
+      if (useGpu_) {
+        inputGrad->copyFrom(*cpuInputGrad_);
+      } else {
+        inputGrad = cpuInputGrad_;
+      }
    }
  }
 }

--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.h
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
@@ -80,6 +80,15 @@ protected:
  int codeLength_;
  /// temporary result of output_
  Argument preOutput_;
+  /// The temporary variables in CPU memory.
+  MatrixPtr cpuWeight_;
+  MatrixPtr cpuWeightGrad_;
+  MatrixPtr cpuInput_;
+  MatrixPtr cpuInputGrad_;
+  MatrixPtr cpuBias_;
+  MatrixPtr cpuOutput_;
+  IVectorPtr cpuLabel_;
 };
 }  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNLRNLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNLRNLayer.cpp
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "MKLDNNLRNLayer.h"
+#include "paddle/utils/Logging.h"
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+namespace paddle {
+REGISTER_LAYER(mkldnn_lrn, MKLDNNLRNLayer);
+bool MKLDNNLRNLayer::init(const LayerMap& layerMap,
+                          const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+  /* the size of inputs for norm-layer is 1 */
+  CHECK_EQ(config_.inputs_size(), 1UL);
+  const NormConfig& conf = config_.inputs(0).norm_conf();
+  localSize_ = conf.size();
+  alpha_ = conf.scale();
+  beta_ = conf.pow();
+  ic_ = conf.channels();
+  oc_ = ic_;
+  iw_ = conf.img_size();
+  ow_ = conf.output_x();
+  ih_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  oh_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+  CHECK_EQ(iw_, ow_);
+  CHECK_EQ(ih_, oh_);
+  return true;
+}
+void MKLDNNLRNLayer::reshape(
+    int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  reshapeInput(bs, ih, iw);
+  // ic_ and oc can not be changed
+  CHECK_EQ((size_t)ic,
+           inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
+      << "Input channel can not be changed";
+  oh = ih;
+  ow = iw;
+  reshapeOutput(oh, ow);
+  resizeOutput(bs, oc * oh * ow);
+}
+void MKLDNNLRNLayer::resetFwd(std::vector<primitive>& pipeline,
+                              std::vector<MKLDNNMatrixPtr>& inputs,
+                              MKLDNNMatrixPtr& out) {
+  resetFwdBuffers(inputs[0], out);
+  resetFwdPD(fwdPD_, inputs[0], out);
+  resetFwdPipeline(pipeline, fwdPD_, inputs[0], out);
+}
+void MKLDNNLRNLayer::resetBwd(std::vector<primitive>& pipeline,
+                              std::vector<MKLDNNMatrixPtr>& inputs,
+                              MKLDNNMatrixPtr& out) {
+  std::shared_ptr<lrn_bwd::primitive_desc> pd;
+  resetBwdBuffers(inputs[0], out);
+  resetBwdPD(pd, inputs[0], out);
+  resetBwdPipeline(pipeline, pd, inputs[0], out);
+}
+void MKLDNNLRNLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
+                                     MKLDNNMatrixPtr& out) {
+  resetInValue(in);
+  CHECK(in);
+  resetOutValue(out, in->getPrimitiveDesc());
+}
+void MKLDNNLRNLayer::resetFwdPD(std::shared_ptr<lrn_fwd::primitive_desc>& pd,
+                                MKLDNNMatrixPtr in,
+                                MKLDNNMatrixPtr out) {
+  prop_kind pk = passType_ == PASS_TEST ? prop_kind::forward_scoring
+                                        : prop_kind::forward_training;
+  auto fwdDesc = lrn_fwd::desc(pk,
+                               algorithm::lrn_across_channels,
+                               in->getMemoryDesc(),
+                               localSize_,
+                               alpha_,
+                               beta_,
+                               1.0f);
+  pd.reset(new lrn_fwd::primitive_desc(fwdDesc, engine_));
+  // prepare workspace if necessary
+  workspace_ =
+      passType_ != PASS_TEST
+          ? std::make_shared<memory>(memory(pd->workspace_primitive_desc()))
+          : nullptr;
+}
+void MKLDNNLRNLayer::resetFwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<lrn_fwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  fwd_ = workspace_
+             ? std::make_shared<lrn_fwd>(lrn_fwd(*pd, *in, *workspace_, *out))
+             : std::make_shared<lrn_fwd>(lrn_fwd(*pd, *in, *out));
+  pipeline.push_back(*fwd_);
+}
+void MKLDNNLRNLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
+                                     MKLDNNMatrixPtr& out) {
+  CHECK(inVals_[0] && outVal_);
+  resetOutGrad(out, outVal_->getPrimitiveDesc());
+  resetInGrad(in, inVals_[0]->getPrimitiveDesc());
+}
+void MKLDNNLRNLayer::resetBwdPD(std::shared_ptr<lrn_bwd::primitive_desc>& pd,
+                                MKLDNNMatrixPtr& in,
+                                MKLDNNMatrixPtr& out) {
+  pd = nullptr;
+  if (in == nullptr) {
+    return;
+  }
+  CHECK(out);
+  auto bwdDesc = lrn_bwd::desc(algorithm::lrn_across_channels,
+                               in->getMemoryDesc(),
+                               out->getMemoryDesc(),
+                               localSize_,
+                               alpha_,
+                               beta_,
+                               1.0f);
+  pd.reset(new lrn_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
+}
+void MKLDNNLRNLayer::resetBwdPipeline(
+    std::vector<primitive>& pipeline,
+    std::shared_ptr<lrn_bwd::primitive_desc>& pd,
+    MKLDNNMatrixPtr& in,
+    MKLDNNMatrixPtr& out) {
+  if (pd == nullptr) {
+    return;
+  }
+  CHECK(inVals_[0]);
+  CHECK(workspace_);
+  bwdData_ = std::make_shared<lrn_bwd>(
+      lrn_bwd(*pd, *inVals_[0], *out, *workspace_, *in));
+  pipeline.push_back(*bwdData_);
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNLRNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLRNLayer.h
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+namespace paddle {
+typedef mkldnn::lrn_forward lrn_fwd;
+typedef mkldnn::lrn_backward lrn_bwd;
+/**
+ * @brief A subclass of MKLDNNLayer LRN(Local Response Norm) layer.
+ *
+ * The config file api is mkldnn_lrn
+ */
+class MKLDNNLRNLayer : public MKLDNNLayer {
+protected:
+  // save forward primitive_desc, which can be used in backward
+  std::shared_ptr<lrn_fwd::primitive_desc> fwdPD_;
+  // according to https://github.com/01org/mkl-dnn/blob/master/tests/gtests/
+  // test_lrn_backward.cpp, lrn need workspace for backward
+  std::shared_ptr<mkldnn::memory> workspace_;
+  int localSize_;
+  float alpha_, beta_;  // scale and pow in paddle
+public:
+  explicit MKLDNNLRNLayer(const LayerConfig& config) : MKLDNNLayer(config) {}
+  ~MKLDNNLRNLayer() {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void reshape(
+      int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
+  void resetFwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+  void resetBwd(std::vector<mkldnn::primitive>& pipeline,
+                std::vector<MKLDNNMatrixPtr>& inputs,
+                MKLDNNMatrixPtr& out) override;
+protected:
+  void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
+  void resetFwdPD(std::shared_ptr<lrn_fwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr in,
+                  MKLDNNMatrixPtr out);
+  void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<lrn_fwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& out);
+  void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
+  void resetBwdPD(std::shared_ptr<lrn_bwd::primitive_desc>& pd,
+                  MKLDNNMatrixPtr& in,
+                  MKLDNNMatrixPtr& out);
+  void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
+                        std::shared_ptr<lrn_bwd::primitive_desc>& pd,
+                        MKLDNNMatrixPtr& in,
+                        MKLDNNMatrixPtr& out);
+};
+}  // namespace paddle
--- a/paddle/gserver/layers/PoolLayer.cpp
+++ b/paddle/gserver/layers/PoolLayer.cpp
@@ -45,6 +45,8 @@ bool PoolLayer::init(const LayerMap& layerMap,
  strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride();
  confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+  excludeMode_ = conf.has_exclude_mode() ? conf.exclude_mode() : true;
  return true;
 }

--- a/paddle/gserver/layers/PoolLayer.h
+++ b/paddle/gserver/layers/PoolLayer.h
@@ -38,6 +38,8 @@ protected:
  std::string poolType_;
+  bool excludeMode_;
 public:
  explicit PoolLayer(const LayerConfig& config) : Layer(config) {}

--- a/paddle/gserver/layers/PoolProjection.cpp
+++ b/paddle/gserver/layers/PoolProjection.cpp
@@ -36,6 +36,8 @@ PoolProjection::PoolProjection(const ProjectionConfig& config,
  strideY_ = conf.has_stride_y() ? conf.stride_y() : conf.stride();
  confPaddingY_ = conf.has_padding_y() ? conf.padding_y() : conf.padding();
  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+  excludeMode_ = conf.has_exclude_mode() ? conf.exclude_mode() : true;
 }
 size_t PoolProjection::getSize() {
@@ -141,7 +143,8 @@ void AvgPoolProjection::forward() {
                       outputY_,
                       outputX_,
                       confPaddingY_,
-                       confPadding_);
+                       confPadding_,
+                       excludeMode_);
 }
 void AvgPoolProjection::backward(const UpdateCallback& callback) {
@@ -166,6 +169,7 @@ void AvgPoolProjection::backward(const UpdateCallback& callback) {
                             1,
                             1,
                             confPaddingY_,
-                             confPadding_);
+                             confPadding_,
+                             excludeMode_);
 }
 }  // namespace paddle
--- a/paddle/gserver/layers/PoolProjection.h
+++ b/paddle/gserver/layers/PoolProjection.h
@@ -28,6 +28,7 @@ protected:
  int confPaddingY_, confPadding_;
  size_t channels_;
  std::string poolType_;
+  bool excludeMode_;
 public:
  PoolProjection(const ProjectionConfig& config,

--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
 # gserver pacakge unittests
 add_simple_unittest(test_LinearChainCRF)
 add_simple_unittest(test_RecurrentLayer)
@@ -29,6 +28,26 @@ gserver_test(test_KmaxSeqScore)
 gserver_test(test_Expand)
 gserver_test(test_MaxPoolingWithMaskOutput)
+set(PYTHON_PATH 
+   ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d 
+   ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/gserver/tests)
+function(gserver_test_with_python TARGET)
+  add_unittest_without_exec(${TARGET} ${TARGET}.cpp)
+  add_test(NAME ${TARGET}
+    COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/${TARGET}
+      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+endfunction()
+gserver_test_with_python(test_PyDataProvider2)
+if(WITH_PYTHON)
+    gserver_test_with_python(test_PyDataProvider)
+endif()
+if(NOT MOBILE_INFERENCE)
+    gserver_test_with_python(test_CompareTwoNets)
+    # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine, I will fix it.
+    gserver_test_with_python(test_RecurrentGradientMachine)
+endif()
 ########## test_MKLDNN layers and activations ##########
 if(WITH_MKLDNN)
    add_unittest_without_exec(test_MKLDNN
@@ -36,87 +55,43 @@ if(WITH_MKLDNN)
        MKLDNNTester.cpp
        LayerGradUtil.cpp)
    add_test(NAME test_MKLDNN
-        COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python
+        COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
-            ${CMAKE_CURRENT_BINARY_DIR}/test_MKLDNN
            WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 endif()
-############## test_PyDataProvider ########################
-if(WITH_PYTHON)
-    add_unittest_without_exec(test_PyDataProvider
-        test_PyDataProvider.cpp)
-    add_test(NAME test_PyDataProvider
-        COMMAND .set_python_path.sh -d ./gserver/tests:${PADDLE_SOURCE_DIR}/python/ ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
-endif()
 ############### test_WarpCTCLayer #######################
 if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
    add_unittest_without_exec(test_WarpCTCLayer
        test_WarpCTCLayer.cpp)
    add_test(NAME test_WarpCTCLayer
        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR}
        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 endif()
 if(NOT MOBILE_INFERENCE)
-################## test_Evaluator #######################
+    ################## test_Evaluator #############
    add_unittest(test_Evaluator
        test_Evaluator.cpp)
-############### test_RecurrentGradientMachine ###############
+    ########### test_NetworkCompare ###############
-    # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine
-    # I will fix it.
-    add_unittest_without_exec(test_RecurrentGradientMachine
-        test_RecurrentGradientMachine.cpp)
-    add_test(NAME test_RecurrentGradientMachine
-        COMMAND .set_python_path.sh -d
-                ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
-                ${CMAKE_CURRENT_BINARY_DIR}/test_RecurrentGradientMachine
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
-############### test_NetworkCompare ###############
    add_unittest_without_exec(test_NetworkCompare
        test_NetworkCompare.cpp)
    if(WITH_GPU)
-        add_test(NAME test_NetworkCompare
+        set(use_gpu true)
-            COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=true
-            WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
    else()
-        add_test(NAME test_NetworkCompare
+        set(use_gpu false)
-            COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=false
-            WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
    endif()
-endif()
+    add_test(NAME test_NetworkCompare
+        COMMAND ${PYTHON_PATH} ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=${use_gpu}
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
-add_unittest_without_exec(test_PyDataProvider2
-        test_PyDataProvider2.cpp)
-add_test(NAME test_PyDataProvider2
-   COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
-        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
-)
-################# test_CompareSparse ##################
+    ############ test_CompareSparse ################
-add_unittest_without_exec(test_CompareSparse
+    add_unittest_without_exec(test_CompareSparse
-    test_CompareSparse.cpp)
+        test_CompareSparse.cpp)
-if(NOT ON_TRAVIS)
+    if(NOT ON_TRAVIS)
-  add_test(NAME test_CompareSparse
+      add_test(NAME test_CompareSparse
-    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
+        COMMAND ${PYTHON_PATH} ./.set_port.sh -p port -n 6
-          ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
+                ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
-              ./.set_port.sh -p port -n 6
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
-                  ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
+    endif()
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 endif()
-################ test_CompareTwoNets ######################
-add_unittest_without_exec(test_CompareTwoNets
-    test_CompareTwoNets.cpp)
-add_test(NAME test_CompareTwoNets
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
-        ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
-        ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
--- a/paddle/gserver/tests/mkldnn_simple_net.conf
+++ b/paddle/gserver/tests/mkldnn_simple_net.conf
@@ -51,6 +51,8 @@ tmp = img_pool_layer(input=tmp,
            padding=1,
            pool_type=MaxPooling())
+tmp = img_cmrnorm_layer(input=tmp, size=5, scale=0.0001, power=0.75)
 tmp = fc_layer(input=tmp,
            size=channels,
            bias_attr=False,

--- a/paddle/gserver/tests/sequence_rnn_matched_inputs.py
+++ b/paddle/gserver/tests/sequence_rnn_matched_inputs.py
@@ -41,7 +41,7 @@ nonseq = embedding_layer(input=label, size=word_dim)
 # This hierarchical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn_multi_unequalength_inputs.conf
+# sequence_rnn_mixed_inputs.conf
 def outer_step(subseq, seq, nonseq, encoding):
    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)

--- a/paddle/gserver/tests/sequence_rnn_mixed_inputs.py
+++ b/paddle/gserver/tests/sequence_rnn_mixed_inputs.py
@@ -37,7 +37,7 @@ encoding = embedding_layer(input=data2, size=word_dim)
 # This hierarchical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn_multi_unequalength_inputs.conf
+# sequence_rnn_matched_inputs.conf
 def outer_step(subseq, seq, nonseq, encoding):
    outer_mem = memory(name="outer_rnn_state", size=hidden_dim)

--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -238,9 +238,24 @@ void testProjectionConv(size_t groups, bool isDeconv) {
                            /* caffeMode */ true);
  conv->set_output_x(output_x);
  conv->set_output_y(output_y);
+  LOG(INFO) << "DILATION:" << DILATION << "; output_x: " << output_x
+            << "; output_y: " << output_y;
  if (isDeconv) {
+    int deconv_image_x = imageSize(output_x,
+                                   (conv->filter_size() - 1) * DILATION + 1,
+                                   conv->padding(),
+                                   conv->stride(),
+                                   /* caffeMode */ true);
+    int deconv_image_y = imageSize(output_y,
+                                   (conv->filter_size_y() - 1) * DILATION + 1,
+                                   conv->padding_y(),
+                                   conv->stride_y(),
+                                   /* caffeMode */ true);
+    LOG(INFO) << " deconv_image_x: " << deconv_image_x
+              << "; deconv_image_y: " << deconv_image_y;
    conf.set_input_size(output_x * output_y * CHANNELS);
-    conf.set_output_size(IMAGE_SIZE * IMAGE_SIZE * NUM_FILTERS);
+    conf.set_output_size(deconv_image_x * deconv_image_y * NUM_FILTERS);
  } else {
    conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
    conf.set_output_size(output_x * output_y * NUM_FILTERS);
@@ -681,12 +696,13 @@ TEST(Layer, hsigmoidLayer) {
  config.layerConfig.add_inputs();
  config.layerConfig.add_inputs();
-  // Not support GPU now
+  for (auto useGpu : {false, true}) {
-  testLayerGrad(config,
+    testLayerGrad(config,
-                "hsigmoid",
+                  "hsigmoid",
-                100,
+                  100,
-                /* trans */ false, /* useGpu */
+                  /* trans */ false,
-                false);
+                  /* useGpu */ useGpu);
+  }
 }
 TEST(Layer, multi_cross) {
@@ -1210,7 +1226,10 @@ void setPoolConfig(TestConfig* config,
  pool->set_output_y(oh);
 }
-void testPoolLayer(const string& poolType, bool trans, bool useGpu) {
+void testPoolLayer(const string& poolType,
+                   bool trans,
+                   bool useGpu,
+                   bool excludeMode = true) {
  TestConfig config;
  config.inputDefs.push_back({INPUT_DATA, "layer_0", 3136, 0});
  LayerInputConfig* input = config.layerConfig.add_inputs();
@@ -1218,6 +1237,7 @@ void testPoolLayer(const string& poolType, bool trans, bool useGpu) {
  pool->set_img_size(14);
  pool->set_img_size_y(14);
+  pool->set_exclude_mode(excludeMode);
  setPoolConfig(&config, pool, poolType);
  config.layerConfig.set_size(pool->output_x() * pool->output_y() *
                              pool->channels());
@@ -1249,16 +1269,26 @@ void testPoolLayer2(const string& poolType, bool trans, bool useGpu) {
 TEST(Layer, PoolLayer) {
  testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ false);
+  testPoolLayer("avg-projection",
+                /* trans= */ false,
+                /* useGpu= */ false,
+                /* excludeMode= */ false);
  testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ false);
  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ false);
 #ifdef PADDLE_WITH_CUDA
  testPoolLayer("avg-projection", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer("avg-projection",
+                /* trans= */ false,
+                /* useGpu= */ true,
+                /* excludeMode= */ false);
  testPoolLayer("max-projection", /* trans= */ false, /* useGpu= */ true);
  testPoolLayer("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
  testPoolLayer("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
  testPoolLayer2("cudnn-max-pool", /* trans= */ false, /* useGpu= */ true);
  testPoolLayer2("cudnn-avg-pool", /* trans= */ false, /* useGpu= */ true);
+  testPoolLayer2(
+      "cudnn-avg-incl-pad-pool", /* trans= */ false, /* useGpu= */ true);
  testPoolLayer("max-pool-with-mask", /* trans= */ false, /* useGpu= */ true);
 #endif
 }
@@ -2464,6 +2494,25 @@ TEST(Layer, L2DistanceLayer) {
  }
 }
+void testFactorizationMachineLayer(InputType type, bool useGpu) {
+  const int FACTOR_SIZE = 10;
+  TestConfig config;
+  config.layerConfig.set_type("factorization_machine");
+  config.layerConfig.set_factor_size(FACTOR_SIZE);
+  config.layerConfig.set_size(1);
+  config.biasSize = 0;
+  config.inputDefs.push_back({type, "layer_0", 128, 1280});
+  config.layerConfig.add_inputs();
+  testLayerGrad(config, "factorization_machine", 16, false, useGpu, false);
+}
+TEST(Layer, FactorizationMachineLayer) {
+  for (auto useGpu : {false, true}) {
+    testFactorizationMachineLayer(INPUT_DATA, useGpu);
+  }
+  testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false);
+}
 int main(int argc, char** argv) {
  testing::InitGoogleTest(&argc, argv);
  initMain(argc, argv);

--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -272,6 +272,51 @@ TEST(MKLDNNLayer, BatchNormLayer) {
  testBatchNormLayer({4, 16, 8, 10});
 }
+struct testLRNDesc {
+  int bs, ic, ih, iw;
+  float scale, pow;
+  int localSize;
+};
+void getMKLDNNLRNConfig(TestConfig& cfg, const testLRNDesc& pm) {
+  cfg.layerConfig.set_type("mkldnn_lrn");
+  cfg.layerConfig.set_active_type("relu");
+  size_t layerSize = pm.ic * pm.ih * pm.iw;
+  cfg.inputDefs.push_back({INPUT_DATA, "layer_0", layerSize, 0});
+  LayerInputConfig* input = cfg.layerConfig.add_inputs();
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_channels(pm.ic);
+  norm->set_size(pm.localSize);
+  norm->set_scale(pm.scale);
+  norm->set_pow(pm.pow);
+  norm->set_blocked(0);
+  norm->set_img_size(pm.iw);
+  norm->set_img_size_y(pm.ih);
+  norm->set_output_x(norm->img_size());
+  norm->set_output_y(norm->img_size_y());
+  cfg.layerConfig.set_size(layerSize);
+  cfg.biasSize = 0;
+}
+void testLRNLayer(const testLRNDesc& pm) {
+  TestConfig dnnConfig;
+  getMKLDNNLRNConfig(dnnConfig, pm);
+  // mkldnn_lrn <==> norm with cmrnorm-projection type
+  TestConfig refConfig = dnnConfig;
+  refConfig.layerConfig.set_type("norm");
+  LayerInputConfig* input = refConfig.layerConfig.mutable_inputs(0);
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_norm_type("cmrnorm-projection");
+  norm->set_scale(norm->scale() / norm->size());
+  RUN_MKLDNN_TEST(dnnConfig, refConfig, pm)
+}
+TEST(MKLDNNLayer, LRNLayer) {
+  testLRNLayer({4, 10, 12, 12, 0.001f, 0.75f, 5});
+  testLRNLayer({2, 32, 6, 6, 0.001f, 0.75f, 5});
+  testLRNLayer({4, 16, 8, 10, 0.01f, 0.5f, 5});
+}
 struct testImageDesc {
  int bs, ic, ih, iw;
 };

--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@@ -26,8 +26,6 @@ else()
 endif()
 if(MOBILE_INFERENCE)
-    list(REMOVE_ITEM MATH_SOURCES
-         ${CMAKE_CURRENT_SOURCE_DIR}/SIMDFunctions.cpp)
    # Remove sparse
    list(REMOVE_ITEM MATH_HEADERS
         ${CMAKE_CURRENT_SOURCE_DIR}/CpuSparseMatrix.h

--- a/paddle/math/CpuSparseMatrix.cpp
+++ b/paddle/math/CpuSparseMatrix.cpp
@@ -260,6 +260,35 @@ void CpuSparseMatrix::printOneRow(std::ostream& os, size_t idx) const {
  os << ";";
 }
+void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) {
+  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
+  CHECK_EQ(height_, b.getHeight());
+  CHECK_EQ(width_, b.getWidth());
+  real* A = getValue();
+  real* B = b.getValue();
+  if (b.getValueType() == FLOAT_VALUE) {
+    for (size_t i = 0; i < height_; i++) {
+      size_t start = getRowStartIdx(i);
+      size_t end = getRowStartIdx(i + 1);
+      CHECK_EQ(start, b.getRowStartIdx(i));
+      CHECK_EQ(end, b.getRowStartIdx(i + 1));
+      for (size_t j = start; j < end; j++) {
+        A[j] = B[j] * c.getElement(i, cCol);
+      }
+    }
+  } else if (b.getValueType() == NO_VALUE) {
+    for (size_t i = 0; i < height_; i++) {
+      size_t start = getRowStartIdx(i);
+      size_t end = getRowStartIdx(i + 1);
+      CHECK_EQ(start, b.getRowStartIdx(i));
+      CHECK_EQ(end, b.getRowStartIdx(i + 1));
+      for (size_t j = start; j < end; j++) {
+        A[j] = c.getElement(i, cCol);
+      }
+    }
+  }
+}
 void CpuSparseMatrix::randomizeUniform() {
  CHECK_LE(elementCnt_, height_ * width_);
  if (valueType_ == FLOAT_VALUE) {

--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@@ -239,6 +239,15 @@ public:
              const unsigned int* cols,
              const real* values);
+  /**
+   * @brief this_row = b_row * c_row[cCol]
+   *
+   * @param[in]  cCol   the column of matrix c used to scale each row of b
+   * @param[in]  b      CpuSparseMatrix
+   * @param[in]  c      Matrix
+   */
+  void rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c);
  void randomizeUniform();
  void copyFrom(const GpuSparseMatrix& src, hl_stream_t stream);

--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -1130,7 +1130,8 @@ void GpuMatrix::avgPoolForward(Matrix& inputMat,
                               size_t outputH,
                               size_t outputW,
                               size_t paddingH,
-                               size_t paddingW) {
+                               size_t paddingW,
+                               bool excludeMode) {
  CHECK(inputMat.useGpu_ == true) << "Matrix type are not equal";
  real* inputData = inputMat.getData();
@@ -1153,7 +1154,8 @@ void GpuMatrix::avgPoolForward(Matrix& inputMat,
                     paddingH,
                     paddingW,
                     data_,
-                     getStride());
+                     getStride(),
+                     excludeMode);
 }
 void GpuMatrix::avgPoolBackward(Matrix& outGrad,
@@ -1168,7 +1170,8 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad,
                                real scaleTargets,
                                real scaleOutput,
                                size_t paddingH,
-                                size_t paddingW) {
+                                size_t paddingW,
+                                bool excludeMode) {
  CHECK(outGrad.useGpu_ == true) << "Matrix type are not equal";
  real* outDiff = outGrad.getData();
@@ -1194,7 +1197,8 @@ void GpuMatrix::avgPoolBackward(Matrix& outGrad,
                      scaleTargets,
                      scaleOutput,
                      data_,
-                      outGrad.getStride());
+                      outGrad.getStride(),
+                      excludeMode);
 }
 void GpuMatrix::maxPool3DForward(Matrix& inputMat,
@@ -2136,7 +2140,8 @@ void CpuMatrix::avgPoolForward(Matrix& input,
                               size_t outputH,
                               size_t outputW,
                               size_t paddingH,
-                               size_t paddingW) {
+                               size_t paddingW,
+                               bool excludeMode) {
  // The main loop
  size_t num = input.getHeight();
  size_t inLength = imgSizeH * imgSizeW;
@@ -2165,7 +2170,8 @@ void CpuMatrix::avgPoolForward(Matrix& input,
              tgtData[ph * outputW + pw] += inData[h * imgSizeW + w];
            }
          }
-          int poolSize = (hend - hstart) * (wend - wstart);
+          int poolSize =
+              excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
          CHECK(poolSize);
          tgtData[ph * outputW + pw] /= poolSize;
        }
@@ -2189,7 +2195,8 @@ void CpuMatrix::avgPoolBackward(Matrix& input,
                                real scaleTargets,
                                real scaleOutput,
                                size_t paddingH,
-                                size_t paddingW) {
+                                size_t paddingW,
+                                bool excludeMode) {
  size_t num = input.getHeight();
  size_t channels = input.getWidth() / outputH / outputW;
  size_t inLength = imgSizeH * imgSizeW;
@@ -2211,7 +2218,8 @@ void CpuMatrix::avgPoolBackward(Matrix& input,
          int wstart = pw * strideW - paddingW;
          int wend = std::min(wstart + sizeX, imgSizeW);
          wstart = std::max(wstart, 0);
-          int poolSize = (hend - hstart) * (wend - wstart);
+          int poolSize =
+              excludeMode ? (hend - hstart) * (wend - wstart) : sizeY * sizeX;
          CHECK(poolSize);
          for (int h = hstart; h < hend; ++h) {

--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
--- a/paddle/math/SIMDFunctions.h
+++ b/paddle/math/SIMDFunctions.h
--- a/paddle/math/float16.h
+++ b/paddle/math/float16.h
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
--- a/paddle/math/tests/test_float16.cpp
+++ b/paddle/math/tests/test_float16.cpp
--- a/paddle/math/tests/test_float16.cu
+++ b/paddle/math/tests/test_float16.cu
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
--- a/paddle/operators/adamax_op.cc
+++ b/paddle/operators/adamax_op.cc
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
--- a/paddle/operators/batch_norm_op.cu.cc
+++ b/paddle/operators/batch_norm_op.cu.cc
--- a/paddle/operators/chunk_eval_op.cc
+++ b/paddle/operators/chunk_eval_op.cc
--- a/paddle/operators/chunk_eval_op.h
+++ b/paddle/operators/chunk_eval_op.h
--- a/paddle/operators/clip_by_norm_op.cc
+++ b/paddle/operators/clip_by_norm_op.cc
--- a/paddle/operators/clip_op.cc
+++ b/paddle/operators/clip_op.cc
--- a/paddle/operators/concat_op.cc
+++ b/paddle/operators/concat_op.cc
--- a/paddle/operators/conv_cudnn_op.cu.cc
+++ b/paddle/operators/conv_cudnn_op.cu.cc
--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
--- a/paddle/operators/conv_transpose_op.cc
+++ b/paddle/operators/conv_transpose_op.cc
--- a/paddle/operators/conv_transpose_op.h
+++ b/paddle/operators/conv_transpose_op.h
--- a/paddle/operators/crf_decoding_op.cc
+++ b/paddle/operators/crf_decoding_op.cc
--- a/paddle/operators/crf_decoding_op.h
+++ b/paddle/operators/crf_decoding_op.h
--- a/paddle/operators/detail/CMakeLists.txt
+++ b/paddle/operators/detail/CMakeLists.txt
--- a/paddle/operators/detail/recv_impl.cc
+++ b/paddle/operators/detail/recv_impl.cc
--- a/paddle/operators/detail/send_impl.cc
+++ b/paddle/operators/detail/send_impl.cc
--- a/paddle/operators/detail/send_recv.proto
+++ b/paddle/operators/detail/send_recv.proto
--- a/paddle/operators/detail/send_recv_impl.h
+++ b/paddle/operators/detail/send_recv_impl.h
--- a/paddle/operators/detail/simple_block_queue.h
+++ b/paddle/operators/detail/simple_block_queue.h
--- a/paddle/operators/elementwise_add_op.h
+++ b/paddle/operators/elementwise_add_op.h
--- a/paddle/operators/elementwise_op.h
+++ b/paddle/operators/elementwise_op.h
--- a/paddle/operators/elementwise_op_function.h
+++ b/paddle/operators/elementwise_op_function.h
--- a/paddle/operators/gru_op.h
+++ b/paddle/operators/gru_op.h
--- a/paddle/operators/hinge_loss_op.cc
+++ b/paddle/operators/hinge_loss_op.cc
--- a/paddle/operators/hinge_loss_op.cu
+++ b/paddle/operators/hinge_loss_op.cu
--- a/paddle/operators/hinge_loss_op.h
+++ b/paddle/operators/hinge_loss_op.h
--- a/paddle/operators/increment_op.cc
+++ b/paddle/operators/increment_op.cc
--- a/paddle/operators/load_op.cc
+++ b/paddle/operators/load_op.cc
--- a/paddle/operators/lod_array_length_op.cc
+++ b/paddle/operators/lod_array_length_op.cc
--- a/paddle/operators/lod_tensor_to_array_op.cc
+++ b/paddle/operators/lod_tensor_to_array_op.cc
--- a/paddle/operators/log_loss_op.cc
+++ b/paddle/operators/log_loss_op.cc
--- a/paddle/operators/log_loss_op.cu
+++ b/paddle/operators/log_loss_op.cu
--- a/paddle/operators/log_loss_op.h
+++ b/paddle/operators/log_loss_op.h
--- a/paddle/operators/logical_op.cc
+++ b/paddle/operators/logical_op.cc
--- a/paddle/operators/lrn_op.cc
+++ b/paddle/operators/lrn_op.cc
--- a/paddle/operators/lrn_op.cu
+++ b/paddle/operators/lrn_op.cu
--- a/paddle/operators/lrn_op.h
+++ b/paddle/operators/lrn_op.h
--- a/paddle/operators/lstm_op.cc
+++ b/paddle/operators/lstm_op.cc
--- a/paddle/operators/lstm_op.h
+++ b/paddle/operators/lstm_op.h
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
--- a/paddle/operators/math/detail/gru_cpu_kernel.h
+++ b/paddle/operators/math/detail/gru_cpu_kernel.h
--- a/paddle/operators/math/detail/gru_gpu_kernel.h
+++ b/paddle/operators/math/detail/gru_gpu_kernel.h
--- a/paddle/operators/math/detail/gru_kernel.h
+++ b/paddle/operators/math/detail/gru_kernel.h
--- a/paddle/operators/math/detail/lstm_cpu_kernel.h
+++ b/paddle/operators/math/detail/lstm_cpu_kernel.h
--- a/paddle/operators/math/detail/lstm_gpu_kernel.h
+++ b/paddle/operators/math/detail/lstm_gpu_kernel.h
--- a/paddle/operators/math/detail/lstm_kernel.h
+++ b/paddle/operators/math/detail/lstm_kernel.h
--- a/paddle/operators/math/gru_compute.cc
+++ b/paddle/operators/math/gru_compute.cc
--- a/paddle/operators/math/gru_compute.cu
+++ b/paddle/operators/math/gru_compute.cu
--- a/paddle/operators/math/gru_compute.h
+++ b/paddle/operators/math/gru_compute.h
--- a/paddle/operators/math/lstm_compute.cc
+++ b/paddle/operators/math/lstm_compute.cc
--- a/paddle/operators/math/lstm_compute.h
+++ b/paddle/operators/math/lstm_compute.h
--- a/paddle/operators/math/maxouting.cc
+++ b/paddle/operators/math/maxouting.cc
--- a/paddle/operators/math/maxouting.cu
+++ b/paddle/operators/math/maxouting.cu
--- a/paddle/operators/math/maxouting.h
+++ b/paddle/operators/math/maxouting.h
--- a/paddle/operators/math/unpooling.cc
+++ b/paddle/operators/math/unpooling.cc
--- a/paddle/operators/math/unpooling.cu
+++ b/paddle/operators/math/unpooling.cu
--- a/paddle/operators/math/unpooling.h
+++ b/paddle/operators/math/unpooling.h
--- a/paddle/operators/maxout_op.cc
+++ b/paddle/operators/maxout_op.cc
--- a/paddle/operators/maxout_op.cu.cc
+++ b/paddle/operators/maxout_op.cu.cc
--- a/paddle/operators/maxout_op.h
+++ b/paddle/operators/maxout_op.h
--- a/paddle/operators/momentum_op.cc
+++ b/paddle/operators/momentum_op.cc
--- a/paddle/operators/momentum_op.cu
+++ b/paddle/operators/momentum_op.cu
--- a/paddle/operators/momentum_op.h
+++ b/paddle/operators/momentum_op.h
--- a/paddle/operators/multiplex_op.cc
+++ b/paddle/operators/multiplex_op.cc
--- a/paddle/operators/nce_op.cc
+++ b/paddle/operators/nce_op.cc
--- a/paddle/operators/nce_op.h
+++ b/paddle/operators/nce_op.h
--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
--- a/paddle/operators/rank_loss_op.cu
+++ b/paddle/operators/rank_loss_op.cu
--- a/paddle/operators/rank_loss_op.h
+++ b/paddle/operators/rank_loss_op.h
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
--- a/paddle/operators/recv_op.cc
+++ b/paddle/operators/recv_op.cc
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
--- a/paddle/operators/reshape_op.cu.cc
+++ b/paddle/operators/reshape_op.cu.cc
--- a/paddle/operators/reshape_op.h
+++ b/paddle/operators/reshape_op.h
--- a/paddle/operators/roi_pool_op.cc
+++ b/paddle/operators/roi_pool_op.cc
--- a/paddle/operators/roi_pool_op.cu
+++ b/paddle/operators/roi_pool_op.cu
--- a/paddle/operators/roi_pool_op.h
+++ b/paddle/operators/roi_pool_op.h
--- a/paddle/operators/save_op.cc
+++ b/paddle/operators/save_op.cc
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
--- a/paddle/operators/scale_op.cu
+++ b/paddle/operators/scale_op.cu
--- a/paddle/operators/scatter_op.cc
+++ b/paddle/operators/scatter_op.cc
--- a/paddle/operators/send_op.cc
+++ b/paddle/operators/send_op.cc
--- a/paddle/operators/send_recv_op_test.cc
+++ b/paddle/operators/send_recv_op_test.cc
--- a/paddle/operators/sequence_pool_op.cc
+++ b/paddle/operators/sequence_pool_op.cc
--- a/paddle/operators/sequence_slice_op.cc
+++ b/paddle/operators/sequence_slice_op.cc
--- a/paddle/operators/sequence_slice_op.h
+++ b/paddle/operators/sequence_slice_op.h
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
--- a/paddle/operators/shrink_rnn_memory_op.cc
+++ b/paddle/operators/shrink_rnn_memory_op.cc
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
--- a/paddle/operators/smooth_l1_loss_op.cc
+++ b/paddle/operators/smooth_l1_loss_op.cc
--- a/paddle/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/operators/softmax_with_cross_entropy_op.cc
--- a/paddle/operators/sum_op.cc
+++ b/paddle/operators/sum_op.cc
--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
--- a/paddle/operators/tensor_array_read_write_op.cc
+++ b/paddle/operators/tensor_array_read_write_op.cc
--- a/paddle/operators/unpool_op.cc
+++ b/paddle/operators/unpool_op.cc
--- a/paddle/operators/unpool_op.cu.cc
+++ b/paddle/operators/unpool_op.cu.cc
--- a/paddle/operators/unpool_op.h
+++ b/paddle/operators/unpool_op.h
--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
--- a/paddle/optimizer/parameter_optimizer_test.cc
+++ b/paddle/optimizer/parameter_optimizer_test.cc
--- a/paddle/optimizer/serialization_test.cc
+++ b/paddle/optimizer/serialization_test.cc
--- a/paddle/platform/cuda_profiler.h
+++ b/paddle/platform/cuda_profiler.h
--- a/paddle/platform/dynload/cudnn.cc
+++ b/paddle/platform/dynload/cudnn.cc
--- a/paddle/platform/dynload/cudnn.h
+++ b/paddle/platform/dynload/cudnn.h
--- a/paddle/platform/dynload/dynamic_loader.cc
+++ b/paddle/platform/dynload/dynamic_loader.cc
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
--- a/paddle/pserver/CMakeLists.txt
+++ b/paddle/pserver/CMakeLists.txt
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
--- a/paddle/testing/paddle_gtest_main.cc
+++ b/paddle/testing/paddle_gtest_main.cc
--- a/paddle/trainer/CMakeLists.txt
+++ b/paddle/trainer/CMakeLists.txt
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
--- a/python/paddle/trainer_config_helpers/poolings.py
+++ b/python/paddle/trainer_config_helpers/poolings.py
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
--- a/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
--- a/python/paddle/v2/dataset/uci_housing.py
+++ b/python/paddle/v2/dataset/uci_housing.py
--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
--- a/python/paddle/v2/fluid/data_feeder.py
+++ b/python/paddle/v2/fluid/data_feeder.py
--- a/python/paddle/v2/fluid/evaluator.py
+++ b/python/paddle/v2/fluid/evaluator.py
--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
--- a/python/paddle/v2/fluid/initializer.py
+++ b/python/paddle/v2/fluid/initializer.py
--- a/python/paddle/v2/fluid/io.py
+++ b/python/paddle/v2/fluid/io.py
--- a/python/paddle/v2/fluid/layer_helper.py
+++ b/python/paddle/v2/fluid/layer_helper.py
--- a/python/paddle/v2/fluid/layers.py
+++ b/python/paddle/v2/fluid/layers.py
--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
--- a/python/paddle/v2/fluid/param_attr.py
+++ b/python/paddle/v2/fluid/param_attr.py
--- a/python/paddle/v2/fluid/profiler.py
+++ b/python/paddle/v2/fluid/profiler.py
--- a/python/paddle/v2/fluid/regularizer.py
+++ b/python/paddle/v2/fluid/regularizer.py
--- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
--- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
--- a/python/paddle/v2/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
--- a/python/paddle/v2/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
--- a/python/paddle/v2/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py
--- a/python/paddle/v2/fluid/tests/demo/fc_gan.py
+++ b/python/paddle/v2/fluid/tests/demo/fc_gan.py
--- a/python/paddle/v2/fluid/tests/test_array_read_write_op.py
+++ b/python/paddle/v2/fluid/tests/test_array_read_write_op.py
--- a/python/paddle/v2/fluid/tests/test_batch_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
--- a/python/paddle/v2/fluid/tests/test_chunk_eval_op.py
+++ b/python/paddle/v2/fluid/tests/test_chunk_eval_op.py
--- a/python/paddle/v2/fluid/tests/test_conditional_block.py
+++ b/python/paddle/v2/fluid/tests/test_conditional_block.py
--- a/python/paddle/v2/fluid/tests/test_crf_decoding_op.py
+++ b/python/paddle/v2/fluid/tests/test_crf_decoding_op.py
--- a/python/paddle/v2/fluid/tests/test_data_feeder.py
+++ b/python/paddle/v2/fluid/tests/test_data_feeder.py
--- a/python/paddle/v2/fluid/tests/test_dyn_rnn.py
+++ b/python/paddle/v2/fluid/tests/test_dyn_rnn.py
--- a/python/paddle/v2/fluid/tests/test_executor_and_mul.py
+++ b/python/paddle/v2/fluid/tests/test_executor_and_mul.py
--- a/python/paddle/v2/fluid/tests/test_hinge_loss_op.py
+++ b/python/paddle/v2/fluid/tests/test_hinge_loss_op.py
--- a/python/paddle/v2/fluid/tests/test_image_classification_layer.py
+++ b/python/paddle/v2/fluid/tests/test_image_classification_layer.py
--- a/python/paddle/v2/fluid/tests/test_initializer.py
+++ b/python/paddle/v2/fluid/tests/test_initializer.py
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
--- a/python/paddle/v2/fluid/tests/test_lod_rank_table.py
+++ b/python/paddle/v2/fluid/tests/test_lod_rank_table.py
--- a/python/paddle/v2/fluid/tests/test_log_loss_op.py
+++ b/python/paddle/v2/fluid/tests/test_log_loss_op.py
--- a/python/paddle/v2/fluid/tests/test_lrn_op.py
+++ b/python/paddle/v2/fluid/tests/test_lrn_op.py
--- a/python/paddle/v2/fluid/tests/test_maxout_op.py
+++ b/python/paddle/v2/fluid/tests/test_maxout_op.py
--- a/python/paddle/v2/fluid/tests/test_nce.py
+++ b/python/paddle/v2/fluid/tests/test_nce.py
--- a/python/paddle/v2/fluid/tests/test_operator_desc.py
+++ b/python/paddle/v2/fluid/tests/test_operator_desc.py
--- a/python/paddle/v2/fluid/tests/test_parameter.py
+++ b/python/paddle/v2/fluid/tests/test_parameter.py
--- a/python/paddle/v2/fluid/tests/test_profiler.py
+++ b/python/paddle/v2/fluid/tests/test_profiler.py
--- a/python/paddle/v2/fluid/tests/test_program.py
+++ b/python/paddle/v2/fluid/tests/test_program.py
--- a/python/paddle/v2/fluid/tests/test_recurrent_op.py
+++ b/python/paddle/v2/fluid/tests/test_recurrent_op.py
--- a/python/paddle/v2/fluid/tests/test_roi_pool_op.py
+++ b/python/paddle/v2/fluid/tests/test_roi_pool_op.py
--- a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
+++ b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
--- a/python/paddle/v2/fluid/tests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/v2/fluid/tests/test_sigmoid_cross_entropy_with_logits_op.py
--- a/python/paddle/v2/fluid/tests/test_unpool_op.py
+++ b/python/paddle/v2/fluid/tests/test_unpool_op.py
--- a/python/paddle/v2/fluid/tests/test_variable.py
+++ b/python/paddle/v2/fluid/tests/test_variable.py
--- a/python/paddle/v2/reader/decorator.py
+++ b/python/paddle/v2/reader/decorator.py
--- a/python/setup.py.in
+++ b/python/setup.py.in