diff --git a/.clang-format b/.clang-format
index 9ba433b17362424973626470d930356c2173dd84..aff93435f58c522f5ed1090aef2005f76e91cf31 100644
--- a/.clang-format
+++ b/.clang-format
@@ -25,4 +25,3 @@ AllowAllParametersOfDeclarationOnNextLine: true
 BinPackParameters: false
 BinPackArguments: false
 ...
-
diff --git a/.travis.yml b/.travis.yml
index c51e02eb79a9e53a2b8d1d663e8f0c3e0d8c3a61..e2d49daa1981396628efa5d16459eb70e9e76884 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -42,7 +42,7 @@ before_install:
 script:
   - |
     timeout 2580 paddle/scripts/travis/${JOB}.sh # 43min timeout
-    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true; else false; fi;
+    RESULT=$?; if [ $RESULT -eq 0 ] || [ $RESULT -eq 142 ]; then true ;else exit 1; fi;
   - |
     if [[ "$JOB" != "build_doc" ]]; then exit 0; fi;
     if [[ "$TRAVIS_PULL_REQUEST" != "false" ]]; then exit 0; fi;
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 65164b8472b902be8b0b9d5fb99807d012b8a666..e76512166fcaea5daf2a67d1259331b680f15b7c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -133,6 +133,8 @@ include(external/any)       # download libn::any
 include(external/eigen)     # download eigen3
 include(external/pybind11)  # download pybind11
 include(external/nccl)
+include(external/cares)
+include(external/grpc)
 
 include(cudnn)              # set cudnn libraries, must before configure
 include(configure)          # add paddle env configuration
diff --git a/Dockerfile b/Dockerfile
index 150344a8116e2be9b5bab8e5fdcc9c37f4025020..857d3f3e5f64791146741ffb29feabfcb2ecbb84 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -29,7 +29,7 @@ RUN apt-get update && \
     automake locales clang-format swig doxygen cmake  \
     liblapack-dev liblapacke-dev libboost-dev \
     clang-3.8 llvm-3.8 libclang-3.8-dev \
-    net-tools && \
+    net-tools libtool && \
     apt-get clean -y
 
 # Install Go and glide
diff --git a/benchmark/paddle/image/googlenet.py b/benchmark/paddle/image/googlenet.py
index a88ecac67d9e677f14f6dc24ba9a337b1245243f..7059c13bd2c2b98eb3fbcf633a6f7064e54d5402 100644
--- a/benchmark/paddle/image/googlenet.py
+++ b/benchmark/paddle/image/googlenet.py
@@ -6,10 +6,21 @@ width = 224
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 128)
 use_gpu = get_config_arg('use_gpu', bool, True)
-
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+is_infer = get_config_arg("is_infer", bool, False)
+
+args = {
+    'height': height,
+    'width': width,
+    'color': True,
+    'num_class': num_class,
+    'is_infer': is_infer
+}
 define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
+    "train.list" if not is_infer else None,
+    "test.list" if is_infer else None,
+    module="provider",
+    obj="process",
+    args=args)
 
 settings(
     batch_size=batch_size,
@@ -146,7 +157,6 @@ def inception(name, input, channels, \
     return cat
 
 
-lab = data_layer(name="label", size=1000)
 data = data_layer(name="input", size=3 * height * width)
 
 # stage 1
@@ -224,6 +234,10 @@ pool5 = img_pool_layer(
 dropout = dropout_layer(name="dropout", input=pool5, dropout_rate=0.4)
 out3 = fc_layer(
     name="output3", input=dropout, size=1000, act=SoftmaxActivation())
-loss3 = cross_entropy(name='loss3', input=out3, label=lab)
 
-outputs(loss3)
+if is_infer:
+    outputs(out3)
+else:
+    lab = data_layer(name="label", size=num_class)
+    loss3 = cross_entropy(name='loss3', input=out3, label=lab)
+    outputs(loss3)
diff --git a/benchmark/paddle/image/provider.py b/benchmark/paddle/image/provider.py
index 4703944c8722552d56ba80a8e0663de5fb4df53d..927b1759941f362ef4b5ffe84dd01332986d9306 100644
--- a/benchmark/paddle/image/provider.py
+++ b/benchmark/paddle/image/provider.py
@@ -13,14 +13,20 @@ def initHook(settings, height, width, color, num_class, **kwargs):
         settings.data_size = settings.height * settings.width * 3
     else:
         settings.data_size = settings.height * settings.width
-
-    settings.slots = [dense_vector(settings.data_size), integer_value(1)]
+    settings.is_infer = kwargs.get('is_infer', False)
+    if settings.is_infer:
+        settings.slots = [dense_vector(settings.data_size)]
+    else:
+        settings.slots = [dense_vector(settings.data_size), integer_value(1)]
 
 
 @provider(
     init_hook=initHook, min_pool_size=-1, cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, file_list):
-    for i in xrange(1024):
+    for i in xrange(2560 if settings.is_infer else 1024):
         img = np.random.rand(1, settings.data_size).reshape(-1, 1).flatten()
-        lab = random.randint(0, settings.num_class - 1)
-        yield img.astype('float32'), int(lab)
+        if settings.is_infer:
+            yield img.astype('float32')
+        else:
+            lab = random.randint(0, settings.num_class - 1)
+            yield img.astype('float32'), int(lab)
diff --git a/benchmark/paddle/image/resnet.py b/benchmark/paddle/image/resnet.py
index 6ae1857642e8df4b3859eec68a3a5227d1c4fcb3..4a14363ff1db48a5072cbb5f5eb3bc9241ffca8f 100644
--- a/benchmark/paddle/image/resnet.py
+++ b/benchmark/paddle/image/resnet.py
@@ -6,11 +6,21 @@ width = 224
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 64)
 layer_num = get_config_arg("layer_num", int, 50)
-is_test = get_config_arg("is_test", bool, False)
-
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+is_infer = get_config_arg("is_infer", bool, False)
+
+args = {
+    'height': height,
+    'width': width,
+    'color': True,
+    'num_class': num_class,
+    'is_infer': is_infer
+}
 define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
+    "train.list" if not is_infer else None,
+    "test.list" if is_infer else None,
+    module="provider",
+    obj="process",
+    args=args)
 
 settings(
     batch_size=batch_size,
@@ -45,7 +55,10 @@ def conv_bn_layer(name,
         act=LinearActivation(),
         bias_attr=False)
     return batch_norm_layer(
-        name=name + "_bn", input=tmp, act=active_type, use_global_stats=is_test)
+        name=name + "_bn",
+        input=tmp,
+        act=active_type,
+        use_global_stats=is_infer)
 
 
 def bottleneck_block(name, input, num_filters1, num_filters2):
@@ -207,7 +220,9 @@ elif layer_num == 152:
 else:
     print("Wrong layer number.")
 
-lbl = data_layer(name="label", size=num_class)
-loss = cross_entropy(name='loss', input=resnet, label=lbl)
-inputs(img, lbl)
-outputs(loss)
+if is_infer:
+    outputs(resnet)
+else:
+    lbl = data_layer(name="label", size=num_class)
+    loss = cross_entropy(name='loss', input=resnet, label=lbl)
+    outputs(loss)
diff --git a/benchmark/paddle/image/run_mkldnn_infer.sh b/benchmark/paddle/image/run_mkldnn_infer.sh
new file mode 100755
index 0000000000000000000000000000000000000000..03a76c0540092501b33e1fdd430ae4e754744fd0
--- /dev/null
+++ b/benchmark/paddle/image/run_mkldnn_infer.sh
@@ -0,0 +1,86 @@
+set -e
+
+function clock_to_seconds() {
+  hours=`echo $1 | awk -F ':' '{print $1}'`
+  mins=`echo $1 | awk -F ':' '{print $2}'`
+  secs=`echo $1 | awk -F ':' '{print $3}'`
+  echo `bc -l <<< "$secs + $mins * 60 + $hours * 3600"`
+}
+
+function infer() {
+  unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
+  topology=$1
+  layer_num=$2
+  bs=$3
+  use_mkldnn=$4
+  if [ $4 == "True" ]; then
+    thread=1
+    log="logs/infer-${topology}-${layer_num}-mkldnn-${bs}.log"
+  elif [ $4 == "False" ]; then
+    thread=`nproc`
+    if [ $thread -gt $bs ]; then
+      thread=$bs
+    fi
+    log="logs/infer-${topology}-${layer_num}-${thread}mklml-${bs}.log"
+  else
+    echo "Wrong input $4, use True or False."
+    exit 0
+  fi
+
+  models_in="models/${topology}-${layer_num}/pass-00000/"
+  if [ ! -d $models_in ]; then
+    echo "Training model ${topology}_${layer_num}"
+    paddle train --job=train \
+      --config="${topology}.py" \
+      --use_mkldnn=True \
+      --use_gpu=False \
+      --trainer_count=1 \
+      --num_passes=1 \
+      --save_dir="models/${topology}-${layer_num}" \
+      --config_args="batch_size=128,layer_num=${layer_num}" \
+      > /dev/null 2>&1
+    echo "Done"
+  fi
+  log_period=$((256 / bs))
+  paddle train --job=test \
+    --config="${topology}.py" \
+    --use_mkldnn=$use_mkldnn \
+    --use_gpu=False \
+    --trainer_count=$thread \
+    --log_period=$log_period \
+    --config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True" \
+    --init_model_path=$models_in \
+    2>&1 | tee ${log}
+
+  # calculate the last 5 logs period time of 1280 samples,
+  # the time before are burning time.
+  start=`tail ${log} -n 7 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+  end=`tail ${log} -n 2 | head -n 1 | awk -F ' ' '{print $2}' | xargs`
+  start_sec=`clock_to_seconds $start`
+  end_sec=`clock_to_seconds $end`
+  fps=`bc <<< "scale = 2; 1280 / ($end_sec - $start_sec)"`
+  echo "Last 1280 samples start: ${start}(${start_sec} sec), end: ${end}(${end_sec} sec;" >> ${log}
+  echo "FPS: $fps images/sec" >> ${log}
+}
+
+if [ ! -f "train.list" ]; then
+  echo " " > train.list
+fi
+if [ ! -f "test.list" ]; then
+  echo " " > test.list
+fi
+if [ ! -d "logs" ]; then
+  mkdir logs
+fi
+if [ ! -d "models" ]; then
+  mkdir -p models
+fi
+
+# inference benchmark
+for use_mkldnn in True False; do
+  for batchsize in 1 2 4 8 16; do
+    infer googlenet v1 $batchsize $use_mkldnn
+    infer resnet 50 $batchsize $use_mkldnn
+    infer vgg 19 $batchsize $use_mkldnn
+  done
+done
diff --git a/benchmark/paddle/image/run_mkldnn.sh b/benchmark/paddle/image/run_mkldnn_train.sh
similarity index 79%
rename from benchmark/paddle/image/run_mkldnn.sh
rename to benchmark/paddle/image/run_mkldnn_train.sh
index f768f6c29a84b40f917e0ccfde4d8c15f65c818b..320206239ae960bd088b05d3b10934a98da741b1 100755
--- a/benchmark/paddle/image/run_mkldnn.sh
+++ b/benchmark/paddle/image/run_mkldnn_train.sh
@@ -8,13 +8,13 @@ function train() {
   use_mkldnn=$4
   if [ $4 == "True" ]; then
     thread=1
-    log="logs/${topology}-${layer_num}-mkldnn-${bs}.log"
+    log="logs/train-${topology}-${layer_num}-mkldnn-${bs}.log"
   elif [ $4 == "False" ]; then
     thread=`nproc`
     # each trainer_count use only 1 core to avoid conflict
-    log="logs/${topology}-${layer_num}-${thread}mklml-${bs}.log"
+    log="logs/train-${topology}-${layer_num}-${thread}mklml-${bs}.log"
   else
-    echo "Wrong input $3, use True or False."
+    echo "Wrong input $4, use True or False."
     exit 0
   fi
   args="batch_size=${bs},layer_num=${layer_num}"
@@ -30,13 +30,14 @@ function train() {
     2>&1 | tee ${log} 
 }
 
-if [ ! -d "train.list" ]; then
+if [ ! -f "train.list" ]; then
   echo " " > train.list
 fi
 if [ ! -d "logs" ]; then
   mkdir logs
 fi
 
+# training benchmark
 for use_mkldnn in True False; do
   for batchsize in 64 128 256; do
     train vgg 19 $batchsize $use_mkldnn
diff --git a/benchmark/paddle/image/vgg.py b/benchmark/paddle/image/vgg.py
index 420884ed8e1ae36a3f1772bfbe8323f3d0ea71e6..8d0a1e97a451cd52ef17e4e326673cc90059ef3c 100644
--- a/benchmark/paddle/image/vgg.py
+++ b/benchmark/paddle/image/vgg.py
@@ -6,10 +6,21 @@ width = 224
 num_class = 1000
 batch_size = get_config_arg('batch_size', int, 64)
 layer_num = get_config_arg('layer_num', int, 19)
+is_infer = get_config_arg("is_infer", bool, False)
 
-args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
+args = {
+    'height': height,
+    'width': width,
+    'color': True,
+    'num_class': num_class,
+    'is_infer': is_infer
+}
 define_py_data_sources2(
-    "train.list", None, module="provider", obj="process", args=args)
+    "train.list" if not is_infer else None,
+    "test.list" if is_infer else None,
+    module="provider",
+    obj="process",
+    args=args)
 
 settings(
     batch_size=batch_size,
@@ -98,6 +109,9 @@ elif layer_num == 19:
 else:
     print("Wrong layer number.")
 
-lab = data_layer('label', num_class)
-loss = cross_entropy(input=vgg, label=lab)
-outputs(loss)
+if is_infer:
+    outputs(vgg)
+else:
+    lab = data_layer('label', num_class)
+    loss = cross_entropy(input=vgg, label=lab)
+    outputs(loss)
diff --git a/cmake/external/cares.cmake b/cmake/external/cares.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..e05111ee18efc906e39bcb56fb1be3b3c3dff5d6
--- /dev/null
+++ b/cmake/external/cares.cmake
@@ -0,0 +1,45 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+IF(MOBILE_INFERENCE)
+    return()
+ENDIF()
+
+include (ExternalProject)
+
+# NOTE: c-ares is needed when linking with grpc.
+
+SET(CARES_SOURCES_DIR ${THIRD_PARTY_PATH}/cares)
+SET(CARES_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cares)
+SET(CARES_INCLUDE_DIR "${CARES_INSTALL_DIR}/include/" CACHE PATH "cares include directory." FORCE)
+
+ExternalProject_Add(
+    extern_cares
+    GIT_REPOSITORY "https://github.com/c-ares/c-ares.git"
+    GIT_TAG "cares-1_13_0"
+    PREFIX          ${CARES_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ./buildconf && ./configure --disable-shared --prefix=${CARES_INSTALL_DIR}
+    BUILD_IN_SOURCE 1
+    BUILD_COMMAND   make
+    INSTALL_COMMAND make install
+)
+
+ADD_LIBRARY(cares STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET cares PROPERTY IMPORTED_LOCATION
+             "${CARES_INSTALL_DIR}/lib/libcares.a")
+
+include_directories(${CARES_INCLUDE_DIR})
+ADD_DEPENDENCIES(cares extern_cares)
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index c819eb4d70898e48eab499c666168d78262d4240..d4f252bb9f64c8db82b841fedf0817f5d8596501 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -28,15 +28,8 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
 ExternalProject_Add(
     extern_gflags
     ${EXTERNAL_PROJECT_LOG_ARGS}
-    # TODO(yiwang): The annoying warnings mentioned in
-    # https://github.com/PaddlePaddle/Paddle/issues/3277 are caused by
-    # gflags.  I fired a PR https://github.com/gflags/gflags/pull/230
-    # to fix it.  Before it gets accepted by the gflags team, we use
-    # my personal fork, which contains above fix, temporarily.  Let's
-    # change this back to the official Github repo once my PR is
-    # merged.
-    GIT_REPOSITORY  "https://github.com/wangkuiyi/gflags.git"
-    GIT_TAG         986964c07427ecb9cdb5bd73f73ebbd40e54dadb
+    GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
+    GIT_TAG         77592648e3f3be87d6c7123eb81cbad75f9aef5a
     PREFIX          ${GFLAGS_SOURCES_DIR}
     UPDATE_COMMAND  ""
     CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
diff --git a/cmake/external/grpc.cmake b/cmake/external/grpc.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..86122aec8c77f34756a37121582b92489d749d7f
--- /dev/null
+++ b/cmake/external/grpc.cmake
@@ -0,0 +1,66 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+IF(MOBILE_INFERENCE)
+    return()
+ENDIF()
+
+include (ExternalProject)
+
+SET(GRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/grpc)
+SET(GRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/grpc)
+SET(GRPC_INCLUDE_DIR "${GRPC_INSTALL_DIR}/include/" CACHE PATH "grpc include directory." FORCE)
+SET(GRPC_CPP_PLUGIN "${GRPC_INSTALL_DIR}/bin/grpc_cpp_plugin" CACHE FILEPATH "GRPC_CPP_PLUGIN" FORCE)
+IF(APPLE)
+  SET(BUILD_CMD make -n HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin | sed "s/-Werror//g" | sh)
+ELSE()
+  SET(BUILD_CMD make HAS_SYSTEM_PROTOBUF=false -s -j8 static grpc_cpp_plugin)
+ENDIF()
+
+ExternalProject_Add(
+    extern_grpc
+    DEPENDS protobuf zlib
+    GIT_REPOSITORY "https://github.com/grpc/grpc.git"
+    GIT_TAG "v1.7.x"
+    PREFIX          ${GRPC_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CONFIGURE_COMMAND ""
+    BUILD_IN_SOURCE 1
+    # NOTE(yuyang18):
+    # Disable -Werror, otherwise the compile will fail in MacOS.
+    # It seems that we cannot configure that by make command.
+    # Just dry run make command and remove `-Werror`, then use a shell to run make commands
+    BUILD_COMMAND  ${BUILD_CMD}
+    INSTALL_COMMAND make prefix=${GRPC_INSTALL_DIR} install
+)
+
+# FIXME(typhoonzero): hack to get static lib path, try a better way like merge them.
+ADD_LIBRARY(grpc++_unsecure STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET grpc++_unsecure PROPERTY IMPORTED_LOCATION
+             "${GRPC_INSTALL_DIR}/lib/libgrpc++_unsecure.a")
+
+ADD_LIBRARY(grpc++ STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET grpc++ PROPERTY IMPORTED_LOCATION
+            "${GRPC_INSTALL_DIR}/lib/libgrpc++.a")
+ADD_LIBRARY(gpr STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET gpr PROPERTY IMPORTED_LOCATION
+            "${GRPC_INSTALL_DIR}/lib/libgpr.a")
+
+ADD_LIBRARY(grpc_unsecure STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET grpc_unsecure PROPERTY IMPORTED_LOCATION
+            "${GRPC_INSTALL_DIR}/lib/libgrpc_unsecure.a")
+
+include_directories(${GRPC_INCLUDE_DIR})
+ADD_DEPENDENCIES(grpc++_unsecure extern_grpc)
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index be7f6a9465970711170bd15dcecaadeaa8a55f86..fab2af362bb070a54987b6499748056f3d12a56b 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -15,7 +15,18 @@
 INCLUDE(ExternalProject)
 # Always invoke `FIND_PACKAGE(Protobuf)` for importing function protobuf_generate_cpp
 FIND_PACKAGE(Protobuf QUIET)
-SET(PROTOBUF_FOUND "OFF")
+macro(UNSET_VAR VAR_NAME)
+    UNSET(${VAR_NAME} CACHE)
+    UNSET(${VAR_NAME})
+endmacro()
+UNSET_VAR(PROTOBUF_INCLUDE_DIR)
+UNSET_VAR(PROTOBUF_FOUND)
+UNSET_VAR(PROTOBUF_PROTOC_EXECUTABLE)
+UNSET_VAR(PROTOBUF_PROTOC_LIBRARY)
+UNSET_VAR(PROTOBUF_LITE_LIBRARY)
+UNSET_VAR(PROTOBUF_LIBRARY)
+UNSET_VAR(PROTOBUF_INCLUDE_DIR)
+UNSET_VAR(Protobuf_PROTOC_EXECUTABLE)
 
 if(NOT COMMAND protobuf_generate_python)  # before cmake 3.4, protobuf_genrerate_python is not defined.
     function(protobuf_generate_python SRCS)
@@ -110,7 +121,6 @@ macro(PROMPT_PROTOBUF_LIB)
     # FIND_Protobuf.cmake uses `Protobuf_PROTOC_EXECUTABLE`.
     # make `protobuf_generate_cpp` happy.
     SET(Protobuf_PROTOC_EXECUTABLE ${PROTOBUF_PROTOC_EXECUTABLE})
-
     FOREACH(dep ${protobuf_DEPS})
         ADD_DEPENDENCIES(protobuf ${dep})
         ADD_DEPENDENCIES(protobuf_lite ${dep})
@@ -128,11 +138,11 @@ endmacro()
 
 set(PROTOBUF_ROOT "" CACHE PATH "Folder contains protobuf")
 if (NOT "${PROTOBUF_ROOT}" STREQUAL "")
-    find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include)
-    find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib)
-    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib)
-    find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib)
-    find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin)
+    find_path(PROTOBUF_INCLUDE_DIR google/protobuf/message.h PATHS ${PROTOBUF_ROOT}/include NO_DEFAULT_PATH)
+    find_library(PROTOBUF_LIBRARY protobuf PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_library(PROTOBUF_LITE_LIBRARY protobuf-lite PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_library(PROTOBUF_PROTOC_LIBRARY protoc PATHS ${PROTOBUF_ROOT}/lib NO_DEFAULT_PATH)
+    find_program(PROTOBUF_PROTOC_EXECUTABLE protoc PATHS ${PROTOBUF_ROOT}/bin NO_DEFAULT_PATH)
     if (PROTOBUF_INCLUDE_DIR AND PROTOBUF_LIBRARY AND PROTOBUF_LITE_LIBRARY AND PROTOBUF_PROTOC_LIBRARY AND PROTOBUF_PROTOC_EXECUTABLE)
         message(STATUS "Using custom protobuf library in ${PROTOBUF_ROOT}.")
         SET_PROTOBUF_VERSION()
@@ -178,14 +188,26 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
         SET(OPTIONAL_CACHE_ARGS "-DZLIB_ROOT:STRING=${ZLIB_ROOT}")
     ENDIF()
 
+    SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
+    SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
+    IF(MOBILE_INFERENCE)
+        # The reason why the official version is not used is described in
+        # https://github.com/PaddlePaddle/Paddle/issues/6114
+        SET(PROTOBUF_REPO "https://github.com/qingqing01/protobuf.git")
+        SET(PROTOBUF_TAG "v3.2.0")
+        IF(NOT BUILD_FOR_HOST)
+            SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-Dprotobuf_BUILD_PROTOC_BINARIES=OFF")
+        ENDIF()
+    ENDIF()
+
     ExternalProject_Add(
         ${TARGET_NAME}
         ${EXTERNAL_PROJECT_LOG_ARGS}
         PREFIX          ${PROTOBUF_SOURCES_DIR}
         UPDATE_COMMAND  ""
         DEPENDS         zlib
-        GIT_REPOSITORY  "https://github.com/google/protobuf.git"
-        GIT_TAG         "9f75c5aa851cd877fb0d93ccc31b8567a6706546"
+        GIT_REPOSITORY  ${PROTOBUF_REPO}
+        GIT_TAG         ${PROTOBUF_TAG}
         CONFIGURE_COMMAND
         ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/${TARGET_NAME}/cmake
             ${OPTIONAL_ARGS}
@@ -203,7 +225,11 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
     )
 ENDFUNCTION()
 
-SET(PROTOBUF_VERSION 3.1)
+IF(NOT MOBILE_INFERENCE)
+    SET(PROTOBUF_VERSION 3.1)
+ELSE()
+    SET(PROTOBUF_VERSION 3.2)
+ENDIF()
 IF(CMAKE_CROSSCOMPILING)
     build_protobuf(protobuf_host TRUE)
     LIST(APPEND external_project_dependencies protobuf_host)
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index a98e069b7cd1654ddd5868560d0905eab6d9c692..1638cd8fdfc34575132462859e056a1907f0b2f1 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -50,6 +50,8 @@ ExternalProject_Add(
 )
 
 LIST(APPEND external_project_dependencies zlib)
+ADD_LIBRARY(zlib_target STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET zlib_target PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
 
 IF(WITH_C_API)
   INSTALL(DIRECTORY ${ZLIB_INCLUDE_DIR} DESTINATION third_party/zlib)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 2b125cef6aa8d1021afe8a7a0d232d84d36be4bc..1120677a37e0d44163816b66600121c8f0d545af 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -111,6 +111,8 @@ set(COMMON_FLAGS
     -Wno-error=sign-compare
     -Wno-error=unused-local-typedefs
     -Wno-error=parentheses-equality # Warnings in pybind11
+    -Wno-error=ignored-attributes  # Warnings in Eigen, gcc 6.3
+    -Wno-error=terminate  # Warning in PADDLE_ENFORCE
 )
 
 set(GPU_COMMON_FLAGS
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index b9c1dde97bc444d793d67ff622fd6b13c6435a9a..9cf256fb6d5336b92f94198517f700e77a9330b3 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -227,8 +227,8 @@ function(cc_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
-    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
     add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
   endif()
 endfunction(cc_test)
@@ -288,8 +288,8 @@ function(nv_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main)
-    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main)
+    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
     add_test(${TARGET_NAME} ${TARGET_NAME})
   endif()
 endfunction(nv_test)
@@ -459,11 +459,58 @@ function(py_test TARGET_NAME)
   if(WITH_TESTING)
     set(options STATIC static SHARED shared)
     set(oneValueArgs "")
-    set(multiValueArgs SRCS DEPS)
-    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})  
+    set(multiValueArgs SRCS DEPS ARGS)
+    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_test(NAME ${TARGET_NAME}
              COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python
-             python2 ${py_test_SRCS}
+             ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
   endif()
 endfunction()
+
+# grpc_library generate grpc code using grpc_cpp_plugin and protoc
+# then build the generated protobuf code and grpc code with your
+# implementation source codes together. Use SRCS argument for your
+# implementation source files and PROTO argument for your .proto
+# files.
+#
+# Usage: grpc_library(my_target SRCS my_client.cc PROTO my_target.proto DEPS my_dep)
+
+function(grpc_library TARGET_NAME)
+  set(oneValueArgs PROTO)
+  set(multiValueArgs SRCS DEPS)
+  set(options "")
+  cmake_parse_arguments(grpc_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  message(STATUS "generating grpc ${grpc_library_PROTO}")
+
+  get_filename_component(ABS_PROTO ${grpc_library_PROTO} ABSOLUTE)
+  get_filename_component(PROTO_WE ${grpc_library_PROTO} NAME_WE)
+  get_filename_component(PROTO_PATH ${ABS_PROTO} PATH)
+
+  protobuf_generate_cpp(grpc_proto_srcs grpc_proto_hdrs "${ABS_PROTO}")
+  set(grpc_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.cc")
+  set(grpc_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/${PROTO_WE}.grpc.pb.h")
+  cc_library("${TARGET_NAME}_proto" SRCS "${grpc_proto_srcs}")
+
+  add_custom_command(
+          OUTPUT "${grpc_grpc_srcs}" "${grpc_grpc_hdrs}"
+          COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
+          ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}" -I "${PROTO_PATH}"
+          --plugin=protoc-gen-grpc="${GRPC_CPP_PLUGIN}" "${ABS_PROTO}"
+          DEPENDS "${ABS_PROTO}" ${PROTOBUF_PROTOC_EXECUTABLE} extern_grpc)
+
+  # FIXME(typhoonzero): grpc generated code do not generate virtual-dtor, mark it
+  # as compiler warnings instead of error. Should try remove the warnings also.
+  set_source_files_properties(
+    ${grpc_grpc_srcs}
+    PROPERTIES
+    COMPILE_FLAGS  "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  cc_library("${TARGET_NAME}_grpc" SRCS "${grpc_grpc_srcs}")
+
+  set_source_files_properties(
+    ${grpc_library_SRCS}
+    PROPERTIES
+    COMPILE_FLAGS  "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  cc_library("${TARGET_NAME}" SRCS "${grpc_library_SRCS}" DEPS "${TARGET_NAME}_grpc" "${TARGET_NAME}_proto" "${grpc_library_DEPS}")
+endfunction()
diff --git a/cmake/util.cmake b/cmake/util.cmake
index ad905ab55ba3537054fa5b30b5fca4d83c406702..0dc33ce385175d1e2dc454d41db467d4b9d9cf9a 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -168,17 +168,3 @@ function(create_resources res_file output_file)
     COMMAND python ARGS ${PADDLE_SOURCE_DIR}/cmake/make_resource.py ${res_file} ${output_file}
     DEPENDS ${res_file} ${PADDLE_SOURCE_DIR}/cmake/make_resource.py)
 endfunction()
-
-
-# Create a python unittest using run_python_tests.sh,
-# which takes care of making correct running environment
-function(add_python_test TEST_NAME)
-    foreach(arg ${ARGN})
-        get_filename_component(py_fn ${arg} NAME_WE)
-        set(TRG_NAME ${TEST_NAME}_${py_fn})
-        add_test(NAME ${TRG_NAME}
-                COMMAND env PYTHONPATH=${PADDLE_PYTHON_PACKAGE_DIR}
-                python2 ${arg}
-                WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
-    endforeach()
-endfunction()
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index d4d182f6692e09b3e40f3620b77d9a0f20ec5af3..c3f9c18d0663a7a24880b441981875c1e4f015aa 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -54,7 +54,7 @@ img_conv
 
 ..  _api_v2.layer_context_projection:
 
-context_projection 
+context_projection
 ------------------
 ..  autoclass:: paddle.v2.layer.context_projection
     :noindex:
@@ -70,7 +70,7 @@ Image Pooling Layer
 img_pool
 --------
 ..  autoclass:: paddle.v2.layer.img_pool
-    :noindex:   
+    :noindex:
 
 spp
 ---
@@ -104,7 +104,7 @@ sum_to_one_norm
 ---------------
 ..  autoclass:: paddle.v2.layer.sum_to_one_norm
     :noindex:
-    
+
 cross_channel_norm
 ------------------
 ..  autoclass:: paddle.v2.layer.cross_channel_norm
@@ -114,7 +114,7 @@ row_l2_norm
 -----------
 ..  autoclass:: paddle.v2.layer.row_l2_norm
     :noindex:
-    
+
 Recurrent Layers
 ================
 
@@ -415,6 +415,13 @@ multiplex
 ..  autoclass:: paddle.v2.layer.multiplex
     :noindex:
 
+Factorization Machine Layer
+============================
+
+factorization_machine
+---------------------
+..  autoclass:: paddle.v2.layer.factorization_machine
+    :noindex:
 
 Slicing and Joining Layers
 ==========================
diff --git a/doc/design/float16.md b/doc/design/float16.md
index 078801ba2ed969d26dd31d5ec4ed268686cf7016..1ea95ed6b5d6792171569b6ff76d09be92fcb13e 100644
--- a/doc/design/float16.md
+++ b/doc/design/float16.md
@@ -28,6 +28,51 @@ The goal of float16 is to serve as a key for the executor to find and run the co
 - [Eigen](https://github.com/RLovelett/eigen) >= 3.3 supports float16 calculation on both GPU and CPU using the `Eigen::half` class. It is mostly useful for Nvidia GPUs because of the overloaded arithmetic operators using cuda intrinsics. It falls back to using software emulation on CPU for calculation and there is no special treatment to ARM processors.
 - [ARM compute library](https://github.com/ARM-software/ComputeLibrary) >= 17.02.01 supports NEON FP16 kernels (requires ARMv8.2-A CPU).
 
+### CUDA version issue
+There are currently three versions of CUDA that supports `__half` data type, namely, CUDA 7.5, 8.0, and 9.0. 
+CUDA 7.5 and 8.0 define `__half` as a simple struct that has a `uint16_t` data (see [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/9212ab5a3ddbe48f30ef373f9c1fb546804c7a8c/include/isaac/external/CUDA/cuda_fp16.h)) as follows:
+```
+typedef struct __align__(2) {
+   unsigned short x;
+} __half;
+
+typedef __half half;
+```
+This struct does not define any overloaded arithmetic operators. So you have to directly use `__hadd` instead of `+` to correctly add two half types:
+```
+__global__ void Add() {
+  half a, b, c;
+  c = __hadd(a, b); // correct
+  c = a + b; // compiler error: no operator "+" matches these operands
+}
+```
+CUDA 9.0 provides a major update to the half data type. The related code can be found in the updated [`cuda_fp16.h`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.h) and the newly added [`cuda_fp16.hpp`](https://github.com/ptillet/isaac/blob/master/include/isaac/external/CUDA/cuda_fp16.hpp).
+
+Essentially, CUDA 9.0 renames the original `__half` type in 7.5 and 8.0 as `__half_raw`, and defines a new `__half` class type that has constructors, conversion operators, and also provides overloaded arithmetic operators such as follows:
+```
+typedef struct __CUDA_ALIGN__(2) {
+    unsigned short x;
+} __half_raw;
+
+
+struct __CUDA_ALIGN__(2) __half {
+protected:
+    unsigned short __x;
+public:
+    // constructors and conversion operators from/to 
+    // __half_raw and other built-in data types
+}
+
+typedef __half half;
+
+__device__ __forceinline__ 
+__half operator+(const __half &lh, const __half &rh) { 
+    return __hadd(lh, rh); 
+}
+
+// Other overloaded operators
+``` 
+This new design makes `c = a + b` work correctly for CUDA half data type. 
 
 ## Implementation
 The float16 class holds a 16-bit `uint16_t` data internally.
diff --git a/doc/design/refactor/distributed_architecture.md b/doc/design/refactor/distributed_architecture.md
index ac7e98ccf1aadbb973a4801fde842375cf63448c..2b4f921ae93c3b443ed62a28b1fa9fbda14f73ab 100644
--- a/doc/design/refactor/distributed_architecture.md
+++ b/doc/design/refactor/distributed_architecture.md
@@ -2,106 +2,70 @@
 
 ## Abstract
 
-PaddlePaddle v0.10.0 uses the "trainer-parameter server"
-architecture. We run multiple replicated instances of trainers (runs
-the same code written by the user) and parameter servers for
-distributed training. This architecture served us well, but has some
-limitations:
+PaddlePaddle version 0.10.0 uses the "trainer-parameter server" architecture. We run multiple instances of trainers (where each trainer runs the same model) and parameter servers for distributed training. This architecture serves well, but has few limitations:
 
-1. Need to write special code to handle tasks which should only be run
-  by a single trainer. E.g., initializing model and saving model.
+1. There is a need to write special code that handles tasks which should only be run on a single trainer. E.g., initializing the model, saving the model etc.
 
-2. Model parallelism is hard: need to write if-else branches conditioned
-  on the trainer ID to partition model onto each trainer, and manually
-  write the inter-model-shard communication code.
+2. Model parallelism is hard: It would need all the if-else branches conditioned on the trainer ID to partition the model onto the trainers, and eventually manually writing out the inter-model-shard communication code to communicate between different trainers.
 
-3. The user can not directly specify the parameter update rule: need
-   to modify the parameter server C++ code and compile a new
-   binary. This adds complication for researchers: A lot of extra
-   effort is required. Besides, the training job submission program
-   may not allow running arbitrary binaries.
+3. The user can not directly specify the parameter update rule: This would need to modify the parameter server code and compile a new binary. This makes things more complicated for researchers: A lot of extra effort is required to make this work. Besides, the training job submission program may not allow running arbitrary binaries.
 
-This design doc discusses PaddlePaddle's new distributed training
-architecture that addresses the above limitations.
+This design doc discusses PaddlePaddle's new distributed training architecture that addresses the above mentioned limitations.
 
 ## Analysis
 
-We will assume the user writes the trainer program by Python, the same
-analysis holds if the trainer program is written in C++.
+The assumption is that the user writes the trainer program in either Python or C++.
 
 ### Limitation 1
 
-If we look at the Python code that the user writes, there are two
-kinds of functionalities:
+There are two basic functionalities in the trainer program:
 
-- The training logic such as load / save model and print log.
-- The neural network definition such as the definition of the data
-  layer, the fully connected layer, the cost function and the
+1. The training logic such as loading / saving the model and printing out the logs.
+2. The neural network definition such as the definition of the data layer, the fully connected layer, the cost function and the
   optimizer.
 
-When we training with PaddlePaddle v0.10.0 distributedly, multiple
-replicated Python instances are running on different nodes: both the
-training logic and the neural network computation is replicated.
+When we train using PaddlePaddle v0.10.0 in a distributed fashion, multiple instances of the same Python code are run on different nodes, hence both: the
+training logic as well as the neural network computation logic, is replicated.
 
-The tasks that should only run once all belong to the training logic,
-if we only replicate the neural network computation, but do **not**
-replicate the training logic, the limitation could be solved.
+The tasks that only need to be run once belong to the training logic. Hence if we only replicate the neural network computation part, and do **not**
+replicate the training logic, the limitation mentioned above can be avoided.
 
 ### Limitation 2
 
-Model parallelism means running a single model on multiple nodes by
-partitioning the model onto different nodes and managing the
-inter-model-shard communications.
+Model parallelism means that a single model is partitioned into different components and each node runs one of the component separately. This comes at the extra cost of managing the
+inter-model-shard communication between nodes.
 
-PaddlePaddle should be able to modify the nerual network computation
-definition to support model parallelism automatically. However, the
-computation is only specified in Python code, and PaddlePaddle can not
-modify Python code.
+PaddlePaddle should ideally be able to modify the neural network computation and figure out the support for model parallelism automatically. However, the
+computation is only specified in Python code which sits outside of PaddlePaddle, hence PaddlePaddle can not support the feature in this setup.
 
-Just like compiler uses a intermediate representation (IR) so that
-programmer does not need to manually optimize their code in most of
-the cases - the compiler will optimize the IR:
+Similar to how a compiler uses an intermediate representation (IR) so that the programmer does not need to manually optimize their code for most of the cases, we can have an intermediate representation in PaddlePaddle as well. The compiler optimizes the IR as follows:
 
 <img src="src/compiler.png"/>
 
-We can have our own IR too: PaddlePaddle can support model parallel by
-converting the IR so the user no longer need to manually do it in
-Python:
+PaddlePaddle can support model parallelism by converting the IR so that the user no longer needs to manually perform the computation and operations in the Python component:
 
 <img src="src/paddle-compile.png"/>
 
-The IR for PaddlePaddle after refactor is called `Block`, it specifies
-the computation dependency graph and the variables used in the
-computation.
+The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the computation dependency graph and the variables used in the computation.
 
 ### Limitation 3
 
-The user can not directly specify the parameter update rule for the
-parameter server because the parameter server does not use the same
-computation definition as the trainer. Instead, the update rule is
-baked in the parameter server. The user can not specify the update
-rule in the same way of specifying the trainer computation.
+The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly.
 
-This could be fixed by making the parameter server run the same
-computation definition as the trainer. For a detailed explanation,
-please
-see
+This could be fixed by making the parameter server run the same computation definition as the trainer (the user's Python module). For a detailed explanation, refer to this document -
 [Design Doc: Operation Graph Based Parameter Server](./dist_train.md)
 
 ## Distributed Training Architecture
 
-The new distributed training architecture can address the above
-limitations. Below is the illustration:
+The revamped distributed training architecture can address the above discussed limitations. Below is the illustration of how it does so:
 
 <img src="src/distributed_architecture.png"/>
 
-The architecture includes major components: *PaddlePaddle Python*,
-*PaddlePaddle converter* and *PaddlePaddle runtime*:
+The major components in the architecture are: *PaddlePaddle Python*, *PaddlePaddle converter* and *PaddlePaddle runtime*.
 
 ### PaddlePaddle Python
 
-PaddlePaddle Python is the Python library that user's Python trainer
-invoke to build the neural network topology, start training, etc.
+PaddlePaddle Python is the Python library that user's Python code invokes, to read the data. build the neural network topology, start training, etc.
 
 ```Python
 paddle.init()
@@ -117,102 +81,60 @@ for i in range(1000):
 	print cost_val
 ```
 
-The code above is a typical Python trainer code, the neural network
-topology is built using helper functions such as
-`paddle.layer.fc`. The training is done by calling `session.eval`
-iteratively.
+The above code is what a typical Python trainer code is, the neural network topology is built using the helper functions such as `paddle.layer.fc`. Training is done by calling `session.eval` iteratively.
 
 #### session.eval
 
-As shown in the graph, `session.eval` sends the IR and the evaluation
-inputs/targets to the PaddlePaddle cluster for evaluation. The
-targets can be any variable in the computation graph. When the target
-is the `optimizer` variable, the neural network will be optimized
-once. When the target is the `cost` variable, `session.eval` returns
-the cost value.
+As shown in the graph, `session.eval` sends the IR and the evaluation inputs or targets to the PaddlePaddle cluster for evaluation.
+The targets can be any variable in the computation graph. When the target is say, the `optimizer` variable, the neural network will be optimized once. When the target is the `cost` variable, `session.eval` returns the cost value. Based on what the target is, an appropriate action is taken.
 
-The Python `session` is a wrapper of the C++ `Session` class. For more
-information about `Session`, please
-see [Design Doc: Session](./session.md).
+The Python `session` is a wrapper of the C++ `Session` class. For more information about `Session`, refer to this document - [Design Doc: Session](./session.md).
 
 ### PaddlePaddle Converter
 
-PaddlePaddle converter automatically converts the IR in the request
-(IR and evaluation inputs/targets) from PaddlePaddle Python to new
-partitioned IRs and dispatch the new IRs and evaluation inputs/targets
-to different PaddlePaddle runtimes. Below are the steps:
+The PaddlePaddle converter automatically converts the IR in the request (IR and evaluation inputs/targets) from PaddlePaddle Python to partitioned IRs and dispatches the new IRs and evaluation inputs/targets to different PaddlePaddle runtimes. Below are the steps that are followed :
 
-1. Add `feed` OP that feeds the eval inputs, and `fetch` OP that
-   fetches the eval targets to the IR.
+1. Add a `feed` OP that feeds the eval inputs, and a `fetch` OP that fetches the eval targets to the IR.
 
-1. Extract a new computation (sub)graph with `feed` and `fetch` OP as
-   the boundary. The runtime does not need to run the OP that is not
-   dependent by the `fetch` OP.
+2. Extract a new computation (sub)graph with the `feed` and `fetch` OPs as the boundary. The runtime does not need to run the OP that is not dependent on the `fetch` OP.
 
-1. Optimizes the computation graph.
+3. Optimize the computation graph.
 
-1. Place the OPs in the graph onto different devices on different
-   PaddlePaddle runtime according to a placement algorithm and device
-   constraint specified by the user.
+4. Place the OPs in the graph onto different devices on different PaddlePaddle runtime according to a placement algorithm and the device constraints specified by the user.
 
-1. Partition the graph according to runtime boundaries and add `send` /
-   `recv` OP pair on the runtime boundaries.
+5. Partition the graph according to runtime boundaries and add `send` / `recv` OP pair on the runtime boundaries.
 
-1. Dispatch the partitioned graph to different PaddlePaddle runtimes.
+6. Dispatch the partitioned graph to different PaddlePaddle runtimes.
+
+7. PaddlePaddle runtimes with the `fetch` OP reports evaluation results back to the converter, the converter reports the evaluation results back to the PaddlePaddle Python.
 
-1. PaddlePaddle runtimes with the `fetch` OP reports evaluation
-   results back to the converter, the convert reports the evaluation
-   results back to the PaddlePaddle Python.
-   
 The output IRs will be cached to optimize the conversion latency.
 
 
 #### Placement Algorithm
 
-Our first implementation will only support "trainer-parameter server"
-placement: the parameters, initializers, and optimizers are placed on
-the PaddlePaddle runtimes with the parameter server role. And
-everything else will be placed on the PaddlePaddle runtimes with the
-trainer role. This has the same functionality of our
-"trainer-parameter server" architecture of PaddlePaddle v0.10.0, but
-is more general and flexible.
+Our first implementation will only support "trainer-parameter server" placement: the parameters, initializers, and optimizers are all placed on the PaddlePaddle runtimes with the parameter server role. Everything else will be placed on the PaddlePaddle runtimes with the trainer role. This has the same functionality as the "trainer-parameter server" architecture of PaddlePaddle v0.10.0, but is more generic and flexible.
 
-In the future, we will implement the general placement algorithm,
-which makes placements according to the input IR, and a model of
-device computation time and device communication time. Model
-parallelism requires the general placement algorithm.
+In the future, a more general placement algorithm should be implemented, which makes placements according to the input IR, and a model of device computation time and device communication time. Model parallelism requires the generic placement algorithm.
 
 
 ### PaddlePaddle Runtime
 
-The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and
-runs the IR. The runtime does not need to do OP placement since it's
-already done by the converter.
+The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and runs the IR. The runtime does not need to do OP placement since it is already done by the converter.
 
 
 ### Local Training Architecture
 
-The local training architecture will be the same as the distributed
-training architecture, the differences are everything runs locally,
-and there is just one PaddlePaddle runtime:
+The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:
 
 <img src="src/local_architecture.png"/>
 
 
 ### Training Data
 
-In PaddlePaddle v0.10.0, training data is typically read
-with [data reader](../reader/README.md) from Python. This approach is
-no longer efficient when training distributedly since the Python
-process no longer runs on the same node with the trainer processes,
-the Python reader will need to read from the distributed filesystem
-(assuming it has the access) and send to the trainers, doubling the
-network traffic.
-
-When doing distributed training, the user can still use Python data
-reader: the training data are sent with `session.eval`. However should
-be used for debugging purpose only. The users are encouraged to use
-the read data OPs.
+In PaddlePaddle v0.10.0, training data is typically read with a [data reader](../reader/README.md) from Python. This approach is no longer efficient when training in a distributed fashion since the Python process no longer runs on the same node with the trainer processes. The Python reader will need to read from the distributed filesystem (assuming it has the required access) and send to the trainers, doubling the network traffic.
+
+When doing distributed training, the user can still use Python data reader: the training data are sent with `session.eval`. However this should be used for debugging purpose only. The users are encouraged to use the read data OPs.
 
 
 ## References:
diff --git a/doc/getstarted/basic_usage/index_cn.rst b/doc/getstarted/basic_usage/index_cn.rst
deleted file mode 100644
index b473944fc7fb89d3e0a0b330933f2226734bb5bd..0000000000000000000000000000000000000000
--- a/doc/getstarted/basic_usage/index_cn.rst
+++ /dev/null
@@ -1,108 +0,0 @@
-经典的线性回归任务
-==================
-
-PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍将向你展示如何利用PaddlePaddle来解决一个经典的线性回归问题。
-
-任务简介
---------
-
-我们展示如何用PaddlePaddle解决 `单变量的线性回归 <https://www.baidu.com/s?wd=单变量线性回归>`_ 问题。线性回归的输入是一批点 `(x, y)` ，其中 `y = wx + b + ε`， 而 ε 是一个符合高斯分布的随机变量。线性回归的输出是从这批点估计出来的参数 `w` 和 `b` 。
-
-一个例子是房产估值。我们假设房产的价格（y）是其大小（x）的一个线性函数，那么我们可以通过收集市场上房子的大小和价格，用来估计线性函数的参数w 和 b。
-
-准备数据
------------
-
-假设变量 `x` 和 `y` 的真实关系为： `y = 2x + 0.3 + ε`，这里展示如何使用观测数据来拟合这一线性关系。首先，Python代码将随机产生2000个观测点，作为线性回归的输入。下面脚本符合PaddlePaddle期待的读取数据的Python程序的模式。
-
-.. code-block:: python
-
-    # dataprovider.py
-    from paddle.trainer.PyDataProvider2 import *
-    import random
-
-    # 定义输入数据的类型: 2个浮点数
-    @provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
-    def process(settings, input_file):
-        for i in xrange(2000):
-            x = random.random()
-            yield [x], [2*x+0.3]
-
-训练模型
------------
-
-为了还原 `y = 2x + 0.3`，我们先从一条随机的直线 `y' = wx + b` 开始，然后利用观测数据调整 `w` 和 `b` 使得 `y'` 和 `y` 的差距不断减小，最终趋于接近。这个过程就是模型的训练过程，而 `w` 和 `b` 就是模型的参数，即我们的训练目标。
-
-在PaddlePaddle里，该模型的网络配置如下。
-
-.. code-block:: python
-
-    # trainer_config.py
-    from paddle.trainer_config_helpers import *
-
-    # 1. 定义数据来源，调用上面的process函数获得观测数据
-    data_file = 'empty.list'
-    with open(data_file, 'w') as f: f.writelines(' ')
-    define_py_data_sources2(train_list=data_file, test_list=None, 
-                            module='dataprovider', obj='process',args={})
-
-    # 2. 学习算法。控制如何改变模型参数 w 和 b
-    settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
-
-    # 3. 神经网络配置
-    x = data_layer(name='x', size=1)
-    y = data_layer(name='y', size=1)
-    # 线性计算网络层: ȳ = wx + b
-    ȳ = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
-    # 计算误差函数，即  ȳ 和真实 y 之间的距离
-    cost = square_error_cost(input= ȳ, label=y)
-    outputs(cost)
-
-
-这段简短的配置展示了PaddlePaddle的基本用法：
-
-- 第一部分定义了数据输入。一般情况下，PaddlePaddle先从一个文件列表里获得数据文件地址，然后交给用户自定义的函数（例如上面的 `process`函数）进行读入和预处理从而得到真实输入。本文中由于输入数据是随机生成的不需要读输入文件，所以放一个空列表（`empty.list`）即可。
-
-- 第二部分主要是选择学习算法，它定义了模型参数改变的规则。PaddlePaddle提供了很多优秀的学习算法，这里使用一个基于momentum的随机梯度下降(SGD)算法，该算法每批量(batch)读取12个采样数据进行随机梯度计算来更新更新。
-
-- 最后一部分是神经网络的配置。由于PaddlePaddle已经实现了丰富的网络层，所以很多时候你需要做的只是定义正确的网络层并把它们连接起来。这里使用了三种网络单元：
-    
-    - **数据层**：数据层 `data_layer` 是神经网络的入口，它读入数据并将它们传输到接下来的网络层。这里数据层有两个，分别对应于变量 `x` 和 `y`。
-    - **全连接层**：全连接层 `fc_layer` 是基础的计算单元，这里利用它建模变量之间的线性关系。计算单元是神经网络的核心，PaddlePaddle支持大量的计算单元和任意深度的网络连接，从而可以拟合任意的函数来学习复杂的数据关系。
-    - **回归误差代价层**：回归误差代价层 `square_error_cost` 是众多误差代价函数层的一种，它们在训练过程作为网络的出口，用来计算模型的误差，是模型参数优化的目标函数。
-
-定义了网络结构并保存为 `trainer_config.py` 之后，运行以下训练命令：
-
-.. code-block:: bash
-
-    paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
-
-PaddlePaddle将在观测数据集上迭代训练30轮，并将每轮的模型结果存放在 `./output` 路径下。从输出日志可以看到，随着轮数增加误差代价函数的输出在不断的减小，这意味着模型在训练数据上不断的改进，直到逼近真实解：` y = 2x + 0.3 `
-
-模型检验
------------
-
-训练完成后，我们希望能够检验模型的好坏。一种常用的做法是用学习的模型对另外一组测试数据进行预测，评价预测的效果。在这个例子中，由于已经知道了真实答案，我们可以直接观察模型的参数是否符合预期来进行检验。
-
-PaddlePaddle将每个模型参数作为一个numpy数组单独存为一个文件，所以可以利用如下方法读取模型的参数。
-
-.. code-block:: python
-
-    import numpy as np
-    import os
-
-    def load(file_name):
-        with open(file_name, 'rb') as f:
-            f.read(16) # skip header for float type.
-            return np.fromfile(f, dtype=np.float32)
-        
-    print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
-    # w=1.999743, b=0.300137
-
-.. image:: ./parameters.png
-     :align: center
-     :scale: 80 %
-
-从图中可以看到，虽然 `w` 和 `b` 都使用随机值初始化，但在起初的几轮训练中它们都在快速逼近真实值，并且后续仍在不断改进，使得最终得到的模型几乎与真实模型一致。
-
-这样，我们用PaddlePaddle解决了单变量线性回归问题， 包括数据输入、模型训练和最后的结果验证。
diff --git a/doc/getstarted/basic_usage/index_en.rst b/doc/getstarted/basic_usage/index_en.rst
deleted file mode 100644
index 2cc438ebbe0f97345d25354b93b4ebbd43502415..0000000000000000000000000000000000000000
--- a/doc/getstarted/basic_usage/index_en.rst
+++ /dev/null
@@ -1,101 +0,0 @@
-Simple Linear Regression
-========================
-
-PaddlePaddle is a deep learning platform open-sourced by Baidu. With PaddlePaddle, you can easily train a classic neural network within a couple lines of configuration, or you can build sophisticated models that provide state-of-the-art performance on difficult learning tasks like sentiment analysis, machine translation, image caption and so on.
-
-Problem Background
-------------------
-
-Now, to give you a hint of what using PaddlePaddle looks like, let's start with a fundamental learning problem - `simple linear regression <https://en.wikipedia.org/wiki/Simple_linear_regression>`_: you have observed a set of two-dimensional data points of ``X`` and ``Y``, where ``X`` is an explanatory variable and ``Y`` is corresponding dependent variable, and you want to recover the underlying correlation between ``X`` and ``Y``. Linear regression can be used in many practical scenarios. For example, ``X`` can be a variable about house size, and ``Y`` a variable about house price. You can build a model that captures relationship between them by observing real estate markets.
-
-Prepare the Data
------------------
-
-Suppose the true relationship can be characterized as ``Y = 2X + 0.3``, let's see how to recover this pattern only from observed data. Here is a piece of python code that feeds synthetic data to PaddlePaddle. The code is pretty self-explanatory, the only extra thing you need to add for PaddlePaddle is a definition of input data types.
-
-    .. code-block:: python
-
-        # dataprovider.py
-        from paddle.trainer.PyDataProvider2 import *
-        import random
-
-        # define data types of input: 2 real numbers
-        @provider(input_types=[dense_vector(1), dense_vector(1)],use_seq=False)
-        def process(settings, input_file):
-            for i in xrange(2000):
-                x = random.random()
-                yield [x], [2*x+0.3]
-
-Train a NeuralNetwork
-----------------------
-
-To recover this relationship between ``X`` and ``Y``, we use a neural network with one layer of linear activation units and a square error cost layer. Don't worry if you are not familiar with these terminologies, it's just saying that we are starting from a random line ``Y' = wX + b`` , then we gradually adapt ``w`` and ``b`` to minimize the difference between ``Y'`` and ``Y``. Here is what it looks like in PaddlePaddle:
-
-    .. code-block:: python
-
-        # trainer_config.py
-        from paddle.trainer_config_helpers import *
-
-        # 1. read data. Suppose you saved above python code as dataprovider.py
-        data_file = 'empty.list'
-        with open(data_file, 'w') as f: f.writelines(' ')
-        define_py_data_sources2(train_list=data_file, test_list=None, 
-                module='dataprovider', obj='process',args={})
-
-        # 2. learning algorithm
-        settings(batch_size=12, learning_rate=1e-3, learning_method=MomentumOptimizer())
-
-        # 3. Network configuration
-        x = data_layer(name='x', size=1)
-        y = data_layer(name='y', size=1)
-        y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
-        cost = square_error_cost(input=y_predict, label=y)
-        outputs(cost)
-
-Some of the most fundamental usages of PaddlePaddle are demonstrated:
-
--  The first part shows how to feed data into PaddlePaddle. In general cases, PaddlePaddle reads raw data from a list of files, and then do some user-defined process to get real input. In this case, we only need to create a placeholder file since we are generating synthetic data on the fly.
-
--  The second part describes learning algorithm. It defines in what ways adjustments are made to model parameters. PaddlePaddle provides a rich set of optimizers, but a simple momentum based optimizer will suffice here, and it processes 12 data points each time.
-
--  Finally, the network configuration. It usually is as simple as "stacking" layers. Three kinds of layers are used in this configuration:
-	-  **Data Layer**: a network always starts with one or more data layers. They provide input data to the rest of the network. In this problem, two data layers are used respectively for ``X`` and ``Y``.
-	-  **FC Layer**: FC layer is short for Fully Connected Layer, which connects all the input units to current layer and does the actual computation specified as activation function. Computation layers like this are the fundamental building blocks of a deeper model.
-	-  **Cost Layer**: in training phase, cost layers are usually the last layers of the network. They measure the performance of current model, and provide guidence to adjust parameters.
-
-Now that everything is ready, you can train the network with a simple command line call:
-
-    .. code-block:: bash
- 
-        paddle train --config=trainer_config.py --save_dir=./output --num_passes=30
- 
-
-This means that PaddlePaddle will train this network on the synthectic dataset for 30 passes, and save all the models under path ``./output``. You will see from the messages printed out during training phase that the model cost is decreasing as time goes by, which indicates we are getting a closer guess.
-
-
-Evaluate the Model
--------------------
-
-Usually, a different dataset that left out during training phase should be used to evalute the models. However, we are lucky enough to know the real answer: ``w=2, b=0.3``, thus a better option is to check out model parameters directly.
-
-In PaddlePaddle, training is just to get a collection of model parameters, which are ``w`` and ``b`` in this case. Each parameter is saved in an individual file in the popular ``numpy`` array format. Here is the code that reads parameters from last pass.
-
-    .. code-block:: python
-
-        import numpy as np
-        import os
-
-        def load(file_name):
-            with open(file_name, 'rb') as f:
-                f.read(16) # skip header for float type.
-                return np.fromfile(f, dtype=np.float32)
-                
-        print 'w=%.6f, b=%.6f' % (load('output/pass-00029/w'), load('output/pass-00029/b'))
-        # w=1.999743, b=0.300137
-
-    .. image:: parameters.png
-        :align: center
-
-Although starts from a random guess, you can see that value of ``w`` changes quickly towards 2 and ``b`` changes quickly towards 0.3. In the end, the predicted line is almost identical with real answer.
-
-There, you have recovered the underlying pattern between ``X`` and ``Y`` only from observed data.
diff --git a/doc/getstarted/basic_usage/parameters.png b/doc/getstarted/basic_usage/parameters.png
deleted file mode 100644
index 2ec67480951e21f0400bce1c34b3108dcd65c18c..0000000000000000000000000000000000000000
Binary files a/doc/getstarted/basic_usage/parameters.png and /dev/null differ
diff --git a/doc/getstarted/build_and_install/build_from_source_cn.rst b/doc/getstarted/build_and_install/build_from_source_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3c525bdad6f6118dcd560e2cb7bfaf89737c1362
--- /dev/null
+++ b/doc/getstarted/build_and_install/build_from_source_cn.rst
@@ -0,0 +1,141 @@
+从源码编译
+======================
+
+.. _build_step:
+
+编译方法
+----------------
+
+PaddlePaddle主要使用 `CMake <https://cmake.org>`_ 以及GCC, G++作为编译工具。
+我们推荐您使用PaddlePaddle Docker编译环境镜像完成编译，这样可以免去单独安装编译依赖的步骤，可选的不同编译环境Docker镜像
+可以在 `这里 <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ 找到。
+
+如果您选择不使用Docker镜像，则需要在本机安装下面章节列出的 `编译依赖`_ 之后才能开始编译的步骤。
+
+编译PaddlePaddle，需要执行：
+
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # 如果使用Docker编译环境，执行下面的命令编译CPU-Only的二进制
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+   # 如果不使用Docker编译环境，执行下面的命令
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
+   make
+
+编译完成后会在build/python/dist目录下生成输出的whl包，可以选在在当前机器安装也可以拷贝到目标机器安装：
+
+.. code-block:: bash
+
+   pip install python/dist/*.whl
+
+
+.. _run_test:
+
+执行单元测试
+----------------
+
+如果您期望在编译完成后立即执行所有的单元测试，可以按照下面的方法：
+
+使用Docker的情况下，设置 :code:`RUN_TEST=ON` 和 :code:`WITH_TESTING=ON` 就会在完成编译之后，立即执行单元测试。
+开启 :code:`WITH_GPU=ON` 可以指定同时执行GPU上的单元测试。
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+
+如果不使用Docker，可以执行ctest命令即可：
+
+.. code-block:: bash
+
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
+   make
+   ctest
+   # 指定执行其中一个单元测试 test_mul_op
+   ctest -R test_mul_op
+
+.. _compile_deps:
+
+编译依赖
+----------------
+
+PaddlePaddle编译需要使用到下面的依赖（包含但不限于），其他的依赖软件，会自动在编译时下载。
+
+.. csv-table:: PaddlePaddle编译依赖
+   :header: "依赖", "版本", "说明"
+   :widths: 10, 15, 30
+
+   "CMake", ">=3.5", ""
+   "GCC", "4.8.2", "推荐使用CentOS的devtools2"
+   "Python", "2.7.x", "依赖libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
+   "SWIG", ">=2.0", ""
+   "Go", ">=1.8", "可选"
+
+
+.. _build_options:
+
+编译选项
+----------------
+
+PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。
+用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考
+`官方文档 <https://cmake.org/cmake-tutorial>`_ 。
+
+在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如：
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: 编译选项说明
+    :header: "选项", "说明", "默认值"
+    :widths: 1, 7, 2
+
+    "WITH_GPU", "是否支持GPU", "ON"
+    "WITH_C_API", "是否仅编译CAPI", "OFF"
+    "WITH_DOUBLE", "是否使用双精度浮点数", "OFF"
+    "WITH_DSO", "是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。", "ON"
+    "WITH_AVX", "是否编译含有AVX指令集的PaddlePaddle二进制文件", "ON"
+    "WITH_PYTHON", "是否内嵌PYTHON解释器", "ON"
+    "WITH_STYLE_CHECK", "是否编译时进行代码风格检查", "ON"
+    "WITH_TESTING", "是否开启单元测试", "ON"
+    "WITH_DOC", "是否编译中英文文档", "OFF"
+    "WITH_SWIG_PY", "是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练", "Auto"
+    "WITH_GOLANG", "是否编译go语言的可容错parameter server", "ON"
+    "WITH_MKL", "是否使用MKL数学库，如果为否则是用OpenBLAS", "ON"
+
+BLAS
++++++
+
+PaddlePaddle支持 `MKL <https://software.intel.com/en-us/intel-mkl>`_ 和
+`OpenBlAS <http://www.openblas.net/>`_ 两种BLAS库。默认使用MKL。如果使用MKL并且机器含有AVX2指令集，
+还会下载MKL-DNN数学库，详细参考 `这里 <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ 。
+
+如果关闭MKL，则会使用OpenBLAS作为BLAS库。
+
+CUDA/cuDNN
++++++++++++
+
+PaddlePaddle在编译时/运行时会自动找到系统中安装的CUDA和cuDNN库进行编译和执行。
+使用参数 :code:`-DCUDA_ARCH_NAME=Auto` 可以指定开启自动检测SM架构，加速编译。
+
+PaddlePaddle可以使用cuDNN v5.1之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cuDNN是同一个版本。
+我们推荐使用最新版本的cuDNN。
+
+编译选项的设置
+++++++++++++++
+
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/cuDNN库。cmake编译时，首先在系统路径（ :code:`/usr/lib:/usr/local/lib` ）中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
+
+**注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（** :code:`rm -rf` ）**后，再指定。**
diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
deleted file mode 100644
index 2f1461489495618718d5abaeab9cbeda9b93700f..0000000000000000000000000000000000000000
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ /dev/null
@@ -1,236 +0,0 @@
-Installing from Sources
-==========================
-
-* [1. Download and Setup](#download)
-* [2. Requirements](#requirements)
-* [3. Build on Ubuntu](#ubuntu)
-* [4. Build on Centos](#centos)
-
-
-## <span id="download">Download and Setup</span> 
-You can download PaddlePaddle from the [github source](https://github.com/PaddlePaddle/Paddle).
-
-```bash
-git clone https://github.com/PaddlePaddle/Paddle paddle
-cd paddle
-```
-## <span id="requirements">Requirements</span>
-
-To compile the source code, your computer must be equipped with the following dependencies.
-
-- **Compiler**: GCC >= 4.8 or Clang >= 3.3 (AppleClang >= 5.1) and gfortran compiler
-- **CMake**: CMake >= 3.0 (at least CMake 3.4 on Mac OS X)
-- **BLAS**: MKL, OpenBlas or ATLAS
-- **Python**: only support Python 2.7
-- **Go**
-
-**Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported!
-For CUDA 8.0, GCC versions later than 5.3 are not supported!
-
-### Options
-
-PaddlePaddle supports some build options. 
-
-<html>
-<table> 
-<thead>
-<tr>
-<th scope="col" class="left">Optional</th>
-<th scope="col" class="left">Description</th>
-</tr>
-</thead>
-<tbody>
-<tr><td class="left">WITH_GPU</td><td class="left">Compile PaddlePaddle with NVIDIA GPU</td></tr>
-<tr><td class="left">WITH_AVX</td><td class="left">Compile PaddlePaddle with AVX intrinsics</td></tr>
-<tr><td class="left">WITH_DSO</td><td class="left">Compile PaddlePaddle with dynamic linked CUDA</td></tr>
-<tr><td class="left">WITH_TESTING</td><td class="left">Compile PaddlePaddle with unit testing</td></tr>
-<tr><td class="left">WITH_SWIG_PY</td><td class="left">Compile PaddlePaddle with inference api</td></tr>
-<tr><td class="left">WITH_STYLE_CHECK</td><td class="left">Compile PaddlePaddle with style check</td></tr>
-<tr><td class="left">WITH_PYTHON</td><td class="left">Compile PaddlePaddle with python interpreter</td></tr>
-<tr><td class="left">WITH_DOUBLE</td><td class="left">Compile PaddlePaddle with double precision</td></tr>
-<tr><td class="left">WITH_RDMA</td><td class="left">Compile PaddlePaddle with RDMA support</td></tr>
-<tr><td class="left">WITH_TIMER</td><td class="left">Compile PaddlePaddle with stats timer</td></tr>
-<tr><td class="left">WITH_PROFILER</td><td class="left">Compile PaddlePaddle with GPU profiler</td></tr>
-<tr><td class="left">WITH_DOC</td><td class="left">Compile PaddlePaddle with documentation</td></tr>
-<tr><td class="left">WITH_COVERAGE</td><td class="left">Compile PaddlePaddle with code coverage</td></tr>
-<tr><td class="left">COVERALLS_UPLOAD</td><td class="left">Package code coverage data to coveralls</td></tr>
-<tr><td class="left">ON_TRAVIS</td><td class="left">Exclude special unit test on Travis CI</td></tr>
-</tbody>
-</table>
-</html>
-
-**Note:**
-  - The GPU version works best with Cuda Toolkit 8.0 and cuDNN v5.
-  - Other versions like Cuda Toolkit 7.0, 7.5 and cuDNN v3, v4 are also supported.
-  - **To utilize cuDNN v5, Cuda Toolkit 7.5 is prerequisite and vice versa.**
-
-As a simple example, consider the following:  
-
-1. **BLAS Dependencies(optional)**
-  
-    CMake will search BLAS libraries from the system. If not found, OpenBLAS will be downloaded, built and installed automatically.
-    To utilize preinstalled BLAS， you can simply specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
-
-    ```bash
-    # specify MKL
-    cmake .. -DMKL_ROOT=<mkl_path>
-    # or specify OpenBLAS
-    cmake .. -DOPENBLAS_ROOT=<openblas_path>
-    ```
-
-2. **Doc Dependencies(optional)**
-
-    To generate PaddlePaddle's documentation, install dependencies and set `-DWITH_DOC=ON` as follows:
-
-    ```bash
-    pip install 'sphinx>=1.4.0'
-    pip install sphinx_rtd_theme recommonmark
-
-    # install doxygen on Ubuntu
-    sudo apt-get install doxygen 
-    # install doxygen on Mac OS X
-    brew install doxygen
-
-    # active docs in cmake
-    cmake .. -DWITH_DOC=ON`
-    ```
-
-## <span id="ubuntu">Build on Ubuntu 14.04</span>
-
-### Install Dependencies
-
-- **Paddle Dependencies**
-
-    ```bash
-    # necessary
-    sudo apt-get update
-    sudo apt-get install -y git curl gcc g++ gfortran make build-essential automake
-    sudo apt-get install -y python python-pip python-numpy libpython-dev bison
-    sudo pip install 'protobuf==3.1.0.post1'
-
-    # Install Go
-    # You can follow https://golang.org/doc/install for a detailed explanation.
-    wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
-    tar -C $HOME -xzf go.tgz && \
-    mkdir $HOME/gopath && \
-    rm go.tgz
-
-    # Setup environment variables
-    export GOROOT=$HOME/go
-    export GOPATH=$HOME/gopath
-    export PATH=$PATH:$GOROOT/bin
-
-    # install cmake 3.4
-    curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
-        cd cmake-3.4.1 && ./bootstrap && make -j4 && sudo make install && \
-        cd .. && rm -rf cmake-3.4.1
-    ```
-
-- **GPU Dependencies (optional)**
-
-    To build GPU version, you will need the following installed:
-
-        1. a CUDA-capable GPU
-        2. A supported version of Linux with a GCC compiler and toolchain
-        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
-        4. NVIDIA cuDNN Library (available at https://developer.nvidia.com/cudnn)
-
-    The CUDA development environment relies on tight integration with the host development environment,
-    including the host compiler and C runtime libraries, and is therefore only supported on
-    distribution versions that have been qualified for this CUDA Toolkit release.
-        
-    After downloading cuDNN library, issue the following commands:
-
-    ```bash
-    sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
-    sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
-    ```
-    Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
-
-    ```bash
-    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-    export PATH=/usr/local/cuda/bin:$PATH
-    ```
-
-### Build and Install
-
-As usual, the best option is to create build folder under paddle project directory.
-
-```bash
-mkdir build && cd build
-``` 
-
-Finally, you can build and install PaddlePaddle:
-
-```bash
-# you can add build option here, such as:    
-cmake .. -DCMAKE_INSTALL_PREFIX=<path to install>
-# please use sudo make install, if you want to install PaddlePaddle into the system
-make -j `nproc` && make install
-# set PaddlePaddle installation path in ~/.bashrc
-export PATH=<path to install>/bin:$PATH
-# install PaddlePaddle Python modules.
-sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
-```
-
-## <span id="centos">Build on Centos 7</span>
-
-### Install Dependencies
-
-- **CPU Dependencies**
-
-    ```bash
-    # necessary
-    sudo yum update
-    sudo yum install -y epel-release
-    sudo yum install -y make cmake3 python-devel python-pip gcc-gfortran swig git
-    sudo pip install wheel numpy
-    sudo pip install 'protobuf>=3.0.0'
-    ```
-  
-- **GPU Dependencies (optional)**
-
-    To build GPU version, you will need the following installed:
-
-        1. a CUDA-capable GPU
-        2. A supported version of Linux with a GCC compiler and toolchain
-        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
-        4. NVIDIA cuDNN Library (available at https://developer.nvidia.com/cudnn)
-
-    The CUDA development environment relies on tight integration with the host development environment,
-    including the host compiler and C runtime libraries, and is therefore only supported on
-    distribution versions that have been qualified for this CUDA Toolkit release.
-        
-    After downloading cuDNN library, issue the following commands:
-
-    ```bash
-    sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
-    sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
-    ```
-    Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
-
-    ```bash
-    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
-    export PATH=/usr/local/cuda/bin:$PATH
-    ```
-
-### Build and Install
-
-As usual, the best option is to create build folder under paddle project directory.
-
-```bash
-mkdir build && cd build
-``` 
-
-Finally, you can build and install PaddlePaddle:
-  
-```bash
-# you can add build option here, such as:    
-cmake3 .. -DCMAKE_INSTALL_PREFIX=<path to install>
-# please use sudo make install, if you want to install PaddlePaddle into the system
-make -j `nproc` && make install
-# set PaddlePaddle installation path in ~/.bashrc
-export PATH=<path to install>/bin:$PATH
-# install PaddlePaddle Python modules.
-sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
-```
diff --git a/doc/getstarted/build_and_install/build_from_source_en.rst b/doc/getstarted/build_and_install/build_from_source_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..76fbc43de2e83580dd79b874507c103533022436
--- /dev/null
+++ b/doc/getstarted/build_and_install/build_from_source_en.rst
@@ -0,0 +1,159 @@
+Build from Sources
+==========================
+
+.. _build_step:
+
+How To Build
+----------------
+
+PaddlePaddle mainly uses `CMake <https://cmake.org>`_ and GCC, G++ as compile
+tools. We recommend you to use our pre-built Docker image to run the build
+to avoid installing dependencies by yourself. We have several build environment
+Docker images `here <https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/>`_ .
+
+If you choose not to use Docker image for your build, you need to install the
+below `Compile Dependencies`_ before run the build.
+
+Then run:
+
+.. code-block:: bash
+
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   # run the following command to build a CPU-Only binaries if you are using docker
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+   # else run these commands
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=OFF ..
+   make
+
+When the compile finishes, you can get the output whl package under
+build/python/dist, then you can choose to install the whl on local
+machine or copy it to the target machine.
+
+.. code-block:: bash
+
+   pip install python/dist/*.whl
+
+
+.. _run_test:
+
+Run Tests
+----------------
+
+If you wish to run the tests, you may follow the below steps:
+
+When using Docker, set :code:`RUN_TEST=ON` and :code:`WITH_TESTING=ON` will run test immediately after the build.
+Set :code:`WITH_GPU=ON` Can also run tests on GPU.
+
+.. code-block:: bash
+
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=ON" -e "RUN_TEST=ON" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x paddle/scripts/docker/build.sh
+
+If you don't use Docker, just run ctest will start the tests:
+
+.. code-block:: bash
+
+   mkdir build
+   cd build
+   cmake -DWITH_GPU=OFF -DWITH_TESTING=ON ..
+   make
+   ctest
+   # run a single test like test_mul_op
+   ctest -R test_mul_op
+
+
+.. _compile_deps:
+
+Compile Dependencies
+----------------
+
+PaddlePaddle need the following dependencies when compiling, other dependencies
+will be downloaded automatically.
+
+.. csv-table:: PaddlePaddle Compile Dependencies
+   :header: "Dependency", "Version", "Description"
+   :widths: 10, 15, 30
+
+   "CMake", ">=3.5", ""
+   "GCC", "4.8.2", "Recommend devtools2 for CentOS"
+   "Python", "2.7.x", "Need libpython2.7.so"
+   "pip", ">=9.0", ""
+   "numpy", "", ""
+   "SWIG", ">=2.0", ""
+   "Go", ">=1.8", "Optional"
+
+
+.. _build_options:
+
+Build Options
+----------------
+
+Build options include whether build binaries for CPU or GPU, which BLAS
+library to use etc. You may pass these settings when running cmake.
+For detailed cmake tutorial please refer to `here <https://cmake.org/cmake-tutorial>`_ 。
+
+.. _build_options_bool:
+
+Bool Type Options
+----------------
+
+You can add :code:`-D` argument to pass such options, like:
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: Bool Type Options
+    :header: "Option", "Description", "Default"
+    :widths: 1, 7, 2
+
+    "WITH_GPU", "Build with GPU support", "ON"
+    "WITH_C_API", "Build only CAPI", "OFF"
+    "WITH_DOUBLE", "Build with double precision", "OFF"
+    "WITH_DSO", "Dynamically load CUDA libraries", "ON"
+    "WITH_AVX", "Build with AVX support", "ON"
+    "WITH_PYTHON", "Build with integrated Python interpreter", "ON"
+    "WITH_STYLE_CHECK", "Check code style when building", "ON"
+    "WITH_TESTING", "Build unit tests", "ON"
+    "WITH_DOC", "Build documentaions", "OFF"
+    "WITH_SWIG_PY", "Build Python SWIG interface for V2 API", "Auto"
+    "WITH_GOLANG", "Build fault-tolerant parameter server written in go", "ON"
+    "WITH_MKL", "Use MKL as BLAS library, else use OpenBLAS", "ON"
+
+
+BLAS
++++++
+
+PaddlePaddle supports `MKL <https://software.intel.com/en-us/intel-mkl>`_ and
+`OpenBlAS <http://www.openblas.net/>`_ as BLAS library。By default it uses MKL.
+If you are using MKL and your machine supports AVX2, MKL-DNN will also be downloaded
+and used, for more `details <https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/mkldnn#cmake>`_ .
+
+If you choose not to use MKL, then OpenBlAS will be used.
+
+CUDA/cuDNN
++++++++++++
+
+PaddlePaddle will automatically find CUDA and cuDNN when compiling and running.
+parameter :code:`-DCUDA_ARCH_NAME=Auto` can be used to detect SM architecture
+automatically in order to speed up the build.
+
+PaddlePaddle can build with any version later than cuDNN v5.1, and we intend to
+keep on with latest cuDNN versions. Be sure to run with the same version of cuDNN
+you built.
+
+Pass Compile Options
+++++++++++++++
+
+You can pass compile options to use intended BLAS/CUDA/Cudnn libraries.
+When running cmake command, it will search system paths like
+:code:`/usr/lib:/usr/local/lib` and then search paths that you
+passed to cmake, i.e.
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=ON -DWITH_TESTING=OFF -DCUDNN_ROOT=/opt/cudnnv5
+
+**NOTE: These options only take effect when running cmake for the first time, you need to clean the cmake cache or clean the build directory (** :code:`rm -rf` **) if you want to change it.**
diff --git a/doc/getstarted/build_and_install/cmake.png b/doc/getstarted/build_and_install/cmake.png
deleted file mode 100644
index a58cd09ad99cf27cc1ca5785fe54d726b83a82f6..0000000000000000000000000000000000000000
Binary files a/doc/getstarted/build_and_install/cmake.png and /dev/null differ
diff --git a/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst b/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
deleted file mode 100644
index be0c1ffa451b2901ec06621dd4d886f800b4562e..0000000000000000000000000000000000000000
--- a/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
+++ /dev/null
@@ -1,43 +0,0 @@
-PaddlePaddle的编译选项
-======================
-
-PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考 `官方文档 <https://cmake.org/cmake-tutorial>`_ 。
-
-Bool型的编译选项
-----------------
-用户可在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如
-
-..  code-block:: bash
-
-    cmake .. -DWITH_GPU=OFF
-
-..  csv-table:: Bool型的编译选项
-    :widths: 1, 7, 2
-    :file: compile_options.csv
-
-BLAS/CUDA/Cudnn的编译选项
---------------------------
-BLAS
-+++++
-
-PaddlePaddle支持以下任意一种BLAS库：`MKL <https://software.intel.com/en-us/intel-mkl>`_ ，`ATLAS <http://math-atlas.sourceforge.net/>`_ ，`OpenBlAS <http://www.openblas.net/>`_ 和 `REFERENCE BLAS <http://www.netlib.org/blas/>`_ 。
-
-..  csv-table:: BLAS路径相关的编译选项
-    :widths: 1, 2, 7
-    :file: cblas_settings.csv
-
-CUDA/Cudnn
-+++++++++++
-
-PaddlePaddle可以使用cudnn v2之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cudnn是同一个版本。 我们推荐使用最新版本的cudnn v5.1。
-
-编译选项的设置
-++++++++++++++
-
-PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。cmake编译时，首先在系统路径(/usr/lib\:/usr/local/lib)中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
-
-..  code-block:: bash
-
-    cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5
-
-注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（``rm -rf``）后，再指定。
diff --git a/doc/getstarted/build_and_install/cmake/cblas_settings.csv b/doc/getstarted/build_and_install/cmake/cblas_settings.csv
deleted file mode 100644
index a6356baf16a0d3d2499e39d2055d8ee878dcaef2..0000000000000000000000000000000000000000
--- a/doc/getstarted/build_and_install/cmake/cblas_settings.csv
+++ /dev/null
@@ -1,5 +0,0 @@
-编译选项,描述,注意
-MKL_ROOT,MKL的路径,${MKL_ROOT}/include下需要包含mkl.h，${MKL_ROOT}/lib目录下需要包含mkl_core，mkl_sequential和mkl_intel_lp64三个库。
-ATLAS_ROOT,ATLAS的路径,${ATLAS_ROOT}/include下需要包含cblas.h，${ATLAS_ROOT}/lib下需要包含cblas和atlas两个库。
-OPENBLAS_ROOT,OpenBLAS的路径,${OPENBLAS_ROOT}/include下需要包含cblas.h，${OPENBLAS_ROOT}/lib下需要包含openblas库。
-REFERENCE_CBLAS_ROOT,REFERENCE BLAS的路径,${REFERENCE_CBLAS_ROOT}/include下需要包含cblas.h，${REFERENCE_CBLAS_ROOT}/lib下需要包含cblas库。
\ No newline at end of file
diff --git a/doc/getstarted/build_and_install/cmake/compile_options.csv b/doc/getstarted/build_and_install/cmake/compile_options.csv
deleted file mode 100644
index 463b825470579d0c3736a408b1e82dd33e6f8d42..0000000000000000000000000000000000000000
--- a/doc/getstarted/build_and_install/cmake/compile_options.csv
+++ /dev/null
@@ -1,12 +0,0 @@
-选项,说明,默认值
-WITH_GPU,是否支持GPU。,取决于是否寻找到CUDA工具链
-WITH_DOUBLE,是否使用双精度浮点数。,否
-WITH_DSO,是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。,是
-WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制文件,是
-WITH_PYTHON,是否内嵌PYTHON解释器。方便今后的嵌入式移植工作。,是
-WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
-WITH_RDMA,是否开启RDMA,否
-WITH_TIMER,是否开启计时功能。如果开启会导致运行略慢，打印的日志变多，但是方便调试和测Benchmark,否
-WITH_TESTING,是否开启单元测试,取决于是否寻找到GTEST
-WITH_DOC,是否编译中英文文档,否
-WITH_SWIG_PY,是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练,取决于是否寻找到SWIG
\ No newline at end of file
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index 0d34dec8e908c5e61001500725187a2233797f46..f78b1fb0e11aa028a4b7abb5270740b97f8039e9 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -1,222 +1,139 @@
-PaddlePaddle的Docker容器使用方式
+使用Docker安装运行
 ================================
 
-PaddlePaddle目前唯一官方支持的运行的方式是Docker容器。因为Docker能在所有主要操作系统（包括Linux，Mac OS X和Windows）上运行。 请注意，您需要更改 `Dockers设置 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 才能充分利用Mac OS X和Windows上的硬件资源。
+使用Docker安装和运行PaddlePaddle可以无需考虑依赖环境即可运行。并且也可以在Windows的docker中运行。
+您可以在 `Docker官网 <https://docs.docker.com/get-started/>`_ 获得基本的Docker安装和使用方法。
 
-Docker使用入门
-------------------------------
-
-几个基础的概念帮助理解和使用Docker：
+如果您在使用Windows，可以参考
+`这篇 <https://docs.docker.com/toolbox/toolbox_install_windows/>`_
+教程，完成在Windows上安装和使用Docker。
 
-- *镜像*：一个Docker镜像是一个打包好的软件。它包含了这个软件本身和它所依赖的运行环境。PaddlePaddle的Docker镜像就包含了PaddlePaddle的Python库以及其依赖的多个Python库。这样我们可以直接在Docker中运行需要的程序而不需要安装后在执行。可以执行：
+在了解Docker的基本使用方法之后，即可开始下面的步骤：
 
-  .. code-block:: bash
+.. _docker_pull:
 
-     docker images
+获取PaddlePaddle的Docker镜像
+------------------------------
 
-  来列出当前系统中的所有镜像，同样可以执行：
+执行下面的命令获取最新的PaddlePaddle Docker镜像
 
   .. code-block:: bash
-		  
-     docker pull paddlepaddle/paddle:0.10.0
 
-  来下载Docker镜像，paddlepaddle/paddle是从官方镜像源Dockerhub.com下载的，推荐国内用户使用docker.paddlepaddle.org/paddle下载。
+     docker pull paddlepaddle/paddle
 
-- *容器*： 如果说一个Docker镜像就是一个程序，那容器就是这个程序运行时产生的“进程”。
-  实际上，一个容器就是一个操作系统的进程，但是是运行在独立的进程空间，文件系统以及网络之上。
-  可以执行：
+对于国内用户，我们提供了加速访问的镜像源：
 
   .. code-block:: bash
 
-     docker run paddlepaddle/paddle:0.10.0
+     docker pull docker.paddlepaddle.org/paddle
 
-  来使用一个镜像启动一个容器。
-
-- 默认情况下，Docker容器会运行在独立的文件系统空间之上，我们无法在Docker容器中
-  访问到主机上的文件。可以通过*挂载Volume*的方式，将主机上的文件或目录挂载到
-  Docker容器中。下面的命令把当前目录挂载到了容器中的 /data 目录下，容器使用
-  debian镜像，并且启动后执行 :code:`ls /data`。
+下载GPU版本的Docker镜像：
 
   .. code-block:: bash
 
-     docker run --rm -v $(pwd):/data debian ls /data
-
-PaddlePaddle发布的Docker镜像使用说明
-------------------------------
-
-我们把PaddlePaddle的编译环境打包成一个镜像，称为开发镜像，里面涵盖了
-PaddlePaddle需要的所有编译工具。把编译出来的PaddlePaddle也打包成一个镜
-像，称为生产镜像，里面涵盖了PaddlePaddle运行所需的所有环境。每次
-PaddlePaddle发布新版本的时候都会发布对应版本的生产镜像以及开发镜像。运
-行镜像包括纯CPU版本和GPU版本以及其对应的非AVX版本。我们会在
-`dockerhub.com <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 
-和国内镜像`docker.paddlepaddle.org` 提供最新
-的Docker镜像，可以在"tags"标签下找到最新的Paddle镜像版本。
-
-**注意：为了方便在国内的开发者下载Docker镜像，我们提供了国内的镜像服务器供大家使用。如果您在国内，请把文档里命令中的paddlepaddle/paddle替换成docker.paddlepaddle.org/paddle。**
-
-1. 开发镜像：:code:`paddlepaddle/paddle:0.10.0-dev`
-
-   这个镜像包含了Paddle相关的开发工具以及编译和运行环境。用户可以使用开发镜像代替配置本地环境，完成开发，编译，发布，
-   文档编写等工作。由于不同的Paddle的版本可能需要不同的依赖和工具，所以如果需要自行配置开发环境需要考虑版本的因素。
-   开发镜像包含了以下工具：
-   
-   - gcc/clang
-   - nvcc
-   - Python
-   - sphinx
-   - woboq
-   - sshd
-   很多开发者会使用远程的安装有GPU的服务器工作，用户可以使用ssh登录到这台服务器上并执行 :code:`docker exec`进入开发镜像并开始工作，
-   也可以在开发镜像中启动一个SSHD服务，方便开发者直接登录到镜像中进行开发:
-
-   以交互容器方式运行开发镜像：
-
-   .. code-block:: bash
-
-      docker run -it --rm -v $(pwd):/paddle  paddlepaddle/paddle:0.10.0-dev /bin/bash
-
-   或者，可以以后台进程方式运行容器：
-
-   .. code-block:: bash
-
-      docker run -d -p 2202:22 -p 8888:8888 -v $(pwd):/paddle paddlepaddle/paddle:0.10.0-dev /usr/sbin/sshd -D
-
-   然后用密码 :code:`root` SSH进入容器：
-
-   .. code-block:: bash
-
-      ssh -p 2202 root@localhost
-
-   SSH方式的一个优点是我们可以从多个终端进入容器。比如，一个终端运行vi，另一个终端运行Python。另一个好处是我们可以把PaddlePaddle容器运行在远程服务器上，并在笔记本上通过SSH与其连接。
-
-2. 生产镜像：根据CPU、GPU和非AVX区分了如下4个镜像：
-
-   - GPU/AVX：:code:`paddlepaddle/paddle:<version>-gpu`
-   - GPU/no-AVX：:code:`paddlepaddle/paddle:<version>-gpu-noavx`
-   - CPU/AVX：:code:`paddlepaddle/paddle:<version>`
-   - CPU/no-AVX：:code:`paddlepaddle/paddle:<version>-noavx`
-
-   纯CPU镜像以及GPU镜像都会用到AVX指令集，但是2008年之前生产的旧电脑不支持AVX。以下指令能检查Linux电脑是否支持AVX：
-
-   .. code-block:: bash
-
-      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
-
-   如果输出是No，就需要选择使用no-AVX的镜像
-
-   **注：在0.10.0之后的版本，PaddlePaddle都可以自动判断硬件是否支持AVX，所以无需判断AVX即可使用**
+     docker pull paddlepaddle/paddle:latest-gpu
+     docker pull docker.paddlepaddle.org/paddle:latest-gpu
 
-   以上方法在GPU镜像里也能用，只是请不要忘记提前在物理机上安装GPU最新驱动。
-   为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用[nvidia-docker](https://github.com/NVIDIA/nvidia-docker)来运行镜像。
+选择下载使用不同的BLAS库的Docker镜像：
 
-   .. code-block:: bash
-
-      nvidia-docker run -it --rm paddledev/paddle:0.10.0-gpu /bin/bash
+  .. code-block:: bash
 
-   注意: 如果使用nvidia-docker存在问题，你也许可以尝试更老的方法，具体如下，但是我们并不推荐这种方法。：
+     # 默认是使用MKL的镜像
+     docker pull paddlepaddle/paddle
+     # 使用OpenBLAS的镜像
+     docker pull paddlepaddle/paddle:latest-openblas
 
-   .. code-block:: bash
+下载指定版本的Docker镜像，可以从 `DockerHub网站 <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_ 获取可选的tag，并执行下面的命令：
 
-      export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
-      export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-      docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0-gpu
+  .. code-block:: bash
 
-3. 运行以及发布您的AI程序
+     docker pull paddlepaddle/paddle:[tag]
+     # 比如：
+     docker pull docker.paddlepaddle.org/paddle:0.10.0-gpu
 
-   假设您已经完成了一个AI训练的python程序 :code:`a.py`，这个程序是您在开发机上使用开发镜像完成开发。此时您可以运行这个命令在开发机上进行测试运行：
+.. _docker_run:
 
-   .. code-block:: bash
+在Docker中执行PaddlePaddle训练程序
+------------------------------
 
-      docker run -it -v $PWD:/work paddle /work/a.py
+假设您已经在当前目录（比如在/home/work）编写了一个PaddlePaddle的程序 :code:`train.py` （可以参考
+`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_ 
+编写），就可以使用下面的命令开始执行训练：
 
-   如果要使用GPU，请运行：
+  .. code-block:: bash
 
-   .. code-block:: bash
+     cd /home/work
+     docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
+ 
+上述命令中， :code:`-it` 参数说明容器已交互式运行； :code:`-v $PWD:/work`
+指定将当前路径（Linux中$PWD变量会展开为当前路径的绝对路径）挂载到容器内部的 :code:`/work`
+目录； :code:`paddlepaddle/paddle` 指定需要使用的容器； 最后 :code:`/work/train.py`
+为容器内执行的命令，即运行训练程序。
 
-      nvidia-docker run -it -v $PWD:/work paddle /work/a.py
+当然，您也可以进入到Docker容器中，以交互式的方式执行或调试您的代码：
 
+  .. code-block:: bash
+     docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
+     cd /work
+     python train.py
 
-   这里`a.py`包含的所有依赖假设都可以在Paddle的运行容器中。如果需要包含更多的依赖、或者需要发布您的应用的镜像，可以编写`Dockerfile`使用`FROM paddledev/paddle:0.10.0`
-   创建和发布自己的AI程序镜像。
+**注：PaddlePaddle Docker镜像为了减小体积，默认没有安装vim，您可以在容器中执行** :code:`apt-get install -y vim` **安装后，在容器中编辑代码。**
 
-运行PaddlePaddle Book
----------------------
+.. _docker_run_book:
 
-Jupyter Notebook是一个开源的web程序，大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。
+使用Docker启动PaddlePaddle Book教程
+------------------------------
 
+使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook，可以通过网页浏览。
 PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
 如果您想要更深入了解deep learning，PaddlePaddle Book一定是您最好的选择。
+大家可以通过它阅读教程，或者制作和分享带有代码、公式、图表、文字的交互式文档。
 
 我们提供可以直接运行PaddlePaddle Book的Docker镜像，直接运行：
 
-.. code-block:: bash
+  .. code-block:: bash
 
-    docker run -p 8888:8888 paddlepaddle/book
+     docker run -p 8888:8888 paddlepaddle/book
 
 然后在浏览器中输入以下网址：
 
-.. code-block:: text
+  .. code-block:: text
 
-    http://localhost:8888/
+     http://localhost:8888/
 
 就这么简单，享受您的旅程！
 
-通过Docker容器开发PaddlePaddle
-------------------------------
-
-开发人员可以在Docker开发镜像中开发PaddlePaddle。这样开发人员可以以一致的方式在不同的平台上工作 - Linux，Mac OS X和Windows。
+.. _docker_run_gpu:
 
-1. 制作PaddlePaddle开发镜像
-
-   PaddlePaddle每次发布新版本都会发布对应的开发镜像供开发者直接使用。这里介绍如生成造这个开发镜像。
-   生成Docker镜像的方式有两个，一个是直接把一个容器转换成镜像，另一个是创建Dockerfile并运行docker build指令按照Dockerfile生成镜像。第一个方法的好处是简单快捷，适合自己实验，可以快速迭代。第二个方法的好处是Dockerfile可以把整个生成流程描述很清楚，其他人很容易看懂镜像生成过程，持续集成系统也可以简单地复现这个过程。我们采用第二个方法。Dockerfile位于PaddlePaddle repo的根目录。生成生产镜像只需要运行：
-
-   .. code-block:: bash
-      
-      git clone https://github.com/PaddlePaddle/Paddle.git
-      cd Paddle
-      docker build -t paddle:dev .
-
-   docker build这个命令的-t指定了生成的镜像的名字，这里我们用paddle:dev。到此，PaddlePaddle开发镜像就被构建完毕了。
+使用Docker执行GPU训练
+------------------------------
 
-2. 制作PaddlePaddle生产镜像
+为了保证GPU驱动能够在镜像里面正常运行，我们推荐使用
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_ 来运行镜像。
+请不要忘记提前在物理机上安装GPU最新驱动。
 
-   生产镜像的生成分为两步，第一步是运行：
+  .. code-block:: bash
 
-   .. code-block:: bash
-      
-      docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=OFF" -e "WITH_TEST=ON" paddle:dev
+     nvidia-docker run -it -v $PWD:/work paddledev/paddle:latest-gpu /bin/bash
 
-   以上命令会编译PaddlePaddle，生成运行程序，以及生成创建生产镜像的Dockerfile。所有生成的的文件都在build目录下。“WITH_GPU”控制生成的生产镜像是否支持GPU，“WITH_AVX”控制生成的生产镜像是否支持AVX，”WITH_TEST“控制是否生成单元测试。
+**注: 如果没有安装nvidia-docker，可以尝试以下的方法，将CUDA库和Linux设备挂载到Docker容器内：**
 
-   第二步是运行：
+  .. code-block:: bash
 
-   .. code-block:: bash
-      
-      docker build -t paddle:prod -f build/Dockerfile ./build
+     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+     docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:latest-gpu
 
-   以上命令会按照生成的Dockerfile把生成的程序拷贝到生产镜像中并做相应的配置，最终生成名为paddle:prod的生产镜像。
+**关于AVX：**
 
-3. 运行单元测试
+AVX是一种CPU指令集，可以加速PaddlePaddle的计算。最新的PaddlePaddle Docker镜像默认
+是开启AVX编译的，所以，如果您的电脑不支持AVX，需要单独
+`编译 <./build_from_source_cn.rst>`_ PaddlePaddle为no-avx版本。
 
-   运行以下指令：
+以下指令能检查Linux电脑是否支持AVX：
 
    .. code-block:: bash
-      
-      docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
-
-文档
-----
-
-Paddle的Docker开发镜像带有一个通过 `woboq code browser
-<https://github.com/woboq/woboq_codebrowser>`_ 生成的HTML版本的C++源代码，便于用户浏览C++源码。
 
-只要在Docker里启动PaddlePaddle的时候给它一个名字，就可以再运行另一个Nginx Docker镜像来服务HTML代码：
-
-.. code-block:: bash
-
-   docker run -d --name paddle-cpu-doc paddle:0.10.0-dev
-   docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
 
-接着我们就能够打开浏览器在 http://localhost:8088/paddle/ 浏览代码。
+如果输出是No，就需要选择使用no-AVX的镜像
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index 94860240f6a4a9bed8a865684a8a79960489280e..d7acc7aeb744b19d83acb520d07c8551168dd096 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -1,270 +1,146 @@
-PaddlePaddle in Docker Containers
+Run in Docker Containers
 =================================
 
-Docker container is currently the only officially-supported way to
-running PaddlePaddle.  This is reasonable as Docker now runs on all
-major operating systems including Linux, Mac OS X, and Windows.
-Please be aware that you will need to change `Dockers settings
-<https://github.com/PaddlePaddle/Paddle/issues/627>`_ to make full use
-of your hardware resource on Mac OS X and Windows.
+Run PaddlePaddle in Docker container so that you don't need to care about
+runtime dependencies, also you can run under Windows system. You can get
+tutorials at `here <https://docs.docker.com/get-started/>`_ .
 
-Working With Docker
--------------------
+If you are using Windows, please refer to
+`this <https://docs.docker.com/toolbox/toolbox_install_windows/>`_
+tutorial to start running docker under windows.
 
-Docker is simple as long as we understand a few basic concepts:
+After you've read above tutorials you may proceed the following steps.
 
-- *image*: A Docker image is a pack of software. It could contain one or more programs and all their dependencies. For example, the PaddlePaddle's Docker image includes pre-built PaddlePaddle and Python and many Python packages. We can run a Docker image directly, other than installing all these software. We can type
+.. _docker_pull:
 
-  .. code-block:: bash
-
-     docker images
+Pull PaddlePaddle Docker Image
+------------------------------
 
-  to list all images in the system. We can also run
+Run the following command to download the latest Docker images:
 
   .. code-block:: bash
-		  
-     docker pull paddlepaddle/paddle:0.10.0
 
-  to download a Docker image, paddlepaddle/paddle in this example,
-  from Dockerhub.com.
+     docker pull paddlepaddle/paddle
 
-- *container*: considering a Docker image a program, a container is a
-  "process" that runs the image. Indeed, a container is exactly an
-  operating system process, but with a virtualized filesystem, network
-  port space, and other virtualized environment. We can type
+For users in China, we provide a faster mirror:
 
   .. code-block:: bash
 
-     docker run paddlepaddle/paddle:0.10.0
+     docker pull docker.paddlepaddle.org/paddle
 
-  to start a container to run a Docker image, paddlepaddle/paddle in this example.
-
-- By default docker container have an isolated file system namespace,
-  we can not see the files in the host file system. By using *volume*,
-  mounted files in host will be visible inside docker container.
-  Following command will mount current dirctory into /data inside
-  docker container, run docker container from debian image with
-  command :code:`ls /data`.
+Download GPU version images:
 
   .. code-block:: bash
 
-     docker run --rm -v $(pwd):/data debian ls /data
-
-Usage of CPU-only and GPU Images
-----------------------------------
-
-We package PaddlePaddle's compile environment into a Docker image,
-called the develop image, it contains all compiling tools that
-PaddlePaddle needs. We package compiled PaddlePaddle program into a
-Docker image as well, called the production image, it contains all
-runtime environment that running PaddlePaddle needs. For each version
-of PaddlePaddle, we release both of them. Production image includes
-CPU-only version and a CUDA GPU version and their no-AVX versions.
-
-We put the docker images on `dockerhub.com
-<https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_. You can find the
-latest versions under "tags" tab at dockerhub.com. 
-
-** NOTE: If you are in China, you can use our Docker image registry mirror to speed up the download process. To use it, please replace all paddlepaddle/paddle in the commands to docker.paddlepaddle.org/paddle.**
-
-
-1. development image :code:`paddlepaddle/paddle:<version>-dev`
-
-   This image has packed related develop tools and runtime
-   environment. Users and developers can use this image instead of
-   their own local computer to accomplish development, build,
-   releasing, document writing etc. While different version of paddle
-   may depends on different version of libraries and tools, if you
-   want to setup a local environment, you must pay attention to the
-   versions.  The development image contains:
-   
-   - gcc/clang
-   - nvcc
-   - Python
-   - sphinx
-   - woboq
-   - sshd
-     
-   Many developers use servers with GPUs, they can use ssh to login to
-   the server and run :code:`docker exec` to enter the docker
-   container and start their work.  Also they can start a development
-   docker image with SSHD service, so they can login to the container
-   and start work.
-
-2. Production images, this image might have multiple variants:
-
-   - GPU/AVX：:code:`paddlepaddle/paddle:<version>-gpu`
-   - GPU/no-AVX：:code:`paddlepaddle/paddle:<version>-gpu-noavx`
-   - CPU/AVX：:code:`paddlepaddle/paddle:<version>`
-   - CPU/no-AVX：:code:`paddlepaddle/paddle:<version>-noavx`
-
-   Please be aware that the CPU-only and the GPU images both use the
-   AVX instruction set, but old computers produced before 2008 do not
-   support AVX.  The following command checks if your Linux computer
-   supports AVX:
-
-   .. code-block:: bash
-
-      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
-
-   **NOTE：versions after 0.10.0 will automatically detect system AVX support, so manual detect is not needed in this case.**
-   To run the CPU-only image as an interactive container:
-
-   .. code-block:: bash
-
-      docker run -it --rm paddlepaddle/paddle:0.10.0 /bin/bash
-
-   Above method work with the GPU image too -- the recommended way is
-   using `nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_.
-
-   Please install nvidia-docker first following this `tutorial
-   <https://github.com/NVIDIA/nvidia-docker#quick-start>`_.
-
-   Now you can run a GPU image:
-
-   .. code-block:: bash
-
-      nvidia-docker run -it --rm paddlepaddle/paddle:0.10.0-gpu /bin/bash
-
-
-Train Model Using Python API
-----------------------------
-
-Our official docker image provides a runtime for PaddlePaddle
-programs. The typical workflow will be as follows:
-
-Create a directory as workspace:
-
-.. code-block:: bash
-
-   mkdir ~/workspace
-
-Edit a PaddlePaddle python program using your favourite editor
-
-.. code-block:: bash
-
-   emacs ~/workspace/example.py
-
-Run the program using docker:
-
-.. code-block:: bash
-
-   docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0 python /workspace/example.py
-
-Or if you are using GPU for training:
+     docker pull paddlepaddle/paddle:latest-gpu
+     docker pull docker.paddlepaddle.org/paddle:latest-gpu
 
-.. code-block:: bash
+Choose between different BLAS version:
 
-   nvidia-docker run --rm -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0-gpu python /workspace/example.py
-
-Above commands will start a docker container by running :code:`python
-/workspace/example.py`. It will stop once :code:`python
-/workspace/example.py` finishes.
-
-Another way is to tell docker to start a :code:`/bin/bash` session and
-run PaddlePaddle program interactively:
-
-.. code-block:: bash
-
-   docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0 /bin/bash
-   # now we are inside docker container
-   cd /workspace
-   python example.py
-
-Running with GPU is identical:
-
-.. code-block:: bash
-
-   nvidia-docker run -it -v ~/workspace:/workspace paddlepaddle/paddle:0.10.0-gpu /bin/bash
-   # now we are inside docker container
-   cd /workspace
-   python example.py
-
-
-Develop PaddlePaddle or Train Model Using C++ API
----------------------------------------------------
-
-We will be using PaddlePaddle development image since it contains all
-compiling tools and dependencies.
+  .. code-block:: bash
 
-1. Build PaddlePaddle develop image
+     # image using MKL by default
+     docker pull paddlepaddle/paddle
+     # image using OpenBLAS
+     docker pull paddlepaddle/paddle:latest-openblas
 
-   Use following command to build PaddlePaddle develop image:
 
-   .. code-block:: bash
+If you want to use legacy versions, choose a tag from
+`DockerHub <https://hub.docker.com/r/paddlepaddle/paddle/tags/>`_
+and run:
 
-      git clone https://github.com/PaddlePaddle/Paddle.git && cd Paddle
-      docker build -t paddle:dev .
-
-2. Build PaddlePaddle production image
+  .. code-block:: bash
 
-   There are two steps for building production image, the first step is to run:
+     docker pull paddlepaddle/paddle:[tag]
+     # i.e.
+     docker pull docker.paddlepaddle.org/paddle:0.10.0-gpu
 
-   .. code-block:: bash
+.. _docker_run:
 
-      docker run -v $(pwd):/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=OFF" -e "WITH_TEST=ON" paddle:dev
+Launch your training program in Docker
+------------------------------
 
-   The above command will compile PaddlePaddle and create a Dockerfile for building production image. All the generated files are in the build directory. "WITH_GPU" controls if the generated production image supports GPU. "WITH_AVX" controls if the generated production image supports AVX. "WITH_TEST" controls if the unit test will be generated.
+Assume that you have already written a PaddlePaddle program
+named :code:`train.py` under directory :code:`/home/work` (refer to 
+`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_ 
+for more samples), then run the following command:
 
-   The second step is to run:
+  .. code-block:: bash
 
-   .. code-block:: bash
+     cd /home/work
+     docker run -it -v $PWD:/work paddlepaddle/paddle /work/train.py
 
-      docker build -t paddle:prod -f build/Dockerfile ./build
+In the above command, :code:`-it` means run the container interactively;
+:code:`-v $PWD:/work` means mount the current directory ($PWD will expand
+to current absolute path in Linux) under :code:`/work` in the container.
+:code:`paddlepaddle/paddle` to specify image to use; finnally
+:code:`/work/train.py` is the command to run inside docker.
 
-   The above command will generate the production image by copying the compiled PaddlePaddle program into the image.
+Also, you can go into the container shell, run or debug your code
+interactively:
 
-3. Run unit test
+  .. code-block:: bash
+     docker run -it -v $PWD:/work paddlepaddle/paddle /bin/bash
+     cd /work
+     python train.py
 
-   Following command will run unit test:
+**NOTE: We did not install vim in the default docker image to reduce the image size, you can run** :code:`apt-get install -y vim` **to install it if you need to edit python files.**
 
-   .. code-block:: bash
-      
-      docker run -it -v $(pwd):/paddle paddle:dev bash -c "cd /paddle/build && ctest"
+.. _docker_run_book:
 
 PaddlePaddle Book
 ------------------
 
-The Jupyter Notebook is an open-source web application that allows
-you to create and share documents that contain live code, equations,
-visualizations and explanatory text in a single browser.
-
-PaddlePaddle Book is an interactive Jupyter Notebook for users and developers.
-We already exposed port 8888 for this book. If you want to
+You can create a container serving PaddlePaddle Book using Jupyter Notebook in
+one minute using Docker. PaddlePaddle Book is an interactive Jupyter Notebook
+for users and developers.If you want to
 dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
 
 We provide a packaged book image, simply issue the command:
 
-.. code-block:: bash
+  .. code-block:: bash
 
-    docker run -p 8888:8888 paddlepaddle/book
+     docker run -p 8888:8888 paddlepaddle/book
 
 Then, you would back and paste the address into the local browser:
 
-.. code-block:: text
+  .. code-block:: text
 
-    http://localhost:8888/
+     http://localhost:8888/
 
 That's all. Enjoy your journey!
 
+.. _docker_run_gpu:
 
-Documentation
--------------
+Train with Docker with GPU
+------------------------------
 
-Paddle Docker images include an HTML version of C++ source code
-generated using `woboq code browser
-<https://github.com/woboq/woboq_codebrowser>`_.  This makes it easy
-for users to browse and understand the C++ source code.
+We recommend using
+`nvidia-docker <https://github.com/NVIDIA/nvidia-docker>`_
+to run GPU training jobs. Please ensure you have latest
+GPU driver installed before move on.
 
-As long as we give the Paddle Docker container a name, we can run an
-additional Nginx Docker container to serve the volume from the Paddle
-container:
+  .. code-block:: bash
 
-.. code-block:: bash
+     nvidia-docker run -it -v $PWD:/work paddledev/paddle:latest-gpu /bin/bash
 
-   docker run -d --name paddle-cpu-doc paddle:<version>
-   docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
+**NOTE: If you don't have nvidia-docker installed, try the following method to mount CUDA libs and devices into the container.**
 
+  .. code-block:: bash
 
-Then we can direct our Web browser to the HTML version of source code
-at http://localhost:8088/paddle/
+     export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+     export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+     docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:latest-gpu
+
+**About AVX:**
+
+AVX is a kind of CPU instruction can accelerate PaddlePaddle's calculations.
+The latest PaddlePaddle Docker image turns AVX on by default, so, if your
+computer doesn't support AVX, you'll probably need to
+`build <./build_from_source_en.rst>`_ with :code:`WITH_AVX=OFF`.
+
+The following command will tell you whether your computer supports AVX.
+
+   .. code-block:: bash
+
+      if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
diff --git a/doc/getstarted/build_and_install/index_cn.rst b/doc/getstarted/build_and_install/index_cn.rst
index dd9923697ab85825557aa89a08870bece7c76673..88c5142ddee994ed0c0dc520195311e97f5a549e 100644
--- a/doc/getstarted/build_and_install/index_cn.rst
+++ b/doc/getstarted/build_and_install/index_cn.rst
@@ -6,12 +6,13 @@
 安装流程
 ++++++++
 
-PaddlePaddle提供Docker镜像来部署环境。
+PaddlePaddle提供pip和Docker的安装方式：
 
 .. toctree::
    :maxdepth: 1
-   
-   docker_install_cn.rst 
+
+   pip_install_cn.rst
+   docker_install_cn.rst
 
 
 编译流程
@@ -19,9 +20,14 @@ PaddlePaddle提供Docker镜像来部署环境。
 
 ..  warning::
 
-    编译流程主要推荐高级用户查看，普通用户请走安装流程。
+    建议直接使用上述安装流程，方便快速安装。只有在遇到需要独立定制的二进制时才需要编译。
 
 ..  toctree::
     :maxdepth: 1
 
-    cmake/build_from_source_cn.rst
+    build_from_source_cn.rst
+
+常见问题解答
+++++++++++
+
+`常见问题解答 <http://www.paddlepaddle.org/docs/develop/documentation/zh/faq/build_and_install/index_cn.html>`_
diff --git a/doc/getstarted/build_and_install/index_en.rst b/doc/getstarted/build_and_install/index_en.rst
index 8a53588e0439df8f4d5fd529b7a20262c67d4e58..c8b60d03578ba6a9b73134ec53b440d057e36079 100644
--- a/doc/getstarted/build_and_install/index_en.rst
+++ b/doc/getstarted/build_and_install/index_en.rst
@@ -1,22 +1,33 @@
 Install and Build
 =================
 
-Install PaddlePaddle
-----------------------
+.. _install_steps:
 
-..  toctree::
-    :maxdepth: 1
+Install Steps
+++++++++
+
+You can choose either pip or Docker to complete your install:
+
+.. toctree::
+   :maxdepth: 1
+
+   pip_install_en.rst
+   docker_install_en.rst
 
-    docker_install_en.rst
 
 Build from Source
 -----------------
 
 ..  warning::
 
-    Please use :code:`docker` image to install paddle. The building guide is used for hacking or contributing PaddlePaddle source code.
+    We recommend to directly install via above installation steps, you'll only need to build PaddlePaddle from source when you need a modifed binary.
 
 ..  toctree::
     :maxdepth: 1
 
     build_from_source_en.md
+
+FAQ
+++++++++++
+
+`FAQ <http://www.paddlepaddle.org/docs/develop/documentation/zh/faq/build_and_install/index_en.html>`_
diff --git a/doc/getstarted/build_and_install/paddleci.png b/doc/getstarted/build_and_install/paddleci.png
new file mode 100644
index 0000000000000000000000000000000000000000..16087ce059aa3c07ce8c927d983eb86351915825
Binary files /dev/null and b/doc/getstarted/build_and_install/paddleci.png differ
diff --git a/doc/getstarted/build_and_install/pip_install_cn.rst b/doc/getstarted/build_and_install/pip_install_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..b26bf4c95cb18f36408eb75894e8b9b674efc67b
--- /dev/null
+++ b/doc/getstarted/build_and_install/pip_install_cn.rst
@@ -0,0 +1,86 @@
+使用pip安装
+================================
+
+PaddlePaddle可以使用常用的Python包管理工具
+`pip <https://pip.pypa.io/en/stable/installing/>`_
+完成安装，并可以在大多数主流的Linux操作系统以及MacOS上执行。
+
+.. _pip_install:
+
+使用pip安装
+------------------------------
+
+
+执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境，并自动下载安装依赖软件。
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+
+如果需要安装支持GPU的版本，需要执行：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+如果需要获取并安装最新的（开发分支）PaddlePaddle，可以从我们的CI系统中下载最新的whl安装包和c-api开发包并安装，
+您可以从下面的表格中找到需要的版本：
+
+如果在点击下面链接时出现如下登陆界面，点击“Log in as guest”即可开始下载：
+
+.. image:: paddleci.png
+   :scale: 50 %
+   :align: center
+
+..  csv-table:: 各个版本最新的whl包
+    :header: "版本说明", "cp27-cp27mu", "cp27-cp27mu", "C-API"
+    :widths: 1, 3, 3, 3
+
+    "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
+    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+
+.. _pip_dependency:
+
+运行环境依赖
+------------------------------
+
+PaddlePaddle安装包由于不仅仅包含.py程序，而且包含了C++编写的部分，所以我们确保发布的二进制包可以支持主流的Linux操作系统，比如CentOS 6以上，Ubuntu 14.04以上，MacOS 10.12以上。
+
+PaddlePaddle发布的安装包会尽量对齐 `manylinux1 <https://www.python.org/dev/peps/pep-0513/#the-manylinux1-policy>`_ 标准，通常使用CentOS 5作为编译环境。但由于CUDA库通常需要CentOS 6以上，而且CentOS 5即将停止维护，所以我们默认使用CentOS 6作为标准编译环境。
+
+.. csv-table:: PaddlePaddle环境依赖
+   :header: "依赖", "版本", "说明"
+   :widths: 10, 15, 30
+
+   "操作系统", "Linux, MacOS", "CentOS 6以上，Ubuntu 14.04以上，MacOS 10.12以上"
+   "Python", "2.7.x", "暂时不支持Python3"
+   "libc.so", "GLIBC_2.7", "glibc至少包含GLIBC_2.7以上的符号"
+   "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "至少包含GLIBCXX_3.4.11, CXXABI_1.3.3以上的符号"
+   "libgcc_s.so", "GCC_3.3", "至少包含GCC_3.3以上的符号"
+
+.. _pip_faq:
+
+安装常见问题和解决方法
+------------------------------
+
+- paddlepaddle*.whl is not a supported wheel on this platform.
+  
+  出现这个问题的主要原因是，没有找到和当前系统匹配的paddlepaddle安装包。请检查Python版本是否为2.7系列。另外最新的pip官方源中的安装包默认是manylinux1标准，需要使用最新的pip (>9.0.0) 才可以安装。可以使用下面的命令更新您的pip：
+
+    .. code-block:: bash
+
+       pip install --upgrade pip
+
+  如果仍然存在问题，可以执行：
+
+      .. code-block:: bash
+
+         python -c "import pip; print(pip.pep425tags.get_supported())"
+
+  获取当前系统支持的安装包格式，并检查和需安装的包是否匹配。pypi安装包可以在 `这个 <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_ 链接中找到。
+
+  如果系统支持的是 linux_x86_64 而安装包是 manylinux1_x86_64 ，需要升级pip版本到最新； 如果系统支持 manylinux1_x86_64 而安装包（本地）是 linux_x86_64 ，可以重命名这个whl包为 manylinux1_x86_64 再安装。
\ No newline at end of file
diff --git a/doc/getstarted/build_and_install/pip_install_en.rst b/doc/getstarted/build_and_install/pip_install_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..113790e4e4ca116e91f11f8a233eae874d9d1b7a
--- /dev/null
+++ b/doc/getstarted/build_and_install/pip_install_en.rst
@@ -0,0 +1,104 @@
+Install Using pip
+================================
+
+You can use current widely used Python package management
+tool `pip <https://pip.pypa.io/en/stable/installing/>`_
+to install PaddlePaddle. This method can be used in
+most of current Linux systems or MacOS.
+
+.. _pip_install:
+
+Install Using pip
+------------------------------
+
+Run the following command to install PaddlePaddle on the current
+machine, it will also download requirements.
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+
+If you wish to install GPU version, just run:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+If you wish to install the latest develop branch PaddlePaddle, 
+you can download the latest whl package from our CI system. Access
+the below links, log in as guest, then click at the "Artifact"
+tab, you'll find the download link of whl packages.
+
+If the links below shows up the login form, just click "Log in as guest" to start the download:
+
+.. image:: paddleci.png
+   :scale: 50 %
+   :align: center
+
+..  csv-table:: whl package of each version
+    :header: "version", "cp27-cp27mu", "cp27-cp27mu", "C-API"
+    :widths: 1, 3, 3, 3
+
+    "cpu_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cpu_avx_openblas", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
+    "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+    "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle-0.10.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <http://guest@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
+
+.. _pip_dependency:
+
+Runtime Dependency
+------------------------------
+
+PaddlePaddle installation packages (whl) does not only contain .py files,
+but also binaries built from C++ code. We ensure that PaddlePaddle can
+run on current mainline Linux distributions, like CentOS 6, Ubuntu 14.04
+and MacOS 10.12.
+
+PaddlePaddle whl packages are trying to satisfy
+`manylinux1 <https://www.python.org/dev/peps/pep-0513/#the-manylinux1-policy>`_
+standard, which uses CentOS 5 as default build environment. But CUDA libraries
+seems only run on CentOS 6 at least, also, CentOS 5 is about to end its lifetime,
+so we use CentOS 6 as default build environment.
+
+.. csv-table:: PaddlePaddle Runtime Deps
+   :header: "Dependency", "version", "description"
+   :widths: 10, 15, 30
+
+   "OS", "Linux, MacOS", "CentOS 6 or later，Ubuntu 14.04 or later，MacOS 10.12 or later"
+   "Python", "2.7.x", "Currently Python3 is not supported"
+   "libc.so", "GLIBC_2.7", "glibc at least include GLIBC_2.7 symbols"
+   "libstdc++.so", "GLIBCXX_3.4.11, CXXABI_1.3.3", "At least include GLIBCXX_3.4.11, CXXABI_1.3.3 symbols"
+   "libgcc_s.so", "GCC_3.3", "At least include GCC_3.3 symbols"
+
+.. _pip_faq:
+
+FAQ
+------------------------------
+
+- paddlepaddle*.whl is not a supported wheel on this platform.
+  
+  The main cause of this issue is that your current platform is
+  not supported. Please check that you are using Python 2.7 series.
+  Besides, pypi only supports manylinux1 standard, you'll need to
+  upgrade your pip to >9.0.0. Then run the below command:
+
+    .. code-block:: bash
+
+       pip install --upgrade pip
+
+  If the problem still exists, run the following command:
+
+      .. code-block:: bash
+
+         python -c "import pip; print(pip.pep425tags.get_supported())"
+
+  Then you'll get supported package suffixes, then check if it matches
+  the file name of the whl package. You can find default whl package at
+  `here <https://pypi.python.org/pypi/paddlepaddle/0.10.5>`_
+
+  If your system supports linux_x86_64 but the whl package is manylinux1_x86_64,
+  you'll need to update pip to the latest version; If your system supports
+  manylinux1_x86_64 but the whl package is linux_x86_64 you can rename the
+  file to manylinux1_x86_64 suffix and then install.
diff --git a/doc/getstarted/index_cn.rst b/doc/getstarted/index_cn.rst
index aa418c657a4ba16cce61c030066f4d3e14e891cc..a9087be6f350c5656cabb0c64ba0f200d1c666cc 100644
--- a/doc/getstarted/index_cn.rst
+++ b/doc/getstarted/index_cn.rst
@@ -1,10 +1,61 @@
 新手入门
 ============
 
+.. _quick_install:
+
+快速安装
+++++++++
+
+PaddlePaddle支持使用pip快速安装，目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12，并安装有Python2.7。
+执行下面的命令完成快速安装：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+如果需要安装支持GPU的版本，需要执行：
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+更详细的安装和编译方法参考：
+
 ..  toctree::
   :maxdepth: 1
 
   build_and_install/index_cn.rst
-  concepts/use_concepts_cn.rst
 
-- `深度学习入门课程 <http://book.paddlepaddle.org/index.cn.html>`_
+.. _quick_start:
+
+快速开始
+++++++++
+
+创建一个 housing.py 并粘贴此Python代码：
+
+  .. code-block:: python
+
+     import paddle.v2 as paddle
+
+     # Initialize PaddlePaddle.
+     paddle.init(use_gpu=False, trainer_count=1)
+
+     # Configure the neural network.
+     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+     # Infer using provided test data.
+     probs = paddle.infer(
+         output_layer=y_predict,
+         parameters=paddle.dataset.uci_housing.model(),
+         input=[item for item in paddle.dataset.uci_housing.test()()])
+
+     for i in xrange(len(probs)):
+         print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
+
+执行 :code:`python housing.py` 瞧！ 它应该打印出预测住房数据的清单。
+
+..  toctree::
+  :maxdepth: 1
+
+  concepts/use_concepts_cn.rst
diff --git a/doc/getstarted/index_en.rst b/doc/getstarted/index_en.rst
index be3253e3d41b99a2b696e2c5ef6463ed49680d69..d14e3f5c0cc90792fce9cb82e65da482c44dc433 100644
--- a/doc/getstarted/index_en.rst
+++ b/doc/getstarted/index_en.rst
@@ -1,9 +1,61 @@
 GET STARTED
 ============
 
+.. _quick_install:
+
+Quick Install
+----------------------
+
+You can use pip to install PaddlePaddle with a single command, supports
+CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
+Simply run the following command to install:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle
+
+If you need to install GPU version, run:
+
+  .. code-block:: bash
+
+     pip install paddlepaddle-gpu
+
+For more details about installation and build:
+
 ..  toctree::
   :maxdepth: 1
 
   build_and_install/index_en.rst
 
-- `Deep Learning 101 <http://book.paddlepaddle.org/index.html>`_
+
+.. _quick_start:
+
+Quick Start
+++++++++
+
+Create a new file called housing.py, and paste this Python
+code:
+
+
+  .. code-block:: python
+
+     import paddle.v2 as paddle
+
+     # Initialize PaddlePaddle.
+     paddle.init(use_gpu=False, trainer_count=1)
+
+     # Configure the neural network.
+     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
+     y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
+
+     # Infer using provided test data.
+     probs = paddle.infer(
+         output_layer=y_predict,
+         parameters=paddle.dataset.uci_housing.model(),
+         input=[item for item in paddle.dataset.uci_housing.test()()])
+
+     for i in xrange(len(probs)):
+         print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
+
+Run :code:`python housing.py` and voila! It should print out a list of predictions
+for the test housing data.
diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst
index 76d3e0a0092f89005605a23e14e712530112a5ac..eb95356c67c5df22e4f543f958eb31d79f2c6195 100644
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -19,7 +19,6 @@
 ..  toctree::
   :maxdepth: 1
 
-  dev/build_cn.rst
   dev/write_docs_cn.rst
 
 模型配置
diff --git a/doc/howto/index_en.rst b/doc/howto/index_en.rst
index 1b6034be4edffd2cbc822018b733b9a3836ea84a..1fbfcd260b912078f00ed5b720ed607db725c4e2 100644
--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -18,7 +18,6 @@ Development
 ..  toctree::
   :maxdepth: 1
 
-  dev/build_en.rst
   dev/new_layer_en.rst
   dev/contribute_to_paddle_en.md
 
diff --git a/doc/howto/optimization/cpu_profiling.md b/doc/howto/optimization/cpu_profiling.md
new file mode 100644
index 0000000000000000000000000000000000000000..1775374cf6e518586c28bbd8e04946c74df7e4c5
--- /dev/null
+++ b/doc/howto/optimization/cpu_profiling.md
@@ -0,0 +1,197 @@
+This tutorial introduces techniques we use to profile and tune the
+CPU performance of PaddlePaddle.  We will use Python packages
+`cProfile` and `yep`, and Google's `perftools`.
+
+Profiling is the process that reveals performance bottlenecks,
+which could be very different from what's in the developers' mind.
+Performance tuning is done to fix these bottlenecks. Performance optimization
+repeats the steps of profiling and tuning alternatively.
+
+PaddlePaddle users program AI applications by calling the Python API, which calls
+into `libpaddle.so.` written in C++.  In this tutorial, we focus on
+the profiling and tuning of
+
+1. the Python code and
+1. the mixture of Python and C++ code.
+
+## Profiling the Python Code
+
+### Generate the Performance Profiling File
+
+We can use Python standard
+package, [`cProfile`](https://docs.python.org/2/library/profile.html),
+to generate Python profiling file.  For example:
+
+```bash
+python -m cProfile -o profile.out main.py
+```
+
+where `main.py` is the program we are going to profile, `-o` specifies
+the output file.  Without `-o`, `cProfile` would outputs to standard
+output.
+
+### Look into the Profiling File
+
+`cProfile` generates `profile.out` after `main.py` completes. We can
+use [`cprofilev`](https://github.com/ymichael/cprofilev) to look into
+the details:
+
+```bash
+cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
+```
+
+where `-a` specifies the HTTP IP, `-p` specifies the port, `-f`
+specifies the profiling file, and `main.py` is the source file.
+
+Open the Web browser and points to the local IP and the specifies
+port, we will see the output like the following:
+
+```
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.284    0.284   29.514   29.514 main.py:1(<module>)
+     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run)
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+        1    0.144    0.144    6.534    6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
+```
+
+where each line corresponds to Python function, and the meaning of
+each column is as follows:
+
+| column | meaning |
+| --- | --- |
+| ncalls | the number of calls into a function |
+| tottime | the total execution time of the function, not including the
+ execution time of other functions called by the function |
+| percall | tottime divided by ncalls |
+| cumtime | the total execution time of the function, including the execution time of other functions being called |
+| percall | cumtime divided by ncalls |
+| filename:lineno(function) | where the function is defined |
+
+### Identify Performance Bottlenecks
+
+Usually, `tottime` and the related `percall` time is what we want to
+focus on. We can sort above profiling file by tottime:
+
+```text
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+   300005    0.874    0.000    1.681    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
+   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
+     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
+        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1(<module>)
+```
+
+We can see that the most time-consuming function is the `built-in
+method run`, which is a C++ function in `libpaddle.so`.  We will
+explain how to profile C++ code in the next section.  At this 
+moment, let's look into the third function `sync_with_cpp`, which is a
+Python function.  We can click it to understand more about it:
+
+```
+Called By:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+
+Function                                                                                                 was called by...
+                                                                                                             ncalls  tottime  cumtime
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone)
+                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward)
+
+
+Called:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+```
+
+The lists of the callers of `sync_with_cpp` might help us understand
+how to improve the function definition.
+
+## Profiling Python and C++ Code
+
+### Generate the Profiling File
+
+To profile a mixture of Python and C++ code, we can use a Python
+package, `yep`, that can work with Google's `perftools`, which is a
+commonly-used profiler for C/C++ code.
+
+In Ubuntu systems, we can install `yep` and `perftools` by running the
+following commands:
+
+```bash
+apt update
+apt install libgoogle-perftools-dev
+pip install yep
+```
+
+Then we can run the following command
+
+```bash
+python -m yep -v main.py
+```
+
+to generate the profiling file.  The default filename is
+`main.py.prof`.
+
+Please be aware of the `-v` command line option, which prints the
+analysis results after generating the profiling file.  By examining the
+ the print result, we'd know that if we stripped debug
+information from `libpaddle.so` at build time.  The following hints
+help make sure that the analysis results are readable:
+
+1. Use GCC command line option `-g` when building `libpaddle.so` so to
+   include the debug information.  The standard building system of
+   PaddlePaddle is CMake, so you might want to set
+   `CMAKE_BUILD_TYPE=RelWithDebInfo`.
+
+1. Use GCC command line option `-O2` or `-O3` to generate optimized
+   binary code. It doesn't make sense to profile `libpaddle.so`
+   without optimization, because it would anyway run slowly.
+
+1. Profiling the single-threaded binary file before the
+   multi-threading version, because the latter often generates tangled
+   profiling analysis result.  You might want to set environment
+   variable `OMP_NUM_THREADS=1` to prevents OpenMP from automatically
+   starting multiple threads.
+
+### Examining the Profiling File
+
+The tool we used to examine the profiling file generated by
+`perftools` is [`pprof`](https://github.com/google/pprof), which
+provides a Web-based GUI like `cprofilev`.
+
+We can rely on the standard Go toolchain to retrieve the source code
+of `pprof` and build it:
+
+```bash
+go get github.com/google/pprof
+```
+
+Then we can use it to profile `main.py.prof` generated in the previous
+section:
+
+```bash
+pprof -http=0.0.0.0:3213 `which python`  ./main.py.prof
+```
+
+Where `-http` specifies the IP and port of the HTTP service.
+Directing our Web browser to the service, we would see something like
+the following:
+
+![result](./pprof_1.png)
+
+### Identifying the Performance Bottlenecks
+
+Similar to how we work with `cprofilev`, we'd focus on `tottime` and
+`cumtime`.
+
+![kernel_perf](./pprof_2.png)
+
+We can see that the execution time of multiplication and the computing
+of the gradient of multiplication takes 2% to 4% of the total running
+time, and `MomentumOp` takes about 17%. Obviously, we'd want to
+optimize `MomentumOp`.
+
+`pprof` would mark performance critical parts of the program in
+red. It's a good idea to follow the hints.
diff --git a/doc/howto/optimization/cpu_profiling_cn.md b/doc/howto/optimization/cpu_profiling_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..14eba0e2f34b115f5cd24920b5b1af07ec953d00
--- /dev/null
+++ b/doc/howto/optimization/cpu_profiling_cn.md
@@ -0,0 +1,155 @@
+此教程会介绍如何使用Python的cProfile包、Python库yep、Google perftools来进行性能分析 (profiling) 与调优（performance tuning）。
+
+Profling 指发现性能瓶颈。系统中的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。Tuning 指消除瓶颈。性能优化的过程通常是不断重复地 profiling 和 tuning。
+
+PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大部分 Python API 调用用 C++ 写的 libpaddle.so。所以 PaddlePaddle 的性能分析与调优分为两个部分:
+
+* Python 代码的性能分析
+* Python 与 C++ 混合代码的性能分析
+
+
+## Python代码的性能分析
+
+### 生成性能分析文件
+
+Python标准库中提供了性能分析的工具包，[cProfile](https://docs.python.org/2/library/profile.html)。生成Python性能分析的命令如下:
+
+```bash
+python -m cProfile -o profile.out main.py
+```
+
+其中 `main.py` 是我们要分析的程序，`-o`标识了一个输出的文件名，用来存储本次性能分析的结果。如果不指定这个文件，`cProfile`会打印到标准输出。
+
+### 查看性能分析文件
+
+`cProfile` 在main.py 运行完毕后输出`profile.out`。我们可以使用[`cprofilev`](https://github.com/ymichael/cprofilev)来查看性能分析结果。`cprofilev`是一个Python的第三方库。使用它会开启一个HTTP服务，将性能分析结果以网页的形式展示出来：
+
+```bash
+cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
+```
+
+其中`-a`标识HTTP服务绑定的IP。使用`0.0.0.0`允许外网访问这个HTTP服务。`-p`标识HTTP服务的端口。`-f`标识性能分析的结果文件。`main.py`标识被性能分析的源文件。
+
+用Web浏览器访问对应网址，即可显示性能分析的结果：
+
+```
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.284    0.284   29.514   29.514 main.py:1(<module>)
+     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run)
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+        1    0.144    0.144    6.534    6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
+```
+
+每一列的含义是:
+
+| 列名 | 含义 |
+| --- | --- |
+| ncalls | 函数的调用次数 |
+| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 |
+| percall | tottime的每次调用平均时间 |
+| cumtime | 函数总时间。包含这个函数调用其他函数的时间 |
+| percall | cumtime的每次调用平均时间 |
+| filename:lineno(function) | 文件名, 行号，函数名 |
+
+
+### 寻找性能瓶颈
+
+通常`tottime`和`cumtime`是寻找瓶颈的关键指标。这两个指标代表了某一个函数真实的运行时间。
+
+将性能分析结果按照tottime排序，效果如下:
+
+```text
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+   300005    0.874    0.000    1.681    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
+   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
+     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
+        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1(<module>)
+```
+
+可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python`与`C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长，每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息，了解其调用关系。
+
+```text
+Called By:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+
+Function                                                                                                 was called by...
+                                                                                                             ncalls  tottime  cumtime
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone)
+                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward)
+
+
+Called:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+```
+
+通常观察热点函数间的调用关系，和对应行的代码，就可以了解到问题代码在哪里。当我们做出性能修正后，再次进行性能分析(profiling)即可检查我们调优后的修正是否能够改善程序的性能。
+
+
+
+## Python与C++混合代码的性能分析
+
+### 生成性能分析文件
+
+C++的性能分析工具非常多。常见的包括`gprof`, `valgrind`, `google-perftools`。但是调试Python中使用的动态链接库与直接调试原始二进制相比增加了很多复杂度。幸而Python的一个第三方库`yep`提供了方便的和`google-perftools`交互的方法。于是这里使用`yep`进行Python与C++混合代码的性能分析
+
+使用`yep`前需要安装`google-perftools`与`yep`包。ubuntu下安装命令为
+
+```bash
+apt update
+apt install libgoogle-perftools-dev
+pip install yep
+```
+
+安装完毕后，我们可以通过
+
+```bash
+python -m yep -v main.py
+```
+
+生成性能分析文件。生成的性能分析文件为`main.py.prof`。
+
+命令行中的`-v`指定在生成性能分析文件之后，在命令行显示分析结果。我们可以在命令行中简单的看一下生成效果。因为C++与Python不同，编译时可能会去掉调试信息，运行时也可能因为多线程产生混乱不可读的性能分析结果。为了生成更可读的性能分析结果，可以采取下面几点措施:
+
+1. 编译时指定`-g`生成调试信息。使用cmake的话，可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`。
+2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。
+3. 运行性能分析的时候，先从单线程开始，再开启多线程，进而多机。毕竟单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。
+
+### 查看性能分析文件
+
+在运行完性能分析后，会生成性能分析结果文件。我们可以使用[`pprof`](https://github.com/google/pprof)来显示性能分析结果。注意，这里使用了用`Go`语言重构后的`pprof`，因为这个工具具有web服务界面，且展示效果更好。
+
+安装`pprof`的命令和一般的`Go`程序是一样的，其命令如下:
+
+```bash
+go get github.com/google/pprof
+```
+
+进而我们可以使用如下命令开启一个HTTP服务:
+
+```bash
+pprof -http=0.0.0.0:3213 `which python`  ./main.py.prof
+```
+
+这行命令中，`-http`指开启HTTP服务。`which python`会产生当前Python二进制的完整路径，进而指定了Python可执行文件的路径。`./main.py.prof`输入了性能分析结果。
+
+访问对应的网址，我们可以查看性能分析的结果。结果如下图所示:
+
+![result](./pprof_1.png)
+
+
+### 寻找性能瓶颈
+
+与寻找Python代码的性能瓶颈类似，寻找Python与C++混合代码的性能瓶颈也是要看`tottime`和`cumtime`。而`pprof`展示的调用图也可以帮助我们发现性能中的问题。
+
+例如下图中，
+
+![kernel_perf](./pprof_2.png)
+
+在一次训练中，乘法和乘法梯度的计算占用2%-4%左右的计算时间。而`MomentumOp`占用了17%左右的计算时间。显然，`MomentumOp`的性能有问题。
+
+在`pprof`中，对于性能的关键路径都做出了红色标记。先检查关键路径的性能问题，再检查其他部分的性能问题，可以更有次序的完成性能的优化。
diff --git a/doc/howto/optimization/pprof_1.png b/doc/howto/optimization/pprof_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..8e9edbf377672d0ef40f2fc7bd39e746923550cb
Binary files /dev/null and b/doc/howto/optimization/pprof_1.png differ
diff --git a/doc/howto/optimization/pprof_2.png b/doc/howto/optimization/pprof_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..172ba20399ba974d27f4c072425277b69b02520b
Binary files /dev/null and b/doc/howto/optimization/pprof_2.png differ
diff --git a/paddle/capi/Matrix.cpp b/paddle/capi/Matrix.cpp
index d5b55e1c95f248f551e6a0a3b39123169dd7784f..30f3a766f0c65187c8f2dd4603e3d26c9b9a6a3d 100644
--- a/paddle/capi/Matrix.cpp
+++ b/paddle/capi/Matrix.cpp
@@ -55,7 +55,7 @@ paddle_error paddle_matrix_set_row(paddle_matrix mat,
 }
 
 PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
-                                          paddle_real* value) {
+                                            paddle_real* value) {
   if (mat == nullptr || value == nullptr) return kPD_NULLPTR;
   auto ptr = cast(mat);
   if (ptr->mat == nullptr) return kPD_NULLPTR;
@@ -75,7 +75,7 @@ PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
 }
 
 PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
-                                          paddle_real* result) {
+                                            paddle_real* result) {
   if (mat == nullptr || result == nullptr) return kPD_NULLPTR;
   auto ptr = cast(mat);
   if (ptr->mat == nullptr) return kPD_NULLPTR;
diff --git a/paddle/capi/matrix.h b/paddle/capi/matrix.h
index 01b8bad2ee9f528f8622346f43b9ff82225a7e73..8cc3e0034e058daefc63c69efe0b1f575c586897 100644
--- a/paddle/capi/matrix.h
+++ b/paddle/capi/matrix.h
@@ -79,7 +79,7 @@ PD_API paddle_error paddle_matrix_set_row(paddle_matrix mat,
  * @note  value should contain enough element of data to init the mat
  */
 PD_API paddle_error paddle_matrix_set_value(paddle_matrix mat,
-                                          paddle_real* value);
+                                            paddle_real* value);
 
 /**
  * @brief PDMatGetRow Get raw row buffer from matrix
@@ -93,14 +93,14 @@ PD_API paddle_error paddle_matrix_get_row(paddle_matrix mat,
                                           paddle_real** rawRowBuffer);
 
 /**
- * @brief copy data from the matrix 
+ * @brief copy data from the matrix
  * @param [in] mat Target matrix
- * @param [out] result pointer to store the matrix data 
+ * @param [out] result pointer to store the matrix data
  * @return paddle_error
  * @note the space of the result should allocated before invoke this API
  */
 PD_API paddle_error paddle_matrix_get_value(paddle_matrix mat,
-                                          paddle_real* result);
+                                            paddle_real* result);
 /**
  * @brief PDMatCreateNone Create None Matrix
  * @return
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index c08e844847737b1172f6453767cc7f5e7b1a2bda..4b0eff3adb6fff0c9599b8613c5f19daea840674 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -6,7 +6,10 @@ cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
 nv_test(dim_test SRCS dim_test.cu DEPS ddim)
 
 cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context)
+
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
+cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor)
+
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
 
 cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto)
@@ -51,10 +54,6 @@ cc_library(executor SRCS executor.cc DEPS op_registry device_context scope frame
 
 cc_library(prune SRCS prune.cc DEPS framework_proto)
 cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
-
-cc_library(tensor_array SRCS tensor_array.cc DEPS lod_tensor)
-cc_test(tensor_array_test SRCS tensor_array_test.cc DEPS tensor_array place)
-
 cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
         proto_desc)
 cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index bc0da55cdae3575414e0d9b36471056c20946489..8fd2906107c490eee129fc10262df28bfa67800b 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -22,7 +22,6 @@
 
 #include "paddle/framework/block_desc.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/operators/dynamic_recurrent_op.h"
 #include "paddle/operators/net_op.h"
 
 namespace paddle {
@@ -218,21 +217,6 @@ static std::unique_ptr<OperatorBase> BackwardRecursive(
                      return false;
                    });
 
-    // process recurrent gradient op as a special operator.
-    if (forwardOp.Type() == "dynamic_recurrent") {
-      // NOTE clean up cycle call somewhere (RNN's stepnet constains itself),
-      // or this will result in infinite loop.
-      const auto& rnnop =
-          *static_cast<const operators::DynamicRecurrentOp*>(&forwardOp);
-      auto rnn_grad_op =
-          static_cast<operators::DynamicRecurrentGradientOp*>(grad_op.get());
-      const auto& stepnet_op =
-          *static_cast<const OperatorBase*>(&rnnop.rnn.GetStepUnit());
-      // create stepnet's gradient op
-      rnn_grad_op->rnn.SetStepUnit(
-          BackwardRecursive(stepnet_op, no_grad_names, grad_to_var, uniq_id));
-    }
-
     if (net->ops_.empty()) {  // Current no aux op is added to network
       return grad_op;
     }
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index a0f2906c749054c1ff9f624e47df432ec2bd6ac8..fdf6de4babff3bb3c253aaf516636882237e6faf 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -13,6 +13,8 @@
    limitations under the License. */
 
 #include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/framework.pb.h"
 
 #include "paddle/memory/memcpy.h"
 #include "paddle/memory/memory.h"
@@ -27,11 +29,11 @@
 namespace paddle {
 namespace framework {
 
-std::ostream& operator<<(std::ostream& os, const LoD& lod) {
+std::ostream &operator<<(std::ostream &os, const LoD &lod) {
   os << "{";
-  for (auto& v : lod) {
+  for (auto &v : lod) {
     os << "{";
-    for (auto& i : v) {
+    for (auto &i : v) {
       os << i << ",";
     }
     os << "}";
@@ -41,7 +43,7 @@ std::ostream& operator<<(std::ostream& os, const LoD& lod) {
   return os;
 }
 
-LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
+LoD SliceLevels(const LoD &in, size_t level_begin, size_t level_end) {
   LoD new_lod;
   new_lod.reserve(level_end - level_begin);
   for (size_t i = level_begin; i < level_end; i++) {
@@ -53,7 +55,7 @@ LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end) {
   return new_lod;
 }
 
-LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
+LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
                  size_t elem_end) {
   PADDLE_ENFORCE_LT(level, in.size());
   PADDLE_ENFORCE_LT(elem_end, in[level].size());
@@ -64,9 +66,9 @@ LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
   res[0].assign(in[level].begin() + elem_begin,
                 in[level].begin() + elem_end + 1);
   for (size_t lvl = 1; lvl < res.size(); lvl++) {
-    const auto& in_level = in[level + lvl];
-    const auto& above_level = res[lvl - 1];
-    auto& out_level = res[lvl];
+    const auto &in_level = in[level + lvl];
+    const auto &above_level = res[lvl - 1];
+    auto &out_level = res[lvl];
     out_level.assign(in_level.begin() + above_level.front(),
                      in_level.begin() + above_level.back() + 1);
   }
@@ -74,33 +76,33 @@ LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
     // to make the first offset equals 0, all the elements minus the first
     // element
     size_t front = res[lvl].front();
-    for (auto& ele : res[lvl]) {
+    for (auto &ele : res[lvl]) {
       ele -= front;
     }
   }
   return res;
 }
 
-LoD ToAbsOffset(const LoD& in) {
+LoD ToAbsOffset(const LoD &in) {
   // the lowest level stores relative offsets
   if (in.empty() || in.size() == 1) return in;
   LoD result = in;
   for (int level = result.size() - 2; level >= 0; level--) {
-    for (auto& ele : result[level]) {
+    for (auto &ele : result[level]) {
       ele = result[level + 1][ele];
     }
   }
   return result;
 }
 
-bool operator==(const LoD& a, const LoD& b) {
+bool operator==(const LoD &a, const LoD &b) {
   if (a.size() != b.size()) {
     return false;
   }
 
   for (size_t i = 0; i < a.size(); i++) {
-    const auto& a_level = a[i];
-    const auto& b_level = b[i];
+    const auto &a_level = a[i];
+    const auto &b_level = b[i];
     if (a_level.size() != b_level.size()) {
       return false;
     }
@@ -151,7 +153,7 @@ void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
 }
 
 using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
-LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD& lod, size_t start_idx,
+LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
                                         size_t end_idx, size_t start_level) {
   LoD sub_lod;
 
@@ -170,7 +172,7 @@ LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD& lod, size_t start_idx,
   return LoDAndOffset{sub_lod, {start_idx, end_idx}};
 }
 
-void AppendLoD(LoD* lod, const LoD& lod_length) {
+void AppendLoD(LoD *lod, const LoD &lod_length) {
   PADDLE_ENFORCE(
       lod->empty() || lod->size() == lod_length.size(),
       "The lod_length should has the same size with the appended lod.");
@@ -178,12 +180,139 @@ void AppendLoD(LoD* lod, const LoD& lod_length) {
     *lod = LoD(lod_length.size(), std::vector<size_t>({0}));
   }
   for (size_t i = 0; i < lod->size(); ++i) {
-    auto& level = (*lod)[i];
+    auto &level = (*lod)[i];
     for (size_t len : lod_length[i]) {
       level.push_back(level.back() + len);
     }
   }
 }
 
+void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
+                       const platform::DeviceContext &dev_ctx) {
+  // TODO(typhoonzero): serialize to ostream
+  {  // the 1st field, uint32_t version
+    constexpr uint32_t version = 0;
+    os.write(reinterpret_cast<const char *>(&version), sizeof(version));
+  }
+  {  // the 2nd field, tensor description
+     // int32_t  size
+     // void*    protobuf message
+    framework::TensorDesc desc;
+    desc.set_data_type(framework::ToDataType(tensor.type()));
+    auto dims = framework::vectorize(tensor.dims());
+    auto *pb_dims = desc.mutable_dims();
+    pb_dims->Resize(static_cast<int>(dims.size()), 0);
+    std::copy(dims.begin(), dims.end(), pb_dims->begin());
+    int32_t size = desc.ByteSize();
+    os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+    auto out = desc.SerializeAsString();
+    os.write(out.data(), size);
+  }
+  {  // the 3rd field, tensor data
+    uint64_t size = tensor.memory_size();
+    auto *data_ptr = tensor.data<void>();
+    PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
+                   "Index overflow when writing tensor");
+    if (platform::is_gpu_place(tensor.place())) {
+#ifdef PADDLE_WITH_CUDA
+      constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
+      std::unique_ptr<char[]> buf(new char[kBufSize]);
+      auto &gpu_dev_ctx =
+          static_cast<const platform::CUDADeviceContext &>(dev_ctx);
+      platform::CPUPlace cpu;
+      uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
+      while (size != 0) {
+        size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
+        memory::Copy(cpu, buf.get(),
+                     boost::get<platform::GPUPlace>(tensor.place()),
+                     reinterpret_cast<const void *>(data), size_to_write,
+                     gpu_dev_ctx.stream());
+        gpu_dev_ctx.Wait();
+        os.write(buf.get(), size_to_write);
+        data += size_to_write;
+        size -= size_to_write;
+      }
+#else
+      PADDLE_THROW("Unexpected branch");
+#endif
+    } else {
+      os.write(static_cast<const char *>(data_ptr),
+               static_cast<std::streamsize>(size));
+    }
+  }
+  {  // the 4th field, lod information
+     // uint64_t lod_level
+     // uint64_t lod_level_1 size in byte.
+     // int*     lod_level_1 data
+     // ...
+    auto lod = tensor.lod();
+    uint64_t size = lod.size();
+    os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+
+    for (auto &each : lod) {
+      size = each.size() * sizeof(framework::LoD::value_type::value_type);
+      os.write(reinterpret_cast<const char *>(&size), sizeof(size));
+      os.write(reinterpret_cast<const char *>(each.data()),
+               static_cast<std::streamsize>(size));
+    }
+  }
+}
+
+void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
+  uint32_t version;
+  is.read(reinterpret_cast<char *>(&version), sizeof(version));
+  PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
+  framework::TensorDesc desc;
+  {  // int32_t size
+     // proto buffer
+    int32_t size;
+    is.read(reinterpret_cast<char *>(&size), sizeof(size));
+    std::unique_ptr<char[]> buf(new char[size]);
+    is.read(reinterpret_cast<char *>(buf.get()), size);
+    PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
+                   "Cannot parse tensor desc");
+  }
+  {  // read tensor
+    std::vector<int64_t> dims;
+    dims.reserve(static_cast<size_t>(desc.dims().size()));
+    std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
+    tensor->Resize(framework::make_ddim(dims));
+
+    void *buf;
+    platform::Place cpu = platform::CPUPlace();
+    switch (desc.data_type()) {
+      case framework::FP32:
+        buf = tensor->mutable_data<float>(cpu);
+        break;
+      case framework::FP64:
+        buf = tensor->mutable_data<double>(cpu);
+        break;
+      case framework::INT32:
+        buf = tensor->mutable_data<int>(cpu);
+        break;
+      case framework::INT64:
+        buf = tensor->mutable_data<int64_t>(cpu);
+        break;
+      default:
+        PADDLE_THROW("DataType %d not supported", desc.data_type());
+    }
+    is.read(static_cast<char *>(buf), tensor->memory_size());
+  }
+  {  // read lod
+    uint64_t lod_level;
+    is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
+    auto &lod = *tensor->mutable_lod();
+    lod.resize(lod_level);
+    for (uint64_t i = 0; i < lod_level; ++i) {
+      uint64_t size;
+      is.read(reinterpret_cast<char *>(&size), sizeof(size));
+      std::vector<size_t> tmp(size / sizeof(size_t));
+      is.read(reinterpret_cast<char *>(tmp.data()),
+              static_cast<std::streamsize>(size));
+      lod[i] = tmp;
+    }
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index 7f8a51cc581e759bc707e506ac7cdeb3680f40ac..9411c96aea4c10ebf921cc3e3b442769c8acbefa 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -24,6 +24,7 @@
 #include <glog/logging.h>
 #include "paddle/framework/ddim.h"
 #include "paddle/framework/tensor.h"
+#include "paddle/framework/tensor_util.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
 
@@ -175,9 +176,9 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
   PADDLE_ENFORCE_EQ(num_instances, lod_level.size() - 1);
   for (size_t ins = 0; ins < num_instances; ins++) {
     for (size_t elem = lod_level[ins]; elem < lod_level[ins + 1]; elem++) {
-      tensor.Slice(elem, elem + 1)
-          .CopyFrom(source.Slice(ins, ins + 1), platform::CPUPlace(),
-                    platform::CPUDeviceContext());
+      auto slice = tensor.Slice(elem, elem + 1);
+      CopyFrom(source.Slice(ins, ins + 1), platform::CPUPlace(),
+               platform::CPUDeviceContext(), &slice);
     }
   }
   return tensor;
@@ -188,5 +189,14 @@ std::pair<LoD, std::pair<size_t, size_t>> GetSubLoDAndAbsoluteOffset(
 
 void AppendLoD(LoD* lod, const LoD& lod_length);
 
+/*
+ * Serialize/Desiralize LoDTensor to std::ostream
+ * You can pass ofstream or ostringstream to serilize to file
+ * or to a in memory string. GPU tensor will be copied to CPU.
+ */
+void SerializeToStream(std::ostream& os, const LoDTensor& tensor,
+                       const platform::DeviceContext& dev_ctx);
+void DeserializeFromStream(std::istream& is, LoDTensor* tensor);
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index 48cd131550dea5ad3f368b25c31d753efbe0dff9..02a825324328fa5cfd3a4d23a8c64488cc88aeec 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -65,7 +65,7 @@ class CompileTimeInferShapeContext : public InferShapeContext {
     PADDLE_ENFORCE_EQ(in_var->GetType(), VarDesc::LOD_TENSOR,
                       "The %d-th output of Output(%s) must be LoDTensor.", j,
                       out);
-    in_var->SetLoDLevel(out_var->GetLodLevel());
+    out_var->SetLoDLevel(in_var->GetLodLevel());
   }
   bool IsRuntime() const override;
 
diff --git a/paddle/framework/shape_inference.cc b/paddle/framework/shape_inference.cc
index 0af41b164f5894db17b2f86d4eba371cf05e3b41..2298507471c54c5b7751beff900466737eea36d4 100644
--- a/paddle/framework/shape_inference.cc
+++ b/paddle/framework/shape_inference.cc
@@ -22,6 +22,12 @@ std::vector<framework::DDim> InferShapeContext::GetInputsDim(
   return GetDims(names);
 }
 
+DDim InferShapeContext::GetInputsElementDim(const std::string &name,
+                                            int idx) const {
+  const std::vector<std::string> &names = Inputs(name);
+  return this->GetDim(names[idx]);
+}
+
 void InferShapeContext::SetOutputsDim(
     const std::string &name, const std::vector<framework::DDim> &dims) {
   auto &names = Outputs(name);
diff --git a/paddle/framework/shape_inference.h b/paddle/framework/shape_inference.h
index 05dc47f06ac81f0acb6d0317cbecb3009c7dd7f0..46f2ea84b4b64292cc9026ef9864621efba79c7a 100644
--- a/paddle/framework/shape_inference.h
+++ b/paddle/framework/shape_inference.h
@@ -37,6 +37,7 @@ class InferShapeContext {
   virtual framework::DDim GetInputDim(const std::string &name) const = 0;
 
   std::vector<framework::DDim> GetInputsDim(const std::string &name) const;
+  DDim GetInputsElementDim(const std::string &name, int idx) const;
 
   virtual void SetOutputDim(const std::string &name, const DDim &dim) = 0;
   void SetOutputsDim(const std::string &name,
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 28d0fcf94ec31c82476e093f93ccee222a0c9d9a..6a0c5133c9a6bb326ca51755242e75b6eb9e5474 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -89,34 +89,6 @@ class Tensor {
   /*! The internal of two tensors share the same memory block. */
   inline Tensor& ShareDataWith(const Tensor& src);
 
-  /**
-   * @brief   Copy the content of external tensor to a new place.
-   *
-   * @param[in] src        The external tensor.
-   * @param[in] dst_place  The dst place.
-   * @param[in] ctx        The device context contains device resources.
-   *
-   * @note    CopyFrom supports CPU <-> GPU, GPU <-> GPU.
-   */
-  // TODO(qijun): https://github.com/PaddlePaddle/Paddle/issues/4647
-  // Remove `CopyFrom` and `CopyFromVector` from Tensor interface
-  // and make them global functions
-  inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
-                       const platform::DeviceContext& ctx);
-
-  /**
-   * @brief   Copy the content of an external vector to a tensor.
-   *
-   * @param[in] src        The external tensor.
-   * @param[in] ctx        The device context contains device resources.
-   *
-   * * @note    CopyFromVector assumes that the tensor has been resized
-   *            before invoking.
-   */
-  template <typename T>
-  inline void CopyFromVector(const std::vector<T>& src,
-                             const platform::DeviceContext& ctx);
-
   /**
    * @brief  Return a sub-tensor of the given tensor.
    *
@@ -141,7 +113,6 @@ class Tensor {
 
   size_t memory_size() const;
 
- private:
   inline void check_memory_size() const;
 
  private:
diff --git a/paddle/framework/tensor_array.cc b/paddle/framework/tensor_array.cc
deleted file mode 100644
index 6058f1b8b1f78181bc4a47e9e3d4135f5139980f..0000000000000000000000000000000000000000
--- a/paddle/framework/tensor_array.cc
+++ /dev/null
@@ -1,444 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-
-
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/framework/tensor_array.h"
-
-#include <glog/logging.h>
-#include <algorithm>
-#include <limits>
-
-#include "paddle/framework/eigen.h"
-
-namespace paddle {
-namespace framework {
-
-namespace detail {
-
-/*
- * Offer an iterator over the length-sorted lod-tensor's top level. The top
- * level of a lod-tensor stores batch-size of sequences, each top-level sequence
- * may contains several lower-level sequences, sort top-level lod by the numbers
- * of lower-level sequences in descending order, so that during RNN's running,
- * the batch-size will keep decreasing, the short sentences will end at the tail
- * of each batch.
- *
- * Let's take a simple lod-tensor for example
- *
- *   |(0)       |(1)        top-level has two instances
- *   |||        |||||    lower-level
- *
- * sort by lower-level's length
- *
- *   |(1)       |(0)
- *   |||||      |||
- *
- * when RNN runs, it get 5 batches (equals the number of elements the longest
- * sequence has)
- *
- * |||||
- * |||
- *
- * the first three batches has two elements, the last two elements just has 1
- * element each.
- */
-struct DynamicBatchUnpacker {
-  using value_type = float;
-
-  DynamicBatchUnpacker(const LoDTensor& source, size_t level,
-                       bool descend = true)
-      : source(&source), level(level) {
-    BuildLengthSortedMeta(descend);
-  }
-
-  LoDTensor GetBatch(size_t index);
-
-  std::vector<DySeqMeta> meta;
-
-  LoDTensor const* source;
-  size_t level;
-
- protected:
-  void BuildLengthSortedMeta(bool descend);
-};
-
-LoDTensor PackDynamicBatch(const std::vector<LoDTensor>& source,
-                           const std::vector<DySeqMeta>& meta, const LoD& lod,
-                           size_t level);
-
-std::vector<size_t> GenDyBatchIndice(const DySeqMetaBatch& meta, int batch_id) {
-  // collect indice need to copy to the batch
-  std::vector<size_t> indice;
-  for (const auto& seq : meta) {
-    size_t id = seq.begin + batch_id;
-    if (id >= seq.end) break;
-    indice.push_back(id);
-  }
-  return indice;
-}
-
-}  // namespace detail
-
-const LoDTensor& TensorArray::Read(size_t index) const {
-  PADDLE_ENFORCE_LE(index, MAX_SIZE, "index[%d] too large", index);
-  if (index >= size()) {
-    values_.resize(index + 1);
-  }
-  return values_[index];
-}
-
-void TensorArray::Write(size_t index, const LoDTensor& value) {
-  PADDLE_ENFORCE_LE(index, MAX_SIZE, "index[%d] too large", index);
-
-  if (index >= size()) {
-    values_.resize(index + 1);
-  }
-
-  values_[index].set_lod(value.lod());
-  values_[index].Resize(value.dims());
-  values_[index].mutable_data<value_type>(value.place());
-  values_[index].CopyFrom(value, value.place(), platform::CPUDeviceContext());
-}
-
-void TensorArray::WriteShared(size_t index, const LoDTensor& value) {
-  PADDLE_ENFORCE_LE(index, MAX_SIZE, "index[%d] too large", index);
-  if (index >= size()) {
-    values_.resize(index + 1);
-  }
-
-  values_[index].set_lod(value.lod());
-  values_[index].ShareDataWith(value);
-}
-
-LoDTensor TensorArray::Pack(size_t level, const std::vector<DySeqMeta>& meta,
-                            const LoD& lod) const {
-  return detail::PackDynamicBatch(values_, meta, lod, level);
-}
-
-DySeqMetaBatch TensorArray::Unpack(const LoDTensor& source, int level,
-                                   bool length_desend) {
-  detail::DynamicBatchUnpacker unpacker(source, level,
-                                        length_desend /*descend*/);
-
-  // find max length of all the sequences
-  size_t max_length = 0;
-  for (const auto& seq : unpacker.meta) {
-    max_length = std::max(max_length, seq.end - seq.begin);
-  }
-
-  // write batches to values
-  for (size_t batch_id = 0; batch_id < max_length; batch_id++) {
-    Write(batch_id, unpacker.GetBatch(batch_id));
-  }
-
-  PADDLE_ENFORCE(!unpacker.meta.empty());
-  return unpacker.meta;
-}
-
-LoDTensor TensorArray::LodPack(size_t level) const {
-  PADDLE_ENFORCE_GT(size(), 0UL, "no time step exists");
-  // the levels should be no less than 2
-  LoDTensor merged;
-  const LoDTensor *pre, *cur;
-  pre = &Read(0);
-
-  for (size_t step = 1; step < size(); step++) {
-    cur = &Read(step);
-    PADDLE_ENFORCE_GT(cur->NumLevels(), 0);
-    PADDLE_ENFORCE_GT(pre->NumLevels(), 0);
-    PADDLE_ENFORCE_EQ(pre->NumLevels(), cur->NumLevels());
-    PADDLE_ENFORCE_EQ(pre->NumElements(level), cur->NumElements(level));
-
-    merged = LodPackTwo(*pre, *cur, level);
-    pre = &merged;
-  }
-  return merged;
-}
-
-/*
- * NOTE currently, only the lowest level supports packing.
- * The lowest LoD will be changed, while the relative offsets in levels above
- * stay unchanged.
- *
- * previous step : [0] [1] [3]
- * current step: [0 1 2] [2 3] []
- * packed to
- *   [0 0] [0 1] [0 2] [1 2] [1 3] [3]
- */
-LoDTensor TensorArray::LodPackTwo(const LoDTensor& pre, const LoDTensor& cur,
-                                  size_t level) const {
-  PADDLE_ENFORCE_EQ(pre.NumLevels(), cur.NumLevels());
-  PADDLE_ENFORCE_EQ(pre.NumLevels(), level + 1,
-                    "Only the lowest LoD level supports pack temporarily.");
-  // calculate the result tensor's shape first
-  size_t num_instances = 0;
-  for (size_t elem = 0; elem < pre.NumElements(level); elem++) {
-    size_t prefix_size = pre.NumElements(level, elem);
-    size_t num_candidates = cur.NumElements(level, elem);
-    if (num_candidates > 0) {
-      num_instances += num_candidates * (prefix_size + 1);
-    } else {
-      num_instances += prefix_size;
-    }
-  }
-
-  auto res_dims = pre.dims();
-  res_dims[0] = num_instances;
-  LoDTensor result;
-  result.Resize(res_dims);
-  result.mutable_data<value_type>(cur.place());
-
-  Vector<size_t> last_lod_level;
-  // copy data
-  size_t index = 0;
-  last_lod_level.push_back(index);
-  for (size_t elem = 0; elem < pre.NumElements(level); elem++) {
-    size_t prefix_size = pre.NumElements(level, elem);
-    size_t num_candidates = cur.NumElements(level, elem);
-
-    // slice the prefix Tensor
-    LoDTensor prefix = pre;
-    prefix.ShrinkInLevel(level, elem, elem + 1);
-    LoDTensor candidate = cur;
-    if (num_candidates > 0) {
-      candidate.ShrinkInLevel(level, elem, elem + 1);
-    } else {  // just push prefix
-      result.Slice(index, index + prefix_size)
-          .CopyFrom(prefix, result.place(), platform::CPUDeviceContext());
-      index += prefix_size;
-      last_lod_level.push_back(index);
-    }
-    for (size_t candi = 0; candi < num_candidates; candi++) {
-      // TODO(superjom) support GPU
-      result.Slice(index, index + prefix_size)
-          .CopyFrom(prefix, result.place(), platform::CPUDeviceContext());
-      index += prefix_size;
-      // copy candidate record
-      result.Slice(index, index + 1)
-          .CopyFrom(candidate.Slice(candi, candi + 1), result.place(),
-                    platform::CPUDeviceContext());
-      index++;
-      last_lod_level.push_back(index);
-    }
-  }
-
-  // update lod
-  auto lod = cur.lod();
-  lod.back() = last_lod_level;
-  result.set_lod(lod);
-  return result;
-}
-
-/*
- * source [0 1 2] [3 4] [5 6 7] will be transformd to a list of LoDTensors such
- * as
- * [0 3 5] [1 4 6] [2 7] with 1-level LoDs:
- * - [0 1 2 3]
- * - [0 1 2 3]
- * - [0 1 1 2], the [1,1) here means the second sequence is empty
- *
- * NOTE Unpack a LoDTensor in this approach may result in a big LoD.
- */
-void TensorArray::LodUnpack(const LoDTensor& source, size_t level) {
-  PADDLE_ENFORCE_EQ(level, source.NumLevels() - 1,
-                    "only the lowest LoD level supports unpack.");
-  const size_t non_empty_instances = source.dims()[0];
-  size_t index = 0;
-  Vector<size_t> lowest_lod_level;
-  lowest_lod_level.push_back(index);
-
-  for (size_t step = 0; step < non_empty_instances; step++) {
-    size_t num_instances = 0;
-    for (size_t id = 0; id < source.NumElements(level); id++) {
-      auto instance = source;
-      instance.ShrinkInLevel(level, id, id + 1);
-      if (static_cast<size_t>(instance.dims()[0]) > step) {
-        num_instances++;
-        index++;
-      }
-      lowest_lod_level.push_back(index);
-    }
-
-    // create tensor for this time step
-    LoDTensor tensor;
-    auto dims = source.dims();
-    dims[0] = num_instances;
-    // set lod
-    auto lod = source.lod();
-    lod.back() = lowest_lod_level;
-    tensor.set_lod(lod);
-
-    index = 0;
-    for (size_t id = 0; id < source.NumElements(level); id++) {
-      auto instance = source;
-      instance.ShrinkInLevel(level, id, id + 1);
-      if (static_cast<size_t>(instance.dims()[0]) > step) {
-        // copy this instance
-        tensor.Slice(index, index + 1)
-            .CopyFrom(instance.Slice(step, step + 1), tensor.place(),
-                      platform::CPUDeviceContext());
-        index++;
-      }
-    }
-    Write(step, tensor);
-  }
-}
-
-LoDTensor TensorArray::Stack() const {
-  LoDTensor result;
-  if (size() == 0) return result;
-
-  const auto& first_dims = values_.front().dims();
-  // check all the values have the same shape
-  // TODO(superjom) check the same data_type
-  for (size_t idx = 1; idx < size(); idx++) {
-    const auto& value_dims = values_[idx].dims();
-    PADDLE_ENFORCE_EQ(first_dims, value_dims);
-  }
-
-  // copy
-  auto result_dims = vectorize(first_dims);
-  result_dims.insert(result_dims.begin(), size());
-  result.Resize(make_ddim(result_dims));
-  result.mutable_data<value_type>(platform::CPUPlace());
-
-  for (size_t idx = 0; idx < size(); idx++) {
-    result.Slice(idx, idx + 1)
-        .CopyFrom(Read(idx), platform::CPUPlace(),
-                  platform::CPUDeviceContext());
-  }
-  return result;
-}
-
-void TensorArray::Unstack(const LoDTensor& source) const {
-  Unstack(source, false /*data_shared*/);
-}
-
-void TensorArray::UnstackShared(const LoDTensor& source) const {
-  Unstack(source, true /*data_shared*/);
-}
-
-void TensorArray::Unstack(const LoDTensor& source, bool data_shared) const {
-  size_t first_dim = source.dims()[0];
-  DDim value_dims = slice_ddim(source.dims(), 1, source.dims().size());
-  PADDLE_ENFORCE_GT(first_dim, 0,
-                    "source should have some data to be unstacked");
-
-  values_.resize(first_dim);
-
-  for (size_t elem = 0; elem < first_dim; elem++) {
-    // create a new value
-    auto& value = values_[elem];
-    if (data_shared) {
-      // share memory
-      value.ShareDataWith(source.Slice(elem, elem + 1));
-    } else {
-      // copy
-      value.Resize(value_dims);
-      value.CopyFrom(source.Slice(elem, elem + 1), platform::CPUPlace(),
-                     platform::CPUDeviceContext());
-    }
-  }
-}
-
-size_t TensorArray::size() const { return values_.size(); }
-
-namespace detail {
-
-void DynamicBatchUnpacker::BuildLengthSortedMeta(bool descend) {
-  PADDLE_ENFORCE(meta.empty(), "duplicate build meta");
-  // collect meta for each sequence in some level
-  auto lod = SliceLevels(source->lod(), level, level + 1)[0];
-
-  for (size_t seq_id = 0; seq_id < lod.size() - 1; seq_id++) {
-    DySeqMeta seq_meta({lod[seq_id], lod[seq_id + 1], seq_id});
-    meta.push_back(seq_meta);
-  }
-
-  PADDLE_ENFORCE_GT(meta.size(), 0, "meta is empty");
-
-  // sort by length
-  sort(meta.begin(), meta.end(),
-       [descend](const DySeqMeta& a, const DySeqMeta& b) {
-         bool a_ge_b = (a.end - a.begin) > (b.end - b.begin);
-         return descend ? a_ge_b : !a_ge_b;
-       });
-}
-
-LoDTensor DynamicBatchUnpacker::GetBatch(size_t index) {
-  PADDLE_ENFORCE(!meta.empty(), "should build meta first");
-  LoDTensor result;
-
-  auto indice = detail::GenDyBatchIndice(meta, index);
-  PADDLE_ENFORCE(!indice.empty(), "invalid batch at %d", index);
-
-  // copy the indice of records in LoDTensor
-  auto record_dims = slice_ddim(source->dims(), 1, source->dims().size());
-  auto record_dims_vec = vectorize(record_dims);
-  record_dims_vec.insert(record_dims_vec.begin(), indice.size());
-  result.Resize(make_ddim(record_dims_vec));
-  result.mutable_data<value_type>(platform::CPUPlace());
-
-  for (size_t i = 0; i < indice.size(); i++) {
-    auto index = indice[i];
-    auto target = result.Slice(i, i + 1);
-    auto slice = source->Slice(index, index + 1);
-
-    target.CopyFrom(slice, platform::CPUPlace(), platform::CPUDeviceContext());
-  }
-
-  return result;
-}
-
-// TODO(supejom) to cache lod if reasonable
-LoDTensor PackDynamicBatch(const std::vector<LoDTensor>& source,
-                           const std::vector<DySeqMeta>& meta, const LoD& lod,
-                           size_t level) {
-  PADDLE_ENFORCE(!source.empty());
-  PADDLE_ENFORCE(!meta.empty());
-  PADDLE_ENFORCE(!lod.empty());
-
-  LoDTensor result;
-
-  // init result space
-  auto record_dims = slice_ddim(source[0].dims(), 1, source[0].dims().size());
-  auto record_dims_vec = vectorize(record_dims);
-  auto height = lod[level].back();
-  record_dims_vec.insert(record_dims_vec.begin(), height);
-  result.Resize(make_ddim(record_dims_vec));
-  result.mutable_data<float>(platform::CPUPlace());
-
-  for (size_t batch_id = 0; batch_id < source.size(); batch_id++) {
-    for (size_t seq_id = 0; seq_id < meta.size(); seq_id++) {
-      const auto& seq_meta = meta[seq_id];
-      // source is source[batch_id][seq_id]
-      // target is result[index]
-      auto index = seq_meta.begin + batch_id;
-      if (index >= seq_meta.end) break;
-      auto source_ = source[batch_id].Slice(seq_id, seq_id + 1);
-      auto target = result.Slice(index, index + 1);
-      target.CopyFrom(source_, platform::CPUPlace(),
-                      platform::CPUDeviceContext());
-    }
-  }
-
-  result.set_lod(lod);
-  return result;
-}
-
-}  // namespace detail
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/tensor_array.h b/paddle/framework/tensor_array.h
deleted file mode 100644
index 78fad8cab7e27a7f07ca542c2a083460ee9e2b79..0000000000000000000000000000000000000000
--- a/paddle/framework/tensor_array.h
+++ /dev/null
@@ -1,132 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-#include <vector>
-
-#include "paddle/framework/lod_tensor.h"
-
-namespace paddle {
-namespace framework {
-
-/*
- * DyBatchSeqPosition stores indices of the basic element in tensor. It is used
- * after lod-tensor's re-assembling, its info can be used to recover the order
- * in original lod-tensor.
- */
-struct DySeqMeta {
-  DySeqMeta(size_t begin, size_t end, size_t ori_idx)
-      : begin(begin), end(end), ori_idx(ori_idx) {}
-
-  size_t begin;
-  size_t end;  // not included
-  size_t ori_idx;
-};
-
-using DySeqMetaBatch = std::vector<DySeqMeta>;
-
-/*
- * Extract the indices of instances.
- */
-std::vector<size_t> GenDyBatchIndice(const DySeqMetaBatch &metas, int batch_id);
-
-/*
- * TensorArray is a C-array-like array of tensors, it is meant to be used with
- * dynamic iteration primitives such as while_loop. It is used to segment inputs
- * and store states in all time steps.
- *
- * By providing some methods similar to a C++ array, the difinition of some
- * state-based dynamic models such as RNN cound be more natural and highly
- * flexible.
- */
-class TensorArray {
- public:
-  using value_type = float;
-
-  // max number of values allowed to store.
-  const size_t MAX_SIZE{100000};
-
-  /*
-   * Read the value at location `index` in the `TensorArray`.
-   */
-  const LoDTensor &Read(size_t index) const;
-
-  /*
-   * Write value into the index of the TensorArray.
-   */
-  void Write(size_t index, const LoDTensor &value);
-
-  /*
-   * Write value into the index of the TensorArray, with memory shared.
-   */
-  void WriteShared(size_t index, const LoDTensor &value);
-
-  /*
-   * Recover the original LoD-arranged LoDTensor with the `values`, `level` and
-   * `indice_map`.
-   */
-  LoDTensor Pack(size_t level, const DySeqMetaBatch &meta,
-                 const LoD &lod) const;
-
-  /*
-   * Split LoDTensor in some `level` and write the generated batches to
-   * `values`, if set `desend`, will sort by length in descending order else in
-   * ascending order.
-   */
-  DySeqMetaBatch Unpack(const LoDTensor &source, int level, bool length_desend);
-
-  /*
-   * Pack an array of LoDTensors to a LoDTensor.
-   */
-  LoDTensor LodPack(size_t level) const;
-
-  /*
-   * Unpack a LoDTensor to an array of LoDTensors.
-   */
-  void LodUnpack(const LoDTensor &source, size_t level);
-
-  /*
-   * Pack the values into a tensor with rank one higher than each tensor in
-   * values.
-   */
-  LoDTensor Stack() const;
-
-  /*
-   * Unstacks the given division of a rank-`R` tensor into rank-`(R-1)` tensors.
-   */
-  void Unstack(const LoDTensor &source) const;
-
-  /*
-   * Unstacks the given division of a rank-`R` tensor into rank-`(R-1)` tensors,
-   * with memory of tensors shared.
-   */
-  void UnstackShared(const LoDTensor &source) const;
-
-  /*
-   * Return the number of values.
-   */
-  size_t size() const;
-
- protected:
-  void Unstack(const LoDTensor &source, bool data_shared) const;
-
-  LoDTensor LodPackTwo(const LoDTensor &pre, const LoDTensor &cur,
-                       size_t level) const;
-
- private:
-  mutable std::vector<LoDTensor> values_;
-};  // class TensorArray
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/tensor_array_test.cc b/paddle/framework/tensor_array_test.cc
deleted file mode 100644
index 83b52b442daf9b2f1fc40f23e458fcb67c5040e8..0000000000000000000000000000000000000000
--- a/paddle/framework/tensor_array_test.cc
+++ /dev/null
@@ -1,182 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/framework/tensor_array.h"
-
-#include <gtest/gtest.h>
-
-namespace paddle {
-namespace framework {
-
-class TensorArrayTester : public ::testing::Test {
- protected:
-  void SetUp() override {
-    LoDTensor source;
-    source.Resize(make_ddim({batch_size, dim}));
-    int* data = source.mutable_data<int>(platform::CPUPlace());
-    for (int i = 0; i < 16 * 32; i++) {
-      data[i] = i;
-    }
-    ta.Unstack(source);
-  }
-
-  TensorArray ta;
-  const int batch_size = 16;
-  const int dim = 32;
-};
-
-TEST_F(TensorArrayTester, Read) {
-  for (int i = 0; i < batch_size; i++) {
-    const auto& tensor = ta.Read(i);
-    ASSERT_EQ(tensor.dims()[0], 1);
-    ASSERT_EQ(tensor.dims()[1], dim);
-  }
-}
-
-TEST_F(TensorArrayTester, Write) {
-  LoDTensor source;
-  source.Resize(make_ddim({1, dim}));
-  for (int i = 0; i < dim; i++) {
-    *(source.mutable_data<int>(platform::CPUPlace()) + i) = i;
-  }
-
-  ta.Write(2, source);
-
-  const auto& tensor = ta.Read(2);
-  for (int i = 0; i < dim; i++) {
-    EXPECT_EQ(*(tensor.data<int>() + i), *(source.data<int>() + i));
-  }
-}
-
-TEST_F(TensorArrayTester, WriteShared) {
-  LoDTensor source;
-  source.Resize(make_ddim({1, dim}));
-  for (int i = 0; i < dim; i++) {
-    *(source.mutable_data<int>(platform::CPUPlace()) + i) = i;
-  }
-
-  ta.WriteShared(2, source);
-
-  const auto& tensor = ta.Read(2);
-  for (int i = 0; i < dim; i++) {
-    EXPECT_EQ(*(tensor.data<int>() + i), *(source.data<int>() + i));
-  }
-
-  EXPECT_EQ(source.data<int>(), tensor.data<int>());
-}
-
-class TensorArrayPackTester : public ::testing::Test {
- protected:
-  virtual void SetUp() override {
-    lod.push_back(std::vector<size_t>{0, 2, 9, 13});
-
-    source.set_lod(lod);
-    source.Resize(make_ddim({13, 128}));
-    source.mutable_data<int>(platform::CPUPlace());
-
-    // content of each setence: 0 1 2 3 4
-    const auto& level = lod.front();
-    for (size_t i = 0; i < level.size() - 1; i++) {
-      size_t begin = level[i];
-      size_t end = level[i + 1];
-      for (size_t j = begin; j < end; j++) {
-        auto record = source.Slice(j, j + 1);
-        for (int dim = 0; dim < 128; dim++) {
-          record.mutable_data<int>(platform::CPUPlace())[dim] = j - begin;
-        }
-      }
-    }
-
-    // unpack
-    meta = ta.Unpack(source, 0, true);
-  }
-
-  LoD lod;
-  TensorArray ta;
-  LoDTensor source;
-  std::vector<DySeqMeta> meta;
-};
-
-TEST_F(TensorArrayPackTester, Unpack) {
-  ASSERT_EQ(ta.size(), 7UL);
-
-  const auto& t0 = ta.Read(0);
-  const auto& t1 = ta.Read(1);
-
-  ASSERT_EQ(t0.data<int>()[0], int(0));
-  ASSERT_EQ(t1.data<int>()[0], int(1));
-}
-
-TEST_F(TensorArrayPackTester, Pack) {
-  LoDTensor packed = ta.Pack(0, meta, lod);
-}
-
-TEST_F(TensorArrayTester, size) {
-  ASSERT_EQ(ta.size(), static_cast<size_t>(batch_size));
-}
-
-TEST(TensorArray, LodPack) {
-  // three time steps, each step stores a LoDTensors
-  // - [0] [1]
-  // - [2 3], [4 5]
-  // - [6 7] [] [8], [9, 10]
-  // try to get a LoDTensor with content:
-  // - [0 2 6]
-  // - [0 2 7]
-  // - [0 3]
-  // - [1 4 8]
-  // - [1 5 9]
-  // - [1 5 10]
-  std::array<LoDTensor, 3> tensors;
-  tensors[0].Resize(make_ddim({2, 1}));
-  tensors[1].Resize(make_ddim({4, 1}));
-  tensors[2].Resize(make_ddim({5, 1}));
-  int index = 0;
-  for (auto& t : tensors) {
-    t.mutable_data<int>(platform::CPUPlace());
-    for (int i = 0; i < t.dims()[0]; i++) {
-      t.data<int>()[i] = index;
-      index++;
-    }
-  }
-
-  std::array<LoD, 3> lods;
-  std::vector<std::vector<size_t>> levels{
-      {0, 1, 2}, {0, 2, 4}, {0, 2, 2, 3, 5}};
-  for (int i = 0; i < 3; i++) {
-    lods[i].emplace_back(levels[i].begin(), levels[i].end());
-  }
-
-  TensorArray ta;
-  for (int i = 0; i < 3; i++) {
-    tensors[i].set_lod(lods[i]);
-    ta.Write(i, tensors[i]);
-  }
-
-  auto merged = ta.LodPack(0);
-
-  std::vector<int> target_tensor_data{{0, 2, 6,  // 0
-                                       0, 2, 7,  // 1
-                                       0, 3,     // 2
-                                       1, 4, 8,  // 3
-                                       1, 5, 9,  // 5
-                                       1, 5, 10}};
-  EXPECT_EQ(merged.dims()[0], (int)target_tensor_data.size());
-  for (size_t i = 0; i < target_tensor_data.size(); i++) {
-    EXPECT_EQ(target_tensor_data[i], merged.data<int>()[i]);
-  }
-}
-
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
index 7e88e039611007d17156d10f852eb46f3ee8e7a3..aba1f9f09329f890ef190f8820b958c56f017e89 100644
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -150,84 +150,6 @@ inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
   return *this;
 }
 
-inline void Tensor::CopyFrom(const Tensor& src,
-                             const platform::Place& dst_place,
-                             const platform::DeviceContext& ctx) {
-  src.check_memory_size();
-  Resize(src.dims());
-
-  auto src_place = src.holder_->place();
-  auto src_ptr = src.data<void>();
-
-  auto dst_ptr = mutable_data(dst_place, src.type());
-
-  auto size = src.numel() * SizeOfType(src.type());
-
-  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
-    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
-                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
-  }
-#ifdef PADDLE_WITH_CUDA
-  else if (platform::is_gpu_place(src_place) &&
-           platform::is_cpu_place(dst_place)) {
-    auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
-    auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
-    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
-    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
-    memory::Copy(
-        dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
-  } else if (platform::is_cpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
-    auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
-    auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
-    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
-    PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
-    memory::Copy(
-        dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
-  } else if (platform::is_gpu_place(src_place) &&
-             platform::is_gpu_place(dst_place)) {
-    auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
-    auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
-    auto ctx_place = ctx.GetPlace();
-    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
-    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
-    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
-    memory::Copy(
-        dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
-  }
-#endif
-}
-
-template <typename T>
-inline void Tensor::CopyFromVector(const std::vector<T>& src,
-                                   const platform::DeviceContext& ctx) {
-  auto dst_place = ctx.GetPlace();
-  auto src_ptr = static_cast<const void*>(src.data());
-  platform::CPUPlace src_place;
-  auto dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
-  auto size = src.size() * sizeof(T);
-
-  if (platform::is_cpu_place(dst_place)) {
-    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, src_place,
-                 src_ptr, size);
-  }
-#ifdef PADDLE_WITH_CUDA
-  else if (platform::is_gpu_place(dst_place)) {
-    memory::Copy(
-        boost::get<platform::GPUPlace>(dst_place), dst_ptr, src_place, src_ptr,
-        size,
-        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
-  }
-#endif
-}
-
 inline Tensor Tensor::Slice(int begin_idx, int end_idx) const {
   check_memory_size();
   PADDLE_ENFORCE_GE(begin_idx, 0,
diff --git a/paddle/framework/tensor_test.cc b/paddle/framework/tensor_test.cc
index 1bb0fb71b079940d35a995b78e04a531c074a8b2..ceca64365a1a628642eb374a3e3bbdff490c955a 100644
--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -188,178 +188,6 @@ TEST(Tensor, Slice) {
 #endif
 }
 
-TEST(Tensor, CopyFrom) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  {
-    Tensor src_tensor;
-    Tensor dst_tensor;
-    CPUDeviceContext cpu_ctx((CPUPlace()));
-
-    int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
-
-    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    memcpy(src_ptr, arr, 9 * sizeof(int));
-
-    auto cpu_place = new paddle::platform::CPUPlace();
-    dst_tensor.CopyFrom(src_tensor, *cpu_place, cpu_ctx);
-
-    const int* dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, dst_ptr);
-    for (size_t i = 0; i < 9; ++i) {
-      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-    }
-
-    Tensor slice_tensor = src_tensor.Slice(1, 2);
-    dst_tensor.CopyFrom(slice_tensor, *cpu_place, cpu_ctx);
-    const int* slice_ptr = slice_tensor.data<int>();
-    dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(dst_ptr, slice_ptr);
-    for (size_t i = 0; i < 3; ++i) {
-      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
-    }
-  }
-#ifdef PADDLE_WITH_CUDA
-  {
-    Tensor src_tensor;
-    Tensor gpu_tensor;
-    Tensor dst_tensor;
-
-    int* src_ptr = src_tensor.mutable_data<int>(make_ddim({3, 3}), CPUPlace());
-
-    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    memcpy(src_ptr, arr, 9 * sizeof(int));
-
-    // CPU Tensor to GPU Tensor
-    auto gpu_place = new paddle::platform::GPUPlace(0);
-    CUDADeviceContext gpu_ctx(*gpu_place);
-    gpu_tensor.CopyFrom(src_tensor, *gpu_place, gpu_ctx);
-
-    // GPU Tensor to CPU Tensor
-    auto cpu_place = new paddle::platform::CPUPlace();
-    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
-
-    // Sync before Compare Tensors
-    gpu_ctx.Wait();
-    const int* dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, dst_ptr);
-    for (size_t i = 0; i < 9; ++i) {
-      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-    }
-
-    Tensor slice_tensor = src_tensor.Slice(1, 2);
-
-    // CPU Slice Tensor to GPU Tensor
-    gpu_tensor.CopyFrom(slice_tensor, *gpu_place, gpu_ctx);
-
-    // GPU Tensor to CPU Tensor
-    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
-
-    // Sync before Compare Slice Tensors
-    gpu_ctx.Wait();
-    const int* slice_ptr = slice_tensor.data<int>();
-    dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(dst_ptr, slice_ptr);
-    for (size_t i = 0; i < 3; ++i) {
-      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
-    }
-  }
-#endif
-}
-
-TEST(Tensor, CopyFromVector) {
-  using namespace paddle::framework;
-  using namespace paddle::platform;
-  {
-    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    Tensor cpu_tensor;
-
-    // Copy to CPU Tensor
-    cpu_tensor.Resize(make_ddim({3, 3}));
-    auto cpu_place = new paddle::platform::CPUPlace();
-    CPUDeviceContext cpu_ctx(*cpu_place);
-    cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
-
-    // Compare Tensors
-    const int* cpu_ptr = cpu_tensor.data<int>();
-    const int* src_ptr = src_vec.data();
-    ASSERT_NE(src_ptr, cpu_ptr);
-    for (size_t i = 0; i < 9; ++i) {
-      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
-    }
-
-    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
-    cpu_tensor.Resize(make_ddim({2, 2}));
-    cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
-    cpu_ptr = cpu_tensor.data<int>();
-    src_ptr = src_vec.data();
-    ASSERT_NE(src_ptr, cpu_ptr);
-    for (size_t i = 0; i < 5; ++i) {
-      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
-    }
-
-    delete cpu_place;
-  }
-
-#ifdef PADDLE_WITH_CUDA
-  {
-    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
-    Tensor cpu_tensor;
-    Tensor gpu_tensor;
-    Tensor dst_tensor;
-
-    // Copy to CPU Tensor
-    cpu_tensor.Resize(make_ddim({3, 3}));
-    auto cpu_place = new paddle::platform::CPUPlace();
-    CPUDeviceContext cpu_ctx(*cpu_place);
-    cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
-
-    // Copy to GPUTensor
-    gpu_tensor.Resize(make_ddim({3, 3}));
-    auto gpu_place = new paddle::platform::GPUPlace();
-    CUDADeviceContext gpu_ctx(*gpu_place);
-    gpu_tensor.CopyFromVector<int>(src_vec, gpu_ctx);
-    // Copy from GPU to CPU tensor for comparison
-    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
-
-    // Sync before Compare Tensors
-    gpu_ctx.Wait();
-    const int* src_ptr = src_vec.data();
-    const int* cpu_ptr = cpu_tensor.data<int>();
-    const int* dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, cpu_ptr);
-    ASSERT_NE(src_ptr, dst_ptr);
-    for (size_t i = 0; i < 9; ++i) {
-      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
-      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-    }
-
-    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
-
-    cpu_tensor.Resize(make_ddim({2, 2}));
-    cpu_tensor.CopyFromVector<int>(src_vec, cpu_ctx);
-    gpu_tensor.Resize(make_ddim({2, 2}));
-    gpu_tensor.CopyFromVector<int>(src_vec, gpu_ctx);
-    dst_tensor.CopyFrom(gpu_tensor, *cpu_place, gpu_ctx);
-
-    // Sync before Compare Tensors
-    gpu_ctx.Wait();
-    src_ptr = src_vec.data();
-    cpu_ptr = cpu_tensor.data<int>();
-    dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, cpu_ptr);
-    ASSERT_NE(src_ptr, dst_ptr);
-    for (size_t i = 0; i < 5; ++i) {
-      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
-      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
-    }
-
-    delete cpu_place;
-    delete gpu_place;
-  }
-#endif
-}
-
 TEST(Tensor, ReshapeToMatrix) {
   using namespace paddle::framework;
   using namespace paddle::platform;
diff --git a/paddle/framework/tensor_util.h b/paddle/framework/tensor_util.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e34b90d57eed8fea84b83045df61a98483c8849
--- /dev/null
+++ b/paddle/framework/tensor_util.h
@@ -0,0 +1,152 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include "paddle/framework/tensor.h"
+
+namespace paddle {
+namespace framework {
+
+/**
+ * @brief   Copy the content of external tensor to a new place.
+ *
+ * @param[in] src        The external tensor.
+ * @param[in] dst_place  The dst place.
+ * @param[in] ctx        The device context contains device resources.
+ *
+ * @note    CopyFrom supports CPU <-> GPU, GPU <-> GPU.
+ */
+
+inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
+                     const platform::DeviceContext& ctx, Tensor* dst) {
+  src.check_memory_size();
+
+  dst->Resize(src.dims());
+  auto src_place = src.place();
+  auto src_ptr = src.data<void>();
+
+  auto dst_ptr = dst->mutable_data(dst_place, src.type());
+
+  auto size = src.numel() * SizeOfType(src.type());
+
+  if (platform::is_cpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
+                 boost::get<platform::CPUPlace>(src_place), src_ptr, size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(src_place) &&  // NOLINT
+           platform::is_cpu_place(dst_place)) {
+    auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
+    auto dst_cpu_place = boost::get<platform::CPUPlace>(dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
+    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
+    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
+    memory::Copy(
+        dst_cpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  } else if (platform::is_cpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    auto src_cpu_place = boost::get<platform::CPUPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
+    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
+    PADDLE_ENFORCE_EQ(dst_gpu_place, ctx_gpu_place);
+    memory::Copy(
+        dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  } else if (platform::is_gpu_place(src_place) &&
+             platform::is_gpu_place(dst_place)) {
+    auto src_gpu_place = boost::get<platform::GPUPlace>(src_place);
+    auto dst_gpu_place = boost::get<platform::GPUPlace>(dst_place);
+    auto ctx_place = ctx.GetPlace();
+    PADDLE_ENFORCE(platform::is_gpu_place(ctx_place));
+    auto ctx_gpu_place = boost::get<platform::GPUPlace>(ctx_place);
+    PADDLE_ENFORCE_EQ(src_gpu_place, ctx_gpu_place);
+    memory::Copy(
+        dst_gpu_place, dst_ptr, src_gpu_place, src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  }
+#endif
+}
+
+/**
+ * @brief   Copy the content of an external vector to a tensor.
+ *
+ * @param[in] src        The external tensor.
+ * @param[in] ctx        The device context contains device resources.
+ *
+ * * @note    CopyFromVector assumes that the tensor has been resized
+ *            before invoking.
+ */
+template <typename T>
+inline void CopyFromVector(const std::vector<T>& src,
+                           const platform::DeviceContext& ctx, Tensor* dst) {
+  auto dst_place = ctx.GetPlace();
+  auto src_ptr = static_cast<const void*>(src.data());
+  platform::CPUPlace src_place;
+  dst->Resize({static_cast<int64_t>(src.size())});
+  auto dst_ptr = static_cast<void*>(dst->mutable_data<T>(dst_place));
+  auto size = src.size() * sizeof(T);
+
+  if (platform::is_cpu_place(dst_place)) {
+    memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr, src_place,
+                 src_ptr, size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(dst_place)) {  // NOLINT
+    memory::Copy(
+        boost::get<platform::GPUPlace>(dst_place), dst_ptr, src_place, src_ptr,
+        size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  }
+#endif
+}
+
+/**
+ * @brief   Copy the content of a tensor to a vector
+ *
+ * @param[in] src        The external tensor.
+ * @param[in] ctx        The device context contains device resources.
+ *
+ * * @note    CopyFromVector assumes that the tensor has been resized
+ *            before invoking.
+ */
+template <typename T>
+inline void CopyToVector(const Tensor& src, const platform::DeviceContext& ctx,
+                         std::vector<T>* dst) {
+  auto src_ptr = static_cast<const void*>(src.data<T>());
+  auto size = src.numel() * sizeof(T);
+
+  platform::CPUPlace dst_place;
+  dst->resize(src.numel());
+  auto dst_ptr = static_cast<void*>(dst->data());
+
+  if (platform::is_cpu_place(src.place())) {
+    memory::Copy(dst_place, dst_ptr,
+                 boost::get<platform::CPUPlace>(src.place()), src_ptr, size);
+  }
+#ifdef PADDLE_WITH_CUDA
+  else if (platform::is_gpu_place(src.place())) {  // NOLINT
+    memory::Copy(
+        dst_place, dst_ptr, boost::get<platform::GPUPlace>(src.place()),
+        src_ptr, size,
+        reinterpret_cast<const platform::CUDADeviceContext&>(ctx).stream());
+  }
+#endif
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/framework/tensor_util_test.cc b/paddle/framework/tensor_util_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..03a70de182d0eb499a81413d38229c81c4378b91
--- /dev/null
+++ b/paddle/framework/tensor_util_test.cc
@@ -0,0 +1,228 @@
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+#include "paddle/framework/tensor_util.h"
+#include <gtest/gtest.h>
+#include <string>
+
+namespace paddle {
+namespace framework {
+TEST(CopyFrom, Tensor) {
+  Tensor src_tensor;
+  Tensor dst_tensor;
+  platform::CPUDeviceContext cpu_ctx((platform::CPUPlace()));
+
+  int* src_ptr =
+      src_tensor.mutable_data<int>(make_ddim({3, 3}), platform::CPUPlace());
+
+  int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+  memcpy(src_ptr, arr, 9 * sizeof(int));
+
+  auto cpu_place = new platform::CPUPlace();
+  CopyFrom(src_tensor, *cpu_place, cpu_ctx, &dst_tensor);
+
+  const int* dst_ptr = dst_tensor.data<int>();
+  ASSERT_NE(src_ptr, dst_ptr);
+  for (size_t i = 0; i < 9; ++i) {
+    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+  }
+
+  Tensor slice_tensor = src_tensor.Slice(1, 2);
+  CopyFrom(slice_tensor, *cpu_place, cpu_ctx, &dst_tensor);
+  const int* slice_ptr = slice_tensor.data<int>();
+  dst_ptr = dst_tensor.data<int>();
+  ASSERT_NE(dst_ptr, slice_ptr);
+  for (size_t i = 0; i < 3; ++i) {
+    EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+  }
+#ifdef PADDLE_WITH_CUDA
+  {
+    Tensor src_tensor;
+    Tensor gpu_tensor;
+    Tensor dst_tensor;
+
+    int* src_ptr =
+        src_tensor.mutable_data<int>(make_ddim({3, 3}), platform::CPUPlace());
+
+    int arr[9] = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    memcpy(src_ptr, arr, 9 * sizeof(int));
+
+    // CPU Tensor to GPU Tensor
+    auto gpu_place = new platform::GPUPlace(0);
+    platform::CUDADeviceContext gpu_ctx(*gpu_place);
+    CopyFrom(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+
+    // GPU Tensor to CPU Tensor
+    auto cpu_place = new platform::CPUPlace();
+    CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+
+    // Sync before Compare Tensors
+    gpu_ctx.Wait();
+    const int* dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    Tensor slice_tensor = src_tensor.Slice(1, 2);
+
+    // CPU Slice Tensor to GPU Tensor
+    CopyFrom(slice_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+
+    // GPU Tensor to CPU Tensor
+    CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+
+    // Sync before Compare Slice Tensors
+    gpu_ctx.Wait();
+    const int* slice_ptr = slice_tensor.data<int>();
+    dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(dst_ptr, slice_ptr);
+    for (size_t i = 0; i < 3; ++i) {
+      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
+    }
+  }
+#endif
+}
+
+TEST(CopyFromVector, Tensor) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  {
+    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    Tensor cpu_tensor;
+
+    // Copy to CPU Tensor
+    cpu_tensor.Resize(make_ddim({3, 3}));
+    auto cpu_place = new paddle::platform::CPUPlace();
+    CPUDeviceContext cpu_ctx(*cpu_place);
+    CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+
+    // Compare Tensors
+    const int* cpu_ptr = cpu_tensor.data<int>();
+    const int* src_ptr = src_vec.data();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+    }
+
+    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
+    cpu_tensor.Resize(make_ddim({2, 2}));
+    CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+    cpu_ptr = cpu_tensor.data<int>();
+    src_ptr = src_vec.data();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    for (size_t i = 0; i < 5; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+    }
+
+    delete cpu_place;
+  }
+
+#ifdef PADDLE_WITH_CUDA
+  {
+    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    Tensor cpu_tensor;
+    Tensor gpu_tensor;
+    Tensor dst_tensor;
+
+    // Copy to CPU Tensor
+    cpu_tensor.Resize(make_ddim({3, 3}));
+    auto cpu_place = new paddle::platform::CPUPlace();
+    CPUDeviceContext cpu_ctx(*cpu_place);
+    CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+
+    // Copy to GPUTensor
+    gpu_tensor.Resize(make_ddim({3, 3}));
+    auto gpu_place = new paddle::platform::GPUPlace();
+    CUDADeviceContext gpu_ctx(*gpu_place);
+    CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+    // Copy from GPU to CPU tensor for comparison
+    CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+
+    // Sync before Compare Tensors
+    gpu_ctx.Wait();
+    const int* src_ptr = src_vec.data();
+    const int* cpu_ptr = cpu_tensor.data<int>();
+    const int* dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 9; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    src_vec.erase(src_vec.begin(), src_vec.begin() + 5);
+
+    cpu_tensor.Resize(make_ddim({2, 2}));
+    CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
+    gpu_tensor.Resize(make_ddim({2, 2}));
+    CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+    CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+
+    // Sync before Compare Tensors
+    gpu_ctx.Wait();
+    src_ptr = src_vec.data();
+    cpu_ptr = cpu_tensor.data<int>();
+    dst_ptr = dst_tensor.data<int>();
+    ASSERT_NE(src_ptr, cpu_ptr);
+    ASSERT_NE(src_ptr, dst_ptr);
+    for (size_t i = 0; i < 5; ++i) {
+      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
+      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
+    }
+
+    delete cpu_place;
+    delete gpu_place;
+  }
+#endif
+}
+
+TEST(CopyToVector, Tensor) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  {
+    Tensor src;
+    int* src_ptr = src.mutable_data<int>({3, 3}, CPUPlace());
+    for (int i = 0; i < 3 * 3; ++i) {
+      src_ptr[i] = i;
+    }
+
+    CPUPlace place;
+    CPUDeviceContext cpu_ctx(place);
+    std::vector<int> dst;
+    CopyToVector<int>(src, cpu_ctx, &dst);
+
+    for (int i = 0; i < 3 * 3; ++i) {
+      EXPECT_EQ(src_ptr[i], dst[i]);
+    }
+  }
+#ifdef PADDLE_WITH_CUDA
+  {
+    std::vector<int> src_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9};
+    Tensor gpu_tensor;
+    GPUPlace place;
+    CUDADeviceContext gpu_ctx(place);
+    CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
+
+    std::vector<int> dst;
+    CopyToVector<int>(gpu_tensor, gpu_ctx, &dst);
+
+    for (int i = 0; i < 3 * 3; ++i) {
+      EXPECT_EQ(src_vec[i], dst[i]);
+    }
+  }
+#endif
+}
+
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/function/EigenGemm.cpp b/paddle/function/EigenGemm.cpp
index b3e666e860d29d89650d48a23cf44917035a02d7..644098a9e7873fb59b6343e805163e4892f060a8 100644
--- a/paddle/function/EigenGemm.cpp
+++ b/paddle/function/EigenGemm.cpp
@@ -21,7 +21,7 @@ template <class T>
 struct EigenBlasGemm {
   typedef Eigen::TensorMap<Eigen::Tensor<T, 2, Eigen::RowMajor, int>,
                            Eigen::Aligned>
-      Matrix;
+      EigenMatrix;
 
   static void compute(const bool transA,
                       const bool transB,
@@ -56,14 +56,13 @@ struct EigenBlasGemm {
       sizeB[1] = N;
       CHECK_EQ(N, ldb);
     }
-    Eigen::array<int, 2> sizeC;
-    sizeC[0] = M;
-    sizeC[1] = N;
-    CHECK_EQ(N, ldc);
+    Eigen::array<int, 2> sizeC = {{M, ldc}};
+    Eigen::array<int, 2> offsetC = {{0, 0}};
+    Eigen::array<int, 2> extentC = {{M, N}};
 
-    const Matrix a(const_cast<T*>(A), sizeA);
-    const Matrix b(const_cast<T*>(B), sizeB);
-    Matrix c(C, sizeC);
+    const EigenMatrix a(const_cast<T*>(A), sizeA);
+    const EigenMatrix b(const_cast<T*>(B), sizeB);
+    EigenMatrix c(C, sizeC);
 
     typedef typename Eigen::Tensor<T, 2>::DimensionPair DimPair;
     Eigen::array<DimPair, 1> dims;
@@ -72,12 +71,23 @@ struct EigenBlasGemm {
     dims[0].second = transB ? 1 : 0;
 
     Eigen::DefaultDevice device;
-    if (alpha == T(1) && beta == T(0)) {
-      c.device(device) = a.contract(b, dims);
-    } else if (alpha == T(1) && beta == T(1)) {
-      c.device(device) += a.contract(b, dims);
+    if (N == ldc) {
+      if (alpha == T(1) && beta == T(0)) {
+        c.device(device) = a.contract(b, dims);
+      } else if (alpha == T(1) && beta == T(1)) {
+        c.device(device) += a.contract(b, dims);
+      } else {
+        c.device(device) = alpha * a.contract(b, dims) + beta * c;
+      }
     } else {
-      c.device(device) = alpha * a.contract(b, dims) + beta * c;
+      if (alpha == T(1) && beta == T(0)) {
+        c.slice(offsetC, extentC).device(device) = a.contract(b, dims);
+      } else if (alpha == T(1) && beta == T(1)) {
+        c.slice(offsetC, extentC).device(device) += a.contract(b, dims);
+      } else {
+        c.slice(offsetC, extentC).device(device) =
+            alpha * a.contract(b, dims) + beta * c.slice(offsetC, extentC);
+      }
     }
   }
 };
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.cpp b/paddle/gserver/layers/FactorizationMachineLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..be26b9ba88c279036f73b0a0baaff164755fe067
--- /dev/null
+++ b/paddle/gserver/layers/FactorizationMachineLayer.cpp
@@ -0,0 +1,158 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "FactorizationMachineLayer.h"
+#include <algorithm>
+#include <vector>
+#include "paddle/math/SparseMatrix.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_LAYER(factorization_machine, FactorizationMachineLayer);
+
+bool FactorizationMachineLayer::init(const LayerMap& layerMap,
+                                     const ParameterMap& parameterMap) {
+  /* Initialize the basic parent class */
+  Layer::init(layerMap, parameterMap);
+
+  factorSize_ = config_.factor_size();
+
+  /* initialize the latentVectors_ */
+  CHECK_EQ(inputLayers_.size(), 1UL);
+  size_t inputSize = inputLayers_[0]->getSize();
+  CHECK_EQ(parameters_[0]->getSize(), inputSize * factorSize_);
+  latentVectors_ = std::unique_ptr<Weight>(
+      new Weight(inputSize, factorSize_, parameters_[0]));
+
+  return true;
+}
+
+void FactorizationMachineLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const MatrixPtr& inputV = getInputValue(0);
+
+  size_t batchSize = inputV->getHeight();
+  size_t outputSize = getSize();
+  size_t inputSize = inputLayers_[0]->getSize();
+  reserveOutput(batchSize, outputSize);
+
+  MatrixPtr outV = getOutputValue();
+
+  Matrix::resizeOrCreate(
+      latentVectorsSquare_, inputSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(
+      inputMulFactor_, batchSize, factorSize_, false, useGpu_);
+  Matrix::resizeOrCreate(tmpOut_, batchSize, factorSize_, false, useGpu_);
+
+  REGISTER_TIMER_INFO("FmInputMulFactorTimer", getName().c_str());
+  inputMulFactor_->mul(*inputV, *latentVectors_->getW());
+  inputMulFactor_->square2(*tmpOut_);
+  outV->sumRows(*tmpOut_, 0.5, 0);
+
+  if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
+    Matrix::resizeOrCreateSparseMatrix(inputSquare_,
+                                       inputV->getHeight(),
+                                       inputV->getWidth(),
+                                       inputV->getElementCnt(),
+                                       inputV->getValueType());
+    inputSquare_->copyFrom(*inputV);
+    (dynamic_cast<CpuSparseMatrix*>(inputSquare_.get()))->square2();
+  } else {
+    Matrix::resizeOrCreate(
+        inputSquare_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
+    inputV->square2(*inputSquare_);
+  }
+  latentVectors_->getW()->square2(*latentVectorsSquare_);
+  tmpOut_->mul(*inputSquare_, *latentVectorsSquare_);
+  outV->sumRows(*tmpOut_, -0.5, 1.0);
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FmFwAtvTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void FactorizationMachineLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ { backwardActivation(); }
+
+  const MatrixPtr& inputV = getInputValue(0);
+  const MatrixPtr& oGrad = getOutputGrad();
+
+  Matrix::resizeOrCreate(
+      tmpSum_, 1, latentVectors_->getW()->getHeight(), false, useGpu_);
+  MatrixPtr tmpSumTrans = Matrix::create(tmpSum_->getRowBuf(0),
+                                         latentVectors_->getW()->getHeight(),
+                                         1,
+                                         false,
+                                         useGpu_);
+
+  /* Calculate the gradients of the latentVectors_ matrix */
+  if (latentVectors_->getWGrad()) {
+    if (dynamic_cast<CpuSparseMatrix*>(inputV.get())) {
+      Matrix::resizeOrCreateSparseMatrix(tmpInput_,
+                                         inputV->getHeight(),
+                                         inputV->getWidth(),
+                                         inputV->getElementCnt());
+
+      CpuSparseMatrix* sparseInputV =
+          dynamic_cast<CpuSparseMatrix*>(inputV.get());
+      CpuSparseMatrix* sparseInputSquare =
+          dynamic_cast<CpuSparseMatrix*>(inputSquare_.get());
+      CpuSparseMatrix* sparseTmpInput =
+          dynamic_cast<CpuSparseMatrix*>(tmpInput_.get());
+      sparseTmpInput->copyFrom(*sparseInputV);
+
+      sparseTmpInput->rowScale(0, *sparseInputV, *oGrad);
+      latentVectors_->getWGrad()->mul(
+          *sparseTmpInput->getTranspose(), *inputMulFactor_, 1, 1);
+      sparseTmpInput->rowScale(0, *sparseInputSquare, *oGrad);
+
+      Matrix::resizeOrCreate(negOnes_, 1, inputV->getHeight(), false, useGpu_);
+      negOnes_->zeroMem();
+      negOnes_->add(-1);
+      tmpSum_->mul(*negOnes_, *sparseTmpInput, 1, 0);
+    } else {
+      Matrix::resizeOrCreate(
+          tmpInput_, inputV->getHeight(), inputV->getWidth(), false, useGpu_);
+
+      tmpInput_->rowScale(0, *inputV, *oGrad);
+      latentVectors_->getWGrad()->mul(
+          *tmpInput_->getTranspose(), *inputMulFactor_, 1, 1);
+      tmpInput_->rowScale(0, *inputSquare_, *oGrad);
+
+      tmpSum_->sumCols(*tmpInput_, -1, 0);
+    }
+
+    latentVectors_->getWGrad()->addRowScale(
+        0, *latentVectors_->getW(), *tmpSumTrans);
+
+    /* Increasing the number of gradient */
+    latentVectors_->getParameterPtr()->incUpdate(callback);
+  }
+
+  /* Calculate the input layers gradient */
+  MatrixPtr inGrad = getInputGrad(0);
+  if (inGrad != NULL) {
+    inGrad->mul(
+        *inputMulFactor_, *latentVectors_->getW()->getTranspose(), 1, 1);
+    tmpSumTrans->sumRows(*latentVectorsSquare_, -1, 0);
+    inGrad->addColScale(0, *inputV, *tmpSum_);
+    inGrad->rowScale(0, *inGrad, *oGrad);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/FactorizationMachineLayer.h b/paddle/gserver/layers/FactorizationMachineLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..df20a49934d5dd444f127842c8fdb7c77f4ebeb1
--- /dev/null
+++ b/paddle/gserver/layers/FactorizationMachineLayer.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/utils/ThreadLocal.h"
+
+namespace paddle {
+/**
+ * @brief The Factorization Machine models pairwise (order-2) feature
+ * interactions as inner product of the learned latent vectors corresponding
+ * to each input feature.
+ *
+ * The Factorization Machine can effectively capture feature interactions
+ * especially when the input is sparse. While in principle FM can model higher
+ * order feature interaction, in practice usually only order-2 feature
+ * interactions are considered. The Factorization Machine Layer here only
+ * computes the order-2 interations with the formula:
+ *
+ * \f[
+ *     y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
+ * \f]
+ *
+ * The detailed calculation for forward and backward can be found at this paper:
+ *
+ *     Factorization machines.
+ *
+ * The config file api is factorization_machine.
+ */
+
+class FactorizationMachineLayer : public Layer {
+protected:
+  // The latent vectors, shape: (size, factorSize_)
+  // Each row of the latentVectors_ matrix is the latent vector
+  // corresponding to one input feature dimension
+  std::unique_ptr<Weight> latentVectors_;
+  // The hyperparameter that defines the dimensionality of the factorization
+  size_t factorSize_;
+
+private:
+  // Store the square values of the letent vectors matrix
+  MatrixPtr latentVectorsSquare_;
+  // Store the square values of input matrix
+  MatrixPtr inputSquare_;
+  // The result of input matrix * latent vector matrix that will be used in
+  // both forward and backward step
+  MatrixPtr inputMulFactor_;
+  // Store temporary calculation result
+  MatrixPtr tmpOut_;
+  MatrixPtr tmpSum_;
+  MatrixPtr tmpInput_;
+  // Negative identity matrix
+  MatrixPtr negOnes_;
+
+public:
+  explicit FactorizationMachineLayer(const LayerConfig& config)
+      : Layer(config) {}
+  ~FactorizationMachineLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
index d62a8d846e5b347aa44ce1951c043d5813a5b3ff..236f8096bdb6e024cf3c9c73eba422616a777a23 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.cpp
@@ -64,49 +64,111 @@ void HierarchicalSigmoidLayer::forward(PassType passType) {
                          batchSize,
                          codeLength_,
                          /* trans */ false,
-                         useGpu(deviceId_));
+                         false);
   Matrix::resizeOrCreate(preOutput_.grad,
                          batchSize,
                          codeLength_,
                          /* trans */ false,
-                         useGpu(deviceId_));
-
+                         false);
   IVectorPtr label = getInput(*getLabelLayer()).ids;
-
   preOutput_.value->zeroMem();
 
+  if (useGpu_) {
+    Matrix::resizeOrCreate(cpuOutput_,
+                           output_.value->getHeight(),
+                           output_.value->getWidth(),
+                           /* trans */ false,
+                           false);
+    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
+    cpuLabel_->copyFrom(*label);
+    cpuOutput_->copyFrom(*output_.value);
+  } else {
+    cpuOutput_ = output_.value;
+    cpuLabel_ = label;
+  }
   /* add the bias-vector */
   if (biases_.get() != NULL) {
-    preOutput_.value->addByBitCode(numClasses_, *label, *biases_->getW());
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuBias_,
+                             1,
+                             numClasses_ - 1,
+                             /* trans */ false,
+                             false);
+      cpuBias_->copyFrom(*biases_->getW());
+    } else {
+      cpuBias_ = biases_->getW();
+    }
+    preOutput_.value->addByBitCode(numClasses_, *cpuLabel_, *cpuBias_);
   }
   for (size_t i = 0; i < inputLayers_.size() - 1; ++i) {
     MatrixPtr input = getInputValue(i);
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuInput_,
+                             input->getHeight(),
+                             input->getWidth(),
+                             /* trans */ false,
+                             false);
+      Matrix::resizeOrCreate(cpuWeight_,
+                             weights_[i]->getW()->getHeight(),
+                             weights_[i]->getW()->getWidth(),
+                             /* trans */ false,
+                             false);
+      cpuInput_->copyFrom(*input);
+      cpuWeight_->copyFrom(*weights_[i]->getW());
+    } else {
+      cpuInput_ = input;
+      cpuWeight_ = weights_[i]->getW();
+    }
     preOutput_.value->mulByBitCode(
-        numClasses_, *label, *weights_[i]->getW(), *input);
+        numClasses_, *cpuLabel_, *cpuWeight_, *cpuInput_);
   }
   // keep consistent with the clipping in the following softrelu
   preOutput_.value->clip(-40.0, 40.0);
   preOutput_.value->sumByBitCode(numClasses_,
-                                 *label,
-                                 *output_.value,
+                                 *cpuLabel_,
+                                 *cpuOutput_,
                                  -1);  // scaleSum
   preOutput_.value->softrelu(*preOutput_.value);
-  MatrixPtr sum =
-      Matrix::create(batchSize, 1, /* trans= */ false, useGpu(deviceId_));
+  MatrixPtr sum = Matrix::create(batchSize, 1, /* trans= */ false, false);
   preOutput_.value->rowSum(*sum);
-  output_.value->add(*sum);
+  cpuOutput_->add(*sum);
+  if (useGpu_) {
+    output_.value->copyFrom(*cpuOutput_);
+  } else {
+    output_.value = cpuOutput_;
+  }
 }
 
 void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
   IVectorPtr label = getInput(*getLabelLayer()).ids;
+  if (useGpu_) {
+    IVector::resizeOrCreate(cpuLabel_, label->getSize(), false);
+    cpuLabel_->copyFrom(*label);
+  } else {
+    cpuLabel_ = label;
+  }
   preOutput_.grad->one();
   preOutput_.grad->softreluDerivative(*preOutput_.value);
-  preOutput_.grad->subByBitCode(numClasses_, *label);
+  preOutput_.grad->subByBitCode(numClasses_, *cpuLabel_);
 
   if (biases_ && biases_->getWGrad()) {
-    preOutput_.grad->addByBitCodeBackward(
-        numClasses_, *label, *biases_->getWGrad());
-
+    MatrixPtr biases_grad = biases_->getWGrad();
+    if (useGpu_) {
+      Matrix::resizeOrCreate(cpuBias_,
+                             1,
+                             numClasses_ - 1,
+                             /* trans */ false,
+                             false);
+      cpuBias_->copyFrom(*biases_grad);
+    } else {
+      cpuBias_ = biases_grad;
+    }
+    preOutput_.grad->addByBitCodeBackward(numClasses_, *cpuLabel_, *cpuBias_);
+    if (useGpu_) {
+      biases_grad->copyFrom(*cpuBias_);
+    } else {
+      biases_grad = cpuBias_;
+    }
     /* Increasing the number of gradient */
     biases_->getParameterPtr()->incUpdate(callback);
   }
@@ -115,9 +177,31 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
     /* Calculate the W-gradient for the current layer */
     MatrixPtr input = getInputValue(i);
     if (weights_[i]->getWGrad()) {
+      MatrixPtr weights_grad = weights_[i]->getWGrad();
+      if (useGpu_) {
+        Matrix::resizeOrCreate(cpuInput_,
+                               input->getHeight(),
+                               input->getWidth(),
+                               /* trans */ false,
+                               false);
+        Matrix::resizeOrCreate(cpuWeightGrad_,
+                               weights_grad->getHeight(),
+                               weights_grad->getWidth(),
+                               /* trans */ false,
+                               false);
+        cpuInput_->copyFrom(*input);
+        cpuWeightGrad_->copyFrom(*weights_grad);
+      } else {
+        cpuInput_ = input;
+        cpuWeightGrad_ = weights_grad;
+      }
       preOutput_.grad->mulByBitCodeBackwardWeight(
-          numClasses_, *label, *weights_[i]->getWGrad(), *input);
-
+          numClasses_, *cpuLabel_, *cpuWeightGrad_, *cpuInput_);
+      if (useGpu_) {
+        weights_grad->copyFrom(*cpuWeightGrad_);
+      } else {
+        weights_grad = cpuWeightGrad_;
+      }
       /* Increasing the number of gradient */
       weights_[i]->getParameterPtr()->incUpdate(callback);
     }
@@ -125,8 +209,30 @@ void HierarchicalSigmoidLayer::backward(const UpdateCallback& callback) {
     /* Calculate the input layers error */
     MatrixPtr inputGrad = getInputGrad(i);
     if (inputGrad) {
+      if (useGpu_) {
+        Matrix::resizeOrCreate(cpuInputGrad_,
+                               inputGrad->getHeight(),
+                               inputGrad->getWidth(),
+                               /* trans */ false,
+                               false);
+        Matrix::resizeOrCreate(cpuWeight_,
+                               weights_[i]->getW()->getHeight(),
+                               weights_[i]->getW()->getWidth(),
+                               /* trans */ false,
+                               false);
+        cpuInputGrad_->copyFrom(*inputGrad);
+        cpuWeight_->copyFrom(*weights_[i]->getW());
+      } else {
+        cpuInputGrad_ = inputGrad;
+        cpuWeight_ = weights_[i]->getW();
+      }
       preOutput_.grad->mulByBitCodeBackwardError(
-          numClasses_, *label, *weights_[i]->getW(), *inputGrad);
+          numClasses_, *cpuLabel_, *cpuWeight_, *cpuInputGrad_);
+      if (useGpu_) {
+        inputGrad->copyFrom(*cpuInputGrad_);
+      } else {
+        inputGrad = cpuInputGrad_;
+      }
     }
   }
 }
diff --git a/paddle/gserver/layers/HierarchicalSigmoidLayer.h b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
index 9afd40b1674680da962d6e51caa56b46279b70de..7f896e61ca26e3e22b99b65b1285384a121f7f02 100644
--- a/paddle/gserver/layers/HierarchicalSigmoidLayer.h
+++ b/paddle/gserver/layers/HierarchicalSigmoidLayer.h
@@ -80,6 +80,15 @@ protected:
   int codeLength_;
   /// temporary result of output_
   Argument preOutput_;
+
+  /// The temporary variables in CPU memory.
+  MatrixPtr cpuWeight_;
+  MatrixPtr cpuWeightGrad_;
+  MatrixPtr cpuInput_;
+  MatrixPtr cpuInputGrad_;
+  MatrixPtr cpuBias_;
+  MatrixPtr cpuOutput_;
+  IVectorPtr cpuLabel_;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ROIPoolLayer.cpp b/paddle/gserver/layers/ROIPoolLayer.cpp
index 02402894d3354a6af221948a3360ef830881bf39..2c8256b91c97b513ce7237b8174c522430094926 100644
--- a/paddle/gserver/layers/ROIPoolLayer.cpp
+++ b/paddle/gserver/layers/ROIPoolLayer.cpp
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "ROIPoolLayer.h"
+#include <cfloat>
 
 namespace paddle {
 
@@ -126,10 +127,8 @@ void ROIPoolLayer::forward(PassType passType) {
 
           bool isEmpty = (hend <= hstart) || (wend <= wstart);
           size_t poolIndex = ph * pooledWidth_ + pw;
-          if (isEmpty) {
-            outputData[poolIndex] = 0;
-            argmaxData[poolIndex] = -1;
-          }
+          outputData[poolIndex] = isEmpty ? 0 : -FLT_MAX;
+          argmaxData[poolIndex] = -1;
 
           for (size_t h = hstart; h < hend; ++h) {
             for (size_t w = wstart; w < wend; ++w) {
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index c295ea19c9ccb3d05c509a41925d2c36efdba8ef..24e6cae8e69557c42ed5d437edce101709ca3983 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -62,11 +62,11 @@ if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
 endif()
 
 if(NOT MOBILE_INFERENCE)
-################## test_Evaluator #######################
+    ################## test_Evaluator #######################
     add_unittest(test_Evaluator
         test_Evaluator.cpp)
       
-############### test_RecurrentGradientMachine ###############
+    ############### test_RecurrentGradientMachine ###############
     # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine
     # I will fix it.
     add_unittest_without_exec(test_RecurrentGradientMachine
@@ -77,7 +77,7 @@ if(NOT MOBILE_INFERENCE)
                 ${CMAKE_CURRENT_BINARY_DIR}/test_RecurrentGradientMachine
         WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
       
-############### test_NetworkCompare ###############
+    ############### test_NetworkCompare ###############
     add_unittest_without_exec(test_NetworkCompare
         test_NetworkCompare.cpp)
     if(WITH_GPU)
@@ -89,34 +89,33 @@ if(NOT MOBILE_INFERENCE)
             COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=false
             WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
     endif()
-endif()
 
+    ################# test_CompareSparse ##################
+    add_unittest_without_exec(test_CompareSparse
+        test_CompareSparse.cpp)
+    if(NOT ON_TRAVIS)
+      add_test(NAME test_CompareSparse
+        COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
+              ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
+                  ./.set_port.sh -p port -n 6
+                      ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+    endif()
+
+    ################ test_CompareTwoNets ######################
+    add_unittest_without_exec(test_CompareTwoNets
+        test_CompareTwoNets.cpp)
+    add_test(NAME test_CompareTwoNets
+      COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
+            ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
+            ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
+endif()
 
+################ test_PyDataProvider2 ######################
 add_unittest_without_exec(test_PyDataProvider2
         test_PyDataProvider2.cpp)
-
 add_test(NAME test_PyDataProvider2
    COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
         WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
 )
-
-################# test_CompareSparse ##################
-add_unittest_without_exec(test_CompareSparse
-    test_CompareSparse.cpp)
-if(NOT ON_TRAVIS)
-  add_test(NAME test_CompareSparse
-    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
-          ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
-              ./.set_port.sh -p port -n 6
-                  ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
-endif()
-
-################ test_CompareTwoNets ######################
-add_unittest_without_exec(test_CompareTwoNets
-    test_CompareTwoNets.cpp)
-add_test(NAME test_CompareTwoNets
-  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
-        ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
-        ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
-    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
diff --git a/paddle/gserver/tests/sequence_rnn_matched_inputs.py b/paddle/gserver/tests/sequence_rnn_matched_inputs.py
index e2635b4400b13517bac716a5a0affeb16c218b09..59e8c91733c42b6f13f723321d21bca98ab78bb7 100644
--- a/paddle/gserver/tests/sequence_rnn_matched_inputs.py
+++ b/paddle/gserver/tests/sequence_rnn_matched_inputs.py
@@ -41,7 +41,7 @@ nonseq = embedding_layer(input=label, size=word_dim)
 
 
 # This hierarchical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn_multi_unequalength_inputs.conf
+# sequence_rnn_mixed_inputs.conf
 def outer_step(subseq, seq, nonseq, encoding):
     outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
 
diff --git a/paddle/gserver/tests/sequence_rnn_mixed_inputs.py b/paddle/gserver/tests/sequence_rnn_mixed_inputs.py
index 84a66e294495c01e03dc83b38a531e482bed1292..6fe9dca6e2cb0e14fee346b8307f67b804328471 100644
--- a/paddle/gserver/tests/sequence_rnn_mixed_inputs.py
+++ b/paddle/gserver/tests/sequence_rnn_mixed_inputs.py
@@ -37,7 +37,7 @@ encoding = embedding_layer(input=data2, size=word_dim)
 
 
 # This hierarchical RNN is designed to be equivalent to the simple RNN in
-# sequence_rnn_multi_unequalength_inputs.conf
+# sequence_rnn_matched_inputs.conf
 def outer_step(subseq, seq, nonseq, encoding):
     outer_mem = memory(name="outer_rnn_state", size=hidden_dim)
 
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index cacf10692942f5eca2f6c498183f4acc00768460..c5359f272b4bed4d4d2483bf19d7ae482b0d33dd 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -681,12 +681,13 @@ TEST(Layer, hsigmoidLayer) {
   config.layerConfig.add_inputs();
   config.layerConfig.add_inputs();
 
-  // Not support GPU now
-  testLayerGrad(config,
-                "hsigmoid",
-                100,
-                /* trans */ false, /* useGpu */
-                false);
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config,
+                  "hsigmoid",
+                  100,
+                  /* trans */ false,
+                  /* useGpu */ useGpu);
+  }
 }
 
 TEST(Layer, multi_cross) {
@@ -2464,6 +2465,25 @@ TEST(Layer, L2DistanceLayer) {
   }
 }
 
+void testFactorizationMachineLayer(InputType type, bool useGpu) {
+  const int FACTOR_SIZE = 10;
+  TestConfig config;
+  config.layerConfig.set_type("factorization_machine");
+  config.layerConfig.set_factor_size(FACTOR_SIZE);
+  config.layerConfig.set_size(1);
+  config.biasSize = 0;
+  config.inputDefs.push_back({type, "layer_0", 128, 1280});
+  config.layerConfig.add_inputs();
+  testLayerGrad(config, "factorization_machine", 16, false, useGpu, false);
+}
+
+TEST(Layer, FactorizationMachineLayer) {
+  for (auto useGpu : {false, true}) {
+    testFactorizationMachineLayer(INPUT_DATA, useGpu);
+  }
+  testFactorizationMachineLayer(INPUT_SPARSE_FLOAT_VALUE_DATA, false);
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp
index bf62229c03bb1d6e2bdf86d8c56a8157938fb832..dc6979cf5a5229fb09866189f28217889d58c2d0 100644
--- a/paddle/math/CpuSparseMatrix.cpp
+++ b/paddle/math/CpuSparseMatrix.cpp
@@ -260,6 +260,35 @@ void CpuSparseMatrix::printOneRow(std::ostream& os, size_t idx) const {
   os << ";";
 }
 
+void CpuSparseMatrix::rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c) {
+  CHECK(getFormat() != SPARSE_CSC) << "Not supported";
+  CHECK_EQ(height_, b.getHeight());
+  CHECK_EQ(width_, b.getWidth());
+  real* A = getValue();
+  real* B = b.getValue();
+  if (b.getValueType() == FLOAT_VALUE) {
+    for (size_t i = 0; i < height_; i++) {
+      size_t start = getRowStartIdx(i);
+      size_t end = getRowStartIdx(i + 1);
+      CHECK_EQ(start, b.getRowStartIdx(i));
+      CHECK_EQ(end, b.getRowStartIdx(i + 1));
+      for (size_t j = start; j < end; j++) {
+        A[j] = B[j] * c.getElement(i, cCol);
+      }
+    }
+  } else if (b.getValueType() == NO_VALUE) {
+    for (size_t i = 0; i < height_; i++) {
+      size_t start = getRowStartIdx(i);
+      size_t end = getRowStartIdx(i + 1);
+      CHECK_EQ(start, b.getRowStartIdx(i));
+      CHECK_EQ(end, b.getRowStartIdx(i + 1));
+      for (size_t j = start; j < end; j++) {
+        A[j] = c.getElement(i, cCol);
+      }
+    }
+  }
+}
+
 void CpuSparseMatrix::randomizeUniform() {
   CHECK_LE(elementCnt_, height_ * width_);
   if (valueType_ == FLOAT_VALUE) {
diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/math/CpuSparseMatrix.h
index aad1348353d558abca72ed0fa5cf943237e3ac78..522b436a2a69179d3f4f17c919d5ba024102db7b 100644
--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@@ -239,6 +239,15 @@ public:
               const unsigned int* cols,
               const real* values);
 
+  /**
+   * @brief this_row = b_row * c_row[cCol]
+   *
+   * @param[in]  cCol   the column of matrix c used to scale each row of b
+   * @param[in]  b      CpuSparseMatrix
+   * @param[in]  c      Matrix
+   */
+  void rowScale(size_t cCol, CpuSparseMatrix& b, Matrix& c);
+
   void randomizeUniform();
 
   void copyFrom(const GpuSparseMatrix& src, hl_stream_t stream);
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 5eb1c44eb6fc45db31ef44bf79e74b79193e08aa..95cfe2525e3e7c128d8652c5c6a0bb3d80a475b9 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -81,18 +81,33 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
 }
 
 template <>
-void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
-  return GetGPUBuddyAllocator(place.device)->Alloc(size);
+size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
+  return GetGPUBuddyAllocator(place.device)->Used();
 }
 
 template <>
-void Free<platform::GPUPlace>(platform::GPUPlace place, void* p) {
-  GetGPUBuddyAllocator(place.device)->Free(p);
+void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
+  auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
+  auto* ptr = buddy_allocator->Alloc(size);
+  if (ptr == nullptr) {
+    int cur_dev = platform::GetCurrentDeviceId();
+    platform::SetDeviceId(place.device);
+    size_t avail, total;
+    platform::GpuMemoryUsage(avail, total);
+    LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
+                 << place.device << ", available " << avail << " bytes";
+    LOG(WARNING) << "total " << total;
+    LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize();
+    LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize();
+    LOG(WARNING) << "GPU memory used: " << Used<platform::GPUPlace>(place);
+    platform::SetDeviceId(cur_dev);
+  }
+  return ptr;
 }
 
 template <>
-size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
-  return GetGPUBuddyAllocator(place.device)->Used();
+void Free<platform::GPUPlace>(platform::GPUPlace place, void* p) {
+  GetGPUBuddyAllocator(place.device)->Free(p);
 }
 
 #endif
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 059a6bba84cfb0c1f6cbbba3c88d589b52dc5592..937441b318095eadb9022c1d7578ad8aca2dadc8 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -73,6 +73,13 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
     endif()
 
+    # conv_cudnn_op contains several operators
+    if ("${TARGET}" STREQUAL "conv_cudnn_op")
+        set(pybind_flag 1)
+        # It's enough to just adding one operator to pybind
+        file(APPEND ${pybind_file} "USE_OP(conv2d_cudnn);\n")
+    endif()
+
     # pool_op contains several operators
     if ("${TARGET}" STREQUAL "pool_op")
         set(pybind_flag 1)
@@ -178,13 +185,13 @@ set(DEPS_OPS
     cond_op
     cross_entropy_op
     recurrent_op
-    dynamic_recurrent_op
     softmax_with_cross_entropy_op
     softmax_op
     sequence_softmax_op
     sum_op
     pool_op
     maxout_op
+    unpool_op
     pool_with_index_op
     conv_op
     conv_transpose_op
@@ -194,12 +201,29 @@ set(DEPS_OPS
     lod_rank_table_op
     lod_tensor_to_array_op
     array_to_lod_tensor_op
+    max_sequence_len_op
     lstm_op
     tensor_array_read_write_op
     gru_op
     adagrad_op
-    sgd_op)
+    sgd_op
+    save_op
+    load_op
+    send_op
+    recv_op)
+
+add_subdirectory(detail)
+op_library(send_op SRCS send_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
+set_source_files_properties(
+    send_op.cc
+    PROPERTIES
+    COMPILE_FLAGS  "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
 
+op_library(recv_op SRCS recv_op.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib_target protobuf)
+set_source_files_properties(
+    recv_op.cc
+    PROPERTIES
+    COMPILE_FLAGS  "-Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
 
 op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
 op_library(cross_entropy_op DEPS cross_entropy)
@@ -212,10 +236,12 @@ op_library(adagrad_op DEPS selected_rows_functor)
 op_library(conv_op DEPS vol2col)
 op_library(pool_op DEPS pooling)
 op_library(maxout_op DEPS maxouting)
+op_library(unpool_op DEPS unpooling)
 op_library(pool_with_index_op DEPS pooling)
 op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
 op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op)
 op_library(array_to_lod_tensor_op SRCS array_to_lod_tensor_op.cc DEPS lod_rank_table_op)
+op_library(max_sequence_len_op SRCS max_sequence_len_op.cc DEPS lod_rank_table)
 op_library(tensor_array_read_write_op SRCS tensor_array_read_write_op.cc)
 if(WITH_GPU)
 op_library(nccl_op DEPS nccl_common)
@@ -225,15 +251,12 @@ op_library(sequence_pool_op DEPS sequence_pooling)
 op_library(lstm_op DEPS sequence2batch lstm_compute)
 op_library(conv_transpose_op DEPS vol2col)
 op_library(gru_op DEPS sequence2batch gru_compute)
-if(WITH_TESTING)
-    op_library(dynamic_recurrent_op SRCS dynamic_recurrent_op.cc rnn/recurrent_op_utils.cc
-        DEPS net_op tensor_array gtest)
-else()
-    op_library(dynamic_recurrent_op SRCS dynamic_recurrent_op.cc rnn/recurrent_op_utils.cc
-            DEPS net_op tensor_array)
-endif()
 op_library(recurrent_op SRCS recurrent_op.cc DEPS executor)
 
+# FIXME(typhoonzero): save/load depends lodtensor serialization functions
+op_library(save_op DEPS lod_tensor)
+op_library(load_op DEPS lod_tensor)
+
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
 foreach(src ${GENERAL_OPS})
     op_library(${src})
@@ -241,15 +264,15 @@ endforeach()
 
 set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
 
+
+
 cc_test(gather_test SRCS gather_test.cc DEPS tensor)
 cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)
 cc_test(scatter_test SRCS scatter_test.cc DEPS tensor)
 cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_tensor)
 cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memory)
-cc_test(dynamic_recurrent_op_test SRCS dynamic_recurrent_op_test.cc
-        rnn/recurrent_op_utils.cc
-        DEPS dynamic_recurrent_op)
 if(WITH_GPU)
   cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
 endif()
 cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
+cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op recv_op sum_op executor)
diff --git a/paddle/operators/activation_op.cc b/paddle/operators/activation_op.cc
index c66d575d24bb6b410602c34965ab1db6bc81b41d..154c618e8e7c4650b7f22684d3357de9c52a416c 100644
--- a/paddle/operators/activation_op.cc
+++ b/paddle/operators/activation_op.cc
@@ -223,6 +223,51 @@ $y = |x|$
   }
 };
 
+class CeilOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  CeilOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Ceil operator");
+    AddOutput("Y", "Output of Ceil operator");
+    AddComment(R"DOC(
+Ceil Activation Operator.
+
+$y = ceil(x)$
+
+)DOC");
+  }
+};
+
+class FloorOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  FloorOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Floor operator");
+    AddOutput("Y", "Output of Floor operator");
+    AddComment(R"DOC(
+Floor Activation Operator.
+
+$y = floor(x)$
+
+)DOC");
+  }
+};
+
+class RoundOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RoundOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "Input of Round operator");
+    AddOutput("Y", "Output of Round operator");
+    AddComment(R"DOC(
+Round Activation Operator.
+
+$y = [x]$
+
+)DOC");
+  }
+};
+
 class ReciprocalOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ReciprocalOpMaker(framework::OpProto *proto,
@@ -493,6 +538,15 @@ REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad,
 REGISTER_OP(abs, ops::ActivationOp, ops::AbsOpMaker, abs_grad,
             ops::ActivationOpGrad);
 
+REGISTER_OP(ceil, ops::ActivationOp, ops::CeilOpMaker, ceil_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(floor, ops::ActivationOp, ops::FloorOpMaker, floor_grad,
+            ops::ActivationOpGrad);
+
+REGISTER_OP(round, ops::ActivationOp, ops::RoundOpMaker, round_grad,
+            ops::ActivationOpGrad);
+
 REGISTER_OP(reciprocal, ops::ActivationOp, ops::ReciprocalOpMaker,
             reciprocal_grad, ops::ActivationOpGrad);
 
diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h
index ceb4b4e40b67473f42e67e3f02f8e012e1b1eb50..8cd3bfbbd3f8f3210f94aef3a1586c8295730c1d 100644
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -283,6 +283,41 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+// ceil(x) = ceiling(x)
+template <typename T>
+struct CeilFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    y.device(d) = x.ceil();
+  }
+};
+
+template <typename T>
+struct ZeroGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Y, typename dY, typename dX>
+  void operator()(Device d, X x, Y y, dY dy, dX dx) const {
+    dx.device(d) = static_cast<T>(0) / x;
+  }
+};
+
+// floor(x) = flooring(x)
+template <typename T>
+struct FloorFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    y.device(d) = x.ceil();
+  }
+};
+
+// round(x) = [x]
+template <typename T>
+struct RoundFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Y>
+  void operator()(Device d, X x, Y y) const {
+    y.device(d) = x.round();
+  }
+};
+
 // abs(x) = |x|
 template <typename T>
 struct AbsFunctor : public BaseActivationFunctor<T> {
@@ -677,6 +712,9 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
   __macro(softshrink, SoftShrinkFunctor, SoftShrinkGradFunctor);     \
   __macro(sqrt, SqrtFunctor, SqrtGradFunctor);                       \
   __macro(abs, AbsFunctor, AbsGradFunctor);                          \
+  __macro(ceil, CeilFunctor, ZeroGradFunctor);                       \
+  __macro(floor, FloorFunctor, ZeroGradFunctor);                     \
+  __macro(round, RoundFunctor, ZeroGradFunctor);                     \
   __macro(reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);     \
   __macro(log, LogFunctor, LogGradFunctor);                          \
   __macro(square, SquareFunctor, SquareGradFunctor);                 \
diff --git a/paddle/operators/array_operator.h b/paddle/operators/array_operator.h
index 233a81198e336d3190565fb18556f96979cec0ce..1f2b4fdb4b4a99d5baf5de1cc226dc196ab4eb2e 100644
--- a/paddle/operators/array_operator.h
+++ b/paddle/operators/array_operator.h
@@ -36,7 +36,7 @@ class ArrayOp : public framework::OperatorBase {
     if (platform::is_gpu_place(i_tensor.place())) {
       // FIXME: Avoid copy from GPU to CPU
       framework::Tensor t;
-      t.CopyFrom(i_tensor, platform::CPUPlace(), dev_ctx);
+      framework::CopyFrom(i_tensor, platform::CPUPlace(), dev_ctx, &t);
       dev_ctx.Wait();
       offset = static_cast<size_t>(*t.data<int64_t>());
     } else {
diff --git a/paddle/operators/array_to_lod_tensor_op.cc b/paddle/operators/array_to_lod_tensor_op.cc
index c0903bb4e5ca7f160e19eefab99af7e3e4a8ed76..faeba7f3ed26d05de16775a1de4d42f802111207 100644
--- a/paddle/operators/array_to_lod_tensor_op.cc
+++ b/paddle/operators/array_to_lod_tensor_op.cc
@@ -102,8 +102,9 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
         if (len == 0) {
           continue;
         }
-        out->Slice(out_offset, out_offset + len)
-            .CopyFrom(x[x_idx].Slice(start_offset, end_offset), place, dev_ctx);
+        auto slice = out->Slice(out_offset, out_offset + len);
+        framework::CopyFrom(x[x_idx].Slice(start_offset, end_offset), place,
+                            dev_ctx, &slice);
         out_offset += len;
       }
     }
diff --git a/paddle/operators/assign_op.cc b/paddle/operators/assign_op.cc
index 609e915b932e2bc4d5abee1e5f868cc07a7619d3..0a37f18729a93b15623c0a17e3689e518c38b844 100644
--- a/paddle/operators/assign_op.cc
+++ b/paddle/operators/assign_op.cc
@@ -43,7 +43,8 @@ class AssignFunctor {
     out_rows.set_rows(rows.rows());
     out_rows.set_height(rows.height());
     auto &t = rows.value();
-    out_rows.mutable_value()->CopyFrom(t, t.place(), dev_ctx_);
+    auto *m = out_rows.mutable_value();
+    framework::CopyFrom(t, t.place(), dev_ctx_, m);
   }
 
   template <typename T>
@@ -55,7 +56,7 @@ class AssignFunctor {
   void copy_tensor(const framework::LoDTensor &lod_tensor,
                    framework::LoDTensor *out) const {
     auto &out_tensor = *out;
-    out_tensor.CopyFrom(lod_tensor, lod_tensor.place(), dev_ctx_);
+    CopyFrom(lod_tensor, lod_tensor.place(), dev_ctx_, &out_tensor);
     out_tensor.set_lod(lod_tensor.lod());
   }
 
diff --git a/paddle/operators/batch_norm_op.cc b/paddle/operators/batch_norm_op.cc
index f884e6efa917ce3f8554dce0e248f2b29273e3f3..ac97bd83ab7e7838871586cfe5acb832084b6cec 100644
--- a/paddle/operators/batch_norm_op.cc
+++ b/paddle/operators/batch_norm_op.cc
@@ -62,13 +62,14 @@ class BatchNormOp : public framework::OperatorWithKernel {
     const auto x_dims = ctx->GetInputDim("X");
     const TensorFormat tensor_format =
         StringToTensorFormat(ctx->Attrs().Get<std::string>("tensor_format"));
+
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "Input X must have 2 to 5 dimensions.");
+
     const int C =
         (tensor_format == TensorFormat::NCHW ? x_dims[1]
                                              : x_dims[x_dims.size() - 1]);
 
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "Input X must have 3 to 5 dimensions.");
-
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], C);
     PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
@@ -146,8 +147,8 @@ class BatchNormKernel<platform::CPUPlace, T> : public framework::OpKernel<T> {
 
     const auto *x = ctx.Input<Tensor>("X");
     const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "The Input dim size should be between 3 and 5");
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
     const int N = x_dims[0];
     const int C =
         (tensor_format == TensorFormat::NCHW ? x_dims[1]
@@ -339,8 +340,8 @@ class BatchNormGradKernel<platform::CPUPlace, T>
     // Get the size for each dimension.
     // NCHW [batch_size, in_channels, in_height, in_width]
     const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "The Input dim size should be between 3 and 5");
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
     const int N = x_dims[0];
     const int C =
         (tensor_format == TensorFormat::NCHW ? x_dims[1]
diff --git a/paddle/operators/batch_norm_op.cu.cc b/paddle/operators/batch_norm_op.cu.cc
index 726d1ea1b8d7ced93f94bb0e5bb4df9e43b0ac7b..7b2f3187007fa2491afa75de1cde1910c6ce9bb8 100644
--- a/paddle/operators/batch_norm_op.cu.cc
+++ b/paddle/operators/batch_norm_op.cu.cc
@@ -29,14 +29,21 @@ void ExtractNCWHD(const framework::DDim &dims,
                   const TensorFormat &tensor_format, int *N, int *C, int *H,
                   int *W, int *D) {
   *N = dims[0];
-  *C = tensor_format == TensorFormat::NCHW ? dims[1] : dims[dims.size() - 1];
-  *H = tensor_format == TensorFormat::NCHW ? dims[2] : dims[1];
-  *W = dims.size() > 3
-           ? (tensor_format == TensorFormat::NCHW ? dims[3] : dims[2])
-           : 1;
-  *D = dims.size() > 4
-           ? (tensor_format == TensorFormat::NCHW ? dims[4] : dims[3])
-           : 1;
+  if (dims.size() == 2) {
+    *C = dims[1];
+    *H = 1;
+    *W = 1;
+    *D = 1;
+  } else {
+    *C = tensor_format == TensorFormat::NCHW ? dims[1] : dims[dims.size() - 1];
+    *H = tensor_format == TensorFormat::NCHW ? dims[2] : dims[1];
+    *W = dims.size() > 3
+             ? (tensor_format == TensorFormat::NCHW ? dims[3] : dims[2])
+             : 1;
+    *D = dims.size() > 4
+             ? (tensor_format == TensorFormat::NCHW ? dims[4] : dims[3])
+             : 1;
+  }
 }
 
 template <typename T>
@@ -56,8 +63,8 @@ class BatchNormKernel<platform::GPUPlace, T> : public framework::OpKernel<T> {
     // NCHW [batch_size, in_channels, in_height, in_width]
     const auto *x = ctx.Input<Tensor>("X");
     const auto &x_dims = x->dims();
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "The Input dim size should be between 3 and 5");
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
     int N, C, H, W, D;
     ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D);
 
@@ -180,8 +187,8 @@ class BatchNormGradKernel<platform::GPUPlace, T>
 
     const auto &x_dims = x->dims();
 
-    PADDLE_ENFORCE(x_dims.size() >= 3 && x_dims.size() <= 5,
-                   "The Input dim size should be between 3 and 5");
+    PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
+                   "The Input dim size should be between 2 and 5");
     int N, C, H, W, D;
     ExtractNCWHD(x_dims, tensor_format, &N, &C, &H, &W, &D);
 
diff --git a/paddle/operators/beam_search_decode_op.h b/paddle/operators/beam_search_decode_op.h
index 0f007ec22f9a66572971516a711317f348e1ec5a..3b1c6cd7a1045bfbb896725c79dc1ae2e22f43dc 100644
--- a/paddle/operators/beam_search_decode_op.h
+++ b/paddle/operators/beam_search_decode_op.h
@@ -232,12 +232,12 @@ void BeamSearchDecoder<T>::ConvertSentenceVectorToLodTensor(
   id_tensor->set_lod(lod);
   id_tensor->Resize({static_cast<int64_t>(id_data.size())});
   id_tensor->mutable_data<int64_t>(paddle::platform::CPUPlace());
-  id_tensor->CopyFromVector<int64_t>(id_data, cpu_ctx);
+  framework::CopyFromVector<int64_t>(id_data, cpu_ctx, id_tensor);
 
   score_tensor->set_lod(lod);
   score_tensor->Resize({static_cast<int64_t>(score_data.size())});
   score_tensor->mutable_data<T>(paddle::platform::CPUPlace());
-  score_tensor->CopyFromVector<T>(score_data, cpu_ctx);
+  framework::CopyFromVector<T>(score_data, cpu_ctx, score_tensor);
 }
 
 template <typename T>
diff --git a/paddle/operators/concat_op.cc b/paddle/operators/concat_op.cc
index 5f052689251bc023df635d41c1e64a660a0aa488..6134ac78b145e0c9db0146a38f525204d9f11fed 100644
--- a/paddle/operators/concat_op.cc
+++ b/paddle/operators/concat_op.cc
@@ -25,7 +25,7 @@ class ConcatOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext *ctx) const override {
     PADDLE_ENFORCE_GE(ctx->Inputs("X").size(), 1UL,
-                      "Inputs(X) of ConcatOp should be empty.")
+                      "Inputs(X) of ConcatOp should be empty.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of ConcatOp should not be null.");
 
@@ -45,7 +45,7 @@ class ConcatOp : public framework::OperatorWithKernel {
         }
         PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
                           "Input tensors should have the same "
-                          "elements except the specify axis.")
+                          "elements except the specify axis.");
       }
     }
     ctx->SetOutputDim("Out", out_dims);
diff --git a/paddle/operators/conv_cudnn_op.cc b/paddle/operators/conv_cudnn_op.cc
index c03dc3e4fb07ac6ecde42be93a1138d91778edf4..0dd8c13b2ad6ff206066ccb98a4c009e4c3b4fd0 100644
--- a/paddle/operators/conv_cudnn_op.cc
+++ b/paddle/operators/conv_cudnn_op.cc
@@ -17,10 +17,10 @@
 namespace paddle {
 namespace operators {
 
-class CudnnConvOpMaker : public Conv2DOpMaker {
+class CudnnConv2DOpMaker : public Conv2DOpMaker {
  public:
-  CudnnConvOpMaker(framework::OpProto* proto,
-                   framework::OpAttrChecker* op_checker)
+  CudnnConv2DOpMaker(framework::OpProto* proto,
+                     framework::OpAttrChecker* op_checker)
       : Conv2DOpMaker(proto, op_checker) {
     AddAttr<int>("workspace_size_MB",
                  "workspace size for cudnn, in MB, "
@@ -32,16 +32,43 @@ class CudnnConvOpMaker : public Conv2DOpMaker {
   }
 };
 
+class CudnnConv3DOpMaker : public Conv3DOpMaker {
+ public:
+  CudnnConv3DOpMaker(framework::OpProto* proto,
+                     framework::OpAttrChecker* op_checker)
+      : Conv3DOpMaker(proto, op_checker) {
+    AddAttr<int>("workspace_size_MB",
+                 "workspace size for cudnn, in MB, "
+                 "workspace is a section of GPU memory which will be "
+                 "allocated/freed each time the operator runs, larger "
+                 "workspace size can increase performance but also requires "
+                 "better hardware. This size should be chosen carefully.")
+        .SetDefault(4096);
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(conv_cudnn, ops::ConvOp, ops::CudnnConvOpMaker, conv_cudnn_grad,
-            ops::ConvOpGrad);
+REGISTER_OP(conv2d_cudnn, ops::ConvOp, ops::CudnnConv2DOpMaker,
+            conv2d_cudnn_grad, ops::ConvOpGrad);
+
+REGISTER_OP(conv3d_cudnn, ops::ConvOp, ops::CudnnConv3DOpMaker,
+            conv3d_cudnn_grad, ops::ConvOpGrad);
+
+REGISTER_OP_CPU_KERNEL(conv2d_cudnn,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
+                       ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    conv2d_cudnn_grad,
+    ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
+    ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
 
-REGISTER_OP_CPU_KERNEL(conv_cudnn,
+REGISTER_OP_CPU_KERNEL(conv3d_cudnn,
                        ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
                        ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
-    conv_cudnn_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
+    conv3d_cudnn_grad,
+    ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
     ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/conv_cudnn_op.cu.cc b/paddle/operators/conv_cudnn_op.cu.cc
index 5eaf6b33704eb371fff4b949c6cc32a7a5dbc812..3f97dc7ee0a61944a8a57314b5ec7f33df619bf3 100644
--- a/paddle/operators/conv_cudnn_op.cu.cc
+++ b/paddle/operators/conv_cudnn_op.cu.cc
@@ -56,6 +56,21 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
     ScopedFilterDescriptor filter_desc;
     ScopedConvolutionDescriptor conv_desc;
     DataLayout layout = DataLayout::kNCHW;
+    if (input->dims().size() == 5) {
+      layout = DataLayout::kNCDHW;
+    }
+
+    cudnnConvolutionDescriptor_t cudnn_conv_desc =
+        conv_desc.descriptor<T>(paddings, strides, dilations);
+
+#if CUDNN_VERSION_MIN(7, 0, 1)
+    // cudnn 7 can support groups, no need to do it mannually
+    // FIXME(typhoonzero): find a better way to disable groups
+    // rather than setting it to 1.
+    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
+        cudnn_conv_desc, groups));
+    groups = 1;
+#endif
 
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
         layout, framework::vectorize2int(input->dims()), groups);
@@ -63,19 +78,34 @@ class CudnnConvOpKernel : public framework::OpKernel<T> {
         layout, framework::vectorize2int(output->dims()), groups);
     cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
         layout, framework::vectorize2int(filter->dims()), groups);
-    cudnnConvolutionDescriptor_t cudnn_conv_desc =
-        conv_desc.descriptor<T>(paddings, strides, dilations);
 
     int input_channels = input->dims()[1];
-    int input_height = input->dims()[2];
-    int input_width = input->dims()[3];
-    int output_channels = output->dims()[1];
-    int output_height = output->dims()[2];
-    int output_width = output->dims()[3];
+    int input_height, input_width, input_depth;
+    if (input->dims().size() == 5) {
+      input_depth = input->dims()[2];
+      input_height = input->dims()[3];
+      input_width = input->dims()[4];
+    } else {  // dim size is enforced in InferShape
+      input_depth = 1;
+      input_height = input->dims()[2];
+      input_width = input->dims()[3];
+    }
+    int output_channels = filter->dims()[0];
+    int output_height, output_width, output_depth;
+    if (output->dims().size() == 5) {
+      output_depth = output->dims()[2];
+      output_height = output->dims()[3];
+      output_width = output->dims()[4];
+    } else {
+      output_depth = 1;
+      output_height = output->dims()[2];
+      output_width = output->dims()[3];
+    }
 
-    int group_offset_in = input_channels / groups * input_height * input_width;
+    int group_offset_in =
+        input_channels / groups * input_height * input_width * input_depth;
     int group_offset_out =
-        output_channels / groups * output_height * output_width;
+        output_channels / groups * output_height * output_width * output_depth;
     int group_offset_filter = filter->numel() / groups;
     // ------------------- cudnn conv workspace ---------------------
     void* cudnn_workspace = nullptr;
@@ -138,12 +168,26 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
     // ------------------- cudnn descriptors ---------------------
     ScopedTensorDescriptor input_desc;
     ScopedTensorDescriptor output_grad_desc;
-    ScopedTensorDescriptor input_grad_desc;
 
     ScopedFilterDescriptor filter_desc;
     ScopedFilterDescriptor filter_grad_desc;
     ScopedConvolutionDescriptor conv_desc;
     DataLayout layout = DataLayout::kNCHW;
+    if (input->dims().size() == 5) {
+      layout = DataLayout::kNCDHW;
+    }
+
+    cudnnConvolutionDescriptor_t cudnn_conv_desc =
+        conv_desc.descriptor<T>(paddings, strides, dilations);
+
+#if CUDNN_VERSION_MIN(7, 0, 1)
+    // cudnn 7 can support groups, no need to do it mannually
+    // FIXME(typhoonzero): find a better way to disable groups
+    // rather than setting it to 1.
+    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionGroupCount(
+        cudnn_conv_desc, groups));
+    groups = 1;
+#endif
 
     cudnnTensorDescriptor_t cudnn_input_desc = input_desc.descriptor<T>(
         layout, framework::vectorize2int(input->dims()), groups);
@@ -152,22 +196,35 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
             layout, framework::vectorize2int(output_grad->dims()), groups);
     cudnnFilterDescriptor_t cudnn_filter_desc = filter_desc.descriptor<T>(
         layout, framework::vectorize2int(filter->dims()), groups);
-    cudnnTensorDescriptor_t cudnn_input_grad_desc = nullptr;
-    cudnnFilterDescriptor_t cudnn_filter_grad_desc = nullptr;
-
-    cudnnConvolutionDescriptor_t cudnn_conv_desc =
-        conv_desc.descriptor<T>(paddings, strides, dilations);
 
     int input_channels = input->dims()[1];
-    int input_height = input->dims()[2];
-    int input_width = input->dims()[3];
+    int input_height, input_width, input_depth;
+    if (input->dims().size() == 5) {
+      input_depth = input->dims()[2];
+      input_height = input->dims()[3];
+      input_width = input->dims()[4];
+    } else {  // dim size is enforced in InferShape
+      input_depth = 1;
+      input_height = input->dims()[2];
+      input_width = input->dims()[3];
+    }
+
     int output_grad_channels = filter->dims()[0];
-    int output_grad_height = output_grad->dims()[2];
-    int output_grad_width = output_grad->dims()[3];
+    int output_grad_height, output_grad_width, output_grad_depth;
+    if (input->dims().size() == 5) {
+      output_grad_depth = output_grad->dims()[2];
+      output_grad_height = output_grad->dims()[3];
+      output_grad_width = output_grad->dims()[4];
+    } else {
+      output_grad_depth = 1;
+      output_grad_height = output_grad->dims()[2];
+      output_grad_width = output_grad->dims()[3];
+    }
 
-    int group_offset_in = input_channels / groups * input_height * input_width;
-    int group_offset_out =
-        output_grad_channels / groups * output_grad_height * output_grad_width;
+    int group_offset_in =
+        input_channels / groups * input_height * input_width * input_depth;
+    int group_offset_out = output_grad_channels / groups * output_grad_height *
+                           output_grad_width * output_grad_depth;
     int group_offset_filter = filter->numel() / groups;
     // ------------------- cudnn backward algorithm ---------------------
     cudnnConvolutionBwdDataAlgo_t data_algo;
@@ -180,8 +237,6 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
 
     auto handle = ctx.cuda_device_context().cudnn_handle();
     if (input_grad) {
-      cudnn_input_grad_desc = input_grad_desc.descriptor<T>(
-          layout, framework::vectorize2int(input_grad->dims()), groups);
       PADDLE_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardDataAlgorithm(
               handle, cudnn_filter_desc,
@@ -190,19 +245,17 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
               cudnn_output_grad_desc, cudnn_conv_desc,
               // dxDesc: Handle to the previously initialized output tensor
               // descriptor.
-              cudnn_input_grad_desc,
+              cudnn_input_desc,
               CUDNN_CONVOLUTION_BWD_DATA_SPECIFY_WORKSPACE_LIMIT,
               workspace_size_limit, &data_algo));
       PADDLE_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
               handle, cudnn_filter_desc, cudnn_output_grad_desc,
-              cudnn_conv_desc, cudnn_input_grad_desc, data_algo, &tmp_size));
+              cudnn_conv_desc, cudnn_input_desc, data_algo, &tmp_size));
       workspace_size_in_bytes = std::max(workspace_size_in_bytes, tmp_size);
     }
 
     if (filter_grad) {
-      cudnn_filter_grad_desc = filter_grad_desc.descriptor<T>(
-          layout, framework::vectorize2int(filter_grad->dims()), groups);
       PADDLE_ENFORCE(
           platform::dynload::cudnnGetConvolutionBackwardFilterAlgorithm(
               handle, cudnn_input_desc, cudnn_output_grad_desc, cudnn_conv_desc,
@@ -222,7 +275,6 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
     platform::GPUPlace gpu = boost::get<platform::GPUPlace>(ctx.GetPlace());
     cudnn_workspace = paddle::memory::Alloc(gpu, workspace_size_in_bytes);
     // ------------------- cudnn conv backward data ---------------------
-    // FIXME(typhoonzero): template type T may not be the same as cudnn call.
     T alpha = 1.0f, beta = 0.0f;
     if (input_grad) {
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
@@ -233,21 +285,20 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
             handle, &alpha, cudnn_filter_desc,
             filter_data + i * group_offset_filter, cudnn_output_grad_desc,
             output_grad_data + i * group_offset_out, cudnn_conv_desc, data_algo,
-            cudnn_workspace, workspace_size_in_bytes, &beta,
-            cudnn_input_grad_desc, input_grad_data + i * group_offset_in));
+            cudnn_workspace, workspace_size_in_bytes, &beta, cudnn_input_desc,
+            input_grad_data + i * group_offset_in));
       }
     }
     // ------------------- cudnn conv backward filter ---------------------
     if (filter_grad) {
       T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
       // Because beta is zero, it is unnecessary to reset filter_grad.
-
       for (int i = 0; i < groups; i++) {
         PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
             handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
             cudnn_output_grad_desc, output_grad_data + i * group_offset_out,
             cudnn_conv_desc, filter_algo, cudnn_workspace,
-            workspace_size_in_bytes, &beta, cudnn_filter_grad_desc,
+            workspace_size_in_bytes, &beta, cudnn_filter_desc,
             filter_grad_data + i * group_offset_filter));
       }
     }
@@ -259,8 +310,16 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_GPU_KERNEL(conv_cudnn, paddle::operators::CudnnConvOpKernel<float>,
+REGISTER_OP_GPU_KERNEL(conv2d_cudnn,
+                       paddle::operators::CudnnConvOpKernel<float>,
+                       paddle::operators::CudnnConvOpKernel<double>);
+REGISTER_OP_GPU_KERNEL(conv2d_cudnn_grad,
+                       paddle::operators::CudnnConvGradOpKernel<float>,
+                       paddle::operators::CudnnConvGradOpKernel<double>);
+
+REGISTER_OP_GPU_KERNEL(conv3d_cudnn,
+                       paddle::operators::CudnnConvOpKernel<float>,
                        paddle::operators::CudnnConvOpKernel<double>);
-REGISTER_OP_GPU_KERNEL(conv_cudnn_grad,
+REGISTER_OP_GPU_KERNEL(conv3d_cudnn_grad,
                        paddle::operators::CudnnConvGradOpKernel<float>,
                        paddle::operators::CudnnConvGradOpKernel<double>);
diff --git a/paddle/operators/conv_op.cc b/paddle/operators/conv_op.cc
index 7a36a9b21aa6a1b415ac5a232e65eda8051c87f8..462e6d9cbcbe61d9911efe8beff4446620e1e932 100644
--- a/paddle/operators/conv_op.cc
+++ b/paddle/operators/conv_op.cc
@@ -97,7 +97,7 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
       .SetDefault({0, 0});
   AddAttr<int>(
       "groups",
-      "(int default:1), the group size of convolution operator. "
+      "(int default:1), the groups number of the convolution operator. "
       "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
       "when group=2, the first half of the filters is only connected to the "
       "first half of the input channels, while the second half of the filters "
@@ -112,23 +112,29 @@ Conv2DOpMaker::Conv2DOpMaker(framework::OpProto* proto,
 Convolution Operator.
 
 The convolution operation calculates the output based on the input, filter
-and strides, paddings, groups, dilations parameters. The size of each dimension of the
+and strides, paddings, dilations, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
-Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch
+Input(Input) and Output(Output) are in NCHW format. Where N is batch
 size, C is the number of channels, H is the height of the feature, and W is
-the width of the feature. Parameters(ksize, strides, paddings, dilations) are two elements.
-These two elements represent height and width, respectively.
+the width of the feature.
+Filters(Input) is MCHW format. Where M is the number of output image channels, C is
+the number of input image channels, H is the height of the filter, and W
+is the width of the filter.
+Parameters(strides, paddings, dilations) are two elements. These two elements represent
+height and width, respectively.
 The input(X) size and output(Out) size may be different.
 
 Example:
   Input:
-       Input shape: (N, C_in, H_in, W_in)
-       Filter shape: (C_out, C_in, H_f, W_f)
+       Input shape: $(N, C_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{out}, C_{in}, H_f, W_f)$
   Output:
-       Output shape: (N, C_out, H_out, W_out)
-  where
-       H_out = (H_in + 2 * paddings[0] - (dilations[0]*(filter_size[0] - 1) + 1)) / strides[0] + 1;
-       W_out = (W_in + 2 * paddings[1] - (dilations[1]*(filter_size[1] - 1) + 1)) / strides[1] + 1;
+       Output shape: $(N, C_{out}, H_{out}, W_{out})$
+  Where
+$$
+       H_{out}= \frac{(H_{in} + 2 * paddings[0] - (dilations[0] * (H_f - 1) + 1))}{strides[0]}+ 1 \\
+       W_{out}= \frac{(W_{in} + 2 * paddings[1] - (dilations[1] * (W_f - 1) + 1))}{strides[1]}+ 1
+$$
 )DOC");
 }
 
@@ -165,7 +171,7 @@ Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto,
       .SetDefault({0, 0, 0});
   AddAttr<int>(
       "groups",
-      "(int default:1), the group size of convolution operator. "
+      "(int default:1), the groups number of the convolution operator. "
       "According to grouped convolution in Alex Krizhevsky's Deep CNN paper: "
       "when group=2, the first half of the filters is only connected to the "
       "first half of the input channels, while the second half of the filters "
@@ -174,32 +180,37 @@ Conv3DOpMaker::Conv3DOpMaker(framework::OpProto* proto,
   AddAttr<std::vector<int>>("dilations",
                             "(vector<int> default:{1, 1, 1}), the "
                             "dilations(d_dilation, h_dilation, w_dilation) of "
-                            "convolution operator. Currently, conv3d doesn't "
-                            "support dilation.")
+                            "convolution operator.")
       .SetDefault({1, 1, 1});
 
   AddComment(R"DOC(
 Convolution3D Operator.
 
 The convolution operation calculates the output based on the input, filter
-and strides, paddings, groups parameters. The size of each dimension of the
+and strides, paddings, dilations, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
-Input(Input, Filter) and output(Output) are in NCDHW format. Where N is batch
+Input(Input) and output(Output) are in NCDHW format, where N is batch
 size, C is the number of channels,D is the depth of the feature, H is the height of
-the feature, and W is the width of the feature. Parameters(ksize, strides, paddings)
-are three elements. These three elements represent depth, height and width, respectively.
+the feature, and W is the width of the feature.
+Filters(Input) is MCDHW format, where M is the number of output image channels,
+C is the number of input image channels, D is the depth of the filter,
+H is the height of the filter, and W is the width of the filter.
+Parameters(strides, paddings, dilations) are three elements. These three elements
+represent depth, height and width, respectively.
 The input(X) size and output(Out) size may be different.
 
 Example:
   Input:
-       Input shape: (N, C_in, D_in, H_in, W_in)
-       Filter shape: (C_out, C_in, D_f, H_f, W_f)
+       Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{out}, C_{in}, D_f, H_f, W_f)$
   Output:
-       Output shape: (N, C_out, D_out, H_out, W_out)
-  where
-       D_out = (D_in - filter_size[0] + 2 * paddings[0]) / strides[0] + 1;
-       H_out = (H_in - filter_size[1] + 2 * paddings[1]) / strides[1] + 1;
-       W_out = (W_in - filter_size[2] + 2 * paddings[2]) / strides[2] + 1;
+       Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
+  Where
+  $$
+       D_{out}= \frac{(D_{in} + 2 * paddings[0] - (dilations[0] * (D_f - 1) + 1))}{ strides[0]}+ 1 \\
+       H_{out}= \frac{(H_{in} + 2 * paddings[1] - (dilations[1] * (H_f - 1) + 1))}{ strides[1]}+ 1 \\
+       W_{out}= \frac{(W_{in} + 2 * paddings[2] - (dilations[2] * (W_f - 1) + 1))}{ strides[2]}+ 1
+  $$
 )DOC");
 }
 
diff --git a/paddle/operators/conv_transpose_op.cc b/paddle/operators/conv_transpose_op.cc
index 3e55ef036a7fb976117054574d1347fa943acd55..678b192dea78fc6b4a6b54c4bb09a55dfb8f9c38 100644
--- a/paddle/operators/conv_transpose_op.cc
+++ b/paddle/operators/conv_transpose_op.cc
@@ -39,7 +39,7 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
                  "ConvTransposeOp input dimension and strides dimension should "
                  "be consistent.");
   PADDLE_ENFORCE_EQ(paddings.size(), strides.size(),
-                    "ConvTransposeOp paddings dimension and Conv strides "
+                    "ConvTransposeOp paddings dimension and strides "
                     "dimension should be the same.");
   PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[0],
                     "In ConvTransposeOp, The input channel should be the same "
@@ -62,24 +62,25 @@ Conv2DTransposeOpMaker::Conv2DTransposeOpMaker(
       "The format of input tensor is NCHW. Where N is batch size, C is the "
       "number of input channels, H is the height of the feature, and "
       "W is the width of the feature.");
-  AddInput("Filter",
-           "(Tensor) The filter tensor of convolution transpose operator. "
-           "The format of the filter tensor is CMHW, where C is the number of "
-           "output image channels, M is the number of input image channels, "
-           "H is the height of the filter, and W is the width of the filter. "
-           "We enforce groups number == 1 and padding == 0 in "
-           "the convolution transpose scenario.");
+  AddInput(
+      "Filter",
+      "(Tensor) The filter tensor of convolution transpose operator. "
+      "The format of the filter tensor is MCHW, where M is the number of "
+      "input feature channels, C is the number of "
+      "output feature channels,"
+      "H is the height of the filter, and W is the width of the filter. "
+      "We enforce groups number == 1 in the convolution transpose scenario.");
   AddOutput("Output",
             "(Tensor) The output tensor of convolution transpose operator. "
             "The format of output tensor is also NCHW.");
   AddAttr<std::vector<int>>(
       "strides",
-      "(vector<int> defalut:{1, 1}), the strides(h_stride, w_stride) of "
+      "(vector<int> default:{1, 1}), the strides(h_stride, w_stride) of "
       "convolution transpose operator.")
       .SetDefault({1, 1});
   AddAttr<std::vector<int>>(
       "paddings",
-      "(vector<int> defalut:{0, 0}), the paddings(h_pad, w_pad) of convolution "
+      "(vector<int> default:{0, 0}), the paddings(h_pad, w_pad) of convolution "
       "transpose operator.")
       .SetDefault({0, 0});
   AddComment(R"DOC(
@@ -88,21 +89,26 @@ Convolution2D Transpose Operator.
 The convolution transpose operation calculates the output based on the input, filter
 and strides, paddings, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
-
-Input(Input, Filter) and output(Output) are in NCHW format. Where N is batch
-size, C is the number of channels, H is the height of the feature, and 
-W is the width of the feature. Parameters(ksize, strides, paddings) are two elements.
-These two elements represent height and width, respectively.
+Input(Input) and output(Output) are in NCHW format. Where N is batchsize, C is the
+number of channels, H is the height of the feature, and W is the width of the feature.
+Filter(Input) is in MCHW format. Where M is the number of input feature channels,
+C is the number of output feature channels, H is the height of the filter,
+and W is the width of the filter.
+Parameters(strides, paddings) are two elements. These two elements represent height
+and width, respectively.
 The input(X) size and output(Out) size may be different.
+
 Example:
   Input:
-       Input shape: (N, C_in, H_in, W_in)
-       Filter shape: (C_in, C_out, H_f, W_f)
+       Input shape: $(N, C_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{in}, C_{out}, H_f, W_f)$
   Output:
-       Output shape: (N, C_out, H_out, W_out)
-  where
-       H_out = (H_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0];
-       W_out = (W_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1];
+       Output shape: $(N, C_{out}, H_{out}, W_{out})$
+  Where
+  $$
+       H_{out} = (H_{in} - 1) * strides[0] - 2 * paddings[0] + H_f \\
+       W_{out} = (W_{in} - 1) * strides[1] - 2 * paddings[1] + W_f
+  $$
 )DOC");
 }
 
@@ -117,8 +123,9 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
            "W is the width of the feature.");
   AddInput("Filter",
            "(Tensor) The filter tensor of convolution transpose operator."
-           "The format of the filter tensor is CMDHW, where C is the number of "
-           "output image channels, M is the number of input image channels, D "
+           "The format of the filter tensor is MCDHW, where M is the number of "
+           "input feature channels, C is the number of "
+           "output feature channels, D "
            "is the depth of the filter, H is the height of the filter, and "
            "W is the width of the filter."
            "We enforce groups number == 1 and padding == 0 in "
@@ -130,12 +137,12 @@ Conv3DTransposeOpMaker::Conv3DTransposeOpMaker(
             "the number of channels, D is the depth of the feature, H is the "
             "height of the feature, and W is the width of the feature.");
   AddAttr<std::vector<int>>("strides",
-                            "(vector<int> defalut:{1, 1, 1}), the "
+                            "(vector<int> default:{1, 1, 1}), the "
                             "strides{d_stride, h_stride, w_stride} of "
                             "convolution transpose operator.")
       .SetDefault({1, 1, 1});
   AddAttr<std::vector<int>>("paddings",
-                            "(vector<int> defalut:{0, 0, 0}), paddings(d_pad, "
+                            "(vector<int> default:{0, 0, 0}), paddings(d_pad, "
                             "h_pad, w_pad) of convolution transpose operator.")
       .SetDefault({0, 0, 0});
   AddComment(R"DOC(
@@ -144,23 +151,28 @@ Convolution3D Transpose Operator.
 The convolution transpose operation calculates the output based on the input, filter
 and strides, paddings, groups parameters. The size of each dimension of the
 parameters is checked in the infer-shape.
-
-Input(Input, Filter) and output(Output) are in NCDHW format. Where N is batch
-size, C is the number of channels, D is the depth of the feature, 
-H is the height of the feature, and W is the width of the feature. 
-Parameters(ksize, strides, paddings) are three elements.
-These three elements represent depth, height and width, respectively.
+Input(Input) and output(Output) are in NCDHW format. Where N is batch size, C is the
+number of channels, D is the depth of the feature, H is the height of the feature,
+and W is the width of the feature.
+Filter(Input) is in MCDHW format. Where M is the number of input feature channels,
+C is the number of output feature channels, D is the depth of the filter,H is the
+height of the filter, and W is the width of the filter.
+Parameters(strides, paddings) are three elements. These three elements represent
+depth, height and width, respectively.
 The input(X) size and output(Out) size may be different.
-Example:
+
+Example:   
   Input:
-       Input shape: (N, C_in, D_in, H_in, W_in)
-       Filter shape: (C_in, C_out, D_f, H_f, W_f)
+       Input shape: $(N, C_{in}, D_{in}, H_{in}, W_{in})$
+       Filter shape: $(C_{in}, C_{out}, D_f, H_f, W_f)$
   Output:
-       Output shape: (N, C_out, D_out, H_out, W_out)
-  where
-       D_out = (D_in - 1) * strides[0] - 2 * paddings[0] + filter_size[0];
-       H_out = (H_in - 1) * strides[1] - 2 * paddings[1] + filter_size[1];
-       W_out = (W_in - 1) * strides[2] - 2 * paddings[2] + filter_size[2];
+       Output shape: $(N, C_{out}, D_{out}, H_{out}, W_{out})$
+  Where
+  $$
+       D_{out} = (D_{in} - 1) * strides[0] - 2 * paddings[0] + D_f \\
+       H_{out} = (H_{in} - 1) * strides[1] - 2 * paddings[1] + H_f \\
+       W_{out} = (W_{in} - 1) * strides[2] - 2 * paddings[2] + W_f
+  $$
 )DOC");
 }
 
diff --git a/paddle/operators/conv_transpose_op.h b/paddle/operators/conv_transpose_op.h
index 0fc0735788c499c2d520c0cc689e1ce07ba67ce8..1cacb770e6af3ad3c99ab81c5598ffcd228f59b2 100644
--- a/paddle/operators/conv_transpose_op.h
+++ b/paddle/operators/conv_transpose_op.h
@@ -63,7 +63,6 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
 
     std::vector<int> strides = context.Attr<std::vector<int>>("strides");
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
-    // TODO(Zhuoyuan): Paddings can be added in future.
     // groups will alway be disabled in conv2dtranspose.
 
     const int batch_size = static_cast<int>(input->dims()[0]);
diff --git a/paddle/operators/detail/CMakeLists.txt b/paddle/operators/detail/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f6bdc63cc2cfae526fe911ee4d989675452d5c5d
--- /dev/null
+++ b/paddle/operators/detail/CMakeLists.txt
@@ -0,0 +1 @@
+grpc_library(sendrecvop_grpc SRCS recv_impl.cc send_impl.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
diff --git a/paddle/operators/detail/recv_impl.cc b/paddle/operators/detail/recv_impl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..89dc5045221156eed7aa9411bc96ad86f91136d2
--- /dev/null
+++ b/paddle/operators/detail/recv_impl.cc
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "send_recv_impl.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+Status SendRecvServerImpl::SendVariable(ServerContext *context,
+                                        const VariableMessage *in_var,
+                                        VariableMessage *out_var) {
+  framework::LoDTensor t;
+  // TODO(typhoonzero): desirealize in_tensor and run pserver network.
+  std::istringstream iss(in_var->serialized());
+  framework::DeserializeFromStream(iss, &t);
+  lodtensor_queue_.Push(std::move(t));
+  // Block util the sub graph is done.
+  t = lodtensor_return_queue_.Pop();
+  std::ostringstream oss;
+  // FIXME(typhoonzero): get context from op.
+  framework::SerializeToStream(oss, t, platform::CPUDeviceContext());
+  std::string *varname = out_var->mutable_varname();
+  *varname = in_var->varname();
+  std::string *serialized = out_var->mutable_serialized();
+  *serialized = oss.str();
+
+  return Status::OK;
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/send_impl.cc b/paddle/operators/detail/send_impl.cc
new file mode 100644
index 0000000000000000000000000000000000000000..da1ddf75d2afb85670c5ea0c9884376415f28208
--- /dev/null
+++ b/paddle/operators/detail/send_impl.cc
@@ -0,0 +1,54 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "send_recv_impl.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+bool RPCClient::SendVariable(const framework::Scope& scope,
+                             const std::string& inname,
+                             const std::string& outname) {
+  ClientContext context;
+  VariableMessage msg, out_msg;
+  // FIXME(typhoonzero): pass device context to here.
+  auto ctx = platform::CPUDeviceContext();
+  auto* var = scope.FindVar(inname);
+  PADDLE_ENFORCE(var);
+  // TODO(typhoonzero): support SelectedRows
+  PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
+                 "Only support LoDTensor, %s has wrong type", inname);
+  const framework::LoDTensor& tensor = var->Get<framework::LoDTensor>();
+  std::ostringstream oss;
+  framework::SerializeToStream(oss, tensor, ctx);
+  msg.set_varname(inname);
+  msg.set_serialized(oss.str());
+  Status status = stub_->SendVariable(&context, msg, &out_msg);
+  if (!status.ok()) {
+    return false;
+  }
+  std::istringstream iss(out_msg.serialized());
+  framework::LoDTensor ret_tensor;
+  framework::DeserializeFromStream(iss, &ret_tensor);
+  auto* outvar = scope.FindVar(outname);
+  framework::LoDTensor* out_tensor = outvar->GetMutable<framework::LoDTensor>();
+  // FIXME(typhoonzero): do not copy.
+  framework::CopyFrom(ret_tensor, ctx.GetPlace(), ctx, out_tensor);
+  return true;
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/send_recv.proto b/paddle/operators/detail/send_recv.proto
new file mode 100644
index 0000000000000000000000000000000000000000..07ff9d2c621a2dfb51792821a0d3fc398c315835
--- /dev/null
+++ b/paddle/operators/detail/send_recv.proto
@@ -0,0 +1,35 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+syntax = "proto3";
+
+package sendrecv;
+
+service SendRecvService {
+  // For parameter server round-robin like hashing, do not split tensors.
+  // Send and recv only one tensor
+  rpc SendVariable(VariableMessage) returns (VariableMessage) {}
+}
+
+// VariableMessage is serialized paddle variable message.
+// It can be:
+// Tensor
+// LoDTensor
+// SelectedRows
+message VariableMessage {
+  string varname = 1;
+  bytes serialized = 2;
+}
+
+message VoidMessage {}
diff --git a/paddle/operators/detail/send_recv_impl.h b/paddle/operators/detail/send_recv_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..b9a5340a8636db7b5d6ec7b21368632d3916b4aa
--- /dev/null
+++ b/paddle/operators/detail/send_recv_impl.h
@@ -0,0 +1,87 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/selected_rows.h"
+#include "paddle/operators/detail/simple_block_queue.h"
+
+// #include <grpc++/channel.h>
+// #include <grpc++/client_context.h>
+// #include <grpc++/create_channel.h>
+// #include <grpc++/security/credentials.h>
+#include "paddle/operators/detail/send_recv.grpc.pb.h"
+#include "paddle/operators/detail/send_recv.pb.h"
+
+#include <grpc++/grpc++.h>
+
+using grpc::Channel;
+using grpc::Server;
+using grpc::ServerContext;
+using grpc::ServerReader;
+using grpc::ServerBuilder;
+
+using grpc::ClientContext;
+using grpc::ClientReader;
+using grpc::ClientReaderWriter;
+using grpc::ClientWriter;
+using grpc::Status;
+using sendrecv::SendRecvService;
+using sendrecv::VariableMessage;
+using sendrecv::VoidMessage;
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+class SendRecvServerImpl final : public SendRecvService::Service {
+ public:
+  explicit SendRecvServerImpl() {}
+
+  Status SendVariable(ServerContext *context, const VariableMessage *in_var,
+                      VariableMessage *out_var) override;
+
+  const framework::LoDTensor Get() { return this->lodtensor_queue_.Pop(); }
+
+  void Push(const framework::LoDTensor &tensor) {
+    this->lodtensor_return_queue_.Push(tensor);
+  }
+
+ private:
+  SimpleBlockQueue<framework::LoDTensor> lodtensor_queue_;
+  SimpleBlockQueue<framework::LoDTensor> lodtensor_return_queue_;
+  SimpleBlockQueue<framework::SelectedRows> selected_rows_queue_;
+  SimpleBlockQueue<framework::SelectedRows> selected_rows_return_queue_;
+};
+
+// RPCClient is a class to send tensors to pserver sub-network
+// using different hashing methods.
+class RPCClient {
+ public:
+  RPCClient(std::shared_ptr<Channel> channel)
+      : stub_(SendRecvService::NewStub(channel)) {}
+
+  bool SendVariable(const framework::Scope &scope, const std::string &inname,
+                    const std::string &outname);
+
+ private:
+  std::unique_ptr<SendRecvService::Stub> stub_;
+};
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/simple_block_queue.h b/paddle/operators/detail/simple_block_queue.h
new file mode 100644
index 0000000000000000000000000000000000000000..44899217579532af2c1d2e6074ec0e08231e7b86
--- /dev/null
+++ b/paddle/operators/detail/simple_block_queue.h
@@ -0,0 +1,52 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <condition_variable>
+#include <deque>
+#include <mutex>
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+template <typename T>
+class SimpleBlockQueue {
+ private:
+  std::mutex mutex_;
+  std::condition_variable condition_;
+  std::deque<T> queue_;
+
+ public:
+  void Push(T const& value) {
+    {
+      std::unique_lock<std::mutex> lock(this->mutex_);
+      queue_.push_front(value);
+    }
+    this->condition_.notify_one();
+  }
+
+  T Pop() {
+    std::unique_lock<std::mutex> lock(this->mutex_);
+    this->condition_.wait(lock, [=] { return !this->queue_.empty(); });
+    T rc(std::move(this->queue_.back()));
+    this->queue_.pop_back();
+    return rc;
+  }
+};
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/dynamic_recurrent_op.cc b/paddle/operators/dynamic_recurrent_op.cc
deleted file mode 100644
index d48cc4e8df587708ab93e7d788145adc01c1d3e5..0000000000000000000000000000000000000000
--- a/paddle/operators/dynamic_recurrent_op.cc
+++ /dev/null
@@ -1,418 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve .
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/operators/dynamic_recurrent_op.h"
-
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Scope;
-using framework::TensorArray;
-using framework::LoDTensor;
-using framework::Variable;
-using framework::OperatorBase;
-using framework::DySeqMetaBatch;
-
-namespace detail {
-
-inline void CreateVariables(Scope& scope,
-                            const std::vector<std::string>& var_names) {
-  for (const auto& name : var_names) {
-    scope.Var(name);
-  }
-}
-
-/*
- * The inputs with sequence should be reordered when they are split, so the
- * boot_states should be reordered in the same order.
- *
- * NOTE This may require that the `pre_state` of the first time step should just
- * copy the `boot_state` rather than reference it, for that the content should
- * be reordered, but the RNN op should not change the `boot_state` as an input
- * variable's content.
- */
-inline void ReorderInitialState(const DySeqMetaBatch& metas,
-                                const LoDTensor& boot_state, LoDTensor* tensor,
-                                const platform::Place& dst_place) {
-  for (size_t seq_id = 0; seq_id < metas.size(); seq_id++) {
-    auto slice = tensor->Slice(seq_id, seq_id + 1);
-    auto boot_slice =
-        boot_state.Slice(metas[seq_id].ori_idx, metas[seq_id].ori_idx + 1);
-    // TODO(superjom) pass in device context as an argument
-    slice.CopyFrom(boot_slice, dst_place, platform::CPUDeviceContext());
-  }
-}
-
-inline void RestoreInitialState(const DySeqMetaBatch& metas,
-                                const LoDTensor& tensor, LoDTensor* boot_state,
-                                const platform::Place& dst_place) {
-  for (size_t seq_id = 0; seq_id < metas.size(); seq_id++) {
-    auto slice = tensor.Slice(seq_id, seq_id + 1);
-    auto boot_slice =
-        boot_state->Slice(metas[seq_id].ori_idx, metas[seq_id].ori_idx + 1);
-    boot_slice.CopyFrom(slice, dst_place, platform::CPUDeviceContext());
-  }
-}
-
-}  // namespace detail
-
-// Implementation for forward propagation.
-template <>
-void RNNAlgorithm::Run<RNNAlgorithm::ComputeMode::kForward>(
-    const framework::Scope& scope, const framework::OperatorBase& op,
-    const platform::DeviceContext& dev_ctx) {
-  SetComputeMode(ComputeMode::kForward);
-  cache_.Init(kArgNames[mode_], op, scope, &dev_ctx, &arg_);
-  SplitInputs();
-  CreateScopes();
-  WriteStepInputs();
-  InitStates();
-  WriteStepOutputs();
-  RunSteps();
-  ConcatOutputs();
-}
-
-// Implementation for backward propagation.
-template <>
-void RNNAlgorithm::Run<RNNAlgorithm::ComputeMode::kBackward>(
-    const framework::Scope& scope, const framework::OperatorBase& op,
-    const platform::DeviceContext& dev_ctx) {
-  SetComputeMode(ComputeMode::kBackward);
-  cache_.Init(kArgNames[mode_], op, scope, &dev_ctx, &arg_);
-  SplitInputs();
-  WriteStepInputs();
-  InitStates();
-  WriteStepOutputs();
-  RunSteps();
-  // copy boot-states' gradients back.
-  for (const auto& state : arg_.states) {
-    ExportInitialStateGradient(state);
-  }
-
-  ConcatOutputs();
-}
-
-void RNNAlgorithm::SplitInputs() {
-  // TODO(superjom) make level a config
-  // TODO(superjom) check all the inputs has the same LoD
-  int level = 0;
-  for (const auto& item : cache_.inputs) {
-    const auto& var = item.second;
-    const auto& tensor = var->Get<LoDTensor>();
-    TensorArray& ta = step_inputs_[item.first];
-
-    dy_seq_metas_[item.first] =
-        ta.Unpack(tensor, level, true /*length_descend*/);
-
-    if (cache_.num_steps) {
-      PADDLE_ENFORCE_EQ(ta.size(), cache_.num_steps,
-                        "inputs should have the same steps");
-    } else {
-      cache_.num_steps = ta.size();
-    }
-  }
-}
-
-void RNNAlgorithm::WriteStepInputs() {
-  for (const auto& item : cache_.inputs) {
-    auto ta_it = step_inputs_.find(item.first);
-    PADDLE_ENFORCE(ta_it != step_inputs_.end(),
-                   "step_inputs_ not compatible with memory set");
-    TensorArray& ta = ta_it->second;
-    for (size_t step = 0; step < ta.size(); step++) {
-      auto tensor = ta.Read(step);
-      auto& step_scope = cache_.GetScope(step);
-      Variable* var = step_scope.FindVar(item.first);
-      if (var == nullptr) {
-        var = step_scope.Var(item.first);
-      }
-      var->GetMutable<LoDTensor>()->ShareDataWith(tensor);
-    }
-  }
-}
-
-void RNNAlgorithm::WriteStepOutputs() {
-  // initialize step outputs
-  for (const auto& item : cache_.outputs) {
-    step_outputs_.emplace(item.first, TensorArray());
-  }
-  PADDLE_ENFORCE_GT(step_outputs_.size(), 0UL);
-}
-
-void RNNAlgorithm::CreateScopes() {
-  PADDLE_ENFORCE_GT(cache_.num_steps, 0);
-  // resize scopes
-  size_t num_scopes_need_create = cache_.num_steps - cache_.scopes->size();
-  for (size_t i = 0; i < num_scopes_need_create; i++) {
-    cache_.scopes->emplace_back(&cache_.scope->NewScope());
-  }
-
-  // init temporary inputs
-  PADDLE_ENFORCE_NOT_NULL(step_unit_, "stepnet should be set first");
-  std::vector<std::string> states;
-  std::vector<std::string> ex_states;
-  std::vector<std::string> step_unit_outputs;
-  std::transform(arg_.states.begin(), arg_.states.end(),
-                 std::back_inserter(states),
-                 [](const rnn::StateAttr& m) { return m.var; });
-  std::transform(arg_.states.begin(), arg_.states.end(),
-                 std::back_inserter(ex_states),
-                 [](const rnn::StateAttr& m) { return m.pre_var; });
-  for (const auto& item : step_unit_->Outputs()) {
-    for (const auto& var : item.second) {
-      step_unit_outputs.push_back(var);
-    }
-  }
-
-  for (size_t step = 0; step < cache_.num_steps; step++) {
-    auto& scope = cache_.GetScope(step);
-    detail::CreateVariables(scope, arg_.inlinks);
-    detail::CreateVariables(scope, arg_.outlinks);
-    detail::CreateVariables(scope, states);
-    detail::CreateVariables(scope, ex_states);
-    detail::CreateVariables(scope, step_unit_outputs);
-  }
-}
-
-void RNNAlgorithm::ConcatOutputs() {
-  // TODO(superjom) transform this to a config
-  int level = 0;
-  for (size_t step = 0; step < cache_.num_steps; step++) {
-    auto& scope = cache_.GetScope(step);
-    for (auto& item : step_outputs_) {
-      auto* var = scope.FindVar(item.first);
-      PADDLE_ENFORCE_NOT_NULL(var);
-      auto* tensor = var->GetMutable<LoDTensor>();
-      tensor->mutable_data<value_type>(platform::CPUPlace());
-      item.second.WriteShared(step, *tensor);
-    }
-  }
-  // the inputs' lods should be the same, so randomly get one lod.
-  const auto& some_lod =
-      cache_.scope->FindVar(arg_.inlinks.front())->Get<LoDTensor>().lod();
-  const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()];
-  for (auto& item : step_outputs_) {
-    auto tensor = item.second.Pack(level, some_meta, some_lod);
-    auto* output = cache_.outputs[item.first]->GetMutable<LoDTensor>();
-    const_cast<LoDTensor*>(output)->ShareDataWith(tensor);
-  }
-}
-
-void RNNAlgorithm::RunSteps() {
-  if (IsBackward()) {
-    // call stepnet in all the time steps reversely
-    for (int step = cache_.num_steps - 1; step >= 0; step--) {
-      auto& step_scope = cache_.GetScope(step);
-      step_unit_->Run(step_scope, *cache_.dev_ctx);
-    }
-  } else {
-    for (size_t step = 0; step < cache_.num_steps; step++) {
-      auto& step_scope = cache_.GetScope(step);
-      step_unit_->Run(step_scope, *cache_.dev_ctx);
-    }
-  }
-}
-
-void RNNAlgorithm::InitStates() {
-  for (size_t step = 0; step < cache_.num_steps; step++) {
-    for (const auto& state : arg_.states) {
-      CreateState(state, step);
-      LinkState(state, step);
-    }
-  }
-}
-
-void RNNAlgorithm::CreateState(const rnn::StateAttr& state_attr, size_t step) {
-  auto& scope = cache_.GetScope(step);
-  auto& state = *cache_.GetTensor(scope, state_attr.var);
-  auto& boot_state = *cache_.GetTensor(*cache_.scope, state_attr.boot_var);
-
-  size_t num_instances =
-      step_inputs_[arg_.inlinks.front()].Read(step).dims()[0];
-  auto dims = boot_state.dims();
-  dims[0] = num_instances;
-
-  state.Resize(dims);
-  state.mutable_data<value_type>(platform::CPUPlace());
-  states_[state_attr.var].WriteShared(step, state);
-}
-
-void RNNAlgorithm::LinkState(const rnn::StateAttr& state, size_t step) {
-  auto& scope = cache_.GetScope(step);
-  auto& state_pre = *cache_.GetTensor(scope, state.pre_var);
-
-  // process the first state's boot-state(the 0-step in forward mode or the
-  // last step in backward mode)
-  // Only forward mode need to link the boot-state to the `pre-state` in first
-  // time step. In backward mode, need to copy the gradient of `pre-state` in
-  // first time step to the gradient of `boot-state`.
-  if (step == 0 && IsForward()) {
-    LinkInitialState(state);
-  } else {
-    size_t num_instances =
-        step_inputs_[arg_.inlinks.front()].Read(step).dims()[0];
-    auto* pre_state = cache_.GetTensor(cache_.GetScope(step - 1), state.var);
-    // shink and share from previous state
-    auto shrinked_pre_state = pre_state->Slice(0, num_instances);
-    state_pre.ShareDataWith(shrinked_pre_state);
-  }
-}
-
-void RNNAlgorithm::LinkInitialState(const rnn::StateAttr& state) {
-  // all the step_inputs' metas should be the same, just randomly select one
-  // and get the dyseq meta.
-  const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()];
-  auto& scope = cache_.GetScope(0);
-  auto& state_pre = *cache_.GetTensor(scope, state.pre_var);
-  auto* pre_state = cache_.GetTensor(*cache_.scope, state.boot_var);
-  pre_state->mutable_data<float>(platform::CPUPlace());
-  // allocate state
-  state_pre.Resize(pre_state->dims());
-  state_pre.mutable_data<value_type>(platform::CPUPlace());
-  detail::ReorderInitialState(some_meta, *pre_state, &state_pre,
-                              pre_state->place());
-}
-
-void RNNAlgorithm::ExportInitialStateGradient(const rnn::StateAttr& state) {
-  // all the step_inputs' metas should be the same, just randomly select one
-  // and get the dyseq meta.
-  const auto& some_meta = dy_seq_metas_[arg_.inlinks.front()];
-  auto& scope = cache_.GetScope(0);
-
-  auto& state_pre = *cache_.GetTensor(scope, state.pre_var);
-  auto& pre_state = *cache_.GetTensor(*cache_.scope, state.boot_var);
-  pre_state.Resize(state_pre.dims());
-  detail::RestoreInitialState(some_meta, state_pre, &pre_state,
-                              pre_state.place());
-}
-
-void RNNAlgorithm::ArgCache::Init(const rnn::ArgumentName& name,
-                                  const paddle::framework::OperatorBase& op,
-                                  const paddle::framework::Scope& scope,
-                                  platform::DeviceContext const* dev_ctx,
-                                  rnn::Argument* arg) {
-  this->scope = &scope;
-  InitArgument(name, op, arg);
-  CacheScopes(scope, *arg);
-  CacheInlinks(scope, arg->inlinks);
-  CacheOutlinks(scope, arg->outlinks);
-  this->dev_ctx = dev_ctx;
-}
-
-void RNNAlgorithm::ArgCache::InitArgument(const rnn::ArgumentName& name,
-                                          const OperatorBase& op,
-                                          rnn::Argument* arg) {
-  rnn::InitArgument(name, arg, op, false /*is_grad*/);
-}
-
-void RNNAlgorithm::ArgCache::CacheScopes(const Scope& scope,
-                                         const rnn::Argument& arg) {
-  auto scopes_var = scope.FindVar(arg.step_scopes);
-  PADDLE_ENFORCE(scopes_var != nullptr,
-                 "the step_scopes output argument [%s] should be created first "
-                 "by framework.",
-                 arg.step_scopes);
-  this->scopes = scopes_var->GetMutable<std::vector<Scope*>>();
-}
-
-void RNNAlgorithm::ArgCache::CacheInlinks(
-    const Scope& scope, const std::vector<std::string>& names) {
-  for (auto name : names) {
-    auto* var = GetVariable(scope, name);
-    inputs[name] = var;
-  }
-}
-
-void RNNAlgorithm::ArgCache::CacheOutlinks(
-    const Scope& scope, const std::vector<std::string>& names) {
-  for (auto name : names) {
-    auto* var = GetVariable(scope, name);
-    outputs[name] = var;
-  }
-}
-
-Variable* RNNAlgorithm::ArgCache::GetVariable(const Scope& scope,
-                                              const std::string& name) {
-  auto* var = scope.FindVar(name);
-  PADDLE_ENFORCE_NOT_NULL(var, "variable [%s] not exist in scope", name);
-  return var;
-}
-
-LoDTensor* RNNAlgorithm::ArgCache::GetTensor(const framework::Scope& scope,
-                                             const std::string& name) {
-  auto* var = GetVariable(scope, name);
-  return var->GetMutable<LoDTensor>();
-}
-
-const std::array<rnn::ArgumentName, 2> RNNAlgorithm::kArgNames{
-    {rnn::ArgumentName{"step_unit", "step_scopes", "inputs", "outputs",
-                       "states", "ex_states", "initial_states"},
-     rnn::ArgumentName{"step_unit", "step_scopes@GRAD", "outputs@GRAD",
-                       "inputs@GRAD", "states", "ex_states",
-                       "initial_states@GRAD"}}};
-
-void DynamicRecurrentOp::Run(const framework::Scope& scope,
-                             const platform::DeviceContext& dev_ctx) const {
-  rnn.Run<RNNAlgorithm::ComputeMode::kForward>(
-      scope, *dynamic_cast<const OperatorBase*>(this), dev_ctx);
-}
-
-void DynamicRecurrentGradientOp::Run(
-    const Scope& scope, const platform::DeviceContext& dev_ctx) const {
-  rnn.Run<RNNAlgorithm::ComputeMode::kBackward>(
-      scope, *dynamic_cast<const OperatorBase*>(this), dev_ctx);
-}
-
-class DynamicRecurrentOpProtoAndCheckerMaker
-    : public framework::OpProtoAndCheckerMaker {
- public:
-  DynamicRecurrentOpProtoAndCheckerMaker(framework::OpProto* proto,
-                                         framework::OpAttrChecker* op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    const auto& name =
-        RNNAlgorithm::kArgNames[RNNAlgorithm::ComputeMode::kForward];
-    // inputs and outputs stored in proto
-    AddInput(name.inlinks,
-             "The inputs that need to be segmented for each step.")
-        .AsDuplicable();
-    AddInput(name.initial_states, "Variables to initialize the states.")
-        .AsDuplicable();
-
-    AddOutput(name.outlinks,
-              "The outputs that need to be concatenated for all steps.")
-        .AsDuplicable();
-    AddOutput(name.step_scopes, "step scopes");
-
-    // Attributes stored in AttributeMap
-    AddAttr<std::vector<std::string>>(name.ex_states, "names of ex_states");
-    AddAttr<std::vector<std::string>>(name.states, "names of states");
-
-    AddComment(R"DOC(
-Dynamic Recurrent Operator.
-
-This is a RNN operator for varience-length sequences.
-
-)DOC");
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP(dynamic_recurrent, paddle::operators::DynamicRecurrentOp,
-            paddle::operators::DynamicRecurrentOpProtoAndCheckerMaker,
-            dynamic_recurrent_grad,
-            paddle::operators::DynamicRecurrentGradientOp);
diff --git a/paddle/operators/dynamic_recurrent_op.h b/paddle/operators/dynamic_recurrent_op.h
deleted file mode 100644
index 5b0548c3a44c9f58838ecc567ee41a587883c26a..0000000000000000000000000000000000000000
--- a/paddle/operators/dynamic_recurrent_op.h
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-
-#ifdef PADDLE_WITH_TESTING
-#include "gtest/gtest.h"
-#endif
-
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/operator.h"
-#include "paddle/framework/tensor_array.h"
-#include "paddle/framework/variable.h"
-#include "paddle/operators/rnn/recurrent_op_utils.h"
-
-namespace paddle {
-namespace operators {
-
-class RNNAlgorithm {
- public:
-  enum ComputeMode { kForward = 0, kBackward = 1 };
-  static const std::array<rnn::ArgumentName, 2> kArgNames;
-  using value_type = float;
-
-  /*
-   * Different `Run` method for forward and backward, `_` is just for template
-   * specifialization.
-   */
-  template <ComputeMode _>
-  void Run(const framework::Scope& scope, const framework::OperatorBase& op,
-           const platform::DeviceContext& dev_ctx);
-  /*
-   * Split the inputs(LoDTensors) to segments for each time step.
-   */
-  void SplitInputs();
-
-  /*
-   * Create step-scopes to store temporary outputs in each time steps.
-   */
-  void CreateScopes();
-
-  /*
-   * Link TensorArray steps to the corresponding variables located in
-   * step-scopes.
-   */
-  void WriteStepInputs();
-
-  /*
-   * Write output of each step to the corresponding TensorArray.
-   */
-  void WriteStepOutputs();
-
-  /*
-   * Initialize the states, each state will have a corresponding pre-state,
-   * which share the memory with the state in the previous time state. The
-   * pre-state in the first time step will be initialized with an zero tensor or
-   * a tensor in parent scope if is provided.
-   */
-  void InitStates();
-
-  /*
-   * Create state variables for each time step.
-   */
-  void CreateState(const rnn::StateAttr& state, size_t step);
-
-  /*
-   * Link pre-state variable in current scope to the state variable in the
-   * previous time step (scope) by reference.
-   */
-  void LinkState(const rnn::StateAttr& state, size_t step);
-
-  /*
-   * Link the pre-state of the first time step to the `boot-state` in parent's
-   * scope.
-   */
-  void LinkInitialState(const rnn::StateAttr& state);
-
-  /*
-   * Copy the gradient from `pre-state` in the first step-scope to the
-   * `boot-state` in parent's scope.
-   */
-  void ExportInitialStateGradient(const rnn::StateAttr& state);
-
-  /*
-   * Calculate time steps.
-   */
-  void RunSteps();
-
-  /*
-   * Concatenate outputs in each time step and generate a LoDTensor.
-   */
-  void ConcatOutputs();
-
-  void SetComputeMode(ComputeMode mode) { mode_ = mode; }
-  bool IsForward() const { return mode_ == ComputeMode::kForward; }
-  bool IsBackward() const { return mode_ == ComputeMode::kBackward; }
-
-  /*
-   * set a step unit that is created according to a RecurrentOp's step unit.
-   */
-  void SetStepUnit(std::unique_ptr<framework::OperatorBase> step_unit) {
-    PADDLE_ENFORCE_NOT_NULL(step_unit);
-    step_unit_ = std::move(step_unit);
-  }
-  const framework::OperatorBase& GetStepUnit() const { return *step_unit_; }
-
-  const framework::TensorArray& state(const std::string& name) const {
-    auto it = states_.find(name);
-    PADDLE_ENFORCE(it != states_.end());
-    return it->second;
-  }
-  const framework::TensorArray& step_input(const std::string& name) const {
-    auto it = step_inputs_.find(name);
-    PADDLE_ENFORCE(it != step_inputs_.end());
-    return it->second;
-  }
-  const framework::TensorArray& step_output(const std::string& name) const {
-    auto it = step_outputs_.find(name);
-    PADDLE_ENFORCE(it != step_outputs_.end());
-    return it->second;
-  }
-
- protected:
-  struct ArgCache {
-    framework::Scope const* scope;
-    std::vector<framework::Scope*>* scopes;
-    std::map<std::string, framework::Variable*> inputs;
-    std::map<std::string, framework::Variable*> outputs;
-    platform::DeviceContext const* dev_ctx;
-
-    size_t num_steps{0};
-
-    void Init(const rnn::ArgumentName& name, const framework::OperatorBase& op,
-              const framework::Scope& scope,
-              platform::DeviceContext const* dev_ctx, rnn::Argument* arg);
-
-    framework::Scope& GetScope(size_t index) {
-      PADDLE_ENFORCE_LT(index, num_steps);
-      return *scopes->at(index);
-    }
-
-    framework::LoDTensor* GetTensor(const framework::Scope& scope,
-                                    const std::string& name);
-
-   private:
-    void InitArgument(const rnn::ArgumentName& name,
-                      const framework::OperatorBase& op, rnn::Argument* arg);
-    void CacheScopes(const framework::Scope& scope, const rnn::Argument& arg);
-    void CacheInlinks(const framework::Scope& scope,
-                      const std::vector<std::string>& names);
-    void CacheOutlinks(const framework::Scope& scope,
-                       const std::vector<std::string>& names);
-    framework::Variable* GetVariable(const framework::Scope& scope,
-                                     const std::string& name);
-  };
-
- private:
-  std::unique_ptr<framework::OperatorBase> step_unit_;
-  std::map<std::string, framework::TensorArray> states_;
-  std::map<std::string, framework::TensorArray> step_inputs_;
-  std::map<std::string, framework::TensorArray> step_outputs_;
-  std::map<std::string, std::vector<framework::DySeqMeta>> dy_seq_metas_;
-  rnn::Argument arg_;
-  ArgCache cache_;
-  ComputeMode mode_{ComputeMode::kForward};
-
-#ifdef PADDLE_WITH_TESTING
-  // test forward
-  friend class RNNAlgorithmTestHelper;
-  FRIEND_TEST(RNNAlgorithmTestHelper, SplitInputs);
-  FRIEND_TEST(RNNAlgorithmTestHelper, CreateCache);
-  FRIEND_TEST(RNNAlgorithmTestHelper, CreateScopes);
-  FRIEND_TEST(RNNAlgorithmTestHelper, WriteStepInputs);
-  FRIEND_TEST(RNNAlgorithmTestHelper, WriteStepOutputs);
-  FRIEND_TEST(RNNAlgorithmTestHelper, InitStates);
-  FRIEND_TEST(RNNAlgorithmTestHelper, ConcatOutputs);
-// TODO(superjom) test backward
-#endif
-};
-
-class DynamicRecurrentOp : public framework::OperatorBase {
- public:
-  DynamicRecurrentOp(const std::string& type,
-                     const framework::VariableNameMap& inputs,
-                     const framework::VariableNameMap& outputs,
-                     const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  DynamicRecurrentOp(const DynamicRecurrentOp& o)
-      : framework::OperatorBase(
-            static_cast<const framework::OperatorBase&>(o)) {
-    PADDLE_THROW("Not implemented");
-  }
-
-  void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override;
-
-  mutable RNNAlgorithm rnn;
-};
-
-class DynamicRecurrentGradientOp : public framework::OperatorBase {
- public:
-  DynamicRecurrentGradientOp(const std::string& type,
-                             const framework::VariableNameMap& inputs,
-                             const framework::VariableNameMap& outputs,
-                             const framework::AttributeMap& attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {}
-
-  DynamicRecurrentGradientOp(const DynamicRecurrentGradientOp& o)
-      : framework::OperatorBase(
-            static_cast<const framework::OperatorBase&>(o)) {
-    PADDLE_THROW("Not implemented");
-  }
-
-  void Run(const framework::Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override;
-
-  mutable RNNAlgorithm rnn;
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/dynamic_recurrent_op_test.cc b/paddle/operators/dynamic_recurrent_op_test.cc
deleted file mode 100644
index 8d840e259b190ead86a66df8ab31c5170db4d824..0000000000000000000000000000000000000000
--- a/paddle/operators/dynamic_recurrent_op_test.cc
+++ /dev/null
@@ -1,217 +0,0 @@
-#include "paddle/operators/dynamic_recurrent_op.h"
-
-#include <gtest/gtest.h>
-
-#include "paddle/framework/ddim.h"
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/op_desc.h"
-#include "paddle/framework/op_registry.h"
-#include "paddle/operators/net_op.h"
-
-namespace paddle {
-namespace operators {
-
-using framework::Scope;
-using framework::TensorArray;
-using framework::LoDTensor;
-using framework::Variable;
-
-class TestOp : public framework::OperatorBase {
- public:
-  using framework::OperatorBase::OperatorBase;
-  DEFINE_OP_CLONE_METHOD(TestOp);
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {}
-};
-
-void OpDescNewVar(const std::string& param_name,
-                  std::initializer_list<const char*> arguments,
-                  paddle::framework::OpDesc::Var* var) {
-  var->set_parameter(param_name);
-  for (auto& arg_name : arguments) {
-    var->add_arguments(arg_name);
-  }
-}
-
-// create a LoD tensor in scope with specific dims
-LoDTensor* CreateVar(Scope& scope, std::string name, framework::DDim dims,
-                     const platform::Place& place) {
-  auto* var = scope.Var(name);
-  auto* tensor = var->GetMutable<LoDTensor>();
-  tensor->Resize(dims);
-  tensor->mutable_data<float>(place);
-  return tensor;
-}
-
-class RNNAlgorithmTestHelper : public ::testing::Test {
- protected:
-  const rnn::ArgumentName argname = RNNAlgorithm::kArgNames[0];
-
-  virtual void SetUp() override {
-    CreateGlobalVariables();
-
-    auto op_desc = CreateOpDesc();
-    op = paddle::framework::OpRegistry::CreateOp(op_desc);
-    dop = &(dynamic_cast<DynamicRecurrentOp*>(op.get())->rnn);
-    InitCacheManually();
-    InitStepNet();
-  }
-
-  framework::OpDesc CreateOpDesc() {
-    // create op
-    paddle::framework::OpDesc op_desc;
-    op_desc.set_type("dynamic_recurrent");
-
-    OpDescNewVar(argname.inlinks, {"in0"}, op_desc.add_inputs());
-    OpDescNewVar(argname.initial_states, {"boot_mem"}, op_desc.add_inputs());
-    OpDescNewVar(argname.step_scopes, {"step_scopes"}, op_desc.add_outputs());
-    OpDescNewVar(argname.outlinks, {"out0"}, op_desc.add_outputs());
-
-    // set pre-states
-    auto pre_memories = op_desc.mutable_attrs()->Add();
-    pre_memories->set_name(argname.ex_states);
-    pre_memories->set_type(paddle::framework::AttrType::STRINGS);
-    auto pre_memories_item = pre_memories->add_strings();
-    *pre_memories_item = "mem@pre";
-
-    // set states
-    auto memories = op_desc.mutable_attrs()->Add();
-    memories->set_name(argname.states);
-    memories->set_type(paddle::framework::AttrType::STRINGS);
-    auto memories_item = memories->add_strings();
-    *memories_item = "mem";
-    return op_desc;
-  }
-
-  void CreateGlobalVariables() {
-    platform::CPUPlace place;
-    scope.Var("step_scopes");
-    CreateVar(scope, "boot_mem", framework::make_ddim({10, 20}), place);
-    CreateVar(scope, "out0", framework::make_ddim({10, 20}), place);
-    auto* in0 = CreateVar(scope, "in0", framework::make_ddim({10, 8}), place);
-    // 10 instanes with 4 sentences, length is 4, 3, 2, 1 respectively.
-    framework::LoD in0_lod(1);
-    for (int x : std::vector<int>{0, 4, 7, 9, 10}) {
-      in0_lod[0].push_back(x);
-    }
-    in0->set_lod(in0_lod);
-    in0->Resize(framework::make_ddim({10, 8}));
-    // set the content, each sentence content is seqid.batchid
-    // the seqid starts from 0
-    int start = 0;
-    for (size_t seqid = 0; seqid < in0_lod.size() - 1; seqid++) {
-      for (size_t batchid = 0;
-           batchid < in0_lod[0][seqid + 1] - in0_lod[0][seqid]; batchid++) {
-        float v = seqid + batchid * 0.1;
-
-        for (size_t dim = 0; dim < 8; dim++) {
-          in0->data<float>()[start * 8 + dim] = v;
-        }
-        start++;
-      }
-    }
-  }
-
-  void InitCacheManually() {
-    dop->cache_.Init(RNNAlgorithm::kArgNames[0], *op, scope, &device_context,
-                     &dop->arg_);
-  }
-
-  void InitStepNet() {
-    std::unique_ptr<framework::OperatorBase> stepnet{new NetOp};
-    dynamic_cast<NetOp*>(stepnet.get())
-        ->AppendOp(std::unique_ptr<TestOp>(new TestOp(
-            "test", {{"inputs", {"in0"}}, {"initial_states", {"boot_mem"}}},
-            {{"outputs", {"out0"}}, {"step_scopes", {"step_scopes"}}}, {})));
-    dop->SetStepUnit(std::move(stepnet));
-  }
-
- protected:
-  RNNAlgorithm* dop;
-  std::unique_ptr<framework::OperatorBase> op;
-  paddle::platform::CPUDeviceContext device_context;
-  paddle::framework::Scope scope;
-};
-
-TEST_F(RNNAlgorithmTestHelper, CreateCache) {
-  const rnn::Argument& arg = dop->arg_;
-  ASSERT_EQ(arg.inlinks.size(), 1UL);
-  ASSERT_EQ(arg.outlinks.size(), 1UL);
-}
-
-TEST_F(RNNAlgorithmTestHelper, SplitInputs) {
-  dop->SplitInputs();
-  auto& in0_ta = dop->step_inputs_["in0"];
-  ASSERT_EQ(in0_ta.size(), 4UL);
-
-  const auto& batch0 = in0_ta.Read(0);
-  const auto& batch1 = in0_ta.Read(1);
-  const auto& batch2 = in0_ta.Read(2);
-  const auto& batch3 = in0_ta.Read(3);
-  EXPECT_EQ(batch0.dims()[0], 4);
-  EXPECT_EQ(batch1.dims()[0], 3);
-  EXPECT_EQ(batch2.dims()[0], 2);
-  EXPECT_EQ(batch3.dims()[0], 1);
-}
-
-TEST_F(RNNAlgorithmTestHelper, CreateScopes) {
-  dop->SplitInputs();
-  dop->CreateScopes();
-  ASSERT_EQ(dop->cache_.num_steps, 4UL);
-  ASSERT_EQ(dop->cache_.scopes->size(), 4UL);
-}
-
-TEST_F(RNNAlgorithmTestHelper, WriteStepInputs) {
-  dop->SplitInputs();
-  dop->CreateScopes();
-  dop->WriteStepInputs();
-
-  for (size_t step = 0; step < dop->cache_.num_steps; step++) {
-    auto& scope = dop->cache_.GetScope(step);
-    for (auto name : std::vector<std::string>({"in0"})) {
-      ASSERT_TRUE(scope.FindVar(name) != nullptr);
-    }
-  }
-}
-
-TEST_F(RNNAlgorithmTestHelper, WriteStepOutputs) {
-  dop->SplitInputs();
-  dop->CreateScopes();
-  dop->WriteStepInputs();
-  dop->WriteStepOutputs();
-
-  for (size_t step = 0; step < dop->cache_.num_steps; step++) {
-    auto& scope = dop->cache_.GetScope(step);
-    for (auto name : std::vector<std::string>({"out0"})) {
-      ASSERT_TRUE(scope.FindVar(name));
-    }
-  }
-}
-
-TEST_F(RNNAlgorithmTestHelper, ConcatOutputs) {
-  // Let's leave this test to python unittest.
-}
-
-TEST_F(RNNAlgorithmTestHelper, InitStates) {
-  dop->SetComputeMode(RNNAlgorithm::ComputeMode::kForward);
-  dop->SplitInputs();
-  dop->CreateScopes();
-  dop->WriteStepInputs();
-  dop->WriteStepOutputs();
-  dop->InitStates();
-
-  for (size_t step = 0; step < dop->cache_.num_steps; step++) {
-    auto& scope = dop->cache_.GetScope(step);
-    auto state = scope.FindVar("mem");
-    ASSERT_TRUE(state != nullptr);
-
-    auto* pre_state = scope.FindVar("mem@pre");
-    ASSERT_TRUE(pre_state != nullptr);
-
-    auto* boot_state = scope.FindVar("boot_mem");
-    ASSERT_TRUE(boot_state != nullptr);
-  }
-}
-
-}  // operators
-}  // namespace paddle
diff --git a/paddle/operators/elementwise_op.h b/paddle/operators/elementwise_op.h
index 56e5eb69bc382a2c15d88b759fa6987f02c6cabb..ea533503e4916cae7e1157ed34da9629dcff3513 100644
--- a/paddle/operators/elementwise_op.h
+++ b/paddle/operators/elementwise_op.h
@@ -35,7 +35,7 @@ class ElementwiseOp : public framework::OperatorWithKernel {
     auto x_dim = ctx->GetInputDim("X");
     auto y_dim = ctx->GetInputDim("Y");
     PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
-                      "Rank of first input must >= rank of second input.")
+                      "Rank of first input must >= rank of second input.");
     ctx->SetOutputDim("Out", x_dim);
     ctx->ShareLoD("X", /*->*/ "Out");
   }
@@ -120,7 +120,7 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
     auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
 
     PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
-                      "Rank of first input must >= rank of second input.")
+                      "Rank of first input must >= rank of second input.");
 
     auto x_grad_name = framework::GradVarName("X");
     auto y_grad_name = framework::GradVarName("Y");
diff --git a/paddle/operators/elementwise_op_function.h b/paddle/operators/elementwise_op_function.h
index 488a35aafc8600bb8bb252fc3a5161c72a2f6df1..8aa35b2c466785c8749739635fcd1c2b19292f3e 100644
--- a/paddle/operators/elementwise_op_function.h
+++ b/paddle/operators/elementwise_op_function.h
@@ -106,7 +106,7 @@ void ElementwiseCompute(const framework::ExecutionContext& ctx) {
   auto x_dims = x->dims();
   auto y_dims = y->dims();
   PADDLE_ENFORCE_GE(x_dims.size(), y_dims.size(),
-                    "Rank of first input must >= rank of second input.")
+                    "Rank of first input must >= rank of second input.");
 
   if (x_dims == y_dims) {
     functor f;
diff --git a/paddle/operators/expand_op.h b/paddle/operators/expand_op.h
index 8ae2c11a5d31dafc1b90d129054ebfabfb761bfe..4d7996ad1e744fead1329c35ce6ea43bf0683ce6 100644
--- a/paddle/operators/expand_op.h
+++ b/paddle/operators/expand_op.h
@@ -125,7 +125,8 @@ class ExpandGradKernel : public framework::OpKernel<T> {
       auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
       auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
       out0->mutable_data<T>(context.GetPlace());
-      out0->CopyFrom(*in0, context.GetPlace(), context.device_context());
+      framework::CopyFrom(*in0, context.GetPlace(), context.device_context(),
+                          out0);
     } else {
       switch (dims) {
         REP_EXPAND_GRAD_TEMPLATE(72)
diff --git a/paddle/operators/feed_op.cc b/paddle/operators/feed_op.cc
index 0dd84cbeaafbafd45132b0a0b744554ce7475411..ee43c22fb13e203c7de1a7e6d1586423fcbfb25a 100644
--- a/paddle/operators/feed_op.cc
+++ b/paddle/operators/feed_op.cc
@@ -47,7 +47,7 @@ class FeedOp : public framework::OperatorBase {
     auto &feed_list = feed_var->Get<framework::FeedFetchList>();
     auto &feed_item = feed_list.at(static_cast<size_t>(col));
     auto *out_item = out_var->GetMutable<framework::FeedFetchType>();
-    out_item->CopyFrom(feed_item, dev_ctx.GetPlace(), dev_ctx);
+    framework::CopyFrom(feed_item, dev_ctx.GetPlace(), dev_ctx, out_item);
     out_item->set_lod(feed_item.lod());
   }
 };
diff --git a/paddle/operators/fetch_op.cc b/paddle/operators/fetch_op.cc
index 8108ae69dec4bafd1c04d5ab05eef6f467d4c6e8..1ae07194c235ce6724f59c9c60df80f957787cda 100644
--- a/paddle/operators/fetch_op.cc
+++ b/paddle/operators/fetch_op.cc
@@ -51,7 +51,7 @@ class FetchOp : public framework::OperatorBase {
 
     // FIXME(yuyang18): Should we assume the fetch operator always generate
     // CPU outputs?
-    dst_item.CopyFrom(src_item, platform::CPUPlace(), dev_ctx);
+    CopyFrom(src_item, platform::CPUPlace(), dev_ctx, &dst_item);
     dev_ctx.Wait();
     dst_item.set_lod(src_item.lod());
 
diff --git a/paddle/operators/gru_op.h b/paddle/operators/gru_op.h
index 1b18368e0e16365682520b62a7f6adab0cbb527f..564489d3a98b59e3e527be5613a73d23d6dbbf31 100644
--- a/paddle/operators/gru_op.h
+++ b/paddle/operators/gru_op.h
@@ -71,8 +71,8 @@ class GRUKernel : public framework::OpKernel<T> {
 
     int frame_size = hidden_dims[1];
     math::hl_gru_value<T> gru_value;
-    gru_value.gateWeight = const_cast<T*>(weight_data);
-    gru_value.stateWeight =
+    gru_value.gate_weight = const_cast<T*>(weight_data);
+    gru_value.state_weight =
         const_cast<T*>(weight_data + 2 * frame_size * frame_size);
     Tensor ordered_h0;
     const size_t* order = batch_gate->lod()[2].data();
@@ -82,9 +82,9 @@ class GRUKernel : public framework::OpKernel<T> {
       // to reorder.
       ReorderInitState<Place, T>(context.device_context(), *h0, order,
                                  &ordered_h0, true);
-      gru_value.prevOutValue = ordered_h0.data<T>();
+      gru_value.prev_out_value = ordered_h0.data<T>();
     } else {
-      gru_value.prevOutValue = nullptr;
+      gru_value.prev_out_value = nullptr;
     }
     auto batch_starts = batch_gate->lod()[0];
     size_t num_batch = batch_starts.size() - 1;
@@ -96,14 +96,14 @@ class GRUKernel : public framework::OpKernel<T> {
       Tensor gate_t = batch_gate->Slice(bstart, bend);
       Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
       Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-      gru_value.outputValue = hidden_t.data<T>();
-      gru_value.gateValue = gate_t.data<T>();
-      gru_value.resetOutputValue = reset_hidden_prev_t.data<T>();
+      gru_value.output_value = hidden_t.data<T>();
+      gru_value.gate_value = gate_t.data<T>();
+      gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
       math::GRUUnitFunctor<Place, T>::compute(
           dev_ctx, gru_value, frame_size, cur_batch_size,
           math::ActiveType(context.Attr<std::string>("activation")),
           math::ActiveType(context.Attr<std::string>("gate_activation")));
-      gru_value.prevOutValue = gru_value.outputValue;
+      gru_value.prev_out_value = gru_value.output_value;
     }
 
     math::Batch2LoDTensorFunctor<Place, T> to_seq;
@@ -169,20 +169,20 @@ class GRUGradKernel : public framework::OpKernel<T> {
     to_batch(dev_ctx, *hidden_grad, batch_hidden_grad, false, is_reverse);
 
     math::hl_gru_value<T> gru_value;
-    gru_value.gateWeight = const_cast<T*>(weight_data);
-    gru_value.stateWeight =
+    gru_value.gate_weight = const_cast<T*>(weight_data);
+    gru_value.state_weight =
         const_cast<T*>(weight_data + 2 * frame_size * frame_size);
 
     math::hl_gru_grad<T> gru_grad;
     if (weight_grad) {
-      gru_grad.gateWeightGrad =
+      gru_grad.gate_weight_grad =
           weight_grad->mutable_data<T>(context.GetPlace());
       zero(dev_ctx, weight_grad, static_cast<T>(0.0));
-      gru_grad.stateWeightGrad =
+      gru_grad.state_weight_grad =
           weight_grad->data<T>() + 2 * frame_size * frame_size;
     } else {
-      gru_grad.gateWeightGrad = nullptr;
-      gru_grad.stateWeightGrad = nullptr;
+      gru_grad.gate_weight_grad = nullptr;
+      gru_grad.state_weight_grad = nullptr;
     }
 
     auto batch_starts = batch_hidden_grad.lod()[0];
@@ -193,27 +193,27 @@ class GRUGradKernel : public framework::OpKernel<T> {
       int cur_batch_size = bend - bstart;
 
       Tensor gate_t = batch_gate->Slice(bstart, bend);
-      gru_value.gateValue = gate_t.data<T>();
+      gru_value.gate_value = gate_t.data<T>();
       Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
-      gru_value.resetOutputValue = reset_hidden_prev_t.data<T>();
+      gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
 
       Tensor hidden_grad_t = batch_hidden_grad.Slice(bstart, bend);
-      gru_grad.outputGrad = hidden_grad_t.data<T>();
+      gru_grad.output_grad = hidden_grad_t.data<T>();
       Tensor gate_grad_t = batch_gate_grad.Slice(bstart, bend);
-      gru_grad.gateGrad = gate_grad_t.data<T>();
+      gru_grad.gate_grad = gate_grad_t.data<T>();
       Tensor reset_hidden_prev_grad_t =
           batch_reset_hidden_prev_grad.Slice(bstart, bend);
-      gru_grad.resetOutputGrad = reset_hidden_prev_grad_t.data<T>();
+      gru_grad.reset_output_grad = reset_hidden_prev_grad_t.data<T>();
       if (n == 0) {
-        gru_value.prevOutValue = h0 ? ordered_h0.data<T>() : nullptr;
-        gru_grad.prevOutGrad =
+        gru_value.prev_out_value = h0 ? ordered_h0.data<T>() : nullptr;
+        gru_grad.prev_out_grad =
             h0 && h0_grad ? ordered_h0_grad.data<T>() : nullptr;
       } else {
         int bstart_pre = static_cast<int>(batch_starts[n - 1]);
         Tensor hidden_prev_t = batch_hidden->Slice(bstart_pre, bstart);
-        gru_value.prevOutValue = hidden_prev_t.data<T>();
+        gru_value.prev_out_value = hidden_prev_t.data<T>();
         Tensor hidden_prev_grad_t = batch_hidden_grad.Slice(bstart_pre, bstart);
-        gru_grad.prevOutGrad = hidden_prev_grad_t.data<T>();
+        gru_grad.prev_out_grad = hidden_prev_grad_t.data<T>();
       }
 
       math::GRUUnitGradFunctor<Place, T>::compute(
diff --git a/paddle/operators/gru_unit_op.h b/paddle/operators/gru_unit_op.h
index 050430d3252d05236219cd5ced5a792c21413c1f..3398c0934e250cfc292776d08773204bb9b4d87e 100644
--- a/paddle/operators/gru_unit_op.h
+++ b/paddle/operators/gru_unit_op.h
@@ -28,6 +28,10 @@ template <typename T, int MajorType = Eigen::RowMajor,
           typename IndexType = Eigen::DenseIndex>
 using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
 
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
 enum GRUActivationType { identity = 0, sigmoid = 1, tanh = 2, relu = 3 };
 
 template <typename Place, typename T>
@@ -226,7 +230,7 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
     // backward for bias
     if (bias_grad) {
       bias_grad->mutable_data<T>(context.GetPlace());
-      auto d_b = EigenMatrix<T>::From(*bias_grad);
+      auto d_b = EigenVector<T>::Flatten(*bias_grad);
       d_b.device(place) = d_g.sum(Eigen::array<int, 1>({{0}}));
     }
   }
diff --git a/paddle/operators/linear_chain_crf_op.h b/paddle/operators/linear_chain_crf_op.h
index 872f659fed40d7479d9d8bed6c8469fb28282253..014bbfa7580011e38a2f546e30d1e584965a7815 100644
--- a/paddle/operators/linear_chain_crf_op.h
+++ b/paddle/operators/linear_chain_crf_op.h
@@ -195,7 +195,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
     auto copyLoDTensor = [](const platform::DeviceContext& ctx,
                             const LoDTensor& src, LoDTensor* dst) {
       dst->mutable_data<T>(src.dims(), platform::CPUPlace());
-      dst->CopyFrom(src, platform::CPUPlace(), ctx);
+      framework::CopyFrom(src, platform::CPUPlace(), ctx, dst);
     };
 
     copyLoDTensor(ctx, emission_weights_src, emission_weights_dst);
@@ -203,8 +203,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
 
     transition_weights_dst->mutable_data<T>(transition_weights_src.dims(),
                                             platform::CPUPlace());
-    transition_weights_dst->CopyFrom(transition_weights_src,
-                                     platform::CPUPlace(), ctx);
+    framework::CopyFrom(transition_weights_src, platform::CPUPlace(), ctx,
+                        transition_weights_dst);
   }
 
   void CopyOutputsToGpuMemory(const platform::DeviceContext& ctx,
@@ -219,7 +219,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
     auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src,
                          Tensor* dst) {
       dst->mutable_data<T>(platform::GPUPlace());
-      dst->CopyFrom(src, platform::GPUPlace(), ctx);
+      framework::CopyFrom(src, platform::GPUPlace(), ctx, dst);
     };
     copyTensor(ctx, emission_exps_src, emission_exps_dst);
     copyTensor(ctx, transition_exps_src, transition_exps_dst);
@@ -410,12 +410,12 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
     // Copy the inputs from GPU memory to CPU memory when this operators runs on
     // GPU device.
     label_dst->mutable_data<T>(label_src.dims(), platform::CPUPlace());
-    label_dst->CopyFrom(label_src, platform::CPUPlace(), ctx);
+    framework::CopyFrom(label_src, platform::CPUPlace(), ctx, label_dst);
 
     auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src,
                          Tensor* dst) {
       dst->mutable_data<T>(src.dims(), platform::CPUPlace());
-      dst->CopyFrom(src, platform::CPUPlace(), ctx);
+      framework::CopyFrom(src, platform::CPUPlace(), ctx, dst);
     };
     copyTensor(ctx, emission_exps_src, emission_exps_dst);
     copyTensor(ctx, transition_exps_src, transition_exps_dst);
@@ -434,7 +434,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
                          Tensor* dst) {
       if (src && dst) {
         dst->mutable_data<T>(platform::GPUPlace());
-        dst->CopyFrom(*src, platform::GPUPlace(), ctx);
+        framework::CopyFrom(*src, platform::GPUPlace(), ctx, dst);
       }
     };
     copyTensor(ctx, emission_grad_src, emission_grad_dst);
diff --git a/paddle/operators/load_op.cc b/paddle/operators/load_op.cc
index b71a33a6b1ce80b545e6d7a4020dafc941dc55d2..4e58b84430f2a8697bbbc1acf971fd063120f563 100644
--- a/paddle/operators/load_op.cc
+++ b/paddle/operators/load_op.cc
@@ -38,61 +38,7 @@ class LoadOp : public framework::OperatorBase {
                    out_var_name);
 
     auto *tensor = out_var->GetMutable<framework::LoDTensor>();
-
-    uint32_t version;
-    fin.read(reinterpret_cast<char *>(&version), sizeof(version));
-    PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
-    framework::TensorDesc desc;
-    {  // int32_t size
-       // proto buffer
-      int32_t size;
-      fin.read(reinterpret_cast<char *>(&size), sizeof(size));
-      std::unique_ptr<char[]> buf(new char[size]);
-      fin.read(reinterpret_cast<char *>(buf.get()), size);
-      PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
-                     "Cannot parse tensor desc");
-    }
-    {  // read tensor
-      std::vector<int64_t> dims;
-      dims.reserve(static_cast<size_t>(desc.dims().size()));
-      std::copy(desc.dims().begin(), desc.dims().end(),
-                std::back_inserter(dims));
-      tensor->Resize(framework::make_ddim(dims));
-
-      void *buf;
-      platform::Place cpu = platform::CPUPlace();
-      switch (desc.data_type()) {
-        case framework::FP32:
-          buf = tensor->mutable_data<float>(cpu);
-          break;
-        case framework::FP64:
-          buf = tensor->mutable_data<double>(cpu);
-          break;
-        case framework::INT32:
-          buf = tensor->mutable_data<int>(cpu);
-          break;
-        case framework::INT64:
-          buf = tensor->mutable_data<int64_t>(cpu);
-          break;
-        default:
-          PADDLE_THROW("DataType %d not supported", desc.data_type());
-      }
-      fin.read(static_cast<char *>(buf), tensor->memory_size());
-    }
-    {  // read lod
-      uint64_t lod_level;
-      fin.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
-      auto &lod = *tensor->mutable_lod();
-      lod.resize(lod_level);
-      for (uint64_t i = 0; i < lod_level; ++i) {
-        uint64_t size;
-        fin.read(reinterpret_cast<char *>(&size), sizeof(size));
-        std::vector<size_t> tmp(size / sizeof(size_t));
-        fin.read(reinterpret_cast<char *>(tmp.data()),
-                 static_cast<std::streamsize>(size));
-        lod[i] = tmp;
-      }
-    }
+    framework::DeserializeFromStream(fin, tensor);
 
     auto place = dev_ctx.GetPlace();
     if (platform::is_gpu_place(place)) {
@@ -105,7 +51,7 @@ class LoadOp : public framework::OperatorBase {
       out_var->Clear();
       tensor = out_var->GetMutable<framework::LoDTensor>();
       tensor->set_lod(cpu_tensor.lod());
-      tensor->CopyFrom(cpu_tensor, place, dev_ctx);
+      CopyFrom(cpu_tensor, place, dev_ctx, tensor);
     }
   }
 };
diff --git a/paddle/operators/lod_reset_op.h b/paddle/operators/lod_reset_op.h
index 2bb916ccee80c83a02ea429fe95f5fafc86ccfa6..cbcbf80adc3cf68f9eb28bbe2a69168cc8798347 100644
--- a/paddle/operators/lod_reset_op.h
+++ b/paddle/operators/lod_reset_op.h
@@ -33,7 +33,8 @@ class LoDResetKernel : public framework::OpKernel<T> {
       auto* lod = lod_t->data<int>();
       if (platform::is_gpu_place(ctx.GetPlace())) {
         framework::Tensor lod_cpu;
-        lod_cpu.CopyFrom(*lod_t, platform::CPUPlace(), ctx.device_context());
+        framework::CopyFrom(*lod_t, platform::CPUPlace(), ctx.device_context(),
+                            &lod_cpu);
         lod = lod_cpu.data<int>();
       }
       level0 = std::vector<int>(lod, lod + lod_t->numel());
diff --git a/paddle/operators/lod_tensor_to_array_op.cc b/paddle/operators/lod_tensor_to_array_op.cc
index 58af35564d83b9699af4f7783fb6367ff9590682..010c79d4e153463d4b2e48e5fd798d3bc4febaf1 100644
--- a/paddle/operators/lod_tensor_to_array_op.cc
+++ b/paddle/operators/lod_tensor_to_array_op.cc
@@ -81,11 +81,11 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
           continue;
         }
         // out[i][offset: offset+len] = x[each_range.begin: each_range.end]
-        out[i]
-            .Slice(static_cast<int>(offset), static_cast<int>(offset + len))
-            .CopyFrom(x.Slice(static_cast<int>(each_range.begin),
-                              static_cast<int>(each_range.end)),
-                      x.place(), dev_ctx);
+        auto slice = out[i].Slice(static_cast<int>(offset),
+                                  static_cast<int>(offset + len));
+        framework::CopyFrom(x.Slice(static_cast<int>(each_range.begin),
+                                    static_cast<int>(each_range.end)),
+                            x.place(), dev_ctx, &slice);
         offset += len;
       }
     }
diff --git a/paddle/operators/log_loss_op.cc b/paddle/operators/log_loss_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..257e5c8a49e935dcbdc33e5060118ef1804fa8d7
--- /dev/null
+++ b/paddle/operators/log_loss_op.cc
@@ -0,0 +1,115 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/log_loss_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LogLossOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Predicted"),
+                   "Input(Predicted) must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) must be initialized.");
+
+    auto pred_dims = ctx->GetInputDim("Predicted");
+    auto label_dims = ctx->GetInputDim("Labels");
+
+    PADDLE_ENFORCE_EQ(pred_dims, label_dims);
+    PADDLE_ENFORCE_EQ(pred_dims.size(), 2,
+                      "The rank of Input(Predicted) must be 2 and the shape is "
+                      "[batch_size, 1].");
+    PADDLE_ENFORCE_EQ(pred_dims[1], 1,
+                      "Each row of Input(Predicted) contains a real value, "
+                      "so the 2nd dimension of Input(X) must be 1.");
+
+    ctx->SetOutputDim("Loss", {pred_dims[0], 1});
+    ctx->ShareLoD("Predicted", "Loss");
+  }
+};
+
+template <typename AttrType>
+class LogLossOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  LogLossOpMaker(framework::OpProto* proto,
+                 framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("Predicted",
+             "The input value (Predicted) of Log loss op."
+             "Predicted is a 2-D tensor with shape [batch_size, 1].");
+    AddInput("Labels",
+             "The target value (Labels) of Log loss op."
+             "Labels is a 2-D tensor with shape [batch_size, 1].");
+    AddOutput("Loss",
+              "The output tensor with shape [batch_size, 1] "
+              "which represents the log loss.");
+    AddAttr<AttrType>("epsilon", "Epsilon in log loss.");
+    AddComment(R"DOC(
+LogLoss Operator.
+
+Log loss is a loss function used for binary classification. Log Loss quantifies
+the accuracy of a classifier by penalising false classifications. Minimising the
+Log Loss is equivalent to maximising the accuracy of the classifier. We define
+Predicted as the values predicted by our model and Labels as the target ground
+truth value. Log loss can evaluate how close the predicted values are to the
+target. The shapes of Predicted and Labels are both [batch_size, 1].
+The equation is:
+
+$$
+Loss = - Labels * log(Predicted + \epsilon) -
+        (1 - Labels) * log(1 - Predicted + \epsilon)
+$$
+
+)DOC");
+  }
+};
+
+class LogLossGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Predicted"),
+                   "Input(Predicted) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Labels"),
+                   "Input(Labels) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")),
+                   "Input(Loss@GRAD) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Predicted")),
+                   "Output(Predicted@GRAD) should not be null.");
+
+    auto pred_dims = ctx->GetInputDim("Predicted");
+    auto label_dims = ctx->GetInputDim("Labels");
+    auto loss_grad_dims = ctx->GetInputDim(framework::GradVarName("Loss"));
+    PADDLE_ENFORCE_EQ(loss_grad_dims, pred_dims);
+
+    auto pred_grad_name = framework::GradVarName("Predicted");
+    ctx->SetOutputDim(pred_grad_name, pred_dims);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(log_loss, ops::LogLossOp, ops::LogLossOpMaker<float>, log_loss_grad,
+            ops::LogLossGradOp);
+REGISTER_OP_CPU_KERNEL(log_loss,
+                       ops::LogLossKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    log_loss_grad, ops::LogLossGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/log_loss_op.cu b/paddle/operators/log_loss_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6c189ef3412d7a56205502c7913e93218a03b929
--- /dev/null
+++ b/paddle/operators/log_loss_op.cu
@@ -0,0 +1,22 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#define EIGEN_USE_GPU
+#include "paddle/operators/log_loss_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(log_loss,
+                       ops::LogLossKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    log_loss_grad, ops::LogLossGradKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/log_loss_op.h b/paddle/operators/log_loss_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..73404fce9157fa750a51451fa93646bc4059481a
--- /dev/null
+++ b/paddle/operators/log_loss_op.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
+template <typename Place, typename T, typename AttrType = T>
+class LogLossKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* loss_out = ctx.Output<Tensor>("Loss");
+
+    loss_out->mutable_data<T>(ctx.GetPlace());
+
+    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
+
+    auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
+    auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
+
+    auto loss = EigenVector<T>::Flatten(*loss_out);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    loss.device(place) = (-(label * (prediction + epsilon).log()) -
+                          ((static_cast<T>(1) - label) *
+                           (static_cast<T>(1) - prediction + epsilon).log()));
+  }
+};
+
+template <typename Place, typename T, typename AttrType = T>
+class LogLossGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto epsilon = static_cast<T>(ctx.Attr<AttrType>("epsilon"));
+
+    auto prediction = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Predicted"));
+    auto label = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Labels"));
+
+    auto* dloss = ctx.Input<Tensor>(framework::GradVarName("Loss"));
+    auto* dpred = ctx.Output<Tensor>(framework::GradVarName("Predicted"));
+
+    auto dl = EigenVector<T>::Flatten(*dloss);
+    auto place = ctx.GetEigenDevice<Place>();
+
+    if (dpred) {
+      dpred->mutable_data<T>(ctx.GetPlace());
+      auto dx = framework::EigenVector<T>::Flatten(*dpred);
+      dx.device(place) = dl * (-(label / (prediction + epsilon)) +
+                               ((static_cast<T>(1) - label) /
+                                (static_cast<T>(1) - prediction + epsilon)));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/CMakeLists.txt b/paddle/operators/math/CMakeLists.txt
index 3017f133afc5d4dcd484c78b44591a876ab4d667..bf47879f772a3013bd7ce78c6f8a6aefe65298f9 100644
--- a/paddle/operators/math/CMakeLists.txt
+++ b/paddle/operators/math/CMakeLists.txt
@@ -13,8 +13,9 @@ if(WITH_GPU)
     nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
     nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context)
     nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
-    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
     nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context)
+    nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context)
+    nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
 else()
     cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto)
     cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
@@ -26,8 +27,9 @@ else()
     cc_library(context_project SRCS context_project.cc DEPS device_context math_function)
     cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context)
     cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
-    cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
     cc_library(maxouting SRCS maxouting.cc DEPS device_context)
+    cc_library(unpooling SRCS unpooling.cc DEPS device_context)
+    cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
 endif()
 
 cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
diff --git a/paddle/operators/math/context_project.h b/paddle/operators/math/context_project.h
index 72f4202bace4461d2597204feaa2a21e355bd1ac..d853507188cf8c80aede1e7646736036e30c9678 100644
--- a/paddle/operators/math/context_project.h
+++ b/paddle/operators/math/context_project.h
@@ -149,7 +149,7 @@ class ContextProjectFunctor {
             Tensor out_t_sub = out_t.Slice(k * context_length,
                                            k * context_length + padding_size);
             Tensor w_sub = padding_data.Slice(k, k + padding_size);
-            out_t_sub.CopyFrom(w_sub, context.GetPlace(), context);
+            framework::CopyFrom(w_sub, context.GetPlace(), context, &out_t_sub);
           }
         }
         if (down_pad > 0) {  // add down pad
@@ -179,7 +179,7 @@ class ContextProjectFunctor {
                 (down_pad_begin_row + t) * context_length);
             Tensor w_sub = padding_data.Slice(
                 up_pad + padding_idx, up_pad + padding_idx + padding_size);
-            out_t_sub.CopyFrom(w_sub, context.GetPlace(), context);
+            framework::CopyFrom(w_sub, context.GetPlace(), context, &out_t_sub);
           }
         }
         out_t.Resize({sequence_height, context_length * sequence_width});
diff --git a/paddle/operators/math/detail/gru_cpu_kernel.h b/paddle/operators/math/detail/gru_cpu_kernel.h
index 51af140cf4d5e6581765bea00033fa53d383230d..4c67dec9cbeb48f400f79f5ed7ba3c939fa2540c 100644
--- a/paddle/operators/math/detail/gru_cpu_kernel.h
+++ b/paddle/operators/math/detail/gru_cpu_kernel.h
@@ -25,393 +25,397 @@ namespace detail {
 #ifndef __NVCC__
 
 template <class OpResetOutput, typename T>
-void hl_naive_gru_forward_reset_output(OpResetOutput opResetOutput,
-                                       T *gateValue, T *resetOutputValue,
-                                       T *prevOutputValue, int frameSize,
+void hl_naive_gru_forward_reset_output(OpResetOutput op_reset_output,
+                                       T *gate_value, T *reset_output_value,
+                                       T *prev_output_value, int frame_size,
                                        activation_mode_t active_gate) {
-  T rValueUpdateGate;
-  T rValueResetGate;
-  T rValueResetOutput;
-  T rPrevOut = 0;
-  T *updateGate = gateValue;
-  T *resetGate = gateValue + frameSize;
-
-  for (int i = 0; i < frameSize; i++) {
-    rValueUpdateGate = updateGate[i];
-    rValueResetGate = resetGate[i];
-    if (prevOutputValue) {
-      rPrevOut = prevOutputValue[i];
+  T r_value_update_gate;
+  T r_value_reset_gate;
+  T r_value_reset_output;
+  T r_prev_out = 0;
+  T *update_gate = gate_value;
+  T *reset_gate = gate_value + frame_size;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_reset_gate = reset_gate[i];
+    if (prev_output_value) {
+      r_prev_out = prev_output_value[i];
     }
 
-    opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut,
-                  rValueResetOutput, active_gate);
+    op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
+                    r_value_reset_output, active_gate);
 
-    updateGate[i] = rValueUpdateGate;
-    resetGate[i] = rValueResetGate;
-    resetOutputValue[i] = rValueResetOutput;
+    update_gate[i] = r_value_update_gate;
+    reset_gate[i] = r_value_reset_gate;
+    reset_output_value[i] = r_value_reset_output;
   }
 }
 
 template <class OpFinalOutput, typename T>
-void hl_naive_gru_forward_final_output(OpFinalOutput opFinalOutput,
-                                       T *gateValue, T *prevOutputValue,
-                                       T *outputValue, int frameSize,
+void hl_naive_gru_forward_final_output(OpFinalOutput op_final_output,
+                                       T *gate_value, T *prev_output_value,
+                                       T *output_value, int frame_size,
                                        activation_mode_t active_node) {
-  T rValueUpdateGate;
-  T rValueFrameState;
-  T rPrevOut = 0;
-  T rOutput;
-  T *updateGate = gateValue;
-  T *frameState = gateValue + frameSize * 2;
-
-  for (int i = 0; i < frameSize; i++) {
-    rValueUpdateGate = updateGate[i];
-    rValueFrameState = frameState[i];
-    if (prevOutputValue) {
-      rPrevOut = prevOutputValue[i];
+  T r_value_update_gate;
+  T r_value_frame_state;
+  T r_prev_out = 0;
+  T r_output;
+  T *update_gate = gate_value;
+  T *frame_state = gate_value + frame_size * 2;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_frame_state = frame_state[i];
+    if (prev_output_value) {
+      r_prev_out = prev_output_value[i];
     }
 
-    opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
-                  active_node);
+    op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
+                    r_output, active_node);
 
-    frameState[i] = rValueFrameState;
-    outputValue[i] = rOutput;
+    frame_state[i] = r_value_frame_state;
+    output_value[i] = r_output;
   }
 }
 
 template <class OpResetOutput, typename T>
-void hl_avx_gru_forward_reset_output(OpResetOutput opResetOutput, T *gateValue,
-                                     T *resetOutputValue, T *prevOutputValue,
-                                     int frameSize,
+void hl_avx_gru_forward_reset_output(OpResetOutput op_reset_output,
+                                     T *gate_value, T *reset_output_value,
+                                     T *prev_output_value, int frame_size,
                                      activation_mode_t active_gate) {
 #ifdef __AVX__
-  __m256 rValueUpdateGate;
-  __m256 rValueResetGate;
-  __m256 rValueResetOutput;
-  __m256 rPrevOut = _mm256_set1_ps(0.0f);
-  __m256 *updateGate = (__m256 *)gateValue;
-  __m256 *resetGate = (__m256 *)(gateValue + frameSize);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rValueUpdateGate = updateGate[i];
-    rValueResetGate = resetGate[i];
-    if (prevOutputValue) {
-      rPrevOut = ((__m256 *)prevOutputValue)[i];
+  __m256 r_value_update_gate;
+  __m256 r_value_reset_gate;
+  __m256 r_value_reset_output;
+  __m256 r_prev_out = _mm256_set1_ps(0.0f);
+  __m256 *update_gate = (__m256 *)gate_value;
+  __m256 *reset_gate = (__m256 *)(gate_value + frame_size);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_reset_gate = reset_gate[i];
+    if (prev_output_value) {
+      r_prev_out = ((__m256 *)prev_output_value)[i];
     }
 
-    opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut,
-                  rValueResetOutput, active_gate);
+    op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
+                    r_value_reset_output, active_gate);
 
-    updateGate[i] = rValueUpdateGate;
-    resetGate[i] = rValueResetGate;
-    ((__m256 *)resetOutputValue)[i] = rValueResetOutput;
+    update_gate[i] = r_value_update_gate;
+    reset_gate[i] = r_value_reset_gate;
+    ((__m256 *)reset_output_value)[i] = r_value_reset_output;
   }
 #endif
 }
 
 template <class OpFinalOutput, typename T>
-void hl_avx_gru_forward_final_output(OpFinalOutput opFinalOutput, T *gateValue,
-                                     T *prevOutputValue, T *outputValue,
-                                     int frameSize,
+void hl_avx_gru_forward_final_output(OpFinalOutput op_final_output,
+                                     T *gate_value, T *prev_output_value,
+                                     T *output_value, int frame_size,
                                      activation_mode_t active_node) {
 #ifdef __AVX__
-  __m256 rValueUpdateGate;
-  __m256 rValueFrameState;
-  __m256 rPrevOut = _mm256_set1_ps(0.0f);
-  __m256 rOutput;
-  __m256 *updateGate = (__m256 *)gateValue;
-  __m256 *frameState = (__m256 *)(gateValue + frameSize * 2);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rValueUpdateGate = updateGate[i];
-    rValueFrameState = frameState[i];
-    if (prevOutputValue) {
-      rPrevOut = ((__m256 *)prevOutputValue)[i];
+  __m256 r_value_update_gate;
+  __m256 r_value_frame_state;
+  __m256 r_prev_out = _mm256_set1_ps(0.0f);
+  __m256 r_output;
+  __m256 *update_gate = (__m256 *)gate_value;
+  __m256 *frame_state = (__m256 *)(gate_value + frame_size * 2);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_value_update_gate = update_gate[i];
+    r_value_frame_state = frame_state[i];
+    if (prev_output_value) {
+      r_prev_out = ((__m256 *)prev_output_value)[i];
     }
 
-    opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
-                  active_node);
+    op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
+                    r_output, active_node);
 
-    frameState[i] = rValueFrameState;
-    ((__m256 *)outputValue)[i] = rOutput;
+    frame_state[i] = r_value_frame_state;
+    ((__m256 *)output_value)[i] = r_output;
   }
 #endif
 }
 
 template <class OpResetOutput, typename T>
-inline void forward_reset_output(OpResetOutput opResetOutput,
-                                 hl_gru_value<T> value, int frameSize,
-                                 int batchSize, activation_mode_t active_gate) {
-  for (int b = 0; b < batchSize; b++) {
-    if (OpResetOutput::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+inline void forward_reset_output(OpResetOutput op_reset_output,
+                                 hl_gru_value<T> value, int frame_size,
+                                 int batch_size,
+                                 activation_mode_t active_gate) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpResetOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
       hl_avx_gru_forward_reset_output(
-          opResetOutput, value.gateValue, value.resetOutputValue,
-          value.prevOutValue, frameSize, active_gate);
+          op_reset_output, value.gate_value, value.reset_output_value,
+          value.prev_out_value, frame_size, active_gate);
     } else {
       hl_naive_gru_forward_reset_output(
-          opResetOutput, value.gateValue, value.resetOutputValue,
-          value.prevOutValue, frameSize, active_gate);
+          op_reset_output, value.gate_value, value.reset_output_value,
+          value.prev_out_value, frame_size, active_gate);
     }
 
-    value.gateValue += frameSize * 3;
-    value.resetOutputValue += frameSize;
-    if (value.prevOutValue) {
-      value.prevOutValue += frameSize;
+    value.gate_value += frame_size * 3;
+    value.reset_output_value += frame_size;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
     }
   }
 }
 
 template <class OpFinalOutput, typename T>
-inline void forward_final_output(OpFinalOutput opFinalOutput,
-                                 hl_gru_value<T> value, int frameSize,
-                                 int batchSize, activation_mode_t active_node) {
-  for (int b = 0; b < batchSize; b++) {
-    if (OpFinalOutput::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
-      hl_avx_gru_forward_final_output(opFinalOutput, value.gateValue,
-                                      value.prevOutValue, value.outputValue,
-                                      frameSize, active_node);
+inline void forward_final_output(OpFinalOutput op_final_output,
+                                 hl_gru_value<T> value, int frame_size,
+                                 int batch_size,
+                                 activation_mode_t active_node) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpFinalOutput::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
+      hl_avx_gru_forward_final_output(op_final_output, value.gate_value,
+                                      value.prev_out_value, value.output_value,
+                                      frame_size, active_node);
     } else {
-      hl_naive_gru_forward_final_output(opFinalOutput, value.gateValue,
-                                        value.prevOutValue, value.outputValue,
-                                        frameSize, active_node);
+      hl_naive_gru_forward_final_output(
+          op_final_output, value.gate_value, value.prev_out_value,
+          value.output_value, frame_size, active_node);
     }
 
-    value.gateValue += frameSize * 3;
-    value.outputValue += frameSize;
-    if (value.prevOutValue) {
-      value.prevOutValue += frameSize;
+    value.gate_value += frame_size * 3;
+    value.output_value += frame_size;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
     }
   }
 }
 
 template <class OpStateGrad, typename T>
-void hl_naive_gru_backward_state_grad(OpStateGrad opStateGrad, T *gateValue,
-                                      T *gateGrad, T *prevOutValue,
-                                      T *prevOutGrad, T *outputGrad,
-                                      int frameSize,
+void hl_naive_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
+                                      T *gate_grad, T *prev_out_value,
+                                      T *prev_out_grad, T *output_grad,
+                                      int frame_size,
                                       activation_mode_t active_node) {
-  T rUpdateGateValue;
-  T rUpdateGateGrad;
-  T rFrameStateValue;
-  T rFrameStateGrad;
-  T rOutGrad;
-  T rPrevOutValue = 0;
-  T rPrevOutGrad = 0;
-  T *updateGateValue = gateValue;
-  T *updateGateGrad = gateGrad;
-  T *frameStateValue = gateValue + frameSize * 2;
-  T *frameStateGrad = gateGrad + frameSize * 2;
-
-  for (int i = 0; i < frameSize; i++) {
-    rUpdateGateValue = updateGateValue[i];
-    rFrameStateValue = frameStateValue[i];
-    rOutGrad = outputGrad[i];
-    if (prevOutValue) {
-      rPrevOutValue = prevOutValue[i];
+  T r_update_gate_value;
+  T r_update_gate_grad;
+  T r_frame_state_value;
+  T r_frame_state_grad;
+  T r_out_grad;
+  T r_prev_out_value = 0;
+  T r_prev_out_grad = 0;
+  T *update_gate_value = gate_value;
+  T *update_gate_grad = gate_grad;
+  T *frame_state_value = gate_value + frame_size * 2;
+  T *frame_state_grad = gate_grad + frame_size * 2;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_frame_state_value = frame_state_value[i];
+    r_out_grad = output_grad[i];
+    if (prev_out_value) {
+      r_prev_out_value = prev_out_value[i];
     }
-    if (prevOutGrad) {
-      rPrevOutGrad = prevOutGrad[i];
+    if (prev_out_grad) {
+      r_prev_out_grad = prev_out_grad[i];
     }
 
-    opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
-                rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
-                active_node);
+    op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
+                  r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
+                  r_out_grad, active_node);
 
-    updateGateGrad[i] = rUpdateGateGrad;
-    frameStateGrad[i] = rFrameStateGrad;
-    if (prevOutGrad) {
-      prevOutGrad[i] = rPrevOutGrad;
+    update_gate_grad[i] = r_update_gate_grad;
+    frame_state_grad[i] = r_frame_state_grad;
+    if (prev_out_grad) {
+      prev_out_grad[i] = r_prev_out_grad;
     }
   }
 }
 
 template <class OpResetGrad, typename T>
-void hl_naive_gru_backward_reset_grad(OpResetGrad opResetGrad, T *gateValue,
-                                      T *gateGrad, T *prevOutValue,
-                                      T *prevOutGrad, T *resetOutputGrad,
-                                      int frameSize,
+void hl_naive_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
+                                      T *gate_grad, T *prev_out_value,
+                                      T *prev_out_grad, T *reset_output_grad,
+                                      int frame_size,
                                       activation_mode_t active_gate) {
-  T rUpdateGateValue;
-  T rUpdateGateGrad;
-  T rResetGateValue;
-  T rResetGateGrad;
-  T rResetOutputGrad = 0;
-  T rPrevOutValue = 0;
-  T rPrevOutGrad = 0;
-  T *updateGateValue = gateValue;
-  T *updateGateGrad = gateGrad;
-  T *resetGateValue = gateValue + frameSize;
-  T *resetGateGrad = gateGrad + frameSize;
-
-  for (int i = 0; i < frameSize; i++) {
-    rUpdateGateValue = updateGateValue[i];
-    rUpdateGateGrad = updateGateGrad[i];
-    rResetGateValue = resetGateValue[i];
-
-    if (prevOutValue && prevOutGrad) {
-      rResetOutputGrad = resetOutputGrad[i];
+  T r_update_gate_value;
+  T r_update_gate_grad;
+  T r_reset_gate_value;
+  T r_reset_gate_grad;
+  T r_reset_output_grad = 0;
+  T r_prev_out_value = 0;
+  T r_prev_out_grad = 0;
+  T *update_gate_value = gate_value;
+  T *update_gate_grad = gate_grad;
+  T *reset_gate_value = gate_value + frame_size;
+  T *reset_gate_grad = gate_grad + frame_size;
+
+  for (int i = 0; i < frame_size; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_update_gate_grad = update_gate_grad[i];
+    r_reset_gate_value = reset_gate_value[i];
+
+    if (prev_out_value && prev_out_grad) {
+      r_reset_output_grad = reset_output_grad[i];
     }
-    if (prevOutValue) {
-      rPrevOutValue = prevOutValue[i];
+    if (prev_out_value) {
+      r_prev_out_value = prev_out_value[i];
     }
-    if (prevOutGrad) {
-      rPrevOutGrad = prevOutGrad[i];
+    if (prev_out_grad) {
+      r_prev_out_grad = prev_out_grad[i];
     }
 
-    opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
-                rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
-                active_gate);
+    op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
+                  r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
+                  r_reset_output_grad, active_gate);
 
-    updateGateGrad[i] = rUpdateGateGrad;
-    resetGateGrad[i] = rResetGateGrad;
-    if (prevOutGrad) {
-      prevOutGrad[i] = rPrevOutGrad;
+    update_gate_grad[i] = r_update_gate_grad;
+    reset_gate_grad[i] = r_reset_gate_grad;
+    if (prev_out_grad) {
+      prev_out_grad[i] = r_prev_out_grad;
     }
   }
 }
 
 template <class OpStateGrad, typename T>
-void hl_avx_gru_backward_state_grad(OpStateGrad opStateGrad, T *gateValue,
-                                    T *gateGrad, T *prevOutValue,
-                                    T *prevOutGrad, T *outputGrad,
-                                    int frameSize,
+void hl_avx_gru_backward_state_grad(OpStateGrad op_state_grad, T *gate_value,
+                                    T *gate_grad, T *prev_out_value,
+                                    T *prev_out_grad, T *output_grad,
+                                    int frame_size,
                                     activation_mode_t active_node) {
 #ifdef __AVX__
-  __m256 rUpdateGateValue;
-  __m256 rUpdateGateGrad;
-  __m256 rFrameStateValue;
-  __m256 rFrameStateGrad;
-  __m256 rOutGrad;
-  __m256 rPrevOutValue = _mm256_set1_ps(0.0f);
-  __m256 rPrevOutGrad = _mm256_set1_ps(0.0f);
-  __m256 *updateGateValue = (__m256 *)gateValue;
-  __m256 *updateGateGrad = (__m256 *)gateGrad;
-  __m256 *frameStateValue = (__m256 *)(gateValue + frameSize * 2);
-  __m256 *frameStateGrad = (__m256 *)(gateGrad + frameSize * 2);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rUpdateGateValue = updateGateValue[i];
-    rFrameStateValue = frameStateValue[i];
-    rOutGrad = ((__m256 *)outputGrad)[i];
-    if (prevOutValue) {
-      rPrevOutValue = ((__m256 *)prevOutValue)[i];
+  __m256 r_update_gate_value;
+  __m256 r_update_gate_grad;
+  __m256 r_frame_state_value;
+  __m256 r_frame_state_grad;
+  __m256 r_out_grad;
+  __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
+  __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
+  __m256 *update_gate_value = (__m256 *)gate_value;
+  __m256 *update_gate_grad = (__m256 *)gate_grad;
+  __m256 *frame_state_value = (__m256 *)(gate_value + frame_size * 2);
+  __m256 *frame_state_grad = (__m256 *)(gate_grad + frame_size * 2);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_frame_state_value = frame_state_value[i];
+    r_out_grad = ((__m256 *)output_grad)[i];
+    if (prev_out_value) {
+      r_prev_out_value = ((__m256 *)prev_out_value)[i];
     }
-    if (prevOutGrad) {
-      rPrevOutGrad = ((__m256 *)prevOutGrad)[i];
+    if (prev_out_grad) {
+      r_prev_out_grad = ((__m256 *)prev_out_grad)[i];
     }
 
-    opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
-                rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
-                active_node);
+    op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
+                  r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
+                  r_out_grad, active_node);
 
-    updateGateGrad[i] = rUpdateGateGrad;
-    frameStateGrad[i] = rFrameStateGrad;
-    if (prevOutGrad) {
-      ((__m256 *)prevOutGrad)[i] = rPrevOutGrad;
+    update_gate_grad[i] = r_update_gate_grad;
+    frame_state_grad[i] = r_frame_state_grad;
+    if (prev_out_grad) {
+      ((__m256 *)prev_out_grad)[i] = r_prev_out_grad;
     }
   }
 #endif
 }
 
 template <class OpResetGrad, typename T>
-void hl_avx_gru_backward_reset_grad(OpResetGrad opResetGrad, T *gateValue,
-                                    T *gateGrad, T *prevOutValue,
-                                    T *prevOutGrad, T *resetOutputGrad,
-                                    int frameSize,
+void hl_avx_gru_backward_reset_grad(OpResetGrad op_reset_grad, T *gate_value,
+                                    T *gate_grad, T *prev_out_value,
+                                    T *prev_out_grad, T *reset_output_grad,
+                                    int frame_size,
                                     activation_mode_t active_gate) {
 #ifdef __AVX__
-  __m256 rUpdateGateValue;
-  __m256 rUpdateGateGrad;
-  __m256 rResetGateValue;
-  __m256 rResetGateGrad;
-  __m256 rResetOutputGrad = _mm256_set1_ps(0.0f);
-  __m256 rPrevOutValue = _mm256_set1_ps(0.0f);
-  __m256 rPrevOutGrad = _mm256_set1_ps(0.0f);
-  __m256 *updateGateValue = (__m256 *)gateValue;
-  __m256 *updateGateGrad = (__m256 *)gateGrad;
-  __m256 *resetGateValue = (__m256 *)(gateValue + frameSize);
-  __m256 *resetGateGrad = (__m256 *)(gateGrad + frameSize);
-
-  for (int i = 0; i < frameSize / 8; i++) {
-    rUpdateGateValue = updateGateValue[i];
-    rUpdateGateGrad = updateGateGrad[i];
-    rResetGateValue = resetGateValue[i];
-
-    if (prevOutValue && prevOutGrad) {
-      rResetOutputGrad = ((__m256 *)resetOutputGrad)[i];
+  __m256 r_update_gate_value;
+  __m256 r_update_gate_grad;
+  __m256 r_reset_gate_value;
+  __m256 r_reset_gate_grad;
+  __m256 r_reset_output_grad = _mm256_set1_ps(0.0f);
+  __m256 r_prev_out_value = _mm256_set1_ps(0.0f);
+  __m256 r_prev_out_grad = _mm256_set1_ps(0.0f);
+  __m256 *update_gate_value = (__m256 *)gate_value;
+  __m256 *update_gate_grad = (__m256 *)gate_grad;
+  __m256 *reset_gate_value = (__m256 *)(gate_value + frame_size);
+  __m256 *reset_gate_grad = (__m256 *)(gate_grad + frame_size);
+
+  for (int i = 0; i < frame_size / 8; i++) {
+    r_update_gate_value = update_gate_value[i];
+    r_update_gate_grad = update_gate_grad[i];
+    r_reset_gate_value = reset_gate_value[i];
+
+    if (prev_out_value && prev_out_grad) {
+      r_reset_output_grad = ((__m256 *)reset_output_grad)[i];
     }
-    if (prevOutValue) {
-      rPrevOutValue = ((__m256 *)prevOutValue)[i];
+    if (prev_out_value) {
+      r_prev_out_value = ((__m256 *)prev_out_value)[i];
     }
-    if (prevOutGrad) {
-      rPrevOutGrad = ((__m256 *)prevOutGrad)[i];
+    if (prev_out_grad) {
+      r_prev_out_grad = ((__m256 *)prev_out_grad)[i];
     }
 
-    opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
-                rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
-                active_gate);
+    op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
+                  r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
+                  r_reset_output_grad, active_gate);
 
-    updateGateGrad[i] = rUpdateGateGrad;
-    resetGateGrad[i] = rResetGateGrad;
-    if (prevOutGrad) {
-      ((__m256 *)prevOutGrad)[i] = rPrevOutGrad;
+    update_gate_grad[i] = r_update_gate_grad;
+    reset_gate_grad[i] = r_reset_gate_grad;
+    if (prev_out_grad) {
+      ((__m256 *)prev_out_grad)[i] = r_prev_out_grad;
     }
   }
 #endif
 }
 
 template <class OpStateGrad, typename T>
-inline void backward_state_grad(OpStateGrad opStateGrad, hl_gru_value<T> value,
-                                hl_gru_grad<T> grad, int frameSize,
-                                int batchSize, activation_mode_t active_node) {
-  for (int b = 0; b < batchSize; b++) {
-    if (OpStateGrad::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+inline void backward_state_grad(OpStateGrad op_state_grad,
+                                hl_gru_value<T> value, hl_gru_grad<T> grad,
+                                int frame_size, int batch_size,
+                                activation_mode_t active_node) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpStateGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
       hl_avx_gru_backward_state_grad(
-          opStateGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
-          grad.prevOutGrad, grad.outputGrad, frameSize, active_node);
+          op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+          grad.prev_out_grad, grad.output_grad, frame_size, active_node);
     } else {
       hl_naive_gru_backward_state_grad(
-          opStateGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
-          grad.prevOutGrad, grad.outputGrad, frameSize, active_node);
+          op_state_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+          grad.prev_out_grad, grad.output_grad, frame_size, active_node);
     }
 
-    value.gateValue += frameSize * 3;
-    if (value.prevOutValue) {
-      value.prevOutValue += frameSize;
+    value.gate_value += frame_size * 3;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
     }
 
-    grad.gateGrad += frameSize * 3;
-    grad.outputGrad += frameSize;
-    if (grad.prevOutGrad) {
-      grad.prevOutGrad += frameSize;
+    grad.gate_grad += frame_size * 3;
+    grad.output_grad += frame_size;
+    if (grad.prev_out_grad) {
+      grad.prev_out_grad += frame_size;
     }
   }
 }
 
 template <class OpResetGrad, typename T>
-inline void backward_reset_grad(OpResetGrad opResetGrad, hl_gru_value<T> value,
-                                hl_gru_grad<T> grad, int frameSize,
-                                int batchSize, activation_mode_t active_gate) {
-  for (int b = 0; b < batchSize; b++) {
-    if (OpResetGrad::avx && !(frameSize & (8 - 1)) && (sizeof(T) == 4)) {
+inline void backward_reset_grad(OpResetGrad op_reset_grad,
+                                hl_gru_value<T> value, hl_gru_grad<T> grad,
+                                int frame_size, int batch_size,
+                                activation_mode_t active_gate) {
+  for (int b = 0; b < batch_size; b++) {
+    if (OpResetGrad::avx && !(frame_size & (8 - 1)) && (sizeof(T) == 4)) {
       hl_avx_gru_backward_reset_grad(
-          opResetGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
-          grad.prevOutGrad, grad.resetOutputGrad, frameSize, active_gate);
+          op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+          grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
     } else {
       hl_naive_gru_backward_reset_grad(
-          opResetGrad, value.gateValue, grad.gateGrad, value.prevOutValue,
-          grad.prevOutGrad, grad.resetOutputGrad, frameSize, active_gate);
+          op_reset_grad, value.gate_value, grad.gate_grad, value.prev_out_value,
+          grad.prev_out_grad, grad.reset_output_grad, frame_size, active_gate);
     }
 
-    value.gateValue += frameSize * 3;
-    if (value.prevOutValue) {
-      value.prevOutValue += frameSize;
+    value.gate_value += frame_size * 3;
+    if (value.prev_out_value) {
+      value.prev_out_value += frame_size;
     }
 
-    grad.gateGrad += frameSize * 3;
-    grad.resetOutputGrad += frameSize;
-    if (grad.prevOutGrad) {
-      grad.prevOutGrad += frameSize;
+    grad.gate_grad += frame_size * 3;
+    grad.reset_output_grad += frame_size;
+    if (grad.prev_out_grad) {
+      grad.prev_out_grad += frame_size;
     }
   }
 }
diff --git a/paddle/operators/math/detail/gru_gpu_kernel.h b/paddle/operators/math/detail/gru_gpu_kernel.h
index 6441c648b048422c110872a85aa8cb719f11a8d7..d2edcb7f258b387530799b967fc0fff61acc5b83 100644
--- a/paddle/operators/math/detail/gru_gpu_kernel.h
+++ b/paddle/operators/math/detail/gru_gpu_kernel.h
@@ -27,174 +27,174 @@ namespace math {
 namespace detail {
 
 /*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
  */
-template <class OpResetOutput, bool isBatch, typename T>
-__global__ void KeGruForwardResetOutput(OpResetOutput opResetOutput,
-                                        T *gateValue, T *resetOutputValue,
-                                        T *prevOutputValue, int frameSize,
-                                        int batchSize,
+template <class OpResetOutput, bool is_batch, typename T>
+__global__ void KeGruForwardResetOutput(OpResetOutput op_reset_output,
+                                        T *gate_value, T *reset_output_value,
+                                        T *prev_output_value, int frame_size,
+                                        int batch_size,
                                         activation_mode_t active_gate) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    resetOutputValue += batchIdx * frameSize;
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    reset_output_value += batch_idx * frame_size;
   }
 
-  T rPrevOut = 0;
-  T rValueResetOutput;
-  T rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
-  T rValueResetGate = gateValue[frameIdx + frameSize * 1];
+  T r_prev_out = 0;
+  T r_value_reset_output;
+  T r_value_update_gate = gate_value[frame_idx + frame_size * 0];
+  T r_value_reset_gate = gate_value[frame_idx + frame_size * 1];
 
-  if (prevOutputValue) {
-    if (isBatch) prevOutputValue += batchIdx * frameSize;
-    rPrevOut = prevOutputValue[frameIdx];
+  if (prev_output_value) {
+    if (is_batch) prev_output_value += batch_idx * frame_size;
+    r_prev_out = prev_output_value[frame_idx];
   }
 
-  opResetOutput(rValueUpdateGate, rValueResetGate, rPrevOut, rValueResetOutput,
-                active_gate);
+  op_reset_output(r_value_update_gate, r_value_reset_gate, r_prev_out,
+                  r_value_reset_output, active_gate);
 
-  gateValue[frameIdx + frameSize * 0] = rValueUpdateGate;
-  gateValue[frameIdx + frameSize * 1] = rValueResetGate;
-  resetOutputValue[frameIdx] = rValueResetOutput;
+  gate_value[frame_idx + frame_size * 0] = r_value_update_gate;
+  gate_value[frame_idx + frame_size * 1] = r_value_reset_gate;
+  reset_output_value[frame_idx] = r_value_reset_output;
 }
 
 /*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
  */
-template <class OpFinalOutput, bool isBatch, typename T>
-__global__ void KeGruForwardFinalOutput(OpFinalOutput opFinalOutput,
-                                        T *gateValue, T *prevOutputValue,
-                                        T *outputValue, int frameSize,
-                                        int batchSize,
+template <class OpFinalOutput, bool is_batch, typename T>
+__global__ void KeGruForwardFinalOutput(OpFinalOutput op_final_output,
+                                        T *gate_value, T *prev_output_value,
+                                        T *output_value, int frame_size,
+                                        int batch_size,
                                         activation_mode_t active_node) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    outputValue += batchIdx * frameSize;
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    output_value += batch_idx * frame_size;
   }
 
-  T rOutput;
-  T rPrevOut = 0;
-  T rValueUpdateGate = gateValue[frameIdx + frameSize * 0];
-  T rValueFrameState = gateValue[frameIdx + frameSize * 2];
+  T r_output;
+  T r_prev_out = 0;
+  T r_value_update_gate = gate_value[frame_idx + frame_size * 0];
+  T r_value_frame_state = gate_value[frame_idx + frame_size * 2];
 
-  if (prevOutputValue) {
-    if (isBatch) prevOutputValue += batchIdx * frameSize;
-    rPrevOut = prevOutputValue[frameIdx];
+  if (prev_output_value) {
+    if (is_batch) prev_output_value += batch_idx * frame_size;
+    r_prev_out = prev_output_value[frame_idx];
   }
 
-  opFinalOutput(rValueUpdateGate, rValueFrameState, rPrevOut, rOutput,
-                active_node);
+  op_final_output(r_value_update_gate, r_value_frame_state, r_prev_out,
+                  r_output, active_node);
 
-  gateValue[frameIdx + frameSize * 2] = rValueFrameState;
-  outputValue[frameIdx] = rOutput;
+  gate_value[frame_idx + frame_size * 2] = r_value_frame_state;
+  output_value[frame_idx] = r_output;
 }
 
 /*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
  */
-template <class OpStateGrad, bool isBatch, typename T>
-__global__ void KeGruBackwardStateGrad(OpStateGrad opStateGrad, T *gateValue,
-                                       T *gateGrad, T *prevOutValue,
-                                       T *prevOutGrad, T *outputGrad,
-                                       int frameSize, int batchSize,
+template <class OpStateGrad, bool is_batch, typename T>
+__global__ void KeGruBackwardStateGrad(OpStateGrad op_state_grad, T *gate_value,
+                                       T *gate_grad, T *prev_out_value,
+                                       T *prev_out_grad, T *output_grad,
+                                       int frame_size, int batch_size,
                                        activation_mode_t active_node) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    gateGrad += batchIdx * 3 * frameSize;
-    outputGrad += batchIdx * frameSize;
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    gate_grad += batch_idx * 3 * frame_size;
+    output_grad += batch_idx * frame_size;
   }
 
-  T rUpdateGateGrad;
-  T rFrameStateGrad;
-  T rPrevOutValue = 0;
-  T rPrevOutGrad = 0;
-  T rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
-  T rFrameStateValue = gateValue[frameIdx + frameSize * 2];
-  T rOutGrad = outputGrad[frameIdx];
+  T r_update_gate_grad;
+  T r_frame_state_grad;
+  T r_prev_out_value = 0;
+  T r_prev_out_grad = 0;
+  T r_update_gate_value = gate_value[frame_idx + frame_size * 0];
+  T r_frame_state_value = gate_value[frame_idx + frame_size * 2];
+  T r_out_grad = output_grad[frame_idx];
 
-  if (prevOutValue && prevOutGrad) {
-    if (isBatch) prevOutValue += batchIdx * frameSize;
-    rPrevOutValue = prevOutValue[frameIdx];
+  if (prev_out_value && prev_out_grad) {
+    if (is_batch) prev_out_value += batch_idx * frame_size;
+    r_prev_out_value = prev_out_value[frame_idx];
 
-    if (isBatch) prevOutGrad += batchIdx * frameSize;
-    rPrevOutGrad = prevOutGrad[frameIdx];
+    if (is_batch) prev_out_grad += batch_idx * frame_size;
+    r_prev_out_grad = prev_out_grad[frame_idx];
   }
 
-  opStateGrad(rUpdateGateValue, rUpdateGateGrad, rFrameStateValue,
-              rFrameStateGrad, rPrevOutValue, rPrevOutGrad, rOutGrad,
-              active_node);
+  op_state_grad(r_update_gate_value, r_update_gate_grad, r_frame_state_value,
+                r_frame_state_grad, r_prev_out_value, r_prev_out_grad,
+                r_out_grad, active_node);
 
-  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
-  gateGrad[frameIdx + frameSize * 2] = rFrameStateGrad;
-  if (prevOutGrad) {
-    prevOutGrad[frameIdx] = rPrevOutGrad;
+  gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
+  gate_grad[frame_idx + frame_size * 2] = r_frame_state_grad;
+  if (prev_out_grad) {
+    prev_out_grad[frame_idx] = r_prev_out_grad;
   }
 }
 
 /*
- * threads(framePerBlock, batchPerBlock)
- * grid(frameBlocks, batchBlocks)
+ * threads(frame_per_block, batch_per_block)
+ * grid(frame_blocks, batch_blocks)
  */
-template <class OpResetGrad, bool isBatch, typename T>
-__global__ void KeGruBackwardResetGrad(OpResetGrad opResetGrad, T *gateValue,
-                                       T *gateGrad, T *prevOutValue,
-                                       T *prevOutGrad, T *resetOutputGrad,
-                                       int frameSize, int batchSize,
+template <class OpResetGrad, bool is_batch, typename T>
+__global__ void KeGruBackwardResetGrad(OpResetGrad op_reset_grad, T *gate_value,
+                                       T *gate_grad, T *prev_out_value,
+                                       T *prev_out_grad, T *reset_output_grad,
+                                       int frame_size, int batch_size,
                                        activation_mode_t active_gate) {
-  const int frameIdx = blockIdx.x * blockDim.x + threadIdx.x;
-  if (frameIdx >= frameSize) return;
-  int batchIdx = 0;
-  if (isBatch) {
-    batchIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    if (batchIdx >= batchSize) return;
-    gateValue += batchIdx * 3 * frameSize;
-    gateGrad += batchIdx * 3 * frameSize;
-    resetOutputGrad += batchIdx * frameSize;
+  const int frame_idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (frame_idx >= frame_size) return;
+  int batch_idx = 0;
+  if (is_batch) {
+    batch_idx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (batch_idx >= batch_size) return;
+    gate_value += batch_idx * 3 * frame_size;
+    gate_grad += batch_idx * 3 * frame_size;
+    reset_output_grad += batch_idx * frame_size;
   }
 
-  T rResetGateGrad;
-  T rPrevOutValue = 0;
-  T rPrevOutGrad = 0;
-  T rResetOutputGrad = 0;
-  T rUpdateGateValue = gateValue[frameIdx + frameSize * 0];
-  T rUpdateGateGrad = gateGrad[frameIdx + frameSize * 0];
-  T rResetGateValue = gateValue[frameIdx + frameSize * 1];
-
-  if (prevOutValue && prevOutGrad) {
-    if (isBatch) prevOutValue += batchIdx * frameSize;
-    if (isBatch) prevOutGrad += batchIdx * frameSize;
-    rPrevOutValue = prevOutValue[frameIdx];
-    rPrevOutGrad = prevOutGrad[frameIdx];
-    rResetOutputGrad = resetOutputGrad[frameIdx];
+  T r_reset_gate_grad;
+  T r_prev_out_value = 0;
+  T r_prev_out_grad = 0;
+  T r_reset_output_grad = 0;
+  T r_update_gate_value = gate_value[frame_idx + frame_size * 0];
+  T r_update_gate_grad = gate_grad[frame_idx + frame_size * 0];
+  T r_reset_gate_value = gate_value[frame_idx + frame_size * 1];
+
+  if (prev_out_value && prev_out_grad) {
+    if (is_batch) prev_out_value += batch_idx * frame_size;
+    if (is_batch) prev_out_grad += batch_idx * frame_size;
+    r_prev_out_value = prev_out_value[frame_idx];
+    r_prev_out_grad = prev_out_grad[frame_idx];
+    r_reset_output_grad = reset_output_grad[frame_idx];
   }
 
-  opResetGrad(rUpdateGateValue, rUpdateGateGrad, rResetGateValue,
-              rResetGateGrad, rPrevOutValue, rPrevOutGrad, rResetOutputGrad,
-              active_gate);
+  op_reset_grad(r_update_gate_value, r_update_gate_grad, r_reset_gate_value,
+                r_reset_gate_grad, r_prev_out_value, r_prev_out_grad,
+                r_reset_output_grad, active_gate);
 
-  gateGrad[frameIdx + frameSize * 0] = rUpdateGateGrad;
-  gateGrad[frameIdx + frameSize * 1] = rResetGateGrad;
-  if (prevOutGrad) {
-    prevOutGrad[frameIdx] = rPrevOutGrad;
+  gate_grad[frame_idx + frame_size * 0] = r_update_gate_grad;
+  gate_grad[frame_idx + frame_size * 1] = r_reset_gate_grad;
+  if (prev_out_grad) {
+    prev_out_grad[frame_idx] = r_prev_out_grad;
   }
 }
 }  // namespace detail
diff --git a/paddle/operators/math/detail/gru_kernel.h b/paddle/operators/math/detail/gru_kernel.h
index 8a681d8d8bced72e1296f863489f6ccbc7913167..acd84be01db9ddaf06d165d8be353b253f324dd2 100644
--- a/paddle/operators/math/detail/gru_kernel.h
+++ b/paddle/operators/math/detail/gru_kernel.h
@@ -28,23 +28,25 @@ namespace forward {
 template <typename T>
 class gru_resetOutput {
  public:
-  HOSTDEVICE void operator()(T &valueUpdateGate, T &valueResetGate, T &prevOut,
-                             T &valueResetOutput, activation_mode_t actGate) {
-    valueUpdateGate = activation(valueUpdateGate, actGate);
-    valueResetGate = activation(valueResetGate, actGate);
-    valueResetOutput = prevOut * valueResetGate;
+  HOSTDEVICE void operator()(T &value_update_gate, T &value_reset_gate,
+                             T &prev_out, T &value_reset_output,
+                             activation_mode_t act_gate) {
+    value_update_gate = activation(value_update_gate, act_gate);
+    value_reset_gate = activation(value_reset_gate, act_gate);
+    value_reset_output = prev_out * value_reset_gate;
   }
 #ifndef __NVCC__
 #ifndef __AVX__
   static const bool avx = false;
 #else
   static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &valueResetGate,
-                             __m256 &prevOut, __m256 &valueResetOutput,
-                             activation_mode_t actGate) {
-    valueUpdateGate = activation(valueUpdateGate, actGate);
-    valueResetGate = activation(valueResetGate, actGate);
-    valueResetOutput = _mm256_mul_ps(prevOut, valueResetGate);
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
+                             __m256 &value_reset_gate, __m256 &prev_out,
+                             __m256 &value_reset_output,
+                             activation_mode_t act_gate) {
+    value_update_gate = activation(value_update_gate, act_gate);
+    value_reset_gate = activation(value_reset_gate, act_gate);
+    value_reset_output = _mm256_mul_ps(prev_out, value_reset_gate);
   }
 #endif
 #endif
@@ -53,24 +55,26 @@ class gru_resetOutput {
 template <typename T>
 class gru_finalOutput {
  public:
-  HOSTDEVICE void operator()(T &valueUpdateGate, T &valueFrameState, T &prevOut,
-                             T &valueOutput, activation_mode_t actInput) {
-    valueFrameState = activation(valueFrameState, actInput);
-    valueOutput = prevOut - (valueUpdateGate * prevOut) +
-                  (valueUpdateGate * valueFrameState);
+  HOSTDEVICE void operator()(T &value_update_gate, T &value_frame_state,
+                             T &prev_out, T &value_output,
+                             activation_mode_t act_input) {
+    value_frame_state = activation(value_frame_state, act_input);
+    value_output = prev_out - (value_update_gate * prev_out) +
+                   (value_update_gate * value_frame_state);
   }
 #ifndef __NVCC__
 #ifndef __AVX__
   static const bool avx = false;
 #else
   static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &valueFrameState,
-                             __m256 &prevOut, __m256 &valueOutput,
-                             activation_mode_t actInput) {
-    valueFrameState = activation(valueFrameState, actInput);
-    valueOutput = _mm256_add_ps(
-        _mm256_sub_ps(prevOut, _mm256_mul_ps(valueUpdateGate, prevOut)),
-        _mm256_mul_ps(valueUpdateGate, valueFrameState));
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
+                             __m256 &value_frame_state, __m256 &prev_out,
+                             __m256 &value_output,
+                             activation_mode_t act_input) {
+    value_frame_state = activation(value_frame_state, act_input);
+    value_output = _mm256_add_ps(
+        _mm256_sub_ps(prev_out, _mm256_mul_ps(value_update_gate, prev_out)),
+        _mm256_mul_ps(value_update_gate, value_frame_state));
   }
 #endif
 #endif
@@ -82,34 +86,37 @@ namespace backward {
 template <typename T>
 class gru_stateGrad {
  public:
-  HOSTDEVICE void operator()(T &valueUpdateGate, T &gradUpdateGate,
-                             T &valueFrameState, T &gradFrameState,
-                             T &valuePrevOut, T &gradPrevOut, T &gradOutput,
-                             activation_mode_t actInput) {
-    gradUpdateGate = (gradOutput * valueFrameState);
-    gradUpdateGate -= (gradOutput * valuePrevOut);
-    gradPrevOut -= (gradOutput * valueUpdateGate);
-    gradPrevOut += gradOutput;
-    gradFrameState =
-        activation(gradOutput * valueUpdateGate, valueFrameState, actInput);
+  HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
+                             T &value_frame_state, T &grad_frame_state,
+                             T &value_prev_out, T &grad_prev_out,
+                             T &grad_output, activation_mode_t act_input) {
+    grad_update_gate = (grad_output * value_frame_state);
+    grad_update_gate -= (grad_output * value_prev_out);
+    grad_prev_out -= (grad_output * value_update_gate);
+    grad_prev_out += grad_output;
+    grad_frame_state = activation(grad_output * value_update_gate,
+                                  value_frame_state, act_input);
   }
 #ifndef __NVCC__
 #ifndef __AVX__
   static const bool avx = false;
 #else
   static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &gradUpdateGate,
-                             __m256 &valueFrameState, __m256 &gradFrameState,
-                             __m256 &valuePrevOut, __m256 &gradPrevOut,
-                             __m256 &gradOutput, activation_mode_t actInput) {
-    gradUpdateGate = _mm256_mul_ps(gradOutput, valueFrameState);
-    gradUpdateGate =
-        _mm256_sub_ps(gradUpdateGate, _mm256_mul_ps(gradOutput, valuePrevOut));
-    gradPrevOut = _mm256_add_ps(
-        _mm256_sub_ps(gradPrevOut, _mm256_mul_ps(gradOutput, valueUpdateGate)),
-        gradOutput);
-    gradFrameState = activation(_mm256_mul_ps(gradOutput, valueUpdateGate),
-                                valueFrameState, actInput);
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
+                             __m256 &grad_update_gate,
+                             __m256 &value_frame_state,
+                             __m256 &grad_frame_state, __m256 &value_prev_out,
+                             __m256 &grad_prev_out, __m256 &grad_output,
+                             activation_mode_t act_input) {
+    grad_update_gate = _mm256_mul_ps(grad_output, value_frame_state);
+    grad_update_gate = _mm256_sub_ps(
+        grad_update_gate, _mm256_mul_ps(grad_output, value_prev_out));
+    grad_prev_out = _mm256_add_ps(
+        _mm256_sub_ps(grad_prev_out,
+                      _mm256_mul_ps(grad_output, value_update_gate)),
+        grad_output);
+    grad_frame_state = activation(_mm256_mul_ps(grad_output, value_update_gate),
+                                  value_frame_state, act_input);
   }
 #endif
 #endif
@@ -118,30 +125,32 @@ class gru_stateGrad {
 template <typename T>
 class gru_resetGrad {
  public:
-  HOSTDEVICE void operator()(T &valueUpdateGate, T &gradUpdateGate,
-                             T &valueResetGate, T &gradResetGate,
-                             T &valuePrevOut, T &gradPrevOut,
-                             T &gradResetOutput, activation_mode_t actGate) {
-    gradResetGate = (gradResetOutput * valuePrevOut);
-    gradPrevOut += (gradResetOutput * valueResetGate);
-    gradUpdateGate = activation(gradUpdateGate, valueUpdateGate, actGate);
-    gradResetGate = activation(gradResetGate, valueResetGate, actGate);
+  HOSTDEVICE void operator()(T &value_update_gate, T &grad_update_gate,
+                             T &value_reset_gate, T &grad_reset_gate,
+                             T &value_prev_out, T &grad_prev_out,
+                             T &grad_reset_output, activation_mode_t act_gate) {
+    grad_reset_gate = (grad_reset_output * value_prev_out);
+    grad_prev_out += (grad_reset_output * value_reset_gate);
+    grad_update_gate =
+        activation(grad_update_gate, value_update_gate, act_gate);
+    grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate);
   }
 #ifndef __NVCC__
 #ifndef __AVX__
   static const bool avx = false;
 #else
   static const bool avx = true;
-  HOSTDEVICE void operator()(__m256 &valueUpdateGate, __m256 &gradUpdateGate,
-                             __m256 &valueResetGate, __m256 &gradResetGate,
-                             __m256 &valuePrevOut, __m256 &gradPrevOut,
-                             __m256 &gradResetOutput,
-                             activation_mode_t actGate) {
-    gradResetGate = _mm256_mul_ps(gradResetOutput, valuePrevOut);
-    gradPrevOut = _mm256_add_ps(gradPrevOut,
-                                _mm256_mul_ps(gradResetOutput, valueResetGate));
-    gradUpdateGate = activation(gradUpdateGate, valueUpdateGate, actGate);
-    gradResetGate = activation(gradResetGate, valueResetGate, actGate);
+  HOSTDEVICE void operator()(__m256 &value_update_gate,
+                             __m256 &grad_update_gate, __m256 &value_reset_gate,
+                             __m256 &grad_reset_gate, __m256 &value_prev_out,
+                             __m256 &grad_prev_out, __m256 &grad_reset_output,
+                             activation_mode_t act_gate) {
+    grad_reset_gate = _mm256_mul_ps(grad_reset_output, value_prev_out);
+    grad_prev_out = _mm256_add_ps(
+        grad_prev_out, _mm256_mul_ps(grad_reset_output, value_reset_gate));
+    grad_update_gate =
+        activation(grad_update_gate, value_update_gate, act_gate);
+    grad_reset_gate = activation(grad_reset_gate, value_reset_gate, act_gate);
   }
 #endif
 #endif
diff --git a/paddle/operators/math/gru_compute.cc b/paddle/operators/math/gru_compute.cc
index 125af449d3f700e24be5e4b7615c3b0e03fd4e5b..ae4e47b014a9cd1f656dd9332086aa4d1b7cbb52 100644
--- a/paddle/operators/math/gru_compute.cc
+++ b/paddle/operators/math/gru_compute.cc
@@ -21,29 +21,29 @@ namespace math {
 template <typename T>
 struct GRUUnitFunctor<platform::CPUPlace, T> {
   static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, int frameSize, int batchSize,
+                      hl_gru_value<T> value, int frame_size, int batch_size,
                       activation_mode_t active_node,
                       activation_mode_t active_gate) {
 #ifndef __NVCC__
-    if (value.prevOutValue) {
+    if (value.prev_out_value) {
       math::gemm<platform::CPUPlace, T>(
-          context, false, false, batchSize, frameSize * 2, frameSize, 1,
-          value.prevOutValue, frameSize, value.gateWeight, frameSize * 2, 1,
-          value.gateValue, frameSize * 3);
+          context, false, false, batch_size, frame_size * 2, frame_size, 1,
+          value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
+          1, value.gate_value, frame_size * 3);
     }
 
     detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value,
-                                 frameSize, batchSize, active_gate);
+                                 frame_size, batch_size, active_gate);
 
-    if (value.prevOutValue) {
+    if (value.prev_out_value) {
       math::gemm<platform::CPUPlace, T>(
-          context, false, false, batchSize, frameSize, frameSize, 1,
-          value.resetOutputValue, frameSize, value.stateWeight, frameSize, 1,
-          value.gateValue + frameSize * 2, frameSize * 3);
+          context, false, false, batch_size, frame_size, frame_size, 1,
+          value.reset_output_value, frame_size, value.state_weight, frame_size,
+          1, value.gate_value + frame_size * 2, frame_size * 3);
     }
 
     detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
-                                 frameSize, batchSize, active_node);
+                                 frame_size, batch_size, active_node);
 #endif
   }
 };
@@ -51,41 +51,43 @@ struct GRUUnitFunctor<platform::CPUPlace, T> {
 template <typename T>
 struct GRUUnitGradFunctor<platform::CPUPlace, T> {
   static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
-                      int batchSize, activation_mode_t active_node,
+                      hl_gru_value<T> value, hl_gru_grad<T> grad,
+                      int frame_size, int batch_size,
+                      activation_mode_t active_node,
                       activation_mode_t active_gate) {
 #ifndef __NVCC__
     detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value,
-                                grad, frameSize, batchSize, active_node);
+                                grad, frame_size, batch_size, active_node);
 
-    if (value.prevOutValue && grad.prevOutGrad) {
+    if (value.prev_out_value && grad.prev_out_grad) {
       math::gemm<platform::CPUPlace, T>(
-          context, false, true, batchSize, frameSize, frameSize, 1,
-          grad.gateGrad + frameSize * 2, frameSize * 3, value.stateWeight,
-          frameSize, 0, grad.resetOutputGrad, frameSize);
+          context, false, true, batch_size, frame_size, frame_size, 1,
+          grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
+          frame_size, 0, grad.reset_output_grad, frame_size);
 
-      if (grad.stateWeightGrad) {
+      if (grad.state_weight_grad) {
         math::gemm<platform::CPUPlace, T>(
-            context, true, false, frameSize, frameSize, batchSize, 1,
-            value.resetOutputValue, frameSize, grad.gateGrad + frameSize * 2,
-            frameSize * 3, 1, grad.stateWeightGrad, frameSize);
+            context, true, false, frame_size, frame_size, batch_size, 1,
+            value.reset_output_value, frame_size,
+            grad.gate_grad + frame_size * 2, frame_size * 3, 1,
+            grad.state_weight_grad, frame_size);
       }
     }
 
     detail::backward_reset_grad(detail::backward::gru_resetGrad<T>(), value,
-                                grad, frameSize, batchSize, active_gate);
+                                grad, frame_size, batch_size, active_gate);
 
-    if (grad.prevOutGrad && value.prevOutValue) {
+    if (grad.prev_out_grad && value.prev_out_value) {
       math::gemm<platform::CPUPlace, T>(
-          context, false, true, batchSize, frameSize, frameSize * 2, 1,
-          grad.gateGrad, frameSize * 3, value.gateWeight, frameSize * 2, 1,
-          grad.prevOutGrad, frameSize);
+          context, false, true, batch_size, frame_size, frame_size * 2, 1,
+          grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
+          grad.prev_out_grad, frame_size);
 
-      if (grad.gateWeightGrad) {
+      if (grad.gate_weight_grad) {
         math::gemm<platform::CPUPlace, T>(
-            context, true, false, frameSize, frameSize * 2, batchSize, 1,
-            value.prevOutValue, frameSize, grad.gateGrad, frameSize * 3, 1,
-            grad.gateWeightGrad, frameSize * 2);
+            context, true, false, frame_size, frame_size * 2, batch_size, 1,
+            value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
+            grad.gate_weight_grad, frame_size * 2);
       }
     }
 #endif
diff --git a/paddle/operators/math/gru_compute.cu b/paddle/operators/math/gru_compute.cu
index 7b9e54ac029f6aa00553338435684097d6d02b25..0252bdbdb63fef2e4754057fc5b6d415cef0c29f 100644
--- a/paddle/operators/math/gru_compute.cu
+++ b/paddle/operators/math/gru_compute.cu
@@ -21,66 +21,66 @@ namespace math {
 template <typename T>
 struct GRUUnitFunctor<platform::GPUPlace, T> {
   static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, int frameSize, int batchSize,
+                      hl_gru_value<T> value, int frame_size, int batch_size,
                       activation_mode_t active_node,
                       activation_mode_t active_gate) {
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
     dim3 threads;
     dim3 grid;
-    if (batchSize == 1) {
-      int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
-      int frameBlocks = (frameSize + 1024 - 1) / 1024;
-      threads = dim3(framePerBlock, 1);
-      grid = dim3(frameBlocks, 1);
+    if (batch_size == 1) {
+      int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
+      int frame_blocks = (frame_size + 1024 - 1) / 1024;
+      threads = dim3(frame_per_block, 1);
+      grid = dim3(frame_blocks, 1);
     } else {
       threads = dim3(32, 32);
-      grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+      grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
     }
 
-    if (value.prevOutValue) {
+    if (value.prev_out_value) {
       math::gemm<platform::GPUPlace, T>(
-          context, false, false, batchSize, frameSize * 2, frameSize, 1,
-          value.prevOutValue, frameSize, value.gateWeight, frameSize * 2, 1,
-          value.gateValue, frameSize * 3);
+          context, false, false, batch_size, frame_size * 2, frame_size, 1,
+          value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
+          1, value.gate_value, frame_size * 3);
     }
 
-    if (batchSize == 1) {
+    if (batch_size == 1) {
       detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
-                                      /* isBatch= */ false,
+                                      /* is_batch= */ false,
                                       T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_resetOutput<T>(), value.gateValue,
-          value.resetOutputValue, value.prevOutValue, frameSize, batchSize,
-          active_gate);
+          detail::forward::gru_resetOutput<T>(), value.gate_value,
+          value.reset_output_value, value.prev_out_value, frame_size,
+          batch_size, active_gate);
     } else {
       detail::KeGruForwardResetOutput<detail::forward::gru_resetOutput<T>,
-                                      /* isBatch= */ true,
+                                      /* is_batch= */ true,
                                       T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_resetOutput<T>(), value.gateValue,
-          value.resetOutputValue, value.prevOutValue, frameSize, batchSize,
-          active_gate);
+          detail::forward::gru_resetOutput<T>(), value.gate_value,
+          value.reset_output_value, value.prev_out_value, frame_size,
+          batch_size, active_gate);
     }
 
-    if (value.prevOutValue) {
+    if (value.prev_out_value) {
       math::gemm<platform::GPUPlace, T>(
-          context, false, false, batchSize, frameSize, frameSize, 1,
-          value.resetOutputValue, frameSize, value.stateWeight, frameSize, 1,
-          value.gateValue + frameSize * 2, frameSize * 3);
+          context, false, false, batch_size, frame_size, frame_size, 1,
+          value.reset_output_value, frame_size, value.state_weight, frame_size,
+          1, value.gate_value + frame_size * 2, frame_size * 3);
     }
 
-    if (batchSize == 1) {
+    if (batch_size == 1) {
       detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
-                                      /* isBatch= */ false,
+                                      /* is_batch= */ false,
                                       T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_finalOutput<T>(), value.gateValue,
-          value.prevOutValue, value.outputValue, frameSize, batchSize,
+          detail::forward::gru_finalOutput<T>(), value.gate_value,
+          value.prev_out_value, value.output_value, frame_size, batch_size,
           active_node);
     } else {
       detail::KeGruForwardFinalOutput<detail::forward::gru_finalOutput<T>,
-                                      /* isBatch= */ true,
+                                      /* is_batch= */ true,
                                       T><<<grid, threads, 0, stream>>>(
-          detail::forward::gru_finalOutput<T>(), value.gateValue,
-          value.prevOutValue, value.outputValue, frameSize, batchSize,
+          detail::forward::gru_finalOutput<T>(), value.gate_value,
+          value.prev_out_value, value.output_value, frame_size, batch_size,
           active_node);
     }
   }
@@ -89,80 +89,82 @@ struct GRUUnitFunctor<platform::GPUPlace, T> {
 template <typename T>
 struct GRUUnitGradFunctor<platform::GPUPlace, T> {
   static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
-                      int batchSize, activation_mode_t active_node,
+                      hl_gru_value<T> value, hl_gru_grad<T> grad,
+                      int frame_size, int batch_size,
+                      activation_mode_t active_node,
                       activation_mode_t active_gate) {
     auto stream =
         reinterpret_cast<const platform::CUDADeviceContext &>(context).stream();
     dim3 threads;
     dim3 grid;
-    if (batchSize == 1) {
-      int framePerBlock = frameSize <= 1024 ? frameSize : 1024;
-      int frameBlocks = (frameSize + 1024 - 1) / 1024;
-      threads = dim3(framePerBlock, 1);
-      grid = dim3(frameBlocks, 1);
+    if (batch_size == 1) {
+      int frame_per_block = frame_size <= 1024 ? frame_size : 1024;
+      int frame_blocks = (frame_size + 1024 - 1) / 1024;
+      threads = dim3(frame_per_block, 1);
+      grid = dim3(frame_blocks, 1);
     } else {
       threads = dim3(32, 32);
-      grid = dim3((frameSize + 32 - 1) / 32, (batchSize + 32 - 1) / 32);
+      grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
     }
 
-    if (batchSize == 1) {
+    if (batch_size == 1) {
       detail::KeGruBackwardStateGrad<
           detail::backward::gru_stateGrad<T>,
-          /* isBatch= */ false><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_stateGrad<T>(), value.gateValue, grad.gateGrad,
-          value.prevOutValue, grad.prevOutGrad, grad.outputGrad, frameSize,
-          batchSize, active_node);
+          /* is_batch= */ false><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_stateGrad<T>(), value.gate_value,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+          grad.output_grad, frame_size, batch_size, active_node);
     } else {
       detail::KeGruBackwardStateGrad<
           detail::backward::gru_stateGrad<T>,
-          /* isBatch= */ true><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_stateGrad<T>(), value.gateValue, grad.gateGrad,
-          value.prevOutValue, grad.prevOutGrad, grad.outputGrad, frameSize,
-          batchSize, active_node);
+          /* is_batch= */ true><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_stateGrad<T>(), value.gate_value,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+          grad.output_grad, frame_size, batch_size, active_node);
     }
 
-    if (value.prevOutValue && grad.prevOutGrad) {
+    if (value.prev_out_value && grad.prev_out_grad) {
       math::gemm<platform::GPUPlace, T>(
-          context, false, true, batchSize, frameSize, frameSize, 1,
-          grad.gateGrad + frameSize * 2, frameSize * 3, value.stateWeight,
-          frameSize, 0, grad.resetOutputGrad, frameSize);
+          context, false, true, batch_size, frame_size, frame_size, 1,
+          grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
+          frame_size, 0, grad.reset_output_grad, frame_size);
 
-      if (grad.stateWeightGrad) {
+      if (grad.state_weight_grad) {
         math::gemm<platform::GPUPlace, T>(
-            context, true, false, frameSize, frameSize, batchSize, 1,
-            value.resetOutputValue, frameSize, grad.gateGrad + frameSize * 2,
-            frameSize * 3, 1, grad.stateWeightGrad, frameSize);
+            context, true, false, frame_size, frame_size, batch_size, 1,
+            value.reset_output_value, frame_size,
+            grad.gate_grad + frame_size * 2, frame_size * 3, 1,
+            grad.state_weight_grad, frame_size);
       }
     }
 
-    if (batchSize == 1) {
+    if (batch_size == 1) {
       detail::KeGruBackwardResetGrad<
           detail::backward::gru_resetGrad<T>,
-          /* isBatch= */ false><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_resetGrad<T>(), value.gateValue, grad.gateGrad,
-          value.prevOutValue, grad.prevOutGrad, grad.resetOutputGrad, frameSize,
-          batchSize, active_gate);
+          /* is_batch= */ false><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_resetGrad<T>(), value.gate_value,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+          grad.reset_output_grad, frame_size, batch_size, active_gate);
     } else {
       detail::KeGruBackwardResetGrad<
           detail::backward::gru_resetGrad<T>,
-          /* isBatch= */ true><<<grid, threads, 0, stream>>>(
-          detail::backward::gru_resetGrad<T>(), value.gateValue, grad.gateGrad,
-          value.prevOutValue, grad.prevOutGrad, grad.resetOutputGrad, frameSize,
-          batchSize, active_gate);
+          /* is_batch= */ true><<<grid, threads, 0, stream>>>(
+          detail::backward::gru_resetGrad<T>(), value.gate_value,
+          grad.gate_grad, value.prev_out_value, grad.prev_out_grad,
+          grad.reset_output_grad, frame_size, batch_size, active_gate);
     }
 
-    if (grad.prevOutGrad && value.prevOutValue) {
+    if (grad.prev_out_grad && value.prev_out_value) {
       math::gemm<platform::GPUPlace, T>(
-          context, false, true, batchSize, frameSize, frameSize * 2, 1,
-          grad.gateGrad, frameSize * 3, value.gateWeight, frameSize * 2, 1,
-          grad.prevOutGrad, frameSize);
+          context, false, true, batch_size, frame_size, frame_size * 2, 1,
+          grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
+          grad.prev_out_grad, frame_size);
 
-      if (grad.gateWeightGrad) {
+      if (grad.gate_weight_grad) {
         math::gemm<platform::GPUPlace, T>(
-            context, true, false, frameSize, frameSize * 2, batchSize, 1,
-            value.prevOutValue, frameSize, grad.gateGrad, frameSize * 3, 1,
-            grad.gateWeightGrad, frameSize * 2);
+            context, true, false, frame_size, frame_size * 2, batch_size, 1,
+            value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
+            grad.gate_weight_grad, frame_size * 2);
       }
     }
   }
diff --git a/paddle/operators/math/gru_compute.h b/paddle/operators/math/gru_compute.h
index 1475fb38104f353857dfd968e46af98a6d52c52a..58ea59f68e91c647a6b29ce3e8bc7e5d25db9b9b 100644
--- a/paddle/operators/math/gru_compute.h
+++ b/paddle/operators/math/gru_compute.h
@@ -22,28 +22,28 @@ namespace math {
 // TODO(guosheng): refine code style in gru_compute
 template <typename T>
 struct hl_gru_value {
-  T *gateWeight;
-  T *stateWeight;
-  T *gateValue;
-  T *resetOutputValue;
-  T *outputValue;
-  T *prevOutValue;
+  T *gate_weight;
+  T *state_weight;
+  T *gate_value;
+  T *reset_output_value;
+  T *output_value;
+  T *prev_out_value;
 };
 
 template <typename T>
 struct hl_gru_grad {
-  T *gateWeightGrad;
-  T *stateWeightGrad;
-  T *gateGrad;
-  T *resetOutputGrad;
-  T *outputGrad;
-  T *prevOutGrad;
+  T *gate_weight_grad;
+  T *state_weight_grad;
+  T *gate_grad;
+  T *reset_output_grad;
+  T *output_grad;
+  T *prev_out_grad;
 };
 
 template <typename Place, typename T>
 struct GRUUnitFunctor {
   static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, int frameSize, int batchSize,
+                      hl_gru_value<T> value, int frame_size, int batch_size,
                       activation_mode_t active_node,
                       activation_mode_t active_gate);
 };
@@ -51,8 +51,9 @@ struct GRUUnitFunctor {
 template <typename Place, typename T>
 struct GRUUnitGradFunctor {
   static void compute(const platform::DeviceContext &context,
-                      hl_gru_value<T> value, hl_gru_grad<T> grad, int frameSize,
-                      int batchSize, activation_mode_t active_node,
+                      hl_gru_value<T> value, hl_gru_grad<T> grad,
+                      int frame_size, int batch_size,
+                      activation_mode_t active_node,
                       activation_mode_t active_gate);
 };
 
diff --git a/paddle/operators/math/im2col.h b/paddle/operators/math/im2col.h
index deb60051beef56437cf75f0fa2cef90bbc0a209a..24fd9a06e9f5fbd50483429379cf3f46ff88bcaa 100644
--- a/paddle/operators/math/im2col.h
+++ b/paddle/operators/math/im2col.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/framework/tensor.h"
+#include "paddle/framework/tensor_util.h"
 #include "paddle/platform/device_context.h"
 
 namespace paddle {
diff --git a/paddle/operators/math/im2col_test.cc b/paddle/operators/math/im2col_test.cc
index 10c28da72ba9d3b94bb59c5cf00e7f5a2f28fd06..ae197a97ed8aa089b51be77a59a8ba6a98ac70ec 100644
--- a/paddle/operators/math/im2col_test.cc
+++ b/paddle/operators/math/im2col_test.cc
@@ -74,7 +74,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    input.CopyFrom(input_tmp, *place, *context);
+    CopyFrom(input_tmp, *place, *context, &input);
   }
   output_cfo.mutable_data<float>(
       {1, filter_size, filter_size, output_height, output_width}, *place);
@@ -99,7 +99,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     out_cfo_ptr = output_cfo.data<float>();
   } else {
-    output_tmp.CopyFrom(output_cfo, paddle::platform::CPUPlace(), *context);
+    CopyFrom(output_cfo, paddle::platform::CPUPlace(), *context, &output_tmp);
     out_cfo_ptr = output_tmp.data<float>();
   }
   for (int i = 0; i < 6; ++i) {
@@ -110,7 +110,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     out_ocf_ptr = output_ocf.data<float>();
   } else {
-    output_tmp.CopyFrom(output_ocf, paddle::platform::CPUPlace(), *context);
+    CopyFrom(output_ocf, paddle::platform::CPUPlace(), *context, &output_tmp);
     out_ocf_ptr = output_tmp.data<float>();
   }
   for (int i = 0; i < 6; ++i) {
@@ -130,7 +130,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    input.CopyFrom(input_tmp, *place, *context);
+    CopyFrom(input_tmp, *place, *context, &input);
   }
 
   col2im(*context, output_cfo, dilation, stride, padding, &input);
@@ -139,7 +139,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     in_ptr = input.data<float>();
   } else {
-    input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context);
+    CopyFrom(input, paddle::platform::CPUPlace(), *context, &input_tmp);
     in_ptr = input_tmp.data<float>();
   }
   for (int i = 0; i < 6; ++i) {
@@ -151,7 +151,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    input.CopyFrom(input_tmp, *place, *context);
+    CopyFrom(input_tmp, *place, *context, &input);
   }
 
   col2im_ocf(*context, output_ocf, dilation, stride, padding, &input);
@@ -159,7 +159,7 @@ void testIm2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     in_ptr = input.data<float>();
   } else {
-    input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context);
+    CopyFrom(input, paddle::platform::CPUPlace(), *context, &input_tmp);
     in_ptr = input_tmp.data<float>();
   }
   for (int i = 0; i < 6; ++i) {
diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h
index ffb99f53808c4316ede96b04e57aec4dae4134de..5a42854f22234629b3405ec2397143ef761a9d08 100644
--- a/paddle/operators/math/math_function.h
+++ b/paddle/operators/math/math_function.h
@@ -49,6 +49,7 @@ int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,
 
 #include "paddle/framework/eigen.h"
 #include "paddle/framework/tensor.h"
+#include "paddle/framework/tensor_util.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/enforce.h"
 
diff --git a/paddle/operators/math/math_function_test.cu b/paddle/operators/math/math_function_test.cu
index 780d17ffc6539c5f4d67ebab5476d6f646840b41..d5d6f0c73bc6bce7a74db2c98fa9f884a0bcd9a2 100644
--- a/paddle/operators/math/math_function_test.cu
+++ b/paddle/operators/math/math_function_test.cu
@@ -16,15 +16,15 @@ TEST(math_function, notrans_mul_trans) {
   auto* gpu_place = new paddle::platform::GPUPlace(0);
   paddle::platform::CUDADeviceContext context(*gpu_place);
 
-  input1_gpu.CopyFrom(input1, *gpu_place, context);
-  input2_gpu.CopyFrom(input1, *gpu_place, context);
+  paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::CopyFrom(input1, *gpu_place, context, &input2_gpu);
 
   out_gpu.mutable_data<float>({2, 2}, *gpu_place);
 
   paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
       context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
 
-  out.CopyFrom(out_gpu, *cpu_place, context);
+  paddle::framework::CopyFrom(out_gpu, *cpu_place, context, &out);
 
   float* out_ptr = out.data<float>();
   context.Wait();
@@ -50,15 +50,15 @@ TEST(math_function, trans_mul_notrans) {
   auto* gpu_place = new paddle::platform::GPUPlace(0);
   paddle::platform::CUDADeviceContext context(*gpu_place);
 
-  input1_gpu.CopyFrom(input1, *gpu_place, context);
-  input2_gpu.CopyFrom(input1, *gpu_place, context);
+  paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::CopyFrom(input1, *gpu_place, context, &input2_gpu);
 
   out_gpu.mutable_data<float>({3, 3}, *gpu_place);
 
   paddle::operators::math::matmul<paddle::platform::GPUPlace, float>(
       context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
 
-  out.CopyFrom(out_gpu, *cpu_place, context);
+  paddle::framework::CopyFrom(out_gpu, *cpu_place, context, &out);
 
   float* out_ptr = out.data<float>();
   context.Wait();
@@ -99,9 +99,9 @@ TEST(math_function, gemm_notrans_cublas) {
   auto* gpu_place = new paddle::platform::GPUPlace(0);
   paddle::platform::CUDADeviceContext context(*gpu_place);
 
-  input1_gpu.CopyFrom(input1, *gpu_place, context);
-  input2_gpu.CopyFrom(input2, *gpu_place, context);
-  input3_gpu.CopyFrom(input3, *gpu_place, context);
+  paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::CopyFrom(input2, *gpu_place, context, &input2_gpu);
+  paddle::framework::CopyFrom(input3, *gpu_place, context, &input3_gpu);
   float* a = input1_gpu.data<float>();
   float* b = input2_gpu.data<float>();
   float* c = input3_gpu.mutable_data<float>(*gpu_place);
@@ -109,7 +109,7 @@ TEST(math_function, gemm_notrans_cublas) {
   paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
       context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
 
-  input3.CopyFrom(input3_gpu, *cpu_place, context);
+  paddle::framework::CopyFrom(input3_gpu, *cpu_place, context, &input3);
 
   // numpy code:
   // a = np.arange(6).reshape(2, 3)
@@ -154,9 +154,9 @@ TEST(math_function, gemm_trans_cublas) {
   auto* gpu_place = new paddle::platform::GPUPlace(0);
   paddle::platform::CUDADeviceContext context(*gpu_place);
 
-  input1_gpu.CopyFrom(input1, *gpu_place, context);
-  input2_gpu.CopyFrom(input2, *gpu_place, context);
-  input3_gpu.CopyFrom(input3, *gpu_place, context);
+  paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu);
+  paddle::framework::CopyFrom(input2, *gpu_place, context, &input2_gpu);
+  paddle::framework::CopyFrom(input3, *gpu_place, context, &input3_gpu);
   float* a = input1_gpu.data<float>();
   float* b = input2_gpu.data<float>();
   float* c = input3_gpu.mutable_data<float>(*gpu_place);
@@ -164,7 +164,7 @@ TEST(math_function, gemm_trans_cublas) {
   paddle::operators::math::gemm<paddle::platform::GPUPlace, float>(
       context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
 
-  input3.CopyFrom(input3_gpu, *cpu_place, context);
+  paddle::framework::CopyFrom(input3_gpu, *cpu_place, context, &input3);
   context.Wait();
 
   EXPECT_EQ(input3_ptr[0], 0);
@@ -205,14 +205,15 @@ void GemvTest(int m, int n, bool trans) {
   }
 
   paddle::platform::CUDADeviceContext context(*gpu_place);
-  g_mat_a.CopyFrom(mat_a, *gpu_place, context);
-  g_vec_b.CopyFrom(vec_b, *gpu_place, context);
+  paddle::framework::CopyFrom(mat_a, *gpu_place, context, &g_mat_a);
+  paddle::framework::CopyFrom(vec_b, *gpu_place, context, &g_vec_b);
 
   paddle::operators::math::gemv<paddle::platform::GPUPlace, T>(
       context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a,
       g_data_b, 0., g_data_c);
 
-  vec_c.CopyFrom(g_vec_c, paddle::platform::CPUPlace(), context);
+  paddle::framework::CopyFrom(g_vec_c, paddle::platform::CPUPlace(), context,
+                              &vec_c);
 
   if (!trans) {
     for (int i = 0; i < m; ++i) {
diff --git a/paddle/operators/math/maxouting.cc b/paddle/operators/math/maxouting.cc
index e5168ce7afd4139475afa6edd5999b9974407c9b..c9003962d33b70b8e21a0d6b78bf5a77981df409 100644
--- a/paddle/operators/math/maxouting.cc
+++ b/paddle/operators/math/maxouting.cc
@@ -23,8 +23,7 @@ template <typename T>
 class MaxOutFunctor<platform::CPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
-                  framework::Tensor * output,
+                  const framework::Tensor& input, framework::Tensor* output,
                   int groups) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
@@ -37,34 +36,30 @@ class MaxOutFunctor<platform::CPUPlace, T> {
     T* output_data = output->mutable_data<T>(context.GetPlace());
 
     for (int i = 0; i < batch_size; ++i) {
-      int new_bindex =  c_size * i;
+      int new_bindex = c_size * i;
       for (int c = 0; c < output_channels; ++c) {
         int new_cindex = fea_size * c;
         for (int f = 0; f < fea_size; ++f) {
           T ele = static_cast<T>(-FLT_MAX);
           for (int ph = 0; ph < groups; ++ph) {
-            T x = input_data[(new_bindex + new_cindex) * groups
-              + ph * fea_size + f];
+            T x = input_data[(new_bindex + new_cindex) * groups +
+                             ph * fea_size + f];
             ele = ele > x ? ele : x;
           }
-          output_data[(new_bindex+new_cindex+f)] = ele;
+          output_data[(new_bindex + new_cindex + f)] = ele;
         }
       }
     }
   }
 };
 
-
-
 template <class T>
 class MaxOutGradFunctor<platform::CPUPlace, T> {
-public:
+ public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
-                  framework::Tensor * input_grad,
+                  const framework::Tensor& input, framework::Tensor* input_grad,
                   const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  int groups) {
+                  const framework::Tensor& output_grad, int groups) {
     const int batch_size = input.dims()[0];
     const int input_height = input.dims()[2];
     const int input_width = input.dims()[3];
@@ -84,11 +79,11 @@ public:
           bool continue_match = true;
           int output_idx = blen + clen + f;
           for (int g = 0; g < groups && continue_match; ++g) {
-              int input_idx = input_idx0 + fea_size * g;
-              if (input_data[input_idx] == output_data[output_idx]) {
-                input_grad_data[input_idx] += output_grad_data[output_idx];
-                continue_match = false;
-              }
+            int input_idx = input_idx0 + fea_size * g;
+            if (input_data[input_idx] == output_data[output_idx]) {
+              input_grad_data[input_idx] += output_grad_data[output_idx];
+              continue_match = false;
+            }
           }
         }
       }
diff --git a/paddle/operators/math/maxouting.cu b/paddle/operators/math/maxouting.cu
index 7c698577b8a8258a58ba9a2b6c675457b2458a5b..c3fabcae081e24d92d50d0e2a2cad4a2e9872125 100644
--- a/paddle/operators/math/maxouting.cu
+++ b/paddle/operators/math/maxouting.cu
@@ -21,9 +21,9 @@ namespace math {
 
 template <typename T>
 __global__ void KernelMaxOut(const int nthreads, const T* input_data,
-                            const int channels,
-                             const int input_height, const int input_width,
-                             int groups, T* output_data ) {
+                             const int channels, const int input_height,
+                             const int input_width, int groups,
+                             T* output_data) {
   const int size = input_height * input_width * channels / groups;
   const int feat_len = input_height * input_width;
   int index = blockIdx.x * blockDim.x + threadIdx.x;
@@ -34,7 +34,7 @@ __global__ void KernelMaxOut(const int nthreads, const T* input_data,
     int channel_idx = batch_offset / feat_len;
     int feat_idx = batch_offset % feat_len;
     int data_idx =
-      (batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
+        (batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
     T ele = static_cast<T>(-FLT_MAX);
     for (int g = 0; g < groups; ++g) {
       T x = input_data[data_idx + g * feat_len];
@@ -44,34 +44,35 @@ __global__ void KernelMaxOut(const int nthreads, const T* input_data,
   }
 }
 template <typename T>
-__global__ void KernelMaxoutGrad(
-    const int nthreads, const T* input_data, const T* output_data,
-    const T* output_grad, T* input_grad, const int channels,
-    const int input_height, const int input_width, int groups) {
-    const int size = input_height * input_width * channels / groups;
-    const int feat_len = input_height * input_width;
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    int offset = blockDim.x * gridDim.x;
-    for (int i = index; i < nthreads; i += offset) {
-      int batch_idx = i / size;
-      int batch_offset = i % size;
-      int channel_idx = batch_offset / feat_len;
-      int feat_idx = batch_offset % feat_len;
-      int data_idx =
+__global__ void KernelMaxoutGrad(const int nthreads, const T* input_data,
+                                 const T* output_data, const T* output_grad,
+                                 T* input_grad, const int channels,
+                                 const int input_height, const int input_width,
+                                 int groups) {
+  const int size = input_height * input_width * channels / groups;
+  const int feat_len = input_height * input_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int batch_idx = i / size;
+    int batch_offset = i % size;
+    int channel_idx = batch_offset / feat_len;
+    int feat_idx = batch_offset % feat_len;
+    int data_idx =
         (batch_idx * size + channel_idx * feat_len) * groups + feat_idx;
-      int max_index = -1;
-      bool continue_match = true;
-      for (int g = 0; g < groups && continue_match; ++g) {
-        if (input_data[data_idx + g * feat_len] == output_data[i]) {
-          max_index = data_idx + g * feat_len;
-          continue_match = false;
-          break;
-        }
-      }
-      if (max_index != -1) {
-        input_grad[max_index] += output_grad[index];
+    int max_index = -1;
+    bool continue_match = true;
+    for (int g = 0; g < groups && continue_match; ++g) {
+      if (input_data[data_idx + g * feat_len] == output_data[i]) {
+        max_index = data_idx + g * feat_len;
+        continue_match = false;
+        break;
       }
     }
+    if (max_index != -1) {
+      input_grad[max_index] += output_grad[index];
+    }
+  }
 }
 /*
  * All tensors are in NCHW format.
@@ -80,7 +81,7 @@ template <typename T>
 class MaxOutFunctor<platform::GPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor * output,
+                  const framework::Tensor& input, framework::Tensor* output,
                   int groups) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
@@ -92,7 +93,7 @@ class MaxOutFunctor<platform::GPUPlace, T> {
 
     const T* input_data = input.data<T>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
-    int nthreads =  output->numel();
+    int nthreads = output->numel();
     int blocks = (nthreads + 1024 - 1) / 1024;
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
@@ -101,8 +102,7 @@ class MaxOutFunctor<platform::GPUPlace, T> {
         T><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
                  .stream()>>>(nthreads, input_data, input_channels,
-                              input_height, input_width, groups,
-                              output_data);
+                              input_height, input_width, groups, output_data);
   }
 };
 /*
@@ -112,11 +112,9 @@ template <typename T>
 class MaxOutGradFunctor<platform::GPUPlace, T> {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
-                  framework::Tensor * input_grad,
+                  const framework::Tensor& input, framework::Tensor* input_grad,
                   const framework::Tensor& output,
-                  const framework::Tensor& output_grad,
-                  int groups) {
+                  const framework::Tensor& output_grad, int groups) {
     const int batch_size = input.dims()[0];
     const int input_channels = input.dims()[1];
     const int input_height = input.dims()[2];
@@ -129,7 +127,7 @@ class MaxOutGradFunctor<platform::GPUPlace, T> {
     const T* output_data = output.data<T>();
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
-    int nthreads =  output.numel();
+    int nthreads = output.numel();
     int blocks = (nthreads + 1024 - 1) / 1024;
     dim3 threads(1024, 1);
     dim3 grid(blocks, 1);
@@ -137,9 +135,9 @@ class MaxOutGradFunctor<platform::GPUPlace, T> {
     KernelMaxoutGrad<
         T><<<grid, threads, 0,
              reinterpret_cast<const platform::CUDADeviceContext&>(context)
-                 .stream()>>>(
-        nthreads, input_data, output_data, output_grad_data, input_grad_data,
-        input_channels, input_height, input_width, groups);
+                 .stream()>>>(nthreads, input_data, output_data,
+                              output_grad_data, input_grad_data, input_channels,
+                              input_height, input_width, groups);
   }
 };
 
diff --git a/paddle/operators/math/maxouting.h b/paddle/operators/math/maxouting.h
index d4c9da38ab8f8d88ed461d805ae64a015db968c4..2d9069b0b3ca3e7bad3b21a46985c52ef00f50e6 100644
--- a/paddle/operators/math/maxouting.h
+++ b/paddle/operators/math/maxouting.h
@@ -21,15 +21,14 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-#define FLT_MAX \
-    __FLT_MAX__
+#define FLT_MAX __FLT_MAX__
 
 template <typename Place, typename T>
 
 class MaxOutFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor * output,
+                  const framework::Tensor& input, framework::Tensor* output,
                   int groups);
 };
 
@@ -37,8 +36,7 @@ template <typename Place, class T>
 class MaxOutGradFunctor {
  public:
   void operator()(const platform::DeviceContext& context,
-                  const framework::Tensor& input,
-                  framework::Tensor * input_grad,
+                  const framework::Tensor& input, framework::Tensor* input_grad,
                   const framework::Tensor& output,
                   const framework::Tensor& output_grad, int groups);
 };
diff --git a/paddle/operators/math/selected_rows_functor.cu b/paddle/operators/math/selected_rows_functor.cu
index c40649e55ef93dec852ff6949b5cb134495e4ebf..c1dd323ba29e03e3ab4a3e4d7248388b408fb9d6 100644
--- a/paddle/operators/math/selected_rows_functor.cu
+++ b/paddle/operators/math/selected_rows_functor.cu
@@ -227,7 +227,6 @@ template struct SelectedRowsAddToTensor<platform::GPUPlace, float>;
 template struct SelectedRowsAddToTensor<platform::GPUPlace, double>;
 template struct SelectedRowsAddToTensor<platform::GPUPlace, int>;
 template struct SelectedRowsAddToTensor<platform::GPUPlace, int64_t>;
-
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/selected_rows_functor_test.cu b/paddle/operators/math/selected_rows_functor_test.cu
index 09de9dc53a1de9537b5109b3cc7cf9744f9c7908..7de9291c17d3f09a3c6076f00f2457f240e6f0af 100644
--- a/paddle/operators/math/selected_rows_functor_test.cu
+++ b/paddle/operators/math/selected_rows_functor_test.cu
@@ -67,7 +67,7 @@ TEST(selected_rows_functor, gpu_add) {
   EXPECT_EQ(out_rows[6], 9);
 
   Tensor out_cpu;
-  out_cpu.CopyFrom(*out_value, cpu_place, ctx);
+  CopyFrom(*out_value, cpu_place, ctx, &out_cpu);
   ctx.Wait();
 
   auto* out_cpu_data = out_cpu.data<float>();
@@ -94,7 +94,7 @@ TEST(selected_rows_functor, gpu_add) {
   add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
 
   Tensor tensor2_cpu;
-  tensor2_cpu.CopyFrom(*tensor2, cpu_place, ctx);
+  CopyFrom(*tensor2, cpu_place, ctx, &tensor2_cpu);
   ctx.Wait();
 
   auto* tensor2_cpu_data = tensor2_cpu.data<float>();
@@ -167,7 +167,7 @@ TEST(selected_rows_functor, gpu_add_to) {
   EXPECT_EQ(out_rows[6], 9);
 
   Tensor out_cpu;
-  out_cpu.CopyFrom(*out_value, cpu_place, ctx);
+  CopyFrom(*out_value, cpu_place, ctx, &out_cpu);
   ctx.Wait();
 
   auto* out_cpu_data = out_cpu.data<float>();
@@ -191,7 +191,7 @@ TEST(selected_rows_functor, gpu_add_to) {
   add_to_tensor_functor(ctx, *output, tensor1.get());
 
   Tensor tensor1_cpu;
-  tensor1_cpu.CopyFrom(*tensor1, cpu_place, ctx);
+  CopyFrom(*tensor1, cpu_place, ctx, &tensor1_cpu);
   ctx.Wait();
 
   auto* tensor1_cpu_data = tensor1_cpu.data<float>();
diff --git a/paddle/operators/math/unpooling.cc b/paddle/operators/math/unpooling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b57d3dc1414cff492db8d7d503a7fce370a3f151
--- /dev/null
+++ b/paddle/operators/math/unpooling.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/unpooling.h"
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+class Unpool2dMaxFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    int input_feasize = input_height * input_width;
+    int output_feasize = output_height * output_width;
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    for (int b = 0; b < batch_size; ++b) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int i = 0; i < input_feasize; ++i) {
+          int index = indices_data[i];
+          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
+          output_data[index] = input_data[i];
+        }
+        input_data += input_feasize;
+        indices_data += input_feasize;
+        output_data += output_feasize;
+      }
+    }
+  }
+};
+template <class T>
+class Unpool2dMaxGradFunctor<platform::CPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    int input_feasize = input_height * input_width;
+    int output_feasize = output_height * output_width;
+    const int* indices_data = indices.data<int>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+
+    for (int b = 0; b < batch_size; ++b) {
+      for (int c = 0; c < output_channels; ++c) {
+        for (int i = 0; i < input_feasize; ++i) {
+          int index = indices_data[i];
+          PADDLE_ENFORCE(index < output_feasize, "err index in unpooling!");
+          input_grad_data[i] = output_grad_data[index];
+        }
+        input_grad_data += input_feasize;
+        indices_data += input_feasize;
+        output_grad_data += output_feasize;
+      }
+    }
+  }
+};
+template class Unpool2dMaxGradFunctor<platform::CPUPlace, float>;
+template class Unpool2dMaxGradFunctor<platform::CPUPlace, double>;
+template class Unpool2dMaxFunctor<platform::CPUPlace, float>;
+template class Unpool2dMaxFunctor<platform::CPUPlace, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/unpooling.cu b/paddle/operators/math/unpooling.cu
new file mode 100644
index 0000000000000000000000000000000000000000..37c3c8b689f9a69b68ddffd23813fa9ad8ced0e7
--- /dev/null
+++ b/paddle/operators/math/unpooling.cu
@@ -0,0 +1,134 @@
+/* Copyright (c) 2016 paddlepaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/math/unpooling.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+__global__ void KernelUnpool2dMax(const int nthreads, const T* input_data,
+                                  const int* indices_data,
+                                  const int input_height, const int input_width,
+                                  const int channels, T* output_data,
+                                  const int output_height,
+                                  const int output_width) {
+  int in_n_stride = input_height * input_width * channels;
+  int in_c_stride = input_height * input_width;
+  int out_n_stride = output_height * output_width * channels;
+  int out_c_stride = output_height * output_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int bidx = i / in_n_stride;
+    int boffset = i % in_n_stride;
+    int cidx = boffset / in_c_stride;
+    int out_offset = bidx * out_n_stride + cidx * out_c_stride;
+    int out_index = indices_data[i];
+    PADDLE_ASSERT(out_index < out_c_stride);
+    output_data[out_offset + out_index] = input_data[i];
+  }
+}
+template <typename T>
+__global__ void KernelUnpool2dMaxGrad(
+    const int nthreads, const T* input_data, const int* indices_data,
+    const int input_height, const int input_width, const int channels,
+    const T* output_data, const T* output_grad, const int output_height,
+    const int output_width, T* input_grad) {
+  int in_n_stride = input_height * input_width * channels;
+  int in_c_stride = input_height * input_width;
+  int out_n_stride = output_height * output_width * channels;
+  int out_c_stride = output_height * output_width;
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int bidx = i / in_n_stride;
+    int boffset = i % in_n_stride;
+    int cidx = boffset / in_c_stride;
+    int out_offset = bidx * out_n_stride + cidx * out_c_stride;
+    int out_index = indices_data[i];
+    PADDLE_ASSERT(out_index < out_c_stride);
+    input_grad[i] = output_grad[out_offset + out_index];
+  }
+}
+/*
+ * All tensors are in NCHW format.
+ */
+template <typename T>
+class Unpool2dMaxFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output->dims()[1];
+    const int output_height = output->dims()[2];
+    const int output_width = output->dims()[3];
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    T* output_data = output->mutable_data<T>(context.GetPlace());
+    int threads = 1024;
+    int grid = (input.numel() + threads - 1) / threads;
+    KernelUnpool2dMax<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(input.numel(), input_data, indices_data,
+                              input_height, input_width, output_channels,
+                              output_data, output_height, output_width);
+  }
+};
+/*
+ * All tensors are in NCHW format.
+ */
+template <typename T>
+class Unpool2dMaxGradFunctor<platform::GPUPlace, T> {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad) {
+    const int batch_size = input.dims()[0];
+    const int input_height = input.dims()[2];
+    const int input_width = input.dims()[3];
+    const int output_channels = output.dims()[1];
+    const int output_height = output.dims()[2];
+    const int output_width = output.dims()[3];
+    const T* input_data = input.data<T>();
+    const int* indices_data = indices.data<int>();
+    const T* output_data = output.data<T>();
+    const T* output_grad_data = output_grad.data<T>();
+    T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
+    int threads = 1024;
+    int grid = (input.numel() + threads - 1) / threads;
+    KernelUnpool2dMaxGrad<
+        T><<<grid, threads, 0,
+             reinterpret_cast<const platform::CUDADeviceContext&>(context)
+                 .stream()>>>(input.numel(), input_data, indices_data,
+                              input_height, input_width, output_channels,
+                              output_data, output_grad_data, output_height,
+                              output_width, input_grad_data);
+  }
+};
+template class Unpool2dMaxGradFunctor<platform::GPUPlace, float>;
+template class Unpool2dMaxGradFunctor<platform::GPUPlace, double>;
+template class Unpool2dMaxFunctor<platform::GPUPlace, float>;
+template class Unpool2dMaxFunctor<platform::GPUPlace, double>;
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/unpooling.h b/paddle/operators/math/unpooling.h
new file mode 100644
index 0000000000000000000000000000000000000000..7077d7c2274fd9e02b69ef343f310f4ffbbcff1a
--- /dev/null
+++ b/paddle/operators/math/unpooling.h
@@ -0,0 +1,40 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "paddle/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename Place, typename T>
+class Unpool2dMaxFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices, framework::Tensor* output);
+};
+template <typename Place, class T>
+class Unpool2dMaxGradFunctor {
+ public:
+  void operator()(const platform::DeviceContext& context,
+                  const framework::Tensor& input,
+                  const framework::Tensor& indices,
+                  const framework::Tensor& output,
+                  const framework::Tensor& output_grad,
+                  framework::Tensor* input_grad);
+};
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/math/vol2col.h b/paddle/operators/math/vol2col.h
index cbc30bd754608dd6e6def1a4097d69bdf0c942c3..dc64d1d9776261541a380ed15207904d6b4e641c 100644
--- a/paddle/operators/math/vol2col.h
+++ b/paddle/operators/math/vol2col.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/framework/tensor.h"
+#include "paddle/framework/tensor_util.h"
 #include "paddle/platform/device_context.h"
 
 namespace paddle {
diff --git a/paddle/operators/math/vol2col_test.cc b/paddle/operators/math/vol2col_test.cc
index c31c716842f30de67c29b803866b8c82ddcf4a41..62c3152304ad7fe946c996be413e102f3dd92bb2 100644
--- a/paddle/operators/math/vol2col_test.cc
+++ b/paddle/operators/math/vol2col_test.cc
@@ -82,7 +82,7 @@ void testVol2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    input.CopyFrom(input_tmp, *place, *context);
+    CopyFrom(input_tmp, *place, *context, &input);
   }
   output.mutable_data<float>({1, filter_size, filter_size, filter_size,
                               output_depth, output_height, output_width},
@@ -96,7 +96,7 @@ void testVol2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     out_cfo_ptr = output.data<float>();
   } else {
-    output_tmp.CopyFrom(output, paddle::platform::CPUPlace(), *context);
+    CopyFrom(output, paddle::platform::CPUPlace(), *context, &output_tmp);
     out_cfo_ptr = output_tmp.data<float>();
   }
 
@@ -110,7 +110,7 @@ void testVol2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     input = input_tmp;
   } else {
-    input.CopyFrom(input_tmp, *place, *context);
+    CopyFrom(input_tmp, *place, *context, &input);
   }
 
   paddle::operators::math::Col2VolFunctor<Place, float> col2vol;
@@ -120,7 +120,7 @@ void testVol2col() {
   if (paddle::platform::is_cpu_place(*place)) {
     in_ptr = input.data<float>();
   } else {
-    input_tmp.CopyFrom(input, paddle::platform::CPUPlace(), *context);
+    CopyFrom(input, paddle::platform::CPUPlace(), *context, &input_tmp);
     in_ptr = input_tmp.data<float>();
   }
 
diff --git a/paddle/operators/max_sequence_len_op.cc b/paddle/operators/max_sequence_len_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..798022c9dd904a0ac189b4b550a94264a433ebf2
--- /dev/null
+++ b/paddle/operators/max_sequence_len_op.cc
@@ -0,0 +1,66 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/lod_rank_table.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+class MaxSeqenceLenOp : public framework::OperatorBase {
+ public:
+  MaxSeqenceLenOp(const std::string &type,
+                  const framework::VariableNameMap &inputs,
+                  const framework::VariableNameMap &outputs,
+                  const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto &rank_table =
+        scope.FindVar(Input("RankTable"))->Get<framework::LoDRankTable>();
+    auto *out =
+        scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+    int64_t *out_ptr = out->mutable_data<int64_t>({1}, platform::CPUPlace());
+    *out_ptr = rank_table.items()[0].length;
+  }
+};
+
+class MaxSeqenceLenOpProtoMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  MaxSeqenceLenOpProtoMaker(framework::OpProto *proto,
+                            framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("RankTable", "The lod_rank_table.");
+    AddOutput("Out", "The max sequence length.");
+    AddComment(
+        R"DOC(Calculate the max sequence length through lod_rank_table.)DOC");
+  }
+};
+
+class MaxSeqenceLenInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE(context->HasInput("RankTable"));
+    context->SetOutputDim("Out", {1});
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(max_sequence_len, paddle::operators::MaxSeqenceLenOp,
+                  paddle::operators::MaxSeqenceLenOpProtoMaker,
+                  paddle::operators::MaxSeqenceLenInferShape,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/maxout_op.cc b/paddle/operators/maxout_op.cc
index 95467f2e69093906980d075b6a41c5d2934dd5a2..e203a25d544372220e8246e5e17ffbc6408d2998 100644
--- a/paddle/operators/maxout_op.cc
+++ b/paddle/operators/maxout_op.cc
@@ -22,16 +22,17 @@ class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   MaxOutOpMaker(framework::OpProto* proto, framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X",
+    AddInput(
+        "X",
         "(Tensor) The input tensor of maxout operator. "
         "The format of input tensor is NCHW. Where N is batch size, C is the "
         "number of channels, H and W is the height and width of feature.");
     AddOutput("Out",
-        "(Tensor) The output tensor of maxout operator."
-        "The format of output tensor is also NCHW."
-        "Where N is batch size, C is "
-        "the number of channels, H and W is the height and "
-        "width of feature.");
+              "(Tensor) The output tensor of maxout operator."
+              "The format of output tensor is also NCHW."
+              "Where N is batch size, C is "
+              "the number of channels, H and W is the height and "
+              "width of feature.");
     AddAttr<int>(
         "groups",
         R"DOC("Specifies how many groups the input tensor will be split"
@@ -59,21 +60,19 @@ class MaxOutOpMaker : public framework::OpProtoAndCheckerMaker {
   }
 };
 
-
 class MaxOutOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of MaxoutOp"
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of MaxoutOp"
                    "should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of MaxoutOp should not be null.");
     auto in_x_dims = ctx->GetInputDim("X");
     int groups = ctx->Attrs().Get<int>("groups");
     // check groups > 1
-    PADDLE_ENFORCE_GT(
-        groups, 1,
-        "groups should be larger than 1 in maxoutop");
+    PADDLE_ENFORCE_GT(groups, 1, "groups should be larger than 1 in maxoutop");
     std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1] / groups});
     output_shape.push_back(in_x_dims[2]);
     output_shape.push_back(in_x_dims[3]);
@@ -87,18 +86,17 @@ class MaxOutOpGrad : public framework::OperatorWithKernel {
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
     PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
-    "Input(X@GRAD) should not be null.");
+                   "Input(X@GRAD) should not be null.");
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
   }
 };
-}    // namespace operators
-}    // namespace paddle
+}  // namespace operators
+}  // namespace paddle
 
 namespace ops = paddle::operators;
 REGISTER_OP(maxout, ops::MaxOutOp, ops::MaxOutOpMaker, maxout_grad,
-                        ops::MaxOutOpGrad);
-REGISTER_OP_CPU_KERNEL(maxout, ops::MaxOutKernel<paddle::platform::CPUPlace,
-                       float>);
-REGISTER_OP_CPU_KERNEL(maxout_grad,
-                       ops::MaxOutGradKernel<paddle::platform::CPUPlace,
-                       float>);
+            ops::MaxOutOpGrad);
+REGISTER_OP_CPU_KERNEL(maxout,
+                       ops::MaxOutKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    maxout_grad, ops::MaxOutGradKernel<paddle::platform::CPUPlace, float>);
diff --git a/paddle/operators/maxout_op.cu.cc b/paddle/operators/maxout_op.cu.cc
index a5823fba6848a0d42a743c90d7d683e3e4ae4422..decd43913d69d122330886e07178778d03f7fef5 100644
--- a/paddle/operators/maxout_op.cu.cc
+++ b/paddle/operators/maxout_op.cu.cc
@@ -18,8 +18,6 @@ namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(maxout,
                        ops::MaxOutKernel<paddle::platform::GPUPlace, float>,
                        ops::MaxOutKernel<paddle::platform::GPUPlace, double>);
-REGISTER_OP_GPU_KERNEL(maxout_grad,
-                       ops::MaxOutGradKernel<paddle::platform::GPUPlace,
-                        float>,
-                       ops::MaxOutGradKernel<paddle::platform::GPUPlace,
-                        double>);
+REGISTER_OP_GPU_KERNEL(
+    maxout_grad, ops::MaxOutGradKernel<paddle::platform::GPUPlace, float>,
+    ops::MaxOutGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/maxout_op.h b/paddle/operators/maxout_op.h
index c404cd16a9b2372ea4c6a17eb5ac82cf8f3bf27c..44a0d073dda642f6e261ce5760013f3e1055f43d 100644
--- a/paddle/operators/maxout_op.h
+++ b/paddle/operators/maxout_op.h
@@ -53,7 +53,7 @@ class MaxOutGradKernel : public framework::OpKernel<T> {
       zero(device_ctx, in_x_grad, static_cast<T>(0.0));
       math::MaxOutGradFunctor<Place, T> maxout_backward;
       maxout_backward(context.device_context(), *in_x, in_x_grad, *out,
-        *out_grad, groups);
+                      *out_grad, groups);
     }
   }
 };
diff --git a/paddle/operators/merge_lod_tensor_op.cc b/paddle/operators/merge_lod_tensor_op.cc
index 80460c476921b63ec5228a9780880c7db3c85217..adc688dbd5e13a2203d6842a12acdb8625288275 100644
--- a/paddle/operators/merge_lod_tensor_op.cc
+++ b/paddle/operators/merge_lod_tensor_op.cc
@@ -45,7 +45,7 @@ class MergeLoDTensorOp : public framework::OperatorBase {
       cpu_mask->ShareDataWith(mask);
     } else if (platform::is_gpu_place(mask.place())) {
 #ifdef PADDLE_WITH_CUDA
-      cpu_mask->CopyFrom(mask, platform::CPUPlace(), dev_ctx);
+      framework::CopyFrom(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
 #else
       PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
 #endif
@@ -99,8 +99,9 @@ class MergeLoDTensorOp : public framework::OperatorBase {
       if (len == 0) {
         continue;
       }
-      out->Slice(out_offset, out_offset + len)
-          .CopyFrom(input->Slice(start_offset, end_offset), place, dev_ctx);
+      auto slice = out->Slice(out_offset, out_offset + len);
+      framework::CopyFrom(input->Slice(start_offset, end_offset), place,
+                          dev_ctx, &slice);
       out_offset += len;
       (*in_idx) += 1;
     }
diff --git a/paddle/operators/multiplex_op.cu b/paddle/operators/multiplex_op.cu
index 49ed8a8879527fd32dd8b001ea256e46a0353487..10dff8d021d0394702cc8b92e779c012a4cf3eb2 100644
--- a/paddle/operators/multiplex_op.cu
+++ b/paddle/operators/multiplex_op.cu
@@ -33,7 +33,7 @@ class MultiplexGPUKernel : public framework::OpKernel<T> {
     auto cols = ins[0]->numel() / rows;
     // copy index to cpu
     Tensor index_t_cpu;
-    index_t_cpu.CopyFrom(*ids, platform::CPUPlace(), ctx.device_context());
+    CopyFrom(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
     auto* index = index_t_cpu.data<int32_t>();
     auto stream = ctx.cuda_device_context().stream();
     Place place = boost::get<Place>(ctx.GetPlace());
@@ -68,7 +68,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
     auto cols = ins[0]->numel() / rows;
     // copy index to cpu
     Tensor index_t_cpu;
-    index_t_cpu.CopyFrom(*ids, platform::CPUPlace(), ctx.device_context());
+    CopyFrom(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
     auto* index = index_t_cpu.data<int32_t>();
 
     auto stream = ctx.cuda_device_context().stream();
diff --git a/paddle/operators/nccl_op_test.cu.cc b/paddle/operators/nccl_op_test.cu.cc
index 56ba57854955c08031214d1f751c17fbb8bb882c..bb7ae20286dd8e52f72b79cbf353bd812a2cc092 100644
--- a/paddle/operators/nccl_op_test.cu.cc
+++ b/paddle/operators/nccl_op_test.cu.cc
@@ -97,7 +97,7 @@ class NCCLTester : public ::testing::Test {
       send_tensor->mutable_data<T>(kDims, place);
 
       std::vector<T> send_vector(f::product(kDims), gpu_id);
-      send_tensor->CopyFromVector<T>(send_vector, *ctx);
+      paddle::framework::CopyFromVector<T>(send_vector, *ctx, send_tensor);
       ctx->Wait();
       VLOG(1) << "Send Tensor filled with elements " << send_tensor->numel();
     }
diff --git a/paddle/operators/pool_op.cc b/paddle/operators/pool_op.cc
index d8c58618cf703d086d3cabc927ebc5eb038b1aec..e26ffd86e5b5645e361070ca9fd9d8dc49d1ed30 100644
--- a/paddle/operators/pool_op.cc
+++ b/paddle/operators/pool_op.cc
@@ -105,7 +105,7 @@ Pool2dOpMaker::Pool2dOpMaker(framework::OpProto *proto,
   // TypedAttrChecker don't support vector type.)
   AddAttr<std::vector<int>>(
       "paddings",
-      "(vector<int>, defalut {0,0}), paddings(height, width) of pooling "
+      "(vector<int>, default {0,0}), paddings(height, width) of pooling "
       "operator."
       "If global_pooling = true, paddings and ksize will be ignored.")
       .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
@@ -122,15 +122,15 @@ Parameters(ksize, strides, paddings) are two elements.
 These two elements represent height and width, respectively.
 The input(X) size and output(Out) size may be different.
 
-Example:
+Example:   
   Input:
        X shape: $(N, C, H_{in}, W_{in})$
   Output:
        Out shape: $(N, C, H_{out}, W_{out})$
-  where 
+  Where
        $$ 
-       H_{out} = (H_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
-       W_{out} = (W_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
        $$
 
 )DOC");
@@ -177,7 +177,7 @@ Pool3dOpMaker::Pool3dOpMaker(framework::OpProto *proto,
                                // TypedAttrChecker don't support vector type.)
   AddAttr<std::vector<int>>(
       "paddings",
-      "(vector<int>, defalut {0,0,0}), paddings(depth, height, "
+      "(vector<int>, default {0,0,0}), paddings(depth, height, "
       "width) of pooling operator. "
       "If global_pooling = true, ksize and paddings will be ignored.")
       .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
@@ -199,12 +199,12 @@ Example:
        X shape: $(N, C, D_{in}, H_{in}, W_{in})$
   Output:
        Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
-  where
-       $$
-       D_{out} = (D_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
-       H_{out} = (H_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1 \\
-       W_{out} = (W_{in} - ksize[2] + 2 * paddings[2]) / strides[2] + 1
-       $$
+  Where
+  $$
+       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
+       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
+  $$
 
 )DOC");
 }
diff --git a/paddle/operators/pool_with_index_op.cc b/paddle/operators/pool_with_index_op.cc
index 4958fa645405db0798f37165030eae95da371477..b9c42a69128a26ff5942748e11fb87c57d3e3f58 100644
--- a/paddle/operators/pool_with_index_op.cc
+++ b/paddle/operators/pool_with_index_op.cc
@@ -142,7 +142,7 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
     // TypedAttrChecker don't support vector type.)
     AddAttr<std::vector<int>>(
         "paddings",
-        "(vector<int>, defalut:{0, 0}), paddings(height, width) of pooling "
+        "(vector<int>, default:{0, 0}), paddings(height, width) of pooling "
         "operator. "
         "If global_pooling = true, paddings and will be ignored.")
         .SetDefault({0, 0});  // TODO(Chengduo): Add checker. (Currently,
@@ -166,10 +166,10 @@ Example:
   Output:
        Out shape: $(N, C, H_{out}, W_{out})$
        Mask shape: $(N, C, H_{out}, W_{out})$
-  where
+  Where
        $$
-       H_{out} = (H_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
-       W_{out} = (W_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1
+       H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
        $$
 
 )DOC");
@@ -220,7 +220,7 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker {
     // TypedAttrChecker don't support vector type.)
     AddAttr<std::vector<int>>(
         "paddings",
-        "(vector, defalut {0,0,0}), paddings(depth, "
+        "(vector, default {0,0,0}), paddings(depth, "
         "height, width) of pooling operator. "
         "If global_pooling = true, paddings and ksize will be ignored.")
         .SetDefault({0, 0, 0});  // TODO(Chengduo): Add checker. (Currently,
@@ -244,11 +244,11 @@ Example:
   Output:
        Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
        Mask shape: $(N, C, D_{out}, H_{out}, W_{out})$
-  where
+  Where
        $$
-       D_{out} = (D_{in} - ksize[0] + 2 * paddings[0]) / strides[0] + 1 \\
-       H_{out} = (H_{in} - ksize[1] + 2 * paddings[1]) / strides[1] + 1 \\
-       W_{out} = (W_{in} - ksize[2] + 2 * paddings[2]) / strides[2] + 1
+       D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
+       H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
+       W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
        $$
 
 )DOC");
diff --git a/paddle/operators/rank_loss_op.cc b/paddle/operators/rank_loss_op.cc
index 061e82412ea5f4f17fd26a7094e68b97138cc09c..912f88f455252effbdb12ecfc45e4afefa60e03e 100644
--- a/paddle/operators/rank_loss_op.cc
+++ b/paddle/operators/rank_loss_op.cc
@@ -4,7 +4,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+       http://www.apache.org/licenses/LICENSE-2.0
 
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
@@ -35,9 +35,10 @@ class RankLossOp : public framework::OperatorWithKernel {
     auto right_dims = ctx->GetInputDim("Right");
 
     PADDLE_ENFORCE((label_dims == left_dims) && (left_dims == right_dims),
-                   "All inputs must have the same size");
-    PADDLE_ENFORCE((label_dims.size() == 2) && (label_dims[1] == 1),
-                   "All inputs must be row vector with size batch_size x 1.");
+                   "All inputs must have the same size.");
+    PADDLE_ENFORCE(
+        (label_dims.size() == 2) && (label_dims[1] == 1),
+        "All inputs must be 2-D tensors with shape [batch_size x 1].");
     ctx->SetOutputDim("Out", label_dims);
   }
 };
@@ -48,10 +49,17 @@ class RankLossOpMaker : public framework::OpProtoAndCheckerMaker {
                   framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("Label",
-             "The label indicating A ranked higher than B or not, row vector.");
-    AddInput("Left", "The output of RankNet for doc A, vector.");
-    AddInput("Right", "The output of RankNet for doc B, vetor.");
-    AddOutput("Out", "The output loss of RankLoss operator, vector.");
+             "(2-D Tensor with shape [batch_size x 1]) "
+             "The label indicating A ranked higher than B or not.");
+    AddInput("Left",
+             "(2-D Tensor with shape [batch_size x 1]) "
+             "The output of RankNet for doc A.");
+    AddInput("Right",
+             "(2-D Tensor with shape [batch_size x 1]) "
+             "The output of RankNet for doc B.");
+    AddOutput("Out",
+              "(2-D Tensor with shape [batch_size x 1]) "
+              "The output loss of RankLoss operator.");
     AddComment(R"DOC(
 RankLoss Operator.
 
@@ -65,16 +73,17 @@ P = {0, 1} or {0, 0.5, 1}, where 0.5 means no information about the rank of
 the input pair.
 
 The RankLoss operator takes three inputs: Left (o_i), Right (o_j) and Label
-(P_{i,j}), which represent the output of RankNet for the two docs and the label, 
-respectively, and yields the rank loss C_{i,j} using the following equation:
+(P_{i,j}), which represent the output score of RankNet for the two docs and 
+the label respectively, and yields the rank loss C_{i,j} using the following 
+equation:
 
-\f$$
-  C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + log(1 + e^{o_{i,j}}) \\
+$$
+  C_{i,j} = -\tilde{P_{ij}} * o_{i,j} + \log(1 + e^{o_{i,j}}) \\
   o_{i,j} =  o_i - o_j  \\
   \tilde{P_{i,j}} = \left \{0, 0.5, 1 \right \} \ or \ \left \{0, 1 \right \}
-\f$$
+$$
 
-The operator can take inputs of one sample or in batch.
+The operator can take batch inputs with size batch_size (batch_size >= 1).
 
 )DOC");
   }
diff --git a/paddle/operators/rank_loss_op.cu b/paddle/operators/rank_loss_op.cu
index 779588ff36c792b8925a535d60f1cfbbe3c66d86..5382e3a6296acd257211104d8ec6835c11b90bdd 100644
--- a/paddle/operators/rank_loss_op.cu
+++ b/paddle/operators/rank_loss_op.cu
@@ -4,7 +4,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+       http://www.apache.org/licenses/LICENSE-2.0
 
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/paddle/operators/rank_loss_op.h b/paddle/operators/rank_loss_op.h
index f184d6efcb496a1d7f38540712b6c431f816482e..703c77a0b21f2b2f0b0ae6fae86aae819ea824b5 100644
--- a/paddle/operators/rank_loss_op.h
+++ b/paddle/operators/rank_loss_op.h
@@ -4,7 +4,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+       http://www.apache.org/licenses/LICENSE-2.0
 
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/paddle/operators/recurrent_op.cc b/paddle/operators/recurrent_op.cc
index ea60665e398978c612321df1f340e5669038e28f..c976e22c7740ad11279ab5ee75e4d130be8fa0c5 100644
--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -284,7 +284,8 @@ class RecurrentOp : public RecurrentBase {
             auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1);
             // Explicit copy output since the local RNN scope can be destroyed
             // early.
-            dst_out.CopyFrom(src_tensor, dev_ctx.GetPlace(), dev_ctx);
+            framework::CopyFrom(src_tensor, dev_ctx.GetPlace(), dev_ctx,
+                                &dst_out);
           });
 
       scopes.Next();
@@ -365,7 +366,8 @@ class RecurrentGradOp : public RecurrentBase {
           auto *cur_grad_var = cur_scope.Var(cur_grad);
           auto cur_grad_tensor =
               cur_grad_var->GetMutable<framework::LoDTensor>();
-          cur_grad_tensor->CopyFrom(ex_tensor, dev_ctx.GetPlace(), dev_ctx);
+          framework::CopyFrom(ex_tensor, dev_ctx.GetPlace(), dev_ctx,
+                              cur_grad_tensor);
         }
       }
 
@@ -438,7 +440,7 @@ class RecurrentGradOp : public RecurrentBase {
             }
 
             auto dst = outside->Slice(seq_offset, seq_offset + 1);
-            dst.CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx);
+            framework::CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx, &dst);
           });
       VLOG(5) << "Link outside gradient finished ";
 
@@ -451,7 +453,7 @@ class RecurrentGradOp : public RecurrentBase {
                 framework::LoDTensor *outside) {
               outside->Resize(inside.dims());
               outside->mutable_data(dev_ctx.GetPlace(), inside.type());
-              outside->CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx);
+              framework::CopyFrom(inside, dev_ctx.GetPlace(), dev_ctx, outside);
             });
         VLOG(5) << "Link initialize state gradient finished ";
       }
diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..c69e416e10f2a9ced1f1b22c39235e4c9338e77c
--- /dev/null
+++ b/paddle/operators/recv_op.cc
@@ -0,0 +1,121 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <stdint.h>
+#include <sys/stat.h>
+#include <ostream>
+#include <thread>
+
+#include <unistd.h>
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/executor.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/detail/send_recv_impl.h"
+#include "paddle/operators/detail/simple_block_queue.h"
+
+namespace paddle {
+namespace operators {
+
+void RunServer(Server **rpc_server,
+               std::shared_ptr<detail::SendRecvServerImpl> service,
+               const std::string &server_address) {
+  ServerBuilder builder;
+  builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
+  builder.RegisterService(service.get());
+  std::unique_ptr<Server> server(builder.BuildAndStart());
+  *rpc_server = server.get();
+  LOG(INFO) << "Server listening on " << server_address << std::endl;
+  server->Wait();
+}
+
+class RecvOp : public framework::OperatorBase {
+ public:
+  RecvOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {
+    if (!rpc_service_) {
+      rpc_service_.reset(new detail::SendRecvServerImpl());
+      std::string endpoint = Attr<std::string>("endpoint");
+      server_thread_.reset(
+          new std::thread(RunServer, &rpc_server_, rpc_service_, endpoint));
+    }
+  }
+
+  virtual ~RecvOp() {
+    rpc_server_->Shutdown();
+    server_thread_->join();
+  }
+
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    // blocking get one var from client.
+    const framework::LoDTensor &t = rpc_service_->Get();
+    framework::Scope &recv_scope = scope.NewScope();
+    // set graph input var
+    auto *var = recv_scope.Var(Input("RX"));
+    auto *tensor = var->GetMutable<framework::LoDTensor>();
+    // FIXME(typhoonzero): do not copy
+    framework::CopyFrom(t, dev_ctx.GetPlace(), dev_ctx, tensor);
+
+    auto *block = Attr<framework::BlockDescBind *>("OptimizeBlock");
+    auto *program = block->Program();
+    framework::Executor executor(dev_ctx);
+    // Run sub graph to get optimized tensor
+    executor.Run(*program, &recv_scope, block->ID(),
+                 false /*create_local_scope*/);
+
+    auto *out_var = recv_scope.FindVar("Out");
+    // push back
+    rpc_service_->Push(out_var->Get<framework::LoDTensor>());
+  }
+
+ protected:
+  // grpc server instance to track status and gracefully shutdown.
+  // borrow an pointer from server thread.
+  Server *rpc_server_{nullptr};
+  // grpc send/recv service implement to register.
+  std::shared_ptr<detail::SendRecvServerImpl> rpc_service_;
+  std::shared_ptr<std::thread> server_thread_;
+};
+
+class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  RecvOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("RX", "(Tensor) Input tensor to be saved");
+    AddComment(R"DOC(
+Recv operator
+
+This operator will recv tensor from send_op
+)DOC");
+    AddAttr<std::string>("endpoint",
+                         "(string, default 127.0.0.1:6164)"
+                         "IP address to listen on.")
+        .SetDefault("127.0.0.1:6164")
+        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
+    AddAttr<framework::BlockDescBind *>("OptimizeBlock", "type BlockDescBind*",
+                                        "optimize network run in server");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(recv, ops::RecvOp, ops::RecvOpMaker);
diff --git a/paddle/operators/reshape_op.cc b/paddle/operators/reshape_op.cc
index ba774ec2160c0460867de42f7ad9d5cd65ad8d6a..39bf2118d603881531bf583ae468e8dc9b8bd181 100644
--- a/paddle/operators/reshape_op.cc
+++ b/paddle/operators/reshape_op.cc
@@ -1,11 +1,10 @@
-
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+       http://www.apache.org/licenses/LICENSE-2.0
 
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
@@ -38,8 +37,8 @@ class ReshapeOp : public framework::OperatorWithKernel {
     // TODO(qiao) change batch_size
     for (size_t i = 1; i < shape.size(); ++i) {
       PADDLE_ENFORCE(shape[i] > 0,
-                     "Each dimension of shape "
-                     "must be positiv except the first.");
+                     "Each dimension of Attr(shape) "
+                     "must be positive except the first one.");
     }
     if (shape[0] < 0) {
       shape[0] = x_dims[0];
diff --git a/paddle/operators/reshape_op.cu.cc b/paddle/operators/reshape_op.cu
similarity index 94%
rename from paddle/operators/reshape_op.cu.cc
rename to paddle/operators/reshape_op.cu
index 23dbe089d3b37aabedf9ef166f7bbfbf67da7e0a..dca6c15007a64808248443af32141b4a677f95d7 100644
--- a/paddle/operators/reshape_op.cu.cc
+++ b/paddle/operators/reshape_op.cu
@@ -4,7 +4,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+       http://www.apache.org/licenses/LICENSE-2.0
 
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
diff --git a/paddle/operators/reshape_op.h b/paddle/operators/reshape_op.h
index beb951713ae2a9fd83fe7c1a5e97ee8c642158a8..73fd1da6428f55976a397b7f6f92bb0c796bfe02 100644
--- a/paddle/operators/reshape_op.h
+++ b/paddle/operators/reshape_op.h
@@ -4,7 +4,7 @@
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at
 
-   http://www.apache.org/licenses/LICENSE-2.0
+       http://www.apache.org/licenses/LICENSE-2.0
 
    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
@@ -28,7 +28,7 @@ class ReshapeKernel : public framework::OpKernel<T> {
     auto* in = ctx.Input<framework::Tensor>("X");
     auto out_dims = out->dims();
     out->mutable_data<T>(ctx.GetPlace());
-    out->CopyFrom(*in, ctx.GetPlace(), ctx.device_context());
+    framework::CopyFrom(*in, ctx.GetPlace(), ctx.device_context(), out);
     out->Resize(out_dims);
   }
 };
@@ -42,7 +42,7 @@ class ReshapeGradKernel : public framework::OpKernel<T> {
     d_x->mutable_data<T>(ctx.GetPlace());
 
     auto in_dims = d_x->dims();
-    d_x->CopyFrom(*d_out, ctx.GetPlace(), ctx.device_context());
+    framework::CopyFrom(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
     d_x->Resize(in_dims);
   }
 };
diff --git a/paddle/operators/rnn/recurrent_op_utils.cc b/paddle/operators/rnn/recurrent_op_utils.cc
deleted file mode 100644
index ee61ea300c33722471189d06eb09f67a083d2a4d..0000000000000000000000000000000000000000
--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ /dev/null
@@ -1,134 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/operators/rnn/recurrent_op_utils.h"
-
-namespace paddle {
-namespace operators {
-namespace rnn {
-
-namespace f = paddle::framework;
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-void SegmentInputs(const std::vector<Scope*>& step_scopes,
-                   const std::vector<std::string>& inlinks,
-                   const size_t seq_len) {
-  PADDLE_ENFORCE(!inlinks.empty(), "no in links are provided.");
-  for (size_t i = 0; i < inlinks.size(); ++i) {
-    // global inputs
-    auto input_var = step_scopes[0]->parent().FindVar(inlinks[i]);
-    PADDLE_ENFORCE_NOT_NULL(input_var, "input link [%s] is not in scope.",
-                            inlinks[i]);
-
-    LoDTensor* input = input_var->GetMutable<LoDTensor>();
-    f::DDim dims = input->dims();
-    PADDLE_ENFORCE_EQ(static_cast<size_t>(dims[0]), seq_len,
-                      "all the inputs be the same length");
-    f::DDim step_dims = slice_ddim(dims, 1, dims.size());
-    for (size_t j = 0; j < seq_len; j++) {
-      Tensor* step_input =
-          step_scopes[j]->Var(inlinks[i])->GetMutable<Tensor>();
-      // The input of operators of each step is Tensor here.
-      // Maybe need to modify Slice function.
-      *step_input = input->Slice(j, j + 1);
-      step_input->Resize(step_dims);
-    }
-  }
-}
-
-void ConcatOutputs(const std::vector<Scope*>& step_scopes,
-                   const std::vector<std::string>& outlinks,
-                   const size_t seq_len, const platform::DeviceContext& ctx) {
-  for (size_t i = 0; i < outlinks.size(); i++) {
-    auto* output_var = step_scopes[0]->parent().FindVar(outlinks[i]);
-    PADDLE_ENFORCE_NOT_NULL(output_var, "output link [%s] is not in scope.",
-                            outlinks[i]);
-    LoDTensor* output = output_var->GetMutable<LoDTensor>();
-
-    auto* step_scope_var = step_scopes[0]->FindVar(outlinks[i]);
-    PADDLE_ENFORCE_NOT_NULL(step_scope_var, "%s not in scope", outlinks[i]);
-    f::DDim step_dims =
-        step_scope_var->template GetMutable<LoDTensor>()->dims();
-    std::vector<int64_t> dims_vec = vectorize(step_dims);
-    dims_vec.insert(dims_vec.begin(), seq_len);
-    output->Resize(f::make_ddim(dims_vec));
-    output->mutable_data<float>(platform::CPUPlace());
-    for (size_t j = 0; j < seq_len; j++) {
-      LoDTensor* step_output =
-          step_scopes[j]->FindVar(outlinks[i])->GetMutable<LoDTensor>();
-      // TODO(luotao02) data type and platform::DeviceContext() should set
-      // correctly
-      (output->Slice(j, j + 1))
-          .CopyFrom(*step_output, platform::CPUPlace(), ctx);
-    }
-  }
-}
-
-void LinkMemories(const std::vector<Scope*>& scopes,
-                  const std::vector<rnn::StateAttr>& memories,
-                  const size_t step_id, const int offset) {
-  PADDLE_ENFORCE_LT(step_id, scopes.size(),
-                    "step [%d] is out of range of step scopes' size [%d]",
-                    step_id, scopes.size());
-  PADDLE_ENFORCE_GE(static_cast<int>(step_id) + offset, 0,
-                    "offset [%d] must be large than -[%d]", offset, step_id);
-  PADDLE_ENFORCE_LT(
-      step_id + offset, scopes.size(),
-      "offset [%d] is out of range, it must be less than (%d - %d)", offset,
-      scopes.size(), step_id);
-  auto* scope = scopes[step_id];
-  auto* linked_scope = scopes[step_id + offset];
-  for (auto& attr : memories) {
-    auto* mem = scope->FindVar(attr.pre_var)->GetMutable<LoDTensor>();
-    auto* linked_mem = linked_scope->FindVar(attr.var)->GetMutable<LoDTensor>();
-    mem->Resize(linked_mem->dims());
-    mem->ShareDataWith(*linked_mem);
-  }
-}
-
-void InitArgument(const ArgumentName& name, Argument* arg,
-                  const framework::OperatorBase& op, bool is_grad) {
-  arg->step_scopes =
-      is_grad ? op.Input(name.step_scopes) : op.Output(name.step_scopes);
-  arg->inlinks = op.Inputs(name.inlinks);
-  arg->outlinks = op.Outputs(name.outlinks);
-
-  auto& boot_memories = is_grad ? op.Outputs(name.initial_states)
-                                : op.Inputs(name.initial_states);
-  // attributes
-  auto& memories = op.Attr<std::vector<std::string>>(name.states);
-  auto& pre_memories = op.Attr<std::vector<std::string>>(name.ex_states);
-
-  PADDLE_ENFORCE(memories.size() == boot_memories.size(),
-                 "the size of states, initial_states don't match:%d,%d",
-                 memories.size(), boot_memories.size());
-  PADDLE_ENFORCE(pre_memories.size() == boot_memories.size(),
-                 "the size of ex_states, initial_states don't match:%d,%d",
-                 pre_memories.size(), boot_memories.size());
-  PADDLE_ENFORCE(memories.size() > 0, "more than 1 states should be set");
-
-  for (size_t i = 0; i < memories.size(); ++i) {
-    rnn::StateAttr mem_attr;
-    mem_attr.var = memories[i];
-    mem_attr.pre_var = pre_memories[i];
-    mem_attr.boot_var = boot_memories[i];
-    (arg->states).push_back(mem_attr);
-  }
-}
-
-}  // namespace rnn
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/rnn/recurrent_op_utils.h b/paddle/operators/rnn/recurrent_op_utils.h
deleted file mode 100644
index fb0e158e07745d58c6211d33e385b324e492b95e..0000000000000000000000000000000000000000
--- a/paddle/operators/rnn/recurrent_op_utils.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#pragma once
-
-#include <string>
-
-#include "paddle/framework/operator.h"
-
-namespace paddle {
-namespace operators {
-namespace rnn {
-
-using Scope = framework::Scope;
-
-/**
- * Memory of a RNN (same as the role of `Momory` in PaddlePaddle).
- *
- * Memory attributes cached by this op, dims will be infered from
- * boot memories in father scope. Other attributes are copied from Op's proto
- * attributes.
- */
-struct StateAttr {
-  // name of current state variable
-  std::string var;
-  // name of previous step's state variable
-  std::string pre_var;
-  // name of the variables to init this memory (same role of `boot_layer` in
-  // PaddlePaddle), which is store in father's scope.
-  std::string boot_var;
-};
-
-struct Argument {
-  std::string step_net;
-  std::string step_scopes;
-  std::vector<std::string> inlinks;
-  std::vector<std::string> outlinks;
-  std::vector<rnn::StateAttr> states;
-};
-
-struct ArgumentName {
-  std::string step_net;
-  std::string step_scopes;
-  std::string inlinks;
-  std::string outlinks;
-  std::string states;          // the memory name
-  std::string ex_states;       // the previous memory name
-  std::string initial_states;  // the boot memory name
-};
-
-/**
- * Prepare inputs for each step net.
- */
-void SegmentInputs(const std::vector<Scope*>& step_scopes,
-                   const std::vector<std::string>& inlinks,
-                   const size_t seq_len);
-
-/**
- * Process outputs of step nets and merge to variables.
- */
-void ConcatOutputs(const std::vector<Scope*>& step_scopes,
-                   const std::vector<std::string>& outlinks,
-                   const size_t seq_len, const platform::DeviceContext& ctx);
-
-void LinkMemories(const std::vector<Scope*>& step_scopes,
-                  const std::vector<StateAttr>& memories, const size_t step_id,
-                  const int offset);
-
-void InitArgument(const ArgumentName& name, Argument* arg,
-                  const framework::OperatorBase& op, bool is_grad = false);
-
-}  // namespace rnn
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/roi_pool_op.cc b/paddle/operators/roi_pool_op.cc
old mode 100755
new mode 100644
index 156db9358689c90293311b8f08a7576b680c9472..2b5e66c96b726a3c1fdb2596a244c5395db85279
--- a/paddle/operators/roi_pool_op.cc
+++ b/paddle/operators/roi_pool_op.cc
@@ -43,8 +43,8 @@ class ROIPoolOp : public framework::OperatorWithKernel {
                    "ROIs should be a 2-D tensor of shape (num_rois, 5)"
                    "given as [[batch_id, x1, y1, x2, y2], …].");
     PADDLE_ENFORCE(rois_dims[1] == kROISize,
-                "ROIs should be a 2-D tensor of shape (num_rois, 5)"
-                "given as [[batch_id, x1, y1, x2, y2], …].");
+                   "ROIs should be a 2-D tensor of shape (num_rois, 5)"
+                   "given as [[batch_id, x1, y1, x2, y2], …].");
 
     int pooled_height = ctx->Attrs().Get<int>("pooled_height");
     int pooled_width = ctx->Attrs().Get<int>("pooled_width");
@@ -65,7 +65,7 @@ class ROIPoolOp : public framework::OperatorWithKernel {
 
     ctx->SetOutputDim("Out", out_dims);
     ctx->SetOutputDim("Argmax", out_dims);
-    }
+  }
 
  protected:
   framework::OpKernelType GetKernelType(
@@ -100,7 +100,7 @@ class ROIPoolGradOp : public framework::OperatorWithKernel {
 class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
   ROIPoolOpMaker(framework::OpProto* proto,
-                       framework::OpAttrChecker* op_checker)
+                 framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
              "(Tensor), "
@@ -125,21 +125,22 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker {
               "(Tensor), "
               "Argmaxes corresponding to indices in X used "
               "for gradient computation. Only output "
-              "if arg “is_test” is false.").AsIntermediate();
+              "if arg “is_test” is false.")
+        .AsIntermediate();
     AddAttr<float>("spatial_scale",
                    "(float, default 1.0), "
                    "Multiplicative spatial scale factor "
                    "to translate ROI coords from their input scale "
                    "to the scale used when pooling.")
-                   .SetDefault(1.0);
+        .SetDefault(1.0);
     AddAttr<int>("pooled_height",
                  "(int, default 1), "
                  "The pooled output height.")
-                 .SetDefault(1);
+        .SetDefault(1);
     AddAttr<int>("pooled_width",
                  "(int, default 1), "
                  "The pooled output width.")
-                 .SetDefault(1);
+        .SetDefault(1);
     AddComment(R"DOC(
 ROIPool operator
 
@@ -153,11 +154,10 @@ https://stackoverflow.com/questions/43430056/what-is-roi-layer-in-fast-rcnn
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker,
-            roi_pool_grad, ops::ROIPoolGradOp);
+REGISTER_OP(roi_pool, ops::ROIPoolOp, ops::ROIPoolOpMaker, roi_pool_grad,
+            ops::ROIPoolGradOp);
 REGISTER_OP_CPU_KERNEL(
-    roi_pool,
-    ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, float>,
+    roi_pool, ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, float>,
     ops::CPUROIPoolOpKernel<paddle::platform::CPUPlace, double>);
 REGISTER_OP_CPU_KERNEL(
     roi_pool_grad,
diff --git a/paddle/operators/roi_pool_op.cu b/paddle/operators/roi_pool_op.cu
old mode 100755
new mode 100644
index 97df45f1b5779d5e28e36814450a9577edf85135..9a4c8ca752bb7abc4f44d4815743769bc989703a
--- a/paddle/operators/roi_pool_op.cu
+++ b/paddle/operators/roi_pool_op.cu
@@ -29,101 +29,95 @@ static inline int NumBlocks(const int N) {
                   kNumMaxinumNumBlocks);
 }
 
-  template <typename T>
-  __global__ void GPUROIPoolForward(
-      const int nthreads, const T* input_data, const int64_t* input_rois,
-      const float spatial_scale, const int channels, const int height,
-      const int width, const int pooled_height, const int pooled_width,
-      T* output_data, int64_t* argmax_data) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    int offset = blockDim.x * gridDim.x;
-    for (size_t i = index; i < nthreads; i += offset) {
-      int pw = index % pooled_width;
-      int ph = (index / pooled_width) % pooled_height;
-      int c = (index / pooled_width / pooled_height) % channels;
-      int n = index / pooled_width / pooled_height / channels;
-
-      const int64_t* offset_input_rois = input_rois + n * kROISize;
-      int roi_batch_ind = offset_input_rois[0];
-      int roi_start_w = round(offset_input_rois[1] * spatial_scale);
-      int roi_start_h = round(offset_input_rois[2] * spatial_scale);
-      int roi_end_w = round(offset_input_rois[3] * spatial_scale);
-      int roi_end_h = round(offset_input_rois[4] * spatial_scale);
-
-      int roi_width = max(roi_end_w - roi_start_w + 1, 1);
-      int roi_height = max(roi_end_h - roi_start_h + 1, 1);
-      T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
-      T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
-
-      int hstart = static_cast<int>(floor(static_cast<T>(ph) * bin_size_h));
-      int wstart = static_cast<int>(floor(static_cast<T>(pw) * bin_size_w));
-      int hend = static_cast<int>(ceil(static_cast<T>(ph + 1) * bin_size_h));
-      int wend = static_cast<int>(ceil(static_cast<T>(pw + 1) * bin_size_w));
-
-      hstart = min(max(hstart + roi_start_h, 0), height);
-      hend = min(max(hend + roi_start_h, 0), height);
-      wstart = min(max(wstart + roi_start_w, 0), width);
-      wend = min(max(wend + roi_start_w, 0), width);
-      bool is_empty = (hend <= hstart) || (wend <= wstart);
-
-      T maxval = is_empty ? 0 : -std::numeric_limits<T>::max();
-      int maxidx = -1;
-      const T* offset_input_data =
-          input_data + (roi_batch_ind * channels + c) * height * width;
-      for (int h = hstart; h < hend; ++h) {
-        for (int w = wstart; w < wend; ++w) {
-          int input_data_index = h * width + w;
-          if (offset_input_data[input_data_index] > maxval) {
-            maxval = offset_input_data[input_data_index];
-            maxidx = input_data_index;
-          }
+template <typename T>
+__global__ void GPUROIPoolForward(const int nthreads, const T* input_data,
+                                  const int64_t* input_rois,
+                                  const float spatial_scale, const int channels,
+                                  const int height, const int width,
+                                  const int pooled_height,
+                                  const int pooled_width, T* output_data,
+                                  int64_t* argmax_data) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (size_t i = index; i < nthreads; i += offset) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const int64_t* offset_input_rois = input_rois + n * kROISize;
+    int roi_batch_ind = offset_input_rois[0];
+    int roi_start_w = round(offset_input_rois[1] * spatial_scale);
+    int roi_start_h = round(offset_input_rois[2] * spatial_scale);
+    int roi_end_w = round(offset_input_rois[3] * spatial_scale);
+    int roi_end_h = round(offset_input_rois[4] * spatial_scale);
+
+    int roi_width = max(roi_end_w - roi_start_w + 1, 1);
+    int roi_height = max(roi_end_h - roi_start_h + 1, 1);
+    T bin_size_h = static_cast<T>(roi_height) / static_cast<T>(pooled_height);
+    T bin_size_w = static_cast<T>(roi_width) / static_cast<T>(pooled_width);
+
+    int hstart = static_cast<int>(floor(static_cast<T>(ph) * bin_size_h));
+    int wstart = static_cast<int>(floor(static_cast<T>(pw) * bin_size_w));
+    int hend = static_cast<int>(ceil(static_cast<T>(ph + 1) * bin_size_h));
+    int wend = static_cast<int>(ceil(static_cast<T>(pw + 1) * bin_size_w));
+
+    hstart = min(max(hstart + roi_start_h, 0), height);
+    hend = min(max(hend + roi_start_h, 0), height);
+    wstart = min(max(wstart + roi_start_w, 0), width);
+    wend = min(max(wend + roi_start_w, 0), width);
+    bool is_empty = (hend <= hstart) || (wend <= wstart);
+
+    T maxval = is_empty ? 0 : -std::numeric_limits<T>::max();
+    int maxidx = -1;
+    const T* offset_input_data =
+        input_data + (roi_batch_ind * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        int input_data_index = h * width + w;
+        if (offset_input_data[input_data_index] > maxval) {
+          maxval = offset_input_data[input_data_index];
+          maxidx = input_data_index;
         }
       }
-      output_data[index] = maxval;
-      if (argmax_data) {
-        argmax_data[index] = maxidx;
-      }
+    }
+    output_data[index] = maxval;
+    if (argmax_data) {
+      argmax_data[index] = maxidx;
     }
   }
+}
 
 template <typename T>
 __global__ void GPUROIPoolBackward(
-    const int nthreads,
-    const int64_t* input_rois,
-    const T* output_grad,
-    const int64_t* argmax_data,
-    const int num_rois,
-    const float spatial_scale,
-    const int channels,
-    const int height,
-    const int width,
-    const int pooled_height,
-    const int pooled_width,
-    T* input_grad) {
-    int index = blockIdx.x * blockDim.x + threadIdx.x;
-    int offset = blockDim.x * gridDim.x;
-    for (int i = index; i < nthreads; i += offset) {
-      int pw = index % pooled_width;
-      int ph = (index / pooled_width) % pooled_height;
-      int c = (index / pooled_width / pooled_height) % channels;
-      int n = index / pooled_width / pooled_height / channels;
-
-      const int64_t* offset_input_rois = input_rois + n * kROISize;
-      int roi_batch_ind = offset_input_rois[0];
-      int input_offset = (roi_batch_ind * channels + c) * height * width;
-      int output_offset = (n * channels + c) * pooled_height * pooled_width;
-      const T* offset_output_grad = output_grad + output_offset;
-      T* offset_input_grad = input_grad + input_offset;
-      const int64_t* offset_argmax_data = argmax_data + output_offset;
-
-      int argmax = offset_argmax_data[ph * pooled_width + pw];
-      if (argmax != -1) {
-        platform::CudaAtomicAdd(offset_input_grad + argmax,
+    const int nthreads, const int64_t* input_rois, const T* output_grad,
+    const int64_t* argmax_data, const int num_rois, const float spatial_scale,
+    const int channels, const int height, const int width,
+    const int pooled_height, const int pooled_width, T* input_grad) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  int offset = blockDim.x * gridDim.x;
+  for (int i = index; i < nthreads; i += offset) {
+    int pw = index % pooled_width;
+    int ph = (index / pooled_width) % pooled_height;
+    int c = (index / pooled_width / pooled_height) % channels;
+    int n = index / pooled_width / pooled_height / channels;
+
+    const int64_t* offset_input_rois = input_rois + n * kROISize;
+    int roi_batch_ind = offset_input_rois[0];
+    int input_offset = (roi_batch_ind * channels + c) * height * width;
+    int output_offset = (n * channels + c) * pooled_height * pooled_width;
+    const T* offset_output_grad = output_grad + output_offset;
+    T* offset_input_grad = input_grad + input_offset;
+    const int64_t* offset_argmax_data = argmax_data + output_offset;
+
+    int argmax = offset_argmax_data[ph * pooled_width + pw];
+    if (argmax != -1) {
+      platform::CudaAtomicAdd(
+          offset_input_grad + argmax,
           static_cast<T>(offset_output_grad[ph * pooled_width + pw]));
-      }
     }
   }
-
+}
 
 template <typename Place, typename T>
 class GPUROIPoolOpKernel : public framework::OpKernel<T> {
@@ -145,25 +139,18 @@ class GPUROIPoolOpKernel : public framework::OpKernel<T> {
     int width = in_dims[3];
 
     size_t rois_num = rois->dims()[0];
-    if (rois_num== 0) return;
+    if (rois_num == 0) return;
 
     int output_size = out->numel();
     int blocks = NumBlocks(output_size);
     int threads = kNumCUDAThreads;
 
-    GPUROIPoolForward<T>
-      <<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-      output_size,
-      in->data<T>(),
-      rois->data<int64_t>(),
-      spatial_scale,
-      channels,
-      height,
-      width,
-      pooled_height,
-      pooled_width,
-      out->mutable_data<T>(ctx.GetPlace()),
-      argmax->mutable_data<int64_t>(ctx.GetPlace()));
+    GPUROIPoolForward<
+        T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+        output_size, in->data<T>(), rois->data<int64_t>(), spatial_scale,
+        channels, height, width, pooled_height, pooled_width,
+        out->mutable_data<T>(ctx.GetPlace()),
+        argmax->mutable_data<int64_t>(ctx.GetPlace()));
   }
 };
 
@@ -175,10 +162,8 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
     auto* rois = ctx.Input<Tensor>("ROIs");
     auto* argmax = ctx.Input<Tensor>("Argmax");
 
-    auto* out_grad =
-        ctx.Input<Tensor>(framework::GradVarName("Out"));
-    auto* x_grad =
-        ctx.Output<Tensor>(framework::GradVarName("X"));
+    auto* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto* x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
 
     auto pooled_height = ctx.Attr<int>("pooled_height");
     auto pooled_width = ctx.Attr<int>("pooled_width");
@@ -199,21 +184,13 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
       int threads = kNumCUDAThreads;
 
       if (output_grad_size > 0) {
-        GPUROIPoolBackward<T>
-          <<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
-          output_grad_size,
-          rois->data<int64_t>(),
-          out_grad->data<T>(),
-          argmax->data<int64_t>(),
-          rois_num,
-          spatial_scale,
-          channels,
-          height,
-          width,
-          pooled_height,
-          pooled_width,
-          x_grad->mutable_data<T>(ctx.GetPlace()));
-        }
+        GPUROIPoolBackward<
+            T><<<blocks, threads, 0, ctx.cuda_device_context().stream()>>>(
+            output_grad_size, rois->data<int64_t>(), out_grad->data<T>(),
+            argmax->data<int64_t>(), rois_num, spatial_scale, channels, height,
+            width, pooled_height, pooled_width,
+            x_grad->mutable_data<T>(ctx.GetPlace()));
+      }
     }
   }
 };
@@ -223,8 +200,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
-    roi_pool,
-    ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, float>,
+    roi_pool, ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, float>,
     ops::GPUROIPoolOpKernel<paddle::platform::GPUPlace, double>);
 REGISTER_OP_GPU_KERNEL(
     roi_pool_grad,
diff --git a/paddle/operators/roi_pool_op.h b/paddle/operators/roi_pool_op.h
old mode 100755
new mode 100644
index bd7736d63125f1be57c8af5141208f66d0592adb..3812c66c65457b9d1337690d1a82759aab9a9732
--- a/paddle/operators/roi_pool_op.h
+++ b/paddle/operators/roi_pool_op.h
@@ -133,54 +133,47 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
     auto* in = ctx.Input<framework::Tensor>("X");
     auto* rois = ctx.Input<framework::Tensor>("ROIs");
     auto* argmax = ctx.Input<framework::Tensor>("Argmax");
-
     auto* out_grad =
         ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* x_grad =
-        ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
 
     auto pooled_height = ctx.Attr<int>("pooled_height");
     auto pooled_width = ctx.Attr<int>("pooled_width");
 
-    if (x_grad) {
-      int channels = in->dims()[1];
-      auto in_stride = framework::stride(in->dims());
-      auto roi_stride = framework::stride(rois->dims());
-
+    if (in_grad) {
       const int64_t* rois_data = rois->data<int64_t>();
-      int rois_num = rois->dims()[0];
-
-      T* x_grad_data = x_grad->mutable_data<T>(ctx.GetPlace());
+      const T* out_grad_data = out_grad->data<T>();
+      const int64_t* argmax_data = argmax->data<int64_t>();
+      T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
       math::SetConstant<Place, T> set_zero;
-      set_zero(ctx.device_context(), x_grad, static_cast<T>(0));
+      set_zero(ctx.device_context(), in_grad, static_cast<T>(0));
 
-      size_t roi_offset = roi_stride[0];
-      size_t batch_offset = in_stride[0];
-      size_t channel_offset = in_stride[1];
+      auto in_stride = framework::stride(in->dims());
+      auto argmax_stride = framework::stride(argmax->dims());
+      auto roi_stride = framework::stride(rois->dims());
+      auto out_stride = framework::stride(out_grad->dims());
 
-      const T* out_grad_data = out_grad->data<T>();
-      size_t pool_channel_offset = pooled_height * pooled_width;
-      const int64_t* argmax_data = argmax->data<int64_t>();
+      int rois_num = rois->dims()[0];
+      int channels = in->dims()[1];
 
-      for (size_t n = 0; n < rois_num; ++n) {
-        size_t roi_batch_idx = rois_data[0];
-        T* batch_grad_data = x_grad_data + batch_offset * roi_batch_idx;
+      for (int n = 0; n < rois_num; ++n) {
+        int roi_batch_idx = rois_data[0];
+        T* batch_grad_data = in_grad_data + roi_batch_idx * in_stride[0];
         for (int c = 0; c < channels; ++c) {
           for (int ph = 0; ph < pooled_height; ++ph) {
             for (int pw = 0; pw < pooled_width; ++pw) {
-              size_t pool_index = ph * pooled_width + pw;
-
+              int pool_index = ph * pooled_width + pw;
               if (argmax_data[pool_index] >= 0) {
-                size_t index = static_cast<size_t>(argmax_data[pool_index]);
+                auto index = argmax_data[pool_index];
                 batch_grad_data[index] += out_grad_data[pool_index];
               }
             }
           }
-          batch_grad_data += channel_offset;
-          out_grad_data += pool_channel_offset;
-          argmax_data += pool_channel_offset;
+          batch_grad_data += in_stride[1];
+          out_grad_data += out_stride[1];
+          argmax_data += argmax_stride[1];
         }
-        rois_data += roi_offset;
+        rois_data += roi_stride[0];
       }
     }
   }
diff --git a/paddle/operators/save_op.cc b/paddle/operators/save_op.cc
index 56909fb65f44ad00314103e21bee9535fbd59317..d4921cb80c8d78c52ae1887c36819b52621470eb 100644
--- a/paddle/operators/save_op.cc
+++ b/paddle/operators/save_op.cc
@@ -88,73 +88,7 @@ class SaveOp : public framework::OperatorBase {
                    "SaveOp only support LoDTensor, %s has wrong type", iname);
 
     auto &tensor = var->Get<framework::LoDTensor>();
-
-    {  // the 1st field, uint32_t version
-      constexpr uint32_t version = 0;
-      fout.write(reinterpret_cast<const char *>(&version), sizeof(version));
-    }
-    {  // the 2nd field, tensor description
-       // int32_t  size
-       // void*    protobuf message
-      framework::TensorDesc desc;
-      desc.set_data_type(framework::ToDataType(tensor.type()));
-      auto dims = framework::vectorize(tensor.dims());
-      auto *pb_dims = desc.mutable_dims();
-      pb_dims->Resize(static_cast<int>(dims.size()), 0);
-      std::copy(dims.begin(), dims.end(), pb_dims->begin());
-      int32_t size = desc.ByteSize();
-      fout.write(reinterpret_cast<const char *>(&size), sizeof(size));
-      auto out = desc.SerializeAsString();
-      fout.write(out.data(), size);
-    }
-    {  // the 3rd field, tensor data
-      uint64_t size = tensor.memory_size();
-      auto *data_ptr = tensor.data<void>();
-      PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
-                     "Index overflow when writing tensor");
-      if (platform::is_gpu_place(tensor.place())) {
-#ifdef PADDLE_WITH_CUDA
-        constexpr size_t kBufSize = 1024 * 1024 * 64;  // 64MB
-        std::unique_ptr<char[]> buf(new char[kBufSize]);
-        auto &gpu_dev_ctx =
-            static_cast<const platform::CUDADeviceContext &>(dev_ctx);
-        platform::CPUPlace cpu;
-        uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
-        while (size != 0) {
-          size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
-          memory::Copy(cpu, buf.get(),
-                       boost::get<platform::GPUPlace>(tensor.place()),
-                       reinterpret_cast<const void *>(data), size_to_write,
-                       gpu_dev_ctx.stream());
-          gpu_dev_ctx.Wait();
-          fout.write(buf.get(), size_to_write);
-          data += size_to_write;
-          size -= size_to_write;
-        }
-#else
-        PADDLE_THROW("Unexpected branch");
-#endif
-      } else {
-        fout.write(static_cast<const char *>(data_ptr),
-                   static_cast<std::streamsize>(size));
-      }
-    }
-    {  // the 4th field, lod information
-       // uint64_t lod_level
-       // uint64_t lod_level_1 size in byte.
-       // int*     lod_level_1 data
-       // ...
-      auto lod = tensor.lod();
-      uint64_t size = lod.size();
-      fout.write(reinterpret_cast<const char *>(&size), sizeof(size));
-
-      for (auto &each : lod) {
-        size = each.size() * sizeof(framework::LoD::value_type::value_type);
-        fout.write(reinterpret_cast<const char *>(&size), sizeof(size));
-        fout.write(reinterpret_cast<const char *>(each.data()),
-                   static_cast<std::streamsize>(size));
-      }
-    }
+    framework::SerializeToStream(fout, tensor, dev_ctx);
   }
 };
 
diff --git a/paddle/operators/scale_op.cc b/paddle/operators/scale_op.cc
index 5745580504fb9bda551f21665bff5c65ae82aeb9..e5c10fec4d840c58a74758a65ddfa93421ab4827 100644
--- a/paddle/operators/scale_op.cc
+++ b/paddle/operators/scale_op.cc
@@ -77,4 +77,6 @@ REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker<float>,
                   ops::ScaleGradMaker);
 REGISTER_OP_CPU_KERNEL(scale,
                        ops::ScaleKernel<paddle::platform::CPUPlace, float>,
-                       ops::ScaleKernel<paddle::platform::CPUPlace, double>);
+                       ops::ScaleKernel<paddle::platform::CPUPlace, double>,
+                       ops::ScaleKernel<paddle::platform::CPUPlace, int>,
+                       ops::ScaleKernel<paddle::platform::CPUPlace, int64_t>);
diff --git a/paddle/operators/scale_op.cu b/paddle/operators/scale_op.cu
index 820fd4e6855bb192ec3292ea6983d5ecae73b6e6..0d707751598e65bc56bf73a435c10b4acd6d8ed0 100644
--- a/paddle/operators/scale_op.cu
+++ b/paddle/operators/scale_op.cu
@@ -16,4 +16,6 @@
 
 REGISTER_OP_GPU_KERNEL(
     scale, paddle::operators::ScaleKernel<paddle::platform::GPUPlace, float>,
-    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, double>);
+    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, double>,
+    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, int>,
+    paddle::operators::ScaleKernel<paddle::platform::GPUPlace, int64_t>);
diff --git a/paddle/operators/send_op.cc b/paddle/operators/send_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a3059847f2d420359b347e3a5d514d8a3829a4e2
--- /dev/null
+++ b/paddle/operators/send_op.cc
@@ -0,0 +1,84 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <ostream>
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/framework.pb.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_registry.h"
+
+#include "paddle/operators/detail/send_recv_impl.h"
+#include "paddle/operators/detail/simple_block_queue.h"
+
+namespace paddle {
+namespace operators {
+
+// TODO(typhoonzero): this is a simple implementation which only send
+// one tensor
+class SendOp : public framework::OperatorBase {
+ public:
+  SendOp(const std::string &type, const framework::VariableNameMap &inputs,
+         const framework::VariableNameMap &outputs,
+         const framework::AttributeMap &attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {
+    // init client when the operator is created at runtime.
+    if (!client_) {
+      std::string endpoint = Attr<std::string>("endpoint");
+      client_.reset(new detail::RPCClient(
+          grpc::CreateChannel(endpoint, grpc::InsecureChannelCredentials())));
+      // TODO(typhoonzero): how to call InitVariables
+    }
+  }
+  void Run(const framework::Scope &scope,
+           const platform::DeviceContext &dev_ctx) const override {
+    auto iname = Input("X");
+    auto oname = Output("Out");
+    // TODO(typhoonzero): currently it's non-blocking,
+    // should block until server responds.
+    bool ret = client_->SendVariable(scope, iname, oname);
+    if (!ret) {
+      LOG(ERROR) << "send variable error";
+    }
+  }
+
+ protected:
+  std::shared_ptr<detail::RPCClient> client_{nullptr};
+};
+
+class SendOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SendOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "(Tensor) Input tensor to be saved");
+    AddOutput("Out", "(Tensor) Output fetched from server");
+    AddComment(R"DOC(
+Recv operator
+
+This operator will recv tensor from send_op
+)DOC");
+    AddAttr<std::string>("endpoint",
+                         "(string, default 127.0.0.1:6164)"
+                         "IP address to listen on.")
+        .SetDefault("127.0.0.1:6164")
+        .AddCustomChecker([](const std::string &ip) { return !ip.empty(); });
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(send, ops::SendOp, ops::SendOpMaker);
diff --git a/paddle/operators/send_recv_op_test.cc b/paddle/operators/send_recv_op_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ac03eb3752e7cd31dd80f4caa39dc0625f0409d5
--- /dev/null
+++ b/paddle/operators/send_recv_op_test.cc
@@ -0,0 +1,125 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+// TODO(typhoonzero): add python bindings for this test as
+// a RemoteOptimizer.
+
+#include <unistd.h>
+#include <thread>
+
+#include "gtest/gtest.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/program_desc.h"
+
+USE_NO_KERNEL_OP(send);
+USE_NO_KERNEL_OP(recv);
+USE_OP(sum);
+
+// global for simplicity.
+std::unique_ptr<paddle::framework::OperatorBase> recv_op;
+
+void InitTensorsInScope(paddle::framework::Scope &scope,
+                        paddle::platform::CPUPlace &place) {
+  paddle::platform::CPUDeviceContext ctx(place);
+  auto var = scope.Var("X");
+  auto tensor = var->GetMutable<paddle::framework::LoDTensor>();
+  tensor->Resize({10, 10});
+  float *expect = tensor->mutable_data<float>(place);
+  for (int64_t i = 0; i < tensor->numel(); ++i) {
+    expect[i] = static_cast<float>(i);
+  }
+
+  auto out_var = scope.Var("Out");
+  auto out_tensor = out_var->GetMutable<paddle::framework::LoDTensor>();
+  out_tensor->Resize({10, 10});
+  tensor->mutable_data<float>(place);  // allocate
+}
+
+void AddOp(const std::string &type,
+           const paddle::framework::VariableNameMap &inputs,
+           const paddle::framework::VariableNameMap &outputs,
+           paddle::framework::AttributeMap attrs,
+           paddle::framework::BlockDescBind *block) {
+  // insert output
+  for (auto kv : outputs) {
+    for (auto v : kv.second) {
+      auto var = block->Var(v);
+      var->SetDataType(paddle::framework::DataType::FP32);
+    }
+  }
+
+  // insert op
+  auto op = block->AppendOp();
+  op->SetType(type);
+  for (auto &kv : inputs) {
+    op->SetInput(kv.first, kv.second);
+  }
+  for (auto &kv : outputs) {
+    op->SetOutput(kv.first, kv.second);
+  }
+  op->SetAttrMap(attrs);
+}
+
+void StartServerNet() {
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+  InitTensorsInScope(scope, place);
+
+  // sub program run in recv_op, for simple test we use sum
+  paddle::framework::ProgramDescBind program;
+  paddle::framework::BlockDescBind *block = program.MutableBlock(0);
+  // X for server side tensors, RX for received tensers, must be of same shape.
+  AddOp("sum", {{"X", {"X", "RX"}}}, {{"Out", {"Out"}}}, {}, block);
+
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
+  attrs.insert({"OptimizeBlock", block});
+  recv_op = paddle::framework::OpRegistry::CreateOp("recv", {{"RX", {"RX"}}},
+                                                    {{"Out", {"Out"}}}, attrs);
+  paddle::platform::CPUDeviceContext ctx(place);
+  recv_op->Run(scope, ctx);
+}
+
+TEST(SendRecvOp, CPU) {
+  std::thread server_thread(StartServerNet);
+  sleep(5);  // wait server to start
+  // local net
+  paddle::framework::Scope scope;
+  paddle::platform::CPUPlace place;
+  InitTensorsInScope(scope, place);
+
+  paddle::framework::AttributeMap attrs;
+  attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
+
+  auto send_op = paddle::framework::OpRegistry::CreateOp(
+      "send", {{"X", {"X"}}}, {{"Out", {"Out"}}}, attrs);
+  paddle::platform::CPUDeviceContext ctx(place);
+  send_op->Run(scope, ctx);
+
+  auto in_var = scope.Var("X");
+  auto tensor = in_var->GetMutable<paddle::framework::LoDTensor>();
+  float *expected = tensor->data<float>();
+
+  auto out_var = scope.Var("Out");
+  auto target = out_var->GetMutable<paddle::framework::LoDTensor>();
+  // send fail cause output is none.
+  EXPECT_NE(target->memory_size(), size_t(0));
+  float *actual = target->data<float>();
+  for (int64_t i = 0; i < target->numel(); ++i) {
+    EXPECT_EQ(expected[i] * 2, actual[i]);
+  }
+  recv_op.reset();  // dtor can shutdown and join server thread.
+  server_thread.join();
+}
diff --git a/paddle/operators/sequence_slice_op.cc b/paddle/operators/sequence_slice_op.cc
old mode 100755
new mode 100644
index cbe0b4233160dd1f3ebdf6db8b5f6df392efdfe7..255683a572c0e8d54791cb0c905d85239920d992
--- a/paddle/operators/sequence_slice_op.cc
+++ b/paddle/operators/sequence_slice_op.cc
@@ -45,7 +45,7 @@ class SequenceSliceOp : public framework::OperatorWithKernel {
     // Initialize the output's dims to maximum,
     // and re-set to real dims by the value of Offset and Length at kernel
     ctx->SetOutputDim("Out", input_dims);
-    }
+  }
 
  protected:
   framework::OpKernelType GetKernelType(
@@ -93,8 +93,7 @@ class SequenceSliceOpMaker : public framework::OpProtoAndCheckerMaker {
              "(Tensor), "
              "a vector<int> to describe the length of every input sequence for "
              "sub sequence item.");
-    AddOutput("Out",
-              "(LoDTensor), the output of SequenceSliceOp.");
+    AddOutput("Out", "(LoDTensor), the output of SequenceSliceOp.");
     AddComment(R"DOC(
 Sequence slice operator
 
diff --git a/paddle/operators/sequence_slice_op.h b/paddle/operators/sequence_slice_op.h
old mode 100755
new mode 100644
index 2c9b8464a1236a054cf1a38b9dc1d73588f8dd38..428ef556daa248a918f58dde608dc024144e773c
--- a/paddle/operators/sequence_slice_op.h
+++ b/paddle/operators/sequence_slice_op.h
@@ -26,7 +26,7 @@ using LoD = framework::LoD;
 
 template <typename T>
 inline LoD SequenceSliceLoD(const T& in, const int64_t* offset_data,
-                           const int64_t* length_data) {
+                            const int64_t* length_data) {
   auto out_lod = in.lod();
   size_t lod_offset = 0;
 
@@ -34,7 +34,7 @@ inline LoD SequenceSliceLoD(const T& in, const int64_t* offset_data,
   out_lod[0][0] = 0;
   for (size_t i = 0; i < n; ++i) {
     lod_offset += length_data[i];
-    out_lod[0][i+1] = lod_offset;
+    out_lod[0][i + 1] = lod_offset;
   }
   return out_lod;
 }
@@ -51,14 +51,13 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
     auto lod = in->lod();
     auto n = lod[0].size() - 1;
 
-    PADDLE_ENFORCE_EQ(lod.size(), 1UL,
-                      "Only support one level sequence now.");
+    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
     PADDLE_ENFORCE_EQ(
         n, static_cast<size_t>(length->dims()[0]),
-        "The size of input-sequence and length-array should be the same")
+        "The size of input-sequence and length-array should be the same");
     PADDLE_ENFORCE_EQ(
         n, static_cast<size_t>(offset->dims()[0]),
-        "The size of input-sequence and offset-array should be the same")
+        "The size of input-sequence and offset-array should be the same");
 
     const int64_t* offset_data = offset->data<int64_t>();
     const int64_t* length_data = length->data<int64_t>();
@@ -67,23 +66,23 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
 
     if (platform::is_gpu_place(ctx.GetPlace())) {
       offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
-      offset_cpu.CopyFrom(*offset, platform::CPUPlace(), ctx.device_context());
+      framework::CopyFrom(*offset, platform::CPUPlace(), ctx.device_context(),
+                          &offset_cpu);
       offset_data = offset_cpu.data<int64_t>();
 
       length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
-      length_cpu.CopyFrom(*length, platform::CPUPlace(), ctx.device_context());
+      framework::CopyFrom(*length, platform::CPUPlace(), ctx.device_context(),
+                          &length_cpu);
       length_data = length_cpu.data<int64_t>();
     }
 
     for (size_t i = 0; i < n; ++i) {
       PADDLE_ENFORCE_LT(0, offset_data[i],
-                "The offset[%d] must greater than zero.", i)
+                        "The offset[%d] must greater than zero.", i);
       PADDLE_ENFORCE_LT(0, length_data[i],
-                "The length[%d] must greater than zero.", i)
-      PADDLE_ENFORCE_LT(
-          lod[0][i] + offset_data[i] + length_data[i],
-          lod[0][i + 1],
-          "The target tensor's length overflow.")
+                        "The length[%d] must greater than zero.", i);
+      PADDLE_ENFORCE_LT(lod[0][i] + offset_data[i] + length_data[i],
+                        lod[0][i + 1], "The target tensor's length overflow.");
     }
 
     out->mutable_data<T>(ctx.GetPlace());
@@ -98,14 +97,12 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
 
     size_t out_offset = 0;
     for (size_t i = 0; i < n; ++i) {
-      Tensor in_t =
-          in->Slice(static_cast<int>(lod[0][i] + offset_data[i]),
-                    static_cast<int>(lod[0][i] + offset_data[i] +
-                                     length_data[i]));
-
-      StridedMemcpy<T>(ctx.device_context(), in_t.data<T>(),
-                       in_stride, in_t.dims(), out_stride,
-                       out->data<T>() + out_offset);
+      Tensor in_t = in->Slice(
+          static_cast<int>(lod[0][i] + offset_data[i]),
+          static_cast<int>(lod[0][i] + offset_data[i] + length_data[i]));
+
+      StridedMemcpy<T>(ctx.device_context(), in_t.data<T>(), in_stride,
+                       in_t.dims(), out_stride, out->data<T>() + out_offset);
       out_offset += length_data[i] * in_stride[0];
     }
   }
@@ -130,11 +127,13 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
 
     if (platform::is_gpu_place(ctx.GetPlace())) {
       offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
-      offset_cpu.CopyFrom(*offset, platform::CPUPlace(), ctx.device_context());
+      framework::CopyFrom(*offset, platform::CPUPlace(), ctx.device_context(),
+                          &offset_cpu);
       offset_data = offset_cpu.data<int64_t>();
 
       length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
-      length_cpu.CopyFrom(*length, platform::CPUPlace(), ctx.device_context());
+      framework::CopyFrom(*length, platform::CPUPlace(), ctx.device_context(),
+                          &length_cpu);
       length_data = length_cpu.data<int64_t>();
     }
 
@@ -162,8 +161,8 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
             static_cast<int>(lod[0][i] + offset_data[i] + length_data[i]));
 
         StridedMemcpy<T>(ctx.device_context(), out_grad_t.data<T>(),
-                        out_grad_stride, out_grad_t.dims(), x_grad_stride,
-                        x_grad_t.data<T>());
+                         out_grad_stride, out_grad_t.dims(), x_grad_stride,
+                         x_grad_t.data<T>());
       }
     }
   }
diff --git a/paddle/operators/sgd_op.cc b/paddle/operators/sgd_op.cc
index 72f4e4d5cbcd692423fa2a3e9ec8e7033b552c3c..5576d7b8be060a3c58cb18ed667041562cf853b8 100644
--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -55,7 +55,7 @@ SGD operator
 
 This operator implements one step of the stochastic gradient descent algorithm.
 
-$$param_out = param - learning_rate * grad$$
+$$param\_out = param - learning\_rate * grad$$
 
 )DOC");
   }
diff --git a/paddle/operators/shrink_rnn_memory_op.cc b/paddle/operators/shrink_rnn_memory_op.cc
index 65bccc0c81d0ad9674649933a20ec7b09fec5b37..c380e606869fd2c559c7d5f378857ca74fa8d8d3 100644
--- a/paddle/operators/shrink_rnn_memory_op.cc
+++ b/paddle/operators/shrink_rnn_memory_op.cc
@@ -57,11 +57,21 @@ class ShrinkRNNMemoryOpProtoMaker : public framework::OpProtoAndCheckerMaker {
   ShrinkRNNMemoryOpProtoMaker(framework::OpProto *proto,
                               framework::OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "");
-    AddInput("RankTable", "");
-    AddInput("I", "");
-    AddOutput("Out", "");
-    AddComment("");
+    AddInput("X", "(LoDTensor) The RNN step memory to be shrinked.");
+    AddInput("RankTable", "(LoDRankTable) The lod_rank_table of dynamic RNN.");
+    AddInput("I",
+             "(LoDTensor) The step index. The RNN step memory 'X' will be "
+             "shrinked to match the size of the input of the index'th step.");
+    AddOutput("Out", "(LoDTensor) The shrinked RNN step memory.");
+    AddComment(
+        R"DOC(
+        In dynamic RNN, we are able to handle sequences of different lengths. 
+        Because of the multiple lengths, the size of each step input can be 
+        different, which may lead to a mismatching between the input of
+        the current step and the memory generated by the previous one. This 
+        operator shrinks memory according to the size of the next step input, 
+        to make sure that they can match each other.
+        )DOC");
   }
 };
 
@@ -101,8 +111,8 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
     } else {
       auto &dout_tensor = dout_var->Get<framework::LoDTensor>();
       auto height = dout_tensor.dims()[0];
-      dx_tensor.Slice(0, static_cast<int>(height))
-          .CopyFrom(dout_tensor, dout_tensor.place(), dev_ctx);
+      auto slice = dx_tensor.Slice(0, static_cast<int>(height));
+      framework::CopyFrom(dout_tensor, dout_tensor.place(), dev_ctx, &slice);
       if (dx_tensor.dims()[0] < height) {
         auto rest_tensor = dx_tensor.Slice(
             static_cast<int>(height), static_cast<int>(dout_tensor.dims()[0]));
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
index d9e40546523c60b0a7eec2e0593446258996ba58..782f4c79361b3255cc686ec3b1edf31ce37f5a2d 100644
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -25,20 +25,19 @@ class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Labels"),
-                   "Input(Labels) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should be not null.");
 
     auto x_dims = ctx->GetInputDim("X");
-    auto labels_dims = ctx->GetInputDim("Labels");
+    auto labels_dims = ctx->GetInputDim("Label");
     PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(labels_dims.size(), 2,
-                      "Input(Labels)'s rank should be 2.");
+                      "Input(Label)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0],
-                      "The 1st dimension of Input(X) and Input(Labels) should "
+                      "The 1st dimension of Input(X) and Input(Label) should "
                       "be equal.");
     PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1],
-                      "The 2nd dimension of Input(X) and Input(Labels) should "
+                      "The 2nd dimension of Input(X) and Input(Label) should "
                       "be equal.");
 
     ctx->SetOutputDim("Out", x_dims);
@@ -53,26 +52,25 @@ class SigmoidCrossEntropyWithLogitsGradOp
 
   void InferShape(framework::InferShapeContext* ctx) const override {
     PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null.");
-    PADDLE_ENFORCE(ctx->HasInput("Labels"),
-                   "Input(Labels) should be not null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null.");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                    "Input(Out@GRAD) shoudl be not null.");
     PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
                    "Output(X@GRAD) should be not null.");
 
     auto x_dims = ctx->GetInputDim("X");
-    auto labels_dims = ctx->GetInputDim("Labels");
+    auto labels_dims = ctx->GetInputDim("Label");
     auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
     PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(labels_dims.size(), 2,
-                      "Input(Labels)'s rank should be 2.");
+                      "Input(Label)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(dout_dims.size(), 2,
                       "Input(Out@Grad)'s rank should be 2.");
     PADDLE_ENFORCE_EQ(x_dims[0], labels_dims[0],
-                      "The 1st dimension of Input(X) and Input(Labels) should "
+                      "The 1st dimension of Input(X) and Input(Label) should "
                       "be equal.");
     PADDLE_ENFORCE_EQ(x_dims[1], labels_dims[1],
-                      "The 2nd dimension of Input(X) and Input(Labels) should "
+                      "The 2nd dimension of Input(X) and Input(Label) should "
                       "be equal.");
     PADDLE_ENFORCE_EQ(x_dims[0], dout_dims[0],
                       "The 1st dimension of Input(X) and Input(Out@Grad) "
@@ -97,7 +95,7 @@ class SigmoidCrossEntropyWithLogitsOpMaker
              "This input is a tensor of logits computed by the previous "
              " operator. Logits are unscaled log probabilities given as "
              "log(p/(1-p)).");
-    AddInput("Labels",
+    AddInput("Label",
              "(Tensor, default Tensor<float>), a 2-D tensor of the same type "
              "and shape as X. This input is a tensor of probabalistic labels "
              "for each logit");
diff --git a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
index 41c619f181c878f08959a8ca461c60af5ffdff2a..2a9d9bbc77266c8ecfba82663c396bbd8e4dbe27 100644
--- a/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
+++ b/paddle/operators/sigmoid_cross_entropy_with_logits_op.h
@@ -25,8 +25,7 @@ class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
     const framework::Tensor *X = context.Input<framework::Tensor>("X");
-    const framework::Tensor *Labels =
-        context.Input<framework::Tensor>("Labels");
+    const framework::Tensor *Labels = context.Input<framework::Tensor>("Label");
     framework::Tensor *Out = context.Output<framework::Tensor>("Out");
     Out->mutable_data<T>(context.GetPlace());
 
@@ -52,8 +51,7 @@ class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &context) const override {
     const framework::Tensor *X = context.Input<framework::Tensor>("X");
-    const framework::Tensor *Labels =
-        context.Input<framework::Tensor>("Labels");
+    const framework::Tensor *Labels = context.Input<framework::Tensor>("Label");
     const framework::Tensor *dOut =
         context.Input<framework::Tensor>(framework::GradVarName("Out"));
     framework::Tensor *dX =
diff --git a/paddle/operators/smooth_l1_loss_op.cc b/paddle/operators/smooth_l1_loss_op.cc
index ebf7b43700a7498aa18b5f648b0b8c2c4e7b442b..50543fcc148698c42e15259ba20bdacdd50ac1af 100644
--- a/paddle/operators/smooth_l1_loss_op.cc
+++ b/paddle/operators/smooth_l1_loss_op.cc
@@ -22,22 +22,20 @@ class SmoothL1LossOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("X"), "X must be initialized.");
-    PADDLE_ENFORCE(ctx->HasInput("Y"), "Y must be initialized.");
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null.");
 
     auto x_dims = ctx->GetInputDim("X");
     auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(x_dims, y_dims, "The shape of X and Y must be the same.");
+    PADDLE_ENFORCE_EQ(x_dims, y_dims);
     PADDLE_ENFORCE_GE(x_dims.size(), 2,
-                      "The tensor rank of X must be at least 2.");
+                      "The tensor rank of Input(X) should not be less than 2.");
     if (ctx->HasInput("InsideWeight")) {
       PADDLE_ENFORCE(ctx->HasInput("OutsideWeight"),
                      "If weights are provided, must specify both "
                      "inside and outside weights.");
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims,
-                        "The shape of InsideWeight must be same as X.");
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims,
-                        "The shape of OutsideWeight must be same as X.");
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims);
     }
 
     ctx->SetOutputDim("Diff", x_dims);
@@ -53,25 +51,29 @@ class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
                       framework::OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X",
-             "The input tensor of smooth l1 loss op."
-             "The rank should be greater or equal to 2 with shape "
-             "[batch_size, value_dim1, value_dim2, ..., value_dimN]");
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "The input value of smooth l1 loss op with shape "
+             "[batch_size, dim1, ..., dimN].");
     AddInput("Y",
-             "The target tensor of smooth l1 loss op "
-             "with the same shape as X.");
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "The target value of smooth l1 loss op with same shape as X.");
     AddInput("InsideWeight",
-             "Optional input tensor of smooth l1 loss op with the same shape "
-             "as X. If provided, the result of (X - Y) will be multiplied "
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "This input is optional and should have same shape with X. "
+             "If provided, the result of (X - Y) will be multiplied "
              "by this tensor element by element.")
         .AsDispensable();
     AddInput("OutsideWeight",
-             "Optinal input of smooth l1 loss op with the same shape as X."
-             "If provided, the output smooth l1 loss will be multiplied by "
-             "this tensor element by element.")
+             "(Tensor, default Tensor<float>) A tensor with rank at least 2. "
+             "This input is optional and should have same shape with X. "
+             "If provided, the out smooth l1 loss will be multiplied by this "
+             "tensor element by element.")
         .AsDispensable();
-    AddOutput("Diff", "Intermediate variable to cache InsideWeight*(X-Y).")
+    AddOutput("Diff", "Intermediate variable to cache InsideWeight * (X - Y).")
         .AsIntermediate();
-    AddOutput("Out", "Smooth l1 loss.");
+    AddOutput("Out",
+              "(Tensor, default Tensor<float>) A tensor with rank be 2. "
+              "The output smooth l1 loss with shape [batch_size, 1].");
     AddAttr<AttrType>("sigma",
                       "Hyper parameter of smooth l1 loss op."
                       "A float scalar with default value 3.0.")
@@ -79,15 +81,23 @@ class SmoothL1LossOpMaker : public framework::OpProtoAndCheckerMaker {
     AddComment(R"DOC(
 Smooth L1 Loss Operator.
 
-This operator computes the smooth l1 loss for input and target.
-The operator takes the first dimension of input as the batch size.
+This operator computes the smooth l1 loss for X and Y.
+The operator takes the first dimension of X and Y as batch size.
 For each instance, it computes the smooth l1 loss element by element first
-and then sums all the losses. So the resulting output shape
-is [batch_size, 1].
+and then sums all the losses. So the shape of Out is [batch_size, 1].
 
 The equation is:
-loss = $$0.5 * (\sigma * (x-y))^2$$   if $$|x - y| < 1 /({\sigma}^2)$$
-       $$\frac{|x - y| - 0.5}{{\sigma}^2}$$ otherwise
+$$
+Out_{\sigma}(X, Y)_i = \begin{cases}
+0.5 * (\sigma * (X_i - Y_i)) ^ 2
+\quad |X_i - Y_i| \lt \frac{1} {{\sigma} ^ 2} \\
+\frac{|X_i - Y_i| - 0.5}{{\sigma}^2},
+\quad otherwise
+\end{cases}
+$$
+
+In the above equation, $Out_{\sigma}(X, Y)_i$, $X_i$ and $Y_i$ represent the ith
+element of Out, X and Y.
 
 )DOC");
   }
diff --git a/paddle/operators/split_lod_tensor_op.cc b/paddle/operators/split_lod_tensor_op.cc
index db635f2ba0804143c9a2e04ff006dfbc8744f3fc..f164a4771186635232fea46327ca1fb8b86f2852 100644
--- a/paddle/operators/split_lod_tensor_op.cc
+++ b/paddle/operators/split_lod_tensor_op.cc
@@ -49,7 +49,7 @@ class SplitLoDTensorOp : public framework::OperatorBase {
       cpu_mask->ShareDataWith(mask);
     } else if (platform::is_gpu_place(mask.place())) {
 #ifdef PADDLE_WITH_CUDA
-      cpu_mask->CopyFrom(mask, platform::CPUPlace(), dev_ctx);
+      framework::CopyFrom(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
 #else
       PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
 #endif
@@ -105,10 +105,11 @@ class SplitLoDTensorOp : public framework::OperatorBase {
           continue;
         }
         // out[offset: offset+len] = x[each_range.begin: each_range.end]
-        out->Slice(static_cast<int>(offset), static_cast<int>(offset + len))
-            .CopyFrom(x.Slice(static_cast<int>(each_range.begin),
-                              static_cast<int>(each_range.end)),
-                      x.place(), dev_ctx);
+        auto slice = out->Slice(static_cast<int>(offset),
+                                static_cast<int>(offset + len));
+        framework::CopyFrom(x.Slice(static_cast<int>(each_range.begin),
+                                    static_cast<int>(each_range.end)),
+                            x.place(), dev_ctx, &slice);
         offset += len;
       }
     }
diff --git a/paddle/operators/sum_op.h b/paddle/operators/sum_op.h
index 4ca15611392b3117aa6c92cba95911eb8bebeb15..a1eb3b014edc65b7ed604c8b7f17d72f7e460f70 100644
--- a/paddle/operators/sum_op.h
+++ b/paddle/operators/sum_op.h
@@ -84,7 +84,7 @@ class SumKernel : public framework::OpKernel<T> {
       int64_t offset = 0;
       for (int i = 0; i < N; i++) {
         PADDLE_ENFORCE_EQ(out->height(),
-                          in_vars[i]->Get<SelectedRows>().height())
+                          in_vars[i]->Get<SelectedRows>().height());
         functor(context.device_context(), in_vars[i]->Get<SelectedRows>(),
                 offset, out);
         offset += in_vars[i]->Get<SelectedRows>().value().numel();
@@ -102,8 +102,8 @@ class SumKernel : public framework::OpKernel<T> {
               out_array.resize(i + 1);
             }
             if (out_array[i].numel() == 0) {
-              out_array[i].CopyFrom(in_array[i], in_array[i].place(),
-                                    context.device_context());
+              framework::CopyFrom(in_array[i], in_array[i].place(),
+                                  context.device_context(), &out_array[i]);
               out_array[i].set_lod(in_array[i].lod());
             } else {
               PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod());
diff --git a/paddle/operators/tensor.save b/paddle/operators/tensor.save
new file mode 100644
index 0000000000000000000000000000000000000000..c24308a7d0131b84c28c0a9857cce4949afb2091
Binary files /dev/null and b/paddle/operators/tensor.save differ
diff --git a/paddle/operators/tensor_array_read_write_op.cc b/paddle/operators/tensor_array_read_write_op.cc
index ae1b48d7a8e3d573a5134a822a2ed5ef70511077..efde850143ce188300667b21e4b539b1d150d9ae 100644
--- a/paddle/operators/tensor_array_read_write_op.cc
+++ b/paddle/operators/tensor_array_read_write_op.cc
@@ -37,9 +37,15 @@ class WriteToArrayOp : public ArrayOp {
                << " to " << offset + 1;
       out->resize(offset + 1);
     }
-    auto *out_tensor = &out->at(offset);
-    out_tensor->CopyFrom(x_tensor, dev_ctx.GetPlace(), dev_ctx);
-    out_tensor->set_lod(x_tensor.lod());
+    if (x_tensor.memory_size() > 0) {
+      auto *out_tensor = &out->at(offset);
+      CopyFrom(x_tensor, dev_ctx.GetPlace(), dev_ctx, out_tensor);
+      out_tensor->set_lod(x_tensor.lod());
+    } else {
+      VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
+                  "nothing has been written to output array["
+               << offset << "].";
+    }
   }
 };
 
@@ -116,7 +122,8 @@ class ReadFromArrayOp : public ArrayOp {
     auto *out_tensor = out->GetMutable<framework::LoDTensor>();
     size_t offset = GetOffset(scope, dev_ctx);
     PADDLE_ENFORCE_LT(offset, x_array.size());
-    out_tensor->CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx);
+    framework::CopyFrom(x_array[offset], dev_ctx.GetPlace(), dev_ctx,
+                        out_tensor);
     out_tensor->set_lod(x_array[offset].lod());
   }
 };
diff --git a/paddle/operators/unpool_op.cc b/paddle/operators/unpool_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..89c48e071cf351f7d7b9cf26a5d4989af291da57
--- /dev/null
+++ b/paddle/operators/unpool_op.cc
@@ -0,0 +1,143 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/unpool_op.h"
+namespace paddle {
+namespace operators {
+
+class Unpool2dOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  Unpool2dOpMaker(framework::OpProto* proto,
+                  framework::OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput(
+        "X",
+        "(Tensor) The input tensor of unpool operator. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddInput(
+        "Indices",
+        "(Tensor) The input tensor of the indices given out by MaxPool2d. "
+        "The format of input tensor is NCHW. Where N is batch size, C is the "
+        "number of channels, H and W is the height and width of feature.");
+    AddOutput("Out",
+              "(Tensor) The output tensor of unpool operator."
+              "The format of output tensor is also NCHW."
+              "Where N is batch size, C is "
+              "the number of channels, H and W is the height and "
+              "width of feature.");
+    AddAttr<std::vector<int>>(
+        "ksize",
+        "(vector), the unpooling window size(height, width) "
+        "of unpooling operator.");
+    AddAttr<std::vector<int>>("strides",
+                              "(vector, default:{1, 1}), "
+                              "strides (height, width) of unpooling operator.")
+        .SetDefault({1, 1});
+    AddAttr<std::vector<int>>("paddings",
+                              "(vector defalut:{0,0}), "
+                              "paddings (height, width) of unpooling operator.")
+        .SetDefault({0, 0});
+    AddAttr<std::string>(
+        "unpooling_type",
+        "(string), unpooling type, can be \"max\" for max-unpooling ")
+        .InEnum({"max"});
+    AddComment(R"DOC(
+        "Input shape: $(N, C_{in}, H_{in}, W_{in})$
+        Output shape: $(N, C_{out}, H_{out}, W_{out})$
+        Where
+          $$
+            H_{out} = (H_{in}−1) * strides[0] − 2 * paddings[0] + ksize[0] \\
+            W_{out} = (W_{in}−1) * strides[1] − 2 * paddings[1] + ksize[1]
+          $$
+        Paper: http://www.matthewzeiler.com/wp-content/uploads/2017
+        /07/iccv2011.pdf
+        )DOC");
+  }
+};
+
+int OutputSize(int input_size, int ksize, int padding, int stride) {
+  int output_size = (input_size - 1) * stride - 2 * padding + ksize;
+  return output_size;
+}
+
+class UnpoolOp : public framework::OperatorWithKernel {
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of UnpoolOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Indices"),
+                   "Input(Indices) of UnpoolOp"
+                   "should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of UnpoolOp should not be null.");
+    auto in_x_dims = ctx->GetInputDim("X");
+    auto in_y_dims = ctx->GetInputDim("Indices");
+    std::string unpooling_type =
+        ctx->Attrs().Get<std::string>("unpooling_type");
+    std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
+    std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+    PADDLE_ENFORCE(in_x_dims.size() == 4,
+                   "Unpooling intput must be of 4-dimensional.");
+    PADDLE_ENFORCE_EQ(in_x_dims, in_y_dims);
+    std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      output_shape.push_back(
+          OutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
+    }
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+};
+
+class UnpoolOpGrad : public framework::OperatorWithKernel {
+ protected:
+  framework::OpKernelType GetKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.device_context());
+  }
+
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
+                   "Input(X@GRAD) should not be null.");
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(unpool, ops::UnpoolOp, ops::Unpool2dOpMaker, unpool_grad,
+            ops::UnpoolOpGrad);
+REGISTER_OP_CPU_KERNEL(unpool,
+                       ops::UnpoolKernel<paddle::platform::CPUPlace, float>,
+                       ops::UnpoolKernel<paddle::platform::CPUPlace, double>);
+REGISTER_OP_CPU_KERNEL(
+    unpool_grad, ops::UnpoolGradKernel<paddle::platform::CPUPlace, float>,
+    ops::UnpoolGradKernel<paddle::platform::CPUPlace, double>);
diff --git a/paddle/operators/unpool_op.cu.cc b/paddle/operators/unpool_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..18aafb7dc74ed474ed3ec5e8a388ecdb71b9a8f5
--- /dev/null
+++ b/paddle/operators/unpool_op.cu.cc
@@ -0,0 +1,23 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/unpool_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(unpool,
+                       ops::UnpoolKernel<paddle::platform::GPUPlace, float>,
+                       ops::UnpoolKernel<paddle::platform::GPUPlace, double>);
+REGISTER_OP_GPU_KERNEL(
+    unpool_grad, ops::UnpoolGradKernel<paddle::platform::GPUPlace, float>,
+    ops::UnpoolGradKernel<paddle::platform::GPUPlace, double>);
diff --git a/paddle/operators/unpool_op.h b/paddle/operators/unpool_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..243eb7e532c5149db4fb1b381fd8664ae4bdd81a
--- /dev/null
+++ b/paddle/operators/unpool_op.h
@@ -0,0 +1,71 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/operators/math/unpooling.h"
+
+namespace paddle {
+namespace operators {
+template <typename Place, typename T>
+class UnpoolKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
+    auto* out = context.Output<framework::Tensor>("Out");
+    std::string unpooling_type = context.Attr<std::string>("unpooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+    T* output_data = out->mutable_data<T>(context.GetPlace());
+    if (output_data) {
+      math::SetConstant<Place, T> set_zero;
+      set_zero(context.device_context(), out, static_cast<T>(0));
+    }
+    math::Unpool2dMaxFunctor<Place, T> unpool2d_max_forward;
+    unpool2d_max_forward(context.device_context(), *in_x, *in_y, out);
+  }
+};
+template <typename Place, typename T>
+class UnpoolGradKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const framework::Tensor* in_x = context.Input<framework::Tensor>("X");
+    const framework::Tensor* in_y = context.Input<framework::Tensor>("Indices");
+    const framework::Tensor* out = context.Input<framework::Tensor>("Out");
+    const framework::Tensor* out_grad =
+        context.Input<framework::Tensor>(framework::GradVarName("Out"));
+    framework::Tensor* in_x_grad =
+        context.Output<framework::Tensor>(framework::GradVarName("X"));
+    std::string unpooling_type = context.Attr<std::string>("unpooling_type");
+    std::vector<int> ksize = context.Attr<std::vector<int>>("ksize");
+    std::vector<int> strides = context.Attr<std::vector<int>>("strides");
+    std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
+
+    auto& device_ctx = context.device_context();
+    math::SetConstant<Place, T> zero;
+    if (in_x_grad) {
+      in_x_grad->mutable_data<T>(context.GetPlace());
+      zero(device_ctx, in_x_grad, static_cast<T>(0));
+    }
+    math::Unpool2dMaxGradFunctor<Place, T> unpool2d_max_backward;
+    unpool2d_max_backward(context.device_context(), *in_x, *in_y, *out,
+                          *out_grad, in_x_grad);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/while_op.cc b/paddle/operators/while_op.cc
index 68b4f7705995e5ecb6c9b8216db7373c1777a31e..59460f6c879cf2c14fd27e33ecb1ba45b21da485 100644
--- a/paddle/operators/while_op.cc
+++ b/paddle/operators/while_op.cc
@@ -287,7 +287,6 @@ class WhileGradOpShapeInference : public framework::InferShapeBase {
 
     auto p_names = ctx->Inputs(kParameters);
     auto pg_names = ctx->Outputs(kParamGrads);
-    auto dims = ctx->GetInputsDim(kParameters);
     auto var_types = ctx->GetInputsVarType(kParameters);
     std::vector<std::string> names_to_set;
     std::vector<framework::DDim> dims_to_set;
@@ -295,13 +294,14 @@ class WhileGradOpShapeInference : public framework::InferShapeBase {
       if (pg_names[i] == framework::kEmptyVarName) {
         continue;
       }
+      auto dims = ctx->GetInputsElementDim(kParameters, i);
       if (var_types[i] == framework::VarDesc::LOD_TENSOR) {
         names_to_set.push_back(pg_names[i]);
-        dims_to_set.push_back(dims[i]);
+        dims_to_set.push_back(dims);
       } else if (var_types[i] == framework::VarDesc::LOD_TENSOR_ARRAY) {
         // not sure how to set the dim of LOD_TENSOR_ARRAY
         names_to_set.push_back(pg_names[i]);
-        dims_to_set.push_back(dims[i]);
+        dims_to_set.push_back(dims);
       }
     }
     ctx->SetDims(names_to_set, dims_to_set);
diff --git a/paddle/optimizer/parameter_optimizer_test.cc b/paddle/optimizer/parameter_optimizer_test.cc
index f29e5317120642e3790a6f6c1976bdda67093a0c..83757a391784453341f22eca73bc73c14ce4174f 100644
--- a/paddle/optimizer/parameter_optimizer_test.cc
+++ b/paddle/optimizer/parameter_optimizer_test.cc
@@ -127,8 +127,3 @@ TEST_F(OptimizerTest, TestGetWeight) { TestGetWeight(); }
 TEST_F(OptimizerTest, TestUpdate) { TestUpdate(); }
 
 TEST_F(OptimizerTest, TestCheckPoint) { TestCheckPoint(); }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/optimizer/serialization_test.cc b/paddle/optimizer/serialization_test.cc
index 4c416f55ee0bd70f9ec6e288b08a5399d8b2bf39..940e941e9042d8a37363311867df5bb477b3dac0 100644
--- a/paddle/optimizer/serialization_test.cc
+++ b/paddle/optimizer/serialization_test.cc
@@ -46,8 +46,3 @@ TEST(TensorToProto, Case2) {
     EXPECT_EQ(t1[i], t[i]);
   }
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/platform/cuda_profiler.h b/paddle/platform/cuda_profiler.h
new file mode 100644
index 0000000000000000000000000000000000000000..b6311cb23d695c3cd851bcca120c24cced7fdd62
--- /dev/null
+++ b/paddle/platform/cuda_profiler.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cuda_profiler_api.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+namespace paddle {
+namespace platform {
+
+void CudaProfilerInit(std::string output_file, std::string output_mode,
+                      std::vector<std::string> config_flags) {
+  std::array<char, 128> buf;
+  std::string tmpl = "/tmp/cuda_profile_config.XXXXXX";
+  PADDLE_ENFORCE_LT(tmpl.size(), buf.size());
+  memcpy(buf.data(), tmpl.data(), tmpl.size());
+  auto result = mktemp(buf.data());
+  PADDLE_ENFORCE(strlen(result) != 0);
+  std::string config_file = result;
+
+  {
+    std::ofstream ofs(config_file, std::ios::out | std::ios::trunc);
+    PADDLE_ENFORCE(ofs.is_open(), "ofstream: ", ofs.rdstate());
+    for (const auto& line : config_flags) {
+      ofs << line << std::endl;
+    }
+  }
+
+  PADDLE_ENFORCE(output_mode == "kvp" || output_mode == "csv");
+  cudaOutputMode_t mode = output_mode == "csv" ? cudaCSV : cudaKeyValuePair;
+  PADDLE_ENFORCE(
+      cudaProfilerInitialize(config_file.c_str(), output_file.c_str(), mode));
+}
+
+void CudaProfilerStart() { PADDLE_ENFORCE(cudaProfilerStart()); }
+
+void CudaProfilerStop() { PADDLE_ENFORCE(cudaProfilerStop()); }
+
+}  // namespace platform
+}  // namespace paddle
diff --git a/paddle/platform/cudnn_helper.h b/paddle/platform/cudnn_helper.h
index c5d8a6066ef3becb601344590f977a38c2af0a63..80a4c9bb4bbcd03cf849d86118db4e502382f031 100644
--- a/paddle/platform/cudnn_helper.h
+++ b/paddle/platform/cudnn_helper.h
@@ -116,7 +116,7 @@ inline cudnnTensorFormat_t GetCudnnTensorFormat(
     case DataLayout::kNCHW:
       return CUDNN_TENSOR_NCHW;
     case DataLayout::kNCDHW:
-      return CUDNN_TENSOR_NCHW;  // TODO(chengduoZH) : add CUDNN_TENSOR_NCDHW
+      return CUDNN_TENSOR_NCHW;  // NOTE: cudnn treat NdTensor as the same
     default:
       PADDLE_THROW("Unknown cudnn equivalent for order");
   }
@@ -143,7 +143,7 @@ class ScopedTensorDescriptor {
       strides[i] = dims[i + 1] * strides[i + 1];
     }
     // Update tensor descriptor dims setting if groups > 1
-    // FIXME(typhoonzero): Assume using NCHW or NCDHW order
+    // NOTE: Assume using NCHW or NCDHW order
     std::vector<int> dims_with_group(dims.begin(), dims.end());  // copy
     if (groups > 1) {
       dims_with_group[1] = dims_with_group[1] / groups;
@@ -186,7 +186,6 @@ class ScopedFilterDescriptor {
     // width of the filter.
     std::vector<int> kernel_with_group(kernel.begin(), kernel.end());
     if (groups > 1) {
-      // M /= groups
       kernel_with_group[0] /= groups;
       // NOTE: input filter(C) of the filter is already asserted to be C/groups.
     }
diff --git a/paddle/platform/dynload/cudnn.cc b/paddle/platform/dynload/cudnn.cc
index d3e4cb567d71b987724366b6a0896f5df0eb6055..761d9edd87f428ba140d29a566fc3401199bab15 100644
--- a/paddle/platform/dynload/cudnn.cc
+++ b/paddle/platform/dynload/cudnn.cc
@@ -37,6 +37,10 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DEFINE_WRAP);
 CUDNN_DNN_ROUTINE_EACH_R5(DEFINE_WRAP);
 #endif
 
+#ifdef CUDNN_DNN_ROUTINE_EACH_R7
+CUDNN_DNN_ROUTINE_EACH_R7(DEFINE_WRAP);
+#endif
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/dynload/cudnn.h b/paddle/platform/dynload/cudnn.h
index b2d69da93bcd4a5c8e694a18ca648ddc4bd947af..61caac545014db2a09e2ada0b508419578c49740 100644
--- a/paddle/platform/dynload/cudnn.h
+++ b/paddle/platform/dynload/cudnn.h
@@ -135,6 +135,12 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
 #endif
 
+#if CUDNN_VERSION >= 7001
+#define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \
+  __macro(cudnnSetConvolutionGroupCount);
+CUDNN_DNN_ROUTINE_EACH_R7(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP)
+#endif
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/platform/enforce.h b/paddle/platform/enforce.h
index 415020ab965fa976c37870b7ad5794aab947fb4e..97338a4ce68da46ead7d66f7e01a3d5c8623b3c4 100644
--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -234,16 +234,24 @@ inline void throw_on_error(T e) {
   __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__)
 #define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
   __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
-#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                            \
-  PADDLE_ENFORCE(nullptr != (__VAL), #__VAL " should not be null\n%s", \
-                 paddle::string::Sprintf("" __VA_ARGS__));
-
-#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...)        \
-  PADDLE_ENFORCE(__VAL0 __CMP __VAL1,                                         \
-                 "enforce %s " #__CMP " %s failed, %s " #__INV_CMP " %s\n%s", \
-                 #__VAL0, #__VAL1, paddle::string::to_string(__VAL0),         \
-                 paddle::string::to_string(__VAL1),                           \
-                 paddle::string::Sprintf("" __VA_ARGS__));
+#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                  \
+  do {                                                       \
+    if (UNLIKELY(nullptr == (__VAL))) {                      \
+      PADDLE_THROW(#__VAL " should not be null\n%s",         \
+                   paddle::string::Sprintf("" __VA_ARGS__)); \
+    }                                                        \
+  } while (0)
+
+#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...)  \
+  do {                                                                  \
+    if (!UNLIKELY((__VAL0)__CMP(__VAL1))) {                             \
+      PADDLE_THROW("enforce %s " #__CMP " %s failed, %s " #__INV_CMP    \
+                   " %s\n%s",                                           \
+                   #__VAL0, #__VAL1, paddle::string::to_string(__VAL0), \
+                   paddle::string::to_string(__VAL1),                   \
+                   paddle::string::Sprintf("" __VA_ARGS__));            \
+    }                                                                   \
+  } while (0)
 
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/pybind/CMakeLists.txt b/paddle/pybind/CMakeLists.txt
index a9bcc474387513a8ca019bc9382b88c93e08ff8d..a54dc0d9fdb3c30391b01966ad493540c8ad1375 100644
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
@@ -1,8 +1,8 @@
 if(WITH_PYTHON)
   cc_library(paddle_pybind SHARED
     SRCS pybind.cc exception.cc protobuf.cc
-    DEPS pybind python backward proto_desc tensor_array paddle_memory executor prune
+    DEPS pybind python backward proto_desc paddle_memory executor prune
     ${GLOB_OP_LIB})
 endif(WITH_PYTHON)
 
-cc_binary(print_operators_doc SRCS print_operators_doc.cc DEPS ${GLOB_OP_LIB} tensor_array)
+cc_binary(print_operators_doc SRCS print_operators_doc.cc DEPS ${GLOB_OP_LIB})
diff --git a/paddle/pybind/pybind.cc b/paddle/pybind/pybind.cc
index e697739cc6814b24710f7caa173d200f2e5d823d..c16d3e0cbe01f90a5aa9a5d7a523cd4e282e4771 100644
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -26,9 +26,7 @@ limitations under the License. */
 #include "paddle/framework/lod_tensor_array.h"
 #include "paddle/framework/prune.h"
 #include "paddle/framework/selected_rows.h"
-#include "paddle/framework/tensor_array.h"
 #include "paddle/operators/cond_op.h"
-#include "paddle/operators/dynamic_recurrent_op.h"
 #include "paddle/operators/net_op.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
@@ -39,6 +37,7 @@ limitations under the License. */
 
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/operators/nccl/nccl_gpu_common.h"
+#include "paddle/platform/cuda_profiler.h"
 #include "paddle/platform/gpu_info.h"
 #endif
 
@@ -395,83 +394,6 @@ All parameter, weight, gradient are variables in Paddle.
         self->CompleteAddOp();
       });
 
-  py::class_<framework::TensorArray>(m, "TensorArray")
-      .def("__init__",
-           [](TensorArray &instance) { new (&instance) TensorArray(); })
-      .def("read",
-           [](TensorArray &self, size_t index) { return self.Read(index); })
-      .def("write", [](TensorArray &self, size_t index,
-                       LoDTensor &value) { self.Write(index, value); })
-      .def("write_shared",
-           [](TensorArray &self, size_t index, const LoDTensor &value) {
-             self.WriteShared(index, value);
-           })
-      .def("size", [](TensorArray &self) { return self.size(); })
-      .def("pack",
-           [](TensorArray &self, size_t level,
-              const std::vector<std::vector<size_t>> &meta_info,
-              const std::vector<std::vector<size_t>> &lod) {
-             std::vector<DySeqMeta> meta;
-             for (auto &info : meta_info) {
-               PADDLE_ENFORCE_EQ(info.size(), 3UL);
-               meta.emplace_back(info[0], info[1], info[2]);
-             }
-#ifndef PADDLE_WITH_CUDA
-             return self.Pack(level, meta, lod);
-#else
-             LoD new_lod;
-             new_lod.reserve(lod.size());
-             std::copy(lod.begin(), lod.end(), std::back_inserter(new_lod));
-             return self.Pack(level, meta, new_lod);
-#endif
-           })
-      .def("unpack",
-           [](TensorArray &self, const LoDTensor &source, int level,
-              bool length_descend) {
-             auto metas = self.Unpack(source, level, length_descend);
-             std::vector<std::vector<size_t>> meta_info;
-             for (auto meta : metas) {
-               meta_info.emplace_back(
-                   std::vector<size_t>({meta.begin, meta.end, meta.ori_idx}));
-             }
-             return meta_info;
-           })
-      .def("stack", [](TensorArray &self) { return self.Stack(); })
-      .def("unstack",
-           [](TensorArray &self, const LoDTensor &source) {
-             return self.Unstack(source);
-           })
-      .def("unstack_shared", [](TensorArray &self, const LoDTensor &source) {
-        return self.UnstackShared(source);
-      });
-
-  py::class_<operators::DynamicRecurrentOp, OperatorBase>(m,
-                                                          "DynamicRecurrentOp")
-      .def_static("create",
-                  [](py::bytes protobin) -> operators::DynamicRecurrentOp * {
-                    OpDesc desc;
-                    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
-                                   "Cannot parse user input to OpDesc");
-                    PADDLE_ENFORCE(desc.IsInitialized(),
-                                   "User OpDesc is not initialized, reason %s",
-                                   desc.InitializationErrorString());
-                    auto rnn_op = OpRegistry::CreateOp(desc);
-                    return static_cast<operators::DynamicRecurrentOp *>(
-                        rnn_op.release());
-                  })
-      .def("set_step_unit",
-           [](operators::DynamicRecurrentOp &self, const operators::NetOp &net)
-               -> void { self.rnn.SetStepUnit(net.Clone()); })
-      .def("get_state",
-           [](operators::DynamicRecurrentOp &self, const std::string &name)
-               -> const TensorArray & { return self.rnn.state(name); })
-      .def("get_step_input",
-           [](operators::DynamicRecurrentOp &self, const std::string &name)
-               -> const TensorArray & { return self.rnn.step_input(name); })
-      .def("get_step_output",
-           [](operators::DynamicRecurrentOp &self, const std::string &name)
-               -> const TensorArray & { return self.rnn.step_output(name); });
-
   // cond_op
   py::class_<operators::CondOp, OperatorBase>(m, "CondOp")
       .def_static("create",
@@ -539,6 +461,10 @@ All parameter, weight, gradient are variables in Paddle.
   m.def("op_support_gpu", OpSupportGPU);
 #ifdef PADDLE_WITH_CUDA
   m.def("get_cuda_device_count", platform::GetCUDADeviceCount);
+
+  m.def("nvprof_init", platform::CudaProfilerInit);
+  m.def("nvprof_start", platform::CudaProfilerStart);
+  m.def("nvprof_stop", platform::CudaProfilerStop);
 #endif
 
   return m.ptr();
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index fda2a2f1b764106a7a108e8c56bc90ce3459e9b5..502637c881208e53dd832a9759b3873ef1988395 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -16,11 +16,13 @@ function cmake_gen() {
         echo "using python abi: $1"
         if [ "$1" == "cp27-cp27m" ]; then
             export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs4/lib:}
+            export PATH=/opt/python/cp27-cp27m/bin/:${PATH}
             PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27m/bin/python
         -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27m/include/python2.7
         -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs2/lib/libpython2.7.so"
         elif [ "$1" == "cp27-cp27mu" ]; then
             export LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH#/opt/_internal/cpython-2.7.11-ucs2/lib:}
+            export PATH=/opt/python/cp27-cp27mu/bin/:${PATH}
             PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/python/cp27-cp27mu/bin/python
         -DPYTHON_INCLUDE_DIR:PATH=/opt/python/cp27-cp27mu/include/python2.7
         -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-2.7.11-ucs4/lib/libpython2.7.so"
@@ -181,6 +183,7 @@ EOF
     ${DOCKERFILE_GPU_ENV}
     ADD go/cmd/pserver/pserver /usr/bin/
     ADD go/cmd/master/master /usr/bin/
+    ADD paddle/pybind/print_operators_doc /usr/bin/
     # default command shows the paddle version and exit
     CMD ["paddle", "version"]
 EOF
diff --git a/paddle/scripts/travis/build_doc.sh b/paddle/scripts/travis/build_doc.sh
index 28d82343ed32273740d0c52d0451681e43b3675e..7d54f0254c8ea9367a34233602293db5b8593f9a 100755
--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -11,8 +11,9 @@ make -j `nproc` gen_proto_py
 make -j `nproc` paddle_docs paddle_docs_cn
 
 # check websites for broken links
-linkchecker doc/en/html/index.html
-linkchecker doc/cn/html/index.html
+# It will be failed now!
+#linkchecker doc/en/html/index.html
+#linkchecker doc/cn/html/index.html
 
 # Parse Github URL
 REPO=`git config remote.origin.url`
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index 4245df5ab72bf0fd67261818b307f0babdb5d685..2275c950ba13d131fa17fef1845a50a22fd55a2f 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -5,4 +5,6 @@ if(WITH_TESTING)
   add_dependencies(paddle_test_main paddle_proto ${external_project_dependencies})
   add_library(paddle_test_util STATIC TestUtil.cpp)
   add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
+  add_library(paddle_gtest_main STATIC paddle_gtest_main.cc)
+  add_dependencies(paddle_gtest_main paddle_memory gtest gflags)
 endif()
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a491322b7e533f7a9c263a249494440269391003
--- /dev/null
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cstring>
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+#include "paddle/memory/memory.h"
+
+int main(int argc, char** argv) {
+  std::vector<char*> new_argv;
+  std::string gflags_env;
+  new_argv.push_back(argv[0]);
+#ifdef PADDLE_WITH_CUDA
+  new_argv.push_back(
+      strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
+#else
+  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory"));
+#endif
+  int new_argc = static_cast<int>(new_argv.size());
+  char** new_argv_address = new_argv.data();
+  google::ParseCommandLineFlags(&new_argc, &new_argv_address, false);
+  testing::InitGoogleTest(&argc, argv);
+  paddle::memory::Used(paddle::platform::CPUPlace());
+#ifdef PADDLE_WITH_CUDA
+  paddle::memory::Used(paddle::platform::GPUPlace(0));
+#endif
+  return RUN_ALL_TESTS();
+}
diff --git a/proto/ModelConfig.proto b/proto/ModelConfig.proto
index e2f5592248fd0b6166c2d11af02cef7815673def..2fcdbbc8bd671f8ae911cf82c7a91091f252a82f 100644
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
@@ -544,6 +544,9 @@ message LayerConfig {
   // for batch normalization layer
   // The small constant added to the variance to improve numeric stability.
   optional double epsilon = 60 [ default = 0.00001 ];
+
+  // for factorization machine layer
+  optional uint32 factor_size = 61;
 }
 
 message EvaluatorConfig {
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 5ba0e50c6ba0f84a3ea87d5a5199fef23a5b05ea..5b173694dd0e4a52c0179f12f5edd74e2c41cb8c 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2400,6 +2400,14 @@ class CropLayer(LayerBase):
         image_conf.img_size_y = input_layer.height
         image_conf.channels = input_layer.size / (input_layer.width *
                                                   input_layer.height)
+        # only support for 4-dims inputs and NCHW order
+        if (len(self.config.inputs) == 2):
+            self.set_layer_height_width(
+                self.get_input_layer(1).height, self.get_input_layer(1).width)
+            self.set_layer_size(self.get_input_layer(1).size)
+        else:
+            self.set_layer_height_width(shape[-2], shape[-1])
+            self.set_layer_size(reduce(lambda x, y: x * y, shape[1:]))
 
 
 @config_layer('batch_norm')
@@ -2798,19 +2806,18 @@ class AddToLayer(LayerBase):
             name, self.layer_type, 0, inputs=inputs, **xargs)
         config_assert(len(inputs) > 0, 'inputs cannot be empty for AddToLayer')
 
-        if len(self.inputs) > 1:
-            for input_index in xrange(len(self.inputs)):
-                assert self.get_input_layer(0).height == self.get_input_layer(
-                    input_index).height
-                assert self.get_input_layer(0).width == self.get_input_layer(
-                    input_index).width
-                assert self.get_input_layer(0).depth == self.get_input_layer(
-                    input_index).depth
+        layer_size = self.get_input_layer(0).size
+        # To reserve heght, width, depth.
+        layer_with_hwc = self.get_input_layer(0)
+        for input_index in xrange(len(self.inputs)):
+            input_layer = self.get_input_layer(input_index)
+            assert layer_size == input_layer.size
+            if input_layer.height and input_layer.height and input_layer.height:
+                layer_with_hwc = input_layer
 
-        self.set_layer_size(self.get_input_layer(0).size)
-        self.set_layer_height_width(self.get_input_layer(0).height, \
-                                        self.get_input_layer(0).width)
-        self.set_layer_depth(self.get_input_layer(0).depth)
+        self.set_layer_size(layer_with_hwc.size)
+        self.set_layer_height_width(layer_with_hwc.height, layer_with_hwc.width)
+        self.set_layer_depth(layer_with_hwc.depth)
         self.create_bias_parameter(bias, self.config.size)
 
 
@@ -3850,6 +3857,26 @@ class SwitchOrderLayer(LayerBase):
             name, 'switch_order', 0, inputs=inputs, **xargs)
         self.config.reshape_conf.height_axis.extend(reshape['height'])
         self.config.reshape_conf.width_axis.extend(reshape['width'])
+        input_layer = self.get_input_layer(0)
+        if reshape is None:
+            self.set_layer_size(input_layer.size)
+        else:
+            in_h = input_layer.height
+            in_w = input_layer.width
+            out_dims = None
+            if input_layer.has_depth():
+                in_d = input_layer.depth
+                in_c = input_layer.size / in_h / in_w / in_d
+                # batch_size, depth, height, width, channel
+                out_dims = [0, in_d, in_h, in_w, in_c]
+            else:
+                in_c = input_layer.size / in_h / in_w
+                # batch_size, height, width, channel
+                out_dims = [0, in_h, in_w, in_c]
+            # Because (reshape['width'][0] > 0) always be true.
+            # So out_dims[0] won't be used.
+            size = reduce(lambda x, y: x * y, out_dims[reshape['width'][0]:])
+            self.set_layer_size(size)
 
 
 @config_layer('scale_sub_region')
@@ -3871,6 +3898,21 @@ class ScaleSubRegionLayer(LayerBase):
                            image_conf.channels)
 
 
+@config_layer('factorization_machine')
+class FactorizationMachineLayer(LayerBase):
+    def __init__(self, name, inputs, factor_size, **xargs):
+        super(FactorizationMachineLayer, self).__init__(
+            name, 'factorization_machine', size=1, inputs=inputs, **xargs)
+        config_assert(
+            len(self.inputs) == 1,
+            'factorization machine layer must have one and only one input.')
+        self.config.factor_size = factor_size
+        input_layer = self.get_input_layer(0)
+        psize = input_layer.size * factor_size
+        dims = [input_layer.size, factor_size]
+        self.create_input_parameter(0, psize, dims)
+
+
 # Deprecated, use a new layer specific class instead
 @config_func
 def Layer(name, type, **xargs):
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 8e127c9489ca5a4ed190e6d4e12ec4c9b28ad9cf..f6dc58b9c0ed0b14ad9db098892af14274aed0c1 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -148,6 +148,7 @@ __all__ = [
     'resize_layer',
     'sub_seq_layer',
     'scale_sub_region_layer',
+    'factorization_machine',
 ]
 
 
@@ -264,6 +265,8 @@ class LayerType(object):
 
     SCALE_SUB_REGION_LAYER = 'scale_sub_region'
 
+    FACTORIZATION_MACHINE = 'factorization_machine'
+
     @staticmethod
     def is_layer_type(type_name):
         """
@@ -1900,9 +1903,12 @@ def repeat_layer(input,
     A layer for repeating the input for num_repeats times.
 
     If as_row_vector:
+
     .. math::
        y  = [x_1,\cdots, x_n, \cdots, x_1, \cdots, x_n]
+
     If not as_row_vector:
+
     .. math::
        y  = [x_1,\cdots, x_1, \cdots, x_n, \cdots, x_n]
 
@@ -1915,19 +1921,19 @@ def repeat_layer(input,
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param num_repeats: Repeat the input so many times
+    :param num_repeats: The times of repeating the input.
     :type num_repeats: int
     :param name: The name of this layer. It is optional.
-    :param as_row_vector: True for treating input as row vector and repeating
-                          in the column direction.  This is equivalent to apply
-                          concat_layer() with num_repeats same input.
-                          False for treating input as column vector and repeating
-                          in the row direction.
+    :type name: basestring
+    :param as_row_vector: Whether to treat the input as row vectors or not. If
+                          the parameter is set to True, the repeating operation
+                          will be performed in the column direction. Otherwise,
+                          it will be performed in the row direction.
     :type as_row_vector: bool
     :param act: Activation type. IdentityActivation is the default activation.
     :type act: BaseActivation
-    :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -1974,13 +1980,14 @@ def seq_reshape_layer(input,
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param reshape_size: the size of reshaped sequence.
+    :param reshape_size: The dimension of the reshaped sequence.
     :type reshape_size: int
     :param name: The name of this layer. It is optional.
     :type name: basestring
     :param act: Activation type. IdentityActivation is the default activation.
     :type act: BaseActivation
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :param bias_attr: The bias attribute. If the parameter is set to False or an object
                       whose type is not ParameterAttribute, no bias is defined. If the
@@ -2008,7 +2015,7 @@ def seq_reshape_layer(input,
 @layer_support()
 def interpolation_layer(input, weight, name=None, layer_attr=None):
     """
-    This layer is for linear interpolation with two inputs,
+    This layer performs linear interpolation on two inputs,
     which is used in NEURAL TURING MACHINE.
 
     .. math::
@@ -2030,7 +2037,8 @@ def interpolation_layer(input, weight, name=None, layer_attr=None):
     :type weight: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2064,7 +2072,7 @@ def bilinear_interp_layer(input,
                           name=None,
                           layer_attr=None):
     """
-    This layer is to implement bilinear interpolation on conv layer output.
+    This layer implements bilinear interpolation on convolutional layer's output.
 
     Please refer to Wikipedia: https://en.wikipedia.org/wiki/Bilinear_interpolation
 
@@ -2074,18 +2082,19 @@ def bilinear_interp_layer(input,
 
        bilinear = bilinear_interp_layer(input=layer1, out_size_x=64, out_size_y=64)
 
-    :param   input:        A input layer.
-    :type    input:        LayerOutput.
-    :param   out_size_x:   bilinear interpolation output width.
-    :type    out_size_x:   int | None
-    :param   out_size_y:   bilinear interpolation output height.
-    :type    out_size_y:   int | None
-    :param   name:         The layer's name, which cna not be specified.
-    :type    name:         None | basestring
-    :param   layer_attr:   Extra Layer attribute.
-    :type    layer_attr:   ExtraLayerAttribute
+    :param input: The input of this layer.
+    :type input: LayerOutput.
+    :param out_size_x: The width of the output.
+    :type out_size_x: int
+    :param out_size_y: The height of the output.
+    :type out_size_y: int
+    :param name: The name of this layer. It is optional.
+    :type name: basestring
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
+    :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
-    :rtype:  LayerOutput
+    :rtype: LayerOutput
     """
     assert input.layer_type == LayerType.CONV_LAYER
     assert isinstance(input.activation, LinearActivation)
@@ -2120,8 +2129,8 @@ def power_layer(input, weight, name=None, layer_attr=None):
     .. math::
        y = x^w
 
-    where :math:`x` is a input vector, :math:`w` is scalar weight,
-    and :math:`y` is a output vector.
+    where :math:`x` is an input vector, :math:`w` is a scalar exponent,
+    and :math:`y` is an output vector.
 
     The example usage is:
 
@@ -2131,11 +2140,12 @@ def power_layer(input, weight, name=None, layer_attr=None):
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param weight: Weight layer.
+    :param weight: The exponent of the power.
     :type weight: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2175,11 +2185,12 @@ def scaling_layer(input, weight, name=None, layer_attr=None):
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param weight: Weight layer.
+    :param weight: The weight of each sample.
     :type weight: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2217,7 +2228,8 @@ def trans_layer(input, name=None, layer_attr=None):
     :type input: LayerOutput
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2253,11 +2265,14 @@ def rotate_layer(input, height, width, name=None, layer_attr=None):
 
     :param input: The input of this layer.
     :type input: LayerOutput
-    :param height: The height of the sample matrix
+    :param height: The height of the sample matrix.
     :type height: int
+    :param width: The width of the sample matrix.
+    :type width: int
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param layer_attr: extra layer attributes.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
+                       details.
     :type layer_attr: ExtraLayerAttribute.
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2302,15 +2317,15 @@ def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None):
 
     :param name: The name of this layer. It is optional.
     :type name: basestring
-    :param a: input layer a
+    :param a: The first input of this layer.
     :type a: LayerOutput
-    :param b: input layer b
+    :param b: The second input of this layer.
     :type b: LayerOutput
-    :param scale: scale for cosine value. default is 5.
+    :param scale: The scale of the cosine similarity. 1 is the default value.
     :type scale: float
-    :param size: layer size. NOTE size_a * size should equal size_b.
+    :param size: The dimension of this layer. NOTE size_a * size should equal size_b.
     :type size: int
-    :param layer_attr: Extra Layer Attribute.
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2395,8 +2410,10 @@ def hsigmoid(input,
     """
     Organize the classes into a binary tree. At each node, a sigmoid function
     is used to calculate the probability of belonging to the right branch.
-    This idea is from "F. Morin, Y. Bengio (AISTATS 05):
-    Hierarchical Probabilistic Neural Network Language Model."
+
+    Reference:
+        `Hierarchical Probabilistic Neural Network Language Model
+        <http://www.gatsby.ucl.ac.uk/aistats/fullpapers/208.pdf>`_
 
     The example usage is:
 
@@ -2407,19 +2424,21 @@ def hsigmoid(input,
 
     :param input: The input of this layer.
     :type input: LayerOutput | list | tuple
-    :param label: Label layer.
+    :param label: The input label.
     :type label: LayerOutput
-    :param num_classes: number of classes.
-    :type num_classes: int | None
+    :param num_classes: The number of classes. And it should be larger than 2. If the parameter
+                        is not set or set to None, its actual value will be automatically set to
+                        the number of labels.
+    :type num_classes: int
     :param name: The name of this layer. It is optional.
     :type name: basestring
     :param bias_attr: The bias attribute. If the parameter is set to False or an object
                       whose type is not ParameterAttribute, no bias is defined. If the
                       parameter is set to True, the bias is initialized to zero.
     :type bias_attr: ParameterAttribute | None | bool | Any
-    :param param_attr: Parameter Attribute. None means default parameter.
-    :type param_attr: ParameterAttribute | None
-    :param layer_attr: Extra Layer Attribute.
+    :param param_attr: The parameter attribute. See ParameterAttribute for details.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for details.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2969,8 +2988,8 @@ def spp_layer(input,
     A layer performs spatial pyramid pooling.
 
     Reference:
-        Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition
-        https://arxiv.org/abs/1406.4729
+        `Spatial Pyramid Pooling in Deep Convolutional Networks for Visual Recognition
+        https://arxiv.org/abs/1406.4729`_
 
     The example usage is:
 
@@ -3071,8 +3090,8 @@ def img_cmrnorm_layer(input,
     Response normalization across feature maps.
 
     Reference:
-        ImageNet Classification with Deep Convolutional Neural Networks
-        http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf
+        `ImageNet Classification with Deep Convolutional Neural Networks
+        http://www.cs.toronto.edu/~fritz/absps/imagenet.pdf`_
 
     The example usage is:
 
@@ -3138,9 +3157,9 @@ def batch_norm_layer(input,
         y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
 
     Reference:
-        Batch Normalization: Accelerating Deep Network Training by Reducing
+        `Batch Normalization: Accelerating Deep Network Training by Reducing
         Internal Covariate Shift
-        http://arxiv.org/abs/1502.03167
+        http://arxiv.org/abs/1502.03167`_
 
     The example usage is:
 
@@ -4241,7 +4260,7 @@ def dot_prod_layer(input1, input2, name=None, layer_attr=None):
     :param name: The name of this layer. It is optional.
     :type name: basestring
     :param input1: The first input layer.
-    :type input: LayerOutput
+    :type input1: LayerOutput
     :param input2: The second input layer.
     :type input2: LayerOutput
     :param layer_attr: The extra layer attribute. See ExtraLayerAttribute for
@@ -5397,10 +5416,10 @@ def maxout_layer(input, groups, num_channels=None, name=None, layer_attr=None):
     to be devided by groups.
 
     Reference:
-        Maxout Networks
-        http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf
-        Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks
-        https://arxiv.org/pdf/1312.6082v4.pdf
+        `Maxout Networks
+        http://www.jmlr.org/proceedings/papers/v28/goodfellow13.pdf`_
+        `Multi-digit Number Recognition from Street View Imagery using Deep Convolutional Neural Networks
+        https://arxiv.org/pdf/1312.6082v4.pdf`_
 
     .. math::
        y_{si+j} = \max_k x_{gsi + sk + j}
@@ -5465,9 +5484,9 @@ def ctc_layer(input,
     alignment between the inputs and the target labels is unknown.
 
     Reference:
-        Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
+        `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
         with Recurrent Neural Networks
-        http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf
+        http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf`_
 
     Note:
         Considering the 'blank' label needed by CTC, you need to use (num_classes + 1)
@@ -5539,9 +5558,9 @@ def warp_ctc_layer(input,
     install it to :code:`third_party/install/warpctc` directory.
 
     Reference:
-        Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
+        `Connectionist Temporal Classification: Labelling Unsegmented Sequence Data
         with Recurrent Neural Networks
-        http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf
+        http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf`_
 
     Note:
         - Let num_classes represents the category number. Considering the 'blank'
@@ -5761,8 +5780,8 @@ def nce_layer(input,
     Noise-contrastive estimation.
 
     Reference:
-        A fast and simple algorithm for training neural probabilistic language
-        models. https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf
+        `A fast and simple algorithm for training neural probabilistic language
+        models. https://www.cs.toronto.edu/~amnih/papers/ncelm.pdf`_
 
     The example usage is:
 
@@ -5877,8 +5896,8 @@ def rank_cost(left,
     A cost Layer for learning to rank using gradient descent.
 
     Reference:
-        Learning to Rank using Gradient Descent
-        http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf
+        `Learning to Rank using Gradient Descent
+        http://research.microsoft.com/en-us/um/people/cburges/papers/ICML_ranking.pdf`_
 
     .. math::
 
@@ -6413,8 +6432,8 @@ def smooth_l1_cost(input, label, name=None, coeff=1.0, layer_attr=None):
         smooth_{L1}(x) = \\begin{cases} 0.5x^2& \\text{if}  \\ |x| < 1 \\\\ |x|-0.5& \\text{otherwise} \end{cases}
 
     Reference:
-        Fast R-CNN
-        https://arxiv.org/pdf/1504.08083v2.pdf
+        `Fast R-CNN
+        https://arxiv.org/pdf/1504.08083v2.pdf`_
 
     The example usage is:
 
@@ -6620,8 +6639,8 @@ def prelu_layer(input,
     The Parametric Relu activation that actives outputs with a learnable weight.
 
     Reference:
-        Delving Deep into Rectifiers: Surpassing Human-Level Performance on
-        ImageNet Classification http://arxiv.org/pdf/1502.01852v1.pdf
+        `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+        ImageNet Classification http://arxiv.org/pdf/1502.01852v1.pdf`_
 
     .. math::
        z_i &\\quad if \\quad z_i > 0 \\\\
@@ -6717,8 +6736,8 @@ def gated_unit_layer(input,
     product between :match:`X'` and :math:`\sigma` is finally returned.
 
     Reference:
-        Language Modeling with Gated Convolutional Networks
-        https://arxiv.org/abs/1612.08083
+        `Language Modeling with Gated Convolutional Networks
+        https://arxiv.org/abs/1612.08083`_
 
     .. math::
        y=\\text{act}(X \cdot W + b)\otimes \sigma(X \cdot V + c)
@@ -6854,6 +6873,7 @@ def crop_layer(input, offset, axis=2, shape=None, name=None, layer_attr=None):
 
     :param input: The input of this layer. If two inputs are given, the second one
                   will be regarded as the reference.
+                  And the input must be 4-dims and in NCHW order.
     :type input: LayerOutput | Sequence
     :param offset: The crop offset.
     :type offset: Sequence
@@ -7387,3 +7407,73 @@ def scale_sub_region_layer(input, indices, value, name=None):
         parents=[input, indices],
         num_filters=input.num_filters,
         size=input.size)
+
+
+@wrap_name_default()
+@wrap_act_default(act=LinearActivation())
+@wrap_param_attr_default()
+@layer_support()
+def factorization_machine(input,
+                          factor_size,
+                          act=None,
+                          name=None,
+                          param_attr=None,
+                          layer_attr=None):
+    """
+    The Factorization Machine models pairwise feature interactions as inner
+    product of the learned latent vectors corresponding to each input feature.
+    The Factorization Machine can effectively capture feature interactions
+    especially when the input is sparse.
+
+    This implementation only consider the 2-order feature interactions using
+    Factorization Machine with the formula:
+
+    .. math::
+        y = \sum_{i=1}^{n-1}\sum_{j=i+1}^n\langle v_i, v_j \rangle x_i x_j
+
+    Note:
+        X is the input vector with size n. V is the factor matrix. Each row of V
+        is the latent vector corresponding to each input dimesion. The size of
+        each latent vector is k.
+
+    For details of Factorization Machine, please refer to the paper:
+    Factorization machines.
+
+    .. code-block:: python
+        first_order = paddle.layer.fc(input=input,
+                                      size=1,
+                                      act=paddle.activation.Linear())
+        second_order = paddle.layer.factorization_machine(input=input,
+                                                          factor_size=10)
+        fm = paddle.layer.addto(input=[first_order, second_order],
+                                act=paddle.activation.Linear(),
+                                bias_attr=False)
+
+    :param input: The input layer. Supported input types: all input data types
+                  on CPU, and only dense input types on GPU.
+    :type input: LayerOutput
+    :param factor_size: The hyperparameter that defines the dimensionality of
+                        the latent vector size.
+    :type context_len: int
+    :param act: Activation Type. Default is linear activation.
+    :type act: BaseActivation
+    :param param_attr: The parameter attribute. See ParameterAttribute for
+                       details.
+    :type param_attr: ParameterAttribute
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput)
+    assert factor_size > 0, "the factor_size must be greater than 0."
+
+    Layer(
+        inputs=[Input(input.name, **param_attr.attr)],
+        name=name,
+        factor_size=factor_size,
+        type=LayerType.FACTORIZATION_MACHINE,
+        active_type=act.name,
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.FACTORIZATION_MACHINE, input, activation=act, size=1)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
index a21f67a2d99e7eab39708e2a571d30d7e9f20ce6..10c941f707498ec45e79bed9d3f8054eea19887d 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -11,6 +11,7 @@ test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_l
 test_kmax_seq_socre_layer test_sub_nested_seq_select_layer test_scale_shift_layer
 test_seq_slice_layer test_cross_entropy_over_beam test_roi_pool_layer test_pooling3D_layer
 test_conv3d_layer test_deconv3d_layer test_BatchNorm3D test_resize_layer
-test_scale_sub_region_layer test_dot_prod_layer test_l2_distance_layer)
+test_scale_sub_region_layer test_dot_prod_layer test_l2_distance_layer
+test_factorization_machine)
 
 export whole_configs=(test_split_datasource)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
new file mode 100644
index 0000000000000000000000000000000000000000..4f3002b19942ed58970bfd64e5978c1601273992
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_factorization_machine.protostr
@@ -0,0 +1,39 @@
+type: "nn"
+layers {
+  name: "data"
+  type: "data"
+  size: 1024
+  active_type: ""
+}
+layers {
+  name: "__factorization_machine_0__"
+  type: "factorization_machine"
+  size: 1
+  active_type: ""
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___factorization_machine_0__.w0"
+  }
+  factor_size: 10
+}
+parameters {
+  name: "___factorization_machine_0__.w0"
+  size: 10240
+  initial_mean: 0.0
+  initial_std: 0.03125
+  dims: 1024
+  dims: 10
+  initial_strategy: 0
+  initial_smart: true
+}
+input_layer_names: "data"
+output_layer_names: "__factorization_machine_0__"
+sub_models {
+  name: "root"
+  layer_names: "data"
+  layer_names: "__factorization_machine_0__"
+  input_layer_names: "data"
+  output_layer_names: "__factorization_machine_0__"
+  is_recurrent_layer_group: false
+}
+
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py b/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
new file mode 100644
index 0000000000000000000000000000000000000000..b249de0fee3c8ca4ad0520872fa2497c493d31b5
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_factorization_machine.py
@@ -0,0 +1,7 @@
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='data', size=1024)
+
+fm = factorization_machine(input=data, factor_size=10)
+
+outputs(fm)
diff --git a/python/paddle/v2/__init__.py b/python/paddle/v2/__init__.py
index 33a0829ba8d635ebd68b50f3da07da958fb79dcb..70f61e84997efdbe3d6f268d249be8bac15b9ecd 100644
--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -83,11 +83,10 @@ def set_omp_mkl_env_vars(trainer_count):
         '''Get the number of physical cores'''
         if platform.system() == "Linux":
             num_sockets = int(
-                os.popen("lscpu |grep \"Socket\" |awk -F':' '{print $2}'|xargs")
+                os.popen("grep 'physical id' /proc/cpuinfo | sort -u | wc -l")
                 .read())
             num_cores_per_socket = int(
-                os.popen(
-                    "lscpu |grep \"per socket\" |awk -F':' '{print $2}'|xargs")
+                os.popen("grep 'core id' /proc/cpuinfo | sort -u | wc -l")
                 .read())
             return num_sockets * num_cores_per_socket
         else:
diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py
index 98b97c75ca72f11c105535e0f2a5fa0201db5d42..f10bf7e42a1ead09b3eba0d61e55701215e4360f 100644
--- a/python/paddle/v2/dataset/uci_housing.py
+++ b/python/paddle/v2/dataset/uci_housing.py
@@ -38,6 +38,7 @@ UCI_TEST_DATA = None
 URL_MODEL = 'https://github.com/PaddlePaddle/book/raw/develop/01.fit_a_line/fit_a_line.tar'
 MD5_MODEL = '52fc3da8ef3937822fcdd87ee05c0c9b'
 
+
 def feature_range(maximums, minimums):
     import matplotlib
     matplotlib.use('Agg')
@@ -114,7 +115,8 @@ def test():
 
 
 def model():
-    tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'fit_a_line.tar', MD5_MODEL)
+    tar_file = paddle.v2.dataset.common.download(URL_MODEL, 'fit_a_line.tar',
+                                                 MD5_MODEL)
     with open(tar_file, 'r') as f:
         parameters = Parameters.from_tar(f)
     return parameters
diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py
index 5df612bf3530c843c16b337f2b8f83445fcf39b5..dd25bc19ec5f4fd6eb3e04f304b1de488e988f41 100644
--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -1,11 +1,43 @@
-import sys
-import core
-__all__ = ['proto']
-argv = []
-if core.is_compile_gpu():
-    argv = list(sys.argv) + [
-        "--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"
-    ]
-else:
-    argv = list(sys.argv) + ["--tryfromenv=use_pinned_memory"]
-core.init_gflags(argv)
+# import all class inside framework into fluid module
+import framework
+from framework import *
+# import all class inside executor into fluid module
+import executor
+from executor import *
+
+import io
+import evaluator
+import initializer
+import layers
+import nets
+import optimizer
+import backward
+import regularizer
+from param_attr import ParamAttr
+
+from core import LoDTensor, CPUPlace, GPUPlace
+
+Tensor = LoDTensor
+__all__ = framework.__all__ + executor.__all__ + [
+    'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward',
+    'regularizer', 'LoDTensor', 'CPUPlace', 'GPUPlace', 'Tensor', 'ParamAttr'
+]
+
+
+def __read_gflags_from_env__():
+    """
+    Enable reading gflags from environment variables.
+    
+    Returns:
+        None
+    """
+    import sys
+    import core
+    read_env_flags = ['use_pinned_memory']
+    if core.is_compile_gpu():
+        read_env_flags.append('fraction_of_gpu_memory_to_use')
+    core.init_gflags([sys.argv[0]] +
+                     ["--tryfromenv=" + ",".join(read_env_flags)])
+
+
+__read_gflags_from_env__()
diff --git a/python/paddle/v2/fluid/evaluator.py b/python/paddle/v2/fluid/evaluator.py
index f78d2f814c89aa6b5ee8387f2558a97c754e655c..137c5736226b689340748d5098ca51659d5acff8 100644
--- a/python/paddle/v2/fluid/evaluator.py
+++ b/python/paddle/v2/fluid/evaluator.py
@@ -1,9 +1,13 @@
 import numpy as np
-from paddle.v2.fluid.framework import Program, g_main_program, unique_name, Variable
-import paddle.v2.fluid.core as core
 
+import layers
+from framework import Program, unique_name, Variable
+from layer_helper import LayerHelper
 
-def _clone_var_in_block_(block, var):
+__all__ = ['Accuracy']
+
+
+def _clone_var_(block, var):
     assert isinstance(var, Variable)
     return block.create_var(
         name=var.name,
@@ -16,175 +20,115 @@ def _clone_var_in_block_(block, var):
 
 class Evaluator(object):
     """
-    Evalutor Base class.
-
-    create metric states
-    add mini-batch evaluator caculate operator
-    add increment operator to accumulate the metric states
+    Base Class for all evaluators
+    
+    Args:
+        name(str): The name of evaluator. such as, "accuracy". Used for generate 
+            temporary variable name.
+        main_program(Program, optional): The evaluator should be added to this 
+            main_program. Default default_main_program()
+        startup_program(Program, optional):The parameter should be added to this 
+            startup_program. Default default_startup_program()
+            
+    Attributes:
+        states(list): The list of state variables. states will be reset to zero 
+            when `reset` is invoked.
+        metrics(list): The list of metrics variables. They will be calculate 
+            every mini-batch
     """
 
     def __init__(self, name, **kwargs):
+        self.states = []
+        self.metrics = []
+        self.helper = LayerHelper(name, **kwargs)
+
+    def reset(self, executor, reset_program=None):
         """
-        init the global states
+        reset metric states at the begin of each pass/user specified batch
         """
-        self._states = {}
-        if kwargs.has_key("main_program"):
-            self._main_program = kwargs.get("main_program")
-        else:
-            self._main_program = g_main_program
+        if reset_program is None:
+            reset_program = Program()
+
+        for var in self.states:
+            assert isinstance(var, Variable)
+            g_var = _clone_var_(reset_program.current_block(), var)
+            layers.fill_constant(
+                shape=g_var.shape,
+                value=0.0,
+                dtype=g_var.dtype,
+                out=g_var,
+                main_program=reset_program)
 
-    def states(self):
-        return self._states
+        executor.run(reset_program)
 
-    def _update_ops(self, *args, **kwargs):
+    def eval(self, executor, eval_program=None):
         """
-        append update ops to the global states
+        Evaluate the statistics merged by multiple mini-batches.
         """
         raise NotImplementedError()
 
-    def reset(self, executor, reset_program=None):
+    def create_state(self, suffix, dtype, shape):
         """
-        Clear metric states at the begin of each pass/user specified batch
-        """
-        if reset_program == None:
-            reset_program = Program()
-        else:
-            reset_program = program
-        block = reset_program.global_block()
-        for k, var in self._states.iteritems():
-            g_var = _clone_var_in_block_(block, var)
-            zeros = block.create_var(dtype="float32", persistable=True)
-            block.append_op(
-                type="fill_constant",
-                outputs={"Out": [zeros]},
-                attrs={
-                    "shape": g_var.shape,
-                    "value": .0,
-                    "dtype": 5,
-                })
-            block.append_op(
-                type="scale", inputs={"X": zeros}, outputs={"Out": g_var})
-        executor.run(reset_program, fetch_list=self._states.values())
+        Create state variable. 
+        
+        NOTE: It is not a public API.
+        
+        Args:
+            suffix(str): the state suffix. 
+            dtype(str|core.DataType): the state data type 
+            shape(tuple|list): the shape of state 
+
+        Returns: State variable
 
-    def eval(self, executor, eval_program=None):
-        """
-        Merge the mini-batch statistics to form the evaluation result for multiple mini-batches.
         """
-        raise NotImplementedError()
+        state = self.helper.create_variable(
+            name="_".join([unique_name(self.helper.name), suffix]),
+            persistable=True,
+            dtype=dtype,
+            shape=shape)
+        self.states.append(state)
+        return state
 
 
 class Accuracy(Evaluator):
     """
-    Accuracy need two state variable Total, Correct
+    Average Accuracy for multiple mini-batches.
     """
 
-    def __init__(self, *args, **kwargs):
+    def __init__(self, input, label, k=1, **kwargs):
         super(Accuracy, self).__init__("accuracy", **kwargs)
-        block = self._main_program.global_block()
-        g_total = block.create_var(
-            name=unique_name("Total"),
-            persistable=True,
-            dtype="int64",
-            shape=[1])
-        g_correct = block.create_var(
-            name=unique_name("Correct"),
-            persistable=True,
-            dtype="int64",
-            shape=[1])
-        self._states["Total"] = g_total
-        self._states["Correct"] = g_correct
-
-    def _update_ops(self, input, label, k=1, **kwargs):
-        block = self._main_program.global_block()
-        topk_out = block.create_var(dtype=input.dtype)
-        topk_indices = block.create_var(dtype="int64")
-        block.append_op(
-            type="top_k",
-            inputs={"X": [input]},
-            outputs={"Out": [topk_out],
-                     "Indices": [topk_indices]},
-            attrs={"k": k})
-        acc_out = block.create_var(dtype=kwargs.get("out_dtype", "float32"))
-        correct = block.create_var(dtype="int64", persistable=True)
-        total = block.create_var(dtype="int64", persistable=True)
-        block.append_op(
-            type="accuracy",
-            inputs={
-                "Out": [topk_out],
-                "Indices": [topk_indices],
-                "Label": [label]
-            },
-            outputs={
-                "Accuracy": [acc_out],
-                "Correct": [correct],
-                "Total": [total],
-            })
-
-        block.append_op(
-            type="cast",
-            inputs={"X": [self._states["Total"]]},
-            outputs={"Out": [self._states["Total"]]},
-            attrs={
-                "in_dtype": 5,  # float32
-                "out_dtype": 2,  # int32
-            })
-        block.append_op(
-            type="cast",
-            inputs={"X": [self._states["Correct"]]},
-            outputs={"Out": [self._states["Correct"]]},
-            attrs={
-                "in_dtype": 5,
-                "out_dtype": 2,
-            })
-
-        block.append_op(
-            type="elementwise_add",
-            inputs={"X": [self._states["Total"]],
-                    "Y": [total]},
-            outputs={"Out": [self._states["Total"]]})
-        block.append_op(
-            type="elementwise_add",
-            inputs={"X": [self._states["Correct"]],
-                    "Y": [correct]},
-            outputs={"Out": [self._states["Correct"]]})
-
-        return acc_out
+        main_program = self.helper.main_program
+        if main_program.current_block().idx != 0:
+            raise ValueError("You can only invoke Evaluator in root block")
+
+        self.total = self.create_state(dtype='int64', shape=[1], suffix='total')
+        self.correct = self.create_state(
+            dtype='int64', shape=[1], suffix='correct')
+        kwargs = {'main_program': main_program}
+        total = self.helper.create_tmp_variable(dtype='int')
+        correct = self.helper.create_tmp_variable(dtype='int')
+        acc = layers.accuracy(
+            input=input,
+            label=label,
+            k=k,
+            total=total,
+            correct=correct,
+            **kwargs)
+        total = layers.cast(x=total, dtype='int64', **kwargs)
+        correct = layers.cast(x=correct, dtype='int64', **kwargs)
+        layers.sums(input=[self.total, total], out=self.total, **kwargs)
+        layers.sums(input=[self.correct, correct], out=self.correct, **kwargs)
+
+        self.metrics.append(acc)
 
     def eval(self, executor, eval_program=None):
-        if eval_program != None:
-            eval_program = eval_program
-        else:
+        if eval_program is None:
             eval_program = Program()
-        block = eval_program.global_block()
-        eval_out = block.create_var(dtype=self._states["Total"].dtype)
-        e_total = _clone_var_in_block_(block, self._states["Total"])
-        e_correct = _clone_var_in_block_(block, self._states["Correct"])
-        block.append_op(
-            type="cast",
-            inputs={"X": [e_total]},
-            outputs={"Out": [e_total]},
-            attrs={
-                "in_dtype": 2,  # int32
-                "out_dtype": 5,  # float32
-            })
-        block.append_op(
-            type="cast",
-            inputs={"X": [e_correct]},
-            outputs={"Out": [e_correct]},
-            attrs={
-                "in_dtype": 2,
-                "out_dtype": 5,
-            })
-        block.append_op(
-            type="elementwise_div",
-            inputs={"X": e_correct,
-                    "Y": e_total},
-            outputs={"Out": eval_out})
-        out = executor.run(eval_program, fetch_list=[eval_out])
-        return np.array(out[0])
-
-
-def accuracy(*args, **kwargs):
-    cls = Accuracy(*args, **kwargs)
-    out = cls._update_ops(*args, **kwargs)
-    return cls, out
+        block = eval_program.current_block()
+        kwargs = {'main_program': eval_program}
+        total = _clone_var_(block, self.total)
+        correct = _clone_var_(block, self.correct)
+        total = layers.cast(total, dtype='float32', **kwargs)
+        correct = layers.cast(correct, dtype='float32', **kwargs)
+        out = layers.elementwise_div(x=correct, y=total, **kwargs)
+        return np.array(executor.run(eval_program, fetch_list=[out])[0])
diff --git a/python/paddle/v2/fluid/executor.py b/python/paddle/v2/fluid/executor.py
index ed1c2c06daa7ede97e138049a1f7044d071c31e8..bdc82eede9d93a7cf904999a6b869ce2d23c90dc 100644
--- a/python/paddle/v2/fluid/executor.py
+++ b/python/paddle/v2/fluid/executor.py
@@ -1,9 +1,40 @@
-import paddle.v2.fluid.core as core
-from paddle.v2.fluid.framework import Block, Program, g_main_program
+import numpy as np
+from . import core
+from framework import Program, default_main_program
+
+__all__ = ['Executor', 'g_scope']
 
 g_scope = core.Scope()
 
 
+def as_numpy(tensor):
+    if isinstance(tensor, list):
+        return [as_numpy(t) for t in tensor]
+    assert isinstance(tensor, core.LoDTensor)
+    lod = tensor.lod()
+    tensor_data = np.array(tensor)
+    if len(lod) == 0:
+        ans = tensor_data
+    else:
+        raise RuntimeError("LoD Calculate lacks unit tests and buggy")
+    # elif len(lod) == 1:
+    #     ans = []
+    #     idx = 0
+    #     while idx < len(lod) - 1:
+    #         ans.append(tensor_data[lod[idx]:lod[idx + 1]])
+    #         idx += 1
+    # else:
+    #     for l in reversed(lod):
+    #         ans = []
+    #         idx = 0
+    #         while idx < len(l) - 1:
+    #             ans.append(tensor_data[l[idx]:l[idx + 1]])
+    #             idx += 1
+    #         tensor_data = ans
+    #     ans = tensor_data
+    return ans
+
+
 class Executor(object):
     def __init__(self, places):
         if not isinstance(places, list) and not isinstance(places, tuple):
@@ -16,6 +47,47 @@ class Executor(object):
             act_places.append(p)
 
         self.executor = core.Executor(act_places)
+        self.places = places
+
+    def aslodtensor(self, data):
+        def accumulate(data):
+            if not isinstance(data, list):
+                return 1
+            return sum([accumulate(sub) for sub in data])
+
+        def parselod(data):
+            seq_lens = [accumulate(seq) for seq in data]
+            cur_len = 0
+            lod = [cur_len]
+            for l in seq_lens:
+                cur_len += l
+                lod.append(cur_len)
+            return lod
+
+        assert len(self.places) != 0
+        if not isinstance(data, list):
+            # pure tensor case
+            tensor = core.LoDTensor()
+            tensor.set(data, self.places[0])
+            return tensor
+        else:
+            raise RuntimeError("Current implementation lacks unittests")
+            # lodtensor case
+            lod = []
+            if not isinstance(data[0], list):
+                lod.append(parselod(data))
+                flattened_data = np.concatenate(data, axis=0).astype("int64")
+            else:
+                while isinstance(data[0], list):
+                    lod.append(parselod(seq))
+                    flattened_data = [item for seq in data for item in seq]
+                    data = flattened_data
+                flattened_data = np.concatenate(data, axis=0).astype("int64")
+            flattened_data = flattened_data.reshape([len(flattened_data), 1])
+            tensor = core.LoDTensor()
+            tensor.set(flattened_data, self.places[0])
+            tensor.set_lod(lod)
+            return tensor
 
     def run(self,
             program=None,
@@ -23,14 +95,15 @@ class Executor(object):
             fetch_list=None,
             feed_var_name='feed',
             fetch_var_name='fetch',
-            scope=None):
+            scope=None,
+            return_numpy=True):
         if feed is None:
             feed = {}
         if fetch_list is None:
             fetch_list = []
 
         if program is None:
-            program = g_main_program
+            program = default_main_program()
 
         if not isinstance(program, Program):
             raise TypeError()
@@ -52,7 +125,10 @@ class Executor(object):
                 inputs={'X': [feed_var]},
                 outputs={'Out': [out]},
                 attrs={'col': i})
-            core.set_feed_variable(scope, feed[name], feed_var.name, i)
+            cur_feed = feed[name]
+            if not isinstance(cur_feed, core.LoDTensor):
+                cur_feed = self.aslodtensor(cur_feed)
+            core.set_feed_variable(scope, cur_feed, feed_var.name, i)
 
         fetch_var = global_block.create_var(
             name=fetch_var_name,
@@ -66,7 +142,11 @@ class Executor(object):
                 attrs={'col': i})
 
         self.executor.run(program.desc, scope, 0, True)
-        return [
+        outs = [
             core.get_fetch_variable(scope, fetch_var_name, i)
             for i in xrange(len(fetch_list))
         ]
+
+        if return_numpy:
+            outs = as_numpy(outs)
+        return outs
diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
index 872c19c2f6f4afbd25a5f7a9df38bd3dd0b61d5f..49c6d8983457fa9c29451b8d020dd0c581481f9c 100644
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -1,12 +1,14 @@
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
 import collections
+
 import numpy as np
-import copy
+from . import core
+import proto.framework_pb2 as framework_pb2
+import contextlib
 
 __all__ = [
     'Block', 'Variable', 'Program', 'Operator', 'default_startup_program',
-    'default_main_program'
+    'default_main_program', 'program_guard', 'switch_startup_program',
+    'switch_main_program'
 ]
 
 
@@ -395,7 +397,11 @@ class Block(object):
         return v
 
     def all_parameters(self):
-        return {v for k, v in self.vars.iteritems() if isinstance(v, Parameter)}
+        return list(self.iter_parameters())
+
+    def iter_parameters(self):
+        return (item[1] for item in self.vars.iteritems()
+                if isinstance(item[1], Parameter))
 
     def create_var(self, *args, **kwargs):
         var = Variable(self, *args, **kwargs)
@@ -469,6 +475,37 @@ class Block(object):
         for index in range(len(self.ops)):
             assert self.ops[index].desc == ops_in_cpp[index]
 
+    def copy_param_info_from(self, other):
+        """
+        Copy the information of parameters from other block
+        Args:
+            other(Block): other block 
+
+        Returns:
+            None
+        """
+        if not isinstance(other, Block):
+            raise TypeError("copy_param_info_from should be invoked with Block")
+        for p in other.iter_parameters():
+            assert isinstance(p, Parameter)
+            v = self.vars.get(p.name, None)
+            if v is None:
+                raise ValueError("copy_param_info_from should be invoked with "
+                                 "same topology")
+            assert isinstance(v, Variable)
+            new_p = Parameter(
+                block=self,
+                shape=v.shape,
+                dtype=v.dtype,
+                type=v.type,
+                lod_level=v.lod_level,
+                stop_gradient=p.stop_gradient,
+                trainable=p.trainable,
+                optimize_attr=p.optimize_attr,
+                regularizer=p.regularizer,
+                name=v.name)
+            self.vars[new_p.name] = new_p
+
 
 class Program(object):
     def __init__(self):
@@ -489,6 +526,7 @@ class Program(object):
         p.desc = core.ProgramDesc(self.desc)
         p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
         p.sync_with_cpp()
+        p.copy_param_info_from(self)
         return p
 
     def prune(self, targets):
@@ -572,6 +610,24 @@ class Program(object):
         for block in self.blocks:
             block.sync_with_cpp()
 
+    def copy_param_info_from(self, other):
+        """
+        Copy the information of parameters from other program. 
+        Args:
+            other(Program): Other program
+
+        Returns:
+            None
+        """
+        if not isinstance(other, Program):
+            raise TypeError("copy_param_info_from should be invoked with "
+                            "Program")
+
+        if len(self.blocks) != len(other.blocks):
+            raise ValueError("copy_param_info_from should be invoked with two "
+                             "program, with represent the same topology")
+        self.global_block().copy_param_info_from(other.global_block())
+
     def list_vars(self):
         for each_block in self.blocks:
             for each_var in each_block.vars.itervalues():
@@ -600,13 +656,88 @@ class Parameter(Variable):
 
 
 # program is a global instance.
-g_main_program = Program()
-g_startup_program = Program()
+_main_program_ = Program()
+_startup_program_ = Program()
 
 
 def default_startup_program():
-    return g_startup_program
+    """
+    Get default startup program. In startup program, Paddle will initialize
+    parameters, initialize nccl handle, etc.
+    
+    Returns:
+        Program: startup program
+    """
+    return _startup_program_
 
 
 def default_main_program():
-    return g_main_program
+    """
+    Get default main program. The main program is used for training or testing.
+    
+    Returns:
+        Program: main program
+    """
+    return _main_program_
+
+
+def switch_main_program(program):
+    """
+    Switch the main program to a new program.
+    
+    Args:
+        program(Program): The new main program
+
+    Returns:
+        Program: The previous main program
+    """
+    global _main_program_
+    prev_program = _main_program_
+    _main_program_ = program
+    return prev_program
+
+
+def switch_startup_program(program):
+    """
+    Switch the startup program to a new program 
+    Args:
+        program(Program): The new startup program
+
+    Returns:
+        Program: The previous startup program
+    """
+    global _startup_program_
+    prev_program = _startup_program_
+    _startup_program_ = program
+    return prev_program
+
+
+@contextlib.contextmanager
+def program_guard(main_program, startup_program=None):
+    """
+    Switch program with `with` statement
+    
+    Examples:
+        >>> with program_guard(Program()):
+        >>>   data = fluid.layers.data(...)
+        >>>   hidden = fluid.layers.fc(...)
+        
+    Args:
+        main_program(Program): New main program inside `with` statement
+        startup_program(Program): New startup program inside `with` statement. 
+            None means do not change startup program.
+
+    Returns:
+        None
+    """
+    if not isinstance(main_program, Program):
+        raise TypeError("main_program should be Program")
+    main_program = switch_main_program(main_program)
+    if startup_program is not None:
+        if not isinstance(startup_program, Program):
+            raise TypeError("startup_program should be Program")
+        startup_program = switch_startup_program(startup_program)
+    yield
+    switch_main_program(main_program)
+    if startup_program is not None:
+        switch_startup_program(startup_program)
diff --git a/python/paddle/v2/fluid/initializer.py b/python/paddle/v2/fluid/initializer.py
index 9f23e68a7635b6e6ae927603dbcc47d63f9c7f3d..d3f648f8460814a3f251d7aa9560d748af85235c 100644
--- a/python/paddle/v2/fluid/initializer.py
+++ b/python/paddle/v2/fluid/initializer.py
@@ -1,10 +1,7 @@
-import paddle.v2.fluid.framework as framework
+import framework
 import numpy as np
 
-__all__ = [
-    'ConstantInitializer', 'UniformInitializer', 'NormalInitializer',
-    'XavierInitializer'
-]
+__all__ = ['Constant', 'Uniform', 'Normal', 'Xavier']
 
 
 class Initializer(object):
@@ -368,3 +365,19 @@ class MSRAInitializer(Initializer):
                 })
         var.op = op
         return op
+
+
+# We short the class name, since users will use the initializer with the package
+# name. The sample code:
+#
+# import paddle.fluid as fluid
+#
+# hidden = fluid.layers.fc(...,
+#                          param_attr=ParamAttr(fluid.initializer.Xavier()))
+#
+# It is no need to add an `Initializer` as the class suffix
+Constant = ConstantInitializer
+Uniform = UniformInitializer
+Normal = NormalInitializer
+Xavier = XavierInitializer
+MSRA = MSRAInitializer
diff --git a/python/paddle/v2/fluid/io.py b/python/paddle/v2/fluid/io.py
index e5b2aa3b919df4cec1091c0bbd39b7e400cc6867..e147ac22ad289eb00c83def66974d875fcdc31f8 100644
--- a/python/paddle/v2/fluid/io.py
+++ b/python/paddle/v2/fluid/io.py
@@ -1,8 +1,7 @@
 import os
 import cPickle as pickle
 
-from paddle.v2.fluid.framework import Program, Parameter, g_main_program, \
-    Variable
+from paddle.v2.fluid.framework import Program, Parameter, default_main_program, Variable
 
 __all__ = [
     'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
@@ -46,7 +45,7 @@ def save_vars(executor, dirname, main_program=None, vars=None, predicate=None):
     """
     if vars is None:
         if main_program is None:
-            main_program = g_main_program
+            main_program = default_main_program()
         if not isinstance(main_program, Program):
             raise TypeError("program should be as Program type or None")
 
@@ -98,7 +97,7 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
     :param executor: executor that save variable
     :param dirname: directory path
     :param main_program: program. If vars is None, then filter all variables in this
-    program which fit `predicate`. Default g_program.
+    program which fit `predicate`. Default default_main_program().
     :param predicate: The Predicate describes a callable that returns a variable
     as a bool. If it returns true, the variables will be loaded.
     :param vars: variables need to be loaded. If specify vars, program &
@@ -107,7 +106,7 @@ def load_vars(executor, dirname, main_program=None, vars=None, predicate=None):
     """
     if vars is None:
         if main_program is None:
-            main_program = g_main_program
+            main_program = default_main_program()
         if not isinstance(main_program, Program):
             raise TypeError("program's type should be Program")
 
@@ -154,7 +153,7 @@ def load_persistables(executor, dirname, main_program=None):
 
 def get_inference_program(target_vars, main_program=None):
     if main_program is None:
-        main_program = g_main_program
+        main_program = default_main_program()
     if not isinstance(target_vars, list):
         target_vars = [target_vars]
 
@@ -177,12 +176,12 @@ def save_inference_model(dirname,
     :param target_vars: Variables from which we can get inference results.
     :param executor: executor that save inference model
     :param main_program: original program, which will be pruned to build the inference model.
-    Default g_main_program.
+            Default default_main_program().
 
     :return: None
     """
     if main_program is None:
-        main_program = g_main_program
+        main_program = default_main_program()
     if not isinstance(target_vars, list):
         target_vars = [target_vars]
 
@@ -272,10 +271,10 @@ def get_parameter_value_by_name(name, executor, program=None):
     :param executor: executor for retrieving the value
     :param name: the name of the parameter
     :param program: the program where the variable is found
-    Default g_main_program.
+            Default default_main_program().
     :return: the LoDTensor for the variable
     """
     if program is None:
-        program = g_main_program
+        program = default_main_program()
     var = program.global_block().var(name)
     return get_parameter_value(var, executor)
diff --git a/python/paddle/v2/fluid/layer_helper.py b/python/paddle/v2/fluid/layer_helper.py
index e0880354fbc5a09bd49de7ec9c5dffc1e3c6259e..5b384e5cf5df5e5abc7f0ef81ff11cd8a31cfa2d 100644
--- a/python/paddle/v2/fluid/layer_helper.py
+++ b/python/paddle/v2/fluid/layer_helper.py
@@ -1,10 +1,10 @@
 import copy
 import itertools
 
-from paddle.v2.fluid.framework import Variable, g_main_program, \
-    g_startup_program, unique_name, Program, dtype_is_floating
-from paddle.v2.fluid.initializer import ConstantInitializer, \
-    UniformInitializer, XavierInitializer
+from framework import Variable, default_main_program, default_startup_program, \
+    unique_name, dtype_is_floating
+from paddle.v2.fluid.initializer import Constant, Xavier
+from param_attr import ParamAttr
 
 
 class LayerHelper(object):
@@ -23,7 +23,7 @@ class LayerHelper(object):
     def main_program(self):
         prog = self.kwargs.get('main_program', None)
         if prog is None:
-            return g_main_program
+            return default_main_program()
         else:
             return prog
 
@@ -31,7 +31,7 @@ class LayerHelper(object):
     def startup_program(self):
         prog = self.kwargs.get('startup_program', None)
         if prog is None:
-            return g_startup_program
+            return default_startup_program()
         else:
             return prog
 
@@ -61,31 +61,15 @@ class LayerHelper(object):
 
     @property
     def param_attr(self):
-        default = {'name': None}
-        actual = self.kwargs.get('param_attr', None)
-        if actual is None:
-            actual = default
-        for default_field in default.keys():
-            if default_field not in actual:
-                actual[default_field] = default[default_field]
-        return actual
+        return ParamAttr.to_attr(self.kwargs.get('param_attr', None))
 
     @property
     def bias_attr(self):
-        default = {'name': None}
-        bias_attr = self.kwargs.get('bias_attr', None)
-        if bias_attr is None:
-            bias_attr = default
-
-        if isinstance(bias_attr, dict):
-            for default_field in default.keys():
-                if default_field not in bias_attr:
-                    bias_attr[default_field] = default[default_field]
-        return bias_attr
+        return ParamAttr.to_attr(self.kwargs.get('bias_attr', None))
 
     def multiple_param_attr(self, length):
         param_attr = self.param_attr
-        if isinstance(param_attr, dict):
+        if isinstance(param_attr, ParamAttr):
             param_attr = [param_attr]
 
         if len(param_attr) != 1 and len(param_attr) != length:
@@ -113,23 +97,30 @@ class LayerHelper(object):
                 raise ValueError("Data Type mismatch")
         return dtype
 
-    def create_parameter(self, attr, shape, dtype, suffix='w',
-                         initializer=None):
+    def create_parameter(self,
+                         attr,
+                         shape,
+                         dtype,
+                         is_bias=False,
+                         default_initializer=None):
         # Deepcopy the attr so that parameters can be shared in program
-        attr_copy = copy.deepcopy(attr)
-        if initializer is not None:
-            attr_copy['initializer'] = initializer
+        assert isinstance(attr, ParamAttr)
+        suffix = 'b' if is_bias else 'w'
+
+        if default_initializer is None:
+            if is_bias:
+                attr.set_default_bias_initializer()
+            else:
+                attr.set_default_param_initializer()
         else:
-            attr_copy['initializer'] = self._get_default_initializer(dtype)
-        if attr_copy['name'] is None:
-            attr_copy['name'] = unique_name(".".join([self.name, suffix]))
+            attr.set_default_initializer(default_initializer)
+        if attr.name is None:
+            attr.name = unique_name(".".join([self.name, suffix]))
+
         self.startup_program.global_block().create_parameter(
-            dtype=dtype, shape=shape, **attr_copy)
+            dtype=dtype, shape=shape, **attr.to_kwargs(with_initializer=True))
         return self.main_program.global_block().create_parameter(
-            name=attr_copy['name'],
-            dtype=dtype,
-            shape=shape,
-            trainable=attr_copy.get('trainable', True))
+            dtype=dtype, shape=shape, **attr.to_kwargs())
 
     def create_tmp_variable(self, dtype):
         return self.main_program.current_block().create_var(
@@ -154,11 +145,7 @@ class LayerHelper(object):
             persistable=True,
             initializer=initializer)
 
-    def append_bias_op(self,
-                       input_var,
-                       bias_initializer,
-                       dim_start=1,
-                       dim_end=None):
+    def append_bias_op(self, input_var, dim_start=1, dim_end=None):
         """
         Append bias operator and return its output. If the user does not set
         bias_attr, append_bias_op will return input_var
@@ -178,11 +165,7 @@ class LayerHelper(object):
             return input_var
 
         b = self.create_parameter(
-            attr=bias_attr,
-            shape=size,
-            dtype=input_var.dtype,
-            suffix='b',
-            initializer=bias_initializer)
+            attr=bias_attr, shape=size, dtype=input_var.dtype, is_bias=True)
         tmp = self.create_tmp_variable(dtype=input_var.dtype)
         self.append_op(
             type='elementwise_add',
@@ -209,7 +192,7 @@ class LayerHelper(object):
 
     def _get_default_initializer(self, dtype):
         if dtype is None or dtype_is_floating(dtype) is True:
-            return XavierInitializer()
+            return Xavier()
         else:
             # For integer and boolean types, initialize with all zeros
-            return ConstantInitializer()
+            return Constant()
diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py
index d094035fe5cae2e77fc2364e8ccb03c350f1301a..e41bfae285a5b8f711d3ea90d9341f0f3a938c1d 100644
--- a/python/paddle/v2/fluid/layers.py
+++ b/python/paddle/v2/fluid/layers.py
@@ -1,12 +1,11 @@
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.proto.framework_pb2 as framework_pb2
-from paddle.v2.fluid.framework import OpProtoHolder, Variable, Program, \
-    Operator
-from paddle.v2.fluid.initializer import ConstantInitializer, \
-    NormalInitializer, XavierInitializer
+import core
+import proto.framework_pb2 as framework_pb2
+from framework import OpProtoHolder, Variable, Program, Operator
+from initializer import Constant, Normal, Xavier, Initializer
 from paddle.v2.fluid.layer_helper import LayerHelper, unique_name
 import re
 import cStringIO
+from param_attr import ParamAttr
 
 __all__ = [
     'fc', 'data', 'cross_entropy', 'conv2d', 'pool2d', 'embedding', 'concat',
@@ -19,9 +18,7 @@ def fc(input,
        size,
        num_flatten_dims=1,
        param_attr=None,
-       param_initializer=None,
        bias_attr=None,
-       bias_initializer=None,
        act=None,
        name=None,
        main_program=None,
@@ -56,23 +53,10 @@ def fc(input,
     to the LayerHelper constructor.
 
     """
-
-    def _get_default_param_initializer():
-        return XavierInitializer()
-
-    def _get_default_bias_initializer():
-        return ConstantInitializer()
-
     helper = LayerHelper('fc', **locals())
 
     dtype = helper.input_dtype()
 
-    if param_initializer is None:
-        param_initializer = _get_default_param_initializer()
-
-    if bias_initializer is None:
-        bias_initializer = _get_default_bias_initializer()
-
     mul_results = []
     for input_var, param_attr in helper.iter_inputs_and_params():
         input_shape = input_var.shape
@@ -80,10 +64,7 @@ def fc(input,
             reduce(lambda a, b: a * b, input_shape[num_flatten_dims:], 1)
         ] + [size]
         w = helper.create_parameter(
-            attr=param_attr,
-            initializer=param_initializer,
-            shape=param_shape,
-            dtype=dtype)
+            attr=param_attr, shape=param_shape, dtype=dtype, is_bias=False)
         tmp = helper.create_tmp_variable(dtype)
         helper.append_op(
             type="mul",
@@ -104,7 +85,7 @@ def fc(input,
         helper.append_op(
             type="sum", inputs={"X": mul_results}, outputs={"Out": pre_bias})
     # add bias
-    pre_activation = helper.append_bias_op(pre_bias, bias_initializer)
+    pre_activation = helper.append_bias_op(pre_bias)
     # add activation
     return helper.append_activation(pre_activation)
 
@@ -112,7 +93,6 @@ def fc(input,
 def embedding(input,
               size,
               is_sparse=False,
-              param_initializer=None,
               param_attr=None,
               dtype='float32',
               main_program=None,
@@ -121,6 +101,7 @@ def embedding(input,
     Embedding Layer.
 
     Args:
+       param_initializer:
        input: The input to the function
        size: The size of the layer
        is_sparse: A flag that decleares whether the input is sparse
@@ -138,15 +119,9 @@ def embedding(input,
 
     """
 
-    def _get_default_param_initializer():
-        return XavierInitializer()
-
     helper = LayerHelper('embedding', **locals())
     w = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=size,
-        dtype=dtype,
-        initializer=param_initializer or _get_default_param_initializer())
+        attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False)
     tmp = helper.create_tmp_variable(dtype)
     helper.append_op(
         type='lookup_table',
@@ -178,7 +153,7 @@ def dynamic_lstm(input,
     if not use_peepholes:
         bias_size[1] = 4 * size
     bias = helper.create_parameter(
-        attr=helper.bias_attr, shape=bias_size, dtype=dtype, suffix='b')
+        attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
 
     hidden = helper.create_tmp_variable(dtype)
     cell = helper.create_tmp_variable(dtype)
@@ -210,6 +185,7 @@ def data(name,
          shape,
          append_batch_size=True,
          dtype='float32',
+         lod_level=0,
          type=core.VarDesc.VarType.LOD_TENSOR,
          main_program=None,
          startup_program=None,
@@ -223,6 +199,7 @@ def data(name,
        append_batch_size: Whether or not to append the data as a batch.
        dtype: The type of data : float32, float_16, int etc
        type: The output type. By default it is LOD_TENSOR.
+       lod_level(int): The LoD Level. 0 means the input data is not a sequence.
        main_program: Name of the main program that calls this
        startup_program: Name of the startup program
        stop_gradient: A boolean that mentions whether gradient should flow.
@@ -253,7 +230,8 @@ def data(name,
         shape=shape,
         dtype=dtype,
         type=type,
-        stop_gradient=stop_gradient)
+        stop_gradient=stop_gradient,
+        lod_level=lod_level)
 
 
 def create_tensor(dtype, name=None, main_program=None, startup_program=None):
@@ -418,12 +396,14 @@ def _create_op_func_(op_type):
 _create_op_func_('mean')
 _create_op_func_('mul')
 _create_op_func_('elementwise_add')
+_create_op_func_('elementwise_div')
 _create_op_func_('dropout')
 _create_op_func_('reshape')
 _create_op_func_('sigmoid')
 _create_op_func_('scale')
 _create_op_func_('reshape')
 _create_op_func_('transpose')
+_create_op_func_('sigmoid_cross_entropy_with_logits')
 
 
 def cast(x, dtype, main_program=None):
@@ -457,13 +437,14 @@ def concat(input, axis, main_program=None, startup_program=None):
     return out
 
 
-def sums(input, main_program=None, startup_program=None):
+def sums(input, out=None, main_program=None, startup_program=None):
     """
     This function takes in the input and performs the sum operation on it
     and returns that as the output.
     """
     helper = LayerHelper('sum', **locals())
-    out = helper.create_tmp_variable(dtype=helper.input_dtype())
+    if out is None:
+        out = helper.create_tmp_variable(dtype=helper.input_dtype())
     helper.append_op(type='sum', inputs={'X': input}, outputs={'Out': out})
     return out
 
@@ -471,19 +452,14 @@ def sums(input, main_program=None, startup_program=None):
 def linear_chain_crf(input,
                      label,
                      param_attr=None,
-                     param_initializer=None,
                      main_program=None,
                      startup_program=None):
-    def _get_default_param_initializer():
-        return XavierInitializer()
-
     helper = LayerHelper('linear_chain_crf', **locals())
     size = input.shape[1]
     transition = helper.create_parameter(
         attr=helper.param_attr,
         shape=[size + 2, size],
-        dtype=helper.input_dtype(),
-        initializer=param_initializer or _get_default_param_initializer())
+        dtype=helper.input_dtype())
     alpha = helper.create_tmp_variable(dtype=helper.input_dtype())
     emission_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
     transition_exps = helper.create_tmp_variable(dtype=helper.input_dtype())
@@ -606,7 +582,7 @@ def square_error_cost(input, label, **kwargs):
     return square_out
 
 
-def accuracy(input, label, k=1, **kwargs):
+def accuracy(input, label, k=1, correct=None, total=None, **kwargs):
     """
     This function computes the accuracy using the input and label.
     The output is the top_k inputs and their indices.
@@ -620,10 +596,11 @@ def accuracy(input, label, k=1, **kwargs):
         outputs={"Out": [topk_out],
                  "Indices": [topk_indices]},
         attrs={"k": k})
-    acc_out_dtype = kwargs.get("out_dtype", "float32")
     acc_out = helper.create_tmp_variable(dtype="float32")
-    correct = helper.create_tmp_variable(dtype="int64")
-    total = helper.create_tmp_variable(dtype="int64")
+    if correct is None:
+        correct = helper.create_tmp_variable(dtype="int64")
+    if total is None:
+        total = helper.create_tmp_variable(dtype="int64")
     helper.append_op(
         type="accuracy",
         inputs={
@@ -645,9 +622,7 @@ def sequence_conv(input,
                   filter_stride=1,
                   padding=None,
                   bias_attr=None,
-                  bias_initializer=None,
                   param_attr=None,
-                  param_initializer=None,
                   act=None,
                   main_program=None,
                   startup_program=None):
@@ -657,30 +632,15 @@ def sequence_conv(input,
     in the input parameters to the function.
     """
 
-    def _get_default_bias_initializer():
-        return ConstantInitializer()
-
-    def _get_default_param_initializer():
-        return XavierInitializer()
-
     # FIXME(dzh) : want to unify the argument of python layer
     # function. So we ignore some unecessary attributes.
     # such as, padding_trainable, context_start.
 
     helper = LayerHelper('sequence_conv', **locals())
     dtype = helper.input_dtype()
-
-    if param_initializer is None:
-        param_initializer = _get_default_param_initializer()
-    if bias_initializer is None:
-        bias_initializer = _get_default_bias_initializer()
-
     filter_shape = [filter_size * input.shape[1], num_filters]
     filter = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=filter_shape,
-        dtype=dtype,
-        initializer=param_initializer)
+        attr=helper.param_attr, shape=filter_shape, dtype=dtype)
     pre_bias = helper.create_tmp_variable(dtype)
 
     helper.append_op(
@@ -695,7 +655,7 @@ def sequence_conv(input,
             'contextStart': -int(filter_size / 2),
             'contextLength': filter_size
         })
-    pre_act = helper.append_bias_op(pre_bias, bias_initializer)
+    pre_act = helper.append_bias_op(pre_bias)
     return helper.append_activation(pre_act)
 
 
@@ -706,9 +666,7 @@ def conv2d(input,
            padding=None,
            groups=None,
            param_attr=None,
-           param_initializer=None,
            bias_attr=None,
-           bias_initializer=None,
            act=None,
            name=None,
            main_program=None,
@@ -721,13 +679,6 @@ def conv2d(input,
     conv-2d output, if mentioned in the input parameters.
     """
 
-    def _get_default_bias_initializer():
-        return ConstantInitializer()
-
-    def _get_default_param_initializer(filter_size, num_channels):
-        std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
-        return NormalInitializer(0.0, std, 0)
-
     helper = LayerHelper('conv2d', **locals())
     dtype = helper.input_dtype()
 
@@ -749,17 +700,16 @@ def conv2d(input,
     input_shape = input.shape
     filter_shape = [num_filters, num_filter_channels] + filter_size
 
-    if param_initializer is None:
-        param_initializer = _get_default_param_initializer(filter_size,
-                                                           num_channels)
-    if bias_initializer is None:
-        bias_initializer = _get_default_bias_initializer()
+    def _get_default_param_initializer():
+        std = (2.0 / (filter_size[0]**2 * num_channels))**0.5
+        return Normal(0.0, std, 0)
 
     filter = helper.create_parameter(
         attr=helper.param_attr,
         shape=filter_shape,
         dtype=dtype,
-        initializer=param_initializer)
+        default_initializer=_get_default_param_initializer())
+
     pre_bias = helper.create_tmp_variable(dtype)
 
     helper.append_op(
@@ -773,8 +723,7 @@ def conv2d(input,
                'paddings': padding,
                'groups': groups})
 
-    pre_act = helper.append_bias_op(
-        pre_bias, bias_initializer, dim_start=1, dim_end=2)
+    pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
 
     return helper.append_activation(pre_act)
 
@@ -875,22 +824,18 @@ def batch_norm(input,
         attr=helper.param_attr,
         shape=param_shape,
         dtype=dtype,
-        initializer=ConstantInitializer(1.0))
+        default_initializer=Constant(1.0))
+
     bias = helper.create_parameter(
-        attr=helper.param_attr,
-        shape=param_shape,
-        dtype=dtype,
-        initializer=ConstantInitializer(0.0))
+        attr=helper.param_attr, shape=param_shape, dtype=dtype, is_bias=True)
 
     mean = helper.create_global_variable(
         dtype=input.dtype, shape=param_shape, persistable=True)
-    helper.set_variable_initializer(
-        var=mean, initializer=ConstantInitializer(0.0))
+    helper.set_variable_initializer(var=mean, initializer=Constant(0.0))
 
     variance = helper.create_global_variable(
         dtype=input.dtype, shape=param_shape, persistable=True)
-    helper.set_variable_initializer(
-        var=variance, initializer=ConstantInitializer(1.0))
+    helper.set_variable_initializer(var=variance, initializer=Constant(1.0))
 
     # create output
     # mean and mean_out share the same memory
@@ -1355,6 +1300,33 @@ def lod_rank_table(x, level=0, main_program=None):
     return table
 
 
+def max_sequence_len(rank_table, main_program=None):
+    """
+    This function creates an operator to calculate the length of
+    max seqence through input rank_table(should be a lod_rank_table)
+    """
+    helper = LayerHelper("max_seqence_len", **locals())
+    res = helper.create_tmp_variable(dtype="int64")
+    helper.append_op(
+        type="max_sequence_len",
+        inputs={"RankTable": rank_table},
+        outputs={"Out": res})
+    return res
+
+
+def topk(input, k, main_program=None, startup_program=None):
+    helper = LayerHelper('topk', **locals())
+    topk_out = helper.create_tmp_variable(dtype=input.data_type)
+    topk_indices = helper.create_tmp_variable(dtype='int64')
+    helper.append_op(
+        type='top_k',
+        inputs={'X': [input]},
+        outputs={'Out': [topk_out],
+                 'Indices': [topk_indices]},
+        attrs={'k': k})
+    return topk_out, topk_indices
+
+
 def lod_tensor_to_array(x, table, main_program=None):
     """
     This function creates an operator to convert an LOD_Tensor to
@@ -1388,14 +1360,20 @@ def array_to_lod_tensor(x, table, main_program=None):
     return tmp
 
 
-def fill_constant(shape, dtype, value, main_program=None, startup_program=None):
+def fill_constant(shape,
+                  dtype,
+                  value,
+                  out=None,
+                  main_program=None,
+                  startup_program=None):
     """
     This function creates a tensor , with shape as mentioned in the input and
     specified dtype and fills this up with a constant value that
     comes in the input. It also sets the stop_gradient to be True.
     """
     helper = LayerHelper("fill_constant", **locals())
-    out = helper.create_tmp_variable(dtype=dtype)
+    if out is None:
+        out = helper.create_tmp_variable(dtype=dtype)
     helper.append_op(
         type='fill_constant',
         inputs={},
@@ -1555,6 +1533,93 @@ def array_length(array, main_program=None):
     return tmp
 
 
+def conv2d_transpose(input,
+                     num_filters,
+                     output_size=None,
+                     filter_size=None,
+                     padding=None,
+                     stride=None,
+                     param_attr=None,
+                     main_program=None,
+                     startup_program=None):
+    """
+    The transpose of conv2d layer.
+
+    This layer is also known as deconvolution layer.
+
+    Args:
+        input(Variable): The input image with [N, C, H, W] format.
+        num_filters(int): The number of filter. It is as same as the output
+            image channel.
+        output_size(int|tuple|None): The output image size. If output size is a
+            tuple, it must contain two integers, (image_H, image_W). This
+            parameter only works when filter_size is None.
+        filter_size(int|tuple|None): The filter size. If filter_size is a tuple,
+            it must contain two integers, (filter_size_H, filter_size_W).
+            Otherwise, the filter will be a square.  None if use output size to
+            calculate filter_size
+        padding(int|tuple): The padding size. If padding is a tuple, it must
+            contain two integers, (padding_H, padding_W). Otherwise, the
+            padding_H = padding_W = padding.
+        stride(int|tuple): The stride size. If stride is a tuple, it must
+            contain two integers, (stride_H, stride_W). Otherwise, the
+            stride_H = stride_W = stride.
+        param_attr: Parameter Attribute.
+        main_program(Program): the main program
+        startup_program(Program): the startup program
+
+    Returns:
+        Variable: Output image.
+    """
+    helper = LayerHelper("conv2d_transpose", **locals())
+    if not isinstance(input, Variable):
+        raise TypeError("Input of conv2d_transpose must be Variable")
+    input_channel = input.shape[1]
+
+    op_attr = dict()
+
+    if isinstance(padding, int):
+        op_attr['paddings'] = [padding, padding]
+    elif padding is not None:
+        op_attr['paddings'] = padding
+
+    if isinstance(stride, int):
+        op_attr['strides'] = stride
+    elif stride is not None:
+        op_attr['strides'] = stride
+
+    if filter_size is None:
+        if output_size is None:
+            raise ValueError("output_size must be set when filter_size is None")
+        if isinstance(output_size, int):
+            output_size = [output_size, output_size]
+
+        padding = op_attr.get('paddings', [0, 0])
+        stride = op_attr.get('strides', [1, 1])
+
+        h_in = input.shape[2]
+        w_in = input.shape[3]
+        filter_size_h = output_size[0] - (h_in - 1) * stride[0] + 2 * padding[0]
+        filter_size_w = output_size[1] - (w_in - 1) * stride[1] + 2 * padding[1]
+        filter_size = [filter_size_h, filter_size_w]
+    elif isinstance(filter_size, int):
+        filter_size = [filter_size, filter_size]
+
+    filter_shape = [input_channel, num_filters] + filter_size
+    img_filter = helper.create_parameter(
+        dtype=input.dtype, shape=filter_shape, attr=helper.param_attr)
+
+    out = helper.create_tmp_variable(dtype=input.dtype)
+    helper.append_op(
+        type='conv2d_transpose',
+        inputs={'Input': [input],
+                'Filter': [img_filter]},
+        outputs={'Output': out},
+        attrs=op_attr)
+
+    return out
+
+
 class ConditionalBlockGuard(BlockGuard):
     def __init__(self, block):
         if not isinstance(block, ConditionalBlock):
diff --git a/python/paddle/v2/fluid/nets.py b/python/paddle/v2/fluid/nets.py
index 5e14ca594bc7965dc29039ba57bb7b26b1ce6871..05728ad75a5bd1e87aa3c75ffcc4eac34b6b956c 100644
--- a/python/paddle/v2/fluid/nets.py
+++ b/python/paddle/v2/fluid/nets.py
@@ -1,4 +1,4 @@
-import paddle.v2.fluid.layers as layers
+import layers
 
 __all__ = ["simple_img_conv_pool", "sequence_conv_pool"]
 
diff --git a/python/paddle/v2/fluid/optimizer.py b/python/paddle/v2/fluid/optimizer.py
index e82f0f060de6af63f63d5601ae94059192076e6f..934e024742fd00bf05cc0d7caaaa870c18a68074 100644
--- a/python/paddle/v2/fluid/optimizer.py
+++ b/python/paddle/v2/fluid/optimizer.py
@@ -1,16 +1,13 @@
 from collections import defaultdict
 
-import paddle.v2.fluid.framework as framework
-from paddle.v2.fluid.framework import unique_name, Program
-from paddle.v2.fluid.backward import append_backward_ops
-from paddle.v2.fluid.initializer import ConstantInitializer
-from paddle.v2.fluid.regularizer import append_regularization_ops
-from paddle.v2.fluid.layer_helper import LayerHelper
+import framework
+from backward import append_backward_ops
+from framework import unique_name
+from initializer import Constant
+from layer_helper import LayerHelper
+from regularizer import append_regularization_ops
 
-__all__ = [
-    'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
-    'AdamaxOptimizer', 'DecayedAdagradOptimizer'
-]
+__all__ = ['SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad']
 
 
 class Optimizer(object):
@@ -48,7 +45,7 @@ class Optimizer(object):
             persistable=True)
         param_lr = param_lr * self._learning_rate
         self.helper.set_variable_initializer(
-            var=param_lr_var, initializer=ConstantInitializer(param_lr))
+            var=param_lr_var, initializer=Constant(param_lr))
         return param_lr_var
 
     def _create_accumulators(self, block, parameters):
@@ -96,7 +93,7 @@ class Optimizer(object):
             type=param.type,
             shape=param.shape)
         self.helper.set_variable_initializer(
-            var, initializer=ConstantInitializer(value=float(fill_value)))
+            var, initializer=Constant(value=float(fill_value)))
         self._accumulators[name][param.name] = var
 
     def _get_accumulator(self, name, param):
@@ -360,7 +357,7 @@ class AdamOptimizer(Optimizer):
             lod_level=0,
             persistable=True)
         self.helper.set_variable_initializer(
-            self._beta1_pow_acc, initializer=ConstantInitializer(self._beta1))
+            self._beta1_pow_acc, initializer=Constant(self._beta1))
 
         self._beta2_pow_acc = self.helper.create_global_variable(
             name=unique_name('beta2_pow_acc'),
@@ -370,7 +367,7 @@ class AdamOptimizer(Optimizer):
             persistable=True)
 
         self.helper.set_variable_initializer(
-            self._beta2_pow_acc, initializer=ConstantInitializer(self._beta2))
+            self._beta2_pow_acc, initializer=Constant(self._beta2))
 
         # Create accumulator tensors for first and second moments
         for p in parameters:
@@ -462,7 +459,7 @@ class AdamaxOptimizer(Optimizer):
             lod_level=0,
             persistable=True)
         self.helper.set_variable_initializer(
-            self._beta1_pow_acc, initializer=ConstantInitializer(self._beta1))
+            self._beta1_pow_acc, initializer=Constant(self._beta1))
 
         # Create accumulator tensors for first moment and infinity norm
         for p in parameters:
@@ -559,3 +556,19 @@ class DecayedAdagradOptimizer(Optimizer):
             attrs={"epsilon": self._epsilon})
 
         return decayed_adagrad_op
+
+
+# We short the class name, since users will use the optimizer with the package
+# name. The sample code:
+#
+# import paddle.fluid as fluid
+#
+# sgd = fluid.optimizer.SGD(...)
+#
+# It is no need to add an `Optimizer` as the class suffix
+SGD = SGDOptimizer
+Momentum = MomentumOptimizer
+Adagrad = AdagradOptimizer
+Adam = AdamOptimizer
+Adamax = AdamaxOptimizer
+DecayedAdagrad = DecayedAdagradOptimizer
diff --git a/python/paddle/v2/fluid/param_attr.py b/python/paddle/v2/fluid/param_attr.py
new file mode 100644
index 0000000000000000000000000000000000000000..86088fdd7ce17b8b7a9688dc838e69b2aa754013
--- /dev/null
+++ b/python/paddle/v2/fluid/param_attr.py
@@ -0,0 +1,61 @@
+from initializer import Initializer, Xavier, Constant
+from regularizer import WeightDecayRegularizer
+
+
+class ParamAttr(object):
+    def __init__(self,
+                 name=None,
+                 initializer=None,
+                 learning_rate=1.0,
+                 regularizer=None,
+                 trainable=True):
+        self.name = name
+        self.initializer = initializer
+        self.learning_rate = learning_rate
+        self.regularizer = regularizer
+        self.trainable = trainable
+
+    def set_default_initializer(self, initializer):
+        if initializer is None:
+            if self.initializer is None:
+                raise ValueError("ParamAttr.initializer is not set")
+            return
+
+        if self.initializer is not None:
+            return
+
+        self.initializer = initializer
+
+    def set_default_param_initializer(self):
+        self.set_default_initializer(Xavier())
+
+    def set_default_bias_initializer(self):
+        self.set_default_initializer(Constant(0.0))
+
+    @staticmethod
+    def to_attr(arg):
+        if arg is None:
+            return ParamAttr()
+        elif isinstance(arg, ParamAttr):
+            return arg
+        elif isinstance(arg, str) or isinstance(arg, unicode):
+            return ParamAttr(name=arg)
+        elif isinstance(arg, Initializer):
+            return ParamAttr(initializer=arg)
+        elif isinstance(arg, WeightDecayRegularizer):
+            return ParamAttr(regularizer=arg)
+        elif isinstance(arg, bool):
+            return ParamAttr.to_attr(None) if arg else False
+        else:
+            raise TypeError("{0} cast to ParamAttr".format(type(arg)))
+
+    def to_kwargs(self, with_initializer=False):
+        kwargs = {
+            'name': self.name,
+            'learning_rate': self.learning_rate,
+            'regularizer': self.regularizer,
+            'trainable': self.trainable
+        }
+        if with_initializer:
+            kwargs['initializer'] = self.initializer
+        return kwargs
diff --git a/python/paddle/v2/fluid/profiler.py b/python/paddle/v2/fluid/profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..2069b713faf41c5c00ceaf47e030864b98c678da
--- /dev/null
+++ b/python/paddle/v2/fluid/profiler.py
@@ -0,0 +1,46 @@
+import paddle.v2.fluid.core as core
+from contextlib import contextmanager
+
+__all__ = ['CudaProfiler']
+
+NVPROF_CONFIG = [
+    "gpustarttimestamp",
+    "gpuendtimestamp",
+    "gridsize3d",
+    "threadblocksize",
+    "streamid",
+    "enableonstart 0",
+    "conckerneltrace",
+]
+
+
+@contextmanager
+def cuda_profiler(output_file, output_mode=None, config=None):
+    """The CUDA profiler.
+    This fuctions is used to profile CUDA program by CUDA runtime application
+    programming interface. The profiling result will be written into
+    `output_file` with Key-Value pair format or Comma separated values format.
+    The user can set the output mode by `output_mode` argument and set the
+    counters/options for profiling by `config` argument. The default config
+    is ['gpustarttimestamp', 'gpustarttimestamp', 'gridsize3d',
+    'threadblocksize', 'streamid', 'enableonstart 0', 'conckerneltrace'].
+
+    Args:
+        output_file (string) : The output file name, the result will be
+            written into this file.
+        output_mode (string) : The output mode has Key-Value pair format and
+            Comma separated values format. It should be 'kvp' or 'csv'.
+        config (string) : The profiler options and counters can refer to
+            "Compute Command Line Profiler User Guide".
+    """
+    if output_mode is None:
+        output_mode = 'csv'
+    if output_mode not in ['kvp', 'csv']:
+        raise ValueError("The output mode must be 'kvp' or 'csv'.")
+    config = NVPROF_CONFIG if config is None else config
+    core.nvprof_init(output_file, output_mode, config)
+    # Enables profiler collection by the active CUDA profiling tool.
+    core.nvprof_start()
+    yield
+    # Disables profiler collection.
+    core.nvprof_stop()
diff --git a/python/paddle/v2/fluid/regularizer.py b/python/paddle/v2/fluid/regularizer.py
index 098cd0dd6439554f49e429ab75fb11bfa2c9d28c..c2c18e1951234f7160ff9f92d6dd6922a56683dd 100644
--- a/python/paddle/v2/fluid/regularizer.py
+++ b/python/paddle/v2/fluid/regularizer.py
@@ -1,8 +1,6 @@
-import paddle.v2.fluid.framework as framework
+import framework
 
-__all__ = [
-    'append_regularization_ops', 'L2DecayRegularizer', 'L1DecayRegularizer'
-]
+__all__ = ['append_regularization_ops', 'L1Decay', 'L2Decay']
 
 
 def append_regularization_ops(parameters_and_grads):
@@ -139,3 +137,16 @@ class L1DecayRegularizer(WeightDecayRegularizer):
             attrs={"scale": self._regularization_coeff})
 
         return decay
+
+
+# We short the class name, since users will use the regulaizer with the package
+# name. The sample code:
+#
+# import paddle.fluid as fluid
+#
+# hidden = fluid.layers.fc(...,
+#                          param_attr=ParamAttr(fluid.regularizer.Xavier()))
+#
+# It is no need to add a `Regularizer` as the class suffix
+L1Decay = L1DecayRegularizer
+L2Decay = L2DecayRegularizer
diff --git a/python/paddle/v2/fluid/tests/.gitignore b/python/paddle/v2/fluid/tests/.gitignore
index fcc52c04886865d96c1bfe1597a9dc99c181de1f..a648f2b387c2c7b9422eea6749e43e7b8871f60f 100644
--- a/python/paddle/v2/fluid/tests/.gitignore
+++ b/python/paddle/v2/fluid/tests/.gitignore
@@ -1,2 +1,3 @@
 image/
 fit_a_line.model/
+tmp
diff --git a/python/paddle/v2/fluid/tests/book/CMakeLists.txt b/python/paddle/v2/fluid/tests/book/CMakeLists.txt
index 4d7664469e481344cf9eea84688f068b4fb99dee..a35abe3e0c436be4eaed01c9b9183344c6d3b275 100644
--- a/python/paddle/v2/fluid/tests/book/CMakeLists.txt
+++ b/python/paddle/v2/fluid/tests/book/CMakeLists.txt
@@ -1,5 +1,11 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
+
+list(REMOVE_ITEM TEST_OPS test_image_classification_train)
+py_test(test_image_classification_train_resnet SRCS test_image_classification_train.py ARGS resnet)
+py_test(test_image_classification_train_vgg SRCS test_image_classification_train.py ARGS vgg)
+
+# default test
 foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
 endforeach()
diff --git a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
index a899f1088d77c4ca6462cf5306393444ea114e6c..9f98493adb21a03b8efde0f88c490e77c9d303e7 100644
--- a/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/v2/fluid/tests/book/test_fit_a_line.py
@@ -1,23 +1,18 @@
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.io import save_persistables, load_persistables
-from paddle.v2.fluid.optimizer import SGDOptimizer
+import paddle.v2.fluid as fluid
 
-x = layers.data(name='x', shape=[13], dtype='float32')
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
 
-y_predict = layers.fc(input=x, size=1, act=None)
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
 
-y = layers.data(name='y', shape=[1], dtype='float32')
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
 
-cost = layers.square_error_cost(input=y_predict, label=y)
-avg_cost = layers.mean(x=cost)
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
 
-sgd_optimizer = SGDOptimizer(learning_rate=0.001)
-opts = sgd_optimizer.minimize(avg_cost)
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
 
 BATCH_SIZE = 20
 
@@ -26,32 +21,24 @@ train_reader = paddle.batch(
         paddle.dataset.uci_housing.train(), buf_size=500),
     batch_size=BATCH_SIZE)
 
-place = core.CPUPlace()
-exe = Executor(place)
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
 
-exe.run(framework.default_startup_program())
+exe.run(fluid.default_startup_program())
 
 PASS_NUM = 100
 for pass_id in range(PASS_NUM):
-    save_persistables(exe, "./fit_a_line.model/")
-    load_persistables(exe, "./fit_a_line.model/")
+    fluid.io.save_persistables(exe, "./fit_a_line.model/")
+    fluid.io.load_persistables(exe, "./fit_a_line.model/")
     for data in train_reader():
-        x_data = np.array(map(lambda x: x[0], data)).astype("float32")
-        y_data = np.array(map(lambda x: x[1], data)).astype("float32")
-
-        tensor_x = core.LoDTensor()
-        tensor_x.set(x_data, place)
-        # print tensor_x.get_dims()
-
-        tensor_y = core.LoDTensor()
-        tensor_y.set(y_data, place)
-        # print tensor_y.get_dims()
-        outs = exe.run(framework.default_main_program(),
-                       feed={'x': tensor_x,
-                             'y': tensor_y},
-                       fetch_list=[avg_cost])
-        out = np.array(outs[0])
-
-        if out[0] < 10.0:
+        x_data = np.array(map(lambda _: _[0], data)).astype("float32")
+        y_data = np.array(map(lambda _: _[1], data)).astype("float32")
+
+        avg_loss_value, = exe.run(fluid.default_main_program(),
+                                  feed={'x': x_data,
+                                        'y': y_data},
+                                  fetch_list=[avg_cost])
+
+        if avg_loss_value[0] < 10.0:
             exit(0)  # if avg cost less than 10.0, we think our code is good.
 exit(1)
diff --git a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
index 76cbd410f94a4be04ba71d1e3175eaed590ac80a..0f0cc5b5406ef51ac3504a95ea716056ae8730af 100644
--- a/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
+++ b/python/paddle/v2/fluid/tests/book/test_image_classification_train.py
@@ -1,19 +1,14 @@
+from __future__ import print_function
+
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.nets as nets
-import paddle.v2.fluid.evaluator as evaluator
-from paddle.v2.fluid.io import get_inference_program
-from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.initializer import XavierInitializer
-from paddle.v2.fluid.optimizer import AdamOptimizer
+import paddle.v2.fluid as fluid
+import sys
 
 
 def resnet_cifar10(input, depth=32):
     def conv_bn_layer(input, ch_out, filter_size, stride, padding, act='relu'):
-        tmp = layers.conv2d(
+        tmp = fluid.layers.conv2d(
             input=input,
             filter_size=filter_size,
             num_filters=ch_out,
@@ -21,12 +16,11 @@ def resnet_cifar10(input, depth=32):
             padding=padding,
             act=None,
             bias_attr=False)
-        return layers.batch_norm(input=tmp, act=act)
+        return fluid.layers.batch_norm(input=tmp, act=act)
 
-    def shortcut(input, ch_in, ch_out, stride, program, init_program):
+    def shortcut(input, ch_in, ch_out, stride):
         if ch_in != ch_out:
-            return conv_bn_layer(input, ch_out, 1, stride, 0, None, program,
-                                 init_program)
+            return conv_bn_layer(input, ch_out, 1, stride, 0, None)
         else:
             return input
 
@@ -34,7 +28,7 @@ def resnet_cifar10(input, depth=32):
         tmp = conv_bn_layer(input, ch_out, 3, stride, 1)
         tmp = conv_bn_layer(tmp, ch_out, 3, 1, 1, act=None)
         short = shortcut(input, ch_in, ch_out, stride)
-        return layers.elementwise_add(x=tmp, y=short, act='relu')
+        return fluid.layers.elementwise_add(x=tmp, y=short, act='relu')
 
     def layer_warp(block_func, input, ch_in, ch_out, count, stride):
         tmp = block_func(input, ch_in, ch_out, stride)
@@ -49,14 +43,14 @@ def resnet_cifar10(input, depth=32):
     res1 = layer_warp(basicblock, conv1, 16, 16, n, 1)
     res2 = layer_warp(basicblock, res1, 16, 32, n, 2)
     res3 = layer_warp(basicblock, res2, 32, 64, n, 2)
-    pool = layers.pool2d(
+    pool = fluid.layers.pool2d(
         input=res3, pool_size=8, pool_type='avg', pool_stride=1)
     return pool
 
 
 def vgg16_bn_drop(input):
     def conv_block(input, num_filter, groups, dropouts):
-        return nets.img_conv_group(
+        return fluid.nets.img_conv_group(
             input=input,
             pool_size=2,
             pool_stride=2,
@@ -73,62 +67,56 @@ def vgg16_bn_drop(input):
     conv4 = conv_block(conv3, 512, 3, [0.4, 0.4, 0])
     conv5 = conv_block(conv4, 512, 3, [0.4, 0.4, 0])
 
-    drop = layers.dropout(x=conv5, dropout_prob=0.5)
-    fc1 = layers.fc(input=drop,
-                    size=512,
-                    act=None,
-                    param_attr={"initializer": XavierInitializer()})
-    reshape1 = layers.reshape(x=fc1, shape=list(fc1.shape + (1, 1)))
-    bn = layers.batch_norm(input=reshape1, act='relu')
-    drop2 = layers.dropout(x=bn, dropout_prob=0.5)
-    fc2 = layers.fc(input=drop2,
-                    size=512,
-                    act=None,
-                    param_attr={"initializer": XavierInitializer()})
+    drop = fluid.layers.dropout(x=conv5, dropout_prob=0.5)
+    fc1 = fluid.layers.fc(input=drop, size=512, act=None)
+    bn = fluid.layers.batch_norm(input=fc1, act='relu')
+    drop2 = fluid.layers.dropout(x=bn, dropout_prob=0.5)
+    fc2 = fluid.layers.fc(input=drop2, size=512, act=None)
     return fc2
 
 
 classdim = 10
 data_shape = [3, 32, 32]
 
-images = layers.data(name='pixel', shape=data_shape, dtype='float32')
-label = layers.data(name='label', shape=[1], dtype='int64')
+images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
+label = fluid.layers.data(name='label', shape=[1], dtype='int64')
 
-# Add neural network config
-# option 1. resnet
-# net = resnet_cifar10(images, 32)
-# option 2. vgg
-net = vgg16_bn_drop(images)
+net_type = "vgg"
+if len(sys.argv) >= 2:
+    net_type = sys.argv[1]
 
-# print(program)
+if net_type == "vgg":
+    print("train vgg net")
+    net = vgg16_bn_drop(images)
+elif net_type == "resnet":
+    print("train resnet")
+    net = resnet_cifar10(images, 32)
+else:
+    raise ValueError("%s network is not supported" % net_type)
 
-predict = layers.fc(input=net, size=classdim, act='softmax')
-cost = layers.cross_entropy(input=predict, label=label)
-avg_cost = layers.mean(x=cost)
+predict = fluid.layers.fc(input=net, size=classdim, act='softmax')
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
 
-# optimizer = SGDOptimizer(learning_rate=0.001)
-optimizer = AdamOptimizer(learning_rate=0.001)
+optimizer = fluid.optimizer.Adam(learning_rate=0.001)
 opts = optimizer.minimize(avg_cost)
 
-accuracy, acc_out = evaluator.accuracy(input=predict, label=label)
+accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
 
 BATCH_SIZE = 128
 PASS_NUM = 1
 
 train_reader = paddle.batch(
     paddle.reader.shuffle(
-        paddle.dataset.cifar.train10(), buf_size=BATCH_SIZE * 10),
+        paddle.dataset.cifar.train10(), buf_size=128 * 10),
     batch_size=BATCH_SIZE)
 
-test_reader = paddle.batch(paddle.dataset.cifar.test10(), batch_size=BATCH_SIZE)
-
-place = core.CPUPlace()
-exe = Executor(place)
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
 
-exe.run(framework.default_startup_program())
+exe.run(fluid.default_startup_program())
 
 for pass_id in range(PASS_NUM):
-    batch_id = 0
     accuracy.reset(exe)
     for data in train_reader():
         img_data = np.array(map(lambda x: x[0].reshape(data_shape),
@@ -139,56 +127,13 @@ for pass_id in range(PASS_NUM):
             batch_size = batch_size * i
         y_data = y_data.reshape([batch_size, 1])
 
-        tensor_img = core.LoDTensor()
-        tensor_y = core.LoDTensor()
-        tensor_img.set(img_data, place)
-        tensor_y.set(y_data, place)
-
-        outs = exe.run(framework.default_main_program(),
-                       feed={"pixel": tensor_img,
-                             "label": tensor_y},
-                       fetch_list=[avg_cost, acc_out])
-
-        loss = np.array(outs[0])
-        acc = np.array(outs[1])
+        loss, acc = exe.run(fluid.default_main_program(),
+                            feed={"pixel": img_data,
+                                  "label": y_data},
+                            fetch_list=[avg_cost] + accuracy.metrics)
         pass_acc = accuracy.eval(exe)
-
-        batch_id = batch_id + 1
-
-        test_accuracy, test_acc_out = evaluator.accuracy(
-            input=predict, label=label)
-
-        test_target = [avg_cost, test_acc_out] + test_accuracy.states().values()
-        inference_program = get_inference_program(test_target)
-
-        test_accuracy.reset(exe)
-
-        for data in test_reader():
-            x_data = np.array(map(lambda x: x[0].reshape(data_shape),
-                                  data)).astype("float32")
-            y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-            y_data = np.expand_dims(y_data, axis=1)
-
-            tensor_x = core.LoDTensor()
-            tensor_x.set(x_data, place)
-
-            tensor_y = core.LoDTensor()
-            tensor_y.set(y_data, place)
-
-            outs = exe.run(inference_program,
-                           feed={'pixel': tensor_x,
-                                 'label': tensor_y},
-                           fetch_list=[avg_cost, test_acc_out])
-            out = np.array(outs[0])
-            acc = np.array(outs[1])
-
-        test_pass_acc = test_accuracy.eval(exe)
-
-        print("pass_id:" + str(pass_id) + " batch_id:" + str(batch_id) +
-              " loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
-                  pass_acc) + " test_pass_acc:" + str(test_pass_acc))
-
-        if batch_id > 1:
-            # this model is slow, so if we can train two mini batch, we think it works properly.
-            exit(0)
+        print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
+            pass_acc))
+        # this model is slow, so if we can train two mini batch, we think it works properly.
+        exit(0)
 exit(1)
diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
index 9c9064ba9639829ef3afd8111278b17035bee84a..bcd6f4d6bc66fd01406332bd1d6d7a5c4b0ddb5a 100644
--- a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
@@ -1,11 +1,7 @@
 import numpy as np
 import paddle.v2 as paddle
 import paddle.v2.dataset.conll05 as conll05
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.executor import Executor, g_scope
-from paddle.v2.fluid.optimizer import SGDOptimizer
+import paddle.v2.fluid as fluid
 
 word_dict, verb_dict, label_dict = conll05.get_dict()
 word_dict_len = len(word_dict)
@@ -34,23 +30,23 @@ def load_parameter(file_name, h, w):
 
 def db_lstm():
     # 8 features
-    word = layers.data(name='word_data', shape=[1], dtype='int64')
-    predicate = layers.data(name='verb_data', shape=[1], dtype='int64')
-    ctx_n2 = layers.data(name='ctx_n2_data', shape=[1], dtype='int64')
-    ctx_n1 = layers.data(name='ctx_n1_data', shape=[1], dtype='int64')
-    ctx_0 = layers.data(name='ctx_0_data', shape=[1], dtype='int64')
-    ctx_p1 = layers.data(name='ctx_p1_data', shape=[1], dtype='int64')
-    ctx_p2 = layers.data(name='ctx_p2_data', shape=[1], dtype='int64')
-    mark = layers.data(name='mark_data', shape=[1], dtype='int64')
-
-    predicate_embedding = layers.embedding(
+    word = fluid.layers.data(name='word_data', shape=[1], dtype='int64')
+    predicate = fluid.layers.data(name='verb_data', shape=[1], dtype='int64')
+    ctx_n2 = fluid.layers.data(name='ctx_n2_data', shape=[1], dtype='int64')
+    ctx_n1 = fluid.layers.data(name='ctx_n1_data', shape=[1], dtype='int64')
+    ctx_0 = fluid.layers.data(name='ctx_0_data', shape=[1], dtype='int64')
+    ctx_p1 = fluid.layers.data(name='ctx_p1_data', shape=[1], dtype='int64')
+    ctx_p2 = fluid.layers.data(name='ctx_p2_data', shape=[1], dtype='int64')
+    mark = fluid.layers.data(name='mark_data', shape=[1], dtype='int64')
+
+    predicate_embedding = fluid.layers.embedding(
         input=predicate,
         size=[pred_len, word_dim],
         dtype='float32',
         is_sparse=IS_SPARSE,
-        param_attr={'name': 'vemb'})
+        param_attr='vemb')
 
-    mark_embedding = layers.embedding(
+    mark_embedding = fluid.layers.embedding(
         input=mark,
         size=[mark_dict_len, mark_dim],
         dtype='float32',
@@ -58,22 +54,22 @@ def db_lstm():
 
     word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
     emb_layers = [
-        layers.embedding(
+        fluid.layers.embedding(
             size=[word_dict_len, word_dim],
             input=x,
-            param_attr={'name': embedding_name,
-                        'trainable': False}) for x in word_input
+            param_attr=fluid.ParamAttr(
+                name=embedding_name, trainable=False)) for x in word_input
     ]
     emb_layers.append(predicate_embedding)
     emb_layers.append(mark_embedding)
 
     hidden_0_layers = [
-        layers.fc(input=emb, size=hidden_dim) for emb in emb_layers
+        fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers
     ]
 
-    hidden_0 = layers.sums(input=hidden_0_layers)
+    hidden_0 = fluid.layers.sums(input=hidden_0_layers)
 
-    lstm_0 = layers.dynamic_lstm(
+    lstm_0 = fluid.layers.dynamic_lstm(
         input=hidden_0,
         size=hidden_dim,
         candidate_activation='relu',
@@ -84,12 +80,12 @@ def db_lstm():
     input_tmp = [hidden_0, lstm_0]
 
     for i in range(1, depth):
-        mix_hidden = layers.sums(input=[
-            layers.fc(input=input_tmp[0], size=hidden_dim),
-            layers.fc(input=input_tmp[1], size=hidden_dim)
+        mix_hidden = fluid.layers.sums(input=[
+            fluid.layers.fc(input=input_tmp[0], size=hidden_dim),
+            fluid.layers.fc(input=input_tmp[1], size=hidden_dim)
         ])
 
-        lstm = layers.dynamic_lstm(
+        lstm = fluid.layers.dynamic_lstm(
             input=mix_hidden,
             size=hidden_dim,
             candidate_activation='relu',
@@ -99,9 +95,9 @@ def db_lstm():
 
         input_tmp = [mix_hidden, lstm]
 
-    feature_out = layers.sums(input=[
-        layers.fc(input=input_tmp[0], size=label_dict_len),
-        layers.fc(input=input_tmp[1], size=label_dict_len)
+    feature_out = fluid.layers.sums(input=[
+        fluid.layers.fc(input=input_tmp[0], size=label_dict_len),
+        fluid.layers.fc(input=input_tmp[1], size=label_dict_len)
     ])
 
     return feature_out
@@ -116,7 +112,7 @@ def to_lodtensor(data, place):
         lod.append(cur_len)
     flattened_data = np.concatenate(data, axis=0).astype("int64")
     flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = core.LoDTensor()
+    res = fluid.LoDTensor()
     res.set(flattened_data, place)
     res.set_lod([lod])
     return res
@@ -125,29 +121,29 @@ def to_lodtensor(data, place):
 def main():
     # define network topology
     feature_out = db_lstm()
-    target = layers.data(name='target', shape=[1], dtype='int64')
-    crf_cost = layers.linear_chain_crf(
+    target = fluid.layers.data(name='target', shape=[1], dtype='int64')
+    crf_cost = fluid.layers.linear_chain_crf(
         input=feature_out,
         label=target,
-        param_attr={"name": 'crfw',
-                    "learning_rate": mix_hidden_lr})
-    avg_cost = layers.mean(x=crf_cost)
+        param_attr=fluid.ParamAttr(
+            name='crfw', learning_rate=mix_hidden_lr))
+    avg_cost = fluid.layers.mean(x=crf_cost)
     # TODO(qiao)
     #   1. add crf_decode_layer and evaluator
     #   2. use other optimizer and check why out will be NAN
-    sgd_optimizer = SGDOptimizer(learning_rate=0.0001)
-    opts = sgd_optimizer.minimize(avg_cost)
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
+    sgd_optimizer.minimize(avg_cost)
 
     train_data = paddle.batch(
         paddle.reader.shuffle(
             paddle.dataset.conll05.test(), buf_size=8192),
         batch_size=BATCH_SIZE)
-    place = core.CPUPlace()
-    exe = Executor(place)
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
 
-    exe.run(framework.default_startup_program())
+    exe.run(fluid.default_startup_program())
 
-    embedding_param = g_scope.find_var(embedding_name).get_tensor()
+    embedding_param = fluid.g_scope.find_var(embedding_name).get_tensor()
     embedding_param.set(
         load_parameter(conll05.get_embedding(), word_dict_len, word_dim), place)
 
@@ -164,7 +160,7 @@ def main():
             mark_data = to_lodtensor(map(lambda x: x[7], data), place)
             target = to_lodtensor(map(lambda x: x[8], data), place)
 
-            outs = exe.run(framework.default_main_program(),
+            outs = exe.run(fluid.default_main_program(),
                            feed={
                                'word_data': word_data,
                                'ctx_n2_data': ctx_n2_data,
diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
index 0bea5f95c895b278db86f25f54e2795d3ec0af69..ba686b56f8603834c12f5ed24e0ef7308c78585d 100644
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_conv.py
@@ -1,23 +1,18 @@
+from __future__ import print_function
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.evaluator as evaluator
-import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.nets as nets
-from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.optimizer import AdamOptimizer
+import paddle.v2.fluid as fluid
 
-images = layers.data(name='pixel', shape=[1, 28, 28], dtype='float32')
-label = layers.data(name='label', shape=[1], dtype='int64')
-conv_pool_1 = nets.simple_img_conv_pool(
+images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32')
+label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+conv_pool_1 = fluid.nets.simple_img_conv_pool(
     input=images,
     filter_size=5,
     num_filters=20,
     pool_size=2,
     pool_stride=2,
     act="relu")
-conv_pool_2 = nets.simple_img_conv_pool(
+conv_pool_2 = fluid.nets.simple_img_conv_pool(
     input=conv_pool_1,
     filter_size=5,
     num_filters=50,
@@ -25,13 +20,13 @@ conv_pool_2 = nets.simple_img_conv_pool(
     pool_stride=2,
     act="relu")
 
-predict = layers.fc(input=conv_pool_2, size=10, act="softmax")
-cost = layers.cross_entropy(input=predict, label=label)
-avg_cost = layers.mean(x=cost)
-optimizer = AdamOptimizer(learning_rate=0.01, beta1=0.9, beta2=0.999)
-opts = optimizer.minimize(avg_cost)
+predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax")
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
+optimizer = fluid.optimizer.Adam(learning_rate=0.01)
+optimizer.minimize(avg_cost)
 
-accuracy, acc_out = evaluator.accuracy(input=predict, label=label)
+accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
 
 BATCH_SIZE = 50
 PASS_NUM = 3
@@ -40,10 +35,10 @@ train_reader = paddle.batch(
         paddle.dataset.mnist.train(), buf_size=500),
     batch_size=BATCH_SIZE)
 
-place = core.CPUPlace()
-exe = Executor(place)
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
 
-exe.run(framework.default_startup_program())
+exe.run(fluid.default_startup_program())
 
 for pass_id in range(PASS_NUM):
     accuracy.reset(exe)
@@ -53,17 +48,10 @@ for pass_id in range(PASS_NUM):
         y_data = np.array(map(lambda x: x[1], data)).astype("int64")
         y_data = y_data.reshape([BATCH_SIZE, 1])
 
-        tensor_img = core.LoDTensor()
-        tensor_y = core.LoDTensor()
-        tensor_img.set(img_data, place)
-        tensor_y.set(y_data, place)
-
-        outs = exe.run(framework.default_main_program(),
-                       feed={"pixel": tensor_img,
-                             "label": tensor_y},
-                       fetch_list=[avg_cost, acc_out])
-        loss = np.array(outs[0])
-        acc = np.array(outs[1])
+        loss, acc = exe.run(fluid.default_main_program(),
+                            feed={"pixel": img_data,
+                                  "label": y_data},
+                            fetch_list=[avg_cost] + accuracy.metrics)
         pass_acc = accuracy.eval(exe)
         print("pass_id=" + str(pass_id) + " acc=" + str(acc) + " pass_acc=" +
               str(pass_acc))
diff --git a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
index f57a5c8d98cd8b89e1d300b4d1fe00d6b24b0d68..fa18965aac667c0829b9e6ee56ece585564f9060 100644
--- a/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
+++ b/python/paddle/v2/fluid/tests/book/test_recognize_digits_mlp.py
@@ -1,42 +1,43 @@
+from __future__ import print_function
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.evaluator as evaluator
-from paddle.v2.fluid.io import get_inference_program
-from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.initializer import UniformInitializer
-from paddle.v2.fluid.optimizer import MomentumOptimizer
-from paddle.v2.fluid.regularizer import L2DecayRegularizer
+import paddle.v2.fluid as fluid
 
 BATCH_SIZE = 128
-image = layers.data(name='x', shape=[784], dtype='float32')
+image = fluid.layers.data(name='x', shape=[784], dtype='float32')
 
-param_attr = {
-    'name': None,
-    'initializer': UniformInitializer(
-        low=-1.0, high=1.0),
-    'regularization': L2DecayRegularizer(0.0005 * BATCH_SIZE)
-}
+regularizer = fluid.regularizer.L2Decay(0.0005 * BATCH_SIZE)
 
-hidden1 = layers.fc(input=image, size=128, act='relu', param_attr=param_attr)
-hidden2 = layers.fc(input=hidden1, size=64, act='relu', param_attr=param_attr)
+hidden1 = fluid.layers.fc(input=image,
+                          size=128,
+                          act='relu',
+                          param_attr=regularizer)
+hidden2 = fluid.layers.fc(input=hidden1,
+                          size=64,
+                          act='relu',
+                          param_attr=regularizer)
 
-predict = layers.fc(input=hidden2,
-                    size=10,
-                    act='softmax',
-                    param_attr=param_attr)
+predict = fluid.layers.fc(input=hidden2,
+                          size=10,
+                          act='softmax',
+                          param_attr=regularizer)
 
-label = layers.data(name='y', shape=[1], dtype='int64')
+label = fluid.layers.data(name='y', shape=[1], dtype='int64')
 
-cost = layers.cross_entropy(input=predict, label=label)
-avg_cost = layers.mean(x=cost)
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
 
-optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9)
+optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
 opts = optimizer.minimize(avg_cost)
 
-accuracy, acc_out = evaluator.accuracy(input=predict, label=label)
+accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
+
+inference_program = fluid.default_main_program().clone()
+test_accuracy = fluid.evaluator.Accuracy(
+    input=predict, label=label, main_program=inference_program)
+test_target = [avg_cost] + test_accuracy.metrics + test_accuracy.states
+inference_program = fluid.io.get_inference_program(
+    test_target, main_program=inference_program)
 
 train_reader = paddle.batch(
     paddle.reader.shuffle(
@@ -45,10 +46,10 @@ train_reader = paddle.batch(
 
 test_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=128)
 
-place = core.CPUPlace()
-exe = Executor(place)
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
 
-exe.run(framework.default_startup_program())
+exe.run(fluid.default_startup_program())
 
 PASS_NUM = 100
 for pass_id in range(PASS_NUM):
@@ -58,44 +59,30 @@ for pass_id in range(PASS_NUM):
         y_data = np.array(map(lambda x: x[1], data)).astype("int64")
         y_data = np.expand_dims(y_data, axis=1)
 
-        tensor_x = core.LoDTensor()
+        tensor_x = fluid.LoDTensor()
         tensor_x.set(x_data, place)
 
-        tensor_y = core.LoDTensor()
+        tensor_y = fluid.LoDTensor()
         tensor_y.set(y_data, place)
 
-        outs = exe.run(framework.default_main_program(),
+        outs = exe.run(fluid.default_main_program(),
                        feed={'x': tensor_x,
                              'y': tensor_y},
-                       fetch_list=[avg_cost, acc_out])
+                       fetch_list=[avg_cost] + accuracy.metrics)
         out = np.array(outs[0])
         acc = np.array(outs[1])
         pass_acc = accuracy.eval(exe)
 
-        test_accuracy, test_acc_out = evaluator.accuracy(
-            input=predict, label=label)
-
-        test_target = [avg_cost, test_acc_out] + test_accuracy.states().values()
-        inference_program = get_inference_program(test_target)
-
         test_accuracy.reset(exe)
         for data in test_reader():
             x_data = np.array(map(lambda x: x[0], data)).astype("float32")
             y_data = np.array(map(lambda x: x[1], data)).astype("int64")
             y_data = np.expand_dims(y_data, axis=1)
 
-            tensor_x = core.LoDTensor()
-            tensor_x.set(x_data, place)
-
-            tensor_y = core.LoDTensor()
-            tensor_y.set(y_data, place)
-
-            outs = exe.run(inference_program,
-                           feed={'x': tensor_x,
-                                 'y': tensor_y},
-                           fetch_list=[avg_cost, test_acc_out])
-            out = np.array(outs[0])
-            acc = np.array(outs[1])
+            out, acc = exe.run(inference_program,
+                               feed={'x': x_data,
+                                     'y': y_data},
+                               fetch_list=[avg_cost] + test_accuracy.metrics)
 
         test_pass_acc = test_accuracy.eval(exe)
         print("pass_id=" + str(pass_id) + " train_cost=" + str(
diff --git a/python/paddle/v2/fluid/tests/book/test_recommender_system.py b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
index f8dc1518579d5a9d7a8d0498dcc5fd8a6d1692c4..db91ca4f9c7d17fb51fc5d65a0464e976d98523c 100644
--- a/python/paddle/v2/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/v2/fluid/tests/book/test_recommender_system.py
@@ -24,7 +24,7 @@ def get_usr_combined_features():
         input=uid,
         dtype='float32',
         size=[USR_DICT_SIZE, 32],
-        param_attr={'name': 'user_table'},
+        param_attr='user_table',
         is_sparse=IS_SPARSE)
 
     usr_fc = layers.fc(input=usr_emb, size=32)
@@ -36,7 +36,7 @@ def get_usr_combined_features():
     usr_gender_emb = layers.embedding(
         input=usr_gender_id,
         size=[USR_GENDER_DICT_SIZE, 16],
-        param_attr={'name': 'gender_table'},
+        param_attr='gender_table',
         is_sparse=IS_SPARSE)
 
     usr_gender_fc = layers.fc(input=usr_gender_emb, size=16)
@@ -48,7 +48,7 @@ def get_usr_combined_features():
         input=usr_age_id,
         size=[USR_AGE_DICT_SIZE, 16],
         is_sparse=IS_SPARSE,
-        param_attr={'name': 'age_table'})
+        param_attr='age_table')
 
     usr_age_fc = layers.fc(input=usr_age_emb, size=16)
 
@@ -58,7 +58,7 @@ def get_usr_combined_features():
     usr_job_emb = layers.embedding(
         input=usr_job_id,
         size=[USR_JOB_DICT_SIZE, 16],
-        param_attr={'name': 'job_table'},
+        param_attr='job_table',
         is_sparse=IS_SPARSE)
 
     usr_job_fc = layers.fc(input=usr_job_emb, size=16)
@@ -81,7 +81,7 @@ def get_mov_combined_features():
         input=mov_id,
         dtype='float32',
         size=[MOV_DICT_SIZE, 32],
-        param_attr={'name': 'movie_table'},
+        param_attr='movie_table',
         is_sparse=IS_SPARSE)
 
     mov_fc = layers.fc(input=mov_emb, size=32)
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
index 3103be83a63d64fcba87132ddc5d830b92047b27..be875a952b7086ee64984525d70ffd3f1ecb5fae 100644
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_conv.py
@@ -1,40 +1,35 @@
+from __future__ import print_function
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.evaluator as evaluator
-import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid.layers as layers
-import paddle.v2.fluid.nets as nets
-from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.optimizer import AdamOptimizer
+import paddle.v2.fluid as fluid
 
 
 def convolution_net(input_dim, class_dim=2, emb_dim=32, hid_dim=32):
-    data = layers.data(name="words", shape=[1], dtype="int64")
-    label = layers.data(name="label", shape=[1], dtype="int64")
+    data = fluid.layers.data(name="words", shape=[1], dtype="int64")
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
 
-    emb = layers.embedding(input=data, size=[input_dim, emb_dim])
-    conv_3 = nets.sequence_conv_pool(
+    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
+    conv_3 = fluid.nets.sequence_conv_pool(
         input=emb,
         num_filters=hid_dim,
         filter_size=3,
         act="tanh",
         pool_type="sqrt")
-    conv_4 = nets.sequence_conv_pool(
+    conv_4 = fluid.nets.sequence_conv_pool(
         input=emb,
         num_filters=hid_dim,
         filter_size=4,
         act="tanh",
         pool_type="sqrt")
-    prediction = layers.fc(input=[conv_3, conv_4],
-                           size=class_dim,
-                           act="softmax")
-    cost = layers.cross_entropy(input=prediction, label=label)
-    avg_cost = layers.mean(x=cost)
-    adam_optimizer = AdamOptimizer(learning_rate=0.002)
-    opts = adam_optimizer.minimize(avg_cost)
-    accuracy, acc_out = evaluator.accuracy(input=prediction, label=label)
-    return avg_cost, accuracy, acc_out
+    prediction = fluid.layers.fc(input=[conv_3, conv_4],
+                                 size=class_dim,
+                                 act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
+    adam_optimizer.minimize(avg_cost)
+    accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
+    return avg_cost, accuracy, accuracy.metrics[0]
 
 
 def to_lodtensor(data, place):
@@ -46,7 +41,7 @@ def to_lodtensor(data, place):
         lod.append(cur_len)
     flattened_data = np.concatenate(data, axis=0).astype("int64")
     flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = core.LoDTensor()
+    res = fluid.LoDTensor()
     res.set(flattened_data, place)
     res.set_lod([lod])
     return res
@@ -67,10 +62,10 @@ def main():
         paddle.reader.shuffle(
             paddle.dataset.imdb.train(word_dict), buf_size=1000),
         batch_size=BATCH_SIZE)
-    place = core.CPUPlace()
-    exe = Executor(place)
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
 
-    exe.run(framework.default_startup_program())
+    exe.run(fluid.default_startup_program())
 
     for pass_id in xrange(PASS_NUM):
         accuracy.reset(exe)
@@ -80,15 +75,14 @@ def main():
             label = np.array(map(lambda x: x[1], data)).astype("int64")
             label = label.reshape([BATCH_SIZE, 1])
 
-            tensor_label = core.LoDTensor()
+            tensor_label = fluid.LoDTensor()
             tensor_label.set(label, place)
 
-            outs = exe.run(framework.default_main_program(),
-                           feed={"words": tensor_words,
-                                 "label": tensor_label},
-                           fetch_list=[cost, acc_out])
-            cost_val = np.array(outs[0])
-            acc_val = np.array(outs[1])
+            cost_val, acc_val = exe.run(
+                fluid.default_main_program(),
+                feed={"words": tensor_words,
+                      "label": tensor_label},
+                fetch_list=[cost, acc_out])
             pass_acc = accuracy.eval(exe)
             print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
                   " pass_acc=" + str(pass_acc))
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
index 208978224f4e83a23efadae37fbe51d0d59dafe8..094a3cdcda12eaee351476e99a388c44b3c81cd6 100644
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_dynamic_lstm.py
@@ -1,11 +1,6 @@
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.evaluator as evaluator
-import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.optimizer import AdamOptimizer
+import paddle.v2.fluid as fluid
 
 
 def stacked_lstm_net(input_dim,
@@ -14,36 +9,36 @@ def stacked_lstm_net(input_dim,
                      hid_dim=512,
                      stacked_num=3):
     assert stacked_num % 2 == 1
-    data = layers.data(name="words", shape=[1], dtype="int64")
-    label = layers.data(name="label", shape=[1], dtype="int64")
+    data = fluid.layers.data(name="words", shape=[1], dtype="int64")
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
 
-    emb = layers.embedding(input=data, size=[input_dim, emb_dim])
+    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
     # add bias attr
 
     # TODO(qijun) linear act
-    fc1 = layers.fc(input=emb, size=hid_dim)
-    lstm1, cell1 = layers.dynamic_lstm(input=fc1, size=hid_dim)
+    fc1 = fluid.layers.fc(input=emb, size=hid_dim)
+    lstm1, cell1 = fluid.layers.dynamic_lstm(input=fc1, size=hid_dim)
 
     inputs = [fc1, lstm1]
 
     for i in range(2, stacked_num + 1):
-        fc = layers.fc(input=inputs, size=hid_dim)
-        lstm, cell = layers.dynamic_lstm(
+        fc = fluid.layers.fc(input=inputs, size=hid_dim)
+        lstm, cell = fluid.layers.dynamic_lstm(
             input=fc, size=hid_dim, is_reverse=(i % 2) == 0)
         inputs = [fc, lstm]
 
-    fc_last = layers.sequence_pool(input=inputs[0], pool_type='max')
-    lstm_last = layers.sequence_pool(input=inputs[1], pool_type='max')
+    fc_last = fluid.layers.sequence_pool(input=inputs[0], pool_type='max')
+    lstm_last = fluid.layers.sequence_pool(input=inputs[1], pool_type='max')
 
-    prediction = layers.fc(input=[fc_last, lstm_last],
-                           size=class_dim,
-                           act='softmax')
-    cost = layers.cross_entropy(input=prediction, label=label)
-    avg_cost = layers.mean(x=cost)
-    adam_optimizer = AdamOptimizer(learning_rate=0.002)
-    opts = adam_optimizer.minimize(avg_cost)
-    accuracy, acc_out = evaluator.accuracy(input=prediction, label=label)
-    return avg_cost, accuracy, acc_out
+    prediction = fluid.layers.fc(input=[fc_last, lstm_last],
+                                 size=class_dim,
+                                 act='softmax')
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
+    adam_optimizer.minimize(avg_cost)
+    accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
+    return avg_cost, accuracy, accuracy.metrics[0]
 
 
 def to_lodtensor(data, place):
@@ -55,7 +50,7 @@ def to_lodtensor(data, place):
         lod.append(cur_len)
     flattened_data = np.concatenate(data, axis=0).astype("int64")
     flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = core.LoDTensor()
+    res = fluid.LoDTensor()
     res.set(flattened_data, place)
     res.set_lod([lod])
     return res
@@ -77,10 +72,10 @@ def main():
         paddle.reader.shuffle(
             paddle.dataset.imdb.train(word_dict), buf_size=1000),
         batch_size=BATCH_SIZE)
-    place = core.CPUPlace()
-    exe = Executor(place)
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
 
-    exe.run(framework.default_startup_program())
+    exe.run(fluid.default_startup_program())
 
     for pass_id in xrange(PASS_NUM):
         accuracy.reset(exe)
@@ -90,15 +85,14 @@ def main():
             label = np.array(map(lambda x: x[1], data)).astype("int64")
             label = label.reshape([BATCH_SIZE, 1])
 
-            tensor_label = core.LoDTensor()
+            tensor_label = fluid.LoDTensor()
             tensor_label.set(label, place)
 
-            outs = exe.run(framework.default_main_program(),
-                           feed={"words": tensor_words,
-                                 "label": tensor_label},
-                           fetch_list=[cost, acc_out])
-            cost_val = np.array(outs[0])
-            acc_val = np.array(outs[1])
+            cost_val, acc_val = exe.run(
+                fluid.default_main_program(),
+                feed={"words": tensor_words,
+                      "label": tensor_label},
+                fetch_list=[cost, acc_out])
             pass_acc = accuracy.eval(exe)
             print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
                   " pass_acc=" + str(pass_acc))
diff --git a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
index 8aebeba653cf49438929fa51312b5af33c3b438d..b2479320330bde5771c3d4a8e2923b5ab1eecf2e 100644
--- a/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
+++ b/python/paddle/v2/fluid/tests/book/test_understand_sentiment_lstm.py
@@ -1,40 +1,39 @@
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.optimizer import AdamOptimizer
+import paddle.v2.fluid as fluid
 
 
 def lstm_net(dict_dim, class_dim=2, emb_dim=32, seq_len=80, batch_size=50):
-    data = layers.data(
+    data = fluid.layers.data(
         name="words",
         shape=[seq_len * batch_size, 1],
         append_batch_size=False,
         dtype="int64")
-    label = layers.data(
+    label = fluid.layers.data(
         name="label",
         shape=[batch_size, 1],
         append_batch_size=False,
         dtype="int64")
 
-    emb = layers.embedding(input=data, size=[dict_dim, emb_dim])
-    emb = layers.reshape(x=emb, shape=[batch_size, seq_len, emb_dim])
-    emb = layers.transpose(x=emb, axis=[1, 0, 2])
+    emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim])
+    emb = fluid.layers.reshape(x=emb, shape=[batch_size, seq_len, emb_dim])
+    emb = fluid.layers.transpose(x=emb, axis=[1, 0, 2])
 
-    c_pre_init = layers.fill_constant(
+    c_pre_init = fluid.layers.fill_constant(
         dtype=emb.dtype, shape=[batch_size, emb_dim], value=0.0)
-    layer_1_out = layers.lstm(emb, c_pre_init=c_pre_init, hidden_dim=emb_dim)
-    layer_1_out = layers.transpose(x=layer_1_out, axis=[1, 0, 2])
+    layer_1_out = fluid.layers.lstm(
+        emb, c_pre_init=c_pre_init, hidden_dim=emb_dim)
+    layer_1_out = fluid.layers.transpose(x=layer_1_out, axis=[1, 0, 2])
 
-    prediction = layers.fc(input=layer_1_out, size=class_dim, act="softmax")
-    cost = layers.cross_entropy(input=prediction, label=label)
+    prediction = fluid.layers.fc(input=layer_1_out,
+                                 size=class_dim,
+                                 act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
 
-    avg_cost = layers.mean(x=cost)
-    adam_optimizer = AdamOptimizer(learning_rate=0.002)
-    opts = adam_optimizer.minimize(avg_cost)
-    acc = layers.accuracy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
+    adam_optimizer.minimize(avg_cost)
+    acc = fluid.layers.accuracy(input=prediction, label=label)
 
     return avg_cost, acc
 
@@ -48,7 +47,7 @@ def to_lodtensor(data, place):
         lod.append(cur_len)
     flattened_data = np.concatenate(data, axis=0).astype("int64")
     flattened_data = flattened_data.reshape([len(flattened_data), 1])
-    res = core.LoDTensor()
+    res = fluid.LoDTensor()
     res.set(flattened_data, place)
     res.set_lod([lod])
     return res
@@ -65,7 +64,7 @@ def prepare_feed_data(data, place):
 
     label = np.array(map(lambda x: x[1], data)).astype("int64")
     label = label.reshape([len(label), 1])
-    tensor_label = core.LoDTensor()
+    tensor_label = fluid.LoDTensor()
     tensor_label.set(label, place)
 
     return tensor_words, tensor_label
@@ -86,17 +85,17 @@ def main():
         paddle.reader.shuffle(
             paddle.dataset.imdb.train(word_dict), buf_size=BATCH_SIZE * 10),
         batch_size=BATCH_SIZE)
-    place = core.CPUPlace()
-    exe = Executor(place)
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
 
-    exe.run(framework.default_startup_program())
+    exe.run(fluid.default_startup_program())
 
     for pass_id in xrange(PASS_NUM):
         for data in train_data():
             chopped_data = chop_data(data)
             tensor_words, tensor_label = prepare_feed_data(chopped_data, place)
 
-            outs = exe.run(framework.default_main_program(),
+            outs = exe.run(fluid.default_main_program(),
                            feed={"words": tensor_words,
                                  "label": tensor_label},
                            fetch_list=[cost, acc])
diff --git a/python/paddle/v2/fluid/tests/book/test_word2vec.py b/python/paddle/v2/fluid/tests/book/test_word2vec.py
index 0629e1cab7fd7e501d9cbf3ae8ee22fe9383ad2b..1b441e15c72c85c3d44c39b0f685a88db2304eef 100644
--- a/python/paddle/v2/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/v2/fluid/tests/book/test_word2vec.py
@@ -1,10 +1,6 @@
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.fluid.core as core
-import paddle.v2.fluid.framework as framework
-import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.optimizer import SGDOptimizer
+import paddle.v2.fluid as fluid
 
 PASS_NUM = 100
 EMBED_SIZE = 32
@@ -16,57 +12,53 @@ IS_SPARSE = True
 word_dict = paddle.dataset.imikolov.build_dict()
 dict_size = len(word_dict)
 
-first_word = layers.data(name='firstw', shape=[1], dtype='int64')
-second_word = layers.data(name='secondw', shape=[1], dtype='int64')
-third_word = layers.data(name='thirdw', shape=[1], dtype='int64')
-forth_word = layers.data(name='forthw', shape=[1], dtype='int64')
-next_word = layers.data(name='nextw', shape=[1], dtype='int64')
+first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
+second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
+third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
+forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
+next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
 
-embed_first = layers.embedding(
+embed_first = fluid.layers.embedding(
     input=first_word,
     size=[dict_size, EMBED_SIZE],
     dtype='float32',
     is_sparse=IS_SPARSE,
-    param_attr={'name': 'shared_w'})
-embed_second = layers.embedding(
+    param_attr='shared_w')
+embed_second = fluid.layers.embedding(
     input=second_word,
     size=[dict_size, EMBED_SIZE],
     dtype='float32',
     is_sparse=IS_SPARSE,
-    param_attr={'name': 'shared_w'})
-embed_third = layers.embedding(
+    param_attr='shared_w')
+embed_third = fluid.layers.embedding(
     input=third_word,
     size=[dict_size, EMBED_SIZE],
     dtype='float32',
     is_sparse=IS_SPARSE,
-    param_attr={'name': 'shared_w'})
-embed_forth = layers.embedding(
+    param_attr='shared_w')
+embed_forth = fluid.layers.embedding(
     input=forth_word,
     size=[dict_size, EMBED_SIZE],
     dtype='float32',
     is_sparse=IS_SPARSE,
-    param_attr={'name': 'shared_w'})
+    param_attr='shared_w')
 
-concat_embed = layers.concat(
+concat_embed = fluid.layers.concat(
     input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
-hidden1 = layers.fc(input=concat_embed, size=HIDDEN_SIZE, act='sigmoid')
-predict_word = layers.fc(input=hidden1, size=dict_size, act='softmax')
-cost = layers.cross_entropy(input=predict_word, label=next_word)
-avg_cost = layers.mean(x=cost)
-sgd_optimizer = SGDOptimizer(learning_rate=0.001)
-opts = sgd_optimizer.minimize(avg_cost)
+hidden1 = fluid.layers.fc(input=concat_embed, size=HIDDEN_SIZE, act='sigmoid')
+predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax')
+cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
+avg_cost = fluid.layers.mean(x=cost)
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+sgd_optimizer.minimize(avg_cost)
 
 train_reader = paddle.batch(
     paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
 
-place = core.CPUPlace()
-exe = Executor(place)
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
 
-# fix https://github.com/PaddlePaddle/Paddle/issues/5434 then remove
-# below exit line.
-exit(0)
-
-exe.run(framework.default_startup_program())
+exe.run(fluid.default_startup_program())
 
 for pass_id in range(PASS_NUM):
     for data in train_reader():
@@ -74,36 +66,15 @@ for pass_id in range(PASS_NUM):
         input_data = map(lambda x: np.array(x).astype("int64"), input_data)
         input_data = map(lambda x: np.expand_dims(x, axis=1), input_data)
 
-        first_data = input_data[0]
-        first_tensor = core.LoDTensor()
-        first_tensor.set(first_data, place)
-
-        second_data = input_data[1]
-        second_tensor = core.LoDTensor()
-        second_tensor.set(second_data, place)
-
-        third_data = input_data[2]
-        third_tensor = core.LoDTensor()
-        third_tensor.set(third_data, place)
-
-        forth_data = input_data[3]
-        forth_tensor = core.LoDTensor()
-        forth_tensor.set(forth_data, place)
-
-        next_data = input_data[4]
-        next_tensor = core.LoDTensor()
-        next_tensor.set(next_data, place)
-
-        outs = exe.run(framework.default_main_program(),
-                       feed={
-                           'firstw': first_tensor,
-                           'secondw': second_tensor,
-                           'thirdw': third_tensor,
-                           'forthw': forth_tensor,
-                           'nextw': next_tensor
-                       },
-                       fetch_list=[avg_cost])
-        out = np.array(outs[0])
-        if out[0] < 10.0:
+        avg_cost_np = exe.run(fluid.default_main_program(),
+                              feed={
+                                  'firstw': input_data[0],
+                                  'secondw': input_data[1],
+                                  'thirdw': input_data[2],
+                                  'forthw': input_data[3],
+                                  'nextw': input_data[4]
+                              },
+                              fetch_list=[avg_cost])
+        if avg_cost_np[0] < 5.0:
             exit(0)  # if avg cost less than 10.0, we think our code is good.
 exit(1)
diff --git a/python/paddle/v2/fluid/tests/op_test.py b/python/paddle/v2/fluid/tests/op_test.py
index 51023bd19a8326152335eabc9e96600427527f26..e83c4a0622013cbfebdf39434ef252412697acb1 100644
--- a/python/paddle/v2/fluid/tests/op_test.py
+++ b/python/paddle/v2/fluid/tests/op_test.py
@@ -261,7 +261,10 @@ class OpTest(unittest.TestCase):
         feed_map = self.feed_var(inputs, place)
 
         exe = Executor(place)
-        outs = exe.run(program, feed=feed_map, fetch_list=fetch_list)
+        outs = exe.run(program,
+                       feed=feed_map,
+                       fetch_list=fetch_list,
+                       return_numpy=False)
 
         for out_name, out_dup in Operator.get_op_outputs(self.op_type):
             if out_name not in self.outputs:
@@ -500,5 +503,6 @@ class OpTest(unittest.TestCase):
 
         fetch_list = [g for p, g in param_grad_list]
         executor = Executor(place)
-        result = executor.run(prog, feed_dict, fetch_list)
-        return map(np.array, result)
+        return map(
+            np.array,
+            executor.run(prog, feed_dict, fetch_list, return_numpy=False))
diff --git a/python/paddle/v2/fluid/tests/test_activation_op.py b/python/paddle/v2/fluid/tests/test_activation_op.py
index 7649e60a3833e34523d87cb963af3888c3cef65d..bd52bef2605874d26e880fb09e589891fc1934d5 100644
--- a/python/paddle/v2/fluid/tests/test_activation_op.py
+++ b/python/paddle/v2/fluid/tests/test_activation_op.py
@@ -152,6 +152,49 @@ class TestAbs(OpTest):
         self.check_grad(['X'], 'Y', max_relative_error=0.007)
 
 
+class TestCeil(OpTest):
+    def setUp(self):
+        self.op_type = "ceil"
+        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.inputs = {'X': x}
+        self.outputs = {'Y': np.ceil(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+
+
+class TestFloor(OpTest):
+    def setUp(self):
+        self.op_type = "floor"
+        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.inputs = {'X': x}
+        # numpy floor need +1
+        self.outputs = {'Y': np.floor(self.inputs['X']) + 1.0}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+
+
+class TestRound(OpTest):
+    def setUp(self):
+        self.op_type = "round"
+        x = np.random.uniform(-1, 1, [4, 4]).astype("float32")
+        self.inputs = {'X': x}
+        self.outputs = {'Y': np.round(self.inputs['X'])}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Y', max_relative_error=0.007)
+
+
 class TestRelu(OpTest):
     def setUp(self):
         self.op_type = "relu"
diff --git a/python/paddle/v2/fluid/tests/test_array_read_write_op.py b/python/paddle/v2/fluid/tests/test_array_read_write_op.py
index e019a4e15f0e25deaedf30911b44e576c8f89013..f6120aedecf1015c279b8f218f5e37f2e598ab91 100644
--- a/python/paddle/v2/fluid/tests/test_array_read_write_op.py
+++ b/python/paddle/v2/fluid/tests/test_array_read_write_op.py
@@ -3,7 +3,7 @@ import paddle.v2.fluid.core as core
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.executor import Executor
 from paddle.v2.fluid.backward import append_backward_ops
-from paddle.v2.fluid.framework import g_main_program
+from paddle.v2.fluid.framework import default_main_program
 import numpy
 
 
@@ -52,15 +52,13 @@ class TestArrayReadWrite(unittest.TestCase):
 
         exe = Executor(cpu)
 
-        tensor = core.LoDTensor()
-        tensor.set(numpy.random.random(size=(100, 100)).astype('float32'), cpu)
+        tensor = numpy.random.random(size=(100, 100)).astype('float32')
 
-        outs = map(numpy.array,
-                   exe.run(feed={'x0': tensor,
-                                 'x1': tensor,
-                                 'x2': tensor},
-                           fetch_list=[a_sum, x_sum],
-                           scope=scope))
+        outs = exe.run(feed={'x0': tensor,
+                             'x1': tensor,
+                             'x2': tensor},
+                       fetch_list=[a_sum, x_sum],
+                       scope=scope)
         self.assertEqual(outs[0], outs[1])
 
         total_sum = layers.sums(input=[a_sum, x_sum])
@@ -68,16 +66,15 @@ class TestArrayReadWrite(unittest.TestCase):
 
         append_backward_ops(total_sum_scaled)
 
-        g_vars = map(g_main_program.global_block().var,
+        g_vars = map(default_main_program().global_block().var,
                      [each_x.name + "@GRAD" for each_x in x])
         g_out = [
             item.sum()
-            for item in map(
-                numpy.array,
-                exe.run(feed={'x0': tensor,
-                              'x1': tensor,
-                              'x2': tensor},
-                        fetch_list=g_vars))
+            for item in exe.run(
+                feed={'x0': tensor,
+                      'x1': tensor,
+                      'x2': tensor},
+                fetch_list=g_vars)
         ]
         g_out_sum = numpy.array(g_out).sum()
 
diff --git a/python/paddle/v2/fluid/tests/test_batch_norm_op.py b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
index 71f9599e0de83c86808f7e62547f80d3d50ffc7d..e766a68c0e338b07e47260e40edc544c98555382 100644
--- a/python/paddle/v2/fluid/tests/test_batch_norm_op.py
+++ b/python/paddle/v2/fluid/tests/test_batch_norm_op.py
@@ -21,6 +21,13 @@ def get_backward_op(scope, op, no_grad_set):
 
 
 def _reference_training(x, scale, offset, epsilon, data_format):
+    x_shape = x.shape
+    if len(x_shape) == 2:
+        if data_format == "NCHW":
+            x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
+        else:
+            x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
+
     if data_format == "NCHW":
         n, c, h, w = x.shape
         x_square = x * x
@@ -39,6 +46,8 @@ def _reference_training(x, scale, offset, epsilon, data_format):
         offset_tile = np.reshape(offset, (1, c, 1, 1))
         offset_tile = np.reshape(offset_tile, (1, c, 1, 1))
         y = normalized * scale_tile + offset_tile
+        if len(x_shape) == 2:
+            y = np.reshape(y, (y.shape[0], y.shape[1]))
         return y, mean, var
     elif data_format == "NHWC":
         x_square = x * x
@@ -48,7 +57,10 @@ def _reference_training(x, scale, offset, epsilon, data_format):
         mean = x_sum / element_count
         var = x_square_sum / element_count - mean * mean
         normalized = (x - mean) / np.sqrt(var + epsilon)
-        return (normalized * scale + offset), mean, var
+        y = normalized * scale + offset
+        if len(x_shape) == 2:
+            y = np.reshape(y, x_shape)
+        return y, mean, var
     else:
         raise ValueError("Unknown data order.")
 
@@ -65,6 +77,18 @@ def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format):
     #   (x - mean) * sum(grad_y * (x - mean)) / (var + epsilon))
 
     # transfer from (N, C, H, W) to (N, H, W, C) to simplify computation
+    x_shape = x.shape
+
+    if len(x_shape) == 2:
+        if data_format == "NCHW":
+            x = np.reshape(x, (x.shape[0], x.shape[1], 1, 1))
+            grad_y = np.reshape(grad_y,
+                                (grad_y.shape[0], grad_y.shape[1], 1, 1))
+        else:
+            x = np.reshape(x, (x.shape[0], 1, 1, x.shape[1]))
+            grad_y = np.reshape(grad_y,
+                                (grad_y.shape[0], 1, 1, grad_y.shape[1]))
+
     if data_format == "NCHW":
         x = np.transpose(x, (0, 2, 3, 1))
         grad_y = np.transpose(grad_y, (0, 2, 3, 1))
@@ -83,6 +107,9 @@ def _reference_grad(x, grad_y, scale, mean, var, epsilon, data_format):
         grad_x = np.transpose(grad_x, (0, 3, 1, 2))
         x = np.transpose(x, (0, 3, 1, 2))
         grad_y = np.transpose(grad_y, (0, 3, 1, 2))
+
+    if len(x_shape) == 2:
+        grad_x = np.reshape(grad_x, x_shape)
     return grad_x, grad_scale, grad_offset
 
 
@@ -127,7 +154,7 @@ class TestBatchNormOp(OpTest):
         momentum = 0.9
 
         # N, H, W, C: 2, 3, 4, 2
-        n, h, w, c = 2, 3, 4, 2
+        n, h, w, c = 2, 3, 4, 5
         x_shape = [n, h, w, c]
         scale_shape = [c]
 
@@ -184,20 +211,23 @@ class TestBatchNormOp(OpTest):
         print 'python: NHWC, NCHW, backward checking passed'
 
     def test_forward_backward(self):
-        def test_with_place(place, tensor_format):
+        def test_with_place(place, tensor_format, shape):
             # attr
             epsilon = 0.00001
             momentum = 0.9
 
-            # N, H, W, C: 12, 3, 4, 2
-            n, h, w, c = 2, 3, 4, 2
-
-            if data_format == "NHWC":
-                x_shape = [n, h, w, c]
-            elif data_format == "NCHW":
-                x_shape = [n, c, h, w]
+            if len(shape) == 2:
+                x_shape = shape
+                c = shape[1]
             else:
-                raise ValueError("Unknown data type.")
+                # n, h, w, c = 2, 3, 4, 2
+                n, h, w, c = shape[0], shape[1], shape[2], shape[3]
+                if data_format == "NHWC":
+                    x_shape = [n, h, w, c]
+                elif data_format == "NCHW":
+                    x_shape = [n, c, h, w]
+                else:
+                    raise ValueError("Unknown data type.")
             scale_shape = [c]
 
             x_val = np.random.random_sample(x_shape).astype(np.float32)
@@ -219,7 +249,10 @@ class TestBatchNormOp(OpTest):
             #  for gradient test
             # y_grad = np.ones(x_shape).astype(np.float32)
             y_grad = np.zeros(x_shape).astype(np.float32)
-            y_grad[0, 0, 0, 0] = 1.
+            if len(y_grad.shape) == 2:
+                y_grad[0, 0] = 1.
+            else:
+                y_grad[0, 0, 0, 0] = 1.
             # y_grad = np.random.random_sample(x_shape).astype(np.float32)
             x_grad_ref, scale_grad_ref, bias_grad_ref = _reference_grad(
                 x_val, y_grad, scale_val, saved_mean, var_ref, epsilon,
@@ -313,7 +346,8 @@ class TestBatchNormOp(OpTest):
             places.append(core.GPUPlace(0))
         for place in places:
             for data_format in ["NCHW", "NHWC"]:
-                test_with_place(place, data_format)
+                test_with_place(place, data_format, [2, 3, 4, 5])
+                test_with_place(place, data_format, [2, 3])
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_conditional_block.py b/python/paddle/v2/fluid/tests/test_conditional_block.py
index 2a30fd107968ce0fa188bda44e731ad760dce1f5..2b9d8f351a2836cd723d629d4790de1e068d0ea3 100644
--- a/python/paddle/v2/fluid/tests/test_conditional_block.py
+++ b/python/paddle/v2/fluid/tests/test_conditional_block.py
@@ -1,7 +1,7 @@
 import unittest
 import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.core as core
-from paddle.v2.fluid.framework import g_startup_program, g_main_program
+from paddle.v2.fluid.framework import default_startup_program, default_main_program
 from paddle.v2.fluid.executor import Executor
 from paddle.v2.fluid.backward import append_backward_ops
 import numpy
@@ -19,20 +19,19 @@ class ConditionalBlock(unittest.TestCase):
 
         cpu = core.CPUPlace()
         exe = Executor(cpu)
-        exe.run(g_startup_program)
+        exe.run(default_startup_program())
 
-        x = core.LoDTensor()
-        x.set(numpy.random.random(size=(10, 1)).astype('float32'), cpu)
+        x = numpy.random.random(size=(10, 1)).astype('float32')
 
-        outs = map(numpy.array, exe.run(feed={'X': x}, fetch_list=[out]))[0]
+        outs = exe.run(feed={'X': x}, fetch_list=[out])[0]
         print outs
         loss = layers.mean(x=out)
         append_backward_ops(loss=loss)
-        outs = map(numpy.array,
-                   exe.run(feed={'X': x},
-                           fetch_list=[
-                               g_main_program.block(0).var(data.name + "@GRAD")
-                           ]))[0]
+        outs = exe.run(
+            feed={'X': x},
+            fetch_list=[
+                default_main_program().block(0).var(data.name + "@GRAD")
+            ])[0]
         print outs
 
 
diff --git a/python/paddle/v2/fluid/tests/test_conv2d_op.py b/python/paddle/v2/fluid/tests/test_conv2d_op.py
index 2240dc73cdd31f320fed174dd811e93c6640137f..e82e3ab0c9c0bc75a13a8948fda925bc4f0b6512 100644
--- a/python/paddle/v2/fluid/tests/test_conv2d_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv2d_op.py
@@ -16,8 +16,8 @@ def conv2d_forward_naive(input, filter, group, conv_param):
     out_w = 1 + (in_w + 2 * pad[1] - (dilation[1] * (f_w - 1) + 1)) / stride[1]
     out = np.zeros((in_n, out_c, out_h, out_w))
 
-    d_bolck_w = (dilation[0] * (f_h - 1) + 1)
-    d_bolck_h = (dilation[1] * (f_w - 1) + 1)
+    d_bolck_h = (dilation[0] * (f_h - 1) + 1)
+    d_bolck_w = (dilation[1] * (f_w - 1) + 1)
 
     input_pad = np.pad(input, ((0, ), (0, ), (pad[0], ), (pad[1], )),
                        mode='constant',
@@ -167,27 +167,27 @@ class TestWithDilation(TestConv2dOp):
 #----------------Conv2dCudnn----------------
 class TestCudnn(TestConv2dOp):
     def init_op_type(self):
-        self.op_type = "conv_cudnn"
+        self.op_type = "conv2d_cudnn"
 
 
 class TestCudnnWithPad(TestWithPad):
     def init_op_type(self):
-        self.op_type = "conv_cudnn"
+        self.op_type = "conv2d_cudnn"
 
 
 class TestCudnnWithStride(TestWithStride):
     def init_op_type(self):
-        self.op_type = "conv_cudnn"
+        self.op_type = "conv2d_cudnn"
 
 
 class TestCudnnWithGroup(TestWithGroup):
     def init_op_type(self):
-        self.op_type = "conv_cudnn"
+        self.op_type = "conv2d_cudnn"
 
 
 class TestCudnnWith1x1(TestWith1x1):
     def init_op_type(self):
-        self.op_type = "conv_cudnn"
+        self.op_type = "conv2d_cudnn"
 
 
 #  cudnn v5 does not support dilation conv.
diff --git a/python/paddle/v2/fluid/tests/test_conv3d_op.py b/python/paddle/v2/fluid/tests/test_conv3d_op.py
index 934ea46437d67b78309a86a2779e0c6577399136..8593dff20b5c283d5862206dfb0c0d2501039d07 100644
--- a/python/paddle/v2/fluid/tests/test_conv3d_op.py
+++ b/python/paddle/v2/fluid/tests/test_conv3d_op.py
@@ -169,5 +169,31 @@ class TestWithDilation(TestConv3dOp):
         self.groups = 3
 
 
+class TestCudnn(TestConv3dOp):
+    def init_op_type(self):
+        self.op_type = "conv3d_cudnn"
+
+
+class TestWithGroup1Cudnn(TestWithGroup1):
+    def init_op_type(self):
+        self.op_type = "conv3d_cudnn"
+
+
+class TestWithGroup2Cudnn(TestWithGroup2):
+    def init_op_type(self):
+        self.op_type = "conv3d_cudnn"
+
+
+class TestWith1x1Cudnn(TestWith1x1):
+    def init_op_type(self):
+        self.op_type = "conv3d_cudnn"
+
+
+# FIXME(typhoonzero): find a way to determine if
+# using cudnn > 6 in python
+# class TestWithDilationCudnn(TestWithDilation):
+#     def init_op_type(self):
+#         self.op_type = "conv3d_cudnn"
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_dynamic_recurrent_op.py b/python/paddle/v2/fluid/tests/test_dynamic_recurrent_op.py
deleted file mode 100644
index c2d8b48ea944ae40a451492b8e9fad38dda0835c..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/test_dynamic_recurrent_op.py
+++ /dev/null
@@ -1,171 +0,0 @@
-import logging
-import paddle.v2.fluid.core as core
-import unittest
-from paddle.v2.fluid.op import Operator, DynamicRecurrentOp
-import numpy as np
-
-# for siplicity, just one level LoD
-lod_py = [[0, 4, 7, 9, 10]]
-input_dim = 30
-num_sents = len(lod_py[0]) - 1
-weight_dim = 15
-
-
-def create_tensor(scope, name, shape, np_data):
-    tensor = scope.var(name).get_tensor()
-    tensor.set_dims(shape)
-    tensor.set(np_data, core.CPUPlace())
-    return tensor
-
-
-class PyRNNStep(object):
-    def __init__(self):
-
-        self.x = np.random.normal(size=(lod_py[0][-1],
-                                        input_dim)).astype("float32")
-        self.W = np.random.normal(size=(input_dim, input_dim)).astype("float32")
-        self.U = np.random.normal(size=(input_dim, input_dim)).astype("float32")
-        self.h_boot = np.random.normal(size=(num_sents,
-                                             input_dim)).astype("float32")
-
-
-class DynamicRecurrentOpTest(unittest.TestCase):
-    '''
-    Test RNNOp
-
-    equation:
-        h_t = \sigma (W x_t + U h_{t-1})
-    weights:
-        - W
-        - U
-    vars:
-        - x
-    states:
-        - h
-    outputs:
-       - h
-    '''
-
-    py = PyRNNStep()
-
-    def forward(self):
-        self.scope = core.Scope()
-        self.create_global_variables()
-        self.create_rnn_op()
-        self.create_step_net()
-        ctx = core.DeviceContext.create(core.CPUPlace())
-        self.rnnop.run(self.scope, ctx)
-        state = self.rnnop.get_state("h@state")
-        print 'state size: ', state.size()
-
-        step_inputs = self.rnnop.get_step_input("x")
-        print "x size ", step_inputs.size()
-        for i in range(step_inputs.size()):
-            print "x %d" % i, np.array(step_inputs.read(i).get_dims())
-        step_outputs = self.rnnop.get_step_output('h@state')
-        print 'step_outputs.size ', step_outputs.size()
-        output = self.scope.find_var("h@state").get_tensor()
-        print 'output', np.array(output).shape
-
-    def create_global_variables(self):
-        # create inlink
-        x_tensor = create_tensor(self.scope, "x", [num_sents, input_dim],
-                                 self.py.x)
-        x_tensor.set_lod(lod_py)
-        create_tensor(self.scope, "W", [input_dim, input_dim], self.py.W)
-        create_tensor(self.scope, "U", [input_dim, input_dim], self.py.U)
-        create_tensor(self.scope, "h_boot", [num_sents, input_dim],
-                      self.py.h_boot)
-        self.scope.var("step_scopes")
-        self.scope.var("h@state")
-
-    def create_rnn_op(self):
-        # create RNNOp
-        self.rnnop = DynamicRecurrentOp(
-            # inputs
-            inputs=["x"],
-            initial_states=["h_boot"],
-            step_net="step_unit",
-            # outputs
-            outputs=["h@state"],
-            step_scopes="step_scopes",
-            # attributes
-            ex_states=["h@pre"],
-            states=["h@state"])
-
-    def create_step_net(self):
-        step_unit = core.Net.create()
-        x_fc_op = Operator("mul", X="x", Y="W", Out="Wx")
-        h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
-        sum_op = Operator("sum", X=["Wx", "Uh"], Out="sum")
-        sig_op = Operator("sigmoid", X="sum", Y="h@state")
-
-        for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
-            step_unit.append_op(op)
-        step_unit.complete_add_op(True)
-        self.rnnop.set_step_unit(step_unit)
-
-    def test_forward(self):
-        print 'test recurrent op forward'
-        pd_output = self.forward()
-        print 'pd_output', pd_output
-
-
-class RecurrentGradientOpTest(unittest.TestCase):
-    py = PyRNNStep()
-
-    def create_forward_op(self):
-        # create RNNOp
-        self.forward_op = DynamicRecurrentOp(
-            # inputs
-            inputs=["x"],
-            initial_states=["h_boot"],
-            step_net="step_unit",
-            # outputs
-            outputs=["h@state"],
-            step_scopes="step_scopes",
-            # attributes
-            ex_states=["h@pre"],
-            states=["h@state"])
-
-    def create_gradient_op(self):
-        a = set()
-        backward_op = core.DynamicRecurrentOp.backward(self.forward_op, a)
-
-    def create_step_net(self):
-        step_unit = core.Net.create()
-        x_fc_op = Operator("mul", X="x", Y="W", Out="Wx")
-        h_fc_op = Operator("mul", X="h@pre", Y="U", Out="Uh")
-        sum_op = Operator("sum", X=["Wx", "Uh"], Out="sum")
-        sig_op = Operator("sigmoid", X="sum", Y="h@state")
-
-        for op in [x_fc_op, h_fc_op, sum_op, sig_op]:
-            step_unit.append_op(op)
-        step_unit.complete_add_op(True)
-        self.forward_op.set_step_unit(step_unit)
-
-    def create_global_variables(self):
-        # create inlink
-        x_tensor = create_tensor(self.scope, "x", [num_sents, input_dim],
-                                 self.py.x)
-        x_tensor.set_lod(lod_py)
-        create_tensor(self.scope, "W", [input_dim, input_dim], self.py.W)
-        create_tensor(self.scope, "U", [input_dim, input_dim], self.py.U)
-        create_tensor(self.scope, "h_boot", [num_sents, input_dim],
-                      self.py.h_boot)
-        self.scope.var("step_scopes")
-        self.scope.var("h@state")
-
-    def test_grad(self):
-        self.scope = core.Scope()
-        self.create_forward_op()
-        self.create_global_variables()
-        self.create_step_net()
-        self.create_gradient_op()
-
-
-if __name__ == '__main__':
-    exit(
-        0
-    )  # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957
-    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_executor_and_mul.py b/python/paddle/v2/fluid/tests/test_executor_and_mul.py
index da64739de5eb4eca8db8ac8370276c41692a7242..b1ef87c5cb1711c419b401c5950839816f7f4160 100644
--- a/python/paddle/v2/fluid/tests/test_executor_and_mul.py
+++ b/python/paddle/v2/fluid/tests/test_executor_and_mul.py
@@ -1,9 +1,10 @@
 import unittest
-from paddle.v2.fluid.layers import mul, data
+
+import numpy
 import paddle.v2.fluid.core as core
+
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.framework import g_main_program
-import numpy
+from paddle.v2.fluid.layers import mul, data
 
 
 class TestExecutor(unittest.TestCase):
@@ -17,17 +18,10 @@ class TestExecutor(unittest.TestCase):
         out = mul(x=a, y=b)
         place = core.CPUPlace()
         a_np = numpy.random.random((100, 784)).astype('float32')
-        tensor_a = core.LoDTensor()
-        tensor_a.set(a_np, place)
         b_np = numpy.random.random((784, 100)).astype('float32')
-        tensor_b = core.LoDTensor()
-        tensor_b.set(b_np, place)
         exe = Executor(place)
-        outs = exe.run(g_main_program,
-                       feed={'a': tensor_a,
-                             'b': tensor_b},
-                       fetch_list=[out])
-        out = numpy.array(outs[0])
+        outs = exe.run(feed={'a': a_np, 'b': b_np}, fetch_list=[out])
+        out = outs[0]
         self.assertEqual((100, 100), out.shape)
         self.assertTrue(numpy.allclose(out, numpy.dot(a_np, b_np)))
 
diff --git a/python/paddle/v2/fluid/tests/test_image_classification_layer.py b/python/paddle/v2/fluid/tests/test_image_classification_layer.py
index 8e8e1b0a8c07a60cb1404462f976d10fe26e87f6..2fd609d4474e97ecd96adcd146f2f550e0772740 100644
--- a/python/paddle/v2/fluid/tests/test_image_classification_layer.py
+++ b/python/paddle/v2/fluid/tests/test_image_classification_layer.py
@@ -1,6 +1,6 @@
 import unittest
 
-import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid as fluid
 import paddle.v2.fluid.nets as nets
 from paddle.v2.fluid.framework import Program
 
@@ -29,27 +29,35 @@ class TestLayer(unittest.TestCase):
     def test_batch_norm_layer(self):
         main_program = Program()
         startup_program = Program()
-        images = layers.data(
+        images = fluid.layers.data(
             name='pixel',
             shape=[3, 48, 48],
             dtype='float32',
             main_program=main_program)
-        layers.batch_norm(
+        hidden1 = fluid.layers.batch_norm(
             input=images,
             main_program=main_program,
             startup_program=startup_program)
+        hidden2 = fluid.layers.fc(input=hidden1,
+                                  size=128,
+                                  act='relu',
+                                  main_program=main_program)
+        hidden3 = fluid.layers.batch_norm(
+            input=hidden2,
+            main_program=main_program,
+            startup_program=startup_program)
 
-        # print str(main_program)
+        print str(main_program)
 
     def test_dropout_layer(self):
         main_program = Program()
         startup_program = Program()
-        images = layers.data(
+        images = fluid.layers.data(
             name='pixel',
             shape=[3, 48, 48],
             dtype='float32',
             main_program=main_program)
-        layers.dropout(
+        fluid.layers.dropout(
             x=images,
             dropout_prob=0.5,
             main_program=main_program,
@@ -61,7 +69,7 @@ class TestLayer(unittest.TestCase):
         main_program = Program()
         startup_program = Program()
 
-        images = layers.data(
+        images = fluid.layers.data(
             name='pixel',
             shape=[3, 48, 48],
             dtype='float32',
@@ -77,19 +85,19 @@ class TestLayer(unittest.TestCase):
     def test_elementwise_add_with_act(self):
         main_program = Program()
         startup_program = Program()
-        image1 = layers.data(
+        image1 = fluid.layers.data(
             name='pixel1',
             shape=[3, 48, 48],
             dtype='float32',
             main_program=main_program,
             startup_program=startup_program)
-        image2 = layers.data(
+        image2 = fluid.layers.data(
             name='pixel2',
             shape=[3, 48, 48],
             dtype='float32',
             main_program=main_program,
             startup_program=startup_program)
-        out = layers.elementwise_add(
+        out = fluid.layers.elementwise_add(
             x=image1,
             y=image2,
             act='relu',
diff --git a/python/paddle/v2/fluid/tests/test_inference_model_io.py b/python/paddle/v2/fluid/tests/test_inference_model_io.py
index 74f1ce23262bbc969f9544885a7390534c76cdf6..60aed62ead83dedbeb9438c431ec292558d88ce5 100644
--- a/python/paddle/v2/fluid/tests/test_inference_model_io.py
+++ b/python/paddle/v2/fluid/tests/test_inference_model_io.py
@@ -1,13 +1,13 @@
-import paddle.v2 as paddle
-import paddle.v2.fluid.layers as layers
+import unittest
+
+import numpy as np
 import paddle.v2.fluid.core as core
-import paddle.v2.fluid.optimizer as optimizer
 
+import paddle.v2.fluid.executor as executor
+import paddle.v2.fluid.layers as layers
+import paddle.v2.fluid.optimizer as optimizer
 from paddle.v2.fluid.framework import Program
 from paddle.v2.fluid.io import save_inference_model, load_inference_model
-import paddle.v2.fluid.executor as executor
-import unittest
-import numpy as np
 
 
 class TestBook(unittest.TestCase):
@@ -44,7 +44,7 @@ class TestBook(unittest.TestCase):
             x=cost, main_program=program, startup_program=init_program)
 
         sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.001)
-        opts = sgd_optimizer.minimize(avg_cost, init_program)
+        sgd_optimizer.minimize(avg_cost, init_program)
 
         place = core.CPUPlace()
         exe = executor.Executor(place)
@@ -52,25 +52,20 @@ class TestBook(unittest.TestCase):
         exe.run(init_program, feed={}, fetch_list=[])
 
         for i in xrange(100):
-            x_data = np.array(
+            tensor_x = np.array(
                 [[1, 1], [1, 2], [3, 4], [5, 2]]).astype("float32")
-            y_data = np.array([[-2], [-3], [-7], [-7]]).astype("float32")
+            tensor_y = np.array([[-2], [-3], [-7], [-7]]).astype("float32")
 
-            tensor_x = core.LoDTensor()
-            tensor_x.set(x_data, place)
-            tensor_y = core.LoDTensor()
-            tensor_y.set(y_data, place)
             exe.run(program,
                     feed={'x': tensor_x,
                           'y': tensor_y},
                     fetch_list=[avg_cost])
 
         save_inference_model(MODEL_DIR, ["x", "y"], [avg_cost], exe, program)
-        outs = exe.run(program,
-                       feed={'x': tensor_x,
-                             'y': tensor_y},
-                       fetch_list=[avg_cost])
-        expected = np.array(outs[0])
+        expected = exe.run(program,
+                           feed={'x': tensor_x,
+                                 'y': tensor_y},
+                           fetch_list=[avg_cost])[0]
 
         reload(executor)  # reload to build a new scope
         exe = executor.Executor(place)
@@ -83,7 +78,7 @@ class TestBook(unittest.TestCase):
             feed={feed_var_names[0]: tensor_x,
                   feed_var_names[1]: tensor_y},
             fetch_list=fetch_vars)
-        actual = np.array(outs[0])
+        actual = outs[0]
 
         self.assertEqual(feed_var_names, ["x", "y"])
         self.assertEqual(len(fetch_vars), 1)
diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py
index 87dc6d1a6270e0f8425b56601d04049450c73380..a9d9d369c7377e8c758b7eea5aacdbfcee269f18 100644
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -1,183 +1,151 @@
+from __future__ import print_function
 import unittest
 
 import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.nets as nets
-from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.framework import Program, program_guard
 
 
 class TestBook(unittest.TestCase):
     def test_fit_a_line(self):
         program = Program()
-        x = layers.data(
-            name='x', shape=[13], dtype='float32', main_program=program)
-        y_predict = layers.fc(input=x, size=1, act=None, main_program=program)
+        with program_guard(program, startup_program=Program()):
+            x = layers.data(name='x', shape=[13], dtype='float32')
+            y_predict = layers.fc(input=x, size=1, act=None)
+            y = layers.data(name='y', shape=[1], dtype='float32')
+            cost = layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = layers.mean(x=cost)
+            self.assertIsNotNone(avg_cost)
+            program.append_backward(avg_cost)
 
-        y = layers.data(
-            name='y', shape=[1], dtype='float32', main_program=program)
-        cost = layers.square_error_cost(
-            input=y_predict, label=y, main_program=program)
-
-        avg_cost = layers.mean(x=cost, main_program=program)
-        self.assertIsNotNone(avg_cost)
-        program.append_backward(avg_cost)
-
-        print str(program)
+        print(str(program))
 
     def test_recognize_digits_mlp(self):
         program = Program()
-
-        # Change g_program, so the rest layers use `g_program`
-        images = layers.data(
-            name='pixel', shape=[784], dtype='float32', main_program=program)
-        label = layers.data(
-            name='label', shape=[1], dtype='int32', main_program=program)
-        hidden1 = layers.fc(input=images,
-                            size=128,
-                            act='relu',
-                            main_program=program)
-        hidden2 = layers.fc(input=hidden1,
-                            size=64,
-                            act='relu',
-                            main_program=program)
-        predict = layers.fc(input=hidden2,
-                            size=10,
-                            act='softmax',
-                            main_program=program)
-        cost = layers.cross_entropy(
-            input=predict, label=label, main_program=program)
-        avg_cost = layers.mean(x=cost, main_program=program)
-        self.assertIsNotNone(avg_cost)
-
-        print str(program)
+        with program_guard(program, startup_program=Program()):
+            # Change g_program, so the rest layers use `g_program`
+            images = layers.data(name='pixel', shape=[784], dtype='float32')
+            label = layers.data(name='label', shape=[1], dtype='int32')
+            hidden1 = layers.fc(input=images, size=128, act='relu')
+            hidden2 = layers.fc(input=hidden1, size=64, act='relu')
+            predict = layers.fc(input=hidden2, size=10, act='softmax')
+            cost = layers.cross_entropy(input=predict, label=label)
+            avg_cost = layers.mean(x=cost)
+            self.assertIsNotNone(avg_cost)
+
+        print(str(program))
 
     def test_simple_conv2d(self):
         program = Program()
-        images = layers.data(
-            name='pixel',
-            shape=[3, 48, 48],
-            dtype='int32',
-            main_program=program)
-        layers.conv2d(
-            input=images,
-            num_filters=3,
-            filter_size=[4, 4],
-            main_program=program)
-
-        print str(program)
+        with program_guard(program, startup_program=Program()):
+            images = layers.data(name='pixel', shape=[3, 48, 48], dtype='int32')
+            layers.conv2d(input=images, num_filters=3, filter_size=[4, 4])
 
-    def test_recognize_digits_conv(self):
+        print(str(program))
+
+    def test_conv2d_transpose(self):
         program = Program()
+        with program_guard(program):
+            img = layers.data(name='pixel', shape=[3, 2, 2], dtype='float32')
+            layers.conv2d_transpose(input=img, num_filters=10, output_size=28)
+        print(str(program))
 
-        images = layers.data(
-            name='pixel',
-            shape=[1, 28, 28],
-            dtype='float32',
-            main_program=program)
-        label = layers.data(
-            name='label', shape=[1], dtype='int32', main_program=program)
-        conv_pool_1 = nets.simple_img_conv_pool(
-            input=images,
-            filter_size=5,
-            num_filters=2,
-            pool_size=2,
-            pool_stride=2,
-            act="relu",
-            main_program=program)
-        conv_pool_2 = nets.simple_img_conv_pool(
-            input=conv_pool_1,
-            filter_size=5,
-            num_filters=4,
-            pool_size=2,
-            pool_stride=2,
-            act="relu",
-            main_program=program)
-
-        predict = layers.fc(input=conv_pool_2,
-                            size=10,
-                            act="softmax",
-                            main_program=program)
-        cost = layers.cross_entropy(
-            input=predict, label=label, main_program=program)
-        avg_cost = layers.mean(x=cost, main_program=program)
-
-        program.append_backward(avg_cost)
-
-        print str(program)
+    def test_recognize_digits_conv(self):
+        program = Program()
+        with program_guard(program, startup_program=Program()):
+            images = layers.data(
+                name='pixel', shape=[1, 28, 28], dtype='float32')
+            label = layers.data(name='label', shape=[1], dtype='int32')
+            conv_pool_1 = nets.simple_img_conv_pool(
+                input=images,
+                filter_size=5,
+                num_filters=2,
+                pool_size=2,
+                pool_stride=2,
+                act="relu")
+            conv_pool_2 = nets.simple_img_conv_pool(
+                input=conv_pool_1,
+                filter_size=5,
+                num_filters=4,
+                pool_size=2,
+                pool_stride=2,
+                act="relu")
+
+            predict = layers.fc(input=conv_pool_2, size=10, act="softmax")
+            cost = layers.cross_entropy(input=predict, label=label)
+            avg_cost = layers.mean(x=cost)
+
+            program.append_backward(avg_cost)
+
+        print(str(program))
 
     def test_word_embedding(self):
         program = Program()
-        dict_size = 10000
-        embed_size = 32
-        first_word = layers.data(
-            name='firstw', shape=[1], dtype='int64', main_program=program)
-        second_word = layers.data(
-            name='secondw', shape=[1], dtype='int64', main_program=program)
-        third_word = layers.data(
-            name='thirdw', shape=[1], dtype='int64', main_program=program)
-        forth_word = layers.data(
-            name='forthw', shape=[1], dtype='int64', main_program=program)
-        next_word = layers.data(
-            name='nextw', shape=[1], dtype='int64', main_program=program)
-
-        embed_first = layers.embedding(
-            input=first_word,
-            size=[dict_size, embed_size],
-            dtype='float32',
-            param_attr={'name': 'shared_w'},
-            main_program=program)
-        embed_second = layers.embedding(
-            input=second_word,
-            size=[dict_size, embed_size],
-            dtype='float32',
-            param_attr={'name': 'shared_w'},
-            main_program=program)
-
-        embed_third = layers.embedding(
-            input=third_word,
-            size=[dict_size, embed_size],
-            dtype='float32',
-            param_attr={'name': 'shared_w'},
-            main_program=program)
-        embed_forth = layers.embedding(
-            input=forth_word,
-            size=[dict_size, embed_size],
-            dtype='float32',
-            param_attr={'name': 'shared_w'},
-            main_program=program)
-
-        concat_embed = layers.concat(
-            input=[embed_first, embed_second, embed_third, embed_forth],
-            axis=1,
-            main_program=program)
-
-        hidden1 = layers.fc(input=concat_embed,
-                            size=256,
-                            act='sigmoid',
-                            main_program=program)
-        predict_word = layers.fc(input=hidden1,
-                                 size=dict_size,
-                                 act='softmax',
-                                 main_program=program)
-        cost = layers.cross_entropy(
-            input=predict_word, label=next_word, main_program=program)
-        avg_cost = layers.mean(x=cost, main_program=program)
-        self.assertIsNotNone(avg_cost)
-
-        print str(program)
+        with program_guard(program, startup_program=Program()):
+            dict_size = 10000
+            embed_size = 32
+            first_word = layers.data(name='firstw', shape=[1], dtype='int64')
+            second_word = layers.data(name='secondw', shape=[1], dtype='int64')
+            third_word = layers.data(name='thirdw', shape=[1], dtype='int64')
+            forth_word = layers.data(name='forthw', shape=[1], dtype='int64')
+            next_word = layers.data(name='nextw', shape=[1], dtype='int64')
+
+            embed_first = layers.embedding(
+                input=first_word,
+                size=[dict_size, embed_size],
+                dtype='float32',
+                param_attr='shared_w')
+            embed_second = layers.embedding(
+                input=second_word,
+                size=[dict_size, embed_size],
+                dtype='float32',
+                param_attr='shared_w')
+
+            embed_third = layers.embedding(
+                input=third_word,
+                size=[dict_size, embed_size],
+                dtype='float32',
+                param_attr='shared_w')
+            embed_forth = layers.embedding(
+                input=forth_word,
+                size=[dict_size, embed_size],
+                dtype='float32',
+                param_attr='shared_w')
+
+            concat_embed = layers.concat(
+                input=[embed_first, embed_second, embed_third, embed_forth],
+                axis=1)
+
+            hidden1 = layers.fc(input=concat_embed, size=256, act='sigmoid')
+            predict_word = layers.fc(input=hidden1,
+                                     size=dict_size,
+                                     act='softmax')
+            cost = layers.cross_entropy(input=predict_word, label=next_word)
+            avg_cost = layers.mean(x=cost)
+            self.assertIsNotNone(avg_cost)
+
+        print(str(program))
 
     def test_linear_chain_crf(self):
         program = Program()
+        with program_guard(program, startup_program=Program()):
+            images = layers.data(name='pixel', shape=[784], dtype='float32')
+            label = layers.data(name='label', shape=[1], dtype='int32')
+            hidden = layers.fc(input=images, size=128)
+            crf = layers.linear_chain_crf(input=hidden, label=label)
+            self.assertNotEqual(crf, None)
 
-        # Change g_program, so the rest layers use `g_program`
-        images = layers.data(
-            name='pixel', shape=[784], dtype='float32', main_program=program)
-        label = layers.data(
-            name='label', shape=[1], dtype='int32', main_program=program)
-        hidden = layers.fc(input=images, size=128, main_program=program)
-        crf = layers.linear_chain_crf(
-            input=hidden, label=label, main_program=program)
+        print(str(program))
 
-        print str(program)
+    def test_sigmoid_cross_entropy(self):
+        program = Program()
+        with program_guard(program):
+            dat = layers.data(name='data', shape=[10], dtype='float32')
+            lbl = layers.data(name='label', shape=[10], dtype='float32')
+            self.assertIsNotNone(
+                layers.sigmoid_cross_entropy_with_logits(
+                    x=dat, label=lbl))
+        print(str(program))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_lod_array_length_op.py b/python/paddle/v2/fluid/tests/test_lod_array_length_op.py
index a01ae83772185df218b8c453557dc0cac719673b..8a4be545eda841dbda33b7c8cae9f91a4199f2f8 100644
--- a/python/paddle/v2/fluid/tests/test_lod_array_length_op.py
+++ b/python/paddle/v2/fluid/tests/test_lod_array_length_op.py
@@ -13,7 +13,7 @@ class TestLoDArrayLength(unittest.TestCase):
         arr_len = layers.array_length(arr)
         cpu = core.CPUPlace()
         exe = Executor(cpu)
-        result = numpy.array(exe.run(fetch_list=[arr_len])[0])
+        result = exe.run(fetch_list=[arr_len])[0]
         self.assertEqual(11, result[0])
 
 
diff --git a/python/paddle/v2/fluid/tests/test_lod_rank_table.py b/python/paddle/v2/fluid/tests/test_lod_rank_table.py
index bbc11930b9e804c2769cc590c298c6e90dc36ca6..30d619fe318517345195281b17f88e9916b6afb3 100644
--- a/python/paddle/v2/fluid/tests/test_lod_rank_table.py
+++ b/python/paddle/v2/fluid/tests/test_lod_rank_table.py
@@ -1,6 +1,5 @@
 from paddle.v2.fluid.layers import lod_rank_table, data
 from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.framework import g_main_program
 import paddle.v2.fluid.core as core
 import numpy
 import unittest
@@ -18,7 +17,7 @@ class TestLoDRankTable(unittest.TestCase):
         tensor = core.LoDTensor()
         tensor.set(numpy.random.random(size=(17, 100)), cpu)
         tensor.set_lod([[0, 1, 3], [0, 5, 6, 7], [0, 3, 4, 9, 10, 13, 16, 17]])
-        exe.run(g_main_program, scope=scope, feed={'x': tensor})
+        exe.run(scope=scope, feed={'x': tensor})
         var = scope.find_var(rank_table.name)
         table = var.get_lod_rank_table()
         self.assertEqual([(0, 5), (1, 1), (2, 1)], table.items())
diff --git a/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
index 16e64b8cd52d72a3bbc84e43d772b843dad0129a..0a916a55bc3d097e17fb504b0d6b2f2818f030c9 100644
--- a/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
+++ b/python/paddle/v2/fluid/tests/test_lod_tensor_array_ops.py
@@ -18,7 +18,11 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor.set_lod([[0, 3, 9, 10]])
         expect = map(lambda x: numpy.array(x).astype('int32'),
                      [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]])
-        self.main(tensor=tensor, expect_array=expect, expect_lod=[] * 6)
+        self.main(
+            tensor=tensor,
+            expect_array=expect,
+            expect_lod=[] * 6,
+            expect_max_len=6)
 
     def test_lod_tensor_to_array_level_0_empty_seq(self):
         tensor = core.LoDTensor()
@@ -27,7 +31,11 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         tensor.set_lod([[0, 3, 9, 9, 10]])
         expect = map(lambda x: numpy.array(x).astype('int32'),
                      [[3, 0, 9], [4, 1], [5, 2], [6], [7], [8]])
-        self.main(tensor=tensor, expect_array=expect, expect_lod=[] * 6)
+        self.main(
+            tensor=tensor,
+            expect_array=expect,
+            expect_lod=[] * 6,
+            expect_max_len=6)
 
     def test_lod_tensor_to_array_level_1(self):
         tensor = core.LoDTensor()
@@ -44,7 +52,11 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         ]
 
         lod = [[[0, 2, 5]], [[0, 6, 12]], [[0, 3]]]
-        self.main(tensor=tensor, expect_array=expect, expect_lod=lod)
+        self.main(
+            tensor=tensor,
+            expect_array=expect,
+            expect_lod=lod,
+            expect_max_len=3)
 
     def test_lod_tensor_to_array_level_1_empty_seq(self):
         tensor = core.LoDTensor()
@@ -63,7 +75,11 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         ]
 
         lod = [[[0, 5, 8, 8, 15]], [[0, 2, 6, 7, 8]], [[0, 2, 6]], [[0, 2]]]
-        self.main(tensor=tensor, expect_array=expect, expect_lod=lod)
+        self.main(
+            tensor=tensor,
+            expect_array=expect,
+            expect_lod=lod,
+            expect_max_len=4)
 
     def test_lod_tensor_to_array_level_2(self):
         tensor = core.LoDTensor()
@@ -80,7 +96,11 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
         ]
         lod = [[[0, 1, 3, 4], [0, 1, 4, 8, 12]],
                [[0, 4, 7], [0, 1, 5, 9, 17, 21, 27, 31]], [[0, 2], [0, 6, 7]]]
-        self.main(tensor=tensor, expect_array=expect, expect_lod=lod)
+        self.main(
+            tensor=tensor,
+            expect_array=expect,
+            expect_lod=lod,
+            expect_max_len=3)
 
     def test_lod_tensor_to_array_level_2_skip_level(self):
         tensor = core.LoDTensor()
@@ -88,14 +108,21 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
             numpy.arange(50).reshape(50, 1).astype('int32'), self.place())
         tensor.set_lod([[0, 2, 5, 6], [0, 2, 5, 6, 10, 12, 13],
                         [0, 3, 7, 11, 17, 21, 22, 23, 27, 31, 39, 45, 46, 50]])
-        self.main(tensor=tensor, expect_array=None, expect_lod=None, level=1)
-
-    def main(self, tensor, expect_array, expect_lod, level=0):
+        self.main(
+            tensor=tensor,
+            expect_array=None,
+            expect_lod=None,
+            expect_max_len=4,
+            level=1)
+
+    def main(self, tensor, expect_array, expect_lod, expect_max_len, level=0):
         place = self.place()
         program = Program()
         x = layers.data(name='x', shape=[10], main_program=program)
         x.persistable = True
         table = layers.lod_rank_table(x, level=level, main_program=program)
+        max_len = layers.max_sequence_len(table, main_program=program)
+        max_len.persistable = True
         array = layers.lod_tensor_to_array(x, table, main_program=program)
         array.persistable = True
 
@@ -110,6 +137,10 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
             self.check_array_same(array, expect_array, expect_lod)
         self.check_tensor_same(scope.find_var(result.name).get_tensor(), tensor)
 
+        self.assertEqual(
+            numpy.array(scope.find_var(max_len.name).get_tensor())[0],
+            expect_max_len)
+
     def check_array_same(self, array, expect_tensor, expect_lod):
         self.assertEqual(len(expect_tensor), len(array))
         for i, exp in enumerate(zip(expect_tensor, expect_lod)):
@@ -151,10 +182,11 @@ class TestCPULoDTensorArrayOpGrad(unittest.TestCase):
 
         exe = Executor(place)
         g_out = [
-            item.sum()
-            for item in map(
-                numpy.array,
-                exe.run(program, feed={'x': tensor}, fetch_list=[g_vars]))
+            numpy.array(item).sum()
+            for item in exe.run(program,
+                                feed={'x': tensor},
+                                fetch_list=[g_vars],
+                                return_numpy=False)
         ]
         g_out_sum = numpy.array(g_out).sum()
 
diff --git a/python/paddle/v2/fluid/tests/test_log_loss_op.py b/python/paddle/v2/fluid/tests/test_log_loss_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..2eeaa90758c57ef0d92a8ad7b0a4c1b1f2c38be3
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_log_loss_op.py
@@ -0,0 +1,33 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestLogLossOp(OpTest):
+    def setUp(self):
+        self.op_type = 'log_loss'
+        samples_num = 32
+
+        predicted = np.random.uniform(0.1, 1.0,
+                                      (samples_num, 1)).astype("float32")
+        labels = np.random.randint(0, 2, (samples_num, 1)).astype("float32")
+        epsilon = 1e-4
+        self.inputs = {
+            'Predicted': predicted,
+            'Labels': labels,
+        }
+
+        self.attrs = {'epsilon': epsilon}
+        loss = -labels * np.log(predicted + epsilon) - (
+            1 - labels) * np.log(1 - predicted + epsilon)
+        self.outputs = {'Loss': loss}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['Predicted'], 'Loss', max_relative_error=0.03)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_maxout_op.py b/python/paddle/v2/fluid/tests/test_maxout_op.py
index 05e42f315833cab5bc5272cbd2173ea8012ff7f5..5fbed43e254b811d38e441e946a73c24f87373de 100644
--- a/python/paddle/v2/fluid/tests/test_maxout_op.py
+++ b/python/paddle/v2/fluid/tests/test_maxout_op.py
@@ -30,9 +30,7 @@ class TestMaxOutOp(OpTest):
     def init_test_case(self):
         self.MaxOut_forward_naive = maxout_forward_naive
         self.shape = [100, 6, 2, 2]
-        self.groups=2
-
-
+        self.groups = 2
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py b/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
index e76357a5be07d79eafee4c3a27911efe8a3eaef4..50fcc4a72ddbd6d7a3d3b73434c6ac8de5a006e2 100644
--- a/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
+++ b/python/paddle/v2/fluid/tests/test_mnist_if_else_op.py
@@ -65,17 +65,10 @@ class TestMNISTIfElseOp(unittest.TestCase):
                 y_data = np.array(map(lambda x: x[1], data)).astype("int64")
                 y_data = np.expand_dims(y_data, axis=1)
 
-                tensor_x = core.LoDTensor()
-                tensor_x.set(x_data, place)
-
-                tensor_y = core.LoDTensor()
-                tensor_y.set(y_data, place)
-
-                outs = map(np.array,
-                           exe.run(kwargs['main_program'],
-                                   feed={'x': tensor_x,
-                                         'y': tensor_y},
-                                   fetch_list=[avg_loss]))
+                outs = exe.run(kwargs['main_program'],
+                               feed={'x': x_data,
+                                     'y': y_data},
+                               fetch_list=[avg_loss])
                 print outs[0]
                 if outs[0] < 1.0:
                     return
@@ -129,19 +122,12 @@ class TestMNISTIfElseOp(unittest.TestCase):
             for data in train_reader():
                 x_data = np.array(map(lambda x: x[0], data)).astype("float32")
                 y_data = np.array(map(lambda x: x[1], data)).astype("int64")
-                y_data = np.expand_dims(y_data, axis=1)
-
-                tensor_x = core.LoDTensor()
-                tensor_x.set(x_data, place)
-
-                tensor_y = core.LoDTensor()
-                tensor_y.set(y_data, place)
+                y_data = y_data.reshape((y_data.shape[0], 1))
 
-                outs = map(np.array,
-                           exe.run(kwargs['main_program'],
-                                   feed={'x': tensor_x,
-                                         'y': tensor_y},
-                                   fetch_list=[avg_loss]))
+                outs = exe.run(kwargs['main_program'],
+                               feed={'x': x_data,
+                                     'y': y_data},
+                               fetch_list=[avg_loss])
                 print outs[0]
                 if outs[0] < 1.0:
                     return
diff --git a/python/paddle/v2/fluid/tests/test_nccl_init_op.py b/python/paddle/v2/fluid/tests/test_nccl_init_op.py
deleted file mode 100644
index a536800ccd81fdc2f3b7c8320cede4f8ecf3a8cb..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/test_nccl_init_op.py
+++ /dev/null
@@ -1,39 +0,0 @@
-import unittest, os
-import numpy as np
-import paddle.v2 as paddle
-from paddle.v2.fluid.op import Operator
-import paddle.v2.fluid.core as core
-from op_test import OpTest, create_op, set_input
-
-if not core.is_compile_gpu():
-    exit(0)
-
-gpu_count = core.get_cuda_device_count()
-
-if gpu_count <= 1:
-    exit(0)
-
-g_scope = core.Scope()
-g_ctx = core.DeviceContext.create(core.CPUPlace())
-
-
-class TestNCCLInit(unittest.TestCase):
-    def test_init(self):
-        self.op_type = "ncclInit"
-        self.gpus = range(gpu_count)
-
-        self.inputs = {}
-        self.attrs = {"gpus": self.gpus}
-        g_scope.var("Communicator").get_communicator()
-        self.outputs = {"Communicator": g_scope.find_var("Communicator")}
-        nccl_init = create_op(
-            g_scope,
-            op_type=self.op_type,
-            inputs=self.inputs,
-            outputs=self.outputs,
-            attrs=self.attrs)
-        nccl_init.run(g_scope, g_ctx)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_operator_desc.py b/python/paddle/v2/fluid/tests/test_operator_desc.py
index e8362d2e9c6038c04c24dce35de8c53bfde78142..ce34d95ac8cb2644dee9c551cd8e85b33609919a 100644
--- a/python/paddle/v2/fluid/tests/test_operator_desc.py
+++ b/python/paddle/v2/fluid/tests/test_operator_desc.py
@@ -1,11 +1,15 @@
 import unittest
-from paddle.v2.fluid.framework import Variable, Program, g_main_program
+
 import paddle.v2.fluid.core as core
 
+from paddle.v2.fluid.framework import Program, default_startup_program
+
+main_program = default_startup_program()
+
 
 class TestOperator(unittest.TestCase):
     def test_error_type(self):
-        block = g_main_program.create_block()
+        block = main_program.create_block()
         try:
             block.append_op()
             self.assertFail()
diff --git a/python/paddle/v2/fluid/tests/test_parameter.py b/python/paddle/v2/fluid/tests/test_parameter.py
index d467e4bbb79b291c442c643158ef6c0d630920dd..694344acbbd3b7c80cb0ff48ada843f794061282 100644
--- a/python/paddle/v2/fluid/tests/test_parameter.py
+++ b/python/paddle/v2/fluid/tests/test_parameter.py
@@ -1,17 +1,19 @@
 import unittest
-from paddle.v2.fluid.framework import g_main_program
+from paddle.v2.fluid.framework import default_main_program
 import paddle.v2.fluid.core as core
 from paddle.v2.fluid.executor import Executor
 import paddle.v2.fluid.io as io
 from paddle.v2.fluid.initializer import ConstantInitializer
 import numpy as np
 
+main_program = default_main_program()
+
 
 class TestParameter(unittest.TestCase):
     def test_param(self):
         shape = [784, 100]
         val = 1.0625
-        b = g_main_program.global_block()
+        b = main_program.global_block()
         param = b.create_parameter(
             name='fc.w',
             shape=shape,
@@ -23,9 +25,9 @@ class TestParameter(unittest.TestCase):
         self.assertEqual(core.DataType.FP32, param.dtype)
         self.assertEqual(0, param.block.idx)
         exe = Executor(core.CPUPlace())
-        p = exe.run(g_main_program, fetch_list=[param])[0]
-        self.assertTrue(np.allclose(np.array(p), np.ones(shape) * val))
-        p = io.get_parameter_value_by_name('fc.w', exe, g_main_program)
+        p = exe.run(main_program, fetch_list=[param])[0]
+        self.assertTrue(np.allclose(p, np.ones(shape) * val))
+        p = io.get_parameter_value_by_name('fc.w', exe, main_program)
         self.assertTrue(np.allclose(np.array(p), np.ones(shape) * val))
 
 
diff --git a/python/paddle/v2/fluid/tests/test_profiler.py b/python/paddle/v2/fluid/tests/test_profiler.py
new file mode 100644
index 0000000000000000000000000000000000000000..395d0dc36a3d1d6fbfebb4cdf34395c4edee412d
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_profiler.py
@@ -0,0 +1,28 @@
+import unittest
+import numpy as np
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.profiler as profiler
+import paddle.v2.fluid.layers as layers
+
+
+class TestProfiler(unittest.TestCase):
+    def test_nvprof(self):
+        if not fluid.core.is_compile_gpu():
+            return
+        epoc = 8
+        dshape = [4, 3, 28, 28]
+        data = layers.data(name='data', shape=[3, 28, 28], dtype='float32')
+        conv = layers.conv2d(data, 20, 3, stride=[1, 1], padding=[1, 1])
+
+        place = fluid.GPUPlace(0)
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        with profiler.cuda_profiler("cuda_profiler.txt", 'csv') as nvprof:
+            for i in range(epoc):
+                input = np.random.random(dshape).astype('float32')
+                exe.run(fluid.default_main_program(), feed={'data': input})
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_program.py b/python/paddle/v2/fluid/tests/test_program.py
index e9bcefd21569aaa9225c676ea03b5c8e37d00333..1a9313c68aab165d85ae29051faeacb4927ac2c9 100644
--- a/python/paddle/v2/fluid/tests/test_program.py
+++ b/python/paddle/v2/fluid/tests/test_program.py
@@ -1,35 +1,38 @@
+from __future__ import print_function
 import unittest
 
-from paddle.v2.fluid.framework import Program
-from paddle.v2.fluid.framework import g_main_program
+from paddle.v2.fluid.framework import Program, default_main_program
+import paddle.v2.fluid.layers as layers
+
+main_program = default_main_program()
 
 
 class TestProgram(unittest.TestCase):
     def test_program(self):
-        b = g_main_program.current_block()
+        b = main_program.current_block()
         self.assertEqual(-1, b.parent_idx)
         self.assertEqual(0, b.idx)
 
-        b = g_main_program.create_block()
+        b = main_program.create_block()
         self.assertEqual(1, b.idx)
         self.assertEqual(0, b.parent_idx)
 
-        b = g_main_program.create_block()
+        b = main_program.create_block()
         self.assertEqual(2, b.idx)
         self.assertEqual(1, b.parent_idx)
 
-        g_main_program.rollback()
+        main_program.rollback()
 
-        b = g_main_program.current_block()
+        b = main_program.current_block()
         self.assertEqual(1, b.idx)
         self.assertEqual(0, b.parent_idx)
 
-        b = g_main_program.create_block()
+        b = main_program.create_block()
         self.assertEqual(3, b.idx)
         self.assertEqual(1, b.parent_idx)
 
-        g_main_program.rollback()
-        b = g_main_program.current_block()
+        main_program.rollback()
+        b = main_program.current_block()
         self.assertEqual(1, b.idx)
         self.assertEqual(0, b.parent_idx)
 
@@ -48,8 +51,8 @@ class TestProgram(unittest.TestCase):
 
         # FIXME(yuyang18): We manual compare the output string, since the order
         # of variable could be changed.
-        print prog
-        print prog.clone()
+        print(prog)
+        print(prog.clone())
 
     def test_parse_program_from_string(self):
         prog = Program()
@@ -67,8 +70,8 @@ class TestProgram(unittest.TestCase):
         binary_str = prog.desc.serialize_to_string()
         prog_restored = Program.parse_from_string(binary_str)
 
-        print prog
-        print prog_restored
+        print(prog)
+        print(prog_restored)
 
     def test_append_backward(self):
         prog = Program()
@@ -123,6 +126,20 @@ class TestProgram(unittest.TestCase):
             actual_ops.append(op.type)
         self.assertEqual(actual_ops, expect_ops)
 
+    def test_program_clone_with_parameter(self):
+        main_program = Program()
+        startup_program = Program()
+        kwargs = {
+            'main_program': main_program,
+            'startup_program': startup_program
+        }
+        d = layers.data(name='x', shape=[784], dtype='float32', **kwargs)
+        hidden = layers.fc(input=d, size=100, **kwargs)
+        layers.fc(input=hidden, size=100, **kwargs)
+
+        new_program = main_program.clone()
+        self.assertNotEqual(0, len(new_program.blocks[0].all_parameters()))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_recurrent_op.py b/python/paddle/v2/fluid/tests/test_recurrent_op.py
index 88bcdc3e6a21881ace2be53c22a62d78df30a974..36e0c84c0b8e7d40aa56d75c8904a38694881be4 100644
--- a/python/paddle/v2/fluid/tests/test_recurrent_op.py
+++ b/python/paddle/v2/fluid/tests/test_recurrent_op.py
@@ -156,7 +156,7 @@ class RecurrentOpTest1(unittest.TestCase):
                       feed=self.feed_map,
                       fetch_list=[self.output])
 
-        return np.array(out[0])
+        return out[0]
 
     def backward(self):
         self.feed_map = {
@@ -171,7 +171,8 @@ class RecurrentOpTest1(unittest.TestCase):
         exe = Executor(self.place)
         return exe.run(self.main_program,
                        feed=self.feed_map,
-                       fetch_list=fetch_list)
+                       fetch_list=fetch_list,
+                       return_numpy=False)
 
     def test_backward(self):
         self.check_forward()
@@ -270,12 +271,12 @@ class RecurrentOpTest2(RecurrentOpTest1):
 
             temp_l = layers.fc(input=x_t,
                                size=self.input_dim,
-                               param_attr={'name': 'W'},
+                               param_attr='W',
                                bias_attr=False,
                                **self.p_info)
             temp_r = layers.fc(input=h_pre,
                                size=self.input_dim,
-                               param_attr={'name': 'U'},
+                               param_attr='U',
                                bias_attr=False,
                                **self.p_info)
 
diff --git a/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py b/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
index a3cba92504a28590083df57e69f7662a887d94a6..9999165ed509aa40f31f26aa676f381561bd0016 100644
--- a/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
+++ b/python/paddle/v2/fluid/tests/test_rnn_memory_helper_op.py
@@ -7,12 +7,6 @@ import numpy as np
 import paddle.v2.fluid.core as core
 
 
-def create_tensor(np_data, place):
-    tensor = core.LoDTensor()
-    tensor.set(np_data, place)
-    return tensor
-
-
 class RNNMemoryHelperOpTest(unittest.TestCase):
     def setUp(self):
         self.program = Program()
@@ -30,13 +24,13 @@ class RNNMemoryHelperOpTest(unittest.TestCase):
 
     def test_forward(self):
         x_np = np.random.normal(size=(2, 3)).astype("float32")
-        self.feed_map = {'X': create_tensor(x_np, self.place)}
+        self.feed_map = {'X': x_np}
         self.fetch_list = [self.Out]
         exe = Executor(self.place)
         out = exe.run(self.program,
                       feed=self.feed_map,
                       fetch_list=self.fetch_list)
-        np.isclose(np.array(out[0]), x_np, rtol=1e-5)
+        self.assertTrue(np.allclose(out[0], x_np, rtol=1e-5))
 
 
 class RNNMemoryHelperGradOpTest(unittest.TestCase):
@@ -66,8 +60,7 @@ class RNNMemoryHelperGradOpTest(unittest.TestCase):
 
     def test_backward(self):
         self.feed_map = {
-            name: create_tensor(
-                np.random.normal(size=(2, 3)).astype("float32"), self.place)
+            name: np.random.normal(size=(2, 3)).astype("float32")
             for name in self.input_names
         }
         self.fetch_list = [self.output_vars['X@GRAD']]
@@ -76,7 +69,7 @@ class RNNMemoryHelperGradOpTest(unittest.TestCase):
         out = exe.run(self.program,
                       feed=self.feed_map,
                       fetch_list=self.fetch_list)
-        np.isclose(np.array(out[0]), self.feed_map['Out@GRAD'], rtol=1e-5)
+        np.isclose(out[0], self.feed_map['Out@GRAD'], rtol=1e-5)
 
 
 class RNNMemoryHelperGradOpWithoutInputTest(unittest.TestCase):
@@ -110,8 +103,7 @@ class RNNMemoryHelperGradOpWithoutInputTest(unittest.TestCase):
 
     def test_backward(self):
         self.feed_map = {
-            name: create_tensor(
-                np.random.normal(size=(2, 3)).astype("float32"), self.place)
+            name: np.random.normal(size=(2, 3)).astype("float32")
             for name in ['X', 'Out']
         }
         self.fetch_list = [self.output_vars['X@GRAD']]
@@ -120,10 +112,9 @@ class RNNMemoryHelperGradOpWithoutInputTest(unittest.TestCase):
         out = exe.run(self.program,
                       feed=self.feed_map,
                       fetch_list=self.fetch_list)
-        np.isclose(
-            np.array(out[0]),
-            np.zeros(shape=(2, 3)).astype("float32"),
-            rtol=1e-5)
+        self.assertTrue(
+            np.allclose(
+                out[0], np.zeros(shape=(2, 3)).astype("float32"), rtol=1e-5))
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_roi_pool_op.py b/python/paddle/v2/fluid/tests/test_roi_pool_op.py
index 7cedb930ca861aed95c355931d80cb4d265c8235..a28d9c7f82d3735c410369eb61e350168c267cea 100644
--- a/python/paddle/v2/fluid/tests/test_roi_pool_op.py
+++ b/python/paddle/v2/fluid/tests/test_roi_pool_op.py
@@ -4,24 +4,22 @@ import math
 import sys
 from op_test import OpTest
 
+
 class TestROIPoolOp(OpTest):
     def set_data(self):
         self.init_test_case()
         self.make_rois()
         self.calc_roi_pool()
 
-        self.inputs = {
-            'X': self.x, 
-            'ROIs': self.rois}
-        
+        self.inputs = {'X': self.x, 'ROIs': self.rois}
+
         self.attrs = {
             'spatial_scale': self.spatial_scale,
             'pooled_height': self.pooled_height,
-            'pooled_width': self.pooled_width}
+            'pooled_width': self.pooled_width
+        }
 
-        self.outputs = {
-            'Out': self.outs,
-            'Argmax': self.argmaxes}
+        self.outputs = {'Out': self.outs, 'Argmax': self.argmaxes}
 
     def init_test_case(self):
         self.batch_size = 5
@@ -30,10 +28,9 @@ class TestROIPoolOp(OpTest):
         self.width = 4
 
         # n, c, h, w
-        self.x_dim = (self.batch_size, self.channels,
-                      self.height, self.width)
+        self.x_dim = (self.batch_size, self.channels, self.height, self.width)
 
-        self.spatial_scale = 1.0/4.0
+        self.spatial_scale = 1.0 / 4.0
         self.pooled_height = 2
         self.pooled_width = 2
         self.rois_num = 2
@@ -41,13 +38,11 @@ class TestROIPoolOp(OpTest):
         self.x = np.random.random(self.x_dim).astype('float32')
 
     def calc_roi_pool(self):
-        out_data = np.zeros(
-            (self.rois_num, self.channels,
-            self.pooled_height, self.pooled_width))
-        argmax_data = np.zeros(
-            (self.rois_num, self.channels,
-            self.pooled_height, self.pooled_width))
-            
+        out_data = np.zeros((self.rois_num, self.channels, self.pooled_height,
+                             self.pooled_width))
+        argmax_data = np.zeros((self.rois_num, self.channels,
+                                self.pooled_height, self.pooled_width))
+
         for i in range(self.rois_num):
             roi = self.rois[i]
             roi_batch_id = roi[0]
@@ -56,8 +51,8 @@ class TestROIPoolOp(OpTest):
             roi_end_w = int(round(roi[3] * self.spatial_scale))
             roi_end_h = int(round(roi[4] * self.spatial_scale))
 
-            roi_height = int(max(roi_end_h - roi_start_h + 1, 1));
-            roi_width = int(max(roi_end_w - roi_start_w + 1, 1));
+            roi_height = int(max(roi_end_h - roi_start_h + 1, 1))
+            roi_width = int(max(roi_end_w - roi_start_w + 1, 1))
 
             x_i = self.x[roi_batch_id]
 
@@ -84,7 +79,7 @@ class TestROIPoolOp(OpTest):
                             out_data[i, c, ph, pw] = -sys.float_info.max
 
                         argmax_data[i, c, ph, pw] = -1
-                        
+
                         for h in range(hstart, hend):
                             for w in range(wstart, wend):
                                 if x_i[c, h, w] > out_data[i, c, ph, pw]:
@@ -104,11 +99,11 @@ class TestROIPoolOp(OpTest):
             y1 = np.random.random_integers(
                 0, self.height / self.spatial_scale - self.pooled_height)
 
-            x2 = np.random.random_integers(
-                x1 + self.pooled_width, self.width / self.spatial_scale)
-            y2 = np.random.random_integers(
-                y1 + self.pooled_height, self.height / self.spatial_scale)
-            
+            x2 = np.random.random_integers(x1 + self.pooled_width,
+                                           self.width / self.spatial_scale)
+            y2 = np.random.random_integers(y1 + self.pooled_height,
+                                           self.height / self.spatial_scale)
+
             roi = [batch_ids[i], x1, y1, x2, y2]
             rois.append(roi)
         self.rois = np.array(rois).astype("int64")
@@ -123,5 +118,6 @@ class TestROIPoolOp(OpTest):
     def test_check_grad(self):
         self.check_grad(['X'], 'Out')
 
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
index 953629d610e183cdddf97081f94a77951fe979d8..86db4c64b493d94cc675ed4bcee7e2925fef1977 100644
--- a/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
+++ b/python/paddle/v2/fluid/tests/test_shrink_rnn_memory.py
@@ -3,9 +3,11 @@ import paddle.v2.fluid.core as core
 from paddle.v2.fluid.executor import Executor
 import paddle.v2.fluid.layers as layers
 from paddle.v2.fluid.backward import append_backward_ops
-from paddle.v2.fluid.framework import g_main_program
+from paddle.v2.fluid.framework import default_main_program
 import numpy
 
+main_program = default_main_program()
+
 
 class TestShrinkRNNMemory(unittest.TestCase):
     def test_shrink_rnn_memory(self):
@@ -27,19 +29,16 @@ class TestShrinkRNNMemory(unittest.TestCase):
         tensor_np = numpy.random.random(size=(3, 100)).astype('float32')
         tensor.set(tensor_np, cpu)
         exe = Executor(cpu)
-        outs = map(numpy.array,
-                   exe.run(feed={'x': tensor}, fetch_list=[mem1, mem2, mem3]))
+        outs = exe.run(feed={'x': tensor}, fetch_list=[mem1, mem2, mem3])
         self.assertTrue(numpy.allclose(tensor_np[0:3], outs[0]))
         self.assertTrue(numpy.allclose(tensor_np[0:2], outs[1]))
         self.assertTrue(numpy.allclose(tensor_np[0:1], outs[2]))
 
         mem3_mean = layers.mean(x=mem3)
         append_backward_ops(loss=mem3_mean)
-        x_grad = map(numpy.array,
-                     exe.run(feed={'x': tensor},
-                             fetch_list=[
-                                 g_main_program.global_block().var('x@GRAD')
-                             ]))[0]
+        x_grad = exe.run(
+            feed={'x': tensor},
+            fetch_list=[main_program.global_block().var('x@GRAD')])[0]
         self.assertAlmostEqual(1.0, x_grad.sum(), delta=0.1)
 
 
diff --git a/python/paddle/v2/fluid/tests/test_sigmoid_cross_entropy_with_logits_op.py b/python/paddle/v2/fluid/tests/test_sigmoid_cross_entropy_with_logits_op.py
index e53856b38aa5ddd6061b350a66e9fe86bc23923c..c42f578f72cb121a24d6b852334cbd8a977f2730 100644
--- a/python/paddle/v2/fluid/tests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/v2/fluid/tests/test_sigmoid_cross_entropy_with_logits_op.py
@@ -2,11 +2,12 @@ import numpy as np
 from op_test import OpTest
 from scipy.special import logit
 from scipy.special import expit
+import unittest
 
 
 class TestSigmoidCrossEntropyWithLogitsOp1(OpTest):
-    '''Test sigmoid_cross_entropy_with_logit_op with binary labels
-    '''
+    """Test sigmoid_cross_entropy_with_logit_op with binary label
+    """
 
     def setUp(self):
         self.op_type = "sigmoid_cross_entropy_with_logits"
@@ -16,16 +17,16 @@ class TestSigmoidCrossEntropyWithLogitsOp1(OpTest):
             'X': logit(
                 np.random.uniform(0, 1, (batch_size, num_classes))
                 .astype("float32")),
-            'Labels': np.random.randint(0, 2, (batch_size, num_classes))
+            'Label': np.random.randint(0, 2, (batch_size, num_classes))
             .astype("float32")
         }
 
         # Fw Pass is implemented as elementwise sigmoid followed by
         # elementwise logistic loss
-        # Labels * -log(sigmoid(X)) + (1 - labels) * -log(1 - sigmoid(X))
+        # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
         sigmoid_X = expit(self.inputs['X'])
-        term1 = self.inputs['Labels'] * np.log(sigmoid_X)
-        term2 = (1 - self.inputs['Labels']) * np.log(1 - sigmoid_X)
+        term1 = self.inputs['Label'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
         self.outputs = {'Out': -term1 - term2}
 
     def test_check_output(self):
@@ -36,8 +37,8 @@ class TestSigmoidCrossEntropyWithLogitsOp1(OpTest):
 
 
 class TestSigmoidCrossEntropyWithLogitsOp2(OpTest):
-    '''Test sigmoid_cross_entropy_with_logit_op with probabalistic labels
-    '''
+    """Test sigmoid_cross_entropy_with_logit_op with probabalistic label
+    """
 
     def setUp(self):
         self.op_type = "sigmoid_cross_entropy_with_logits"
@@ -47,16 +48,16 @@ class TestSigmoidCrossEntropyWithLogitsOp2(OpTest):
             'X': logit(
                 np.random.uniform(0, 1, (batch_size, num_classes))
                 .astype("float32")),
-            'Labels': np.random.uniform(0, 1, (batch_size, num_classes))
+            'Label': np.random.uniform(0, 1, (batch_size, num_classes))
             .astype("float32")
         }
 
         # Fw Pass is implemented as elementwise sigmoid followed by
         # elementwise logistic loss
-        # Labels * -log(sigmoid(X)) + (1 - labels) * -log(1 - sigmoid(X))
+        # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X))
         sigmoid_X = expit(self.inputs['X'])
-        term1 = self.inputs['Labels'] * np.log(sigmoid_X)
-        term2 = (1 - self.inputs['Labels']) * np.log(1 - sigmoid_X)
+        term1 = self.inputs['Label'] * np.log(sigmoid_X)
+        term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X)
         self.outputs = {'Out': -term1 - term2}
 
     def test_check_output(self):
@@ -64,3 +65,7 @@ class TestSigmoidCrossEntropyWithLogitsOp2(OpTest):
 
     def test_check_grad(self):
         self.check_grad(['X'], 'Out')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
index a98cb3bbab8442886206b59a2b591fee96deeb9f..f5da4e408f0a83dbf6da530b478e91bbf9cd5ab2 100644
--- a/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/v2/fluid/tests/test_split_and_merge_lod_tensor_op.py
@@ -98,7 +98,11 @@ class TestCPULoDTensorArrayOps(unittest.TestCase):
 
         exe = Executor(place)
         scope = core.Scope()
-        exe.run(program, feed={'x': tensor, 'y': mask}, scope=scope)
+        exe.run(program,
+                feed={'x': tensor,
+                      'y': mask},
+                scope=scope,
+                return_numpy=False)
 
         var_true = scope.find_var(out_true.name).get_tensor()
 
@@ -169,7 +173,8 @@ class TestCPUSplitMergeLoDTensorGrad(unittest.TestCase):
                                     feed={'x': tensor,
                                           'y': mask},
                                     fetch_list=[g_vars],
-                                    scope=scope))
+                                    scope=scope,
+                                    return_numpy=False))
         ]
 
         g_out_sum = np.array(g_out).sum()
diff --git a/python/paddle/v2/fluid/tests/test_tensor_array.py b/python/paddle/v2/fluid/tests/test_tensor_array.py
deleted file mode 100644
index d6929ba16e4dae0c57adcceb4f0e78c094eee55c..0000000000000000000000000000000000000000
--- a/python/paddle/v2/fluid/tests/test_tensor_array.py
+++ /dev/null
@@ -1,106 +0,0 @@
-import logging
-import paddle.v2.fluid.core as core
-import unittest
-import numpy as np
-
-
-class TestTensorArray(unittest.TestCase):
-    def setUp(self):
-        self.ta = core.TensorArray()
-
-        self.batch_size = 10
-        self.dim = 2
-
-        # create a LoDTensor
-        self.scope = core.Scope()
-        var = self.scope.var("test_tensor")
-        self.place = core.CPUPlace()
-        tensor = var.get_tensor()
-        tensor.set_dims([self.batch_size, self.dim])
-        tensor.alloc_float(self.place)
-        tensor_array = np.array(tensor)
-        tensor_array[0, 0] = 0
-        tensor_array[1, 0] = 1
-        tensor_array[2, 0] = 2
-        tensor_array[3, 0] = 3
-        tensor_array[4, 0] = 4
-        tensor_array[5, 0] = 5
-        tensor_array[6, 0] = 6
-        tensor_array[7, 0] = 7
-        tensor_array[8, 0] = 8
-        tensor_array[9, 0] = 9
-
-        lod_py = [[0, 2, 5, 10]]
-        lod_tensor = core.LoDTensor(lod_py)
-        lod_tensor.set(tensor_array, self.place)
-
-        self.py_seq_meta = [[5, 10, 2], [2, 5, 1], [0, 2, 0]]
-
-        self.tensor = lod_tensor
-
-    def test_unstack(self):
-        self.ta.unstack(self.tensor)
-        self.assertEqual(self.tensor.get_dims()[0], self.ta.size())
-
-    def test_read(self):
-        self.ta.unstack(self.tensor)
-        for i in range(self.batch_size):
-            tensor = self.ta.read(i)
-
-    def test_write(self):
-        self.ta.unstack(self.tensor)
-
-        # create a tensor with shape of [1, self.dim]
-        var = self.scope.var("hell")
-        tensor = var.get_tensor()
-        tensor.set_dims([1, self.dim])
-        tensor.alloc_float(self.place)
-        tensor_array = np.array(tensor)
-        for i in range(self.dim):
-            tensor_array[0, i] = i
-        tensor.set(tensor_array, self.place)
-
-        self.ta.write(2, tensor)
-
-        ta_tensor = self.ta.read(2)
-        ta_tensor_array = np.array(ta_tensor)
-        self.assertEqual(ta_tensor.get_dims(), [1, self.dim])
-        self.assertTrue((tensor_array == ta_tensor_array).all())
-
-    def test_write_shared(self):
-        self.ta.unstack(self.tensor)
-
-        # create a tensor with shape of [1, self.dim]
-        var = self.scope.var("hell")
-        tensor = var.get_tensor()
-        tensor.set_dims([1, self.dim])
-        tensor.alloc_float(self.place)
-        tensor_array = np.array(tensor)
-        for i in range(self.dim):
-            tensor_array[0, i] = i
-        tensor.set(tensor_array, self.place)
-
-        self.ta.write_shared(2, tensor)
-
-        ta_tensor = self.ta.read(2)
-        ta_tensor_array = np.array(ta_tensor)
-        self.assertEqual(ta_tensor.get_dims(), [1, self.dim])
-        self.assertTrue((tensor_array == ta_tensor_array).all())
-
-    def test_unpack(self):
-        meta = self.ta.unpack(self.tensor, 0, True)
-        self.assertEqual(self.ta.size(), 5)
-        self.assertEqual(meta, self.py_seq_meta)
-
-    def test_pack(self):
-        meta = self.ta.unpack(self.tensor, 0, True)
-        print "meta", meta
-        tensor = self.ta.pack(0, meta, self.tensor.lod())
-        print np.array(self.tensor)
-        print np.array(tensor)
-        self.assertTrue((np.array(self.tensor) == np.array(tensor)).all())
-        self.assertTrue(tensor.lod(), self.tensor.lod())
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_unpool_op.py b/python/paddle/v2/fluid/tests/test_unpool_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..e87f283042c081ed9f232d140ff8c303cd3d1858
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_unpool_op.py
@@ -0,0 +1,83 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def unpool2dmax_forward_naive(input, indices, ksize, strides, paddings):
+    s0, s1, s2, s3 = input.shape
+    out_hsize = (s2 - 1) * strides[0] - 2 * paddings[0] + ksize[0]
+    out_wsize = (s2 - 1) * strides[1] - 2 * paddings[1] + ksize[1]
+    out = np.zeros((s0, s1, out_hsize, out_wsize))
+    for nidx in xrange(s0):
+        for cidx in xrange(s1):
+            for h in xrange(s2):
+                for w in xrange(s3):
+                    index = indices[nidx, cidx, h, w]
+                    hidx = (index - index % out_wsize) / out_wsize
+                    widx = index % out_wsize
+                    out[nidx, cidx, int(hidx), int(widx)] = \
+                            input[nidx, cidx, h, w]
+
+    return out
+
+
+class TestUnpoolOp(OpTest):
+    def setUp(self):
+        self.op_type = "unpool"
+        self.init_test_case()
+        pre_input = np.random.random(self.shape).astype("float32")
+        nsize, csize, hsize, wsize = pre_input.shape
+        hsize_out = (hsize - self.ksize[0] + 2 * self.paddings[0]) / \
+                self.strides[0] + 1
+        wsize_out = (wsize - self.ksize[1] + 2 * self.paddings[1]) / \
+                self.strides[1] + 1
+        input = np.zeros((nsize, csize, hsize_out, wsize_out))
+        indices = np.zeros((nsize, csize, hsize_out, wsize_out))
+        for i in xrange(hsize_out):
+            for j in xrange(wsize_out):
+                r_start = np.max((i * self.strides[0] - self.paddings[0], 0))
+                r_end = np.min((i * self.strides[0] + self.ksize[0] - \
+                        self.paddings[0], hsize))
+                c_start = np.max((j * self.strides[1] - self.paddings[1], 0))
+                c_end = np.min((j * self.strides[1] + self.ksize[1] - \
+                        self.paddings[1], wsize))
+                for nidx in xrange(nsize):
+                    for cidx in xrange(csize):
+                        x_masked = pre_input[nidx, cidx, r_start:r_end, \
+                                c_start:c_end]
+                        input[nidx, cidx, i, j] = x_masked.max()
+                        arg = x_masked.argmax()
+                        indices[nidx, cidx, i, j] = \
+                                (r_start + arg / self.ksize[1]) * wsize + \
+                                c_start + arg % self.ksize[1]
+        output = self.unpool2d_forward_naive(input, indices, self.ksize, \
+                self.strides, self.paddings).astype("float32")
+        self.inputs = {
+            'X': input.astype('float32'),
+            'Indices': indices.astype('int32')
+        }
+        self.attrs = {
+            'strides': self.strides,
+            'paddings': self.paddings,
+            'ksize': self.ksize,
+            'unpooling_type': self.unpooling_type,
+        }
+        self.outputs = {'Out': output.astype('float32')}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+    def init_test_case(self):
+        self.unpool2d_forward_naive = unpool2dmax_forward_naive
+        self.unpooling_type = "max"
+        self.shape = [6, 4, 5, 5]
+        self.ksize = [3, 3]
+        self.strides = [2, 2]
+        self.paddings = [0, 0]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_variable.py b/python/paddle/v2/fluid/tests/test_variable.py
index 92ffdceb6c84fb2669f8c1bb556c46fb1c03c411..f1e4c0ba21d5c4f10d2b5011bdb5abaebaec5431 100644
--- a/python/paddle/v2/fluid/tests/test_variable.py
+++ b/python/paddle/v2/fluid/tests/test_variable.py
@@ -1,5 +1,5 @@
 import unittest
-from paddle.v2.fluid.framework import g_main_program, Program, convert_np_dtype_to_dtype_
+from paddle.v2.fluid.framework import default_main_program, Program, convert_np_dtype_to_dtype_
 import paddle.v2.fluid.core as core
 import numpy as np
 
@@ -18,7 +18,7 @@ class TestVariable(unittest.TestCase):
         self.assertRaises(ValueError, lambda: convert("int8"))
 
     def test_var(self):
-        b = g_main_program.current_block()
+        b = default_main_program().current_block()
         w = b.create_var(
             dtype="float64", shape=[784, 100], lod_level=0, name="fc.w")
         self.assertNotEqual(str(w), "")
diff --git a/python/paddle/v2/fluid/tests/test_while_op.py b/python/paddle/v2/fluid/tests/test_while_op.py
index fca0cdcc319ff661ced33b6bcd242c894941576c..033b03a4957131e1155c61e8ed2f10eefb23fda4 100644
--- a/python/paddle/v2/fluid/tests/test_while_op.py
+++ b/python/paddle/v2/fluid/tests/test_while_op.py
@@ -55,19 +55,10 @@ class TestWhileOp(unittest.TestCase):
         for i in xrange(3):
             d.append(numpy.random.random(size=[10]).astype('float32'))
 
-        d_tensor = []
-        for item in d:
-            t = core.LoDTensor()
-            t.set(item, cpu)
-            d_tensor.append(t)
-
-        outs = map(numpy.array,
-                   exe.run(feed={
-                       'd0': d_tensor[0],
-                       'd1': d_tensor[1],
-                       'd2': d_tensor[2]
-                   },
-                           fetch_list=[sum_result]))
+        outs = exe.run(feed={'d0': d[0],
+                             'd1': d[1],
+                             'd2': d[2]},
+                       fetch_list=[sum_result])
         self.assertAlmostEqual(numpy.sum(d), numpy.sum(outs[0]), delta=0.01)