diff --git a/.gitignore b/.gitignore
index 35bed0accdaa274f5966ca5b4b7180106325449b..1c9730a5ad57cd70613c0692529bcb1ccf056d59 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,3 +9,6 @@ build/
 .pydevproject
 Makefile
 .test_env/
+
+*~
+bazel-*
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 942669c41ff154c91e88c937739b0f604f21d545..b9902a863d864b28f0fad0fefe64248e356010e4 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,7 +6,8 @@
 -   repo: https://github.com/reyoung/mirrors-yapf.git
     sha: v0.13.2
     hooks:
-    -   id: yapf
+    - id: yapf
+      files: (.*\.(py|bzl)|BUILD|.*\.BUILD|WORKSPACE)$  # Bazel BUILD files follow Python syntax.
 -   repo: https://github.com/pre-commit/pre-commit-hooks
     sha: 7539d8bd1a00a3c1bfd34cdb606d3a6372e83469
     hooks:
diff --git a/.travis.yml b/.travis.yml
index 6215060e336c7cff9689951c918dc7ec02b2a2fb..5b14f8e61e6143bb22a3aad5e0a9b11688b1b4be 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -8,10 +8,13 @@ os:
 env:
   - JOB=DOCS
   - JOB=BUILD_AND_TEST
+  - JOB=PRE_COMMIT
 matrix:
   exclude:
     - os: osx
-      env: JOB=DOCS  # Only generate documentation in linux
+      env: JOB=DOCS  # Only generate documentation in linux.
+    - os: osx
+      env: JOB=PRE_COMMIT # Only check pre-commit hook in linux
 
 addons:
   apt:
@@ -26,10 +29,6 @@ addons:
       - python-pip
       - python2.7-dev
       - m4
-      - libprotobuf-dev
-      - doxygen
-      - protobuf-compiler
-      - python-protobuf
       - python-numpy
       - python-wheel
       - libgoogle-glog-dev
@@ -39,18 +38,25 @@ addons:
       - lcov
       - graphviz
       - swig
+      - clang-format-3.8
+      - automake
+      - libtool
 before_install:
   - |
     if [ ${JOB} == "BUILD_AND_TEST" ]; then
-      if ! git diff --name-only $TRAVIS_COMMIT_RANGE | grep -qvE '(\.md$)|(\.rst$)|(\.jpg$)|(\.png$)'
-      then
-        echo "Only markdown docs were updated, stopping build process."
-        exit
+      local change_list=`git diff --name-only $TRAVIS_COMMIT_RANGE`
+      if [ $? -eq 0 ]; then  # if git diff return no zero, then rerun unit test.
+        if ! echo ${change_list} | grep -qvE '(\.md$)|(\.rst$)|(\.jpg$)|(\.png$)'
+        then
+          echo "Only markdown docs were updated, stopping build process."
+          exit
+        fi
       fi
     fi
   - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi
   - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
-  - pip install wheel protobuf sphinx breathe recommonmark virtualenv numpy sphinx_rtd_theme
+  - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
+  - pip install wheel protobuf sphinx recommonmark virtualenv numpy sphinx_rtd_theme pre-commit
 script:
   - paddle/scripts/travis/main.sh
 notifications:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0a44e56719baa433a5c45df2082fa86296b3da1c..65fbbb481c432f7b905f4dec7ea39c51ec853ae8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -11,7 +11,7 @@ find_package(Protobuf REQUIRED)
 
 # Check protobuf library version.
 execute_process(COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --version
-	OUTPUT_VARIABLE PROTOBUF_VERSION)
+    OUTPUT_VARIABLE PROTOBUF_VERSION)
 string(REPLACE "libprotoc " "" PROTOBUF_VERSION ${PROTOBUF_VERSION})
 
 set(PROTOBUF_3 OFF)
@@ -25,8 +25,8 @@ find_package(ZLIB REQUIRED)
 find_package(NumPy REQUIRED)
 find_package(Threads REQUIRED)
 find_package(AVX QUIET)
-find_package(Glog)
-find_package(Gflags QUIET)
+find_package(Glog REQUIRED)
+find_package(Gflags REQUIRED)
 find_package(GTest)
 find_package(Sphinx)
 find_package(Doxygen)
@@ -40,8 +40,6 @@ option(WITH_AVX "Compile PaddlePaddle with avx intrinsics" ${AVX_FOUND})
 option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
 option(WITH_STYLE_CHECK "Style Check for PaddlePaddle" ${PYTHONINTERP_FOUND})
 option(WITH_RDMA "Compile PaddlePaddle with rdma support" OFF)
-option(WITH_GLOG "Compile PaddlePaddle use glog, otherwise use a log implement internally" ${LIBGLOG_FOUND})
-option(WITH_GFLAGS "Compile PaddlePaddle use gflags, otherwise use a flag implement internally" ${GFLAGS_FOUND})
 option(WITH_TIMER "Compile PaddlePaddle use timer" OFF)
 option(WITH_PROFILER "Compile PaddlePaddle use gpu profiler" OFF)
 option(WITH_TESTING "Compile and run unittest for PaddlePaddle" ${GTEST_FOUND})
@@ -136,16 +134,12 @@ else(WITH_RDMA)
   add_definitions(-DPADDLE_DISABLE_RDMA)
 endif(WITH_RDMA)
 
-if(WITH_GLOG)
-    add_definitions(-DPADDLE_USE_GLOG)
-    include_directories(${LIBGLOG_INCLUDE_DIR})
-endif()
+# glog
+include_directories(${LIBGLOG_INCLUDE_DIR})
 
-if(WITH_GFLAGS)
-    add_definitions(-DPADDLE_USE_GFLAGS)
-    add_definitions(-DGFLAGS_NS=${GFLAGS_NAMESPACE})
-    include_directories(${GFLAGS_INCLUDE_DIRS})
-endif()
+#gflags
+add_definitions(-DGFLAGS_NS=${GFLAGS_NAMESPACE})
+include_directories(${GFLAGS_INCLUDE_DIRS})
 
 if(WITH_TESTING)
     enable_testing()
@@ -169,5 +163,4 @@ add_subdirectory(paddle)
 add_subdirectory(python)
 if(WITH_DOC)
     add_subdirectory(doc)
-    add_subdirectory(doc_cn)
 endif()
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000000000000000000000000000000000000..0d4bb973ae87bb45ef4386a63c26ed62602f2cee
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1 @@
+./doc/howto/dev/contribute_to_paddle_en.md
diff --git a/WORKSPACE b/WORKSPACE
new file mode 100644
index 0000000000000000000000000000000000000000..f097c41da85affd1ff0b24200dbdbc63bf9c3ab6
--- /dev/null
+++ b/WORKSPACE
@@ -0,0 +1,31 @@
+# External dependency to Google protobuf.
+http_archive(
+    name="protobuf",
+    url="http://github.com/google/protobuf/archive/v3.1.0.tar.gz",
+    sha256="0a0ae63cbffc274efb573bdde9a253e3f32e458c41261df51c5dbc5ad541e8f7",
+    strip_prefix="protobuf-3.1.0")
+
+# External dependency to gtest 1.7.0.  This method comes from
+# https://www.bazel.io/versions/master/docs/tutorial/cpp.html.
+new_http_archive(
+    name="gtest",
+    url="https://github.com/google/googletest/archive/release-1.7.0.zip",
+    sha256="b58cb7547a28b2c718d1e38aee18a3659c9e3ff52440297e965f5edffe34b6d0",
+    build_file="third_party/gtest.BUILD",
+    strip_prefix="googletest-release-1.7.0")
+
+# External dependency to gflags.  This method comes from
+# https://github.com/gflags/example/blob/master/WORKSPACE.
+new_git_repository(
+    name="gflags",
+    tag="v2.2.0",
+    remote="https://github.com/gflags/gflags.git",
+    build_file="third_party/gflags.BUILD")
+
+# External dependency to glog.  This method comes from
+# https://github.com/reyoung/bazel_playground/blob/master/WORKSPACE
+new_git_repository(
+    name="glog",
+    remote="https://github.com/google/glog.git",
+    commit="b6a5e0524c28178985f0d228e9eaa43808dbec3c",
+    build_file="third_party/glog.BUILD")
diff --git a/benchmark/tensorflow/rnn/run_multi.sh b/benchmark/tensorflow/rnn/run_multi.sh
index f7f52e01e38d304bb3bf8185c53bd0da26014d3a..c2d7dd597e6da54cd5c4cda311fbbd18486b4647 100755
--- a/benchmark/tensorflow/rnn/run_multi.sh
+++ b/benchmark/tensorflow/rnn/run_multi.sh
@@ -25,4 +25,3 @@ test 4 2 256 512
 test 4 2 512 128 
 test 4 2 512 256 
 test 4 2 512 512 
-
diff --git a/cmake/FindSphinx.cmake b/cmake/FindSphinx.cmake
index 6702f45a168bf0dfc6cfca3ff8e68fbc79c92b11..d319442ef10b38b9edf5844e5540a92c7094c7ce 100644
--- a/cmake/FindSphinx.cmake
+++ b/cmake/FindSphinx.cmake
@@ -72,6 +72,7 @@ function( Sphinx_add_target target_name builder conf cache source destination )
     ${source}
     ${destination}
     COMMENT "Generating sphinx documentation: ${builder}"
+    COMMAND ln -sf ${destination}/index_*.html ${destination}/index.html
     )
 
   set_property(
@@ -143,4 +144,4 @@ function( Sphinx_add_targets target_base_name conf source base_destination )
 
     add_dependencies( ${target_base_name}_linkcheck ${_dependencies} )
   endif()
-endfunction()
\ No newline at end of file
+endfunction()
diff --git a/cmake/check_packages.cmake b/cmake/check_packages.cmake
index 3bc0c1fd18448e3c2f0799295ac77d57cdc1bee7..afb84c6ff52af05769a99246d2e93380832c04e0 100644
--- a/cmake/check_packages.cmake
+++ b/cmake/check_packages.cmake
@@ -14,13 +14,9 @@ if(WITH_STYLE_CHECK)
   find_package(PythonInterp REQUIRED)
 endif()
 
-if(WITH_GLOG)
-  find_package(Glog REQUIRED)
-endif()
+find_package(Glog REQUIRED)
 
-if(WITH_GFLAGS)
-  find_package(Gflags REQUIRED)
-endif()
+find_package(Gflags REQUIRED)
 
 if(WITH_TESTING)
   find_package(GTest REQUIRED)
@@ -28,9 +24,7 @@ endif()
 
 if(WITH_DOC)
   find_package(Sphinx REQUIRED)
-  find_package(Doxygen REQUIRED)
   find_python_module(recommonmark REQUIRED)
-  find_python_module(breathe REQUIRED)
 endif()
 
 if(WITH_SWIG_PY)
diff --git a/cmake/util.cmake b/cmake/util.cmake
index 11641f6064b9db36e14293460a1f05067e373661..38366373c6dbcc1d05c359484ae73ace1bbc59be 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -65,7 +65,7 @@ endmacro()
 # link_paddle_exe
 # add paddle library for a paddle executable, such as trainer, pserver.
 #
-# It will handle WITH_PYTHON/WITH_GLOG etc.
+# It will handle WITH_PYTHON etc.
 function(link_paddle_exe TARGET_NAME)
     if(WITH_RDMA)
         generate_rdma_links()
@@ -108,6 +108,8 @@ function(link_paddle_exe TARGET_NAME)
         paddle_cuda
         ${METRIC_LIBS}
         ${PROTOBUF_LIBRARY}
+        ${LIBGLOG_LIBRARY}
+        ${GFLAGS_LIBRARIES}
         ${CMAKE_THREAD_LIBS_INIT}
         ${CBLAS_LIBS}
         ${ZLIB_LIBRARIES}
@@ -119,27 +121,17 @@ function(link_paddle_exe TARGET_NAME)
             ${RDMA_LD_FLAGS}
             ${RDMA_LIBS})
     endif()
-    
+
     if(WITH_PYTHON)
         target_link_libraries(${TARGET_NAME}
             ${PYTHON_LIBRARIES})
     endif()
 
-    if(WITH_GLOG)
-        target_link_libraries(${TARGET_NAME}
-            ${LIBGLOG_LIBRARY})
-    endif()
-
-    if(WITH_GFLAGS)
-        target_link_libraries(${TARGET_NAME}
-            ${GFLAGS_LIBRARIES})
-    endif()
-
     if(WITH_GPU)
-        if(NOT WITH_DSO OR WITH_METRIC) 
+        if(NOT WITH_DSO OR WITH_METRIC)
             target_link_libraries(${TARGET_NAME}
                 ${CUDNN_LIBRARY}
-                ${CUDA_curand_LIBRARY}) 
+                ${CUDA_curand_LIBRARY})
             CUDA_ADD_CUBLAS_TO_TARGET(${TARGET_NAME})
         endif()
 
@@ -206,5 +198,5 @@ function(create_resources res_file output)
     # Convert hex data for C compatibility
     string(REGEX REPLACE "([0-9a-f][0-9a-f])" "0x\\1," filedata ${filedata})
     # Append data to output file
-    file(APPEND ${output} "const unsigned char ${filename}[] = {${filedata}};\nconst unsigned ${filename}_size = sizeof(${filename});\n")
+    file(APPEND ${output} "const unsigned char ${filename}[] = {${filedata}0};\nconst unsigned ${filename}_size = sizeof(${filename});\n")
 endfunction()
diff --git a/demo/gan/README.md b/demo/gan/README.md
index fdc970a07b488c3a4146c9baa76a133a456fc9ab..1908b534b0c1f63904d5503399b961d74ce0037c 100644
--- a/demo/gan/README.md
+++ b/demo/gan/README.md
@@ -10,4 +10,4 @@ Then you can run the command below. The flag -d specifies the training data (cif
 $python gan_trainer.py -d cifar --use_gpu 1
 
 The generated images will be stored in ./cifar_samples/
-The corresponding models will be stored in ./cifar_params/
\ No newline at end of file
+The corresponding models will be stored in ./cifar_params/
diff --git a/demo/gan/data/download_cifar.sh b/demo/gan/data/download_cifar.sh
index 32e73b3d8e50ec845c79e4ce93f220583f364360..bbadc7c10c73e45a0948018b8812f79040d14bc4 100755
--- a/demo/gan/data/download_cifar.sh
+++ b/demo/gan/data/download_cifar.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -15,4 +16,3 @@ set -e
 wget https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz
 tar zxf cifar-10-python.tar.gz
 rm cifar-10-python.tar.gz
-
diff --git a/demo/gan/data/get_mnist_data.sh b/demo/gan/data/get_mnist_data.sh
old mode 100644
new mode 100755
index d21bf7067135f1f8be486ef0f13fc3ec94ffc4ed..a77c81bf5af9ddb6634ff89460797ca543c5e517
--- a/demo/gan/data/get_mnist_data.sh
+++ b/demo/gan/data/get_mnist_data.sh
@@ -15,5 +15,3 @@ do
         gunzip ${fname}.gz
     fi
 done
-
-
diff --git a/demo/gan/gan_conf.py b/demo/gan/gan_conf.py
index 58ba9dde58bafb90a4bd1d76f5d8138e8948dd3a..86ac2dffe5f4490a88e12d1fa5e8cd9fa61a69f4 100644
--- a/demo/gan/gan_conf.py
+++ b/demo/gan/gan_conf.py
@@ -14,10 +14,9 @@
 from paddle.trainer_config_helpers import *
 
 mode = get_config_arg("mode", str, "generator")
-assert mode in set(["generator",
-                    "discriminator",
-                    "generator_training",
-                    "discriminator_training"])
+assert mode in set([
+    "generator", "discriminator", "generator_training", "discriminator_training"
+])
 
 is_generator_training = mode == "generator_training"
 is_discriminator_training = mode == "discriminator_training"
@@ -38,8 +37,8 @@ sample_dim = 2
 settings(
     batch_size=128,
     learning_rate=1e-4,
-    learning_method=AdamOptimizer(beta1=0.5)
-)
+    learning_method=AdamOptimizer(beta1=0.5))
+
 
 def discriminator(sample):
     """
@@ -50,70 +49,87 @@ def discriminator(sample):
     of the sample is from real data.
     """
     param_attr = ParamAttr(is_static=is_generator_training)
-    bias_attr = ParamAttr(is_static=is_generator_training,
-                          initial_mean=1.0,
-                          initial_std=0)
-
-    hidden = fc_layer(input=sample, name="dis_hidden", size=hidden_dim,
-                    bias_attr=bias_attr,
-                    param_attr=param_attr,
-                    act=ReluActivation())
-
-    hidden2 = fc_layer(input=hidden, name="dis_hidden2", size=hidden_dim,
-                    bias_attr=bias_attr,
-                    param_attr=param_attr,
-                    act=LinearActivation())
-    
-    hidden_bn = batch_norm_layer(hidden2, 
-                     act=ReluActivation(), 
-                     name="dis_hidden_bn", 
-                     bias_attr=bias_attr, 
-                     param_attr=ParamAttr(is_static=is_generator_training,
-                           initial_mean=1.0,
-                           initial_std=0.02),
-                     use_global_stats=False)
-    
-    return fc_layer(input=hidden_bn, name="dis_prob", size=2,
-                    bias_attr=bias_attr,
-                    param_attr=param_attr,
-                    act=SoftmaxActivation())
+    bias_attr = ParamAttr(
+        is_static=is_generator_training, initial_mean=1.0, initial_std=0)
+
+    hidden = fc_layer(
+        input=sample,
+        name="dis_hidden",
+        size=hidden_dim,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        act=ReluActivation())
+
+    hidden2 = fc_layer(
+        input=hidden,
+        name="dis_hidden2",
+        size=hidden_dim,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        act=LinearActivation())
+
+    hidden_bn = batch_norm_layer(
+        hidden2,
+        act=ReluActivation(),
+        name="dis_hidden_bn",
+        bias_attr=bias_attr,
+        param_attr=ParamAttr(
+            is_static=is_generator_training, initial_mean=1.0,
+            initial_std=0.02),
+        use_global_stats=False)
+
+    return fc_layer(
+        input=hidden_bn,
+        name="dis_prob",
+        size=2,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        act=SoftmaxActivation())
+
 
 def generator(noise):
     """
     generator generates a sample given noise
     """
     param_attr = ParamAttr(is_static=is_discriminator_training)
-    bias_attr = ParamAttr(is_static=is_discriminator_training,
-                           initial_mean=1.0,
-                           initial_std=0)
-    
-    hidden = fc_layer(input=noise,
-                    name="gen_layer_hidden",
-                    size=hidden_dim,
-                    bias_attr=bias_attr,
-                    param_attr=param_attr,
-                    act=ReluActivation())
-
-    hidden2 = fc_layer(input=hidden, name="gen_hidden2", size=hidden_dim,
-                    bias_attr=bias_attr,
-                    param_attr=param_attr,
-                    act=LinearActivation())
-    
-    hidden_bn = batch_norm_layer(hidden2, 
-                     act=ReluActivation(), 
-                     name="gen_layer_hidden_bn", 
-                     bias_attr=bias_attr, 
-                     param_attr=ParamAttr(is_static=is_discriminator_training,
-                           initial_mean=1.0,
-                           initial_std=0.02),
-                     use_global_stats=False)
-    
-    return fc_layer(input=hidden_bn,
-                    name="gen_layer1",
-                    size=sample_dim,
-                    bias_attr=bias_attr,
-                    param_attr=param_attr,
-                    act=LinearActivation())
+    bias_attr = ParamAttr(
+        is_static=is_discriminator_training, initial_mean=1.0, initial_std=0)
+
+    hidden = fc_layer(
+        input=noise,
+        name="gen_layer_hidden",
+        size=hidden_dim,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        act=ReluActivation())
+
+    hidden2 = fc_layer(
+        input=hidden,
+        name="gen_hidden2",
+        size=hidden_dim,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        act=LinearActivation())
+
+    hidden_bn = batch_norm_layer(
+        hidden2,
+        act=ReluActivation(),
+        name="gen_layer_hidden_bn",
+        bias_attr=bias_attr,
+        param_attr=ParamAttr(
+            is_static=is_discriminator_training,
+            initial_mean=1.0,
+            initial_std=0.02),
+        use_global_stats=False)
+
+    return fc_layer(
+        input=hidden_bn,
+        name="gen_layer1",
+        size=sample_dim,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        act=LinearActivation())
+
 
 if is_generator_training:
     noise = data_layer(name="noise", size=noise_dim)
@@ -126,7 +142,8 @@ if is_generator_training or is_discriminator_training:
     label = data_layer(name="label", size=1)
     prob = discriminator(sample)
     cost = cross_entropy(input=prob, label=label)
-    classification_error_evaluator(input=prob, label=label, name=mode+'_error')
+    classification_error_evaluator(
+        input=prob, label=label, name=mode + '_error')
     outputs(cost)
 
 if is_generator:
diff --git a/demo/gan/gan_conf_image.py b/demo/gan/gan_conf_image.py
index 5c2b140537418d52760719c7b605e778790cb7a6..f89a4e706c3b7eeaa7858f54f8fa04a5e038b66e 100644
--- a/demo/gan/gan_conf_image.py
+++ b/demo/gan/gan_conf_image.py
@@ -15,10 +15,9 @@ from paddle.trainer_config_helpers import *
 
 mode = get_config_arg("mode", str, "generator")
 dataSource = get_config_arg("data", str, "mnist")
-assert mode in set(["generator",
-                    "discriminator",
-                    "generator_training",
-                    "discriminator_training"])
+assert mode in set([
+    "generator", "discriminator", "generator_training", "discriminator_training"
+])
 
 is_generator_training = mode == "generator_training"
 is_discriminator_training = mode == "discriminator_training"
@@ -36,24 +35,33 @@ noise_dim = 100
 gf_dim = 64
 df_dim = 64
 if dataSource == "mnist":
-    sample_dim = 28 # image dim
-    c_dim = 1 # image color
+    sample_dim = 28  # image dim
+    c_dim = 1  # image color
 else:
     sample_dim = 32
     c_dim = 3
-s2, s4 = int(sample_dim/2), int(sample_dim/4), 
-s8, s16 = int(sample_dim/8), int(sample_dim/16)
+s2, s4 = int(sample_dim / 2), int(sample_dim / 4),
+s8, s16 = int(sample_dim / 8), int(sample_dim / 16)
 
 settings(
     batch_size=128,
     learning_rate=2e-4,
-    learning_method=AdamOptimizer(beta1=0.5)
-)
+    learning_method=AdamOptimizer(beta1=0.5))
 
-def conv_bn(input, channels, imgSize, num_filters, output_x, stride, name, 
-                 param_attr, bias_attr, param_attr_bn, bn, trans=False, 
-                 act=ReluActivation()):
-    
+
+def conv_bn(input,
+            channels,
+            imgSize,
+            num_filters,
+            output_x,
+            stride,
+            name,
+            param_attr,
+            bias_attr,
+            param_attr_bn,
+            bn,
+            trans=False,
+            act=ReluActivation()):
     """
     conv_bn is a utility function that constructs a convolution/deconv layer 
     with an optional batch_norm layer
@@ -63,10 +71,10 @@ def conv_bn(input, channels, imgSize, num_filters, output_x, stride, name,
     :param trans: whether to use conv (False) or deconv (True)
     :type trans: bool
     """
-    
+
     # calculate the filter_size and padding size based on the given
     # imgSize and ouput size
-    tmp =  imgSize - (output_x - 1) * stride
+    tmp = imgSize - (output_x - 1) * stride
     if tmp <= 1 or tmp > 5:
         raise ValueError("conv input-output dimension does not fit")
     elif tmp <= 3:
@@ -76,111 +84,134 @@ def conv_bn(input, channels, imgSize, num_filters, output_x, stride, name,
         filter_size = tmp
         padding = 0
 
-    print (imgSize, output_x, stride, filter_size, padding)
-    
+    print(imgSize, output_x, stride, filter_size, padding)
+
     if trans:
         nameApx = "_conv"
     else:
         nameApx = "_convt"
-    
+
     if bn:
-        conv = img_conv_layer(input, filter_size=filter_size, 
-                   num_filters=num_filters,
-                   name=name + nameApx, num_channels=channels,
-                   act=LinearActivation(), groups=1, stride=stride, 
-                   padding=padding, bias_attr=bias_attr,
-                   param_attr=param_attr, shared_biases=True, layer_attr=None,
-                   filter_size_y=None, stride_y=None, padding_y=None, 
-                   trans=trans)
-        
-        conv_bn = batch_norm_layer(conv, 
-                         act=act, 
-                         name=name + nameApx + "_bn", 
-                         bias_attr=bias_attr, 
-                         param_attr=param_attr_bn,
-                         use_global_stats=False)
-        
+        conv = img_conv_layer(
+            input,
+            filter_size=filter_size,
+            num_filters=num_filters,
+            name=name + nameApx,
+            num_channels=channels,
+            act=LinearActivation(),
+            groups=1,
+            stride=stride,
+            padding=padding,
+            bias_attr=bias_attr,
+            param_attr=param_attr,
+            shared_biases=True,
+            layer_attr=None,
+            filter_size_y=None,
+            stride_y=None,
+            padding_y=None,
+            trans=trans)
+
+        conv_bn = batch_norm_layer(
+            conv,
+            act=act,
+            name=name + nameApx + "_bn",
+            bias_attr=bias_attr,
+            param_attr=param_attr_bn,
+            use_global_stats=False)
+
         return conv_bn
     else:
-        conv = img_conv_layer(input, filter_size=filter_size, 
-                   num_filters=num_filters,
-                   name=name + nameApx, num_channels=channels,
-                   act=act, groups=1, stride=stride, 
-                   padding=padding, bias_attr=bias_attr,
-                   param_attr=param_attr, shared_biases=True, layer_attr=None,
-                   filter_size_y=None, stride_y=None, padding_y=None,
-                   trans=trans)
+        conv = img_conv_layer(
+            input,
+            filter_size=filter_size,
+            num_filters=num_filters,
+            name=name + nameApx,
+            num_channels=channels,
+            act=act,
+            groups=1,
+            stride=stride,
+            padding=padding,
+            bias_attr=bias_attr,
+            param_attr=param_attr,
+            shared_biases=True,
+            layer_attr=None,
+            filter_size_y=None,
+            stride_y=None,
+            padding_y=None,
+            trans=trans)
         return conv
-    
+
+
 def generator(noise):
     """
     generator generates a sample given noise
     """
-    param_attr = ParamAttr(is_static=is_discriminator_training,
-                           initial_mean=0.0,
-                           initial_std=0.02)
-    bias_attr = ParamAttr(is_static=is_discriminator_training,
-                           initial_mean=0.0,
-                           initial_std=0.0)
-    
-    param_attr_bn=ParamAttr(is_static=is_discriminator_training,
-                           initial_mean=1.0,
-                           initial_std=0.02)
-    
-    h1 = fc_layer(input=noise,
-                    name="gen_layer_h1",
-                    size=s8 * s8 * gf_dim * 4,
-                    bias_attr=bias_attr,
-                    param_attr=param_attr,
-                    act=LinearActivation())
-    
-    h1_bn = batch_norm_layer(h1, 
-                     act=ReluActivation(), 
-                     name="gen_layer_h1_bn", 
-                     bias_attr=bias_attr, 
-                     param_attr=param_attr_bn,
-                     use_global_stats=False)
-    
-    h2_bn = conv_bn(h1_bn, 
-                    channels=gf_dim*4, 
-                    output_x=s8,
-                    num_filters=gf_dim*2, 
-                    imgSize=s4,
-                    stride=2,
-                    name="gen_layer_h2", 
-                    param_attr=param_attr, 
-                    bias_attr=bias_attr, 
-                    param_attr_bn=param_attr_bn,
-                    bn=True,
-                    trans=True)
-    
-    h3_bn = conv_bn(h2_bn, 
-                    channels=gf_dim*2, 
-                    output_x=s4,
-                    num_filters=gf_dim, 
-                    imgSize=s2,
-                    stride=2,
-                    name="gen_layer_h3", 
-                    param_attr=param_attr, 
-                    bias_attr=bias_attr, 
-                    param_attr_bn=param_attr_bn,
-                    bn=True,
-                    trans=True)
-     
-    
-    return conv_bn(h3_bn,
-                   channels=gf_dim, 
-                   output_x=s2,
-                   num_filters=c_dim, 
-                   imgSize=sample_dim,
-                   stride=2,
-                   name="gen_layer_h4", 
-                   param_attr=param_attr, 
-                   bias_attr=bias_attr, 
-                   param_attr_bn=param_attr_bn,
-                   bn=False,
-                   trans=True,
-                   act=TanhActivation())
+    param_attr = ParamAttr(
+        is_static=is_discriminator_training, initial_mean=0.0, initial_std=0.02)
+    bias_attr = ParamAttr(
+        is_static=is_discriminator_training, initial_mean=0.0, initial_std=0.0)
+
+    param_attr_bn = ParamAttr(
+        is_static=is_discriminator_training, initial_mean=1.0, initial_std=0.02)
+
+    h1 = fc_layer(
+        input=noise,
+        name="gen_layer_h1",
+        size=s8 * s8 * gf_dim * 4,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        act=LinearActivation())
+
+    h1_bn = batch_norm_layer(
+        h1,
+        act=ReluActivation(),
+        name="gen_layer_h1_bn",
+        bias_attr=bias_attr,
+        param_attr=param_attr_bn,
+        use_global_stats=False)
+
+    h2_bn = conv_bn(
+        h1_bn,
+        channels=gf_dim * 4,
+        output_x=s8,
+        num_filters=gf_dim * 2,
+        imgSize=s4,
+        stride=2,
+        name="gen_layer_h2",
+        param_attr=param_attr,
+        bias_attr=bias_attr,
+        param_attr_bn=param_attr_bn,
+        bn=True,
+        trans=True)
+
+    h3_bn = conv_bn(
+        h2_bn,
+        channels=gf_dim * 2,
+        output_x=s4,
+        num_filters=gf_dim,
+        imgSize=s2,
+        stride=2,
+        name="gen_layer_h3",
+        param_attr=param_attr,
+        bias_attr=bias_attr,
+        param_attr_bn=param_attr_bn,
+        bn=True,
+        trans=True)
+
+    return conv_bn(
+        h3_bn,
+        channels=gf_dim,
+        output_x=s2,
+        num_filters=c_dim,
+        imgSize=sample_dim,
+        stride=2,
+        name="gen_layer_h4",
+        param_attr=param_attr,
+        bias_attr=bias_attr,
+        param_attr_bn=param_attr_bn,
+        bn=False,
+        trans=True,
+        act=TanhActivation())
 
 
 def discriminator(sample):
@@ -191,58 +222,60 @@ def discriminator(sample):
     of the sample is from generator and dimension 1 is the probabblity
     of the sample is from real data.
     """
-    param_attr = ParamAttr(is_static=is_generator_training,
-                           initial_mean=0.0,
-                           initial_std=0.02)
-    bias_attr = ParamAttr(is_static=is_generator_training,
-                          initial_mean=0.0,
-                          initial_std=0.0)
-    
-    param_attr_bn=ParamAttr(is_static=is_generator_training,
-                           initial_mean=1.0,
-                           initial_std=0.02)
-    
-    h0 = conv_bn(sample, 
-                 channels=c_dim, 
-                 imgSize=sample_dim,
-                 num_filters=df_dim, 
-                 output_x=s2, 
-                 stride=2, 
-                 name="dis_h0", 
-                 param_attr=param_attr, 
-                 bias_attr=bias_attr, 
-                 param_attr_bn=param_attr_bn, 
-                 bn=False)
-    
-    h1_bn = conv_bn(h0, 
-                 channels=df_dim,
-                 imgSize=s2,
-                 num_filters=df_dim*2, 
-                 output_x=s4, 
-                 stride=2, 
-                 name="dis_h1", 
-                 param_attr=param_attr, 
-                 bias_attr=bias_attr, 
-                 param_attr_bn=param_attr_bn, 
-                 bn=True)
-
-    h2_bn = conv_bn(h1_bn, 
-                 channels=df_dim*2,
-                 imgSize=s4,
-                 num_filters=df_dim*4, 
-                 output_x=s8, 
-                 stride=2, 
-                 name="dis_h2", 
-                 param_attr=param_attr, 
-                 bias_attr=bias_attr, 
-                 param_attr_bn=param_attr_bn, 
-                 bn=True)
-        
-    return fc_layer(input=h2_bn, name="dis_prob", size=2,
-                    bias_attr=bias_attr,
-                    param_attr=param_attr,
-                    act=SoftmaxActivation())
+    param_attr = ParamAttr(
+        is_static=is_generator_training, initial_mean=0.0, initial_std=0.02)
+    bias_attr = ParamAttr(
+        is_static=is_generator_training, initial_mean=0.0, initial_std=0.0)
+
+    param_attr_bn = ParamAttr(
+        is_static=is_generator_training, initial_mean=1.0, initial_std=0.02)
+
+    h0 = conv_bn(
+        sample,
+        channels=c_dim,
+        imgSize=sample_dim,
+        num_filters=df_dim,
+        output_x=s2,
+        stride=2,
+        name="dis_h0",
+        param_attr=param_attr,
+        bias_attr=bias_attr,
+        param_attr_bn=param_attr_bn,
+        bn=False)
+
+    h1_bn = conv_bn(
+        h0,
+        channels=df_dim,
+        imgSize=s2,
+        num_filters=df_dim * 2,
+        output_x=s4,
+        stride=2,
+        name="dis_h1",
+        param_attr=param_attr,
+        bias_attr=bias_attr,
+        param_attr_bn=param_attr_bn,
+        bn=True)
+
+    h2_bn = conv_bn(
+        h1_bn,
+        channels=df_dim * 2,
+        imgSize=s4,
+        num_filters=df_dim * 4,
+        output_x=s8,
+        stride=2,
+        name="dis_h2",
+        param_attr=param_attr,
+        bias_attr=bias_attr,
+        param_attr_bn=param_attr_bn,
+        bn=True)
 
+    return fc_layer(
+        input=h2_bn,
+        name="dis_prob",
+        size=2,
+        bias_attr=bias_attr,
+        param_attr=param_attr,
+        act=SoftmaxActivation())
 
 
 if is_generator_training:
@@ -250,13 +283,14 @@ if is_generator_training:
     sample = generator(noise)
 
 if is_discriminator_training:
-    sample = data_layer(name="sample", size=sample_dim * sample_dim*c_dim)
+    sample = data_layer(name="sample", size=sample_dim * sample_dim * c_dim)
 
 if is_generator_training or is_discriminator_training:
     label = data_layer(name="label", size=1)
     prob = discriminator(sample)
     cost = cross_entropy(input=prob, label=label)
-    classification_error_evaluator(input=prob, label=label, name=mode+'_error')
+    classification_error_evaluator(
+        input=prob, label=label, name=mode + '_error')
     outputs(cost)
 
 if is_generator:
diff --git a/demo/gan/gan_trainer.py b/demo/gan/gan_trainer.py
index a8c1bd0414529f48feb23bdb850751782de52c04..4a26c230f7a21cc6dd4a3cdb52e32730b1ce73ca 100644
--- a/demo/gan/gan_trainer.py
+++ b/demo/gan/gan_trainer.py
@@ -16,7 +16,7 @@ import argparse
 import random
 import numpy
 import cPickle
-import sys,os
+import sys, os
 from PIL import Image
 
 from paddle.trainer.config_parser import parse_config
@@ -24,6 +24,7 @@ from paddle.trainer.config_parser import logger
 import py_paddle.swig_paddle as api
 import matplotlib.pyplot as plt
 
+
 def plot2DScatter(data, outputfile):
     '''
     Plot the data as a 2D scatter plot and save to outputfile
@@ -41,9 +42,11 @@ def plot2DScatter(data, outputfile):
     plt.scatter(x, y)
     plt.savefig(outputfile, bbox_inches='tight')
 
+
 def CHECK_EQ(a, b):
     assert a == b, "a=%s, b=%s" % (a, b)
 
+
 def copy_shared_parameters(src, dst):
     '''
     copy the parameters from src to dst
@@ -52,11 +55,9 @@ def copy_shared_parameters(src, dst):
     :param dst: the destination of the parameters
     :type dst: GradientMachine
     '''
-    src_params = [src.getParameter(i)
-               for i in xrange(src.getParameterSize())]
+    src_params = [src.getParameter(i) for i in xrange(src.getParameterSize())]
     src_params = dict([(p.getName(), p) for p in src_params])
 
-
     for i in xrange(dst.getParameterSize()):
         dst_param = dst.getParameter(i)
         src_param = src_params.get(dst_param.getName(), None)
@@ -67,15 +68,17 @@ def copy_shared_parameters(src, dst):
         CHECK_EQ(len(src_value), len(dst_value))
         dst_value.copyFrom(src_value)
         dst_param.setValueUpdated()
-        
+
+
 def print_parameters(src):
-    src_params = [src.getParameter(i)
-               for i in xrange(src.getParameterSize())]
+    src_params = [src.getParameter(i) for i in xrange(src.getParameterSize())]
 
     print "***************"
     for p in src_params:
         print "Name is %s" % p.getName()
-        print "value is %s \n" % p.getBuf(api.PARAMETER_VALUE).copyToNumpyArray()
+        print "value is %s \n" % p.getBuf(api.PARAMETER_VALUE).copyToNumpyArray(
+        )
+
 
 def load_mnist_data(imageFile):
     f = open(imageFile, "rb")
@@ -86,33 +89,36 @@ def load_mnist_data(imageFile):
         n = 60000
     else:
         n = 10000
-    
-    data = numpy.fromfile(f, 'ubyte', count=n*28*28).reshape((n, 28*28))
+
+    data = numpy.fromfile(f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28))
     data = data / 255.0 * 2.0 - 1.0
 
     f.close()
     return data.astype('float32')
 
+
 def load_cifar_data(cifar_path):
     batch_size = 10000
-    data = numpy.zeros((5*batch_size, 32*32*3), dtype = "float32")
+    data = numpy.zeros((5 * batch_size, 32 * 32 * 3), dtype="float32")
     for i in range(1, 6):
         file = cifar_path + "/data_batch_" + str(i)
         fo = open(file, 'rb')
         dict = cPickle.load(fo)
         fo.close()
-        data[(i - 1)*batch_size:(i*batch_size), :] = dict["data"]
-    
+        data[(i - 1) * batch_size:(i * batch_size), :] = dict["data"]
+
     data = data / 255.0 * 2.0 - 1.0
     return data
 
+
 # synthesize 2-D uniform data
 def load_uniform_data():
     data = numpy.random.rand(1000000, 2).astype('float32')
     return data
 
+
 def merge(images, size):
-    if images.shape[1] == 28*28:
+    if images.shape[1] == 28 * 28:
         h, w, c = 28, 28, 1
     else:
         h, w, c = 32, 32, 3
@@ -124,6 +130,7 @@ def merge(images, size):
           ((images[idx, :].reshape((h, w, c), order="F").transpose(1, 0, 2) + 1.0) / 2.0 * 255.0)
     return img.astype('uint8')
 
+
 def save_images(images, path):
     merged_img = merge(images, [8, 8])
     if merged_img.shape[2] == 1:
@@ -131,14 +138,17 @@ def save_images(images, path):
     else:
         im = Image.fromarray(merged_img, mode="RGB")
     im.save(path)
-    
+
+
 def get_real_samples(batch_size, data_np):
-    return data_np[numpy.random.choice(data_np.shape[0], batch_size, 
-                                       replace=False),:]
-    
+    return data_np[numpy.random.choice(
+        data_np.shape[0], batch_size, replace=False), :]
+
+
 def get_noise(batch_size, noise_dim):
     return numpy.random.normal(size=(batch_size, noise_dim)).astype('float32')
 
+
 def get_fake_samples(generator_machine, batch_size, noise):
     gen_inputs = api.Arguments.createArguments(1)
     gen_inputs.setSlotValue(0, api.Matrix.createDenseFromNumpy(noise))
@@ -147,12 +157,14 @@ def get_fake_samples(generator_machine, batch_size, noise):
     fake_samples = gen_outputs.getSlotValue(0).copyToNumpyMat()
     return fake_samples
 
+
 def get_training_loss(training_machine, inputs):
     outputs = api.Arguments.createArguments(0)
     training_machine.forward(inputs, outputs, api.PASS_TEST)
     loss = outputs.getSlotValue(0).copyToNumpyMat()
     return numpy.mean(loss)
 
+
 def prepare_discriminator_data_batch_pos(batch_size, data_np):
     real_samples = get_real_samples(batch_size, data_np)
     labels = numpy.ones(batch_size, dtype='int32')
@@ -161,6 +173,7 @@ def prepare_discriminator_data_batch_pos(batch_size, data_np):
     inputs.setSlotIds(1, api.IVector.createVectorFromNumpy(labels))
     return inputs
 
+
 def prepare_discriminator_data_batch_neg(generator_machine, batch_size, noise):
     fake_samples = get_fake_samples(generator_machine, batch_size, noise)
     labels = numpy.zeros(batch_size, dtype='int32')
@@ -169,6 +182,7 @@ def prepare_discriminator_data_batch_neg(generator_machine, batch_size, noise):
     inputs.setSlotIds(1, api.IVector.createVectorFromNumpy(labels))
     return inputs
 
+
 def prepare_generator_data_batch(batch_size, noise):
     label = numpy.ones(batch_size, dtype='int32')
     inputs = api.Arguments.createArguments(2)
@@ -193,10 +207,9 @@ def get_layer_size(model_conf, layer_name):
 def main():
     parser = argparse.ArgumentParser()
     parser.add_argument("-d", "--data_source", help="mnist or cifar or uniform")
-    parser.add_argument("--use_gpu", default="1", 
-                        help="1 means use gpu for training")
-    parser.add_argument("--gpu_id", default="0", 
-                        help="the gpu_id parameter")
+    parser.add_argument(
+        "--use_gpu", default="1", help="1 means use gpu for training")
+    parser.add_argument("--gpu_id", default="0", help="the gpu_id parameter")
     args = parser.parse_args()
     data_source = args.data_source
     use_gpu = args.use_gpu
@@ -208,30 +221,32 @@ def main():
 
     if not os.path.exists("./%s_params/" % data_source):
         os.makedirs("./%s_params/" % data_source)
-        
-    api.initPaddle('--use_gpu=' + use_gpu, '--dot_period=10', '--log_period=100', 
-                   '--gpu_id=' + args.gpu_id, '--save_dir=' + "./%s_params/" % data_source)
-    
+
+    api.initPaddle('--use_gpu=' + use_gpu, '--dot_period=10',
+                   '--log_period=100', '--gpu_id=' + args.gpu_id,
+                   '--save_dir=' + "./%s_params/" % data_source)
+
     if data_source == "uniform":
         conf = "gan_conf.py"
         num_iter = 10000
     else:
         conf = "gan_conf_image.py"
         num_iter = 1000
-        
+
     gen_conf = parse_config(conf, "mode=generator_training,data=" + data_source)
-    dis_conf = parse_config(conf, "mode=discriminator_training,data=" + data_source)
+    dis_conf = parse_config(conf,
+                            "mode=discriminator_training,data=" + data_source)
     generator_conf = parse_config(conf, "mode=generator,data=" + data_source)
     batch_size = dis_conf.opt_config.batch_size
     noise_dim = get_layer_size(gen_conf.model_config, "noise")
-    
+
     if data_source == "mnist":
         data_np = load_mnist_data("./data/mnist_data/train-images-idx3-ubyte")
     elif data_source == "cifar":
         data_np = load_cifar_data("./data/cifar-10-batches-py/")
     else:
         data_np = load_uniform_data()
-    
+
     # this creates a gradient machine for discriminator
     dis_training_machine = api.GradientMachine.createFromConfigProto(
         dis_conf.model_config)
@@ -244,26 +259,24 @@ def main():
     logger.info(str(generator_conf.model_config))
     generator_machine = api.GradientMachine.createFromConfigProto(
         generator_conf.model_config)
-    
-    dis_trainer = api.Trainer.create(
-        dis_conf, dis_training_machine)
 
-    gen_trainer = api.Trainer.create(
-        gen_conf, gen_training_machine)
-    
+    dis_trainer = api.Trainer.create(dis_conf, dis_training_machine)
+
+    gen_trainer = api.Trainer.create(gen_conf, gen_training_machine)
+
     dis_trainer.startTrain()
     gen_trainer.startTrain()
-    
+
     # Sync parameters between networks (GradientMachine) at the beginning
     copy_shared_parameters(gen_training_machine, dis_training_machine)
     copy_shared_parameters(gen_training_machine, generator_machine)
-    
+
     # constrain that either discriminator or generator can not be trained
     # consecutively more than MAX_strike times
     curr_train = "dis"
     curr_strike = 0
     MAX_strike = 5
-     
+
     for train_pass in xrange(100):
         dis_trainer.startTrainPass()
         gen_trainer.startTrainPass()
@@ -272,23 +285,25 @@ def main():
             noise = get_noise(batch_size, noise_dim)
             data_batch_dis_pos = prepare_discriminator_data_batch_pos(
                 batch_size, data_np)
-            dis_loss_pos = get_training_loss(dis_training_machine, data_batch_dis_pos)
-            
+            dis_loss_pos = get_training_loss(dis_training_machine,
+                                             data_batch_dis_pos)
+
             data_batch_dis_neg = prepare_discriminator_data_batch_neg(
                 generator_machine, batch_size, noise)
-            dis_loss_neg = get_training_loss(dis_training_machine, data_batch_dis_neg)            
-                         
+            dis_loss_neg = get_training_loss(dis_training_machine,
+                                             data_batch_dis_neg)
+
             dis_loss = (dis_loss_pos + dis_loss_neg) / 2.0
-            
+
             # Do forward pass in generator to get the gen_loss
-            data_batch_gen = prepare_generator_data_batch(
-                    batch_size, noise)
+            data_batch_gen = prepare_generator_data_batch(batch_size, noise)
             gen_loss = get_training_loss(gen_training_machine, data_batch_gen)
-             
+
             if i % 100 == 0:
-                print "d_pos_loss is %s     d_neg_loss is %s" % (dis_loss_pos, dis_loss_neg) 
+                print "d_pos_loss is %s     d_neg_loss is %s" % (dis_loss_pos,
+                                                                 dis_loss_neg)
                 print "d_loss is %s    g_loss is %s" % (dis_loss, gen_loss)
-            
+
             # Decide which network to train based on the training history
             # And the relative size of the loss        
             if (not (curr_train == "dis" and curr_strike == MAX_strike)) and \
@@ -297,11 +312,12 @@ def main():
                     curr_strike += 1
                 else:
                     curr_train = "dis"
-                    curr_strike = 1                
+                    curr_strike = 1
                 dis_trainer.trainOneDataBatch(batch_size, data_batch_dis_neg)
-                dis_trainer.trainOneDataBatch(batch_size, data_batch_dis_pos)               
-                copy_shared_parameters(dis_training_machine, gen_training_machine)
- 
+                dis_trainer.trainOneDataBatch(batch_size, data_batch_dis_pos)
+                copy_shared_parameters(dis_training_machine,
+                                       gen_training_machine)
+
             else:
                 if curr_train == "gen":
                     curr_strike += 1
@@ -311,19 +327,23 @@ def main():
                 gen_trainer.trainOneDataBatch(batch_size, data_batch_gen)
                 # TODO: add API for paddle to allow true parameter sharing between different GradientMachines 
                 # so that we do not need to copy shared parameters. 
-                copy_shared_parameters(gen_training_machine, dis_training_machine)
+                copy_shared_parameters(gen_training_machine,
+                                       dis_training_machine)
                 copy_shared_parameters(gen_training_machine, generator_machine)
- 
+
         dis_trainer.finishTrainPass()
         gen_trainer.finishTrainPass()
         # At the end of each pass, save the generated samples/images
         fake_samples = get_fake_samples(generator_machine, batch_size, noise)
         if data_source == "uniform":
-            plot2DScatter(fake_samples, "./%s_samples/train_pass%s.png" % (data_source, train_pass))
+            plot2DScatter(fake_samples, "./%s_samples/train_pass%s.png" %
+                          (data_source, train_pass))
         else:
-            save_images(fake_samples, "./%s_samples/train_pass%s.png" % (data_source, train_pass))
+            save_images(fake_samples, "./%s_samples/train_pass%s.png" %
+                        (data_source, train_pass))
     dis_trainer.finishTrain()
     gen_trainer.finishTrain()
 
+
 if __name__ == '__main__':
     main()
diff --git a/demo/image_classification/data/download_cifar.sh b/demo/image_classification/data/download_cifar.sh
index 52e82d0d9812c88e5c85cffc0585e3425b862809..532178d627fe19ab8ea79ecae73e5328b5294bea 100755
--- a/demo/image_classification/data/download_cifar.sh
+++ b/demo/image_classification/data/download_cifar.sh
@@ -1,3 +1,4 @@
+#!/bin/bash
 # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
diff --git a/demo/image_classification/image_provider.py b/demo/image_classification/image_provider.py
index 87eed5eebd7680e578c822083efb8b9eab16b266..6a315ff094c1af5f8250d8a22ff96740dddd9808 100644
--- a/demo/image_classification/image_provider.py
+++ b/demo/image_classification/image_provider.py
@@ -21,7 +21,7 @@ from paddle.trainer.PyDataProvider2 import *
 
 #
 # {'img_size': 32,
-# 'settings': <paddle.trainer.PyDataProviderWrapper.Cls instance at 0x7fea27cb6050>,
+# 'settings': a global object,
 # 'color': True,
 # 'mean_img_size': 32,
 # 'meta': './data/cifar-out/batches/batches.meta',
@@ -50,10 +50,10 @@ def hook(settings, img_size, mean_img_size, num_classes, color, meta, use_jpeg,
 
     settings.logger.info('Image size: %s', settings.img_size)
     settings.logger.info('Meta path: %s', settings.meta_path)
-    settings.input_types = [
-        dense_vector(settings.img_raw_size),  # image feature
-        integer_value(settings.num_classes)
-    ]  # labels
+    settings.input_types = {
+        'image': dense_vector(settings.img_raw_size),
+        'label': integer_value(settings.num_classes)
+    }
 
     settings.logger.info('DataProvider Initialization finished')
 
@@ -83,4 +83,7 @@ def processData(settings, file_list):
                         img, settings.img_mean, settings.img_size,
                         settings.is_train, settings.color)
                     label = data['labels'][i]
-                    yield img_feat.astype('float32'), int(label)
+                    yield {
+                        'image': img_feat.astype('float32'),
+                        'label': int(label)
+                    }
diff --git a/demo/introduction/.gitignore b/demo/introduction/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..c54f3f9480ce4ceefda98f77a812ec2d6cd4a5e3
--- /dev/null
+++ b/demo/introduction/.gitignore
@@ -0,0 +1,5 @@
+dataprovider.pyc
+empty.list
+train.log
+output
+train.list
diff --git a/demo/introduction/dataprovider.py b/demo/introduction/dataprovider.py
index 03c920cc34b397643e97ad41cf06458245c7ca7b..5b48aad0408800676ae7da16eba2dcbb2124f25f 100644
--- a/demo/introduction/dataprovider.py
+++ b/demo/introduction/dataprovider.py
@@ -17,8 +17,10 @@ import random
 
 
 # define data types of input: 2 real numbers
-@provider(input_types=[dense_vector(1), dense_vector(1)], use_seq=False)
+@provider(
+    input_types={'x': dense_vector(1),
+                 'y': dense_vector(1)}, use_seq=False)
 def process(settings, input_file):
     for i in xrange(2000):
         x = random.random()
-        yield [x], [2 * x + 0.3]
+        yield {'x': [x], 'y': [2 * x + 0.3]}
diff --git a/demo/introduction/trainer_config.py b/demo/introduction/trainer_config.py
index 41cebcf6e146e55efb89c2ceea429fa003ff206e..ecafe955f9e5c1062168d5d7b6b4c639d6e72a99 100644
--- a/demo/introduction/trainer_config.py
+++ b/demo/introduction/trainer_config.py
@@ -15,11 +15,8 @@
 from paddle.trainer_config_helpers import *
 
 # 1. read data. Suppose you saved above python code as dataprovider.py
-data_file = 'empty.list'
-with open(data_file, 'w') as f:
-    f.writelines(' ')
 define_py_data_sources2(
-    train_list=data_file,
+    train_list=['no_matter.txt'],
     test_list=None,
     module='dataprovider',
     obj='process',
diff --git a/demo/mnist/mnist_provider.py b/demo/mnist/mnist_provider.py
index 6df4676da3bdc2e6949cc911fa3720cb51ddc568..4635833d36b9f21c992d96910f3ac9094ccefd2c 100644
--- a/demo/mnist/mnist_provider.py
+++ b/demo/mnist/mnist_provider.py
@@ -1,10 +1,12 @@
 from paddle.trainer.PyDataProvider2 import *
+import numpy
 
 
 # Define a py data provider
 @provider(
     input_types={'pixel': dense_vector(28 * 28),
-                 'label': integer_value(10)})
+                 'label': integer_value(10)},
+    cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, filename):  # settings is not used currently.
     imgf = filename + "-images-idx3-ubyte"
     labelf = filename + "-labels-idx1-ubyte"
@@ -20,12 +22,13 @@ def process(settings, filename):  # settings is not used currently.
     else:
         n = 10000
 
-    for i in range(n):
-        label = ord(l.read(1))
-        pixels = []
-        for j in range(28 * 28):
-            pixels.append(float(ord(f.read(1))) / 255.0)
-        yield {"pixel": pixels, 'label': label}
+    images = numpy.fromfile(
+        f, 'ubyte', count=n * 28 * 28).reshape((n, 28 * 28)).astype('float32')
+    images = images / 255.0 * 2.0 - 1.0
+    labels = numpy.fromfile(l, 'ubyte', count=n).astype("int")
+
+    for i in xrange(n):
+        yield {"pixel": images[i, :], 'label': labels[i]}
 
     f.close()
     l.close()
diff --git a/demo/quick_start/.gitignore b/demo/quick_start/.gitignore
index d6bc73105b1abfdae3067b7fecd656079a56b57c..f71662563ff96d6227dd568d9951a90b0d09456e 100644
--- a/demo/quick_start/.gitignore
+++ b/demo/quick_start/.gitignore
@@ -8,6 +8,8 @@ data/test.list
 data/test.txt
 data/train.list
 data/train.txt
+data/pred.list
+data/pred.txt
 dataprovider_copy_1.py
 train.log
 output
diff --git a/demo/quick_start/api_predict.py b/demo/quick_start/api_predict.py
new file mode 100755
index 0000000000000000000000000000000000000000..9bdffe1006281c58a595e2771561ba62e4c2d6bd
--- /dev/null
+++ b/demo/quick_start/api_predict.py
@@ -0,0 +1,147 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os, sys
+import numpy as np
+from optparse import OptionParser
+from py_paddle import swig_paddle, DataProviderConverter
+from paddle.trainer.PyDataProvider2 import sparse_binary_vector
+from paddle.trainer.config_parser import parse_config
+"""
+Usage: run following command to show help message.
+  python api_predict.py -h
+"""
+
+
+class QuickStartPrediction():
+    def __init__(self, train_conf, dict_file, model_dir=None, label_file=None):
+        """
+        train_conf: trainer configure.
+        dict_file: word dictionary file name.
+        model_dir: directory of model.
+        """
+        self.train_conf = train_conf
+        self.dict_file = dict_file
+        self.word_dict = {}
+        self.dict_dim = self.load_dict()
+        self.model_dir = model_dir
+        if model_dir is None:
+            self.model_dir = os.path.dirname(train_conf)
+
+        self.label = None
+        if label_file is not None:
+            self.load_label(label_file)
+
+        conf = parse_config(train_conf, "is_predict=1")
+        self.network = swig_paddle.GradientMachine.createFromConfigProto(
+            conf.model_config)
+        self.network.loadParameters(self.model_dir)
+        input_types = [sparse_binary_vector(self.dict_dim)]
+        self.converter = DataProviderConverter(input_types)
+
+    def load_dict(self):
+        """
+        Load dictionary from self.dict_file.
+        """
+        for line_count, line in enumerate(open(self.dict_file, 'r')):
+            self.word_dict[line.strip().split('\t')[0]] = line_count
+        return len(self.word_dict)
+
+    def load_label(self, label_file):
+        """
+        Load label.
+        """
+        self.label = {}
+        for v in open(label_file, 'r'):
+            self.label[int(v.split('\t')[1])] = v.split('\t')[0]
+
+    def get_index(self, data):
+        """
+        transform word into integer index according to the dictionary.
+        """
+        words = data.strip().split()
+        word_slot = [self.word_dict[w] for w in words if w in self.word_dict]
+        return word_slot
+
+    def batch_predict(self, data_batch):
+        input = self.converter(data_batch)
+        output = self.network.forwardTest(input)
+        prob = output[0]["id"].tolist()
+        print("predicting labels is:")
+        print prob
+
+
+def option_parser():
+    usage = "python predict.py -n config -w model_dir -d dictionary -i input_file "
+    parser = OptionParser(usage="usage: %s [options]" % usage)
+    parser.add_option(
+        "-n",
+        "--tconf",
+        action="store",
+        dest="train_conf",
+        help="network config")
+    parser.add_option(
+        "-d",
+        "--dict",
+        action="store",
+        dest="dict_file",
+        help="dictionary file")
+    parser.add_option(
+        "-b",
+        "--label",
+        action="store",
+        dest="label",
+        default=None,
+        help="dictionary file")
+    parser.add_option(
+        "-c",
+        "--batch_size",
+        type="int",
+        action="store",
+        dest="batch_size",
+        default=1,
+        help="the batch size for prediction")
+    parser.add_option(
+        "-w",
+        "--model",
+        action="store",
+        dest="model_path",
+        default=None,
+        help="model path")
+    return parser.parse_args()
+
+
+def main():
+    options, args = option_parser()
+    train_conf = options.train_conf
+    batch_size = options.batch_size
+    dict_file = options.dict_file
+    model_path = options.model_path
+    label = options.label
+    swig_paddle.initPaddle("--use_gpu=0")
+    predict = QuickStartPrediction(train_conf, dict_file, model_path, label)
+
+    batch = []
+    labels = []
+    for line in sys.stdin:
+        [label, text] = line.split("\t")
+        labels.append(int(label))
+        batch.append([predict.get_index(text)])
+    print("labels is:")
+    print labels
+    predict.batch_predict(batch)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/quick_start/api_predict.sh b/demo/quick_start/api_predict.sh
new file mode 100755
index 0000000000000000000000000000000000000000..c90d3b70548b3ef2a7e0e423c74cd97f1886c0fc
--- /dev/null
+++ b/demo/quick_start/api_predict.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+set -e
+
+#Note the default model is pass-00002, you shold make sure the model path
+#exists or change the mode path.
+#only test on trainer_config.lr.py
+model=output/pass-00001/
+config=trainer_config.lr.py
+label=data/labels.list
+dict=data/dict.txt
+batch_size=20
+head -n$batch_size data/test.txt | python api_predict.py \
+     --tconf=$config\
+     --model=$model \
+     --label=$label \
+     --dict=$dict \
+     --batch_size=$batch_size
diff --git a/demo/quick_start/dataprovider_bow.py b/demo/quick_start/dataprovider_bow.py
index 8e651d77bf3fd3bbd990ef314456ec14bd77cfeb..2745495586449b5d1eb64ae570f73eb6b14dbdfe 100644
--- a/demo/quick_start/dataprovider_bow.py
+++ b/demo/quick_start/dataprovider_bow.py
@@ -31,16 +31,16 @@ def initializer(settings, dictionary, **kwargs):
 
     # setting.input_types specifies what the data types the data provider
     # generates.
-    settings.input_types = [
+    settings.input_types = {
         # The first input is a sparse_binary_vector,
         # which means each dimension of the vector is either 0 or 1. It is the
         # bag-of-words (BOW) representation of the texts.
-        sparse_binary_vector(len(dictionary)),
+        'word': sparse_binary_vector(len(dictionary)),
         # The second input is an integer. It represents the category id of the
         # sample. 2 means there are two labels in the dataset.
         # (1 for positive and 0 for negative)
-        integer_value(2)
-    ]
+        'label': integer_value(2)
+    }
 
 
 # Delaring a data provider. It has an initializer 'data_initialzer'.
@@ -67,12 +67,12 @@ def process(settings, file_name):
             # Return the features for the current comment. The first is a list
             # of ids representing a 0-1 binary sparse vector of the text,
             # the second is the integer id of the label.
-            yield word_vector, int(label)
+            yield {'word': word_vector, 'label': int(label)}
 
 
 def predict_initializer(settings, dictionary, **kwargs):
     settings.word_dict = dictionary
-    settings.input_types = [sparse_binary_vector(len(dictionary))]
+    settings.input_types = {'word': sparse_binary_vector(len(dictionary))}
 
 
 # Declaring a data provider for prediction. The difference with process
@@ -83,4 +83,4 @@ def process_predict(settings, file_name):
         for line in f:
             comment = line.strip().split()
             word_vector = [settings.word_dict.get(w, UNK_IDX) for w in comment]
-            yield word_vector
+            yield {'word': word_vector}
diff --git a/demo/quick_start/dataprovider_emb.py b/demo/quick_start/dataprovider_emb.py
index b010253a8a764ede4ff0416231ac6aa2fd8f94e3..ddfa3ce9b73555cb3b7f5a44314ca35b12d41ede 100755
--- a/demo/quick_start/dataprovider_emb.py
+++ b/demo/quick_start/dataprovider_emb.py
@@ -19,13 +19,13 @@ UNK_IDX = 0
 
 def initializer(settings, dictionary, **kwargs):
     settings.word_dict = dictionary
-    settings.input_types = [
+    settings.input_types = {
         # Define the type of the first input as sequence of integer.
         # The value of the integers range from 0 to len(dictrionary)-1
-        integer_value_sequence(len(dictionary)),
+        'word': integer_value_sequence(len(dictionary)),
         # Define the second input for label id
-        integer_value(2)
-    ]
+        'label': integer_value(2)
+    }
 
 
 @provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
@@ -35,15 +35,12 @@ def process(settings, file_name):
             label, comment = line.strip().split('\t')
             words = comment.split()
             word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
-            yield word_slot, int(label)
+            yield {'word': word_slot, 'label': int(label)}
 
 
 def predict_initializer(settings, dictionary, **kwargs):
     settings.word_dict = dictionary
-    settings.input_types = [
-        integer_value(
-            len(dictionary), seq_type=SequenceType.SEQUENCE)
-    ]
+    settings.input_types = {'word': integer_value_sequence(len(dictionary))}
 
 
 @provider(init_hook=predict_initializer, should_shuffle=False)
@@ -52,4 +49,4 @@ def process_predict(settings, file_name):
         for line in f:
             comment = line.strip().split()
             word_slot = [settings.word_dict.get(w, UNK_IDX) for w in comment]
-            yield word_slot
+            yield {'word': word_slot}
diff --git a/demo/quick_start/trainer_config.resnet-lstm.py b/demo/quick_start/trainer_config.resnet-lstm.py
index 5bed925d84a0a6d94da446e1a8c64061ad54ae55..89a837abb7cdeaaa249160123e1f2001d23d7aa1 100644
--- a/demo/quick_start/trainer_config.resnet-lstm.py
+++ b/demo/quick_start/trainer_config.resnet-lstm.py
@@ -13,7 +13,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
 """
 This configuration is a demonstration of how to implement the stacked LSTM
 with residual connections, i.e. an LSTM layer takes the sum of the hidden states
@@ -46,11 +45,12 @@ is_predict = get_config_arg('is_predict', bool, False)
 trn = 'data/train.list' if not is_predict else None
 tst = 'data/test.list' if not is_predict else 'data/pred.list'
 process = 'process' if not is_predict else 'process_predict'
-define_py_data_sources2(train_list=trn,
-                        test_list=tst,
-                        module="dataprovider_emb",
-                        obj=process,
-                        args={"dictionary": word_dict})
+define_py_data_sources2(
+    train_list=trn,
+    test_list=tst,
+    module="dataprovider_emb",
+    obj=process,
+    args={"dictionary": word_dict})
 
 batch_size = 128 if not is_predict else 1
 settings(
@@ -58,10 +58,9 @@ settings(
     learning_rate=2e-3,
     learning_method=AdamOptimizer(),
     regularization=L2Regularization(8e-4),
-    gradient_clipping_threshold=25
-)
+    gradient_clipping_threshold=25)
 
-bias_attr = ParamAttr(initial_std=0.,l2_rate=0.)
+bias_attr = ParamAttr(initial_std=0., l2_rate=0.)
 
 data = data_layer(name="word", size=len(word_dict))
 emb = embedding_layer(input=data, size=128)
@@ -73,17 +72,15 @@ for i in range(3):
     # The input to the current layer is the sum of the hidden state
     # and input of the previous layer.
     current_input = addto_layer(input=[previous_input, previous_hidden_state])
-    hidden_state = simple_lstm(input=current_input, size=128,
-                               lstm_cell_attr=ExtraAttr(drop_rate=0.1))
+    hidden_state = simple_lstm(
+        input=current_input, size=128, lstm_cell_attr=ExtraAttr(drop_rate=0.1))
     previous_input, previous_hidden_state = current_input, hidden_state
 
 lstm = previous_hidden_state
 
 lstm_last = pooling_layer(input=lstm, pooling_type=MaxPooling())
-output = fc_layer(input=lstm_last, size=2,
-                  bias_attr=bias_attr,
-                  act=SoftmaxActivation())
-
+output = fc_layer(
+    input=lstm_last, size=2, bias_attr=bias_attr, act=SoftmaxActivation())
 
 if is_predict:
     maxid = maxid_layer(output)
diff --git a/demo/recommendation/common_utils.py b/demo/recommendation/common_utils.py
index d4fbdad1d7ac53b35d9478c65ab61c2d28845261..c20c65286621d701ad58409b539bbe9c813d453a 100755
--- a/demo/recommendation/common_utils.py
+++ b/demo/recommendation/common_utils.py
@@ -17,13 +17,14 @@ from paddle.trainer.PyDataProvider2 import *
 def meta_to_header(meta, name):
     metas = meta[name]['__meta__']['raw_meta']
     for each_meta in metas:
+        slot_name = each_meta.get('name', '%s_id' % name)
         if each_meta['type'] == 'id':
-            yield integer_value(each_meta['max'])
+            yield slot_name, integer_value(each_meta['max'])
         elif each_meta['type'] == 'embedding':
             is_seq = each_meta['seq'] == 'sequence'
-            yield integer_value(
+            yield slot_name, integer_value(
                 len(each_meta['dict']),
                 seq_type=SequenceType.SEQUENCE
                 if is_seq else SequenceType.NO_SEQUENCE)
         elif each_meta['type'] == 'one_hot_dense':
-            yield dense_vector(len(each_meta['dict']))
+            yield slot_name, dense_vector(len(each_meta['dict']))
diff --git a/demo/recommendation/dataprovider.py b/demo/recommendation/dataprovider.py
index 80c62d75612e544c5197f878a83284f8e08d1a99..c4ff96d80e81926049c9a71d6d9d991c0b568c25 100755
--- a/demo/recommendation/dataprovider.py
+++ b/demo/recommendation/dataprovider.py
@@ -16,6 +16,14 @@ from paddle.trainer.PyDataProvider2 import *
 import common_utils  # parse
 
 
+def __list_to_map__(lst):
+    ret_val = dict()
+    for each in lst:
+        k, v = each
+        ret_val[k] = v
+    return ret_val
+
+
 def hook(settings, meta, **kwargs):
     """
     Init hook is invoked before process data. It will set obj.slots and store
@@ -34,12 +42,16 @@ def hook(settings, meta, **kwargs):
     #    second part is user features.
     #    final part is rating score.
     # header is a list of [USE_SEQ_OR_NOT?, SlotType]
-    headers = list(common_utils.meta_to_header(meta, 'movie'))
-    headers.extend(list(common_utils.meta_to_header(meta, 'user')))
-    headers.append(dense_vector(1))  # Score
+    movie_headers = list(common_utils.meta_to_header(meta, 'movie'))
+    settings.movie_names = [h[0] for h in movie_headers]
+    headers = movie_headers
+    user_headers = list(common_utils.meta_to_header(meta, 'user'))
+    settings.user_names = [h[0] for h in user_headers]
+    headers.extend(user_headers)
+    headers.append(("rating", dense_vector(1)))  # Score
 
     # slot types.
-    settings.input_types = headers
+    settings.input_types = __list_to_map__(headers)
     settings.meta = meta
 
 
@@ -57,20 +69,20 @@ def process(settings, filename):
             movie_meta = settings.meta['movie'][movie_id]
             user_meta = settings.meta['user'][user_id]
 
-            outputs = [movie_id - 1]
+            outputs = [('movie_id', movie_id - 1)]
 
             # Then add movie features
-            for each_meta in movie_meta:
-                outputs.append(each_meta)
+            for i, each_meta in enumerate(movie_meta):
+                outputs.append((settings.movie_names[i + 1], each_meta))
 
             # Then add user id.
-            outputs.append(user_id - 1)
+            outputs.append(('user_id', user_id - 1))
 
             # Then add user features.
-            for each_meta in user_meta:
-                outputs.append(each_meta)
+            for i, each_meta in enumerate(user_meta):
+                outputs.append((settings.user_names[i + 1], each_meta))
 
             # Finally, add score
-            outputs.append([score])
+            outputs.append(('rating', [score]))
             # Return data to paddle
-            yield outputs
+            yield __list_to_map__(outputs)
diff --git a/demo/recommendation/prediction.py b/demo/recommendation/prediction.py
index 191120188ef5dbddf4c42a1356a9fa46e16c5ca1..8ad993eab3a9f637cfff752bfedbbc62eaf3c8d5 100755
--- a/demo/recommendation/prediction.py
+++ b/demo/recommendation/prediction.py
@@ -34,8 +34,8 @@ if __name__ == '__main__':
     network.loadParameters(model_path)
     with open('./data/meta.bin', 'rb') as f:
         meta = pickle.load(f)
-        headers = list(meta_to_header(meta, 'movie'))
-        headers.extend(list(meta_to_header(meta, 'user')))
+        headers = [h[1] for h in meta_to_header(meta, 'movie')]
+        headers.extend([h[1] for h in meta_to_header(meta, 'user')])
         cvt = DataProviderConverter(headers)
         while True:
             movie_id = int(raw_input("Input movie_id: "))
diff --git a/demo/recommendation/preprocess.sh b/demo/recommendation/preprocess.sh
index e121e470193fa1e73c000fe612d6858e28f9261f..dc6b2cdfc13273026bb9b2b36677ef8cac454d3a 100755
--- a/demo/recommendation/preprocess.sh
+++ b/demo/recommendation/preprocess.sh
@@ -25,7 +25,7 @@ python meta_generator.py $dir meta.bin --config=meta_config.json
 echo 'split train/test file'
 python split.py $dir/ratings.dat --delimiter=${delimiter} --test_ratio=0.1
 echo 'shuffle train file'
-shuf $dir/ratings.dat.train > ratings.dat.train
+gshuf $dir/ratings.dat.train > ratings.dat.train
 cp $dir/ratings.dat.test .
 echo "./data/ratings.dat.train" > train.list
 echo "./data/ratings.dat.test" > test.list
diff --git a/demo/semantic_role_labeling/.gitignore b/demo/semantic_role_labeling/.gitignore
index cd90ca7bbe9be46f54cb656a8067c794a55d8cfc..65c9b674c7d1dad53b7d1c6ee1dcbdb72553888d 100644
--- a/demo/semantic_role_labeling/.gitignore
+++ b/demo/semantic_role_labeling/.gitignore
@@ -8,3 +8,7 @@ data/test.wsj.seq_pair
 data/test.wsj.words
 data/tgt.dict
 output
+data/emb
+data/targetDict.txt
+data/verbDict.txt
+data/wordDict.txt
diff --git a/demo/semantic_role_labeling/data/extract_dict_feature.py b/demo/semantic_role_labeling/data/extract_dict_feature.py
index 123df022f508cad1d4557b845619dd18761f357e..da44111976a0dec68345fc139d0aa459ca9211c2 100644
--- a/demo/semantic_role_labeling/data/extract_dict_feature.py
+++ b/demo/semantic_role_labeling/data/extract_dict_feature.py
@@ -33,7 +33,7 @@ def extract_dict_features(pair_file, feature_file):
                 ctx_n1 = sentence_list[verb_index - 1]
             else:
                 ctx_n1 = 'bos'
-            
+
             if verb_index > 1:
                 mark[verb_index - 2] = 1
                 ctx_n2 = sentence_list[verb_index - 2]
@@ -43,13 +43,13 @@ def extract_dict_features(pair_file, feature_file):
             mark[verb_index] = 1
             ctx_0 = sentence_list[verb_index]
 
-            if verb_index < len(labels_list) - 2:
+            if verb_index < len(labels_list) - 1:
                 mark[verb_index + 1] = 1
                 ctx_p1 = sentence_list[verb_index + 1]
             else:
                 ctx_p1 = 'eos'
-            
-            if verb_index < len(labels_list) - 3:
+
+            if verb_index < len(labels_list) - 2:
                 mark[verb_index + 2] = 1
                 ctx_p2 = sentence_list[verb_index + 2]
             else:
@@ -69,7 +69,6 @@ def extract_dict_features(pair_file, feature_file):
             feature_out.write(feature_str + '\n')
 
 
-
 if __name__ == '__main__':
 
     usage = '-p pair_file -f feature_file'
diff --git a/demo/semantic_role_labeling/data/extract_pairs.py b/demo/semantic_role_labeling/data/extract_pairs.py
index 2d0d535c53a74a9fbf9ea2521930333b7f89581b..94a8488c16734eb1882d54f7ec36f4b9308c09d4 100644
--- a/demo/semantic_role_labeling/data/extract_pairs.py
+++ b/demo/semantic_role_labeling/data/extract_pairs.py
@@ -66,8 +66,8 @@ def transform_labels(sentences, labels):
         else:
             verb_list = []
             for x in labels[i][0]:
-                if x !='-':
-                   verb_list.append(x)
+                if x != '-':
+                    verb_list.append(x)
 
             for j in xrange(1, len(labels[i])):
                 label_list = labels[i][j]
@@ -93,7 +93,7 @@ def transform_labels(sentences, labels):
                         is_in_bracket = True
                     else:
                         print 'error:', ll
-                sen_lab_pair.append((sentences[i], verb_list[j-1], label_seq))
+                sen_lab_pair.append((sentences[i], verb_list[j - 1], label_seq))
     return sen_lab_pair
 
 
@@ -103,7 +103,7 @@ def write_file(sen_lab_pair, output_file):
             sentence = x[0]
             label_seq = ' '.join(x[2])
             assert len(sentence.split()) == len(x[2])
-            fout.write(sentence + '\t' + x[1]+'\t' +label_seq + '\n')
+            fout.write(sentence + '\t' + x[1] + '\t' + label_seq + '\n')
 
 
 if __name__ == '__main__':
diff --git a/demo/semantic_role_labeling/data/get_data.sh b/demo/semantic_role_labeling/data/get_data.sh
old mode 100644
new mode 100755
diff --git a/demo/semantic_role_labeling/dataprovider.py b/demo/semantic_role_labeling/dataprovider.py
index d12f10bfcb65e25972035d863997bb9d26ba86eb..042cd4e7a9e256cd597ac34eed423040f1d7ccd5 100644
--- a/demo/semantic_role_labeling/dataprovider.py
+++ b/demo/semantic_role_labeling/dataprovider.py
@@ -21,7 +21,7 @@ def hook(settings, word_dict, label_dict, predicate_dict, **kwargs):
     settings.word_dict = word_dict
     settings.label_dict = label_dict
     settings.predicate_dict = predicate_dict
-   
+
     #all inputs are integral and sequential type
     settings.slots = [
         integer_value_sequence(len(word_dict)),
@@ -29,25 +29,28 @@ def hook(settings, word_dict, label_dict, predicate_dict, **kwargs):
         integer_value_sequence(len(word_dict)),
         integer_value_sequence(len(word_dict)),
         integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)), 
-        integer_value_sequence(len(predicate_dict)),
-        integer_value_sequence(2),
+        integer_value_sequence(len(word_dict)),
+        integer_value_sequence(len(predicate_dict)), integer_value_sequence(2),
         integer_value_sequence(len(label_dict))
     ]
 
 
 def get_batch_size(yeild_data):
     return len(yeild_data[0])
-    
 
-@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size, 
-          can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM)
+
+@provider(
+    init_hook=hook,
+    should_shuffle=True,
+    calc_batch_size=get_batch_size,
+    can_over_batch_size=False,
+    cache=CacheType.CACHE_PASS_IN_MEM)
 def process(settings, file_name):
     with open(file_name, 'r') as fdata:
         for line in fdata:
             sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,  mark, label = \
                 line.strip().split('\t')
-           
+
             words = sentence.split()
             sen_len = len(words)
             word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
diff --git a/demo/semantic_role_labeling/db_lstm.py b/demo/semantic_role_labeling/db_lstm.py
index 75946bd72e04341c189f6e88fdde98e03f4a8bfb..04e2a559b19bd4b9aec0242eb43edf6ab1e7624e 100644
--- a/demo/semantic_role_labeling/db_lstm.py
+++ b/demo/semantic_role_labeling/db_lstm.py
@@ -20,7 +20,7 @@ from paddle.trainer_config_helpers import *
 #file paths
 word_dict_file = './data/wordDict.txt'
 label_dict_file = './data/targetDict.txt'
-predicate_file= './data/verbDict.txt'
+predicate_file = './data/verbDict.txt'
 train_list_file = './data/train.list'
 test_list_file = './data/test.list'
 
@@ -47,7 +47,6 @@ if not is_predict:
             w = line.strip()
             predicate_dict[w] = i
 
-
     if is_test:
         train_list_file = None
 
@@ -57,9 +56,11 @@ if not is_predict:
         test_list=test_list_file,
         module='dataprovider',
         obj='process',
-        args={'word_dict': word_dict,
-              'label_dict': label_dict,
-              'predicate_dict': predicate_dict })
+        args={
+            'word_dict': word_dict,
+            'label_dict': label_dict,
+            'predicate_dict': predicate_dict
+        })
 
     word_dict_len = len(word_dict)
     label_dict_len = len(label_dict)
@@ -77,24 +78,16 @@ mark_dim = 5
 hidden_dim = 512
 depth = 8
 
-
-
 ########################### Optimizer #######################################
 
-
 settings(
     batch_size=150,
     learning_method=MomentumOptimizer(momentum=0),
     learning_rate=2e-2,
     regularization=L2Regularization(8e-4),
     is_async=False,
-    model_average=ModelAverage(average_window=0.5,
-                               max_average_window=10000),
-                               
-)
-
-
-
+    model_average=ModelAverage(
+        average_window=0.5, max_average_window=10000), )
 
 ####################################### network ##############################
 #8 features and 1 target
@@ -108,22 +101,28 @@ ctx_p1 = data_layer(name='ctx_p1_data', size=word_dict_len)
 ctx_p2 = data_layer(name='ctx_p2_data', size=word_dict_len)
 mark = data_layer(name='mark_data', size=mark_dict_len)
 
-
 if not is_predict:
     target = data_layer(name='target', size=label_dict_len)
 
-
-default_std=1/math.sqrt(hidden_dim)/3.0
+default_std = 1 / math.sqrt(hidden_dim) / 3.0
 
 emb_para = ParameterAttribute(name='emb', initial_std=0., learning_rate=0.)
 std_0 = ParameterAttribute(initial_std=0.)
-std_default = ParameterAttribute(initial_std=default_std) 
-
-predicate_embedding = embedding_layer(size=word_dim, input=predicate, param_attr=ParameterAttribute(name='vemb',initial_std=default_std))
-mark_embedding = embedding_layer(name='word_ctx-in_embedding', size=mark_dim, input=mark, param_attr=std_0)
-
-word_input=[word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
-emb_layers = [embedding_layer(size=word_dim, input=x, param_attr=emb_para) for x in word_input]
+std_default = ParameterAttribute(initial_std=default_std)
+
+predicate_embedding = embedding_layer(
+    size=word_dim,
+    input=predicate,
+    param_attr=ParameterAttribute(
+        name='vemb', initial_std=default_std))
+mark_embedding = embedding_layer(
+    name='word_ctx-in_embedding', size=mark_dim, input=mark, param_attr=std_0)
+
+word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+emb_layers = [
+    embedding_layer(
+        size=word_dim, input=x, param_attr=emb_para) for x in word_input
+]
 emb_layers.append(predicate_embedding)
 emb_layers.append(mark_embedding)
 
@@ -131,84 +130,89 @@ hidden_0 = mixed_layer(
     name='hidden0',
     size=hidden_dim,
     bias_attr=std_default,
-    input=[ full_matrix_projection(input=emb, param_attr=std_default ) for emb in emb_layers ])
-
+    input=[
+        full_matrix_projection(
+            input=emb, param_attr=std_default) for emb in emb_layers
+    ])
 
 mix_hidden_lr = 1e-3
 lstm_para_attr = ParameterAttribute(initial_std=0.0, learning_rate=1.0)
-hidden_para_attr = ParameterAttribute(initial_std=default_std, learning_rate=mix_hidden_lr)
-
-lstm_0 = lstmemory(name='lstm0',
-                   input=hidden_0, 
-                   act=ReluActivation(),
-                   gate_act=SigmoidActivation(),
-                   state_act=SigmoidActivation(),
-                   bias_attr=std_0,
-                   param_attr=lstm_para_attr)
+hidden_para_attr = ParameterAttribute(
+    initial_std=default_std, learning_rate=mix_hidden_lr)
+
+lstm_0 = lstmemory(
+    name='lstm0',
+    input=hidden_0,
+    act=ReluActivation(),
+    gate_act=SigmoidActivation(),
+    state_act=SigmoidActivation(),
+    bias_attr=std_0,
+    param_attr=lstm_para_attr)
 
 #stack L-LSTM and R-LSTM with direct edges
 input_tmp = [hidden_0, lstm_0]
 
-
 for i in range(1, depth):
 
-    mix_hidden = mixed_layer(name='hidden'+str(i),
-                             size=hidden_dim, 
-                             bias_attr=std_default,
-                             input=[full_matrix_projection(input=input_tmp[0], param_attr=hidden_para_attr),
-                                    full_matrix_projection(input=input_tmp[1], param_attr=lstm_para_attr)
-                                   ]
-                             )
-
-    lstm = lstmemory(name='lstm'+str(i),
-                     input=mix_hidden,
-                     act=ReluActivation(),
-                     gate_act=SigmoidActivation(),
-                     state_act=SigmoidActivation(),
-                     reverse=((i % 2)==1),
-                     bias_attr=std_0,
-                     param_attr=lstm_para_attr)
+    mix_hidden = mixed_layer(
+        name='hidden' + str(i),
+        size=hidden_dim,
+        bias_attr=std_default,
+        input=[
+            full_matrix_projection(
+                input=input_tmp[0], param_attr=hidden_para_attr),
+            full_matrix_projection(
+                input=input_tmp[1], param_attr=lstm_para_attr)
+        ])
+
+    lstm = lstmemory(
+        name='lstm' + str(i),
+        input=mix_hidden,
+        act=ReluActivation(),
+        gate_act=SigmoidActivation(),
+        state_act=SigmoidActivation(),
+        reverse=((i % 2) == 1),
+        bias_attr=std_0,
+        param_attr=lstm_para_attr)
 
     input_tmp = [mix_hidden, lstm]
 
-feature_out = mixed_layer(name='output',
-                          size=label_dict_len,
-                          bias_attr=std_default, 
-                          input=[full_matrix_projection(input=input_tmp[0], param_attr=hidden_para_attr),
-                                 full_matrix_projection(input=input_tmp[1], param_attr=lstm_para_attr)
-                                ],
-                          )
-
-
+feature_out = mixed_layer(
+    name='output',
+    size=label_dict_len,
+    bias_attr=std_default,
+    input=[
+        full_matrix_projection(
+            input=input_tmp[0], param_attr=hidden_para_attr),
+        full_matrix_projection(
+            input=input_tmp[1], param_attr=lstm_para_attr)
+    ], )
 
 if not is_predict:
-    crf_l = crf_layer( name = 'crf',
-                       size = label_dict_len,
-                       input = feature_out, 
-                       label = target,
-                       param_attr=ParameterAttribute(name='crfw',initial_std=default_std, learning_rate=mix_hidden_lr)
-
-                      )
-
-    
-    crf_dec_l = crf_decoding_layer(name = 'crf_dec_l',
-                                   size = label_dict_len,
-                                   input = feature_out,
-                                   label = target,
-                                   param_attr=ParameterAttribute(name='crfw')
-                                       )
-
+    crf_l = crf_layer(
+        name='crf',
+        size=label_dict_len,
+        input=feature_out,
+        label=target,
+        param_attr=ParameterAttribute(
+            name='crfw', initial_std=default_std, learning_rate=mix_hidden_lr))
+
+    crf_dec_l = crf_decoding_layer(
+        name='crf_dec_l',
+        size=label_dict_len,
+        input=feature_out,
+        label=target,
+        param_attr=ParameterAttribute(name='crfw'))
 
     eval = sum_evaluator(input=crf_dec_l)
-        
+
     outputs(crf_l)
 
 else:
-    crf_dec_l = crf_decoding_layer(name = 'crf_dec_l',
-                                   size = label_dict_len,
-                                   input = feature_out,
-                                   param_attr=ParameterAttribute(name='crfw')
-                                       )
+    crf_dec_l = crf_decoding_layer(
+        name='crf_dec_l',
+        size=label_dict_len,
+        input=feature_out,
+        param_attr=ParameterAttribute(name='crfw'))
 
     outputs(crf_dec_l)
-
diff --git a/demo/semantic_role_labeling/predict.py b/demo/semantic_role_labeling/predict.py
index 15145fafceb2422ee201684e85ef5d1043a7bf7d..372fd090b6e8f08f5bb34697772c2e4976810595 100644
--- a/demo/semantic_role_labeling/predict.py
+++ b/demo/semantic_role_labeling/predict.py
@@ -26,7 +26,8 @@ UNK_IDX = 0
 
 
 class Prediction():
-    def __init__(self, train_conf, dict_file, model_dir, label_file, predicate_dict_file):
+    def __init__(self, train_conf, dict_file, model_dir, label_file,
+                 predicate_dict_file):
         """
         train_conf: trainer configure.
         dict_file: word dictionary file name.
@@ -35,7 +36,7 @@ class Prediction():
 
         self.dict = {}
         self.labels = {}
-        self.predicate_dict={}
+        self.predicate_dict = {}
         self.labels_reverse = {}
         self.load_dict_label(dict_file, label_file, predicate_dict_file)
 
@@ -44,25 +45,18 @@ class Prediction():
         len_pred = len(self.predicate_dict)
 
         conf = parse_config(
-            train_conf,
-            'dict_len=' + str(len_dict) + 
-            ',label_len=' + str(len_label) +
-            ',pred_len=' + str(len_pred) +
-            ',is_predict=True')
+            train_conf, 'dict_len=' + str(len_dict) + ',label_len=' +
+            str(len_label) + ',pred_len=' + str(len_pred) + ',is_predict=True')
         self.network = swig_paddle.GradientMachine.createFromConfigProto(
             conf.model_config)
         self.network.loadParameters(model_dir)
 
         slots = [
-            integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict),
-            integer_value_sequence(len_dict), 
-            integer_value_sequence(len_pred),
-            integer_value_sequence(2)
-            ]
+            integer_value_sequence(len_dict), integer_value_sequence(len_dict),
+            integer_value_sequence(len_dict), integer_value_sequence(len_dict),
+            integer_value_sequence(len_dict), integer_value_sequence(len_dict),
+            integer_value_sequence(len_pred), integer_value_sequence(2)
+        ]
         self.converter = DataProviderConverter(slots)
 
     def load_dict_label(self, dict_file, label_file, predicate_dict_file):
@@ -78,6 +72,7 @@ class Prediction():
 
         for line_count, line in enumerate(open(predicate_dict_file, 'r')):
             self.predicate_dict[line.strip()] = line_count
+
     def get_data(self, data_file):
         """
         Get input data of paddle format.
@@ -88,9 +83,10 @@ class Prediction():
                 ).split('\t')
                 words = sentence.split()
                 sen_len = len(words)
-                 
+
                 word_slot = [self.dict.get(w, UNK_IDX) for w in words]
-                predicate_slot = [self.predicate_dict.get(predicate, UNK_IDX)] * sen_len
+                predicate_slot = [self.predicate_dict.get(predicate, UNK_IDX)
+                                  ] * sen_len
                 ctx_n2_slot = [self.dict.get(ctx_n2, UNK_IDX)] * sen_len
                 ctx_n1_slot = [self.dict.get(ctx_n1, UNK_IDX)] * sen_len
                 ctx_0_slot = [self.dict.get(ctx_0, UNK_IDX)] * sen_len
@@ -99,7 +95,7 @@ class Prediction():
 
                 marks = mark.split()
                 mark_slot = [int(w) for w in marks]
-                
+
                 yield word_slot, ctx_n2_slot, ctx_n1_slot, \
                       ctx_0_slot, ctx_p1_slot, ctx_p2_slot, predicate_slot, mark_slot
 
@@ -123,8 +119,9 @@ class Prediction():
 
 
 def option_parser():
-    usage = ("python predict.py -c config -w model_dir " 
-             "-d word dictionary -l label_file -i input_file  -p pred_dict_file")
+    usage = (
+        "python predict.py -c config -w model_dir "
+        "-d word dictionary -l label_file -i input_file  -p pred_dict_file")
     parser = OptionParser(usage="usage: %s [options]" % usage)
     parser.add_option(
         "-c",
@@ -187,8 +184,9 @@ def main():
     output_file = options.output_file
 
     swig_paddle.initPaddle("--use_gpu=0")
-    predict = Prediction(train_conf, dict_file, model_path, label_file, predict_dict_file)
-    predict.predict(data_file,output_file)
+    predict = Prediction(train_conf, dict_file, model_path, label_file,
+                         predict_dict_file)
+    predict.predict(data_file, output_file)
 
 
 if __name__ == '__main__':
diff --git a/demo/sentiment/predict.py b/demo/sentiment/predict.py
index 00239c6009b8503cf445d9847abde92db12db2fe..8ec490f64691924013200a3d0038d39aa834b038 100755
--- a/demo/sentiment/predict.py
+++ b/demo/sentiment/predict.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
+import os, sys
 import numpy as np
 from optparse import OptionParser
 from py_paddle import swig_paddle, DataProviderConverter
@@ -66,34 +66,24 @@ class SentimentPrediction():
         for v in open(label_file, 'r'):
             self.label[int(v.split('\t')[1])] = v.split('\t')[0]
 
-    def get_data(self, data_file):
+    def get_index(self, data):
         """
-        Get input data of paddle format.
+        transform word into integer index according to the dictionary.
         """
-        with open(data_file, 'r') as fdata:
-            for line in fdata:
-                words = line.strip().split()
-                word_slot = [
-                    self.word_dict[w] for w in words if w in self.word_dict
-                ]
-                if not word_slot:
-                    print "all words are not in dictionary: %s", line
-                    continue
-                yield [word_slot]
-
-    def predict(self, data_file):
-        """
-        data_file: file name of input data.
-        """
-        input = self.converter(self.get_data(data_file))
+        words = data.strip().split()
+        word_slot = [self.word_dict[w] for w in words if w in self.word_dict]
+        return word_slot
+
+    def batch_predict(self, data_batch):
+        input = self.converter(data_batch)
         output = self.network.forwardTest(input)
         prob = output[0]["value"]
-        lab = np.argsort(-prob)
-        if self.label is None:
-            print("%s: predicting label is %d" % (data_file, lab[0][0]))
-        else:
-            print("%s: predicting label is %s" %
-                  (data_file, self.label[lab[0][0]]))
+        labs = np.argsort(-prob)
+        for idx, lab in enumerate(labs):
+            if self.label is None:
+                print("predicting label is %d" % (lab[0]))
+            else:
+                print("predicting label is %s" % (self.label[lab[0]]))
 
 
 def option_parser():
@@ -119,11 +109,13 @@ def option_parser():
         default=None,
         help="dictionary file")
     parser.add_option(
-        "-i",
-        "--data",
+        "-c",
+        "--batch_size",
+        type="int",
         action="store",
-        dest="data",
-        help="data file to predict")
+        dest="batch_size",
+        default=1,
+        help="the batch size for prediction")
     parser.add_option(
         "-w",
         "--model",
@@ -137,13 +129,21 @@ def option_parser():
 def main():
     options, args = option_parser()
     train_conf = options.train_conf
-    data = options.data
+    batch_size = options.batch_size
     dict_file = options.dict_file
     model_path = options.model_path
     label = options.label
     swig_paddle.initPaddle("--use_gpu=0")
     predict = SentimentPrediction(train_conf, dict_file, model_path, label)
-    predict.predict(data)
+
+    batch = []
+    for line in sys.stdin:
+        batch.append([predict.get_index(line)])
+        if len(batch) == batch_size:
+            predict.batch_predict(batch)
+            batch = []
+    if len(batch) > 0:
+        predict.batch_predict(batch)
 
 
 if __name__ == '__main__':
diff --git a/demo/sentiment/predict.sh b/demo/sentiment/predict.sh
index a889dfe3ec6635bd1ab2b60ae7207815cd205416..c72a8e8641516543ef267fcb4b448630246d1e8d 100755
--- a/demo/sentiment/predict.sh
+++ b/demo/sentiment/predict.sh
@@ -19,9 +19,9 @@ set -e
 model=model_output/pass-00002/
 config=trainer_config.py
 label=data/pre-imdb/labels.list
-python predict.py \
-     -n $config\
-     -w $model \
-     -b $label \
-     -d ./data/pre-imdb/dict.txt \
-     -i ./data/aclImdb/test/pos/10007_10.txt 
+cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \
+     --tconf=$config\
+     --model=$model \
+     --label=$label \
+     --dict=./data/pre-imdb/dict.txt \
+     --batch_size=1
diff --git a/demo/sequence_tagging/linear_crf.py b/demo/sequence_tagging/linear_crf.py
index 736b580bb87a3f2c12b369e231a10893fa95ce08..0624b17787aaf90732707e5f4fc6c2195b8f65ee 100644
--- a/demo/sequence_tagging/linear_crf.py
+++ b/demo/sequence_tagging/linear_crf.py
@@ -74,7 +74,8 @@ sum_evaluator(
 
 chunk_evaluator(
     name="chunk_f1",
-    input=[crf_decoding, chunk],
+    input=crf_decoding,
+    label=chunk,
     chunk_scheme="IOB",
     num_chunk_types=11, )
 
diff --git a/demo/sequence_tagging/rnn_crf.py b/demo/sequence_tagging/rnn_crf.py
index ad1e7b68e78ae202575623e139ad3727b0b9d30c..b9b41b2433461eb1bfb309659834661c2ae43253 100644
--- a/demo/sequence_tagging/rnn_crf.py
+++ b/demo/sequence_tagging/rnn_crf.py
@@ -112,7 +112,8 @@ sum_evaluator(
 
 chunk_evaluator(
     name="chunk_f1",
-    input=[crf_decoding, chunk],
+    input=crf_decoding,
+    label=chunk,
     chunk_scheme="IOB",
     num_chunk_types=11, )
 
diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt
index efcf8b0ad3d6f2f831fe71f3c09163015cc1ac96..6fa42fd0c71e78cc2fa6b0fe2cb970baf4ac89ed 100644
--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -7,25 +7,50 @@ if(NOT DEFINED SPHINX_THEME_DIR)
 endif()
 
 # configured documentation tools and intermediate build results
-set(BINARY_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/_build")
+set(BINARY_BUILD_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_build")
 
 # Sphinx cache with pickled ReST documents
-set(SPHINX_CACHE_DIR "${CMAKE_CURRENT_BINARY_DIR}/_doctrees")
+set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
 
-# HTML output directory
-set(SPHINX_HTML_DIR "${CMAKE_CURRENT_BINARY_DIR}/html")
+# HTML output director
+set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")
 
 configure_file(
-    "${CMAKE_CURRENT_SOURCE_DIR}/conf.py.in"
-    "${BINARY_BUILD_DIR}/conf.py"
+    "${CMAKE_CURRENT_SOURCE_DIR}/templates/conf.py.en.in"
+    "${BINARY_BUILD_DIR_EN}/conf.py"
     @ONLY)
 
 sphinx_add_target(paddle_docs
                   html
-                  ${BINARY_BUILD_DIR}
-                  ${SPHINX_CACHE_DIR}
+                  ${BINARY_BUILD_DIR_EN}
+                  ${SPHINX_CACHE_DIR_EN}
                   ${CMAKE_CURRENT_SOURCE_DIR}
-                  ${SPHINX_HTML_DIR})
+                  ${SPHINX_HTML_DIR_EN})
 
 add_dependencies(paddle_docs
   gen_proto_py)
+
+
+# configured documentation tools and intermediate build results
+set(BINARY_BUILD_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_build")
+
+# Sphinx cache with pickled ReST documents
+set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
+
+# HTML output directory
+set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")
+
+configure_file(
+    "${CMAKE_CURRENT_SOURCE_DIR}/templates/conf.py.cn.in"
+    "${BINARY_BUILD_DIR_CN}/conf.py"
+    @ONLY)
+
+sphinx_add_target(paddle_docs_cn
+                  html
+                  ${BINARY_BUILD_DIR_CN}
+                  ${SPHINX_CACHE_DIR_CN}
+                  ${CMAKE_CURRENT_SOURCE_DIR}
+                  ${SPHINX_HTML_DIR_CN})
+
+add_dependencies(paddle_docs_cn
+  gen_proto_py)
diff --git a/doc/about/index_cn.md b/doc/about/index_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..3bf030004d4de8c6f3cb773c6e78c09f40878c5f
--- /dev/null
+++ b/doc/about/index_cn.md
@@ -0,0 +1,11 @@
+关于PaddlePaddle
+================
+
+PaddlePaddle是一个最早由百度科学家和工程师共同研发的并行分布式深度学习平台，兼备易用性、高效性、灵活性和可扩展性，目前已被百度内部多个产品线广泛使用。
+PaddlePaddle目前已经开放源码, 但是远未完善，我们希望能在这个基础上不断的改进、扩展和延伸。
+同时我们希望广大开发者积极提供反馈和贡献源代码，建立一个活跃的开源社区。
+
+致谢
+--------
+
+在此，特别感谢PaddlePaddle的[所有贡献者](https://github.com/PaddlePaddle/Paddle/graphs/contributors)。
diff --git a/doc/about/index_en.rst b/doc/about/index_en.rst
index 8a372d2bc2b2c54b021ed63941482cbad8d8f719..065c430cdea802ed3c9f487cd00255b85a5598a5 100644
--- a/doc/about/index_en.rst
+++ b/doc/about/index_en.rst
@@ -11,4 +11,4 @@ We hope to build an active open source community both by providing feedback and
 Credits
 --------
 
-We owe many thanks to `all contributors and developers <https://github.com/PaddlePaddle/Paddle/blob/develop/authors>`_ of PaddlePaddle!
+We owe many thanks to `all contributors and developers <https://github.com/PaddlePaddle/Paddle/graphs/contributors>`_ of PaddlePaddle!
diff --git a/doc_cn/ui/data_provider/dataprovider.rst b/doc/api/data_provider/dataprovider_cn.rst
similarity index 99%
rename from doc_cn/ui/data_provider/dataprovider.rst
rename to doc/api/data_provider/dataprovider_cn.rst
index e6796429a78801eba5e5fb776dd6fbe3413115ea..6861ecece8cad19aa8a1e4e67e819f40873ef07c 100644
--- a/doc_cn/ui/data_provider/dataprovider.rst
+++ b/doc/api/data_provider/dataprovider_cn.rst
@@ -1,13 +1,13 @@
 DataProvider的介绍
 ==================
 
-DataProvider是PaddlePaddle负责提供数据的模块。其作用是将数据传入内存或显存，让神经网络可以进行训练或预测。用户可以通过简单使用Python接口 `PyDataProvider2 <pydataprovider2.html>`_ ，来自定义传数据的过程。如果有更复杂的使用，或者需要更高的效率，用户也可以在C++端自定义一个 ``DataProvider`` 。
+DataProvider是PaddlePaddle负责提供数据的模块。其作用是将数据传入内存或显存，让神经网络可以进行训练或预测。用户可以通过简单使用Python接口 `PyDataProvider2 <pydataprovider2.html>`_ ，来自定义传数据的过程。如果有更复杂的使用，或者需要更高的效率，用户也可以在C++端自定义一个 ``DataProvider`` 。
 
 PaddlePaddle需要用户在网络配置（trainer_config.py）中定义使用哪种DataProvider，并且在DataProvider中实现如何访问训练文件列表（train.list）或测试文件列表（test.list）。
 
-- train.list和test.list存放在本地（推荐直接存放到训练目录，以相对路径引用)。一般情况下，两者均为纯文本文件，其中每一行对应一个数据文件地址：
-  
-  - 如果数据文件存于本地磁盘，这个地址则为它的绝对路径或相对路径(相对于PaddlePaddle程序运行时的路径)。
-  - 地址也可以为hdfs文件路径，或者数据库连接路径等。
-  - 由于这个地址会被DataProvider使用，因此，如何解析该地址也是用户自定义DataProvider时需要考虑的地方。
+- train.list和test.list存放在本地（推荐直接存放到训练目录，以相对路径引用)。一般情况下，两者均为纯文本文件，其中每一行对应一个数据文件地址：
+  
+  - 如果数据文件存于本地磁盘，这个地址则为它的绝对路径或相对路径(相对于PaddlePaddle程序运行时的路径)。
+  - 地址也可以为hdfs文件路径，或者数据库连接路径等。
+  - 由于这个地址会被DataProvider使用，因此，如何解析该地址也是用户自定义DataProvider时需要考虑的地方。
 - 如果没有设置test.list，或设置为None，那么在训练过程中不会执行测试操作；否则，会根据命令行参数指定的测试方式，在训练过程中进行测试，从而防止过拟合。
diff --git a/doc/api/data_provider/index_en.rst b/doc/api/data_provider/dataprovider_en.rst
similarity index 100%
rename from doc/api/data_provider/index_en.rst
rename to doc/api/data_provider/dataprovider_en.rst
diff --git a/doc_cn/ui/data_provider/pydataprovider2.rst b/doc/api/data_provider/pydataprovider2_cn.rst
similarity index 95%
rename from doc_cn/ui/data_provider/pydataprovider2.rst
rename to doc/api/data_provider/pydataprovider2_cn.rst
index dce373118c5ae01c7ecf9afc15e1d9af9bf4ebe4..f243ea775a6b4c0961a8948653ad54ea9b531dcb 100644
--- a/doc_cn/ui/data_provider/pydataprovider2.rst
+++ b/doc/api/data_provider/pydataprovider2_cn.rst
@@ -1,227 +1,227 @@
-PyDataProvider2的使用
-=====================
-
-PyDataProvider2是PaddlePaddle使用Python提供数据的推荐接口。该接口使用多线程读取数据，并提供了简单的Cache功能；同时可以使用户只关注如何从文件中读取每一条数据，而不用关心数据如何传输，如何存储等等。
-
-..  contents::
-
-MNIST的使用场景
----------------
-
-我们以MNIST手写识别为例，来说明PyDataProvider2的简单使用场景。
-
-样例数据
-++++++++
-
-MNIST是一个包含有70,000张灰度图片的数字分类数据集。样例数据 ``mnist_train.txt`` 如下：
-
-..  literalinclude:: mnist_train.txt
-
-其中每行数据代表一张图片，行内使用 ``;`` 分成两部分。第一部分是图片的标签，为0-9中的一个数字；第二部分是28*28的图片像素灰度值。 对应的 ``train.list`` 即为这个数据文件的名字：
-
-..  literalinclude:: train.list
-
-dataprovider的使用
-++++++++++++++++++
-
-..  literalinclude:: mnist_provider.dict.py
-
-- 首先，引入PaddlePaddle的PyDataProvider2包。
-- 其次，定义一个Python的 `Decorator <http://www.learnpython.org/en/Decorators>`_ `@provider`_ 。用于将下一行的数据输入函数标记成一个PyDataProvider2，同时设置它的input_types属性。
-  
-  - `input_types`_：设置这个PyDataProvider2返回什么样的数据。本例根据网络配置中 ``data_layer`` 的名字，显式指定返回的是一个28*28维的稠密浮点数向量和一个[0-9]的10维整数标签。
-
-    ..  literalinclude:: mnist_config.py
-         :lines: 9-10
-
-  - 注意：如果用户不显示指定返回数据的对应关系，那么PaddlePaddle会根据layer的声明顺序，来确定对应关系。但这个关系可能不正确，所以推荐使用显式指定的方式来设置input_types。
-- 最后，实现数据输入函数（如本例的 ``process`` 函数）。
-
-  - 该函数的功能是：打开文本文件，读取每一行，将行中的数据转换成与input_types一致的格式，然后返回给PaddlePaddle进程。注意，
-    
-    - 返回的顺序需要和input_types中定义的顺序一致。
-    - 返回时，必须使用Python关键词 ``yield`` ，相关概念是 ``generator`` 。
-    - 一次yield调用，返回一条完整的样本。如果想为一个数据文件返回多条样本，只需要在函数中调用多次yield即可（本例中使用for循环进行多次调用）。
-  
-  - 该函数具有两个参数：
-  
-    - settings：在本例中没有使用，具体可以参考 `init_hook`_ 中的说明。
-    - filename：为 ``train.list`` 或 ``test.list`` 中的一行，即若干数据文件路径的某一个。
-
-网络配置中的调用
-++++++++++++++++
-
-在网络配置里，只需要一行代码就可以调用这个PyDataProvider2，如，
-
-..  literalinclude:: mnist_config.py
-     :lines: 1-7
-
-训练数据是 ``train.list`` ，没有测试数据，调用的PyDataProvider2是 ``mnist_provider`` 模块中的 ``process`` 函数。
-
-小结
-+++++
-
-至此，简单的PyDataProvider2样例就说明完毕了。对用户来说，仅需要知道如何从 **一个文件** 中读取 **一条样本** ，就可以将数据传送给PaddlePaddle了。而PaddlePaddle则会帮用户做以下工作：
-
-* 将数据组合成Batch进行训练
-* 对训练数据进行Shuffle
-* 多线程的数据读取
-* 缓存训练数据到内存(可选)
-* CPU->GPU双缓存
-
-是不是很简单呢？
-
-时序模型的使用场景
-------------------
-样例数据
-++++++++
-
-时序模型是指数据的某一维度是一个序列形式，即包含时间步信息。所谓时间步信息，不一定和时间有关系，只是说明数据的顺序是重要的。例如，文本信息就是一个序列数据。
-
-本例采用英文情感分类的数据，即将一段英文文本数据，分类成正面情绪和负面情绪两类(用0和1表示)。样例数据 ``sentimental_train.txt`` 如下：
-
-..  literalinclude:: sentimental_train.txt
-
-dataprovider的使用
-++++++++++++++++++
-
-相对MNIST而言，这个dataprovider较复杂，主要原因是增加了初始化机制 `init_hook`_。本例的 ``on_init`` 函数就是根据该机制配置的，它会在dataprovider创建的时候执行。
-
-- 其中 ``input_types`` 和在 `@provider`_ 中配置的效果一致。本例中的输入特征是词ID的序列，因此使用 ``integer_value_sequence`` 类型来设置。
-- 将 ``dictionary`` 存入settings对象，在 ``process`` 函数中使用。 dictionary是从网络配置中传入的dict对象，即一个将单词字符串映射到单词ID的字典。
-
-..  literalinclude:: sentimental_provider.py
-
-网络配置中的调用
-++++++++++++++++
-
-调用这个PyDataProvider2的方法，基本上和MNIST样例一致，除了
-
-* 在配置中需要读取外部字典。
-* 在声明DataProvider的时候传入dictionary作为参数。
-
-..  literalinclude:: sentimental_config.py
-     :emphasize-lines: 12-14
-
-参考(Reference)
----------------
-
-@provider
-+++++++++
-
-``@provider`` 是一个Python的 `Decorator`_ ，可以将某一个函数标记成一个PyDataProvider2。如果不了解 `Decorator`_ 是什么也没关系，只需知道这是一个标记属性的方法就可以了。它包含的属性参数如下:
-
-*  input_types：数据输入格式。具体的格式说明，请参考 `input_types`_ 。
-*  should_shuffle：是不是要对数据做Shuffle。训练时默认shuffle，测试时默认不shuffle。
-*  min_pool_size：设置内存中最小暂存的数据条数，也是PaddlePaddle所能够保证的shuffle粒度。如果为-1，则会预先读取全部数据到内存中。
-*  pool_size： 设置内存中暂存的数据条数。如果为-1（默认），则不在乎内存暂存多少条数据。如果设置，则推荐大于训练时batch size的值，并且在内存足够的情况下越大越好。
-*  can_over_batch_size：是否允许暂存略微多余pool_size的数据。由于这样做可以避免很多死锁问题，一般推荐设置成True。
-*  calc_batch_size：可以传入一个函数，用于自定义每条数据的batch size（默认为1）。
-*  cache： 数据缓存的策略，具体请参考 `cache`_ 。
-*  init_hook：初始化时调用的函数，具体请参考 `init_hook`_ 。
-*  check：如果为true，会根据input_types检查数据的合法性。
-*  check_fail_continue：如果为true，那么当check出数据不合法时，会扔到这条数据，继续训练或预测。（对check=false的情况，没有作用）
-
-input_types
-+++++++++++
-
-PaddlePaddle的数据包括四种主要类型，和三种序列模式。
-
-四种数据类型：
-
-* dense_vector：稠密的浮点数向量。
-* sparse_binary_vector：稀疏的01向量，即大部分值为0，但有值的地方必须为1。
-* sparse_float_vector：稀疏的向量，即大部分值为0，但有值的部分可以是任何浮点数。
-* integer：整数标签。
-
-三种序列模式：
-
-* SequenceType.NO_SEQUENCE：不是一条序列
-* SequenceType.SEQUENCE：是一条时间序列
-* SequenceType.SUB_SEQUENCE： 是一条时间序列，且序列的每一个元素还是一个时间序列。
-
-不同的数据类型和序列模式返回的格式不同，列表如下：
-
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-|                      | NO_SEQUENCE         | SEQUENCE                          |  SUB_SEQUENCE                                  |
-+======================+=====================+===================================+================================================+
-| dense_vector         | [f, f, ...]         | [[f, ...], [f, ...], ...]         | [[[f, ...], ...], [[f, ...], ...],...]         |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| sparse_binary_vector | [i, i, ...]         | [[i, ...], [i, ...], ...]         | [[[i, ...], ...], [[i, ...], ...],...]         |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| sparse_float_vector  | [(i,f), (i,f), ...] | [[(i,f), ...], [(i,f), ...], ...] | [[[(i,f), ...], ...], [[(i,f), ...], ...],...] |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-| integer_value        |  i                  | [i, i, ...]                       | [[i, ...], [i, ...], ...]                      |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-
-其中，f代表一个浮点数，i代表一个整数。
-
-注意：对sparse_binary_vector和sparse_float_vector，PaddlePaddle存的是有值位置的索引。例如，
-
-- 对一个5维非序列的稀疏01向量 ``[0, 1, 1, 0, 0]`` ，类型是sparse_binary_vector，返回的是 ``[1, 2]`` 。
-- 对一个5维非序列的稀疏浮点向量 ``[0, 0.5, 0.7, 0, 0]`` ，类型是sparse_float_vector，返回的是 ``[(1, 0.5), (2, 0.7)]`` 。
-
-init_hook
-+++++++++
-
-init_hook可以传入一个函数。该函数在初始化的时候会被调用，其参数如下:
-
-* 第一个参数是settings对象，它和数据传入函数的第一个参数（如本例中 ``process`` 函数的 ``settings`` 参数）必须一致。该对象具有以下两个属性：
-    * settings.input_types：数据输入格式，具体请参考 `input_types`_ 。
-    * settings.logger：一个logging对象。
-* 其他参数使用 ``kwargs`` （key word arguments）传入，包括以下两种：
-    * PaddlePaddle定义的参数: 1）is_train：bool型参数，表示用于训练或预测；2）file_list：所有文件列表。
-    * 用户定义的参数：使用args在网络配置中设置。
-
-注意：PaddlePaddle保留添加参数的权力，因此init_hook尽量使用 ``**kwargs`` 来接受不使用的函数以保证兼容性。
-
-cache
-+++++
-
-PyDataProvider2提供了两种简单的Cache策略：
-
-* CacheType.NO_CACHE：不缓存任何数据，每次都会从python端读取数据
-* CacheType.CACHE_PASS_IN_MEM：第一个pass会从python端读取数据，剩下的pass会直接从内存里
-  读取数据。 
-
-
-注意事项
---------
-
-可能的内存泄露问题
-++++++++++++++++++
-
-PaddlePaddle将train.list中的每一行都传递给process函数，从而生成多个generator。当训练数据非常多时，就会生成非常多的generator。
-
-虽然每个generator在没有调用的时候，是几乎不占内存的；但当调用过一次后，generator便会存下当前的上下文(Context)，而这个Context可能会非常大。并且，generator至少需要调用两次才会知道是否停止。所以，即使process函数里面只有一个yield，也需要两次随机选择到相同generator的时候，才会释放该段内存。
-
-..  code-block:: python
-
-    def func():
-        yield 0
-
-    f = func()  # 创建generator
-    tmp = next(f)  # 调用一次，返回0
-    tmp = next(f)  # 调用第二次的时候，才会Stop Iteration
-
-由于顺序调用这些generator不会出现上述问题，因此有两种解决方案：
-
-1. **最佳推荐**：将样本的地址放入另一个文本文件，train.list写入那个文本文件的地址。即不要将每一个样本都放入train.list。
-2. 在generator的上下文中尽量留下非常少的变量引用，例如
-
-..  code-block:: python
-
-    def real_process(fn):
-        # ... read from fn
-        return result   # 当函数返回的时候，python可以解除掉内部变量的引用。
-
-    def process(fn):
-        yield real_process(fn)
-
-注意：这个问题是PyDataProvider读数据时候的逻辑问题，很难整体修正。
-
-内存不够用的情况
-++++++++++++++++
-
-PyDataProvider2会尽可能多的使用内存。因此，对于内存较小的机器，推荐使用 ``pool_size`` 变量来设置内存中暂存的数据条。具体请参考 `@provider`_ 中的说明。
-
+PyDataProvider2的使用
+=====================
+
+PyDataProvider2是PaddlePaddle使用Python提供数据的推荐接口。该接口使用多线程读取数据，并提供了简单的Cache功能；同时可以使用户只关注如何从文件中读取每一条数据，而不用关心数据如何传输，如何存储等等。
+
+..  contents::
+
+MNIST的使用场景
+---------------
+
+我们以MNIST手写识别为例，来说明PyDataProvider2的简单使用场景。
+
+样例数据
+++++++++
+
+MNIST是一个包含有70,000张灰度图片的数字分类数据集。样例数据 ``mnist_train.txt`` 如下：
+
+..  literalinclude:: src/mnist_train.txt
+
+其中每行数据代表一张图片，行内使用 ``;`` 分成两部分。第一部分是图片的标签，为0-9中的一个数字；第二部分是28*28的图片像素灰度值。 对应的 ``train.list`` 即为这个数据文件的名字：
+
+..  literalinclude:: src/train.list
+
+dataprovider的使用
+++++++++++++++++++
+
+..  literalinclude:: src/mnist_provider.dict.py
+
+- 首先，引入PaddlePaddle的PyDataProvider2包。
+- 其次，定义一个Python的 `Decorator <http://www.learnpython.org/en/Decorators>`_ `@provider`_ 。用于将下一行的数据输入函数标记成一个PyDataProvider2，同时设置它的input_types属性。
+  
+  - `input_types`_：设置这个PyDataProvider2返回什么样的数据。本例根据网络配置中 ``data_layer`` 的名字，显式指定返回的是一个28*28维的稠密浮点数向量和一个[0-9]的10维整数标签。
+
+    ..  literalinclude:: src/mnist_config.py
+         :lines: 9-10
+
+  - 注意：如果用户不显示指定返回数据的对应关系，那么PaddlePaddle会根据layer的声明顺序，来确定对应关系。但这个关系可能不正确，所以推荐使用显式指定的方式来设置input_types。
+- 最后，实现数据输入函数（如本例的 ``process`` 函数）。
+
+  - 该函数的功能是：打开文本文件，读取每一行，将行中的数据转换成与input_types一致的格式，然后返回给PaddlePaddle进程。注意，
+    
+    - 返回的顺序需要和input_types中定义的顺序一致。
+    - 返回时，必须使用Python关键词 ``yield`` ，相关概念是 ``generator`` 。
+    - 一次yield调用，返回一条完整的样本。如果想为一个数据文件返回多条样本，只需要在函数中调用多次yield即可（本例中使用for循环进行多次调用）。
+  
+  - 该函数具有两个参数：
+  
+    - settings：在本例中没有使用，具体可以参考 `init_hook`_ 中的说明。
+    - filename：为 ``train.list`` 或 ``test.list`` 中的一行，即若干数据文件路径的某一个。
+
+网络配置中的调用
+++++++++++++++++
+
+在网络配置里，只需要一行代码就可以调用这个PyDataProvider2，如，
+
+..  literalinclude:: src/mnist_config.py
+     :lines: 1-7
+
+训练数据是 ``train.list`` ，没有测试数据，调用的PyDataProvider2是 ``mnist_provider`` 模块中的 ``process`` 函数。
+
+小结
++++++
+
+至此，简单的PyDataProvider2样例就说明完毕了。对用户来说，仅需要知道如何从 **一个文件** 中读取 **一条样本** ，就可以将数据传送给PaddlePaddle了。而PaddlePaddle则会帮用户做以下工作：
+
+* 将数据组合成Batch进行训练
+* 对训练数据进行Shuffle
+* 多线程的数据读取
+* 缓存训练数据到内存(可选)
+* CPU->GPU双缓存
+
+是不是很简单呢？
+
+时序模型的使用场景
+------------------
+样例数据
+++++++++
+
+时序模型是指数据的某一维度是一个序列形式，即包含时间步信息。所谓时间步信息，不一定和时间有关系，只是说明数据的顺序是重要的。例如，文本信息就是一个序列数据。
+
+本例采用英文情感分类的数据，即将一段英文文本数据，分类成正面情绪和负面情绪两类(用0和1表示)。样例数据 ``sentimental_train.txt`` 如下：
+
+..  literalinclude:: src/sentimental_train.txt
+
+dataprovider的使用
+++++++++++++++++++
+
+相对MNIST而言，这个dataprovider较复杂，主要原因是增加了初始化机制 `init_hook`_。本例的 ``on_init`` 函数就是根据该机制配置的，它会在dataprovider创建的时候执行。
+
+- 其中 ``input_types`` 和在 `@provider`_ 中配置的效果一致。本例中的输入特征是词ID的序列，因此使用 ``integer_value_sequence`` 类型来设置。
+- 将 ``dictionary`` 存入settings对象，在 ``process`` 函数中使用。 dictionary是从网络配置中传入的dict对象，即一个将单词字符串映射到单词ID的字典。
+
+..  literalinclude:: src/sentimental_provider.py
+
+网络配置中的调用
+++++++++++++++++
+
+调用这个PyDataProvider2的方法，基本上和MNIST样例一致，除了
+
+* 在配置中需要读取外部字典。
+* 在声明DataProvider的时候传入dictionary作为参数。
+
+..  literalinclude:: src/sentimental_config.py
+     :emphasize-lines: 12-14
+
+参考(Reference)
+---------------
+
+@provider
++++++++++
+
+``@provider`` 是一个Python的 `Decorator`_ ，可以将某一个函数标记成一个PyDataProvider2。如果不了解 `Decorator`_ 是什么也没关系，只需知道这是一个标记属性的方法就可以了。它包含的属性参数如下:
+
+*  input_types：数据输入格式。具体的格式说明，请参考 `input_types`_ 。
+*  should_shuffle：是不是要对数据做Shuffle。训练时默认shuffle，测试时默认不shuffle。
+*  min_pool_size：设置内存中最小暂存的数据条数，也是PaddlePaddle所能够保证的shuffle粒度。如果为-1，则会预先读取全部数据到内存中。
+*  pool_size： 设置内存中暂存的数据条数。如果为-1（默认），则不在乎内存暂存多少条数据。如果设置，则推荐大于训练时batch size的值，并且在内存足够的情况下越大越好。
+*  can_over_batch_size：是否允许暂存略微多余pool_size的数据。由于这样做可以避免很多死锁问题，一般推荐设置成True。
+*  calc_batch_size：可以传入一个函数，用于自定义每条数据的batch size（默认为1）。
+*  cache： 数据缓存的策略，具体请参考 `cache`_ 。
+*  init_hook：初始化时调用的函数，具体请参考 `init_hook`_ 。
+*  check：如果为true，会根据input_types检查数据的合法性。
+*  check_fail_continue：如果为true，那么当check出数据不合法时，会扔到这条数据，继续训练或预测。（对check=false的情况，没有作用）
+
+input_types
++++++++++++
+
+PaddlePaddle的数据包括四种主要类型，和三种序列模式。
+
+四种数据类型：
+
+* dense_vector：稠密的浮点数向量。
+* sparse_binary_vector：稀疏的01向量，即大部分值为0，但有值的地方必须为1。
+* sparse_float_vector：稀疏的向量，即大部分值为0，但有值的部分可以是任何浮点数。
+* integer：整数标签。
+
+三种序列模式：
+
+* SequenceType.NO_SEQUENCE：不是一条序列
+* SequenceType.SEQUENCE：是一条时间序列
+* SequenceType.SUB_SEQUENCE： 是一条时间序列，且序列的每一个元素还是一个时间序列。
+
+不同的数据类型和序列模式返回的格式不同，列表如下：
+
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+|                      | NO_SEQUENCE         | SEQUENCE                          |  SUB_SEQUENCE                                  |
++======================+=====================+===================================+================================================+
+| dense_vector         | [f, f, ...]         | [[f, ...], [f, ...], ...]         | [[[f, ...], ...], [[f, ...], ...],...]         |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+| sparse_binary_vector | [i, i, ...]         | [[i, ...], [i, ...], ...]         | [[[i, ...], ...], [[i, ...], ...],...]         |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+| sparse_float_vector  | [(i,f), (i,f), ...] | [[(i,f), ...], [(i,f), ...], ...] | [[[(i,f), ...], ...], [[(i,f), ...], ...],...] |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+| integer_value        |  i                  | [i, i, ...]                       | [[i, ...], [i, ...], ...]                      |
++----------------------+---------------------+-----------------------------------+------------------------------------------------+
+
+其中，f代表一个浮点数，i代表一个整数。
+
+注意：对sparse_binary_vector和sparse_float_vector，PaddlePaddle存的是有值位置的索引。例如，
+
+- 对一个5维非序列的稀疏01向量 ``[0, 1, 1, 0, 0]`` ，类型是sparse_binary_vector，返回的是 ``[1, 2]`` 。
+- 对一个5维非序列的稀疏浮点向量 ``[0, 0.5, 0.7, 0, 0]`` ，类型是sparse_float_vector，返回的是 ``[(1, 0.5), (2, 0.7)]`` 。
+
+init_hook
++++++++++
+
+init_hook可以传入一个函数。该函数在初始化的时候会被调用，其参数如下:
+
+* 第一个参数是settings对象，它和数据传入函数的第一个参数（如本例中 ``process`` 函数的 ``settings`` 参数）必须一致。该对象具有以下两个属性：
+    * settings.input_types：数据输入格式，具体请参考 `input_types`_ 。
+    * settings.logger：一个logging对象。
+* 其他参数使用 ``kwargs`` （key word arguments）传入，包括以下两种：
+    * PaddlePaddle定义的参数: 1）is_train：bool型参数，表示用于训练或预测；2）file_list：所有文件列表。
+    * 用户定义的参数：使用args在网络配置中设置。
+
+注意：PaddlePaddle保留添加参数的权力，因此init_hook尽量使用 ``**kwargs`` 来接受不使用的函数以保证兼容性。
+
+cache
++++++
+
+PyDataProvider2提供了两种简单的Cache策略：
+
+* CacheType.NO_CACHE：不缓存任何数据，每次都会从python端读取数据
+* CacheType.CACHE_PASS_IN_MEM：第一个pass会从python端读取数据，剩下的pass会直接从内存里
+  读取数据。 
+
+
+注意事项
+--------
+
+可能的内存泄露问题
+++++++++++++++++++
+
+PaddlePaddle将train.list中的每一行都传递给process函数，从而生成多个generator。当训练数据非常多时，就会生成非常多的generator。
+
+虽然每个generator在没有调用的时候，是几乎不占内存的；但当调用过一次后，generator便会存下当前的上下文(Context)，而这个Context可能会非常大。并且，generator至少需要调用两次才会知道是否停止。所以，即使process函数里面只有一个yield，也需要两次随机选择到相同generator的时候，才会释放该段内存。
+
+..  code-block:: python
+
+    def func():
+        yield 0
+
+    f = func()  # 创建generator
+    tmp = next(f)  # 调用一次，返回0
+    tmp = next(f)  # 调用第二次的时候，才会Stop Iteration
+
+由于顺序调用这些generator不会出现上述问题，因此有两种解决方案：
+
+1. **最佳推荐**：将样本的地址放入另一个文本文件，train.list写入那个文本文件的地址。即不要将每一个样本都放入train.list。
+2. 在generator的上下文中尽量留下非常少的变量引用，例如
+
+..  code-block:: python
+
+    def real_process(fn):
+        # ... read from fn
+        return result   # 当函数返回的时候，python可以解除掉内部变量的引用。
+
+    def process(fn):
+        yield real_process(fn)
+
+注意：这个问题是PyDataProvider读数据时候的逻辑问题，很难整体修正。
+
+内存不够用的情况
+++++++++++++++++
+
+PyDataProvider2会尽可能多的使用内存。因此，对于内存较小的机器，推荐使用 ``pool_size`` 变量来设置内存中暂存的数据条。具体请参考 `@provider`_ 中的说明。
+
diff --git a/doc/api/data_provider/pydataprovider2_en.rst b/doc/api/data_provider/pydataprovider2_en.rst
index b42cbca576e4b5d67d50d0156939a01faae4533d..30357be32538db4423ad0eaf899138256c84edc7 100644
--- a/doc/api/data_provider/pydataprovider2_en.rst
+++ b/doc/api/data_provider/pydataprovider2_en.rst
@@ -1,5 +1,7 @@
+..  _api_pydataprovider2:
+
 PyDataProvider2
-=================
+===============
 
 We highly recommand users to use PyDataProvider2 to provide training or testing
 data to PaddlePaddle. The user only needs to focus on how to read a single
@@ -22,18 +24,18 @@ of 28 x 28 pixels.
 
 A small part of the original data as an example is shown as below:
 
-.. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_train.txt
+.. literalinclude:: src/mnist_train.txt
 
 Each line of the data contains two parts, separated by :code:`;`. The first part is
 label of an image. The second part contains 28x28 pixel float values.
 
 Just write path of the above data into train.list. It looks like this:
 
-.. literalinclude:: ../../../doc_cn/ui/data_provider/train.list
+.. literalinclude:: src/train.list
 
 The corresponding dataprovider is shown as below:
 
-.. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_provider.py
+.. literalinclude:: src/mnist_provider.dict.py
 
 The first line imports PyDataProvider2 package.
 The main function is the process function, that has two parameters.
@@ -72,7 +74,7 @@ sample by using keywords :code:`yield`.
 Only a few lines of codes need to be added into the training configuration file,
 you can take this as an example.
 
-.. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_config.py
+.. literalinclude:: src/mnist_config.py
 
 Here we specify training data by :code:`train.list`, and no testing data is specified.
 The method which actually provide data is :code:`process`.
@@ -81,7 +83,7 @@ User also can use another style to provide data, which defines the
 :code:`data_layer`'s name explicitly when `yield`. For example,
 the :code:`dataprovider` is shown as below.
 
-.. literalinclude:: ../../../doc_cn/ui/data_provider/mnist_provider.dict.py
+.. literalinclude:: src/mnist_provider.dict.py
    :linenos:
 
 If user did't give the :code:`data_layer`'s name, PaddlePaddle will use
@@ -102,6 +104,8 @@ And PaddlePadle will do all of the rest things\:
 
 Is this cool?
 
+..  _api_pydataprovider2_sequential_model:
+
 DataProvider for the sequential model
 -------------------------------------
 A sequence model takes sequences as its input. A sequence is made up of several
@@ -117,11 +121,11 @@ negative sentiment (marked by 0 and 1 respectively).
 
 A small part of the original data as an example can be found in the path below:
 
-.. literalinclude:: ../../../doc_cn/ui/data_provider/sentimental_train.txt
+.. literalinclude:: src/sentimental_train.txt
 
 The corresponding data provider can be found in the path below:
 
-.. literalinclude:: ../../../doc_cn/ui/data_provider/sentimental_provider.py
+.. literalinclude:: src/sentimental_provider.py
 
 This data provider for sequential model is a little more complex than that
 for MINST dataset.
@@ -139,7 +143,7 @@ initialized. The :code:`on_init` function has the following parameters:
 To pass these parameters into DataProvider, the following lines should be added
 into trainer configuration file.
 
-.. literalinclude:: ../../../doc_cn/ui/data_provider/sentimental_config.py
+.. literalinclude:: src/sentimental_config.py
 
 The definition is basically same as MNIST example, except:
 * Load dictionary in this configuration
diff --git a/doc_cn/ui/data_provider/mnist_config.py b/doc/api/data_provider/src/mnist_config.py
similarity index 100%
rename from doc_cn/ui/data_provider/mnist_config.py
rename to doc/api/data_provider/src/mnist_config.py
diff --git a/doc_cn/ui/data_provider/mnist_provider.dict.py b/doc/api/data_provider/src/mnist_provider.dict.py
similarity index 100%
rename from doc_cn/ui/data_provider/mnist_provider.dict.py
rename to doc/api/data_provider/src/mnist_provider.dict.py
diff --git a/doc_cn/ui/data_provider/mnist_train.txt b/doc/api/data_provider/src/mnist_train.txt
similarity index 100%
rename from doc_cn/ui/data_provider/mnist_train.txt
rename to doc/api/data_provider/src/mnist_train.txt
diff --git a/doc_cn/ui/data_provider/sentimental_config.py b/doc/api/data_provider/src/sentimental_config.py
similarity index 100%
rename from doc_cn/ui/data_provider/sentimental_config.py
rename to doc/api/data_provider/src/sentimental_config.py
diff --git a/doc_cn/ui/data_provider/sentimental_provider.py b/doc/api/data_provider/src/sentimental_provider.py
similarity index 100%
rename from doc_cn/ui/data_provider/sentimental_provider.py
rename to doc/api/data_provider/src/sentimental_provider.py
diff --git a/doc_cn/ui/data_provider/sentimental_train.txt b/doc/api/data_provider/src/sentimental_train.txt
similarity index 100%
rename from doc_cn/ui/data_provider/sentimental_train.txt
rename to doc/api/data_provider/src/sentimental_train.txt
diff --git a/doc_cn/ui/data_provider/train.list b/doc/api/data_provider/src/train.list
similarity index 100%
rename from doc_cn/ui/data_provider/train.list
rename to doc/api/data_provider/src/train.list
diff --git a/doc/api/index_cn.rst b/doc/api/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..3718cd73a2003b8ef6c406a9bd51dc68e76402dc
--- /dev/null
+++ b/doc/api/index_cn.rst
@@ -0,0 +1,37 @@
+API中文手册
+============
+
+DataProvider API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    data_provider/dataprovider_cn.rst
+    data_provider/pydataprovider2_cn.rst
+
+..  _api_trainer_config:
+
+Model Config API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    trainer_config_helpers/optimizers.rst
+    trainer_config_helpers/data_sources.rst
+    trainer_config_helpers/layers.rst
+    trainer_config_helpers/activations.rst 
+    trainer_config_helpers/poolings.rst
+    trainer_config_helpers/networks.rst
+    trainer_config_helpers/evaluators.rst
+    trainer_config_helpers/attrs.rst
+
+
+Applications API
+----------------
+
+..  toctree::
+    :maxdepth: 1
+
+    predict/swig_py_paddle_cn.rst
diff --git a/doc/api/index_en.rst b/doc/api/index_en.rst
index 9930f93e10e64d1e2306d34bb32aedc858bfcb68..10c297a71d6988c002de868e804ed9ee2345fbd7 100644
--- a/doc/api/index_en.rst
+++ b/doc/api/index_en.rst
@@ -1,35 +1,37 @@
 API
-====
+===
 
 DataProvider API
 ----------------
 
 ..  toctree::
-  :maxdepth: 1
+    :maxdepth: 1
 
-  data_provider/index_en.rst
-  data_provider/pydataprovider2_en.rst
+    data_provider/dataprovider_en.rst
+    data_provider/pydataprovider2_en.rst
+
+..  _api_trainer_config:
 
 Model Config API
 ----------------
 
 ..  toctree::
-  :maxdepth: 1
+    :maxdepth: 1
 
-  trainer_config_helpers/optimizers.rst
-  trainer_config_helpers/data_sources.rst
-  trainer_config_helpers/layers.rst
-  trainer_config_helpers/activations.rst 
-  trainer_config_helpers/poolings.rst
-  trainer_config_helpers/networks.rst
-  trainer_config_helpers/evaluators.rst
-  trainer_config_helpers/attrs.rst
+    trainer_config_helpers/optimizers.rst
+    trainer_config_helpers/data_sources.rst
+    trainer_config_helpers/layers.rst
+    trainer_config_helpers/activations.rst 
+    trainer_config_helpers/poolings.rst
+    trainer_config_helpers/networks.rst
+    trainer_config_helpers/evaluators.rst
+    trainer_config_helpers/attrs.rst
 
 
 Applications API
 ----------------
 
 ..  toctree::
-  :maxdepth: 1
+    :maxdepth: 1
 
-  predict/swig_py_paddle_en.rst
+    predict/swig_py_paddle_en.rst
diff --git a/doc/api/predict/predict_sample.py b/doc/api/predict/src/predict_sample.py
similarity index 100%
rename from doc/api/predict/predict_sample.py
rename to doc/api/predict/src/predict_sample.py
diff --git a/doc_cn/ui/predict/swig_py_paddle.rst b/doc/api/predict/swig_py_paddle_cn.rst
similarity index 97%
rename from doc_cn/ui/predict/swig_py_paddle.rst
rename to doc/api/predict/swig_py_paddle_cn.rst
index 05f25345c5246687363dee1931310120b5723d0b..15e35353bb25e7906191e47eae49c824b521c7fd 100644
--- a/doc_cn/ui/predict/swig_py_paddle.rst
+++ b/doc/api/predict/swig_py_paddle_cn.rst
@@ -34,7 +34,7 @@ PaddlePaddle使用swig对常用的预测接口进行了封装，通过编译会
 
 如下是一段使用mnist model来实现手写识别的预测代码。完整的代码见 ``src_root/doc/ui/predict/predict_sample.py`` 。mnist model可以通过 ``src_root\demo\mnist`` 目录下的demo训练出来。
 
-..  literalinclude:: ../../../doc/ui/predict/predict_sample.py
+..  literalinclude:: src/predict_sample.py
     :language: python
     :lines: 15-18,121-136
 
diff --git a/doc/api/predict/swig_py_paddle_en.rst b/doc/api/predict/swig_py_paddle_en.rst
index 9845cd1607b425dc0a4ddc665aab40d96fa2fbe4..1c628e6971fa5643e6a9ca629488049957686193 100644
--- a/doc/api/predict/swig_py_paddle_en.rst
+++ b/doc/api/predict/swig_py_paddle_en.rst
@@ -13,7 +13,7 @@ Here is a sample python script that shows the typical prediction process for the
 MNIST classification problem. A complete sample code could be found at
 :code:`src_root/doc/ui/predict/predict_sample.py`.
 
-..  literalinclude:: ./predict_sample.py
+..  literalinclude:: src/predict_sample.py
     :language: python
     :lines: 15-18,90-100,101-104
 
@@ -23,7 +23,7 @@ python's :code:`help()` function. Let's walk through the above python script:
 
 * At the beginning, use :code:`swig_paddle.initPaddle()` to initialize
   PaddlePaddle with command line arguments, for more about command line arguments
-  see `Command Line Arguments <../cmd_argument/detail_introduction.html>`_.
+  see :ref:`cmd_detail_introduction` .
 * Parse the configuration file that is used in training with :code:`parse_config()`.
   Because data to predict with always have no label, and output of prediction work
   normally is the output layer rather than the cost layer, so you should modify
@@ -36,7 +36,7 @@ python's :code:`help()` function. Let's walk through the above python script:
     - Note: As swig_paddle can only accept C++ matrices, we offer a utility
       class DataProviderConverter that can accept the same input data with
       PyDataProvider2, for more information please refer to document
-      of `PyDataProvider2 <../data_provider/pydataprovider2.html>`_.
+      of :ref:`api_pydataprovider2` .
 * Do the prediction with :code:`forwardTest()`, which takes the converted
   input data and outputs the activations of the output layer.
 
diff --git a/doc/api/trainer_config_helpers/data_sources.rst b/doc/api/trainer_config_helpers/data_sources.rst
index 44ea59df43762508e86c7b867fcf136d84c8351e..b9dd4dda01ae59d1260356aff50ddf298d02c87f 100644
--- a/doc/api/trainer_config_helpers/data_sources.rst
+++ b/doc/api/trainer_config_helpers/data_sources.rst
@@ -1,3 +1,5 @@
+..  _api_trainer_config_helpers_data_sources:
+
 DataSources
 ===========
 
diff --git a/doc/api/trainer_config_helpers/layers.rst b/doc/api/trainer_config_helpers/layers.rst
index b487b739a719e9f7118efcc143301da36f7a978e..52a6cfb120504d57617f0d777b5ca49cd7d269d7 100644
--- a/doc/api/trainer_config_helpers/layers.rst
+++ b/doc/api/trainer_config_helpers/layers.rst
@@ -1,3 +1,5 @@
+..  _api_trainer_config_helpers_layers:
+
 ======
 Layers
 ======
@@ -20,6 +22,8 @@ LayerOutput
 Data layer
 ===========
 
+..  _api_trainer_config_helpers_layers_data_layer:
+
 data_layer
 ----------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -29,6 +33,8 @@ data_layer
 Fully Connected Layers
 ======================
 
+..  _api_trainer_config_helpers_layers_fc_layer:
+
 fc_layer
 --------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -68,6 +74,8 @@ img_conv_layer
     :members: img_conv_layer
     :noindex:
 
+..  _api_trainer_config_helpers_layers_context_projection:
+
 context_projection 
 ------------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -185,6 +193,8 @@ mixed_layer
     :members: mixed_layer
     :noindex:
 
+..  _api_trainer_config_helpers_layers_embedding_layer:
+
 embedding_layer
 ---------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -237,6 +247,8 @@ trans_full_matrix_projection
 Aggregate Layers
 ================
 
+..  _api_trainer_config_helpers_layers_pooling_layer:
+
 pooling_layer
 -------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -333,6 +345,8 @@ tensor_layer
     :members: tensor_layer
     :noindex:
 
+..  _api_trainer_config_helpers_layers_cos_sim:
+
 cos_sim
 -------
 ..  automodule:: paddle.trainer_config_helpers.layers
diff --git a/doc/api/trainer_config_helpers/networks.rst b/doc/api/trainer_config_helpers/networks.rst
index 29c52c5ce3078f1755162dbbdd65a059d8ba9fa4..e13c368051abe3c50036c3baab988f170df4c641 100644
--- a/doc/api/trainer_config_helpers/networks.rst
+++ b/doc/api/trainer_config_helpers/networks.rst
@@ -13,6 +13,8 @@ sequence_conv_pool
     :members: sequence_conv_pool
     :noindex:
 
+..  _api_trainer_config_helpers_network_text_conv_pool:
+
 text_conv_pool
 --------------
 ..  automodule:: paddle.trainer_config_helpers.networks
diff --git a/doc_cn/faq/index.rst b/doc/faq/index_cn.rst
similarity index 91%
rename from doc_cn/faq/index.rst
rename to doc/faq/index_cn.rst
index df8f1308cbc4d93cfeab4d921dcbbf5155eb4cc1..f2f114065c4109cd0b36752da622ea01ce822ceb 100644
--- a/doc_cn/faq/index.rst
+++ b/doc/faq/index_cn.rst
@@ -1,5 +1,5 @@
 ####################
-PaddlePaddle常见问题
+FAQ
 ####################
 
 ..  contents::
@@ -33,10 +33,9 @@ PyDataProvider使用的是异步加载，同时在内存里直接随即选取数
 个内存池实际上决定了shuffle的粒度。所以，如果将这个内存池减小，又要保证数据是随机的，
 那么最好将数据文件在每次读取之前做一次shuffle。可能的代码为
 
-..  literalinclude:: reduce_min_pool_size.py
+..  literalinclude:: src/reduce_min_pool_size.py
 
-这样做可以极大的减少内存占用，并且可能会加速训练过程，详细文档参考 `这里
-<../ui/data_provider/pydataprovider2.html#provider>`_ 。
+这样做可以极大的减少内存占用，并且可能会加速训练过程，详细文档参考 `这里 <../ui/data_provider/pydataprovider2.html#provider>`_ 。
 
 神经元激活内存
 ++++++++++++++
@@ -76,7 +75,7 @@ PaddlePaddle支持非常多的优化算法(Optimizer)，不同的优化算法需
 使用 :code:`pydataprovider`时，可以减少缓存池的大小，同时设置内存缓存功能，即可以极大的加速数据载入流程。
 :code:`DataProvider` 缓存池的减小，和之前减小通过减小缓存池来减小内存占用的原理一致。
 
-..  literalinclude:: reduce_min_pool_size.py
+..  literalinclude:: src/reduce_min_pool_size.py
 
 同时 :code:`@provider` 接口有一个 :code:`cache` 参数来控制缓存方法，将其设置成 :code:`CacheType.CACHE_PASS_IN_MEM` 的话，会将第一个 :code:`pass` (过完所有训练数据即为一个pass)生成的数据缓存在内存里，在之后的 :code:`pass` 中，不会再从 :code:`python` 端读取数据，而是直接从内存的缓存里读取数据。这也会极大减少数据读入的耗时。
 
@@ -90,11 +89,11 @@ PaddlePaddle支持Sparse的训练，sparse训练需要训练特征是 :code:`spa
 
 使用一个词前两个词和后两个词，来预测这个中间的词。这个任务的DataProvider为\:
 
-..  literalinclude:: word2vec_dataprovider.py
+..  literalinclude:: src/word2vec_dataprovider.py
 
 这个任务的配置为\:
 
-..  literalinclude:: word2vec_config.py
+..  literalinclude:: src/word2vec_config.py
 
 更多关于sparse训练的内容请参考 `sparse训练的文档 <TBD>`_
 
@@ -114,7 +113,7 @@ PaddlePaddle支持Sparse的训练，sparse训练需要训练特征是 :code:`spa
   * 具体的多机训练方法参考  `多机训练文档 <../ui/data_provider/pydataprovider2.html#provider>`_ 。
 
 
-3. 遇到“非法指令”或者是“illegal instruction” 
+3. 遇到“非法指令”或者是“illegal instruction”
 --------------------------------------------
 
 PaddlePaddle使用avx SIMD指令提高cpu执行效率，因此错误的使用二进制发行版可能会导致这种错误，请选择正确的版本。
@@ -141,7 +140,7 @@ PaddlePaddle使用avx SIMD指令提高cpu执行效率，因此错误的使用二
 
 ..  code-block:: python
 
-    hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0), 
+    hidden = fc_layer(input=ipt, param_attr=ParamAttr(initial_max=1.0, initial_min=-1.0),
                       bias_attr=ParamAttr(initial_mean=1.0, initial_std=0.0))
 
 上述代码将bias全部初始化为1.0, 同时将参数初始化为 :code:`[1.0, -1.0]` 的均匀分布。
@@ -157,8 +156,8 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 
 这里 :code:`hidden_a` 和 :code:`hidden_b` 使用了同样的parameter和bias。并且softmax层的两个输入也使用了同样的参数 :code:`softmax_param`。
 
-7. *-cp27mu-linux_x86_64.whl is not a supported wheel on this platform.
------------------------------------------------------------------------
+7. \*-cp27mu-linux_x86_64.whl is not a supported wheel on this platform.
+------------------------------------------------------------------------
 
 出现这个问题的主要原因是，系统编译wheel包的时候，使用的 :code:`wheel` 包是最新的，
 而系统中的 :code:`pip` 包比较老。具体的解决方法是，更新 :code:`pip` 包并重新编译PaddlePaddle。
@@ -191,14 +190,14 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
     41 - test_config_parser (Failed)
     42 - test_swig_api (Failed)
     43 - layers_test (Failed)
-    
+
 并且查询PaddlePaddle单元测试的日志，提示：
 
 ..  code-block:: bash
-    
+
     paddle package is already in your PYTHONPATH. But unittest need a clean environment.
     Please uninstall paddle package before start unittest. Try to 'pip uninstall paddle'.
-    
+
 解决办法是：
 
 * 卸载PaddlePaddle包 :code:`pip uninstall paddle`, 清理掉老旧的PaddlePaddle安装包，使得单元测试有一个干净的环境。如果PaddlePaddle包已经在python的site-packages里面，单元测试会引用site-packages里面的python包，而不是源码目录里 :code:`/python` 目录下的python包。同时，即便设置 :code:`PYTHONPATH` 到 :code:`/python` 也没用，因为python的搜索路径是优先已经安装的python包。
@@ -220,18 +219,18 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 
 
 10. CMake源码编译, 找到的PythonLibs和PythonInterp版本不一致
-----------------------------------------------------------
+----------------------------------------------------------------
 
 这是目前CMake寻找Python的逻辑存在缺陷，如果系统安装了多个Python版本，CMake找到的Python库和Python解释器版本可能有不一致现象，导致编译PaddlePaddle失败。正确的解决方法是，
 用户强制指定特定的Python版本，具体操作如下：
 
     ..  code-block:: bash
-        
+
         cmake .. -DPYTHON_EXECUTABLE=<exc_path> -DPYTHON_LIBRARY=<lib_path>  -DPYTHON_INCLUDE_DIR=<inc_path>
 
 用户需要指定本机上Python的路径：``<exc_path>``, ``<lib_path>``, ``<inc_path>``
 
-10. A protocol message was rejected because it was too big
+10. A protocol message was rejected because it was too big
 ----------------------------------------------------------
 
 如果在训练NLP相关模型时，出现以下错误：
@@ -239,7 +238,7 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 ..  code-block:: bash
 
     [libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes).  To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.
-    F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr) 
+    F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr)
 
 可能的原因是：传给dataprovider的某一个args过大，一般是由于直接传递大字典导致的。错误的define_py_data_sources2类似：
 
@@ -285,3 +284,22 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 ..      code-block:: bash
 
         paddle train --use_gpu=true --trainer_count=2 --gpu_id=2
+
+12. 编译源码提示warp-ctc/include/ctc.h 找不到的情况
+---------------------------------------------------
+
+目前Paddle使用\ :code:`git submodule`\ 来引用一些第三方模块。简单的\
+:code:`git clone`\ 命令不能得到第三方模块的代码。需要使用\:
+
+..  code-block:: bash
+
+    git clone --recursive https://github.com/PaddlePaddle/Paddle.git
+
+来获取所有源码。对于已经clone的git版本库，可以在Paddle的源码目录中执行\:
+
+..  code-block:: bash
+
+    git submodule init
+    git submodule update
+
+来获得所有第三方模块。
\ No newline at end of file
diff --git a/doc_cn/faq/reduce_min_pool_size.py b/doc/faq/src/reduce_min_pool_size.py
similarity index 100%
rename from doc_cn/faq/reduce_min_pool_size.py
rename to doc/faq/src/reduce_min_pool_size.py
diff --git a/doc_cn/faq/word2vec_config.py b/doc/faq/src/word2vec_config.py
similarity index 100%
rename from doc_cn/faq/word2vec_config.py
rename to doc/faq/src/word2vec_config.py
diff --git a/doc_cn/faq/word2vec_dataprovider.py b/doc/faq/src/word2vec_dataprovider.py
similarity index 100%
rename from doc_cn/faq/word2vec_dataprovider.py
rename to doc/faq/src/word2vec_dataprovider.py
diff --git a/doc_cn/introduction/index.rst b/doc/getstarted/basic_usage/index_cn.rst
similarity index 80%
rename from doc_cn/introduction/index.rst
rename to doc/getstarted/basic_usage/index_cn.rst
index c996f5f4acd07011c98c3e1086080e85ed7dd1b4..d01cdaaeb75ec7d02480eb9162cabaad2a947db9 100644
--- a/doc_cn/introduction/index.rst
+++ b/doc/getstarted/basic_usage/index_cn.rst
@@ -1,16 +1,16 @@
-简介
-====
+经典的线性回归任务
+==================
 
 PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍将向你展示如何利用PaddlePaddle来解决一个经典的线性回归问题。
 
-1. 一个经典的任务
------------------
+任务简介
+--------
 
 我们展示如何用PaddlePaddle解决 `单变量的线性回归 <https://www.baidu.com/s?wd=单变量线性回归>`_ 问题。线性回归的输入是一批点 `(x, y)` ，其中 `y = wx + b + ε`， 而 ε 是一个符合高斯分布的随机变量。线性回归的输出是从这批点估计出来的参数 `w` 和 `b` 。
 
 一个例子是房产估值。我们假设房产的价格（y）是其大小（x）的一个线性函数，那么我们可以通过收集市场上房子的大小和价格，用来估计线性函数的参数w 和 b。
 
-2. 准备数据
+准备数据
 -----------
 
 假设变量 `x` 和 `y` 的真实关系为： `y = 2x + 0.3 + ε`，这里展示如何使用观测数据来拟合这一线性关系。首先，Python代码将随机产生2000个观测点，作为线性回归的输入。下面脚本符合PaddlePaddle期待的读取数据的Python程序的模式。
@@ -28,7 +28,7 @@ PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍
             x = random.random()
             yield [x], [2*x+0.3]
 
-3. 训练模型
+训练模型
 -----------
 
 为了还原 `y = 2x + 0.3`，我们先从一条随机的直线 `y' = wx + b` 开始，然后利用观测数据调整 `w` 和 `b` 使得 `y'` 和 `y` 的差距不断减小，最终趋于接近。这个过程就是模型的训练过程，而 `w` 和 `b` 就是模型的参数，即我们的训练目标。
@@ -58,6 +58,7 @@ PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍
     cost = regression_cost(input= ȳ, label=y)
     outputs(cost)
 
+
 这段简短的配置展示了PaddlePaddle的基本用法：
 
 - 第一部分定义了数据输入。一般情况下，PaddlePaddle先从一个文件列表里获得数据文件地址，然后交给用户自定义的函数（例如上面的 `process`函数）进行读入和预处理从而得到真实输入。本文中由于输入数据是随机生成的不需要读输入文件，所以放一个空列表（`empty.list`）即可。
@@ -65,10 +66,10 @@ PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍
 - 第二部分主要是选择学习算法，它定义了模型参数改变的规则。PaddlePaddle提供了很多优秀的学习算法，这里使用一个基于momentum的随机梯度下降(SGD)算法，该算法每批量(batch)读取12个采样数据进行随机梯度计算来更新更新。
 
 - 最后一部分是神经网络的配置。由于PaddlePaddle已经实现了丰富的网络层，所以很多时候你需要做的只是定义正确的网络层并把它们连接起来。这里使用了三种网络单元：
-	
-	- **数据层**：数据层 `data_layer` 是神经网络的入口，它读入数据并将它们传输到接下来的网络层。这里数据层有两个，分别对应于变量 `x` 和 `y`。
-	- **全连接层**：全连接层 `fc_layer` 是基础的计算单元，这里利用它建模变量之间的线性关系。计算单元是神经网络的核心，PaddlePaddle支持大量的计算单元和任意深度的网络连接，从而可以拟合任意的函数来学习复杂的数据关系。
-	- **回归误差代价层**：回归误差代价层 `regression_cost` 是众多误差代价函数层的一种，它们在训练过程作为网络的出口，用来计算模型的误差，是模型参数优化的目标函数。
+    
+    - **数据层**：数据层 `data_layer` 是神经网络的入口，它读入数据并将它们传输到接下来的网络层。这里数据层有两个，分别对应于变量 `x` 和 `y`。
+    - **全连接层**：全连接层 `fc_layer` 是基础的计算单元，这里利用它建模变量之间的线性关系。计算单元是神经网络的核心，PaddlePaddle支持大量的计算单元和任意深度的网络连接，从而可以拟合任意的函数来学习复杂的数据关系。
+    - **回归误差代价层**：回归误差代价层 `regression_cost` 是众多误差代价函数层的一种，它们在训练过程作为网络的出口，用来计算模型的误差，是模型参数优化的目标函数。
 
 定义了网络结构并保存为 `trainer_config.py` 之后，运行以下训练命令：
 
@@ -78,7 +79,7 @@ PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍
 
 PaddlePaddle将在观测数据集上迭代训练30轮，并将每轮的模型结果存放在 `./output` 路径下。从输出日志可以看到，随着轮数增加误差代价函数的输出在不断的减小，这意味着模型在训练数据上不断的改进，直到逼近真实解：` y = 2x + 0.3 `
 
-4. 模型检验
+模型检验
 -----------
 
 训练完成后，我们希望能够检验模型的好坏。一种常用的做法是用学习的模型对另外一组测试数据进行预测，评价预测的效果。在这个例子中，由于已经知道了真实答案，我们可以直接观察模型的参数是否符合预期来进行检验。
@@ -99,16 +100,9 @@ PaddlePaddle将每个模型参数作为一个numpy数组单独存为一个文件
     # w=1.999743, b=0.300137
 
 .. image:: ./parameters.png
-	 :align: center
-	 :scale: 80 %
+     :align: center
+     :scale: 80 %
 
 从图中可以看到，虽然 `w` 和 `b` 都使用随机值初始化，但在起初的几轮训练中它们都在快速逼近真实值，并且后续仍在不断改进，使得最终得到的模型几乎与真实模型一致。
 
 这样，我们用PaddlePaddle解决了单变量线性回归问题， 包括数据输入、模型训练和最后的结果验证。
-
-5. 推荐后续阅读
----------------
-
-- `安装/编译 <../build_and_install/index.html>`_ ：PaddlePaddle的安装与编译文档。
-- `快速入门 <../demo/quick_start/index.html>`_ ：使用商品评论分类任务，系统性的介绍如何一步步改进，最终得到产品级的深度模型。
-- `示例 <../demo/index.html>`_ ：各种实用案例，涵盖图像、文本、推荐等多个领域。
\ No newline at end of file
diff --git a/doc/getstarted/basic_usage/index_en.rst b/doc/getstarted/basic_usage/index_en.rst
index dca7a6b1f4f017b302148c611122806f112564a9..c10b897d4292d0c2b062b5c8e23466505afa408a 100644
--- a/doc/getstarted/basic_usage/index_en.rst
+++ b/doc/getstarted/basic_usage/index_en.rst
@@ -1,15 +1,15 @@
-Basic Usage
-=============
+Simple Linear Regression
+========================
 
 PaddlePaddle is a deep learning platform open-sourced by Baidu. With PaddlePaddle, you can easily train a classic neural network within a couple lines of configuration, or you can build sophisticated models that provide state-of-the-art performance on difficult learning tasks like sentiment analysis, machine translation, image caption and so on.
 
-1. A Classic Problem
----------------------
+Problem Background
+------------------
 
 Now, to give you a hint of what using PaddlePaddle looks like, let's start with a fundamental learning problem - `simple linear regression <https://en.wikipedia.org/wiki/Simple_linear_regression>`_: you have observed a set of two-dimensional data points of ``X`` and ``Y``, where ``X`` is an explanatory variable and ``Y`` is corresponding dependent variable, and you want to recover the underlying correlation between ``X`` and ``Y``. Linear regression can be used in many practical scenarios. For example, ``X`` can be a variable about house size, and ``Y`` a variable about house price. You can build a model that captures relationship between them by observing real estate markets.
 
-2. Prepare the Data
---------------------
+Prepare the Data
+-----------------
 
 Suppose the true relationship can be characterized as ``Y = 2X + 0.3``, let's see how to recover this pattern only from observed data. Here is a piece of python code that feeds synthetic data to PaddlePaddle. The code is pretty self-explanatory, the only extra thing you need to add for PaddlePaddle is a definition of input data types.
 
@@ -26,8 +26,8 @@ Suppose the true relationship can be characterized as ``Y = 2X + 0.3``, let's se
                 x = random.random()
                 yield [x], [2*x+0.3]
 
-3. Train a NeuralNetwork
--------------------------
+Train a NeuralNetwork
+----------------------
 
 To recover this relationship between ``X`` and ``Y``, we use a neural network with one layer of linear activation units and a square error cost layer. Don't worry if you are not familiar with these terminologies, it's just saying that we are starting from a random line ``Y' = wX + b`` , then we gradually adapt ``w`` and ``b`` to minimize the difference between ``Y'`` and ``Y``. Here is what it looks like in PaddlePaddle:
 
@@ -73,8 +73,8 @@ Now that everything is ready, you can train the network with a simple command li
 This means that PaddlePaddle will train this network on the synthectic dataset for 30 passes, and save all the models under path ``./output``. You will see from the messages printed out during training phase that the model cost is decreasing as time goes by, which indicates we are getting a closer guess.
 
 
-4. Evaluate the Model
------------------------
+Evaluate the Model
+-------------------
 
 Usually, a different dataset that left out during training phase should be used to evalute the models. However, we are lucky enough to know the real answer: ``w=2, b=0.3``, thus a better option is to check out model parameters directly.
 
@@ -99,11 +99,3 @@ In PaddlePaddle, training is just to get a collection of model parameters, which
 Although starts from a random guess, you can see that value of ``w`` changes quickly towards 2 and ``b`` changes quickly towards 0.3. In the end, the predicted line is almost identical with real answer.
 
 There, you have recovered the underlying pattern between ``X`` and ``Y`` only from observed data.
-
-
-5. Where to Go from Here
--------------------------
-
-- `Install and Build <../build_and_install/index.html>`_
-- `Tutorials <../demo/quick_start/index_en.html>`_
-- `Example and Demo <../demo/index.html>`_
diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
index 150d7fc43720314462ac5c5b72f6a93b18e6d735..aaa07d49d3148266db27670a98c2b27db4dc0a8f 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -14,6 +14,13 @@ cd paddle
 git submodule update --init --recursive
 ```
 
+If you already have a local PaddlePaddle repo and have not initialized the submodule, your local submodule folder will be empty. You can simply run the last line of the above codes in your PaddlePaddle home directory to initialize your submodule folder.
+
+If you have already initialized your submodule and you would like to sync with the upstream submodule repo, you can run the following command
+```
+git submodule update --remote
+```
+
 ## <span id="requirements">Requirements</span>
 
 To compile the source code, your computer must be equipped with the following dependencies.
@@ -42,10 +49,8 @@ PaddlePaddle supports some build options. To enable it, first you need to instal
 <tbody>
 <tr><td class="left">WITH_GPU</td><td class="left">Compile with GPU mode.</td></tr>
 <tr><td class="left">WITH_DOUBLE</td><td class="left">Compile with double precision floating-point, default: single precision.</td></tr>
-<tr><td class="left">WITH_GLOG</td><td class="left">Compile with glog. If not found, default: an internal log implementation.</td></tr>
-<tr><td class="left">WITH_GFLAGS</td><td class="left">Compile with gflags. If not found, default: an internal flag implementation.</td></tr>
 <tr><td class="left">WITH_TESTING</td><td class="left">Compile with gtest for PaddlePaddle's unit testing.</td></tr>
-<tr><td class="left">WITH_DOC</td><td class="left">	Compile to generate PaddlePaddle's docs, default: disabled (OFF).</td></tr>
+<tr><td class="left">WITH_DOC</td><td class="left">    Compile to generate PaddlePaddle's docs, default: disabled (OFF).</td></tr>
 <tr><td class="left">WITH_SWIG_PY</td><td class="left">Compile with python predict API, default: disabled (OFF).</td></tr>
 <tr><td class="left">WITH_STYLE_CHECK</td><td class="left">Compile with code style check, default: enabled (ON).</td></tr>
 </tbody>
@@ -79,7 +84,7 @@ As a simple example, consider the following:
 
     ```bash
     pip install 'sphinx>=1.4.0'
-    pip install sphinx_rtd_theme breathe recommonmark
+    pip install sphinx_rtd_theme recommonmark
 
     # install doxygen on Ubuntu
     sudo apt-get install doxygen 
diff --git a/doc_cn/build_and_install/cmake/compile_options.rst b/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
similarity index 94%
rename from doc_cn/build_and_install/cmake/compile_options.rst
rename to doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
index f345ead2bf851bdad7be2fb8185d16fd2a318a66..3a52c8723bbccd70dd89e8913092d92813925f90 100644
--- a/doc_cn/build_and_install/cmake/compile_options.rst
+++ b/doc/getstarted/build_and_install/cmake/build_from_source_cn.rst
@@ -1,43 +1,43 @@
-PaddlePaddle的编译选项
-======================
-
-PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考 `官方文档 <https://cmake.org/cmake-tutorial>`_ 。
-
-Bool型的编译选项
-----------------
-用户可在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如
-
-..  code-block:: bash
-
-    cmake .. -DWITH_GPU=OFF
-
-..  csv-table:: Bool型的编译选项
-    :widths: 1, 7, 2
-    :file: compile_options.csv
-
-BLAS/CUDA/Cudnn的编译选项
---------------------------
-BLAS
-+++++
-
-PaddlePaddle支持以下任意一种BLAS库：`MKL <https://software.intel.com/en-us/intel-mkl>`_ ，`ATLAS <http://math-atlas.sourceforge.net/>`_ ，`OpenBlAS <http://www.openblas.net/>`_ 和 `REFERENCE BLAS <http://www.netlib.org/blas/>`_ 。
-
-..  csv-table:: BLAS路径相关的编译选项
-    :widths: 1, 2, 7
-    :file: cblas_settings.csv
-
-CUDA/Cudnn
-+++++++++++
-
-PaddlePaddle可以使用cudnn v2之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cudnn是同一个版本。 我们推荐使用最新版本的cudnn v5.1。
-
-编译选项的设置
-++++++++++++++
-
-PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。cmake编译时，首先在系统路径(/usr/lib\:/usr/local/lib)中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
-
-..  code-block:: bash
-
-    cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5
-
+PaddlePaddle的编译选项
+======================
+
+PaddlePaddle的编译选项，包括生成CPU/GPU二进制文件、链接何种BLAS库等。用户可在调用cmake的时候设置它们，详细的cmake使用方法可以参考 `官方文档 <https://cmake.org/cmake-tutorial>`_ 。
+
+Bool型的编译选项
+----------------
+用户可在cmake的命令行中，通过使用 ``-D`` 命令设置该类编译选项，例如
+
+..  code-block:: bash
+
+    cmake .. -DWITH_GPU=OFF
+
+..  csv-table:: Bool型的编译选项
+    :widths: 1, 7, 2
+    :file: compile_options.csv
+
+BLAS/CUDA/Cudnn的编译选项
+--------------------------
+BLAS
++++++
+
+PaddlePaddle支持以下任意一种BLAS库：`MKL <https://software.intel.com/en-us/intel-mkl>`_ ，`ATLAS <http://math-atlas.sourceforge.net/>`_ ，`OpenBlAS <http://www.openblas.net/>`_ 和 `REFERENCE BLAS <http://www.netlib.org/blas/>`_ 。
+
+..  csv-table:: BLAS路径相关的编译选项
+    :widths: 1, 2, 7
+    :file: cblas_settings.csv
+
+CUDA/Cudnn
++++++++++++
+
+PaddlePaddle可以使用cudnn v2之后的任何一个版本来编译运行，但尽量请保持编译和运行使用的cudnn是同一个版本。 我们推荐使用最新版本的cudnn v5.1。
+
+编译选项的设置
+++++++++++++++
+
+PaddePaddle通过编译时指定路径来实现引用各种BLAS/CUDA/Cudnn库。cmake编译时，首先在系统路径(/usr/lib\:/usr/local/lib)中搜索这几个库，同时也会读取相关路径变量来进行搜索。 通过使用 ``-D`` 命令可以设置，例如 
+
+..  code-block:: bash
+
+    cmake .. -DMKL_ROOT=/opt/mkl/ -DCUDNN_ROOT=/opt/cudnnv5
+
 注意：这几个编译选项的设置，只在第一次cmake的时候有效。如果之后想要重新设置，推荐清理整个编译目录（``rm -rf``）后，再指定。
\ No newline at end of file
diff --git a/doc_cn/build_and_install/cmake/cblas_settings.csv b/doc/getstarted/build_and_install/cmake/cblas_settings.csv
similarity index 100%
rename from doc_cn/build_and_install/cmake/cblas_settings.csv
rename to doc/getstarted/build_and_install/cmake/cblas_settings.csv
diff --git a/doc_cn/build_and_install/cmake/compile_options.csv b/doc/getstarted/build_and_install/cmake/compile_options.csv
similarity index 65%
rename from doc_cn/build_and_install/cmake/compile_options.csv
rename to doc/getstarted/build_and_install/cmake/compile_options.csv
index 12b45eebb2822d77447fa1bc754360605971dcab..463b825470579d0c3736a408b1e82dd33e6f8d42 100644
--- a/doc_cn/build_and_install/cmake/compile_options.csv
+++ b/doc/getstarted/build_and_install/cmake/compile_options.csv
@@ -1,14 +1,12 @@
-选项,说明,默认值
-WITH_GPU,是否支持GPU。,取决于是否寻找到CUDA工具链
-WITH_DOUBLE,是否使用双精度浮点数。,否
-WITH_DSO,是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。,是
-WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制文件,是
-WITH_PYTHON,是否内嵌PYTHON解释器。方便今后的嵌入式移植工作。,是
-WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
-WITH_RDMA,是否开启RDMA,否
-WITH_GLOG,是否开启GLOG。如果不开启，则会使用一个简化版的日志，同时方便今后的嵌入式移植工作。,取决于是否寻找到GLOG
-WITH_GFLAGS,是否使用GFLAGS。如果不开启，则会使用一个简化版的命令行参数解析器，同时方便今后的嵌入式移植工作。,取决于是否寻找到GFLAGS
-WITH_TIMER,是否开启计时功能。如果开启会导致运行略慢，打印的日志变多，但是方便调试和测Benchmark,否
-WITH_TESTING,是否开启单元测试,取决于是否寻找到GTEST
-WITH_DOC,是否编译中英文文档,否
+选项,说明,默认值
+WITH_GPU,是否支持GPU。,取决于是否寻找到CUDA工具链
+WITH_DOUBLE,是否使用双精度浮点数。,否
+WITH_DSO,是否运行时动态加载CUDA动态库，而非静态加载CUDA动态库。,是
+WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制文件,是
+WITH_PYTHON,是否内嵌PYTHON解释器。方便今后的嵌入式移植工作。,是
+WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
+WITH_RDMA,是否开启RDMA,否
+WITH_TIMER,是否开启计时功能。如果开启会导致运行略慢，打印的日志变多，但是方便调试和测Benchmark,否
+WITH_TESTING,是否开启单元测试,取决于是否寻找到GTEST
+WITH_DOC,是否编译中英文文档,否
 WITH_SWIG_PY,是否编译PYTHON的SWIG接口，该接口可用于预测和定制化训练,取决于是否寻找到SWIG
\ No newline at end of file
diff --git a/doc_cn/build_and_install/install/docker_install.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
similarity index 93%
rename from doc_cn/build_and_install/install/docker_install.rst
rename to doc/getstarted/build_and_install/docker_install_cn.rst
index 40339659be406ec72da8ad89b6d5dd38d72bb5ae..35234e0eb3ece3cb20d62841c1d75e60b485b9ea 100644
--- a/doc_cn/build_and_install/install/docker_install.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -111,7 +111,24 @@ cuda相关的Driver和设备映射进container中，脚本类似于
 
 简单的含有ssh的Dockerfile如下：
 
-..  literalinclude:: paddle_ssh.Dockerfile
+..  code-block:: bash
+
+    FROM paddledev/paddle:cpu-latest
+
+    MAINTAINER PaddlePaddle dev team <paddle-dev@baidu.com>
+
+    RUN apt-get update
+    RUN apt-get install -y openssh-server
+    RUN mkdir /var/run/sshd
+    RUN echo 'root:root' | chpasswd
+
+    RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
+    RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+
+    EXPOSE 22
+
+    CMD    ["/usr/sbin/sshd", "-D"]
+
 
 使用该Dockerfile构建出镜像，然后运行这个container即可。相关命令为\:
 
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index 1ab6fc6a728f68b16d798a577da2896481eb17d1..7633bf4d576ee6a3e75c2c493eb248f5d2636628 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -9,6 +9,91 @@ Please be aware that you will need to change `Dockers settings
 of your hardware resource on Mac OS X and Windows.
 
 
+Development Using Docker
+------------------------
+
+Developers can work on PaddlePaddle using Docker.  This allows
+developers to work on different platforms -- Linux, Mac OS X, and
+Windows -- in a consistent way.
+
+The general development workflow with Docker and Bazel is as follows:
+
+1. Get the source code of Paddle:
+
+   .. code-block:: bash
+
+      git clone --recursive https://github.com/PaddlePaddle/Paddle.git
+
+   
+   Here **git clone --recursive is required** as we have a submodule `warp-ctc <https://github.com/baidu-research/warp-ctc>`_.
+
+   If you have used :code:`git clone https://github.com/PaddlePaddle/Paddle` and find that the directory :code:`warp-ctc` is
+   empty, please use the following command to get the submodule.
+
+   .. code-block:: bash
+
+      git submodule update --init --recursive
+
+
+2. Build a development Docker image :code:`paddle:dev` from the source
+   code.  This image contains all the development tools and
+   dependencies of PaddlePaddle.
+
+
+   .. code-block:: bash
+
+      cd paddle
+      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile .
+
+
+3. Run the image as a container and mounting local source code
+   directory into the container.  This allows us to change the code on
+   the host and build it within the container.
+
+   .. code-block:: bash
+
+      docker run       \
+       -d              \
+       --name paddle   \
+       -p 2022:22      \
+       -v $PWD:/paddle \
+       -v $HOME/.cache/bazel:/root/.cache/bazel \
+       paddle:dev
+
+   where :code:`-d` makes the container running in background,
+   :code:`--name paddle` allows us to run a nginx container to serve
+   documents in this container, :code:`-p 2022:22` allows us to SSH
+   into this container, :code:`-v $PWD:/paddle` shares the source code
+   on the host with the container, :code:`-v
+   $HOME/.cache/bazel:/root/.cache/bazel` shares Bazel cache on the
+   host with the container.
+
+4. SSH into the container:
+
+   .. code-block:: bash
+
+      ssh root@localhost -p 2022
+
+5. We can edit the source code in the container or on this host.  Then
+   we can build using cmake
+
+   .. code-block:: bash
+
+      cd /paddle # where paddle source code has been mounted into the container
+      mkdir -p build
+      cd build
+      cmake -DWITH_TESTING=ON ..
+      make -j `nproc`
+      CTEST_OUTPUT_ON_FAILURE=1 ctest
+
+   or Bazel in the container:
+
+   .. code-block:: bash
+
+      cd /paddle
+      bazel test ...
+
+
 CPU-only and GPU Images
 -----------------------
 
@@ -17,7 +102,7 @@ CPU-only one and a CUDA GPU one.  We do so by configuring
 `dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_
 automatically runs the following commands:
 
-.. code-block:: base
+.. code-block:: bash
 
    docker build -t paddle:cpu -f paddle/scripts/docker/Dockerfile .
    docker build -t paddle:gpu -f paddle/scripts/docker/Dockerfile.gpu .
@@ -77,7 +162,7 @@ source code:
 .. code-block:: bash
 
    cd ~
-   git clone github.com/PaddlePaddle/Paddle
+   git clone https://github.com/PaddlePaddle/Paddle.git
    cd Paddle
    git submodule update --init --recursive
    docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
@@ -93,7 +178,7 @@ generated using `woboq code browser
 for users to browse and understand the C++ source code.
 
 As long as we give the Paddle Docker container a name, we can run an
-additional nginx Docker container to serve the volume from the Paddle
+additional Nginx Docker container to serve the volume from the Paddle
 container:
 
 .. code-block:: bash
diff --git a/doc_cn/build_and_install/index.rst b/doc/getstarted/build_and_install/index_cn.rst
similarity index 58%
rename from doc_cn/build_and_install/index.rst
rename to doc/getstarted/build_and_install/index_cn.rst
index 48163fb36e561fe5fd8f6907379687a8b5c97f68..3ffa8585041d3023161c2cada8a3dc149f740ba0 100644
--- a/doc_cn/build_and_install/index.rst
+++ b/doc/getstarted/build_and_install/index_cn.rst
@@ -1,5 +1,5 @@
 编译与安装
-========================
+==========
 
 安装
 ++++
@@ -9,8 +9,8 @@ PaddlePaddle提供数个预编译的二进制来进行安装，包括Docker镜
 .. toctree::
    :maxdepth: 1
    
-   install/docker_install.rst 
-   install/ubuntu_install.rst
+   docker_install_cn.rst 
+   ubuntu_install_cn.rst
 
 
 
@@ -19,9 +19,9 @@ PaddlePaddle提供数个预编译的二进制来进行安装，包括Docker镜
 
 ..  warning::
 
-	编译选项主要推荐高级用户查看，普通用户请走安装流程。
+    编译选项主要推荐高级用户查看，普通用户请走安装流程。
 
-.. toctree::
-   :maxdepth: 1
+..  toctree::
+    :maxdepth: 1
 
-   cmake/index.rst
+    cmake/build_from_source_cn.rst
diff --git a/doc_cn/build_and_install/install/ubuntu_install.rst b/doc/getstarted/build_and_install/ubuntu_install_cn.rst
similarity index 71%
rename from doc_cn/build_and_install/install/ubuntu_install.rst
rename to doc/getstarted/build_and_install/ubuntu_install_cn.rst
index 4500d6e0b03be9280e3e6c25cddbf7fb389671b8..d02d9c63bbfb50954d7b75f2c685ce167a3b7146 100644
--- a/doc_cn/build_and_install/install/ubuntu_install.rst
+++ b/doc/getstarted/build_and_install/ubuntu_install_cn.rst
@@ -38,7 +38,18 @@ PaddlePaddle提供了ubuntu 14.04 deb安装包。
 
 安装完成后，可以使用命令 :code:`paddle version` 查看安装后的paddle 版本:
 
-..  literalinclude:: paddle_version.txt
+..  code-block:: shell
+
+    PaddlePaddle 0.8.0b1, compiled with
+        with_avx: ON
+        with_gpu: OFF
+        with_double: OFF
+        with_python: ON
+        with_rdma: OFF
+        with_metric_learning:
+        with_timer: OFF
+        with_predict_sdk:
+
 
 可能遇到的问题
 --------------
@@ -48,9 +59,9 @@ libcudart.so/libcudnn.so找不到
 
 安装完成后，运行 :code:`paddle train` 报错\:
 
-.. 	code-block:: shell
+..  code-block:: shell
 
-	  0831 12:36:04.151525  1085 hl_dso_loader.cc:70] Check failed: nullptr != *dso_handle For Gpu version of PaddlePaddle, it couldn't find CUDA library: libcudart.so Please make sure you already specify its path.Note: for training data on Cpu using Gpu version of PaddlePaddle,you must specify libcudart.so via LD_LIBRARY_PATH.
+      0831 12:36:04.151525  1085 hl_dso_loader.cc:70] Check failed: nullptr != *dso_handle For Gpu version of PaddlePaddle, it couldn't find CUDA library: libcudart.so Please make sure you already specify its path.Note: for training data on Cpu using Gpu version of PaddlePaddle,you must specify libcudart.so via LD_LIBRARY_PATH.
 
 原因是未设置cuda运行时环境变量。 如果使用GPU版本的PaddlePaddle，请安装CUDA 7.5 和CUDNN 5到本地环境中，并设置：
 
diff --git a/doc/getstarted/index_cn.rst b/doc/getstarted/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c6a4d3121c5857cd434acecb389d68f4d4c7a532
--- /dev/null
+++ b/doc/getstarted/index_cn.rst
@@ -0,0 +1,8 @@
+新手入门
+============
+
+..  toctree::
+  :maxdepth: 2
+
+  build_and_install/index_cn.rst
+  basic_usage/index_cn.rst
diff --git a/doc/howto/deep_model/index_en.rst b/doc/howto/deep_model/index_en.rst
deleted file mode 100644
index 00a45641e6ad60a944c4334503e117cab1624896..0000000000000000000000000000000000000000
--- a/doc/howto/deep_model/index_en.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-How to Configure Deep Models
-============================
-
-..  toctree::
-  :maxdepth: 1
-
-  rnn/rnn_en.rst
diff --git a/doc_cn/algorithm/rnn/hierarchical-layer.rst b/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
similarity index 100%
rename from doc_cn/algorithm/rnn/hierarchical-layer.rst
rename to doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
diff --git a/doc_cn/algorithm/rnn/hrnn_rnn_api_compare.rst b/doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst
similarity index 91%
rename from doc_cn/algorithm/rnn/hrnn_rnn_api_compare.rst
rename to doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst
index 9baa0b578041ab82331a94c2a9e4d081697a5fda..96e52b910a22576fd75c9d4e1bef6e2cf74bc84f 100644
--- a/doc_cn/algorithm/rnn/hrnn_rnn_api_compare.rst
+++ b/doc/howto/deep_model/rnn/hrnn_rnn_api_compare_cn.rst
@@ -24,18 +24,18 @@
 
 - 本例中的原始数据一共有10个样本。每个样本由两部分组成，一个label（此处都为2）和一个已经分词后的句子。这个数据也被单层RNN网络直接使用。
 
-..  literalinclude:: ../../../paddle/gserver/tests/Sequence/tour_train_wdseg
+..  literalinclude:: ../../../../paddle/gserver/tests/Sequence/tour_train_wdseg
     :language: text
 
 
 - 双层序列数据一共有4个样本。 每个样本间用空行分开，整体数据和原始数据完全一样。但于双层序列的LSTM来说，第一个样本同时encode两条数据成两个向量。这四条数据同时处理的句子数量为\ :code:`[2, 3, 2, 3]`\ 。
 
-..  literalinclude:: ../../../paddle/gserver/tests/Sequence/tour_train_wdseg.nest
+..  literalinclude:: ../../../../paddle/gserver/tests/Sequence/tour_train_wdseg.nest
     :language: text
 
 其次，对于两种不同的输入数据类型，不同DataProvider对比如下(`sequenceGen.py <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/sequenceGen.py>`_)\：
 
-..  literalinclude:: ../../../paddle/gserver/tests/sequenceGen.py
+..  literalinclude:: ../../../../paddle/gserver/tests/sequenceGen.py
     :language: python
     :lines: 21-39
     :linenos:
@@ -43,10 +43,11 @@
 - 这是普通的单层时间序列的DataProvider代码，其说明如下：
   
   * DataProvider共返回两个数据，分别是words和label。即上述代码中的第19行。
-  - words是原始数据中的每一句话，所对应的词表index数组。它是integer_value_sequence类型的，即整数数组。words即为这个数据中的单层时间序列。
-  - label是原始数据中对于每一句话的分类标签，它是integer_value类型的。
 
-..  literalinclude:: ../../../paddle/gserver/tests/sequenceGen.py
+    - words是原始数据中的每一句话，所对应的词表index数组。它是integer_value_sequence类型的，即整数数组。words即为这个数据中的单层时间序列。
+    - label是原始数据中对于每一句话的分类标签，它是integer_value类型的。
+
+..  literalinclude:: ../../../../paddle/gserver/tests/sequenceGen.py
     :language: python
     :lines: 42-71
     :linenos:
@@ -63,7 +64,7 @@
 
 首先，我们看一下单层RNN的配置。代码中9-15行(高亮部分)即为单层RNN序列的使用代码。这里使用了PaddlePaddle预定义好的RNN处理函数。在这个函数中，RNN对于每一个时间步通过了一个LSTM网络。
 
-..  literalinclude:: ../../../paddle/gserver/tests/sequence_layer_group.conf
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_layer_group.conf
     :language: python
     :lines: 38-63
     :linenos:
@@ -84,7 +85,7 @@
 
 * 至此，\ :code:`lstm_last`\ 便和单层RNN配置中的\ :code:`lstm_last`\ 具有相同的结果了。
 
-..  literalinclude:: ../../../paddle/gserver/tests/sequence_nest_layer_group.conf
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_layer_group.conf
     :language: python
     :lines: 38-64
     :linenos:
@@ -106,7 +107,7 @@
 
 - 单层RNN：过了一个很简单的recurrent_group。每一个时间步，当前的输入y和上一个时间步的输出rnn_state做了一个全链接。
 
-..  literalinclude:: ../../../paddle/gserver/tests/sequence_rnn.conf
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_rnn.conf
     :language: python
     :lines: 36-48
 
@@ -115,7 +116,7 @@
   - 内层inner_step的recurrent_group和单层序列的几乎一样。除了boot_layer=outer_mem，表示将外层的outer_mem作为内层memory的初始状态。外层outer_step中，outer_mem是一个子句的最后一个向量，即整个双层group是将前一个子句的最后一个向量，作为下一个子句memory的初始状态。
   - 从输入数据上看，单双层序列的句子是一样的，只是双层序列将其又做了子序列划分。因此双层序列的配置中，必须将前一个子句的最后一个元素，作为boot_layer传给下一个子句的memory，才能保证和单层序列的配置中“每个时间步都用了上一个时间步的输出结果”一致。
 
-..  literalinclude:: ../../../paddle/gserver/tests/sequence_nest_rnn.conf
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_rnn.conf
     :language: python
     :lines: 39-66
 
@@ -151,14 +152,14 @@
 
 * 单层RNN\:
 
-..  literalinclude:: ../../../paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_rnn_multi_unequalength_inputs.py
     :language: python
     :lines: 42-59
     :linenos:
 
 * 双层RNN\ \:
 
-..  literalinclude:: ../../../paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
+..  literalinclude:: ../../../../paddle/gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py
     :language: python
     :lines: 41-80
     :linenos:
@@ -181,11 +182,11 @@ Memory
 
 Memory是PaddlePaddle实现RNN时候使用的一个概念。RNN即时间递归神经网络，通常要求时间步之间具有一些依赖性，即当前时间步下的神经网络依赖前一个时间步神经网络中某一个神经元输出。如下图所示。
 
-..  graphviz:: glossary_rnn.dot
+..  graphviz:: src/glossary_rnn.dot
 
 上图中虚线的连接，即是跨越时间步的网络连接。PaddlePaddle在实现RNN的时候，将这种跨越时间步的连接用一个特殊的神经网络单元实现。这个神经网络单元就叫Memory。Memory可以缓存上一个时刻某一个神经元的输出，然后在下一个时间步输入给另一个神经元。使用Memory的RNN实现便如下图所示。
 
-..  graphviz:: glossary_rnn_with_memory.dot
+..  graphviz:: src/glossary_rnn_with_memory.dot
 
 使用这种方式，PaddlePaddle可以比较简单的判断哪些输出是应该跨越时间步的，哪些不是。
 
diff --git a/doc/howto/deep_model/rnn/index_cn.rst b/doc/howto/deep_model/rnn/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9e805ca85191b793c8798a239927a318c70b96f5
--- /dev/null
+++ b/doc/howto/deep_model/rnn/index_cn.rst
@@ -0,0 +1,9 @@
+RNN相关模型
+===========
+
+..  toctree::
+  :maxdepth: 1
+
+  recurrent_group_cn.md
+  hierarchical_layer_cn.rst
+  hrnn_rnn_api_compare_cn.rst
diff --git a/doc/howto/deep_model/rnn/index_en.rst b/doc/howto/deep_model/rnn/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..7adc79873d699fdfd5a85034bcef964dd1f19132
--- /dev/null
+++ b/doc/howto/deep_model/rnn/index_en.rst
@@ -0,0 +1,7 @@
+RNN Models
+==========
+
+..  toctree::
+  :maxdepth: 1
+
+  rnn_config_en.rst
diff --git a/doc_cn/algorithm/rnn/rnn-tutorial.md b/doc/howto/deep_model/rnn/recurrent_group_cn.md
similarity index 98%
rename from doc_cn/algorithm/rnn/rnn-tutorial.md
rename to doc/howto/deep_model/rnn/recurrent_group_cn.md
index 9e488b0d51956e86f9fb76f450fdb438f596e239..984fdcc505cdd073d0265c496cda5fb3553c22e4 100644
--- a/doc_cn/algorithm/rnn/rnn-tutorial.md
+++ b/doc/howto/deep_model/rnn/recurrent_group_cn.md
@@ -1,96 +1,96 @@
-# Recurrent Group教程
-
-## 概述
-
-序列数据是自然语言处理任务面对的一种主要输入数据类型。
-
-一句话是由词语构成的序列，多句话进一步构成了段落。因此，段落可以看作是一个嵌套的双层的序列，这个序列的每个元素又是一个序列。
-
-双层序列是PaddlePaddle支持的一种非常灵活的数据组织方式，帮助我们更好地描述段落、多轮对话等更为复杂的语言数据。基于双层序列输入，我们可以设计搭建一个灵活的、层次化的RNN，分别从词语和句子级别编码输入数据，同时也能够引入更加复杂的记忆机制，更好地完成一些复杂的语言理解任务。
-
-在PaddlePaddle中，`recurrent_group`是一种任意复杂的RNN单元，用户只需定义RNN在一个时间步内完成的计算，PaddlePaddle负责完成信息和误差在时间序列上的传播。
-
-更进一步，`recurrent_group`同样可以扩展到双层序列的处理上。通过两个嵌套的`recurrent_group`分别定义子句级别和词语级别上需要完成的运算，最终实现一个层次化的复杂RNN。
-
-目前，在PaddlePaddle中，能够对双向序列进行处理的有`recurrent_group`和部分Layer，具体可参考文档：<a href = "hierarchical-layer.html">支持双层序列作为输入的Layer</a>。
- 
-## 相关概念
-
-### 基本原理
-`recurrent_group` 是PaddlePaddle支持的一种任意复杂的RNN单元。使用者只需要关注于设计RNN在一个时间步之内完成的计算，PaddlePaddle负责完成信息和梯度在时间序列上的传播。
-
-PaddlePaddle中，`recurrent_group`的一个简单调用如下：
-
-``` python
-recurrent_group(step, input, reverse)
-```
-- step：一个可调用的函数，定义一个时间步之内RNN单元完成的计算
-- input：输入，必须是一个单层序列，或者一个双层序列
-- reverse：是否以逆序处理输入序列
- 
-使用`recurrent_group`的核心是设计step函数的计算逻辑。step函数内部可以自由组合PaddlePaddle支持的各种layer，完成任意的运算逻辑。`recurrent_group` 的输入（即input）会成为step函数的输入，由于step 函数只关注于RNN一个时间步之内的计算，在这里`recurrent_group`替我们完成了原始输入数据的拆分。
-
-### 输入
-`recurrent_group`处理的输入序列主要分为以下三种类型：
- 
-- **数据输入**：一个双层序列进入`recurrent_group`会被拆解为一个单层序列，一个单层序列进入`recurrent_group`会被拆解为非序列，然后交给step函数，这一过程对用户是完全透明的。可以有以下两种：1）通过data_layer拿到的用户输入；2）其它layer的输出。
-		
-- **只读Memory输入**：`StaticInput` 定义了一个只读的Memory，由`StaticInput`指定的输入不会被`recurrent_group`拆解，`recurrent_group` 循环展开的每个时间步总是能够引用所有输入，可以是一个非序列，或者一个单层序列。
-	  
-- **序列生成任务的输入**：`GeneratedInput`只用于在序列生成任务中指定输入数据。
-
-### 输入示例
-
-序列生成任务大多遵循encoder-decoer架构，encoder和decoder可以是能够处理序列的任意神经网络单元，而RNN是最流行的选择。
-
-给定encoder输出和当前词，decoder每次预测产生下一个最可能的词语。在这种结构中，decoder接受两个输入：
-    
-- 要生成的目标序列：是decoder的数据输入，也是decoder循环展开的依据，`recurrent_group`会对这类输入进行拆解。
-
-- encoder输出，可以是一个非序列，或者一个单层序列：是一个unbounded memory，decoder循环展开的每一个时间步会引用全部结果，不应该被拆解，这种类型的输入必须通过`StaticInput`指定。关于Unbounded Memory的更多讨论请参考论文 [Neural Turning Machine](https://arxiv.org/abs/1410.5401)。
-		
-在序列生成任务中，decoder RNN总是引用上一时刻预测出的词的词向量，作为当前时刻输入。`GeneratedInput`自动完成这一过程。
-		 
-### 输出
-`step`函数必须返回一个或多个Layer的输出，这个Layer的输出会作为整个`recurrent_group` 最终的输出结果。在输出的过程中，`recurrent_group` 会将每个时间步的输出拼接，这个过程对用户也是透明的。
-
-### memory
-memory只能在`recurrent_group`中定义和使用。memory不能独立存在，必须指向一个PaddlePaddle定义的Layer。引用memory得到这layer上一时刻输出，因此，可以将memory理解为一个时延操作。
-
-可以显示地指定一个layer的输出用于初始化memory。不指定时，memory默认初始化为0。
-
-## 双层RNN介绍
-`recurrent_group`帮助我们完成对输入序列的拆分，对输出的合并，以及计算逻辑在序列上的循环展开。
-
-利用这种特性，两个嵌套的`recurrent_group`能够处理双层序列，实现词语和句子两个级别的双层RNN结构。
-
-- 单层（word-level）RNN：每个状态（state）对应一个词（word）。
-- 双层（sequence-level）RNN：一个双层RNN由多个单层RNN组成，每个单层RNN（即双层RNN的每个状态）对应一个子句（subseq）。
-
-为了描述方便，下文以NLP任务为例，将含有子句（subseq）的段落定义为一个双层序列，将含有词语的句子定义为一个单层序列，那么0层序列即为一个词语。
-
-## 双层RNN的使用
-
-### 训练流程的使用方法
-使用 `recurrent_group`需要遵循以下约定：
- 
-- **单进单出**：输入和输出都是单层序列。
-  - 如果有多个输入，不同输入序列含有的词语数必须严格相等。
-  - 输出一个单层序列，输出序列的词语数和输入序列一致。
-  - memory：在step函数中定义 memory指向一个layer，通过引用memory得到这个layer上一个时刻输出，形成recurrent 连接。memory的is_seq参数必须为false。如果没有定义memory，每个时间步之内的运算是独立的。
-  - boot_layer：memory的初始状态，默认初始状为0，memory的is_seq参数必须为false。
- 
-- **双进双出**：输入和输出都是双层序列。
-  - 如果有多个输入序列，不同输入含有的子句（subseq）数必须严格相等，但子句含有的词语数可以不相等。
-  - 输出一个双层序列，子句（subseq）数、子句的单词数和指定的一个输入序列一致，默认为第一个输入。
-  - memory：在step函数中定义memory，指向一个layer，通过引用memory得到这个layer上一个时刻的输出，形成recurrent连接。定义在外层`recurrent_group` step函数中的memory，能够记录上一个subseq 的状态，可以是一个单层序列（只作为read-only memory），也可以是一个词语。如果没有定义memory，那么 subseq 之间的运算是独立的。
-  - boot_layer：memory 初始状态，可以是一个单层序列（只作为read-only memory）或一个向量。默认不设置，即初始状态为0。
-
-- **双进单出**：目前还未支持，会报错"In hierachical RNN, all out links should be from sequences now"。
- 
-
-### 生成流程的使用方法
-使用`beam_search`需要遵循以下约定：
-
-- 单层RNN：从一个word生成下一个word。
+# Recurrent Group教程
+
+## 概述
+
+序列数据是自然语言处理任务面对的一种主要输入数据类型。
+
+一句话是由词语构成的序列，多句话进一步构成了段落。因此，段落可以看作是一个嵌套的双层的序列，这个序列的每个元素又是一个序列。
+
+双层序列是PaddlePaddle支持的一种非常灵活的数据组织方式，帮助我们更好地描述段落、多轮对话等更为复杂的语言数据。基于双层序列输入，我们可以设计搭建一个灵活的、层次化的RNN，分别从词语和句子级别编码输入数据，同时也能够引入更加复杂的记忆机制，更好地完成一些复杂的语言理解任务。
+
+在PaddlePaddle中，`recurrent_group`是一种任意复杂的RNN单元，用户只需定义RNN在一个时间步内完成的计算，PaddlePaddle负责完成信息和误差在时间序列上的传播。
+
+更进一步，`recurrent_group`同样可以扩展到双层序列的处理上。通过两个嵌套的`recurrent_group`分别定义子句级别和词语级别上需要完成的运算，最终实现一个层次化的复杂RNN。
+
+目前，在PaddlePaddle中，能够对双向序列进行处理的有`recurrent_group`和部分Layer，具体可参考文档：<a href = "hierarchical-layer.html">支持双层序列作为输入的Layer</a>。
+ 
+## 相关概念
+
+### 基本原理
+`recurrent_group` 是PaddlePaddle支持的一种任意复杂的RNN单元。使用者只需要关注于设计RNN在一个时间步之内完成的计算，PaddlePaddle负责完成信息和梯度在时间序列上的传播。
+
+PaddlePaddle中，`recurrent_group`的一个简单调用如下：
+
+``` python
+recurrent_group(step, input, reverse)
+```
+- step：一个可调用的函数，定义一个时间步之内RNN单元完成的计算
+- input：输入，必须是一个单层序列，或者一个双层序列
+- reverse：是否以逆序处理输入序列
+ 
+使用`recurrent_group`的核心是设计step函数的计算逻辑。step函数内部可以自由组合PaddlePaddle支持的各种layer，完成任意的运算逻辑。`recurrent_group` 的输入（即input）会成为step函数的输入，由于step 函数只关注于RNN一个时间步之内的计算，在这里`recurrent_group`替我们完成了原始输入数据的拆分。
+
+### 输入
+`recurrent_group`处理的输入序列主要分为以下三种类型：
+ 
+- **数据输入**：一个双层序列进入`recurrent_group`会被拆解为一个单层序列，一个单层序列进入`recurrent_group`会被拆解为非序列，然后交给step函数，这一过程对用户是完全透明的。可以有以下两种：1）通过data_layer拿到的用户输入；2）其它layer的输出。
+		
+- **只读Memory输入**：`StaticInput` 定义了一个只读的Memory，由`StaticInput`指定的输入不会被`recurrent_group`拆解，`recurrent_group` 循环展开的每个时间步总是能够引用所有输入，可以是一个非序列，或者一个单层序列。
+	  
+- **序列生成任务的输入**：`GeneratedInput`只用于在序列生成任务中指定输入数据。
+
+### 输入示例
+
+序列生成任务大多遵循encoder-decoer架构，encoder和decoder可以是能够处理序列的任意神经网络单元，而RNN是最流行的选择。
+
+给定encoder输出和当前词，decoder每次预测产生下一个最可能的词语。在这种结构中，decoder接受两个输入：
+    
+- 要生成的目标序列：是decoder的数据输入，也是decoder循环展开的依据，`recurrent_group`会对这类输入进行拆解。
+
+- encoder输出，可以是一个非序列，或者一个单层序列：是一个unbounded memory，decoder循环展开的每一个时间步会引用全部结果，不应该被拆解，这种类型的输入必须通过`StaticInput`指定。关于Unbounded Memory的更多讨论请参考论文 [Neural Turning Machine](https://arxiv.org/abs/1410.5401)。
+		
+在序列生成任务中，decoder RNN总是引用上一时刻预测出的词的词向量，作为当前时刻输入。`GeneratedInput`自动完成这一过程。
+		 
+### 输出
+`step`函数必须返回一个或多个Layer的输出，这个Layer的输出会作为整个`recurrent_group` 最终的输出结果。在输出的过程中，`recurrent_group` 会将每个时间步的输出拼接，这个过程对用户也是透明的。
+
+### memory
+memory只能在`recurrent_group`中定义和使用。memory不能独立存在，必须指向一个PaddlePaddle定义的Layer。引用memory得到这layer上一时刻输出，因此，可以将memory理解为一个时延操作。
+
+可以显示地指定一个layer的输出用于初始化memory。不指定时，memory默认初始化为0。
+
+## 双层RNN介绍
+`recurrent_group`帮助我们完成对输入序列的拆分，对输出的合并，以及计算逻辑在序列上的循环展开。
+
+利用这种特性，两个嵌套的`recurrent_group`能够处理双层序列，实现词语和句子两个级别的双层RNN结构。
+
+- 单层（word-level）RNN：每个状态（state）对应一个词（word）。
+- 双层（sequence-level）RNN：一个双层RNN由多个单层RNN组成，每个单层RNN（即双层RNN的每个状态）对应一个子句（subseq）。
+
+为了描述方便，下文以NLP任务为例，将含有子句（subseq）的段落定义为一个双层序列，将含有词语的句子定义为一个单层序列，那么0层序列即为一个词语。
+
+## 双层RNN的使用
+
+### 训练流程的使用方法
+使用 `recurrent_group`需要遵循以下约定：
+ 
+- **单进单出**：输入和输出都是单层序列。
+  - 如果有多个输入，不同输入序列含有的词语数必须严格相等。
+  - 输出一个单层序列，输出序列的词语数和输入序列一致。
+  - memory：在step函数中定义 memory指向一个layer，通过引用memory得到这个layer上一个时刻输出，形成recurrent 连接。memory的is_seq参数必须为false。如果没有定义memory，每个时间步之内的运算是独立的。
+  - boot_layer：memory的初始状态，默认初始状为0，memory的is_seq参数必须为false。
+ 
+- **双进双出**：输入和输出都是双层序列。
+  - 如果有多个输入序列，不同输入含有的子句（subseq）数必须严格相等，但子句含有的词语数可以不相等。
+  - 输出一个双层序列，子句（subseq）数、子句的单词数和指定的一个输入序列一致，默认为第一个输入。
+  - memory：在step函数中定义memory，指向一个layer，通过引用memory得到这个layer上一个时刻的输出，形成recurrent连接。定义在外层`recurrent_group` step函数中的memory，能够记录上一个subseq 的状态，可以是一个单层序列（只作为read-only memory），也可以是一个词语。如果没有定义memory，那么 subseq 之间的运算是独立的。
+  - boot_layer：memory 初始状态，可以是一个单层序列（只作为read-only memory）或一个向量。默认不设置，即初始状态为0。
+
+- **双进单出**：目前还未支持，会报错"In hierachical RNN, all out links should be from sequences now"。
+ 
+
+### 生成流程的使用方法
+使用`beam_search`需要遵循以下约定：
+
+- 单层RNN：从一个word生成下一个word。
 - 双层RNN：即把单层RNN生成后的subseq给拼接成一个新的双层seq。从语义上看，也不存在一个subseq直接生成下一个subseq的情况。
diff --git a/doc/howto/deep_model/rnn/rnn_en.rst b/doc/howto/deep_model/rnn/rnn_config_en.rst
similarity index 96%
rename from doc/howto/deep_model/rnn/rnn_en.rst
rename to doc/howto/deep_model/rnn/rnn_config_en.rst
index da29b8efadd299fe4fc74a71392cbc9a56e32be3..73f5d5371fcd3ce95253cad47b0d8e738284441c 100644
--- a/doc/howto/deep_model/rnn/rnn_en.rst
+++ b/doc/howto/deep_model/rnn/rnn_config_en.rst
@@ -30,7 +30,7 @@ Then at the :code:`process` function, each :code:`yield` function will return th
     yield src_ids, trg_ids, trg_ids_next
 
 
-For more details description of how to write a data provider, please refer to `PyDataProvider2 <../../ui/data_provider/index.html>`_. The full data provider file is located at :code:`demo/seqToseq/dataprovider.py`.
+For more details description of how to write a data provider, please refer to :ref:`api_pydataprovider2` . The full data provider file is located at :code:`demo/seqToseq/dataprovider.py`.
 
 ===============================================
 Configure Recurrent Neural Network Architecture
@@ -42,8 +42,8 @@ Simple Gated Recurrent Neural Network
 
 Recurrent neural network process a sequence at each time step sequentially. An example of the architecture of LSTM is listed below.
 
-.. image:: ../../../tutorials/sentiment_analysis/bi_lstm.jpg
-	 :align: center
+.. image:: ../../../tutorials/sentiment_analysis/src/bi_lstm.jpg
+     :align: center
 
 Generally speaking, a recurrent network perform the following operations from :math:`t=1` to :math:`t=T`, or reversely from :math:`t=T` to :math:`t=1`.
 
@@ -102,11 +102,11 @@ Sequence to Sequence Model with Attention
 We will use the sequence to sequence model with attention as an example to demonstrate how you can configure complex recurrent neural network models. An illustration of the sequence to sequence model with attention is shown in the following figure.
 
 .. image:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png
- 	 :align: center
+      :align: center
 
 In this model, the source sequence :math:`S = \{s_1, \dots, s_T\}` is encoded with a bidirectional gated recurrent neural networks. The hidden states of the bidirectional gated recurrent neural network :math:`H_S = \{H_1, \dots, H_T\}` is called *encoder vector* The decoder is a gated recurrent neural network. When decoding each token :math:`y_t`, the gated recurrent neural network generates a set of weights :math:`W_S^t = \{W_1^t, \dots, W_T^t\}`, which are used to compute a weighted sum of the encoder vector. The weighted sum of the encoder vector is utilized to condition the generation of the token :math:`y_t`.
 
-The encoder part of the model is listed below. It calls :code:`grumemory` to represent gated recurrent neural network. It is the recommended way of using recurrent neural network if the network architecture is simple, because it is faster than :code:`recurrent_group`. We have implemented most of the commonly used recurrent neural network architectures, you can refer to `Layers <../../ui/api/trainer_config_helpers/layers_index.html>`_  for more details.
+The encoder part of the model is listed below. It calls :code:`grumemory` to represent gated recurrent neural network. It is the recommended way of using recurrent neural network if the network architecture is simple, because it is faster than :code:`recurrent_group`. We have implemented most of the commonly used recurrent neural network architectures, you can refer to :ref:`api_trainer_config_helpers_layers` for more details.
 
 We also project the encoder vector to :code:`decoder_size` dimensional space, get the first instance of the backward recurrent network, and project it to :code:`decoder_size` dimensional space:
 
@@ -246,6 +246,6 @@ The code is listed below:
     outputs(beam_gen)
 
 
-Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to `Semantic Role Labeling Demo <../../demo/semantic_role_labeling/index.html>`_ for more details.
+Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to :ref:`semantic_role_labeling` for more details.
 
 The full configuration file is located at :code:`demo/seqToseq/seqToseq_net.py`.
diff --git a/doc_cn/algorithm/rnn/glossary_rnn.dot b/doc/howto/deep_model/rnn/src/glossary_rnn.dot
similarity index 100%
rename from doc_cn/algorithm/rnn/glossary_rnn.dot
rename to doc/howto/deep_model/rnn/src/glossary_rnn.dot
diff --git a/doc_cn/algorithm/rnn/glossary_rnn_with_memory.dot b/doc/howto/deep_model/rnn/src/glossary_rnn_with_memory.dot
similarity index 100%
rename from doc_cn/algorithm/rnn/glossary_rnn_with_memory.dot
rename to doc/howto/deep_model/rnn/src/glossary_rnn_with_memory.dot
diff --git a/doc_cn/algorithm/rnn/simple_full_hierarchical_recurrent.dot b/doc/howto/deep_model/rnn/src/simple_full_hierarchical_recurrent.dot
similarity index 100%
rename from doc_cn/algorithm/rnn/simple_full_hierarchical_recurrent.dot
rename to doc/howto/deep_model/rnn/src/simple_full_hierarchical_recurrent.dot
diff --git a/doc_cn/algorithm/rnn/simple_full_recurrent.dot b/doc/howto/deep_model/rnn/src/simple_full_recurrent.dot
similarity index 100%
rename from doc_cn/algorithm/rnn/simple_full_recurrent.dot
rename to doc/howto/deep_model/rnn/src/simple_full_recurrent.dot
diff --git a/doc/howto/new_layer/FullyConnected.jpg b/doc/howto/dev/FullyConnected.jpg
similarity index 100%
rename from doc/howto/new_layer/FullyConnected.jpg
rename to doc/howto/dev/FullyConnected.jpg
diff --git a/doc/howto/dev/contribute_to_paddle_cn.md b/doc/howto/dev/contribute_to_paddle_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..e0a63f5a14c7b2e8953aa21739668ee2a9ebeff1
--- /dev/null
+++ b/doc/howto/dev/contribute_to_paddle_cn.md
@@ -0,0 +1,131 @@
+# 如何贡献代码
+
+我们真诚地感谢您的贡献，欢迎通过 GitHub 的 fork 和 pull request 流程来提交代码。
+ 
+## 代码要求
+- 你的代码必须完全遵守 [doxygen](http://www.stack.nl/~dimitri/doxygen/) 的样式。
+- 确保编译器选项 WITH\_STYLE\_CHECK 已打开，并且编译能通过代码样式检查。
+- 所有代码必须具有单元测试。
+- 通过所有单元测试。
+
+以下教程将指导您提交代码。
+ 
+## [Fork](https://help.github.com/articles/fork-a-repo/)
+ 
+跳转到[PaddlePaddle](https://github.com/PaddlePaddle/Paddle) GitHub首页，然后单击 `Fork` 按钮。
+
+## 克隆（Clone）
+
+Paddle 目前使用[git流分支模型](http://nvie.com/posts/a-successful-git-branching-model/)进行开发，测试，发行和维护。
+**develop** 是主分支，其他用户分支是特征分支（feature branches）。
+
+一旦你创建了一个fork，你可以使用你最喜欢的 git 客户端克隆你的仓库（repo）或只是直接在命令行输入：
+
+```shell
+# 克隆 fork 到本地
+git clone --branch develop https://github.com/USERNAME/Paddle.git
+```
+如果你的仓库不包含 **develop** 分支，你只需自己创建它。
+
+```shell
+git clone https://github.com/USERNAME/Paddle.git Paddle
+cd Paddle
+git checkout -b develop  # 创建 develop 分支
+git remote add upstream https://github.com/PaddlePaddle/Paddle.git  # 添加 upstream 到 baidu/Paddle
+git pull upstream develop  # 更新 upstream
+git submodule update --init --recursive
+```
+
+然后你可以通过做一个本地开发分支开始开发
+
+```shell
+git checkout -b MY_COOL_STUFF_BRANCH
+```
+
+## 使用 `pre-commit` 钩子
+
+Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理git预提交钩子。 它可以帮助我们格式化源代码（cpp，python），在提交前检查一些基本事宜（每个文件只有一个 EOL 
+，git 中不要添加大文件）。 `pre-commit`测试是 Travis-CI 中单元测试的一部分，不满足钩子
+的 PR 不能提交代码到 Paddle。
+
+你可以通过 `pip install pre-commit` 安装 [pre-commit](http://pre-commit.com/)，
+目前 Paddle 使用 `clang-format` 来调整C/C++源代码格式。请确保 clang-format 版本在3.8以上。
+
+然后只需在 Paddle clone 目录中运行 `pre-commit install` 。当你
+提交你的代码时，pre-commit 钩子会检查本地代码是否存在
+不适合提交的东西，等等。
+
+## 提交（Commit）
+
+提交你的代码：
+
+```shell
+# 显示工作树状态
+git status
+# 添加修改过的文件
+git add xx
+env EDITOR=vim git commit  # 你可以用 vim/nano/emacs 写下你的注释
+```
+提交信息的第一行是标题，其他行可以添加一些细节（如果有必要的话）。
+
+## 保持 Fork 状态最新
+
+在拉（pull）你的请求（request）之前，你应该从最新的 PaddlePaddle 同步代码。
+为此，你需要首先添加远程（remote）：
+
+```shell
+# 观察当前远程仓库配置
+git remote -v
+# 添加上游（upstream）仓库
+git remote add upstream https://github.com/PaddlePaddle/Paddle.git
+# 验证新的 upstream
+git remote -v
+```
+
+用最新的 upstream 更新你的 fork：
+
+```shell
+git pull --rebase upstream develop
+```
+如果本地没有提交，git 将简单地执行快进。但是，如果你一直在做一些改变（绝大多数情况下不应该），你可能要处理冲突。
+
+现在，你的本地主分支与上游修改的一致并是最新的。
+
+## 推送（Push）到 GitHub
+
+```shell
+# 在 GitHub 上 push 你的仓库
+git push -u origin MY_COOL_STUFF_BRANCH  # 创建远程分支 MY_COOL_STUFF_BRANCH 到 origin.
+```
+
+## 拉取请求（Pull Request）
+
+转到 GitHub上 你 fork 的页面，选择你的开发分支并单击 **pull request 按钮**。
+
+## 使用最新版本更新你的 pull 请求
+
+在代码审查（code review）期间，由于 baidu/Paddle 中新的提交导致你的 pull 请求可能会失效。如果没有冲突，GitHub允许自动更新。 你可以点击 pull request 页面中的“更新分支（Update Branch）”按钮。 但是如果存在代码冲突，你需要手动进行更新。你需要在本地仓库执行如下命令：
+
+```shell
+git checkout MY_COOL_STUFF_BRANCH
+git pull upstream develop
+# 你可能需要根据git提示解决冲突
+# 创建并测试你的代码
+git push origin MY_COOL_STUFF_BRANCH
+```
+现在你的 Pull Request 是最新的了。
+
+## 修改你的 pull request
+
+当根据审阅者的意见修改 pull 请求时，请使用“git commit”而不是“git commit --amend”来提交更改，以便审阅者可以看到新的请求和旧的请求之间的区别。
+
+可能的命令是
+
+```shell
+git checkout MY_COOL_STUFF_BRANCH
+git pull upstream develop   # 将本地更新到最新的代码库
+# 可能会发生一些冲突
+# 开始开发吧！
+env EDITOR=vim git commit  # 添加修改日志
+git push origin MY_COOL_STUFF_BRANCH
+```
diff --git a/doc/howto/contribute_to_paddle_en.md b/doc/howto/dev/contribute_to_paddle_en.md
similarity index 81%
rename from doc/howto/contribute_to_paddle_en.md
rename to doc/howto/dev/contribute_to_paddle_en.md
index 1decc91d62cc25c5b3157bdc6e0835421be23252..e578f6fce8b94180da7d5de041a0e17b1d59f6ea 100644
--- a/doc/howto/contribute_to_paddle_en.md
+++ b/doc/howto/dev/contribute_to_paddle_en.md
@@ -1,8 +1,8 @@
-# How to Contribute Code
+# Contribute Code
 
 We sincerely appreciate your contributions. You can use fork and pull request
-workflow to merge your code. 
- 
+workflow to merge your code.
+
 ## Code Requirements
 - Your code must be fully documented by
   [doxygen](http://www.stack.nl/~dimitri/doxygen/) style.
@@ -12,11 +12,11 @@ workflow to merge your code.
 - Pass all unit tests.
 
 The following tutorial guides you into submitting your contibution.
- 
+
 ## [Creating a Fork](https://help.github.com/articles/fork-a-repo/)
- 
+
 Just head over to the GitHub page and click the "Fork" button.
-It's just that simple. 
+It's just that simple.
 
 ## Clone
 
@@ -25,7 +25,7 @@ The **develop** is the main branch, and other user's branches are feature branch
 
 Once you've created a fork, you can use your favorite git client to clone your
 repo or just head straight to the command line:
- 
+
 ```shell
 # Clone your fork to your local machine
 git clone --branch develop https://github.com/USERNAME/Paddle.git
@@ -47,6 +47,22 @@ Then you can start to develop by making a local developement branch
 git checkout -b MY_COOL_STUFF_BRANCH
 ```
 
+## Using `pre-commit` hook
+
+Paddle developers use [pre-commit](http://pre-commit.com/) tool to manage git
+pre-commit hooks. It can help us format source codes (cpp, python), check some
+basic thing before commit (only one EOL for each file, do not add a huge file
+in git). `pre-commit` tests is a part of unit tests in Travis-CI now, every
+PR doesn't fit hook can not be merged into Paddle.
+
+To use [pre-commit](http://pre-commit.com/), you should install it by
+`pip install pre-commit`, and currently, Paddle uses `clang-format` to format
+c/cpp sources. Please make sure clang-format 3.8+ installed.
+
+Then just run `pre-commit install` in your Paddle clone directory. When you
+commit your code, the pre-commit hook will check the local code if there is
+anything not suitable to commit, and so on.
+
 ## Commit
 
 Commit your changes by following command lines:
@@ -83,7 +99,7 @@ git pull --rebase upstream develop
 
 If there are no unique commits locally, git will simply perform a fast-forward.
 However, if you have been making changes (in the vast majority of cases you
-probably shouldn't be), you may have to deal with conflicts. 
+probably shouldn't be), you may have to deal with conflicts.
 
 Now, your local master branch is up-to-date with everything modified upstream.
 
diff --git a/doc/howto/new_layer/index_en.rst b/doc/howto/dev/new_layer_en.rst
similarity index 99%
rename from doc/howto/new_layer/index_en.rst
rename to doc/howto/dev/new_layer_en.rst
index 922bda5b0d879b9041e3c0ca5d2518363a7cfa05..0513f068f39ad0d931b03d066a0083a1a8a33b79 100644
--- a/doc/howto/new_layer/index_en.rst
+++ b/doc/howto/dev/new_layer_en.rst
@@ -1,6 +1,6 @@
-=======================
-How to Write New Layers
-=======================
+================
+Write New Layers
+================
 
 This tutorial will guide you to write customized layers in PaddlePaddle. We will utilize fully connected layer as an example to guide you through the following steps for writing a new layer.
 
diff --git a/doc_cn/howto/how_to_write_docs/index.rst b/doc/howto/dev/write_docs_cn.rst
similarity index 90%
rename from doc_cn/howto/how_to_write_docs/index.rst
rename to doc/howto/dev/write_docs_cn.rst
index a1f983b3405fa40f436885e40fca2ebbb4695491..5051a892304fdc8b0f1a19a7d4560d5ee007c47d 100644
--- a/doc_cn/howto/how_to_write_docs/index.rst
+++ b/doc/howto/dev/write_docs_cn.rst
@@ -1,6 +1,6 @@
-###############################
-如何贡献/修改PaddlePaddle的文档
-###############################
+##################
+如何贡献/修改文档
+##################
 
 PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成，生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。
 
@@ -51,4 +51,4 @@ TBD
 
 
 ..	_cmake: https://cmake.org/
-..	_sphinx: http://www.sphinx-doc.org/en/1.4.8/
\ No newline at end of file
+..	_sphinx: http://www.sphinx-doc.org/en/1.4.8/
diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e03138723e4df951b5fb1bd28f98a33e679b454a
--- /dev/null
+++ b/doc/howto/index_cn.rst
@@ -0,0 +1,37 @@
+进阶指南
+========
+
+使用说明
+--------
+
+..  toctree::
+  :maxdepth: 1
+
+  usage/concepts/use_concepts_cn.rst
+  usage/cluster/k8s/k8s_cn.md
+  usage/cluster/k8s/k8s_distributed_cn.md
+
+开发标准
+--------
+
+..  toctree::
+  :maxdepth: 1
+
+  dev/write_docs_cn.rst
+  dev/contribute_to_paddle_cn.md
+
+模型配置
+--------
+
+..  toctree::
+  :maxdepth: 1
+
+  deep_model/rnn/index_cn.rst
+
+性能优化
+--------
+
+..  toctree::
+  :maxdepth: 1
+
+  optimization/gpu_profiling_cn.rst
diff --git a/doc/howto/index_en.rst b/doc/howto/index_en.rst
index bd64c5b1fb1226b07a07094cfd60bce6fa4e7884..983dc743eb453a0210bc5fb3c7e4525fa838d428 100644
--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -7,9 +7,8 @@ Usage
 ..  toctree::
   :maxdepth: 1
 
-  cmd_parameter/index_en.md
-  deep_model/index_en.rst
-  cluster/cluster_train_en.md
+  usage/cmd_parameter/index_en.md
+  usage/cluster/cluster_train_en.md
 
 Development
 ------------
@@ -17,8 +16,16 @@ Development
 ..  toctree::
   :maxdepth: 1
 
-  new_layer/index_en.rst
-  contribute_to_paddle_en.md
+  dev/new_layer_en.rst
+  dev/contribute_to_paddle_en.md
+
+Configuration
+-------------
+
+..  toctree::
+  :maxdepth: 1
+
+  deep_model/rnn/index_en.rst
 
 Optimization
 -------------
@@ -26,4 +33,4 @@ Optimization
 ..  toctree::
   :maxdepth: 1
 
-  optimization/index_en.rst
+  optimization/gpu_profiling_en.rst
diff --git a/doc/howto/optimization/gpu_profiling_cn.rst b/doc/howto/optimization/gpu_profiling_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..e2b0b0396e0034b01ed2c5081effdd3bcabf31ae
--- /dev/null
+++ b/doc/howto/optimization/gpu_profiling_cn.rst
@@ -0,0 +1,242 @@
+==================
+GPU性能分析与调优
+==================
+
+..  contents::
+
+此教程将向您分步介绍如何使用内置的定时工具、 **nvprof** 或 **nvvp** 来运行性能分析和调优。
+
+- 什么是性能分析？
+- 为什么需要性能分析？
+- 如何进行性能分析？
+- 性能分析工具介绍
+- 详细教程
+- 性能分析小技巧
+
+什么是性能分析？
+================
+在软件工程的范畴里，性能分析（Profiling）是一个动态程序分析的术语，它可以指测量一个程序的空间（内存）复杂度或时间复杂度，
+也可以说是某些特定指令的使用情况，或者是函数调用的频率和耗时等。通常情况下，分析得到的信息用于协助进行程序的优化。
+
+简单来说，性能分析工具是用于给应用程序的性能做定量分析的。如果想很好的理解程序的行为，那程序分析工具是必不可少的利器。简单的性能分析，可以告诉您某个操作到底花了多长时间？而更深入的分析，甚至能解释为什么某个操作花了很长时间？
+
+为什么需要性能分析？
+============================
+训练好一个深层神经网络通常要耗费非常长的时间，所以性能也就逐步变成了深度学习领域最重要的指标。
+而优化性能的首要任务，是需要了解哪些步骤拖慢了整体。
+如果某一块根本就不怎么耗时，那也就不需要急着优化性能啦！
+
+如何进行性能分析？
+========================
+为了达到性能最优，您可以采用下面五个步骤：
+
+- 对代码进行性能分析
+- 找到运行慢的部分
+- 找到运行慢的原因
+- 修改成更快的版本
+- 再次对代码进行性能分析
+
+Usually, processor has two key performance limits include float point throughput and
+memory throughput. For GPU,  it also need more parallelism to fulfill its potential.
+This is why they can be so fast.
+
+通常情况下，处理器有两个关键性能限制：一个是浮点计算量，另一个是内存操作量。
+GPU则还需要高并行性，才能发挥其全部能力。这正是它们速度快的原因。
+
+性能分析工具介绍
+======================
+就通常的GPU性能分析来说，市面上已经有NVIDIA或第三方提供的众多工具。
+
+**nvprof** 是Nvidia性能分析工具， **nvvp** 则是带GUI的Nvidia可视化性能分析工具。
+在这个教程中，我们主要会介绍nvprof和nvvp。
+
+:code:`test_GpuProfiler` from :code:`paddle/math/tests` directory will be used to evaluate
+above profilers.
+
+:code:`paddle/math/test` 目录中的 :code:`test_GpuProfiler` 就是用于展示上述分析工具的用法。
+
+.. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
+   :language: c++
+   :lines: 137-151
+   :linenos:
+
+上述的代码片段包含了两种方法，您可以任意使用一个或两个来对感兴趣的代码段做性能分析。
+
+1. :code:`REGISTER_TIMER_INFO` 是一个内置的定时器封装，可以用来计算CPU函数或cuda内核的时间消耗。
+
+2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid
+program crashes when CPU version of PaddlePaddle invokes them.
+
+3. :code:`REGISTER_GPU_PROFILER` 是一个封装对象，封装了 :code:`cudaProfilerStart` 和 :code:`cudaProfileStop` 两个操作；同时其内部实现可以避免纯CPU版本PaddlePaddle在执行本语句时发生崩溃。
+
+您会在接下来的部分中获得更多的细节介绍。
+
+详细教程
+============
+
+内置定时器
+------------
+
+如果想要启用PaddlePaddle的内置定时器，您首先需要在相关代码段中加入 :code:`REGISTER_TIMER_INFO`。
+接下来就可以使用 :code:`printStatus` 或者 :code:`printAllStatus` 函数来将信息输出到界面中。
+下面举个简单的例子：
+
+1. 加入 :code:`REGISTER_TIMER_INFO` 和 :code:`printAllStatus` 函数（如高亮部分）。
+
+    .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 137-151
+        :emphasize-lines: 8-12,14
+        :linenos:
+
+2. cmake配置中将 **WITH_TIMER** 打开，重新编译PaddlePaddle。
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_TIMER=ON
+        make
+
+3. 执行您的代码，并观察结果(如高亮部分）。
+
+    .. code-block:: bash
+        :emphasize-lines: 1,12-15
+
+        > ./paddle/math/tests/test_GpuProfiler
+        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler
+        I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
+        I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
+        [==========] Running 1 test from 1 test case.
+        [----------] Global test environment set-up.
+        [----------] 1 test from Profiler
+        [ RUN      ] Profiler.BilinearFwdBwd
+        I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im
+        gSizeX = 64, imgSizeY = 64"
+        I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751
+        I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======
+        I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd     total=136.141    avg=136.141    max=136.141    min=136.141   count=1
+        I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======
+        I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------
+        [       OK ] Profiler.BilinearFwdBwd (136 ms)
+        [----------] 1 test from Profiler (136 ms total)
+
+        [----------] Global test environment tear-down
+        [==========] 1 test from 1 test case ran. (136 ms total)
+        [  PASSED  ] 1 test.
+
+nvprof 工具
+----------------
+
+要使用命令行分析工具 **nvprof**，您按如下步骤操作即可：
+
+1. 将 :code:`REGISTER_GPU_PROFILER` 函数加到代码中（参考强调部分）。
+
+    .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 137-151
+        :emphasize-lines: 6-7
+        :linenos:
+
+2. cmake中将 **WITH_PROFILER** 配置打开，重新编译PaddlePaddle。
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_PROFILER=ON
+        make
+
+3. 使用 **nvprof** 来分析执行文件。
+
+    .. code-block:: bash
+
+        nvprof  ./paddle/math/tests/test_GpuProfiler
+
+然后，您就能获得如下的分析结果：
+
+.. code-block:: bash
+
+    ==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler
+    ==78544== Profiling result:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]
+    26.07%  9.0957ms         1  9.0957ms  9.0957ms  9.0957ms  KeBilinearInterpBw
+    23.78%  8.2977ms         1  8.2977ms  8.2977ms  8.2977ms  KeBilinearInterpFw
+    22.55%  7.8661ms         2  3.9330ms  1.5798ms  6.2863ms  [CUDA memcpy DtoH]
+
+    ==78544== API calls:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    46.85%  682.28ms         8  85.285ms  12.639us  682.03ms  cudaStreamCreateWithFlags
+    39.83%  580.00ms         4  145.00ms     302ns  550.27ms  cudaFree
+    9.82%   143.03ms         9  15.892ms  8.7090us  142.78ms  cudaStreamCreate
+    1.23%   17.983ms         7  2.5690ms  23.210us  6.4563ms  cudaMemcpy
+    1.23%   17.849ms         2  8.9247ms  8.4726ms  9.3768ms  cudaStreamSynchronize
+    0.66%   9.5969ms         7  1.3710ms  288.43us  2.4279ms  cudaHostAlloc
+    0.13%   1.9530ms        11  177.54us  7.6810us  591.06us  cudaMalloc
+    0.07%   1.0424ms         8  130.30us  1.6970us  453.72us  cudaGetDevice
+    0.04%   527.90us        40  13.197us     525ns  253.99us  cudaEventCreateWithFlags
+    0.03%   435.73us       348  1.2520us     124ns  42.704us  cuDeviceGetAttribute
+    0.03%   419.36us         1  419.36us  419.36us  419.36us  cudaGetDeviceCount
+    0.02%   260.75us         2  130.38us  129.32us  131.43us  cudaGetDeviceProperties
+    0.02%   222.32us         2  111.16us  106.94us  115.39us  cudaLaunch
+    0.01%   214.06us         4  53.514us  28.586us  77.655us  cuDeviceGetName
+    0.01%   115.45us         4  28.861us  9.8250us  44.526us  cuDeviceTotalMem
+    0.01%   83.988us         4  20.997us     578ns  77.760us  cudaSetDevice
+    0.00%   38.918us         1  38.918us  38.918us  38.918us  cudaEventCreate
+    0.00%   34.573us        31  1.1150us     279ns  12.784us  cudaDeviceGetAttribute
+    0.00%   17.767us         1  17.767us  17.767us  17.767us  cudaProfilerStart
+    0.00%   15.228us         2  7.6140us  3.5460us  11.682us  cudaConfigureCall
+    0.00%   14.536us         2  7.2680us  1.1490us  13.387us  cudaGetLastError
+    0.00%   8.6080us        26     331ns     173ns     783ns  cudaSetupArgument
+    0.00%   5.5470us         6     924ns     215ns  2.6780us  cuDeviceGet
+    0.00%   5.4090us         6     901ns     328ns  3.3320us  cuDeviceGetCount
+    0.00%   4.1770us         3  1.3920us  1.0630us  1.8300us  cuDriverGetVersion
+    0.00%   3.4650us         3  1.1550us  1.0810us  1.2680us  cuInit
+    0.00%      830ns         1     830ns     830ns     830ns  cudaRuntimeGetVersion
+
+
+nvvp 工具
+--------------
+
+如果想使用可视化的分析器 **nvvp**，您可以导入 :code:`nvprof -o ...` 的输出，或者从工具的界面里运行您的应用。
+
+**备注: nvvp 也支持CPU的性能分析** (需在nvvp界面中选上才能开启）
+
+..  image:: nvvp1.png
+    :align: center
+    :scale: 33%
+
+从内核函数的角度， **nvvp** 可以精确说明一个长耗时操作的具体原因。
+同时，如下图所示， **nvvp** 的内核block使用情况、寄存器使用情况和共享内存使用情况能让我们对GPU的整体使用有更好的理解。
+
+
+..  image:: nvvp2.png
+    :align: center
+    :scale: 33%
+
+而从应用的角度， **nvvp** 可以帮您提供一些定位性能瓶颈的建议。
+例如，下图中就展示了一些关于内存数据迁徙和计算资源利用率的建议，为您做性能调优提供了方向。
+
+..  image:: nvvp3.png
+    :align: center
+    :scale: 33%
+
+..  image:: nvvp4.png
+    :align: center
+    :scale: 33%
+
+性能分析小技巧
+==================
+
+- 开始阶段，从 **nvprof** 和 **nvvp** 的输出信息入手是个不错的选择。
+- 接下来可以考虑下时间线的分析。
+- 如果真想挖掘内核深处的某个秘密，您最好先确认：这一块的耗时比例真的太高，值得深入分析。
+- 可能的情况下，试着让输出的分析数据和理论值对应。
+
+    1) 例如，如果我知道内核花了10ms来移动1GB数据，那我会期望分析工具统计到速度是100GB/s。
+    2) 若有不一致之处，很有可能实际应用就是没有按照您的预期情况运行。
+- 了解您的硬件：如果您的GPU理论可以达到6 TFLOPs（6万亿次浮点运算每秒），而当前已经有5.5 TFLOPs了，那估计这里的潜力就没啥好挖的了……
+
+性能分析是性能优化的关键一步。有的时候简简单单的改变就能在性能上产生明显的优化效果！
+当然，具体情况因人而异。
+
+参考资料
+===========
+Jeremy Appleyard, `GPU Profiling for Deep Learning <http://www.robots.ox.ac.uk/~seminars/seminars/Extra/2015_10_08_JeremyAppleyard.pdf>`_, 2015
diff --git a/doc/howto/optimization/gpu_profiling_en.rst b/doc/howto/optimization/gpu_profiling_en.rst
index 667bf1364e7cd4c9098caba72a127228d78ca38b..ed208ceaf7af0c5aab88fd4fcb18fa96b8c9ff38 100644
--- a/doc/howto/optimization/gpu_profiling_en.rst
+++ b/doc/howto/optimization/gpu_profiling_en.rst
@@ -1,5 +1,8 @@
-Profiling on PaddlePaddle
-=========================
+====================
+Tune GPU Performance 
+====================
+
+..  contents::
 
 This tutorial will guide you step-by-step through how to conduct profiling and performance tuning using built-in timer, **nvprof** and **nvvp**.
 
@@ -49,11 +52,11 @@ For general GPU profiling, a bunch of tools are provided from both NVIDIA and th
 In this tutorial, we will focus on nvprof and nvvp.
 
 :code:`test_GpuProfiler` from :code:`paddle/math/tests` directory will be used to evaluate
-above profilers. 
+above profilers.
 
-.. literalinclude:: ../../paddle/math/tests/test_GpuProfiler.cpp
+.. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
    :language: c++
-   :lines: 111-124
+   :lines: 137-151
    :linenos:
 
 The above code snippet includes two methods, you can use any of them to profile the regions of interest.
@@ -77,10 +80,10 @@ As a simple example, consider the following:
 
 1. Add :code:`REGISTER_TIMER_INFO` and :code:`printAllStatus` functions (see the emphasize-lines).
 
-    .. literalinclude:: ../../paddle/math/tests/test_GpuProfiler.cpp
+    .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
         :language: c++
-        :lines: 111-124
-        :emphasize-lines: 8-10,13
+        :lines: 137-151
+        :emphasize-lines: 8-12,14
         :linenos:
 
 2. Configure cmake with **WITH_TIMER** and recompile PaddlePaddle.
@@ -90,31 +93,31 @@ As a simple example, consider the following:
         cmake .. -DWITH_TIMER=ON
         make
 
-3. Execute your code and observe the results (see the emphasize-lines). 
+3. Execute your code and observe the results (see the emphasize-lines).
 
     .. code-block:: bash
         :emphasize-lines: 1,12-15
 
-        > ./paddle/math/tests/test_GpuProfiler                                                                             
-        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler                                             
-        I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions                                                                      
-        I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.                                                                   
-        [==========] Running 1 test from 1 test case.                                                                                                
-        [----------] Global test environment set-up.                                                                                                 
-        [----------] 1 test from Profiler                                                                                                            
-        [ RUN      ] Profiler.BilinearFwdBwd                                                                                                         
+        > ./paddle/math/tests/test_GpuProfiler
+        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler
+        I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
+        I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
+        [==========] Running 1 test from 1 test case.
+        [----------] Global test environment set-up.
+        [----------] 1 test from Profiler
+        [ RUN      ] Profiler.BilinearFwdBwd
         I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im
-        gSizeX = 64, imgSizeY = 64"                                                                                                                  
-        I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751                                           
-        I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======                                               
-        I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd     total=136.141    avg=136.141    max=136.141    min=136.141   count=1                                                                                                                                  
-        I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======                                                          
-        I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------                                            
-        [       OK ] Profiler.BilinearFwdBwd (136 ms)                                                                                                
-        [----------] 1 test from Profiler (136 ms total)                                                                                             
-                                                                                                                                                    
-        [----------] Global test environment tear-down                                                                                               
-        [==========] 1 test from 1 test case ran. (136 ms total)                                                                                     
+        gSizeX = 64, imgSizeY = 64"
+        I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751
+        I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======
+        I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd     total=136.141    avg=136.141    max=136.141    min=136.141   count=1
+        I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======
+        I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------
+        [       OK ] Profiler.BilinearFwdBwd (136 ms)
+        [----------] 1 test from Profiler (136 ms total)
+
+        [----------] Global test environment tear-down
+        [==========] 1 test from 1 test case ran. (136 ms total)
         [  PASSED  ] 1 test.
 
 nvprof profiler
@@ -124,9 +127,9 @@ To use this command line profiler **nvprof**, you can simply issue the following
 
 1. Add :code:`REGISTER_GPU_PROFILER` function (see the emphasize-lines).
 
-    .. literalinclude:: ../../paddle/math/tests/test_GpuProfiler.cpp
+    .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
         :language: c++
-        :lines: 111-124
+        :lines: 137-151
         :emphasize-lines: 6-7
         :linenos:
 
@@ -147,42 +150,42 @@ Then, you can get the following profiling result:
 
 .. code-block:: bash
 
-    ==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler                                                                                                      
-    ==78544== Profiling result:                                                                                                                                                
-    Time(%)     Time     Calls       Avg       Min       Max  Name                                                                                                            
-    27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]                                                                                              
-    26.07%  9.0957ms         1  9.0957ms  9.0957ms  9.0957ms  KeBilinearInterpBw                                                                                            
-    23.78%  8.2977ms         1  8.2977ms  8.2977ms  8.2977ms  KeBilinearInterpFw                                                                                        
-    22.55%  7.8661ms         2  3.9330ms  1.5798ms  6.2863ms  [CUDA memcpy DtoH]                                                                                              
-                                                                                                                                                                            
-    ==78544== API calls:                                                                                                                                                       
-    Time(%)     Time     Calls       Avg       Min       Max  Name                                                                                                            
-    46.85%  682.28ms         8  85.285ms  12.639us  682.03ms  cudaStreamCreateWithFlags                                                                                       
-    39.83%  580.00ms         4  145.00ms     302ns  550.27ms  cudaFree                                                                                                        
-    9.82%   143.03ms         9  15.892ms  8.7090us  142.78ms  cudaStreamCreate                                                                                                
-    1.23%   17.983ms         7  2.5690ms  23.210us  6.4563ms  cudaMemcpy                                                                                                      
-    1.23%   17.849ms         2  8.9247ms  8.4726ms  9.3768ms  cudaStreamSynchronize                                                                                           
-    0.66%   9.5969ms         7  1.3710ms  288.43us  2.4279ms  cudaHostAlloc                                                                                                   
-    0.13%   1.9530ms        11  177.54us  7.6810us  591.06us  cudaMalloc                                                                                                      
-    0.07%   1.0424ms         8  130.30us  1.6970us  453.72us  cudaGetDevice                                                                                                   
-    0.04%   527.90us        40  13.197us     525ns  253.99us  cudaEventCreateWithFlags                                                                                        
-    0.03%   435.73us       348  1.2520us     124ns  42.704us  cuDeviceGetAttribute                                                                                            
-    0.03%   419.36us         1  419.36us  419.36us  419.36us  cudaGetDeviceCount                                                                                              
-    0.02%   260.75us         2  130.38us  129.32us  131.43us  cudaGetDeviceProperties                                                                                         
-    0.02%   222.32us         2  111.16us  106.94us  115.39us  cudaLaunch                                                                                                      
-    0.01%   214.06us         4  53.514us  28.586us  77.655us  cuDeviceGetName                                                                                                 
-    0.01%   115.45us         4  28.861us  9.8250us  44.526us  cuDeviceTotalMem                                                                                                
-    0.01%   83.988us         4  20.997us     578ns  77.760us  cudaSetDevice                                                                                                   
-    0.00%   38.918us         1  38.918us  38.918us  38.918us  cudaEventCreate                                                                                                 
-    0.00%   34.573us        31  1.1150us     279ns  12.784us  cudaDeviceGetAttribute                                                                                          
-    0.00%   17.767us         1  17.767us  17.767us  17.767us  cudaProfilerStart                                                                                               
-    0.00%   15.228us         2  7.6140us  3.5460us  11.682us  cudaConfigureCall                                                                                               
-    0.00%   14.536us         2  7.2680us  1.1490us  13.387us  cudaGetLastError                                                                                                
-    0.00%   8.6080us        26     331ns     173ns     783ns  cudaSetupArgument                                                                                               
-    0.00%   5.5470us         6     924ns     215ns  2.6780us  cuDeviceGet                                                                                                     
-    0.00%   5.4090us         6     901ns     328ns  3.3320us  cuDeviceGetCount                                                                                                
-    0.00%   4.1770us         3  1.3920us  1.0630us  1.8300us  cuDriverGetVersion                                                                                              
-    0.00%   3.4650us         3  1.1550us  1.0810us  1.2680us  cuInit                                                                                                          
+    ==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler
+    ==78544== Profiling result:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]
+    26.07%  9.0957ms         1  9.0957ms  9.0957ms  9.0957ms  KeBilinearInterpBw
+    23.78%  8.2977ms         1  8.2977ms  8.2977ms  8.2977ms  KeBilinearInterpFw
+    22.55%  7.8661ms         2  3.9330ms  1.5798ms  6.2863ms  [CUDA memcpy DtoH]
+
+    ==78544== API calls:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    46.85%  682.28ms         8  85.285ms  12.639us  682.03ms  cudaStreamCreateWithFlags
+    39.83%  580.00ms         4  145.00ms     302ns  550.27ms  cudaFree
+    9.82%   143.03ms         9  15.892ms  8.7090us  142.78ms  cudaStreamCreate
+    1.23%   17.983ms         7  2.5690ms  23.210us  6.4563ms  cudaMemcpy
+    1.23%   17.849ms         2  8.9247ms  8.4726ms  9.3768ms  cudaStreamSynchronize
+    0.66%   9.5969ms         7  1.3710ms  288.43us  2.4279ms  cudaHostAlloc
+    0.13%   1.9530ms        11  177.54us  7.6810us  591.06us  cudaMalloc
+    0.07%   1.0424ms         8  130.30us  1.6970us  453.72us  cudaGetDevice
+    0.04%   527.90us        40  13.197us     525ns  253.99us  cudaEventCreateWithFlags
+    0.03%   435.73us       348  1.2520us     124ns  42.704us  cuDeviceGetAttribute
+    0.03%   419.36us         1  419.36us  419.36us  419.36us  cudaGetDeviceCount
+    0.02%   260.75us         2  130.38us  129.32us  131.43us  cudaGetDeviceProperties
+    0.02%   222.32us         2  111.16us  106.94us  115.39us  cudaLaunch
+    0.01%   214.06us         4  53.514us  28.586us  77.655us  cuDeviceGetName
+    0.01%   115.45us         4  28.861us  9.8250us  44.526us  cuDeviceTotalMem
+    0.01%   83.988us         4  20.997us     578ns  77.760us  cudaSetDevice
+    0.00%   38.918us         1  38.918us  38.918us  38.918us  cudaEventCreate
+    0.00%   34.573us        31  1.1150us     279ns  12.784us  cudaDeviceGetAttribute
+    0.00%   17.767us         1  17.767us  17.767us  17.767us  cudaProfilerStart
+    0.00%   15.228us         2  7.6140us  3.5460us  11.682us  cudaConfigureCall
+    0.00%   14.536us         2  7.2680us  1.1490us  13.387us  cudaGetLastError
+    0.00%   8.6080us        26     331ns     173ns     783ns  cudaSetupArgument
+    0.00%   5.5470us         6     924ns     215ns  2.6780us  cuDeviceGet
+    0.00%   5.4090us         6     901ns     328ns  3.3320us  cuDeviceGetCount
+    0.00%   4.1770us         3  1.3920us  1.0630us  1.8300us  cuDriverGetVersion
+    0.00%   3.4650us         3  1.1550us  1.0810us  1.2680us  cuInit
     0.00%      830ns         1     830ns     830ns     830ns  cudaRuntimeGetVersion
 
 
diff --git a/doc/howto/optimization/index_en.rst b/doc/howto/optimization/index_en.rst
deleted file mode 100644
index 1e2f16b5da7a7aa9e5075effea2d2a171a987e6c..0000000000000000000000000000000000000000
--- a/doc/howto/optimization/index_en.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-How to Tune GPU Performance
-===========================
-
-.. toctree::
-  :maxdepth: 3
-
-  gpu_profiling_en.rst
diff --git a/doc/howto/source/api.rst b/doc/howto/source/api.rst
deleted file mode 100644
index 30396c26b61827847cc5acc29cee1c3c8e7b226e..0000000000000000000000000000000000000000
--- a/doc/howto/source/api.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-API
-===
-
-.. doxygenfile:: paddle/api/PaddleAPI.h
-.. doxygenfile:: paddle/api/Internal.h
diff --git a/doc/howto/source/cuda/index.rst b/doc/howto/source/cuda/index.rst
deleted file mode 100644
index b0fed2e7f72c9a9671e56e114edfc88d72504dbe..0000000000000000000000000000000000000000
--- a/doc/howto/source/cuda/index.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-CUDA
-====
-
-.. toctree::
-  :maxdepth: 2
-
-  matrix.rst
-  nn.rst
-  utils.rst
diff --git a/doc/howto/source/cuda/matrix.rst b/doc/howto/source/cuda/matrix.rst
deleted file mode 100644
index b7699c83eda15d9003506f5fc57b51d52e7af823..0000000000000000000000000000000000000000
--- a/doc/howto/source/cuda/matrix.rst
+++ /dev/null
@@ -1,59 +0,0 @@
-Matrix
-======
-
-Base
-----
-
-hl_matrix.h
-```````````
-.. doxygenfile:: paddle/cuda/include/hl_matrix.h
-
-hl_matrix_base.h
-````````````````
-.. doxygenfile:: paddle/cuda/include/hl_matrix_base.cuh
-
-hl_matrix_apply.cuh
-```````````````````
-.. doxygenfile:: paddle/cuda/include/hl_matrix_apply.cuh
-
-hl_matrix_ops.cuh
-`````````````````
-.. doxygenfile:: paddle/cuda/include/hl_matrix_ops.cuh
-
-hl_matrix_type.cuh
-``````````````````
-.. doxygenfile:: paddle/cuda/include/hl_matrix_type.cuh
-
-hl_sse_matrix_kernel.cuh
-````````````````````````
-.. doxygenfile:: paddle/cuda/include/hl_sse_matrix_kernel.cuh
-
-Matrix Function 
----------------
-
-hl_batch_transpose.h
-````````````````````
-.. doxygenfile:: paddle/cuda/include/hl_batch_transpose.h
-
-hl_aggregate.h
-``````````````
-.. doxygenfile:: paddle/cuda/include/hl_aggregate.h
-
-hl_top_k.h
-``````````
-.. doxygenfile:: paddle/cuda/include/hl_top_k.h
-
-hl_table_apply.h
-````````````````
-.. doxygenfile:: paddle/cuda/include/hl_table_apply.h
-
-Sparse Matrix
--------------
-
-hl_sparse.h
-```````````
-.. doxygenfile:: paddle/cuda/include/hl_sparse.h
-
-hl_sparse.ph
-````````````
-.. doxygenfile:: paddle/cuda/include/hl_sparse.ph
diff --git a/doc/howto/source/cuda/nn.rst b/doc/howto/source/cuda/nn.rst
deleted file mode 100644
index 5577d01e72a5b22847bda40528c46a28cacc1490..0000000000000000000000000000000000000000
--- a/doc/howto/source/cuda/nn.rst
+++ /dev/null
@@ -1,39 +0,0 @@
-Neural Network
-==============
-
-Base
-----
-
-.. doxygenfile:: paddle/cuda/include/hl_gpu.h
-.. doxygenfile:: paddle/cuda/include/hl_functions.h
-.. doxygenfile:: paddle/cuda/include/hl_avx_functions.h
-.. doxygenfile:: paddle/cuda/include/hl_gpu_functions.cuh
-.. doxygenfile:: paddle/cuda/include/hl_activation_functions.h
-
-
-CNN Related APIs
-----------------
-.. doxygenfile:: paddle/cuda/include/hl_cnn.h
-.. doxygenfile:: paddle/cuda/include/hl_cuda_cudnn.h
-.. doxygenfile:: paddle/cuda/include/hl_cuda_cudnn.ph
-
-RNN Related APIs
-----------------
-
-.. doxygenfile:: paddle/cuda/include/hl_recurrent_apply.cuh
-.. doxygenfile:: paddle/cuda/include/hl_sequence.h
-
-LSTM Model
-``````````
-
-.. doxygenfile:: paddle/cuda/include/hl_lstm.h
-.. dpxygenfile:: paddle/cuda/include/hl_cpu_lstm.cuh
-.. doxygenfile:: paddle/cuda/include/hl_gpu_lstm.cuh
-.. doxygenfile:: paddle/cuda/include/hl_lstm_ops.cuh
-
-GRU Model
-`````````
-
-.. doxygenfile:: paddle/cuda/include/hl_gru_ops.cuh
-.. doxygenfile:: paddle/cuda/include/hl_cpu_gru.cuh
-.. doxygenfile:: paddle/cuda/include/hl_gpu_gru.cuh
diff --git a/doc/howto/source/cuda/utils.rst b/doc/howto/source/cuda/utils.rst
deleted file mode 100644
index 850e8bd1c6670947e2a5f1b6f9b0d5b252117cbf..0000000000000000000000000000000000000000
--- a/doc/howto/source/cuda/utils.rst
+++ /dev/null
@@ -1,37 +0,0 @@
-Utils
-=====
-
-Dynamic Link Libs
------------------
-.. doxygenfile:: paddle/cuda/include/hl_dso_loader.h
-
-GPU Resources
--------------
-
-hl_cuda.ph
-``````````
-.. doxygenfile:: paddle/cuda/include/hl_cuda.ph
-
-hl_cuda.h
-`````````
-.. doxygenfile:: paddle/cuda/include/hl_cuda.h
-
-HPPL Base
----------
-.. doxygenfile:: paddle/cuda/include/hl_base.h
-
-CUBLAS Wrapper
---------------
-.. doxygenfile:: paddle/cuda/include/hl_cuda_cublas.h
-
-Timer
------
-.. doxygenfile:: paddle/cuda/include/hl_time.h
-
-Thread Resource
----------------
-.. doxygenfile:: paddle/cuda/include/hl_thread.ph
-
-Device Function
----------------
-.. doxygenfile:: paddle/cuda/include/hl_device_functions.cuh
diff --git a/doc/howto/source/gserver/activations.rst b/doc/howto/source/gserver/activations.rst
deleted file mode 100644
index 55b9d3be383c07842d7066280cc0e174788db1fb..0000000000000000000000000000000000000000
--- a/doc/howto/source/gserver/activations.rst
+++ /dev/null
@@ -1,5 +0,0 @@
-Activations
-===========
-
-..  doxygenclass:: paddle::ActivationFunction
-    :members:
diff --git a/doc/howto/source/gserver/dataproviders.rst b/doc/howto/source/gserver/dataproviders.rst
deleted file mode 100644
index c30d9d6a36a6fbb664ae001274b6a7b0e721070f..0000000000000000000000000000000000000000
--- a/doc/howto/source/gserver/dataproviders.rst
+++ /dev/null
@@ -1,87 +0,0 @@
-==============
-Data Providers
-==============
-
-DataProviders
-=============
-
-Base
-----
-..  doxygenclass:: paddle::DataProvider
-    :members:
-
-DataProviderGroup
------------------
-..  doxygenclass:: paddle::DataProviderGroup
-    :members:
-
-MultiDataProvider
------------------
-..  doxygenclass:: paddle::MultiDataProvider
-    :members:
-
-PyDataProvider
-==============
-
-IFieldScanner
--------------
-..  doxygenclass:: paddle::IFieldScanner
-    :members:
-
-DenseScanner
--------------
-..  doxygenclass:: paddle::DenseScanner
-    :members:
-
-IndexScanner
--------------
-..  doxygenclass:: paddle::IndexScanner
-    :members:
-
-SparseNonValueScanner
----------------------
-..  doxygenclass:: paddle::SparseNonValueScanner
-    :members:
-
-SparseValueScanner
-------------------
-..  doxygenclass:: paddle::SparseValueScanner
-    :members:
-
-SequenceScanner
----------------
-..  doxygenclass:: paddle::SparseValueScanner
-    :members:
-
-IPyDataProviderCache
---------------------
-..  doxygenclass:: paddle::IPyDataProviderCache
-    :members:
-
-NoCacheStrategy
----------------
-..  doxygenclass:: paddle::NoCacheStrategy
-    :members:
-
-CacheOnePassInMemory
---------------------
-..  doxygenclass:: paddle::CacheOnePassInMemory
-    :members:
-
-IPyDataProvider
----------------
-..  doxygenclass:: paddle::PyDataProvider2
-    :members:
-
-ProtoDataProvider
-=================
-
-ProtoDataProvider
-----------------
-..  doxygenclass:: paddle::ProtoDataProvider
-    :members:
-
-ProtoSequenceDataProvider
--------------------------
-..  doxygenclass:: paddle::ProtoSequenceDataProvider
-    :members:
diff --git a/doc/howto/source/gserver/evaluators.rst b/doc/howto/source/gserver/evaluators.rst
deleted file mode 100644
index f5361f76cd2b1c9c004221c03ea05b2c1f3a652e..0000000000000000000000000000000000000000
--- a/doc/howto/source/gserver/evaluators.rst
+++ /dev/null
@@ -1,103 +0,0 @@
-==========
-Evaluators
-==========
-
-Base
-====
-
-..  doxygenclass:: paddle::Evaluator
-    :members:
-
-Sum
-===
-
-SumEvaluator
-------------
-..  doxygenclass:: paddle::SumEvaluator
-    :members:
-
-ColumnSumEvaluator
-------------------
-..  doxygenclass:: paddle::ColumnSumEvaluator
-    :members:
-
-Classification
-==============
-
-ClassificationErrorEvaluator
----------------------------
-..  doxygenclass:: paddle::ClassificationErrorEvaluator
-    :members:
-
-SequenceClassificationErrorEvaluator
-------------------------------------
-..  doxygenclass:: paddle::SequenceClassificationErrorEvaluator
-    :members:
-
-AucEvaluator
--------------
-..  doxygenclass:: paddle::AucEvaluator
-    :members:
-
-PrecisionRecallEvaluator
-------------------------
-..  doxygenclass:: paddle::PrecisionRecallEvaluator
-    :members:
-
-ChunkEvaluator
---------------
-..  doxygenclass:: paddle::ChunkEvaluator
-    :members:
-
-CTCEvaluator
-------------
-..  doxygenclass:: paddle::CTCErrorEvaluator
-    :members:
-
-
-Rank
-====
-
-PnpairEvaluator
--------------
-..  doxygenclass:: paddle::PnpairEvaluator
-    :members:
-
-AucEvaluator
--------------
-..  doxygenclass:: paddle::RankAucEvaluator
-    :members:
-
-
-Printer
-=======
-
-ValuePrinter
--------------
-..  doxygenclass:: paddle::ValuePrinter
-    :members:
-
-GradientPrinter
----------------
-..  doxygenclass:: paddle::GradientPrinter
-    :members:
-
-MaxIdPrinter
-------------
-..  doxygenclass:: paddle::MaxIdPrinter
-    :members:
-
-MaxFramePrinter
----------------
-..  doxygenclass:: paddle::MaxFramePrinter
-    :members:
-
-SequenceTextPrinter
-------------------
-..  doxygenclass:: paddle::SequenceTextPrinter
-    :members:
-
-ClassificationErrorPrinter
---------------------------
-..  doxygenclass:: paddle::ClassificationErrorPrinter
-    :members:
diff --git a/doc/howto/source/gserver/gradientmachines.rst b/doc/howto/source/gserver/gradientmachines.rst
deleted file mode 100644
index 04c8e91d0316a45ad10b0ed0513d3e8916b7c3d9..0000000000000000000000000000000000000000
--- a/doc/howto/source/gserver/gradientmachines.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-Gradient Machines
-=================
-
-GradientMachine
----------------
-..  doxygenclass:: paddle::GradientMachine
-    :members:
-
-GradientMachineMode
--------------------
-..  doxygenclass:: paddle::IGradientMachineMode
-    :members:
-
-MultiGradientMachine
---------------------
-..  doxygenclass:: paddle::MultiGradientMachine
-    :members:
-
-TrainerThread
-`````````````
-..  doxygenclass:: paddle::TrainerThread
-    :members:
-
-RecurrentGradientMachine
-------------------------
-..  doxygenclass:: paddle::RecurrentGradientMachine
-    :members:
diff --git a/doc/howto/source/gserver/index.rst b/doc/howto/source/gserver/index.rst
deleted file mode 100644
index 223b00b9a9dbf1db40ce702cf0e154e5e53a8644..0000000000000000000000000000000000000000
--- a/doc/howto/source/gserver/index.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-GServer
-=======
-
-.. toctree::
-  :maxdepth: 2
-
-  activations.rst
-  dataproviders.rst
-  evaluators.rst
-  gradientmachines.rst
-  layers.rst
-  neworks.rst
diff --git a/doc/howto/source/gserver/layers.rst b/doc/howto/source/gserver/layers.rst
deleted file mode 100644
index 191b2bdff26ed17437370a12036f9dbb174dae15..0000000000000000000000000000000000000000
--- a/doc/howto/source/gserver/layers.rst
+++ /dev/null
@@ -1,566 +0,0 @@
-======
-Layers
-======
-
-Base
-====
-
-Layer 
------
-..  doxygenclass:: paddle::Layer
-    :members:
-
-Projection
-----------
-..  doxygenclass:: paddle::Projection
-    :members:
-
-Operator
---------
-..  doxygenclass:: paddle::Operator
-    :members:
-    
-Data Layer
-==========
-
-..  doxygenclass:: paddle::DataLayer
-    :members:
-
-Fully Connected Layers
-======================
-
-FullyConnectedLayer
--------------------
-..  doxygenclass:: paddle::FullyConnectedLayer
-    :members:
-
-SelectiveFullyConnectedLayer
-----------------------------
-..  doxygenclass:: paddle::SelectiveFullyConnectedLayer
-    :members:
-
-Conv Layers
-===========
-
-ConvBaseLayer
--------------
-..  doxygenclass:: paddle::ConvBaseLayer
-    :members:
-
-ConvOperator
-------------
-..  doxygenclass:: paddle::ConvOperator
-    :members:
-
-ConvShiftLayer
---------------
-..  doxygenclass:: paddle::ConvShiftLayer
-    :members:
-
-CudnnConvLayer
---------------
-..  doxygenclass:: paddle::CudnnConvLayer
-    :members:
-
-ExpandConvBaseLayer
--------------------
-..  doxygenclass:: paddle::ExpandConvBaseLayer
-    :members:
-
-ExpandConvLayer
----------------
-..  doxygenclass:: paddle::ExpandConvLayer
-    :members:
-
-ContextProjection
------------------
-..  doxygenclass:: paddle::ContextProjection
-    :members:
-
-Pooling Layers
-==============
-
-PoolLayer
----------
-..  doxygenclass:: paddle::PoolLayer
-    :members:
-
-PoolProjectionLayer
--------------------
-..  doxygenclass:: paddle::PoolProjectionLayer
-    :members:
-
-CudnnPoolLayer
---------------
-..  doxygenclass:: paddle::CudnnPoolLayer
-    :members:
-
-SpatialPyramidPoolLayer
------------------------
-..  doxygenclass:: paddle::SpatialPyramidPoolLayer
-    :members:
-
-MaxOutLayer
------------
-..  doxygenclass:: paddle::MaxOutLayer
-    :members:
-
-Norm Layers
-===========
-
-NormLayer
----------
-..  doxygenclass:: paddle::NormLayer
-    :members:
-
-CMRProjectionNormLayer
-----------------------
-..  doxygenclass:: paddle::CMRProjectionNormLayer
-    :members:
-
-DataNormLayer
--------------
-..  doxygenclass:: paddle::DataNormLayer
-    :members:
-
-ResponseNormLayer
------------------
-..  doxygenclass:: paddle::ResponseNormLayer
-    :members:
-
-BatchNormBaseLayer
-------------------
-..  doxygenclass:: paddle::BatchNormBaseLayer
-    :members:
-
-BatchNormalizationLayer
------------------------
-..  doxygenclass:: paddle::BatchNormalizationLayer
-    :members:
-
-CudnnBatchNormLayer
------------------------
-..  doxygenclass:: paddle::CudnnBatchNormLayer
-    :members:
-
-SumToOneNormLayer
------------------
-..  doxygenclass:: paddle::SumToOneNormLayer
-    :members:
-
-Activation Layer
-================
-
-ParameterReluLayer
-------------------
-..  doxygenclass:: paddle::ParameterReluLayer
-    :members:
-
-Recurrent Layers
-================
-
-RecurrentLayer
---------------
-..  doxygenclass:: paddle::RecurrentLayer
-    :members:
-
-SequenceToBatch
----------------
-..  doxygenclass:: paddle::SequenceToBatch
-    :members:
-
-LSTM
-----
-LstmLayer
-`````````
-..  doxygenclass:: paddle::LstmLayer
-    :members:
-
-LstmStepLayer
-`````````````
-..  doxygenclass:: paddle::LstmStepLayer
-    :members:
-
-LstmCompute
-```````````
-..  doxygenclass:: paddle::LstmCompute
-    :members:
-
-MDLSTM
-------
-MDLstmLayer
-```````````
-..  doxygenclass:: paddle::MDLstmLayer
-    :members:
-
-CoordIterator
-`````````````
-..  doxygenclass:: paddle::CoordIterator
-    :members:
-
-GRU
----
-GatedRecurrentLayer
-```````````````````
-..  doxygenclass:: paddle::GatedRecurrentLayer
-    :members:
-
-GruStepLayer
-````````````
-..  doxygenclass:: paddle::GruStepLayer
-    :members:
-
-GruCompute
-``````````
-..  doxygenclass:: paddle::GruCompute
-    :members:
-
-Recurrent Layer Group
-=====================
-
-AgentLayer
-----------
-..  doxygenclass:: paddle::AgentLayer
-    :members:
-
-SequenceAgentLayer
-------------------
-..  doxygenclass:: paddle::SequenceAgentLayer
-    :members:
-
-GatherAgentLayer
-----------------
-..  doxygenclass:: paddle::GatherAgentLayer
-    :members:
-
-SequenceGatherAgentLayer
-------------------------
-..  doxygenclass:: paddle::SequenceGatherAgentLayer
-    :members:
-
-ScatterAgentLayer
------------------
-..  doxygenclass:: paddle::ScatterAgentLayer
-    :members:
-
-SequenceScatterAgentLayer
--------------------------
-..  doxygenclass:: paddle::SequenceScatterAgentLayer
-    :members:
-
-GetOutputLayer
---------------
-..  doxygenclass:: paddle::GetOutputLayer
-    :members:
-
-Mixed Layer
-===========
-..  doxygenclass:: paddle::MixedLayer
-    :members:
-
-DotMulProjection
-----------------
-..  doxygenclass:: paddle::DotMulProjection
-    :members:
-
-DotMulOperator
---------------
-..  doxygenclass:: paddle::DotMulOperator
-    :members:
-
-FullMatrixProjection
---------------------
-..  doxygenclass:: paddle::FullMatrixProjection
-    :members:
-
-IdentityProjection
-------------------
-..  doxygenclass:: paddle::IdentityProjection
-    :members:
-
-IdentityOffsetProjection
-------------------------
-..  doxygenclass:: paddle::IdentityOffsetProjection
-    :members:
-
-TableProjection
----------------
-..  doxygenclass:: paddle::TableProjection
-    :members:
-
-TransposedFullMatrixProjection
-------------------------------
-..  doxygenclass:: paddle::TransposedFullMatrixProjection
-    :members:
-
-Aggregate Layers
-================
-
-Aggregate
----------
-AverageLayer
-````````````
-..  doxygenclass:: paddle::AverageLayer
-    :members:
-
-MaxLayer
-````````
-..  doxygenclass:: paddle::MaxLayer
-    :members:
-
-SequenceLastInstanceLayer
-`````````````````````````
-..  doxygenclass:: paddle::SequenceLastInstanceLayer
-    :members:
-
-Concat
-------
-ConcatenateLayer
-````````````````
-..  doxygenclass:: paddle::ConcatenateLayer
-    :members:
-
-ConcatenateLayer2
-`````````````````
-..  doxygenclass:: paddle::ConcatenateLayer2
-    :members:
-
-SequenceConcatLayer
-```````````````````
-..  doxygenclass:: paddle::SequenceConcatLayer
-    :members:
-
-Subset
-------
-SubSequenceLayer
-````````````````
-..  doxygenclass:: paddle::SubSequenceLayer
-    :members:
-
-Reshaping Layers
-================
-
-BlockExpandLayer
-----------------
-..  doxygenclass:: paddle::BlockExpandLayer
-    :members:
-
-ExpandLayer
------------
-..  doxygenclass:: paddle::ExpandLayer
-    :members:
-
-FeatureMapExpandLayer
----------------------
-..  doxygenclass:: paddle::FeatureMapExpandLayer
-    :members:
-
-ResizeLayer
------------
-..  doxygenclass:: paddle::ResizeLayer
-    :members:
-
-SequenceReshapeLayer
---------------------
-..  doxygenclass:: paddle::SequenceReshapeLayer
-    :members:
-
-Math Layers
-===========
-
-AddtoLayer
-----------
-..  doxygenclass:: paddle::AddtoLayer
-    :members:
-
-ConvexCombinationLayer
-----------------------
-..  doxygenclass:: paddle::ConvexCombinationLayer
-    :members:
-
-InterpolationLayer
-------------------
-..  doxygenclass:: paddle::InterpolationLayer
-    :members:
-
-MultiplexLayer
---------------
-..  doxygenclass:: paddle::MultiplexLayer
-    :members:
-
-OuterProdLayer
---------------
-..  doxygenclass:: paddle::OuterProdLayer
-    :members:
-
-PowerLayer
-----------
-..  doxygenclass:: paddle::PowerLayer
-    :members:
-
-ScalingLayer
-------------
-..  doxygenclass:: paddle::ScalingLayer
-    :members:
-
-SlopeInterceptLayer
--------------------
-..  doxygenclass:: paddle::SlopeInterceptLayer
-    :members:
-
-TensorLayer
-------------
-..  doxygenclass:: paddle::TensorLayer
-    :members:
-
-TransLayer
-----------
-..  doxygenclass:: paddle::TransLayer
-    :members:
-
-Sampling Layers
-===============
-
-BilinearInterpLayer
--------------------
-..  doxygenclass:: paddle::BilinearInterpLayer
-    :members:
-
-MultinomialSampler
-------------------
-..  doxygenclass:: paddle::MultinomialSampler
-    :members:
-
-MaxIdLayer
-----------
-..  doxygenclass:: paddle::MaxIdLayer
-    :members:
-
-SamplingIdLayer
----------------
-..  doxygenclass:: paddle::SamplingIdLayer
-    :members:
-
-Cost Layers
-===========
-
-CostLayer
------------
-..  doxygenclass:: paddle::CostLayer
-    :members:
-
-HuberTwoClass
-`````````````
-..  doxygenclass:: paddle::HuberTwoClass
-    :members:
-
-LambdaCost
-```````````
-..  doxygenclass:: paddle::LambdaCost
-    :members:
-
-MultiBinaryLabelCrossEntropy
-````````````````````````````
-..  doxygenclass:: paddle::MultiBinaryLabelCrossEntropy
-    :members:
-
-MultiClassCrossEntropy
-```````````````````````
-..  doxygenclass:: paddle::MultiClassCrossEntropy
-    :members:
-
-MultiClassCrossEntropyWithSelfNorm
-``````````````````````````````````
-..  doxygenclass:: paddle::MultiClassCrossEntropyWithSelfNorm
-    :members:
-
-RankingCost
-```````````
-..  doxygenclass:: paddle::RankingCost
-    :members:
-
-SoftBinaryClassCrossEntropy
-```````````````````````````
-..  doxygenclass:: paddle::SoftBinaryClassCrossEntropy
-    :members:
-
-SumOfSquaresCostLayer
-`````````````````````
-..  doxygenclass:: paddle::SumOfSquaresCostLayer
-    :members:
-
-SumCostLayer
-`````````````````````
-..  doxygenclass:: paddle::SumCostLayer
-    :members:
-
-CosSimLayer
------------
-..  doxygenclass:: paddle::CosSimLayer
-    :members:
-
-CosSimVecMatLayer
------------------
-..  doxygenclass:: paddle::CosSimVecMatLayer
-    :members:
-
-CRFDecodingLayer
-----------------
-..  doxygenclass:: paddle::CRFDecodingLayer
-    :members:
-
-CRFLayer
---------
-..  doxygenclass:: paddle::CRFLayer
-    :members:
-
-CTCLayer
---------
-..  doxygenclass:: paddle::CTCLayer
-    :members:
-
-HierarchicalSigmoidLayer
-------------------------
-..  doxygenclass:: paddle::HierarchicalSigmoidLayer
-    :members:
-
-LinearChainCRF
---------------
-..  doxygenclass:: paddle::LinearChainCRF
-    :members:
-
-LinearChainCTC
---------------
-..  doxygenclass:: paddle::LinearChainCTC
-    :members:
-
-NCELayer
---------
-..  doxygenclass:: paddle::NCELayer
-    :members:
-
-Validation Layers
------------------
-
-ValidationLayer
-```````````````
-..  doxygenclass:: paddle::ValidationLayer
-    :members:
-
-AucValidation
-`````````````
-..  doxygenclass:: paddle::AucValidation
-    :members:
-
-PnpairValidation
-````````````````
-..  doxygenclass:: paddle::PnpairValidation
-    :members:
-
-Check Layers
-============
-
-EosIdCheckLayer
----------------
-..  doxygenclass:: paddle::EosIdCheckLayer
-    :members:
diff --git a/doc/howto/source/gserver/neworks.rst b/doc/howto/source/gserver/neworks.rst
deleted file mode 100644
index 73fb60d549cc88f61d2e2d18c9ec31c37cf4fa9a..0000000000000000000000000000000000000000
--- a/doc/howto/source/gserver/neworks.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-Networks
-========
-
-NeuralNetwork
--------------
-..  doxygenclass:: paddle::NeuralNetwork
-    :members:
-
-ParallelNeuralNetwork
----------------------
-..  doxygenclass:: paddle::ParallelNeuralNetwork
-    :members:
diff --git a/doc/howto/source/index.rst b/doc/howto/source/index.rst
deleted file mode 100644
index 36323c888ee65147f59f28160dc26ca29235ba63..0000000000000000000000000000000000000000
--- a/doc/howto/source/index.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-Source Code Documents
-=====================
-
-.. toctree::
-  :maxdepth: 1
-
-  gserver/index.rst
-  trainer.rst
-  parameter/index.rst
-  pserver/index.rst
-  api.rst
-  cuda/index.rst
-  math/index.rst
-  utils/index.rst
diff --git a/doc/howto/source/math/functions.rst b/doc/howto/source/math/functions.rst
deleted file mode 100644
index aef12e0f005226c6d40d74d0e858a11585339758..0000000000000000000000000000000000000000
--- a/doc/howto/source/math/functions.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-Functions
-=========
-
-MathFunctions
--------------
-.. doxygenfile:: paddle/math/MathFunctions.h
-
-SIMDFunctions
--------------
-.. doxygenfile:: paddle/math/SIMDFunctions.h
diff --git a/doc/howto/source/math/index.rst b/doc/howto/source/math/index.rst
deleted file mode 100644
index 2ec16f2b4450c870f9590aea4ad4ca7dc415b75d..0000000000000000000000000000000000000000
--- a/doc/howto/source/math/index.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-Math
-====
-
-.. toctree::
-  :maxdepth: 2
-
-  vector.rst
-  matrix.rst
-  functions.rst
-  utils.rst
diff --git a/doc/howto/source/math/matrix.rst b/doc/howto/source/math/matrix.rst
deleted file mode 100644
index 9bb20f618d229e1baea15e26378bf40d7c6e1783..0000000000000000000000000000000000000000
--- a/doc/howto/source/math/matrix.rst
+++ /dev/null
@@ -1,76 +0,0 @@
-Matrix
-======
-
-Base
-----
-
-BaseMatrix Template
-```````````````````
-..  doxygenclass:: paddle::BaseMatrixT
-    :members:
-
-Matrix
-``````
-..  doxygenclass:: paddle::Matrix
-    :members:
-
-MatrixOffset
-````````````
-..  doxygenclass:: paddle::MatrixOffset
-    :members:
-
-CpuMatrix
----------
-
-CpuMatrix
-`````````
-..  doxygenclass:: paddle::CpuMatrix
-    :members:
-
-SharedCpuMatrix
-```````````````
-..  doxygenclass:: paddle::SharedCpuMatrix
-    :members:
-
-GpuMatrix
----------
-..  doxygenclass:: paddle::GpuMatrix
-    :members:
-
-CpuSparseMatrix
----------------
-
-CpuSparseMatrix
-```````````````
-..  doxygenclass:: paddle::CpuSparseMatrix
-    :members:
-
-SparseRowCpuMatrix
-``````````````````
-..  doxygenclass:: paddle::SparseRowCpuMatrix
-    :members:
-
-SparseAutoGrowRowCpuMatrix
-``````````````````````````
-..  doxygenclass:: paddle::SparseAutoGrowRowCpuMatrix
-    :members:
-
-SparsePrefetchRowCpuMatrix
-``````````````````````````
-..  doxygenclass:: paddle::SparsePrefetchRowCpuMatrix
-    :members:
-
-SparseRowIdsCpuMatrix
-`````````````````````
-..  doxygenclass:: paddle::SparseRowIdsCpuMatrix
-    :members:
-
-CacheRowCpuMatrix
-`````````````````
-..  doxygenclass:: paddle::CacheRowCpuMatrix
-    :members:
-
-GpuSparseMatrix
----------------
-..  doxygenclass:: paddle::GpuSparseMatrix
-    :members:
diff --git a/doc/howto/source/math/utils.rst b/doc/howto/source/math/utils.rst
deleted file mode 100644
index 55d9961a390c205563a9ae4fbd87ac4ae90fc314..0000000000000000000000000000000000000000
--- a/doc/howto/source/math/utils.rst
+++ /dev/null
@@ -1,18 +0,0 @@
-Memory Manager
-==============
-
-Memory Handle
--------------
-.. doxygenfile:: paddle/math/MemoryHandle.h
-
-Allocator
----------
-.. doxygenfile:: paddle/math/Allocator.h
-
-PoolAllocator
-`````````````
-.. doxygenfile:: paddle/math/PoolAllocator.h
-
-Storage
--------
-.. doxygenfile:: paddle/math/Storage.h
diff --git a/doc/howto/source/math/vector.rst b/doc/howto/source/math/vector.rst
deleted file mode 100644
index 07f7062abaf4f30b8967b594f4e16ab881f5414f..0000000000000000000000000000000000000000
--- a/doc/howto/source/math/vector.rst
+++ /dev/null
@@ -1,37 +0,0 @@
-Vector
-======
-
-BaseVector
-``````````
-..  doxygenclass:: paddle::BaseVector
-    :members:
-
-Vector Template
-```````````````
-..  doxygenclass:: paddle::VectorT
-    :members:
-
-CpuVector Template
-``````````````````
-..  doxygenclass:: paddle::CpuVectorT
-    :members:
-
-GpuVector Template
-``````````````````
-..  doxygenclass:: paddle::GpuVectorT
-    :members:
-
-ParallelCpuVector Template
-``````````````````````````
-..  doxygenclass:: paddle::ParallelCpuVectorT
-    :members:
-
-ParallelGpuVector Template
-``````````````````````````
-..  doxygenclass:: paddle::ParallelGpuVectorT
-    :members:
-
-CpuGpuVector Template
-`````````````````````
-..  doxygenclass:: paddle::CpuGpuVectorT
-    :members:
diff --git a/doc/howto/source/parameter/index.rst b/doc/howto/source/parameter/index.rst
deleted file mode 100644
index 3bf6948dc3478574d8d125d8461235f8827e4e42..0000000000000000000000000000000000000000
--- a/doc/howto/source/parameter/index.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-Parameter
-=========
-
-.. toctree::
-  :maxdepth: 2
-
-  parameter.rst
-  optimizer.rst
-  updater.rst
diff --git a/doc/howto/source/parameter/optimizer.rst b/doc/howto/source/parameter/optimizer.rst
deleted file mode 100644
index b5b8b850b349d547c9e5508d3ebec3d7e00ea310..0000000000000000000000000000000000000000
--- a/doc/howto/source/parameter/optimizer.rst
+++ /dev/null
@@ -1,22 +0,0 @@
-Optimizer
-=========
-
-ParameterOptimizer
-------------------
-.. doxygenfile:: paddle/parameter/ParameterOptimizer.h
-
-Regularizer
------------
-.. doxygenfile:: paddle/parameter/Regularizer.h
-
-FirstOrderOptimizer
--------------------
-.. doxygenfile:: paddle/parameter/FirstOrderOptimizer.h
-
-AverageOptimizer
-----------------
-.. doxygenfile:: paddle/parameter/AverageOptimizer.h
-
-OptimizerWithRegularizer
-------------------------
-.. doxygenfile:: paddle/parameter/OptimizerWithRegularizer.h
diff --git a/doc/howto/source/parameter/parameter.rst b/doc/howto/source/parameter/parameter.rst
deleted file mode 100644
index 2daa62d4e63b952cd93bba35ee32ce35ce768a0d..0000000000000000000000000000000000000000
--- a/doc/howto/source/parameter/parameter.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-Parameter
-=========
-
-Parameter
----------
-.. doxygenfile:: paddle/parameter/Argument.h
-.. doxygenfile:: paddle/parameter/Parameter.h
-.. doxygenfile:: paddle/parameter/ParallelParameter.h
-
-Weight
-------
-.. doxygenfile:: paddle/parameter/Weight.h
diff --git a/doc/howto/source/parameter/updater.rst b/doc/howto/source/parameter/updater.rst
deleted file mode 100644
index dfa22e8e7d1d6f0713974835de93194d2cc58e6f..0000000000000000000000000000000000000000
--- a/doc/howto/source/parameter/updater.rst
+++ /dev/null
@@ -1,14 +0,0 @@
-Updater
-=======
-
-Base
-----
-.. doxygenfile:: paddle/parameter/ParameterUpdaterBase.h
-
-Hook
-----
-.. doxygenfile:: paddle/parameter/ParameterUpdaterHook.h
-
-Functions
----------
-.. doxygenfile:: paddle/parameter/ParameterUpdateFunctions.h
diff --git a/doc/howto/source/pserver/client.rst b/doc/howto/source/pserver/client.rst
deleted file mode 100644
index e5bba0706a1d919104b85e23861ba490a2c828db..0000000000000000000000000000000000000000
--- a/doc/howto/source/pserver/client.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-Client
-======
-
-BaseClient
-----------
-..  doxygenclass:: paddle::BaseClient
-    :members:
-
-ParameterClient2
-----------------
-..  doxygenclass:: paddle::ParameterClient2
-    :members:
diff --git a/doc/howto/source/pserver/index.rst b/doc/howto/source/pserver/index.rst
deleted file mode 100644
index 0031e9476bd063511cc2f0a8c209f35627cf44ba..0000000000000000000000000000000000000000
--- a/doc/howto/source/pserver/index.rst
+++ /dev/null
@@ -1,10 +0,0 @@
-PServer
-=======
-
-.. toctree::
-  :maxdepth: 2
-
-  client.rst
-  network.rst
-  server.rst
-  utils.rst
diff --git a/doc/howto/source/pserver/network.rst b/doc/howto/source/pserver/network.rst
deleted file mode 100644
index 7004c9d91fa9f2af11e15791ef682c108761027e..0000000000000000000000000000000000000000
--- a/doc/howto/source/pserver/network.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-Network
-=======
-
-SocketServer
-------------
-..  doxygenclass:: paddle::SocketServer
-    :members:
-
-SocketWorker
-------------
-..  doxygenclass:: paddle::SocketWorker
-    :members:
-
-SocketClient
-------------
-..  doxygenclass:: paddle::SocketClient
-    :members:
-
-SocketChannel
--------------
-..  doxygenclass:: paddle::SocketChannel
-    :members:
-
-MessageReader
--------------
-..  doxygenclass:: paddle::MsgReader
-    :members:
diff --git a/doc/howto/source/pserver/server.rst b/doc/howto/source/pserver/server.rst
deleted file mode 100644
index 35301acf8ffe3d97e6124c37cf8fe1b43071e14e..0000000000000000000000000000000000000000
--- a/doc/howto/source/pserver/server.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-Server
-======
-
-ProtoServer
------------
-..  doxygenclass:: paddle::ProtoServer
-    :members:
-
-ParameterServer2
-----------------
-..  doxygenclass:: paddle::ParameterServer2
-    :members:
diff --git a/doc/howto/source/trainer.rst b/doc/howto/source/trainer.rst
deleted file mode 100644
index 85f1feb4fc941f94e65a6b1d037445d2367f65ec..0000000000000000000000000000000000000000
--- a/doc/howto/source/trainer.rst
+++ /dev/null
@@ -1,32 +0,0 @@
-Trainer
-=======
-
-TrainerStats
-------------
-
-..  doxygenclass:: paddle::TrainerStats
-    :members:
-
-RemoteParameterUpdater
------------------------
-
-..  doxygenclass:: paddle::RemoteParameterUpdater
-    :members:
-
-ConcurrentRemoteParameterUpdater
---------------------------------
-
-..  doxygenclass:: paddle::ConcurrentRemoteParameterUpdater
-    :members:
-
-SparseRemoteParameterUpdater
-----------------------------
-
-..  doxygenclass:: paddle::SparseRemoteParameterUpdater
-    :members:
-
-SparseRemoteParameterUpdaterComposite
--------------------------------------
-
-..  doxygenclass:: paddle::SparseRemoteParameterUpdaterComposite
-    :members:
diff --git a/doc/howto/source/utils/customStackTrace.rst b/doc/howto/source/utils/customStackTrace.rst
deleted file mode 100644
index cdc8930739eb4b4d6308ff1fbce170d2977d42e8..0000000000000000000000000000000000000000
--- a/doc/howto/source/utils/customStackTrace.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-CustomStackTrace
-================
-..  doxygenclass:: paddle::CustomStackTrace
-    :members:
diff --git a/doc/howto/source/utils/enum.rst b/doc/howto/source/utils/enum.rst
deleted file mode 100644
index e0da75afe164f9dab59b862faa7230fc57423e50..0000000000000000000000000000000000000000
--- a/doc/howto/source/utils/enum.rst
+++ /dev/null
@@ -1,3 +0,0 @@
-Enumeration wrapper
-===================
-..  doxygennamespace:: paddle::enumeration_wrapper
diff --git a/doc/howto/source/utils/index.rst b/doc/howto/source/utils/index.rst
deleted file mode 100644
index 7ddc47d1726f7627852be922d2b769d0752aa799..0000000000000000000000000000000000000000
--- a/doc/howto/source/utils/index.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-Utils
-=====
-
-.. toctree::
-  :maxdepth: 2
-
-  lock.rst
-  queue.rst
-  thread.rst
-  customStackTrace.rst
-  enum.rst
diff --git a/doc/howto/source/utils/lock.rst b/doc/howto/source/utils/lock.rst
deleted file mode 100644
index f011acb9431f0f3dc3b2ba27fcfe71fe6eb07ae9..0000000000000000000000000000000000000000
--- a/doc/howto/source/utils/lock.rst
+++ /dev/null
@@ -1,32 +0,0 @@
-Lock
-====
-
-RWLock
-------
-..  doxygenclass:: paddle::RWLock
-    :members:
-
-ReadLockGuard
--------------
-..  doxygenclass:: paddle::ReadLockGuard
-    :members:
-
-SpinLock
---------
-..  doxygenclass:: paddle::SpinLock
-    :members:
-
-Semaphore
----------
-..  doxygenclass:: paddle::Semaphore
-    :members:
-
-ThreadBarrier
--------------
-..  doxygenclass:: paddle::ThreadBarrier
-    :members:
-
-LockedCondition
----------------
-..  doxygenclass:: paddle::LockedCondition
-    :members:
diff --git a/doc/howto/source/utils/queue.rst b/doc/howto/source/utils/queue.rst
deleted file mode 100644
index 98192648e2d61e622c2337d10ba024dd676ee685..0000000000000000000000000000000000000000
--- a/doc/howto/source/utils/queue.rst
+++ /dev/null
@@ -1,12 +0,0 @@
-Queue
-=====
-
-Queue
------
-..  doxygenclass:: paddle::Queue
-    :members:
-
-BlockingQueue 
--------------
-..  doxygenclass:: paddle::BlockingQueue 
-    :members:
diff --git a/doc/howto/source/utils/thread.rst b/doc/howto/source/utils/thread.rst
deleted file mode 100644
index 23d379a9894e5fc22bc6795a480a53d768e608e6..0000000000000000000000000000000000000000
--- a/doc/howto/source/utils/thread.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-Thread
-======
-
-Thread 
-------
-..  doxygenclass:: paddle::Thread
-    :members:
-
-ThreadWorker
-------------
-..  doxygenclass:: paddle::ThreadWorker
-    :members:
-
-SyncThreadPool 
---------------
-..  doxygenclass:: paddle::SyncThreadPool 
-    :members:
-    
-MultiThreadWorker 
------------------
-..  doxygenclass:: paddle::MultiThreadWorker 
-    :members:
-
-AsyncThreadPool 
----------------
-..  doxygenclass:: paddle::AsyncThreadPool
-    :members:
diff --git a/doc/howto/cluster/cluster_train_en.md b/doc/howto/usage/cluster/cluster_train_en.md
similarity index 99%
rename from doc/howto/cluster/cluster_train_en.md
rename to doc/howto/usage/cluster/cluster_train_en.md
index 1de34a6a99440bf45af8b1fec2c7a2361865fed3..2fd24e532e3b8cb7572e1d4c2e5acbb5d57bc567 100644
--- a/doc/howto/cluster/cluster_train_en.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
@@ -1,4 +1,4 @@
-# How to Run Distributed Training
+# Run Distributed Training
 
 In this article, we explain how to run distributed Paddle training jobs on clusters.  We will create the distributed version of the single-process training example, [recommendation](https://github.com/baidu/Paddle/tree/develop/demo/recommendation).
 
diff --git a/doc_cn/cluster/k8s/Dockerfile b/doc/howto/usage/cluster/k8s/Dockerfile
similarity index 100%
rename from doc_cn/cluster/k8s/Dockerfile
rename to doc/howto/usage/cluster/k8s/Dockerfile
diff --git a/doc_cn/cluster/k8s/job.yaml b/doc/howto/usage/cluster/k8s/job.yaml
similarity index 99%
rename from doc_cn/cluster/k8s/job.yaml
rename to doc/howto/usage/cluster/k8s/job.yaml
index 1e0ac464b2ec71e98c28f090124690b01b0755ce..488aad0bede4f940b25c7be04259f209c3de9f52 100644
--- a/doc_cn/cluster/k8s/job.yaml
+++ b/doc/howto/usage/cluster/k8s/job.yaml
@@ -40,4 +40,4 @@ spec:
         - name: jobpath
           mountPath: /home/jobpath       
       restartPolicy: Never
-    
\ No newline at end of file
+    
diff --git a/doc_cn/cluster/k8s/k8s-paddle-arch.png b/doc/howto/usage/cluster/k8s/k8s-paddle-arch.png
similarity index 100%
rename from doc_cn/cluster/k8s/k8s-paddle-arch.png
rename to doc/howto/usage/cluster/k8s/k8s-paddle-arch.png
diff --git a/doc_cn/build_and_install/paddle_on_kubernetes.md b/doc/howto/usage/cluster/k8s/k8s_cn.md
similarity index 99%
rename from doc_cn/build_and_install/paddle_on_kubernetes.md
rename to doc/howto/usage/cluster/k8s/k8s_cn.md
index f8c9f19a9fef50c03f6ffee639a580adbf29844a..2575701053ca12cc3af45682af6cd682a88bb987 100644
--- a/doc_cn/build_and_install/paddle_on_kubernetes.md
+++ b/doc/howto/usage/cluster/k8s/k8s_cn.md
@@ -1,4 +1,4 @@
-# Paddle On Kubernetes：单机训练
+# Kubernetes 单机训练
 
 在这篇文档里，我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的Paddle训练作业。在下一篇中，我们将介绍如何启动分布式训练作业。
 
diff --git a/doc_cn/cluster/k8s/distributed_training_on_kubernetes.md b/doc/howto/usage/cluster/k8s/k8s_distributed_cn.md
similarity index 99%
rename from doc_cn/cluster/k8s/distributed_training_on_kubernetes.md
rename to doc/howto/usage/cluster/k8s/k8s_distributed_cn.md
index d9ed431ec0566cf90f11ebaeec56560ff69e71fe..d4d01f2759bd89a3448ed12ee7fd24a091217e47 100644
--- a/doc_cn/cluster/k8s/distributed_training_on_kubernetes.md
+++ b/doc/howto/usage/cluster/k8s/k8s_distributed_cn.md
@@ -1,5 +1,4 @@
-
-# PaddlePaddle on Kubernetes：分布式训练
+# Kubernetes 分布式训练
 
 前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里，我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练，文章 [Cluster Training](https://github.com/baidu/Paddle/blob/develop/doc/cluster/opensource/cluster_train.md)介绍了一种通过SSH远程分发任务，进行分布式训练的方法，与此不同的是，本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群，进行分布式训练的方案。
 
@@ -306,4 +305,4 @@ I1116 09:10:18.019069    50 ParameterClient2.cpp:122] pserver 2 192.168.223.143:
 I1116 09:10:18.019492    50 ParameterClient2.cpp:122] pserver 3 192.168.223.143:7165
 I1116 09:10:18.019716    50 ParameterClient2.cpp:122] pserver 4 192.168.129.71:7164
 I1116 09:10:18.019836    50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7165
-```
\ No newline at end of file
+```
diff --git a/doc_cn/cluster/k8s/start.sh b/doc/howto/usage/cluster/k8s/start.sh
similarity index 100%
rename from doc_cn/cluster/k8s/start.sh
rename to doc/howto/usage/cluster/k8s/start.sh
diff --git a/doc_cn/cluster/k8s/start_paddle.py b/doc/howto/usage/cluster/k8s/start_paddle.py
similarity index 97%
rename from doc_cn/cluster/k8s/start_paddle.py
rename to doc/howto/usage/cluster/k8s/start_paddle.py
index 6a461614101aa74f3badf67e65c0d6fcb985ee9b..df00d82919faa2acecc79c28e3d773ba3de9672a 100755
--- a/doc_cn/cluster/k8s/start_paddle.py
+++ b/doc/howto/usage/cluster/k8s/start_paddle.py
@@ -19,7 +19,6 @@ import socket
 import os
 import argparse
 
-
 # configuration for cluster
 API = "/api/v1/namespaces/"
 JOBSELECTOR = "labelSelector=job-name="
@@ -145,8 +144,8 @@ def startPaddle(idMap={}, train_args_dict=None):
 
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser(prog="start_paddle.py",
-                                     description='simple tool for k8s')
+    parser = argparse.ArgumentParser(
+        prog="start_paddle.py", description='simple tool for k8s')
     args, train_args_list = parser.parse_known_args()
     train_args = refine_unknown_args(train_args_list)
     train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
diff --git a/doc/howto/cmd_parameter/arguments_en.md b/doc/howto/usage/cmd_parameter/arguments_en.md
similarity index 100%
rename from doc/howto/cmd_parameter/arguments_en.md
rename to doc/howto/usage/cmd_parameter/arguments_en.md
diff --git a/doc/howto/cmd_parameter/detail_introduction_en.md b/doc/howto/usage/cmd_parameter/detail_introduction_en.md
similarity index 99%
rename from doc/howto/cmd_parameter/detail_introduction_en.md
rename to doc/howto/usage/cmd_parameter/detail_introduction_en.md
index 510396b629e398cef2ccda2f1cec474160693219..27b2faf1d8a9367ff9498a76d363791ab7fbe61c 100644
--- a/doc/howto/cmd_parameter/detail_introduction_en.md
+++ b/doc/howto/usage/cmd_parameter/detail_introduction_en.md
@@ -1,3 +1,7 @@
+```eval_rst
+..  _cmd_detail_introduction:
+```
+
 # Detail Description
 
 ## Common
diff --git a/doc/howto/cmd_parameter/index_en.md b/doc/howto/usage/cmd_parameter/index_en.md
similarity index 62%
rename from doc/howto/cmd_parameter/index_en.md
rename to doc/howto/usage/cmd_parameter/index_en.md
index bd16affdd8ceaf72250acb7c1411c315334f07ba..2a96e7e976c43fd69befccd78753cee431ef61bc 100644
--- a/doc/howto/cmd_parameter/index_en.md
+++ b/doc/howto/usage/cmd_parameter/index_en.md
@@ -1,4 +1,7 @@
-# How to Set Command-line Parameters
+```eval_rst
+..  _cmd_line_index:
+```
+# Set Command-line Parameters
 
 * [Use Case](use_case_en.md)
 * [Arguments](arguments_en.md)
diff --git a/doc/howto/cmd_parameter/use_case_en.md b/doc/howto/usage/cmd_parameter/use_case_en.md
similarity index 100%
rename from doc/howto/cmd_parameter/use_case_en.md
rename to doc/howto/usage/cmd_parameter/use_case_en.md
diff --git a/doc_cn/concepts/pserver_topology.dot b/doc/howto/usage/concepts/src/pserver_topology.dot
similarity index 100%
rename from doc_cn/concepts/pserver_topology.dot
rename to doc/howto/usage/concepts/src/pserver_topology.dot
diff --git a/doc_cn/concepts/trainer_config.py b/doc/howto/usage/concepts/src/trainer_config.py
similarity index 100%
rename from doc_cn/concepts/trainer_config.py
rename to doc/howto/usage/concepts/src/trainer_config.py
diff --git a/doc_cn/concepts/use_concepts.rst b/doc/howto/usage/concepts/use_concepts_cn.rst
similarity index 89%
rename from doc_cn/concepts/use_concepts.rst
rename to doc/howto/usage/concepts/use_concepts_cn.rst
index 2d27e29fac37d54e4a31540cf75361464f51b193..77ba76441910b0696188ad6fa577b92c47129499 100644
--- a/doc_cn/concepts/use_concepts.rst
+++ b/doc/howto/usage/concepts/use_concepts_cn.rst
@@ -1,6 +1,6 @@
-#########################
-PaddlePaddle 基本使用概念
-#########################
+############
+基本使用概念
+############
 
 PaddlePaddle是一个深度学习框架，支持单机模式和多机模式。
 
@@ -8,29 +8,29 @@ PaddlePaddle是一个深度学习框架，支持单机模式和多机模式。
 
 本文首先介绍trainer进程中的一些使用概念，然后介绍pserver进程中概念。
 
-..	contents::
+..    contents::
 
 系统框图
 ========
 
 下图描述了用户使用框图，PaddlePaddle的trainer进程里内嵌了Python解释器，trainer进程可以利用这个解释器执行Python脚本，Python脚本里定义了模型配置、训练算法、以及数据读取函数。其中，数据读取程序往往定义在一个单独Python脚本文件里，被称为数据提供器（DataProvider），通常是一个Python函数。模型配置、训练算法通常定义在另一单独Python文件中, 称为训练配置文件。下面将分别介绍这两部分。
 
-..	graphviz:: 
-
-	digraph pp_process {
-		rankdir=LR;
-		config_file [label="用户神经网络配置"];
-		subgraph cluster_pp {
-			style=filled;
-			color=lightgrey;
-			node [style=filled, color=white, shape=box];
-			label = "PaddlePaddle C++";
-			py [label="Python解释器"];
-		}
-		data_provider [label="用户数据解析"];
-		config_file -> py;
-		py -> data_provider [dir="back"];
-	}
+..    graphviz:: 
+
+    digraph pp_process {
+        rankdir=LR;
+        config_file [label="用户神经网络配置"];
+        subgraph cluster_pp {
+            style=filled;
+            color=lightgrey;
+            node [style=filled, color=white, shape=box];
+            label = "PaddlePaddle C++";
+            py [label="Python解释器"];
+        }
+        data_provider [label="用户数据解析"];
+        config_file -> py;
+        py -> data_provider [dir="back"];
+    }
 
 数据提供器
 ==========
@@ -47,7 +47,7 @@ DataProvider是PaddlePaddle系统的数据提供器，将用户的原始数据
 
 一个简单的训练配置文件为：
 
-..  literalinclude:: trainer_config.py
+..  literalinclude:: src/trainer_config.py
     :linenos:
 
 文件开头 ``from paddle.trainer_config_helpers import *`` ，是因为PaddlePaddle配置文件与C++模块通信的最基础协议是protobuf，为了避免用户直接写复杂的protobuf string，我们为用户定以Python接口来配置网络，该Python代码可以生成protobuf包，这就是`trainer_config_helpers`_的作用。因此，在文件的开始，需要import这些函数。 这个包里面包含了模型配置需要的各个模块。
@@ -100,11 +100,11 @@ DataProvider是PaddlePaddle系统的数据提供器，将用户的原始数据
 
 例如，和 ``fc_layer`` 同样功能的 ``mixed_layer`` 是:
 
-..	code-block:: python
+..    code-block:: python
    
-   	data = data_layer(name='data', size=200)
-   	with mixed_layer(size=200) as out:
-   		out += full_matrix_projection(input=data)
+       data = data_layer(name='data', size=200)
+       with mixed_layer(size=200) as out:
+           out += full_matrix_projection(input=data)
 
 PaddlePaddle 可以使用 ``mixed layer`` 配置出非常复杂的网络，甚至可以直接配置一个完整的LSTM。用户可以参考 `mixed_layer`_ 的相关文档进行配置。
 
@@ -114,13 +114,13 @@ PaddlePaddle 可以使用 ``mixed layer`` 配置出非常复杂的网络，甚
 
 PaddlePaddle多机采用经典的 Parameter Server 架构对多个节点的 trainer 进行同步。多机训练的经典拓扑结构如下\:
 
-..	graphviz:: pserver_topology.dot
+..    graphviz:: src/pserver_topology.dot
 
 图中每个灰色方块是一台机器，在每个机器中，先使用命令 ``paddle pserver`` 启动一个pserver进程，并指定端口号，可能的参数是\:
 
-..	code-block:: bash
+..    code-block:: bash
 
-	paddle pserver --port=5000 --num_gradient_servers=4 --tcp_rdma='tcp' --nics='eth0'
+    paddle pserver --port=5000 --num_gradient_servers=4 --tcp_rdma='tcp' --nics='eth0'
 
 * ``--port=5000`` : 指定 pserver 进程端口是 5000 。
 * ``--gradient_servers=4`` : 有四个训练进程(PaddlePaddle 将 trainer 也称作 GradientServer ，因为其为负责提供Gradient) 。
@@ -128,9 +128,9 @@ PaddlePaddle多机采用经典的 Parameter Server 架构对多个节点的 trai
 
 启动之后 pserver 进程之后，需要启动 trainer 训练进程，在各个机器上运行如下命令\:
 
-..	code-block:: bash
+..    code-block:: bash
 
-	paddle train --port=5000 --pservers=192.168.100.101,192.168.100.102,192.168.100.103,192.168.100.104 --config=...
+    paddle train --port=5000 --pservers=192.168.100.101,192.168.100.102,192.168.100.103,192.168.100.104 --config=...
 
 对于简单的多机协同训练使用上述方式即可。另外，pserver/train 通常在高级情况下，还需要设置下面两个参数\：
 
diff --git a/doc/index_cn.rst b/doc/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..460fedb5658a8ea9bbe8b602ee2b5df66502fa62
--- /dev/null
+++ b/doc/index_cn.rst
@@ -0,0 +1,11 @@
+PaddlePaddle 文档
+======================
+
+..  toctree::
+  :maxdepth: 1
+
+  getstarted/index_cn.rst
+  tutorials/index_cn.md
+  howto/index_cn.rst
+  api/index_cn.rst
+  faq/index_cn.rst
diff --git a/doc/index.rst b/doc/index_en.rst
similarity index 88%
rename from doc/index.rst
rename to doc/index_en.rst
index c107239438b038fb6a4a6123e9b61f424b60142f..1d9cca7de720ebc23fe816f32d158930d91c07e7 100644
--- a/doc/index.rst
+++ b/doc/index_en.rst
@@ -8,4 +8,5 @@ PaddlePaddle Documentation
   tutorials/index_en.md
   howto/index_en.rst
   api/index_en.rst
-  about/index_en.rst 
+  about/index_en.rst
+ 
\ No newline at end of file
diff --git a/doc_cn/conf.py.in b/doc/templates/conf.py.cn.in
similarity index 98%
rename from doc_cn/conf.py.in
rename to doc/templates/conf.py.cn.in
index 4f3afb814f1e779a711e3535da1f8853aa0d97c6..418d718fbd9c61bff3acb9c2dab0638c0b650bab 100644
--- a/doc_cn/conf.py.in
+++ b/doc/templates/conf.py.cn.in
@@ -62,7 +62,7 @@ source_suffix = ['.rst', '.md', '.Rmd']
 source_encoding = 'utf-8'
 
 # The master toctree document.
-master_doc = 'index'
+master_doc = 'index_cn'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -79,7 +79,7 @@ language = 'zh_CN'
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
-exclude_patterns = ['_build']
+exclude_patterns = ['_build', '**/*_en*', '*_en*']
 
 # The reST default role (used for this markup: `text`) to use for all
 # documents.
diff --git a/doc/conf.py.in b/doc/templates/conf.py.en.in
similarity index 97%
rename from doc/conf.py.in
rename to doc/templates/conf.py.en.in
index 5fb307e3a9b572f14789dec3707611f336a5d44f..e96c25cb75bee20d2e2949423d80ddab1d3450a1 100644
--- a/doc/conf.py.in
+++ b/doc/templates/conf.py.en.in
@@ -63,7 +63,7 @@ source_suffix = ['.rst', '.md', '.Rmd']
 source_encoding = 'utf-8'
 
 # The master toctree document.
-master_doc = 'index'
+master_doc = 'index_en'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
@@ -80,7 +80,7 @@ language = None
 
 # List of patterns, relative to source directory, that match files and
 # directories to ignore when looking for source files.
-exclude_patterns = ['_build']
+exclude_patterns = ['_build', '**/*_cn*', '*_cn*']
 
 # The reST default role (used for this markup: `text`) to use for all
 # documents.
@@ -144,5 +144,6 @@ def setup(app):
     # no c++ API for now
     app.add_config_value('recommonmark_config', {
             'url_resolver': lambda url: github_doc_root + url,
+        'enable_eval_rst': True,
             }, True)
     app.add_transform(AutoStructify)
diff --git a/doc/tutorials/embedding_model/index_en.md b/doc/tutorials/embedding_model/index_en.md
index 06f3ff1f009e470cdb9687658613a76acbb79751..d793a50f488e464bcd90a2fb506a8dcc3c760433 100644
--- a/doc/tutorials/embedding_model/index_en.md
+++ b/doc/tutorials/embedding_model/index_en.md
@@ -93,7 +93,7 @@ where `train.sh` is almost the same as `demo/seqToseq/translation/train.sh`, the
 - `--init_model_path`: path of the initialization model, here is `data/paraphrase_model`
 - `--load_missing_parameter_strategy`: operations when model file is missing, here use a normal distibution to initialize the other parameters except for the embedding layer
 
-For users who want to understand the dataset format, model architecture and training procedure in detail, please refer to [Text generation Tutorial](../text_generation/text_generation.md).
+For users who want to understand the dataset format, model architecture and training procedure in detail, please refer to [Text generation Tutorial](../text_generation/index_en.md).
 
 ## Optional Function ##
 ###  Embedding Parameters Observation
diff --git a/doc/tutorials/image_classification/src/cifar.png b/doc/tutorials/image_classification/src/cifar.png
new file mode 100644
index 0000000000000000000000000000000000000000..f54a0c58837cb3385b32dc57d02cec92666ef0f1
Binary files /dev/null and b/doc/tutorials/image_classification/src/cifar.png differ
diff --git a/doc/tutorials/image_classification/src/image_classification.png b/doc/tutorials/image_classification/src/image_classification.png
new file mode 100644
index 0000000000000000000000000000000000000000..14f255805081c1b4fab27eaf336fd389fa93ca19
Binary files /dev/null and b/doc/tutorials/image_classification/src/image_classification.png differ
diff --git a/doc/tutorials/image_classification/src/lenet.png b/doc/tutorials/image_classification/src/lenet.png
new file mode 100644
index 0000000000000000000000000000000000000000..1e6f2b32bad797f3fccb929c72a121fc935b0cbb
Binary files /dev/null and b/doc/tutorials/image_classification/src/lenet.png differ
diff --git a/doc/tutorials/image_classification/src/plot.png b/doc/tutorials/image_classification/src/plot.png
new file mode 100644
index 0000000000000000000000000000000000000000..a31f99791c670e18bb8c62b7604ec8cb0284ffb4
Binary files /dev/null and b/doc/tutorials/image_classification/src/plot.png differ
diff --git a/doc/tutorials/imagenet_model/resnet_model_cn.md b/doc/tutorials/imagenet_model/resnet_model_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..82ec9d70b345c11aba3aa86f8206eedc8072bb88
--- /dev/null
+++ b/doc/tutorials/imagenet_model/resnet_model_cn.md
@@ -0,0 +1,284 @@
+# Model Zoo - ImageNet #
+
+[ImageNet](http://www.image-net.org/) 是通用物体分类领域一个众所周知的数据库。本教程提供了一个用于ImageNet上的卷积分类网络模型。
+
+## ResNet 介绍
+
+论文 [Deep Residual Learning for Image Recognition](http://arxiv.org/abs/1512.03385) 中提出的ResNet网络结构在2015年ImageNet大规模视觉识别竞赛(ILSVRC 2015)的分类任务中赢得了第一名。他们提出残差学习的框架来简化网络的训练，所构建网络结构的的深度比之前使用的网络有大幅度的提高。下图展示的是基于残差的连接方式。左图构造网络模块的方式被用于34层的网络中，而右图的瓶颈连接模块用于50层，101层和152层的网络结构中。
+
+<center>![resnet_block](./resnet_block.jpg)</center>
+<center>图 1. ResNet 网络模块</center>
+
+本教程中我们给出了三个ResNet模型，这些模型都是由原作者提供的模型<https://github.com/KaimingHe/deep-residual-networks>转换过来的。我们使用PaddlePaddle在ILSVRC的验证集共50,000幅图像上测试了模型的分类错误率，其中输入图像的颜色通道顺序为**BGR**，保持宽高比缩放到短边为256，只截取中心方形的图像区域。分类错误率和模型大小由下表给出。
+<center>
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+<colgroup>
+<col  class="left" />
+<col  class="left" />
+<col  class="left" />
+</colgroup>
+<thead>
+<tr>
+<th scope="col" class="left">ResNet</th>
+<th scope="col" class="left">Top-1</th>
+<th scope="col" class="left">Model Size</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left">ResNet-50</td>
+<td class="left">24.9%</td>
+<td class="left">99M</td>
+</tr>
+<tr>
+<td class="left">ResNet-101</td>
+<td class="left">23.7%</td>
+<td class="left">173M</td>
+</tr>
+<tr>
+<td class="left">ResNet-152</td>
+<td class="left">23.2%</td>
+<td class="left">234M</td>
+</tr>
+</tbody>
+
+</table></center>
+<br>
+
+## ResNet 模型
+
+50层，101层和152层的网络配置文件可参照```demo/model_zoo/resnet/resnet.py```。你也可以通过在命令行参数中增加一个参数如```--config_args=layer_num=50```来指定网络层的数目。
+
+### 网络可视化
+
+你可以通过执行下面的命令来得到ResNet网络的结构可视化图。该脚本会生成一个dot文件，然后可以转换为图片。需要安装graphviz来转换dot文件为图片。
+
+```
+cd demo/model_zoo/resnet
+./net_diagram.sh
+```
+
+### 模型下载
+
+```
+cd demo/model_zoo/resnet
+./get_model.sh
+```
+你可以执行上述命令来下载所有的模型和均值文件，如果下载成功，这些文件将会被保存在```demo/model_zoo/resnet/model```路径下。
+
+```
+mean_meta_224  resnet_101  resnet_152  resnet_50
+```
+   * resnet_50: 50层网络模型。
+   * resnet_101: 101层网络模型。
+   * resnet_152: 152层网络模型。
+   * mean\_meta\_224: 均值图像文件，图像大小为3 x 224 x 224，颜色通道顺序为**BGR**。你也可以使用这三个值: 103.939, 116.779, 123.68。
+
+### 参数信息
+
+* **卷积层权重**
+
+  由于每个卷积层后面连接的是batch normalization层，因此该层中没有偏置(bias)参数，并且只有一个权重。
+  形状: `(Co, ky, kx, Ci)`
+   * Co: 输出特征图的通道数目
+   * ky: 滤波器核在垂直方向上的尺寸
+   * kx: 滤波器核在水平方向上的尺寸
+   * Ci: 输入特征图的通道数目
+
+  二维矩阵: (Co * ky * kx, Ci), 行优先次序存储。
+
+* **全连接层权重**
+
+  二维矩阵: (输入层尺寸, 本层尺寸), 行优先次序存储。
+
+* **[Batch Normalization](<http://arxiv.org/abs/1502.03167>) 层权重**
+
+本层有四个参数，实际上只有.w0和.wbias是需要学习的参数，另外两个分别是滑动均值和方差。在测试阶段它们将会被加载到模型中。下表展示了batch normalization层的参数。
+<center>
+<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
+<colgroup>
+<col  class="left" />
+<col  class="left" />
+<col  class="left" />
+</colgroup>
+<thead>
+<tr>
+<th scope="col" class="left">参数名</th>
+<th scope="col" class="left">尺寸</th>
+<th scope="col" class="left">含义</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left">_res2_1_branch1_bn.w0</td>
+<td class="left">256</td>
+<td class="left">gamma, 缩放参数</td>
+</tr>
+<tr>
+<td class="left">_res2_1_branch1_bn.w1</td>
+<td class="left">256</td>
+<td class="left">特征图均值</td>
+</tr>
+<tr>
+<td class="left">_res2_1_branch1_bn.w2</td>
+<td class="left">256</td>
+<td class="left">特征图方差</td>
+</tr>
+<tr>
+<td class="left">_res2_1_branch1_bn.wbias</td>
+<td class="left">256</td>
+<td class="left">beta, 偏置参数</td>
+</tr>
+</tbody>
+
+</table></center>
+<br>
+
+### 参数读取
+
+使用者可以使用下面的Python脚本来读取参数值:
+
+```
+import sys
+import numpy as np
+
+def load(file_name):
+    with open(file_name, 'rb') as f:
+        f.read(16) # skip header for float type.
+        return np.fromfile(f, dtype=np.float32)
+
+if __name__=='__main__':
+    weight = load(sys.argv[1])
+```
+
+或者直接使用下面的shell命令:
+
+```
+od -j 16 -f _res2_1_branch1_bn.w0
+```
+
+## 特征提取
+
+我们提供了C++和Python接口来提取特征。下面的例子使用了`demo/model_zoo/resnet/example`中的数据，详细地展示了整个特征提取的过程。
+
+### C++接口
+
+首先，在配置文件中的`define_py_data_sources2`里指定图像数据列表，具体请参照示例`demo/model_zoo/resnet/resnet.py`。
+
+```
+    train_list = 'train.list' if not is_test else None
+    # mean.meta is mean file of ImageNet dataset.
+    # mean.meta size : 3 x 224 x 224.
+    # If you use three mean value, set like:
+    # "mean_value:103.939,116.779,123.68;"
+    args={
+        'mean_meta': "model/mean_meta_224/mean.meta",
+        'image_size': 224, 'crop_size': 224,
+        'color': True,'swap_channel:': [2, 1, 0]}
+    define_py_data_sources2(train_list,
+                           'example/test.list',
+                           module="example.image_list_provider",
+                           obj="processData",
+                           args=args)
+```
+
+第二步，在`resnet.py`文件中指定要提取特征的网络层的名字。例如，
+
+```
+Outputs("res5_3_branch2c_conv", "res5_3_branch2c_bn")
+```
+
+第三步，在`extract_fea_c++.sh`文件中指定模型路径和输出的目录，然后执行下面的命令。
+
+```
+cd demo/model_zoo/resnet
+./extract_fea_c++.sh
+```
+
+如果执行成功，特征将会存到`fea_output/rank-00000`文件中，如下所示。同时你可以使用`load_feature.py`文件中的`load_feature_c`接口来加载该文件。
+
+```
+-0.115318 -0.108358 ... -0.087884;-1.27664 ... -1.11516 -2.59123;
+-0.126383 -0.116248 ... -0.00534909;-1.42593 ... -1.04501 -1.40769;
+```
+
+* 每行存储的是一个样本的特征。其中，第一行存的是图像`example/dog.jpg`的特征，第二行存的是图像`example/cat.jpg`的特征。
+* 不同层的特征由分号`;`隔开，并且它们的顺序与`Outputs()`中指定的层顺序一致。这里，左边是`res5_3_branch2c_conv`层的特征，右边是`res5_3_branch2c_bn`层特征。
+
+### Python接口
+
+示例`demo/model_zoo/resnet/classify.py`中展示了如何使用Python来提取特征。下面的例子同样使用了`./example/test.list`中的数据。执行的命令如下：
+
+```
+cd demo/model_zoo/resnet
+./extract_fea_py.sh
+```
+
+extract_fea_py.sh:
+
+```
+python classify.py \
+     --job=extract \
+     --conf=resnet.py\
+     --use_gpu=1 \
+     --mean=model/mean_meta_224/mean.meta \
+     --model=model/resnet_50 \
+     --data=./example/test.list \
+     --output_layer="res5_3_branch2c_conv,res5_3_branch2c_bn" \
+     --output_dir=features
+
+```
+* \--job=extract:              指定工作模式来提取特征。
+* \--conf=resnet.py:           网络配置文件。
+* \--use_gpu=1:                指定是否使用GPU。
+* \--model=model/resnet_50:    模型路径。
+* \--data=./example/test.list: 数据列表。
+* \--output_layer="xxx,xxx":   指定提取特征的层。
+* \--output_dir=features:      输出目录。
+
+如果运行成功，你将会看到特征存储在`features/batch_0`文件中，该文件是由cPickle产生的。你可以使用`load_feature.py`中的`load_feature_py`接口来打开该文件，它将返回如下的字典：
+
+```
+{
+'cat.jpg': {'res5_3_branch2c_conv': array([[-0.12638293, -0.116248  , -0.11883899, ..., -0.00895038, 0.01994277, -0.00534909]], dtype=float32), 'res5_3_branch2c_bn': array([[-1.42593431, -1.28918779, -1.32414699, ..., -1.45933616, -1.04501402, -1.40769434]], dtype=float32)},
+'dog.jpg': {'res5_3_branch2c_conv': array([[-0.11531784, -0.10835785, -0.08809858, ...,0.0055237, 0.01505112, -0.08788397]], dtype=float32), 'res5_3_branch2c_bn': array([[-1.27663755, -1.18272924, -0.90937918, ..., -1.25178063, -1.11515927, -2.59122872]], dtype=float32)}
+}
+```
+
+仔细观察，这些特征值与上述使用C++接口提取的结果是一致的。
+
+## 预测
+
+`classify.py`文件也可以用于对样本进行预测。我们提供了一个示例脚本`predict.sh`，它使用50层的ResNet模型来对`example/test.list`中的数据进行预测。
+
+```
+cd demo/model_zoo/resnet
+./predict.sh
+```
+
+predict.sh调用了`classify.py`:
+
+```
+python classify.py \
+     --job=predict \
+     --conf=resnet.py\
+     --multi_crop \
+     --model=model/resnet_50 \
+     --use_gpu=1 \
+     --data=./example/test.list
+```
+* \--job=extract:              指定工作模型进行预测。
+* \--conf=resnet.py:           网络配置文件。network configure.
+* \--multi_crop:               使用10个裁剪图像块，预测概率取平均。
+* \--use_gpu=1:                指定是否使用GPU。
+* \--model=model/resnet_50:    模型路径。
+* \--data=./example/test.list: 数据列表。
+
+如果运行成功，你将会看到如下结果，其中156和285是这些图像的分类标签。
+
+```
+Label of example/dog.jpg is: 156
+Label of example/cat.jpg is: 282
+```
diff --git a/doc/tutorials/imagenet_model/resnet_model_en.md b/doc/tutorials/imagenet_model/resnet_model_en.md
index 5403ab9f17d2399fee878d0f3c512cb166aba06f..478ad06193b14ba7fe02238df621db1f7b0804d4 100644
--- a/doc/tutorials/imagenet_model/resnet_model_en.md
+++ b/doc/tutorials/imagenet_model/resnet_model_en.md
@@ -52,7 +52,7 @@ See ```demo/model_zoo/resnet/resnet.py```. This config contains network of 50, 1
 
 ### Network Visualization
 
-You can get a diagram of ResNet network by running the following commands. The script generates dot file and then converts dot file to PNG file, which uses installed draw_dot tool in our server. If you can not access the server, just install graphviz to convert dot file.
+You can get a diagram of ResNet network by running the following commands. The script generates dot file and then converts dot file to PNG file, which needs to install graphviz to convert.
 
 ```
 cd demo/model_zoo/resnet
@@ -138,7 +138,7 @@ There are four parameters in this layer. In fact, only .w0 and .wbias are the le
 
 ### Parameter Observation
 
-Users who want to observe the parameters can use python to read:
+Users who want to observe the parameters can use Python to read:
 
 ```
 import sys
@@ -209,7 +209,7 @@ If successful, features are saved in `fea_output/rank-00000` as follows. And you
 
 ### Python Interface
 
-`demo/model_zoo/resnet/classify.py` is an example to show how to use python to extract features. Following example still uses data of `./example/test.list`. Command is as follows:
+`demo/model_zoo/resnet/classify.py` is an example to show how to use Python to extract features. Following example still uses data of `./example/test.list`. Command is as follows:
 
 ```
 cd demo/model_zoo/resnet
@@ -238,8 +238,6 @@ python classify.py \
 * \--output_layer="xxx,xxx":   specify layers to extract features.
 * \--output_dir=features:      output diretcoty.
 
-Note, since the convolution layer in these ResNet models is suitable for the cudnn implementation which only support GPU. It not support CPU mode because of compatibility issue and we will fix later.
-
 If run successfully, you will see features saved in `features/batch_0`, this file is produced with cPickle. You can use `load_feature_py` interface in `load_feature.py` to open the file, and it returns a dictionary as follows:
 
 ```
diff --git a/doc/tutorials/index_cn.md b/doc/tutorials/index_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..adc75978a7820b7f9c9239ea8a727aa3d587cab0
--- /dev/null
+++ b/doc/tutorials/index_cn.md
@@ -0,0 +1,24 @@
+# 完整教程
+
+## 快速入门
+
+使用商品评论分类任务，系统性的介绍如何一步步改进，最终得到产品级的深度模型。
+
+* [阅读教程](quick_start/index_cn.rst)
+
+## 图像
+
+* TBD
+
+## 自然语言处理
+
+* [情感分类](sentiment_analysis/index_cn.md)
+* [语义角色标注](semantic_role_labeling/index_cn.md)
+
+## 个性化推荐
+
+* TBD
+
+## 常用模型
+
+* TBD
diff --git a/doc/tutorials/index_en.md b/doc/tutorials/index_en.md
index 97de356665d23543ddc241552c6e3c896a78db86..63b2091c245eedf61a31da620e5804daf765cc42 100644
--- a/doc/tutorials/index_en.md
+++ b/doc/tutorials/index_en.md
@@ -1,7 +1,9 @@
 # TUTORIALS
-There are serveral examples and demos here.
+There are several examples and demos here.
 
-## [Quick Start](quick_start/index_en.md)
+## Quick Start
+
+* [Quick Start](quick_start/index_en.md)
 
 ## Image
 
@@ -15,7 +17,6 @@ There are serveral examples and demos here.
 
 ## Recommendation
 
-* [MovieLens Dataset](rec/ml_dataset_en.md)
 * [MovieLens Regression](rec/ml_regression_en.rst)
 
 ## Model Zoo
diff --git a/doc_cn/demo/quick_start/index.rst b/doc/tutorials/quick_start/index_cn.rst
similarity index 86%
rename from doc_cn/demo/quick_start/index.rst
rename to doc/tutorials/quick_start/index_cn.rst
index 0536936dc47689d3ff285b919586a10128a0c745..936f16118a439b310794157191bb6d82d8fa6d42 100644
--- a/doc_cn/demo/quick_start/index.rst
+++ b/doc/tutorials/quick_start/index_cn.rst
@@ -1,5 +1,6 @@
-PaddlePaddle快速入门教程
-========================
+=============
+快速入门教程
+=============
 
 我们将以 `文本分类问题 <https://en.wikipedia.org/wiki/Document_classification>`_ 为例,
 介绍PaddlePaddle的基本使用方法。
@@ -21,7 +22,7 @@ PaddlePaddle快速入门教程
 
 使用PaddlePaddle, 每一个任务流程都可以被划分为如下五个步骤。
 
-    ..  image:: Pipeline.jpg
+    ..  image:: src/Pipeline_cn.jpg
         :align: center
         :scale: 80%
 
@@ -99,7 +100,7 @@ Python脚本读取数据
 
 本小节我们将介绍模型网络结构。
 
-    ..  image:: PipelineNetwork.jpg
+    ..  image:: src/PipelineNetwork_cn.jpg
         :align: center
         :scale: 80%
 
@@ -112,7 +113,7 @@ Python脚本读取数据
 
 具体流程如下:
 
-    ..  image:: NetLR.jpg
+    ..  image:: src/NetLR_cn.jpg
         :align: center
         :scale: 80%
 
@@ -147,9 +148,9 @@ Python脚本读取数据
 **效果总结**：我们将在后面介绍训练和预测流程的脚本。在此为方便对比不同网络结构，我们总结了各个网络的复杂度和效果。
 
     =====================  ===============================  =================
-    网络名称	                    参数数量                    错误率
+    网络名称                        参数数量                    错误率
     =====================  ===============================  =================
-    逻辑回归	                  252 KB                       8.652 %
+    逻辑回归                      252 KB                       8.652 %
     =====================  ===============================  =================
 
 词向量模型
@@ -176,7 +177,7 @@ embedding模型需要稍微改变提供数据的Python脚本，即 ``dataprovide
 
 该模型依然使用逻辑回归分类网络的框架， 只是将句子用连续向量表示替换为用稀疏向量表示， 即对第三步进行替换。句子表示的计算更新为两步：
 
-..  image:: NetContinuous.jpg
+..  image:: src/NetContinuous_cn.jpg
     :align: center
     :scale: 80%
 
@@ -197,9 +198,9 @@ embedding模型需要稍微改变提供数据的Python脚本，即 ``dataprovide
 **效果总结：**
 
     =====================  ===============================  ==================
-    网络名称	                    参数数量                    错误率
+    网络名称                        参数数量                    错误率
     =====================  ===============================  ==================
-    词向量模型	                  15 MB                       8.484 %
+    词向量模型                      15 MB                       8.484 %
     =====================  ===============================  ==================
 
 卷积模型
@@ -207,7 +208,7 @@ embedding模型需要稍微改变提供数据的Python脚本，即 ``dataprovide
 
 卷积网络是一种特殊的从词向量表示到句子表示的方法， 也就是将词向量模型进一步演化为三个新步骤。
 
-..  image:: NetConv.jpg
+..  image:: src/NetConv_cn.jpg
     :align: center
     :scale: 80%
 
@@ -230,15 +231,15 @@ embedding模型需要稍微改变提供数据的Python脚本，即 ``dataprovide
 **效果总结：**
 
     =====================  ===============================  ========================
-    网络名称	                    参数数量                    错误率
+    网络名称                        参数数量                    错误率
     =====================  ===============================  ========================
-    卷积模型	                  16 MB                       5.628 %
+    卷积模型                      16 MB                       5.628 %
     =====================  ===============================  ========================
 
 时序模型
 ----------
 
-..  image:: NetRNN.jpg
+..  image:: src/NetRNN_cn.jpg
     :align: center
     :scale: 80%
 
@@ -260,9 +261,9 @@ embedding模型需要稍微改变提供数据的Python脚本，即 ``dataprovide
 本次试验，我们采用单层LSTM模型，并使用了Dropout，**效果总结：**
 
     =====================  ===============================  =========================
-    网络名称	                    参数数量                    错误率
+    网络名称                        参数数量                    错误率
     =====================  ===============================  =========================
-    时序模型	                  16 MB                       4.812 %
+    时序模型                      16 MB                       4.812 %
     =====================  ===============================  =========================
 
 优化算法
@@ -284,7 +285,7 @@ Momentum, RMSProp，AdaDelta，AdaGrad，ADAM，Adamax等，这里采用Adam优
 
 在数据加载和网络配置完成之后， 我们就可以训练模型了。
 
-..  image:: PipelineTrain.jpg
+..  image:: src/PipelineTrain_cn.jpg
     :align: center
     :scale: 80%
 
@@ -294,7 +295,7 @@ Momentum, RMSProp，AdaDelta，AdaGrad，ADAM，Adamax等，这里采用Adam优
 
         ./train.sh
 
-``train.sh``中包含了训练模型的基本命令。训练时所需设置的主要参数如下：
+``train.sh`` 中包含了训练模型的基本命令。训练时所需设置的主要参数如下：
 
     .. code-block:: bash
 
@@ -312,7 +313,7 @@ Momentum, RMSProp，AdaDelta，AdaGrad，ADAM，Adamax等，这里采用Adam优
 
 当模型训练好了之后，我们就可以进行预测了。
 
-..  image:: PipelineTest.jpg
+..  image:: src/PipelineTest_cn.jpg
     :align: center
     :scale: 80%
 
@@ -348,12 +349,12 @@ Momentum, RMSProp，AdaDelta，AdaGrad，ADAM，Adamax等，这里采用Adam优
 对于Amazon-Elec测试集(25k), 如下表格，展示了上述网络模型的训练效果:
 
     =====================  ===============================  =============  ==================================
-    网络名称	                   参数数量                    错误率          配置文件
+    网络名称                       参数数量                    错误率          配置文件
     =====================  ===============================  =============  ==================================
-    逻辑回归模型	                  252 KB                     8.652%          trainer_config.lr.py
-    词向量模型      	               15 MB                      8.484%         trainer_config.emb.py
+    逻辑回归模型                      252 KB                     8.652%          trainer_config.lr.py
+    词向量模型                         15 MB                      8.484%         trainer_config.emb.py
     卷积模型                        16 MB                     5.628%          trainer_config.cnn.py
-    时序模型 	                    16 MB                     4.812%          trainer_config.lstm.py
+    时序模型                         16 MB                     4.812%          trainer_config.lstm.py
     =====================  ===============================  =============  ==================================
 
 
@@ -384,12 +385,12 @@ Momentum, RMSProp，AdaDelta，AdaGrad，ADAM，Adamax等，这里采用Adam优
 模型训练会看到类似上面这样的日志信息，详细的参数解释，请参考如下表格：
 
     ===========================================  ==============================================================
-    名称	                                         解释
+    名称                                             解释
     ===========================================  ==============================================================
-    Batch=20	                                  表示过了20个batch
-    samples=2560	                              表示过了2560个样本
-    AvgCost	                                      每个pass的第0个batch到当前batch所有样本的平均cost
-    CurrentCost	                                  当前log_period个batch所有样本的平均cost
-    Eval: classification_error_evaluator	      每个pass的第0个batch到当前batch所有样本的平均分类错误率
-    CurrentEval: classification_error_evaluator	  当前log_period个batch所有样本的平均分类错误率
+    Batch=20                                      表示过了20个batch
+    samples=2560                                  表示过了2560个样本
+    AvgCost                                          每个pass的第0个batch到当前batch所有样本的平均cost
+    CurrentCost                                      当前log_period个batch所有样本的平均cost
+    Eval: classification_error_evaluator          每个pass的第0个batch到当前batch所有样本的平均分类错误率
+    CurrentEval: classification_error_evaluator      当前log_period个batch所有样本的平均分类错误率
     ===========================================  ==============================================================
diff --git a/doc/tutorials/quick_start/index_en.md b/doc/tutorials/quick_start/index_en.md
index ec548b5393d7b210d6409328c00917aeb679a451..4e765b23037d8b4b717d12437f839cc488badf5b 100644
--- a/doc/tutorials/quick_start/index_en.md
+++ b/doc/tutorials/quick_start/index_en.md
@@ -12,7 +12,7 @@ This tutorial will teach the basics of deep learning (DL), including how to impl
 
 To get started, please install PaddlePaddle on your computer. Throughout this tutorial, you will learn by implementing different DL models for text classification.
 
-To install PaddlePaddle, please follow the instructions here: <a href = "../../build/index.html" >Build and Install</a>.
+To install PaddlePaddle, please follow the instructions here: <a href = "../../getstarted/build_and_install/index_en.html" >Build and Install</a>.
 
 ## Overview
 For the first step, you will use PaddlePaddle to build a **text classification** system. For example, suppose you run an e-commence  website, and you want to analyze the sentiment of user reviews to evaluate product quality.
@@ -32,7 +32,7 @@ The monitor breaks down two months after purchase.
 the classifier should output “negative“.
 
 To build your text classification system, your code will need to perform five steps:
-<center> ![](./Pipeline_en.jpg) </center>
+<center> ![](./src/Pipeline_en.jpg) </center>
 
   - Preprocess data into a standardized format.
   - Provide data to the learning model.
@@ -156,18 +156,18 @@ define_py_data_sources2(train_list='data/train.list',
                         obj="process",
                         args={"dictionary": word_dict})
 ```
-You can refer to the following link for more detailed examples and data formats: <a href = "../../ui/data_provider/pydataprovider2.html">PyDataProvider2</a>.
+You can refer to the following link for more detailed examples and data formats: <a href = "../../api/data_provider/pydataprovider2_en.html">PyDataProvider2</a>.
 
 ## Network Architecture
 You will describe four kinds of network architectures in this section.
-<center> ![](./PipelineNetwork_en.jpg) </center>
+<center> ![](./src/PipelineNetwork_en.jpg) </center>
 
 First, you will build a logistic regression model. Later, you will also get chance to build other more powerful network architectures.
-For more detailed documentation, you could refer to: <a href = "../../ui/api/trainer_config_helpers/layers_index.html">Layer documentation</a>。All configuration files are in `demo/quick_start` directory.
+For more detailed documentation, you could refer to: <a href = "../../api/trainer_config_helpers/layers.html">layer documentation</a>. All configuration files are in `demo/quick_start` directory.
 
 ### Logistic Regression
 The architecture is illustrated in the following picture:
-<center> ![](./NetLR_en.png) </center>
+<center> ![](./src/NetLR_en.png) </center>
 
 - You need define the data for text features. The size of the data layer is the number of words in the dictionary.
 
@@ -182,10 +182,10 @@ label = data_layer(name="label", size=label_dim)
 ```
 
 - It uses logistic regression model to classify the vector, and it will output the classification error during training.
-	- Each layer has an *input* argument that specifies its input layer. Some layers can have multiple input layers. You can use a list of the input layers as input in that case.
-	- *size* for each layer means the number of neurons of the layer.
-	- *act_type* means activation function applied to the output of each neuron independently.
-	- Some layers can have additional special inputs. For example, `classification_cost` needs ground truth label as input to compute classification loss and error.
+    - Each layer has an *input* argument that specifies its input layer. Some layers can have multiple input layers. You can use a list of the input layers as input in that case.
+    - *size* for each layer means the number of neurons of the layer.
+    - *act_type* means activation function applied to the output of each neuron independently.
+    - Some layers can have additional special inputs. For example, `classification_cost` needs ground truth label as input to compute classification loss and error.
 ```python
 # Define a fully connected layer with logistic activation (also called softmax activation).
 output = fc_layer(input=word,
@@ -240,7 +240,7 @@ def process(settings, file_name):
 ```
 
 This model is very similar to the framework of logistic regression, but it uses word embedding vectors instead of a sparse vectors to represent words.
-<center> ![](./NetContinuous_en.png) </center>
+<center> ![](./src/NetContinuous_en.png) </center>
 
 - It can look up the dense word embedding vector in the dictionary  (its words embedding vector is `word_dim`). The input is a sequence of N words, the output is N word_dim dimensional vectors.
 
@@ -283,7 +283,7 @@ The performance is summarized in the following table:
 
 ### Convolutional Neural Network Model
 Convolutional neural network converts a sequence of word embeddings into a sentence representation using temporal convolutions. You will transform the fully connected layer of the word embedding model to 3 new sub-steps.
-<center> ![](./NetConv_en.png) </center>
+<center> ![](./src/NetConv_en.png) </center>
 
 
 Text convolution has 3 steps:
@@ -295,8 +295,8 @@ Text convolution has 3 steps:
 # context_len means convolution kernel size.
 # context_start means the start of the convolution. It can be negative. In that case, zero padding is applied.
 text_conv = sequence_conv_pool(input=emb,
-	                           context_start=k,
-	                           context_len=2 * k + 1)
+                               context_start=k,
+                               context_len=2 * k + 1)
 ```
 
 The performance is summarized in the following table：
@@ -324,7 +324,7 @@ The performance is summarized in the following table：
 <br>
 
 ### Recurrent Model
-<center> ![](./NetRNN_en.png) </center>
+<center> ![](./src/NetRNN_en.png) </center>
 
 You can use Recurrent neural network as our time sequence model, including simple RNN model, GRU model, and LSTM model。
 
@@ -366,7 +366,7 @@ You can use single layer LSTM model with Dropout for our text classification pro
 <br>
 
 ## Optimization Algorithm
-<a href = "../../ui/api/trainer_config_helpers/optimizers.html">Optimization algorithms</a> include Momentum, RMSProp, AdaDelta, AdaGrad, Adam, and Adamax. You can use Adam optimization method here, with L2 regularization and gradient clipping, because Adam has been proved to work very well for training recurrent neural network.
+<a href = "../../api/trainer_config_helpers/optimizers.html">Optimization algorithms</a> include Momentum, RMSProp, AdaDelta, AdaGrad, Adam, and Adamax. You can use Adam optimization method here, with L2 regularization and gradient clipping, because Adam has been proved to work very well for training recurrent neural network.
 
 ```python
 settings(batch_size=128,
@@ -378,7 +378,7 @@ settings(batch_size=128,
 
 ## Training Model
 After completing data preparation and network architecture specification, you will run the training script.
-<center> ![](./PipelineTrain_en.png) </center>
+<center> ![](./src/PipelineTrain_en.png) </center>
 
 Training script: our training script is in `train.sh` file. The training arguments are listed below:
 
@@ -391,10 +391,11 @@ paddle train \
 --use_gpu=false
 ```
 
-If you want to install the remote training platform, which enables distributed training on clusters, follow the instructions here: <a href = "../../cluster/index.html">Platform</a> documentation. We do not provide examples on how to train on clusters. Please refer to other demos or platform training documentation for mode details on training on clusters.
+We do not provide examples on how to train on clusters here. If you want to train on clusters, please follow the <a href = "../../howto/cluster/cluster_train_en.html">distributed training</a> documentation or other demos for more details.
+
 ## Inference
 You can use the trained model to perform prediction on the dataset with no labels. You can also evaluate the model on dataset with labels to obtain its test accuracy.
-<center> ![](./PipelineTest_en.png) </center>
+<center> ![](./src/PipelineTest_en.png) </center>
 
 The test script is listed below. PaddlePaddle can evaluate a model on the data with labels specified in `test.list`.
 
@@ -406,7 +407,7 @@ paddle train \
 --init_model_path=./output/pass-0000x
 ```
 
-We will give an example of performing prediction using Recurrent model on a dataset with no labels. You can refer to: <a href = "../../ui/predict/swig_py_paddle_en.html">Python Prediction API</a> tutorial，or other <a href = "../../demo/index.html">demo</a> for the prediction process using Python. You can also use the following script for inference or evaluation.
+We will give an example of performing prediction using Recurrent model on a dataset with no labels. You can refer to <a href = "../../api/predict/swig_py_paddle_en.html">Python Prediction API</a> tutorial，or other <a href = "../../tutorials/index_en.html">demo</a> for the prediction process using Python. You can also use the following script for inference or evaluation.
 
 inference script (predict.sh)：
 
@@ -508,7 +509,7 @@ The scripts of data downloading, network configurations, and training scrips are
 * \--config_args：Other configuration arguments.
 * \--init_model_path：The path of the initial model parameter.
 
-By default, the trainer will save model every pass. You can also specify `saving_period_by_batches` to set the frequency of batch saving. You can use `show_parameter_stats_period` to print the statistics of the parameters, which are very useful for tuning parameters. Other command line arguments can be found in <a href = "../../ui/index.html#command-line-argument">command line argument documentation</a>。
+By default, the trainer will save model every pass. You can also specify `saving_period_by_batches` to set the frequency of batch saving. You can use `show_parameter_stats_period` to print the statistics of the parameters, which are very useful for tuning parameters. Other command line arguments can be found in <a href = "../../howto/cmd_parameter/index_en.html">command line argument documentation</a>。
 
 ### Log
 
diff --git a/doc_cn/demo/quick_start/NetContinuous.jpg b/doc/tutorials/quick_start/src/NetContinuous_cn.jpg
similarity index 100%
rename from doc_cn/demo/quick_start/NetContinuous.jpg
rename to doc/tutorials/quick_start/src/NetContinuous_cn.jpg
diff --git a/doc/tutorials/quick_start/NetContinuous_en.png b/doc/tutorials/quick_start/src/NetContinuous_en.png
similarity index 100%
rename from doc/tutorials/quick_start/NetContinuous_en.png
rename to doc/tutorials/quick_start/src/NetContinuous_en.png
diff --git a/doc_cn/demo/quick_start/NetConv.jpg b/doc/tutorials/quick_start/src/NetConv_cn.jpg
similarity index 100%
rename from doc_cn/demo/quick_start/NetConv.jpg
rename to doc/tutorials/quick_start/src/NetConv_cn.jpg
diff --git a/doc/tutorials/quick_start/NetConv_en.png b/doc/tutorials/quick_start/src/NetConv_en.png
similarity index 100%
rename from doc/tutorials/quick_start/NetConv_en.png
rename to doc/tutorials/quick_start/src/NetConv_en.png
diff --git a/doc_cn/demo/quick_start/NetLR.jpg b/doc/tutorials/quick_start/src/NetLR_cn.jpg
similarity index 100%
rename from doc_cn/demo/quick_start/NetLR.jpg
rename to doc/tutorials/quick_start/src/NetLR_cn.jpg
diff --git a/doc/tutorials/quick_start/NetLR_en.png b/doc/tutorials/quick_start/src/NetLR_en.png
similarity index 100%
rename from doc/tutorials/quick_start/NetLR_en.png
rename to doc/tutorials/quick_start/src/NetLR_en.png
diff --git a/doc_cn/demo/quick_start/NetRNN.jpg b/doc/tutorials/quick_start/src/NetRNN_cn.jpg
similarity index 100%
rename from doc_cn/demo/quick_start/NetRNN.jpg
rename to doc/tutorials/quick_start/src/NetRNN_cn.jpg
diff --git a/doc/tutorials/quick_start/NetRNN_en.png b/doc/tutorials/quick_start/src/NetRNN_en.png
similarity index 100%
rename from doc/tutorials/quick_start/NetRNN_en.png
rename to doc/tutorials/quick_start/src/NetRNN_en.png
diff --git a/doc_cn/demo/quick_start/PipelineNetwork.jpg b/doc/tutorials/quick_start/src/PipelineNetwork_cn.jpg
similarity index 100%
rename from doc_cn/demo/quick_start/PipelineNetwork.jpg
rename to doc/tutorials/quick_start/src/PipelineNetwork_cn.jpg
diff --git a/doc/tutorials/quick_start/PipelineNetwork_en.jpg b/doc/tutorials/quick_start/src/PipelineNetwork_en.jpg
similarity index 100%
rename from doc/tutorials/quick_start/PipelineNetwork_en.jpg
rename to doc/tutorials/quick_start/src/PipelineNetwork_en.jpg
diff --git a/doc_cn/demo/quick_start/PipelineTest.jpg b/doc/tutorials/quick_start/src/PipelineTest_cn.jpg
similarity index 100%
rename from doc_cn/demo/quick_start/PipelineTest.jpg
rename to doc/tutorials/quick_start/src/PipelineTest_cn.jpg
diff --git a/doc/tutorials/quick_start/PipelineTest_en.png b/doc/tutorials/quick_start/src/PipelineTest_en.png
similarity index 100%
rename from doc/tutorials/quick_start/PipelineTest_en.png
rename to doc/tutorials/quick_start/src/PipelineTest_en.png
diff --git a/doc_cn/demo/quick_start/PipelineTrain.jpg b/doc/tutorials/quick_start/src/PipelineTrain_cn.jpg
similarity index 100%
rename from doc_cn/demo/quick_start/PipelineTrain.jpg
rename to doc/tutorials/quick_start/src/PipelineTrain_cn.jpg
diff --git a/doc/tutorials/quick_start/PipelineTrain_en.png b/doc/tutorials/quick_start/src/PipelineTrain_en.png
similarity index 100%
rename from doc/tutorials/quick_start/PipelineTrain_en.png
rename to doc/tutorials/quick_start/src/PipelineTrain_en.png
diff --git a/doc_cn/demo/quick_start/Pipeline.jpg b/doc/tutorials/quick_start/src/Pipeline_cn.jpg
similarity index 100%
rename from doc_cn/demo/quick_start/Pipeline.jpg
rename to doc/tutorials/quick_start/src/Pipeline_cn.jpg
diff --git a/doc/tutorials/quick_start/Pipeline_en.jpg b/doc/tutorials/quick_start/src/Pipeline_en.jpg
similarity index 100%
rename from doc/tutorials/quick_start/Pipeline_en.jpg
rename to doc/tutorials/quick_start/src/Pipeline_en.jpg
diff --git a/doc/tutorials/rec/ml_dataset_cn.md b/doc/tutorials/rec/ml_dataset_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..2207a776f0774e72aba15169e59258dd04583637
--- /dev/null
+++ b/doc/tutorials/rec/ml_dataset_cn.md
@@ -0,0 +1,105 @@
+```eval_rst
+.. _demo_ml_dataset:
+
+```
+
+# MovieLens数据集
+
+[MovieLens 数据集](http://grouplens.org/datasets/movielens/)由GroupLens Research实验室搜集整理。
+该数据集包含一些用户信息、电影信息以及电影评分\[1-5\]。根据数据量规模，该数据及有很多不同的版本。
+我们用[MovieLens 百万数据集](http://files.grouplens.org/datasets/movielens/ml-1m.zip)作为示例数据
+集，其中包含6,000位用户对4,000部电影的1,000,000条评价。该数据集于2003年2月发布。
+
+## 数据集特征
+
+在[ml-1m 数据集](http://files.grouplens.org/datasets/movielens/ml-1m.zip)中有许多的特征。在[ml-1m 数据集]
+(http://files.grouplens.org/datasets/movielens/ml-1m.zip)中的这些数据文件(含有".dat"的后缀)实际上是CSV文件，
+分隔符为"::"。以下我们翻译数据集网站中README文件的描述:
+
+### 评分文件描述(ratings.dat)
+
+
+所有的评分数据都包含在"ratings.dat"文件中，遵循如下的格式:
+
+用户ID::电影ID::评分::时间戳
+
+- 用户ID范围从1到6040
+- 电影ID范围从1到3952
+- 评分被调整为5星的规模(只允许整数的星级)
+- 时间戳表示为从1970-01-01(UTC)来的秒数，与time(2)的返回值一致
+- 每位用户至少有20条评分
+
+### 用户文件描述(users.dat)
+
+所有的用户信息都包含在"users.dat"文件中，遵循如下的格式:
+
+用户ID::性别::年龄::职业::邮编
+
+所有的人口统计学信息由用户自愿提供，没有进行正确性的检查。只有含有人
+口统计学信息的用户才被包含在数据集中。
+
+- 性别，用"M"表示男性，"F"表示女性
+- 年龄从下列列表范围中选取:
+
+	*   1:	"18岁以下"
+	*  18:	"18-24岁"
+	*  25:	"25-34岁"
+	*  35:	"35-44岁"
+	*  45:	"45-49岁"
+	*  50:	"50-55岁"
+	*  56:	"56+"
+
+- 职业从下面所列中选择:
+
+	*   0:  "其他"或不确定
+	*   1:  "学术/教育工作者"
+	*   2:  "艺术家"
+	*   3:  "文书工作/管理员"
+	*   4:  "大学生/研究生"
+	*   5:  "客户服务"
+	*   6:  "医生/医疗保健"
+	*   7:  "行政工作/管理人员"
+	*   8:  "农民"
+	*   9:  "操持家务者"
+	*  10:  "高中毕业生"
+	*  11:  "律师"
+	*  12:  "程序员"
+	*  13:  "退休人员"
+	*  14:  "销售/市场"
+	*  15:  "科学家"
+	*  16:  "自由职业者"
+	*  17:  "技术员/工程师"
+	*  18:  "推销员/手工艺者"
+	*  19:  "无业人士"
+	*  20:  "作家"
+
+### 电影文件描述(movies.dat)
+
+所有的电影信息都包含在"movies.dat"文件中，遵循如下的格式:
+
+电影ID::电影名称::电影类型
+
+- 电影名称（包括发行时间）与IMDB网站提供的一致
+- 电影类型如符合多种用管道符号|分割，选自下列类型:
+
+	*	动作片
+	*	冒险片
+	*	动画片
+	*	儿童片
+	*	喜剧片
+	*	犯罪片
+	*	纪录片
+	*	戏剧
+	*	奇幻片
+	*	黑色电影
+	*	恐怖片
+	*	音乐剧
+	*	悬疑片
+	*	浪漫片
+	*	科幻片
+	*	惊险电影
+	*	战争片
+	*	西部片
+
+- 由于意外的副本记录和测试记录，有些电影ID可能与实际电影不相符合
+- 电影大部分是手工输入数据，因此可能会有一些错误和不一致发生
diff --git a/doc/tutorials/rec/ml_dataset_en.md b/doc/tutorials/rec/ml_dataset_en.md
index c93a4585e4027b1912da8a77c2562d1ee69c5366..25dea5c4afbf1ce1c1ac6195cbd245b116459e2e 100644
--- a/doc/tutorials/rec/ml_dataset_en.md
+++ b/doc/tutorials/rec/ml_dataset_en.md
@@ -1,3 +1,7 @@
+```eval_rst
+..  _demo_ml_dataset:
+```
+
 # MovieLens Dataset
 
 The [MovieLens Dataset](http://grouplens.org/datasets/movielens/) was collected by GroupLens Research.
diff --git a/doc/tutorials/rec/ml_regression_cn.rst b/doc/tutorials/rec/ml_regression_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..a084e4790c78018a2cb02392e22cf59a4b94aeeb
--- /dev/null
+++ b/doc/tutorials/rec/ml_regression_cn.rst
@@ -0,0 +1,349 @@
+MovieLens数据集评分回归模型
+=========================
+
+这里我们在MovieLens数据集描述一种 **余弦相似度回归** 任务。
+该示例将展示paddle如何进行词向量嵌入，处理相似度回归，针对文本
+的单词级别的卷积神经网络，以及paddle如何处理多种类型的输入。
+需要注意的是，该模型网络只是用于进行demo展示paddle如何工作，而
+没有进行结构的微调。
+
+
+**我们非常欢迎您用PADDLEPADDLE构建更好的示例，如果您有好的建议来
+让这个示例变得更好，希望能让我们知晓。**
+
+数据准备
+```````
+下载并解压数据集
+''''''''''''''
+这里我们使用 :ref:`demo_ml_dataset` 。
+要下载和解压数据集，只需要简单的运行下面的命令即可。
+
+.. code-block:: bash
+
+	cd demo/recommendation/data
+	./ml_data.sh
+
+:code:`demo/recommendation/data/ml-1m` 的目录结构为:
+
+.. code-block:: text
+
+	+--ml-1m
+		+--- movies.dat 	# 电影特征
+		+--- ratings.dat 	# 评分
+		+--- users.dat 		# 用户特征
+		+--- README 		# 数据集描述
+
+字段配置文件
+''''''''''
+**字段配置文件** 用来具体说明数据集的字段和文件格式，
+例如，说明每个特征文件具体字段是 **什么** 类型。
+
+ml-1m的字段配置文件在目录 :code:`demo/recommendation/data/config.json` 中。
+其具体说明了字段类型和文件名称:
+
+1) 用户文件中有四种类型的字段\: 编号，性别，年龄和职业；
+
+2) 文件名称为"users.dat"，文件的分隔符为"::"。
+
+.. include:: ../../../demo/recommendation/data/config.json
+   :code: json
+   :literal:
+
+准备数据
+```````
+你需要安装python的第三方库。
+**强烈推荐使用VIRTUALENV来创造一个干净的python环境。**
+
+.. code-block:: bash
+
+	pip install -r requirements.txt
+
+预处理数据一般的命令为:
+
+.. code-block:: bash
+
+	cd demo/recommendation
+	./preprocess.sh
+
+下面介绍预处理过程具体的步骤。
+
+提取电影或用户的特征并生成python对象
+''''''''''''''''''''''''''''''''
+
+在movielens 1m数据集中，电影和用户有许多的特征。
+评分文件的每一行仅仅提供电影或用户的编号来代表相应的电影或用户。
+我们首先处理电影或用户的特征文件，然后用pickle命令将特征( **Meta** )对象存储为文件。
+
+Meta配置文件
+...........
+
+**Meta配置文件** 用来具体描述 **如何** 解析数据集中的每一个字段。
+该文件可以从字段配置文件生成，或是手动编辑生成。文件的格式可以
+为json或yaml格式。解析器能通过文件的扩展名自动识别文件的格式。
+
+要将字段配置文件转化为meta配置文件，只需要运行：
+
+.. code-block:: bash
+
+	cd demo/recommendation/data
+	python config_generator.py config.json > meta_config.json
+
+生成的meta配置文件如下所示：
+
+.. include:: ../../../demo/recommendation/data/meta_config.json
+	:code: json
+	:literal:
+
+在meta文件中有两种特征\: 电影和用户。
+
+* 在电影文件movies.dat中
+	* 我们仅用"::"来分隔每一行
+	* pos 0 代表编号
+	* pos 1 特征：
+		* name是电影名
+		* 利用正则表达式来解析该特征
+		* 基于字母的词嵌入特征
+		* 是序列
+	* pos 2 特征：
+		* name是体裁
+		* type是one hot稠密向量
+		* dictionary由解析自动生成，每一个key由'|'分隔
+* 在用户文件users.dat中
+	* 我们仅用"::"来分隔每一行
+	* pos 0 代表编号
+	* pos 1 特征：
+		* name是性别
+		* 简单的基于字母的词嵌入
+	* pos 2 特征：
+		* name是年龄
+		* 是整个的词嵌入
+		* 嵌入编号会根据单词排序
+	* pos 3 特征：
+		* name是职业
+		* 简单的整个词嵌入
+
+
+Meta文件
+''''''''
+
+有了meta配置文件之后，我们可以生成 **Meta文件** ，该文件是python的pickle对象，
+存储着电影或用户信息。可以运行下面的命令来生成。
+
+.. code-block:: bash
+
+	python meta_generator.py ml-1m meta.bin --config=meta_config.json
+
+meta文件 :code:`meta.bin` 的结构如下：
+
+.. code-block:: text
+
+    +--+ movie
+    |      +--+ __meta__
+    |      |       +--+ raw_meta  # 每个特征的meta配置。列表
+    |      |       |       +
+    |      |       |       |     # 编号字段，我们用编号作为key 
+    |      |       |       +--+ {'count': 3883, 'max': 3952, 'is_key': True, 'type': 'id', 'min': 1}
+    |      |       |       |
+    |      |       |       |     # 电影名字段，嵌入特征字典
+    |      |       |       +--+ {'dict': [ ... ], 'type': 'embedding', 'name': 'title', 'seq': 'sequence'}
+    |      |       |       |
+    |      |       |       |     # 体裁字段，体裁字典
+    |      |       |       +--+ {'dict': [ ... ], 'type': 'one_hot_dense', 'name': 'genres'}
+    |      |       |
+    |      |       +--+ feature_map [1, 2] # a list for raw_meta index for feature field.
+    |      |                               # it means there are 2 features for each key.
+    |      |                               #    * 0 offset of feature is raw_meta[1], Title.
+    |      |                               #    * 1 offset of feature is raw_meta[2], Genres.
+    |      |
+    |      +--+ 1 # 电影1的特征
+    |      |    +
+    |      |    +---+ [[...], [...]] # title ids, genres dense vector
+    |      |
+    |      +--+ 2
+    |      |
+    |      +--+ ...
+    |
+    +--- user
+           +--+ __meta__
+           |       +
+           |       +--+ raw_meta
+           |       |       +
+           |       |       +--+ id field as user
+           |       |       |
+           |       |       +--+ {'dict': ['F', 'M'], 'type': 'embedding', 'name': 'gender', 'seq': 'no_sequence'}
+           |       |       |
+           |       |       +--+ {'dict': ['1', '18', '25', '35', '45', '50', '56'], 'type': 'embedding', 'name': 'age', 'seq': 'no_sequence'}
+           |       |       |
+           |       |       +--+ {'dict': [...], 'type': 'embedding', 'name': 'occupation', 'seq': 'no_sequence'}
+           |       |
+           |       +--+ feature_map [1, 2, 3]
+           |
+           +--+ 1 # 用户1的特征
+           |
+           +--+ 2
+           +--+ ...
+
+
+分割训练/测试文件
+'''''''''''''''
+
+我们将 :code:`ml-1m/ratings.dat` 文件分割为训练和测试文件。分割文件的方法是：对于每位用户，我们将评分分成两部分。
+这样的话每位用户在测试文件中将与训练文件含有同样的信息。
+
+用 :code:`separate.py` 来分离训练和测试文件。
+
+.. code-block:: bash
+
+	python split.py ml-1m/ratings.dat --delimiter="::" --test_ratio=0.1
+
+这样就会生成两个文件：:code:`ml-1m/ratings.dat.train` 和 :code:`ml-1m/ratings.data.test` 。
+将他们移动到目录 :code:`data` ，然后进行随机打乱，再为paddle的训练过程提供文件列表。
+
+..  code-block:: bash
+
+    shuf ml-1m/ratings.dat.train > ratings.dat.train
+    cp ml-1m/ratings.dat.test .
+    echo "./data/ratings.dat.train" > train.list
+    echo "./data/ratings.dat.test" > test.list
+
+
+神经网络结构配置
+``````````````
+
+训练器配置文件
+''''''''''''
+
+网络结构如下图所示：
+
+..  image:: rec_regression_network.png
+    :align: center
+    :alt: rec_regression_network
+
+该示例的神经网络配置文件 :code:`trainer_config.py` 如下所示：
+
+..  literalinclude:: ../../../demo/recommendation/trainer_config.py
+    :language: python
+    :lines: 15-
+
+在文件 :code:`trainer_config.py` 中，我们仅仅是将每个特征种类映射到一个特征向量中，以下
+展示了如何将每个特征映射到一个向量。
+
+* :code:`id` \: 仅仅是简单的嵌入，然后添加一个全连接层。
+* :code:`embedding` \:
+    - 如果是序列，则先做嵌入，然后再做一次文本卷积网络操作，
+      然后得到平均采样的结果。
+    - 如果不是序列，则先做嵌入，然后添加一个全连接层。
+* :code:`one_host_dense` \:
+    - 仅仅是两个全连接层。
+
+然后我们利用多输入的:code:`fc_layer` 全连接层将电影的每个特征结合成一个电影特征，
+并且对用户的特征做同样的操作，也得到一个用户特征。然后我们求这两个特征的余弦相似度。
+
+在这些网络中，我们用以下的一些:ref:`api_trainer_config` 中的接口。
+
+*  数据层， :ref:`api_trainer_config_helpers_layers_data_layer`
+*  全连接层， :ref:`api_trainer_config_helpers_layers_fc_layer`
+*  嵌入层， :ref:`api_trainer_config_helpers_layers_embedding_layer`
+*  文本投影层， :ref:`api_trainer_config_helpers_layers_context_projection`
+*  采样层， :ref:`api_trainer_config_helpers_layers_pooling_layer`
+*  余弦相似度层， :ref:`api_trainer_config_helpers_layers_cos_sim`
+*  文本卷积采样层， :ref:`api_trainer_config_helpers_network_text_conv_pool`
+*  声明Python数据源， :ref:`api_trainer_config_helpers_data_sources` 
+
+数据提供脚本
+'''''''''''
+
+..  literalinclude:: ../../../demo/recommendation/dataprovider.py
+    :language: python
+    :lines: 15-
+
+数据提供脚本仅仅是读取meta.bin和评分文件，生成训练需要的样本。
+在脚本 :code:`dataprovider.py` 中，我们需要设置：
+
+* obj.slots\: 特征的类型和维度。
+* use_seq\: :code:`dataprovider.py` 中的数据是否为序列模式。
+* process\: 返回数据的每一条样本给 :code:`paddle` 。
+
+数据提供脚本的细节文档可以参考 :ref:`api_pydataprovider` 。
+
+训练
+````
+
+准备好数据，配置了网络，编写好数据提供脚本后，现在我们可以开始paddle训练了。
+
+代码 :code:`run.sh` 如下：
+
+..  literalinclude:: ../../../demo/recommendation/run.sh
+    :language: bash
+    :lines: 16-
+
+该脚本仅仅是开始一个paddle训练过程，将日志写入文件 :code:`log.txt` ，然后
+打印在屏幕上。
+
+脚本 :code:`run.sh` 中的每一行命令，请参考页面 :ref:`cmd_line_index` 。
+这些参数的简短介绍如下：
+
+*  config\: 告诉paddle哪个文件是神经网络的配置文件。
+*  save_dir\: 告诉paddle将模型保存在: code:`./output` 中。
+*  use_gpu\: 是否使用GPU，默认为不使用。
+*  trainer_count\: 一台机器上面的线程数量。
+*  test_all_data_in_one_period\: 每一个测试周期测试一次所有数据。否则，
+   每个测试周期测试: code:`batch_size` 批次的数据。
+*  log_period\: 在训练了: code:`log_period` 批次后打印日志。
+*  dot_period\: 在每训练: code:`dot_period` 个批次后打印一个 :code:`.` 。
+*  num_passes\: 训练至多: code:`num_passes` 轮。
+
+如果训练过程启动成功的话，输出应该类似如下：
+
+..  code-block:: text
+
+    I0601 08:07:22.832059 10549 TrainerInternal.cpp:157]  Batch=100 samples=160000 AvgCost=4.13494 CurrentCost=4.13494 Eval:  CurrentEval:
+
+    I0601 08:07:50.672627 10549 TrainerInternal.cpp:157]  Batch=200 samples=320000 AvgCost=3.80957 CurrentCost=3.48421 Eval:  CurrentEval:
+
+    I0601 08:08:18.877369 10549 TrainerInternal.cpp:157]  Batch=300 samples=480000 AvgCost=3.68145 CurrentCost=3.42519 Eval:  CurrentEval:
+
+    I0601 08:08:46.863963 10549 TrainerInternal.cpp:157]  Batch=400 samples=640000 AvgCost=3.6007 CurrentCost=3.35847 Eval:  CurrentEval:
+
+    I0601 08:09:15.413025 10549 TrainerInternal.cpp:157]  Batch=500 samples=800000 AvgCost=3.54811 CurrentCost=3.33773 Eval:  CurrentEval:
+    I0601 08:09:36.058670 10549 TrainerInternal.cpp:181]  Pass=0 Batch=565 samples=902826 AvgCost=3.52368 Eval:
+    I0601 08:09:46.215489 10549 Tester.cpp:101]  Test samples=97383 cost=3.32155 Eval:
+    I0601 08:09:46.215966 10549 GradientMachine.cpp:132] Saving parameters to ./output/model/pass-00000
+    I0601 08:09:46.233397 10549 ParamUtil.cpp:99] save dir ./output/model/pass-00000
+    I0601 08:09:46.233438 10549 Util.cpp:209] copy trainer_config.py to ./output/model/pass-00000
+    I0601 08:09:46.233541 10549 ParamUtil.cpp:147] fileName trainer_config.py
+
+模型被保存在 :code:`output/` 目录中。你可以在任何时候用 :code:`Ctrl-C` 来停止训练。
+
+模型评估和预测
+````````````
+
+在训练了几个轮次以后，你可以对模型进行评估，得到最好轮次下的模型。运行下面命令即可：
+
+.. code-block:: bash
+
+    ./evaluate.sh 
+
+你将看到如下的信息：
+
+.. code-block:: text
+
+    Best pass is 00009,  error is 3.06949, which means predict get error as 0.875998002281
+    evaluating from pass output/pass-00009
+
+然后，你可以预测任何用户对于任何一部电影的评价，运行下面命令即可：
+
+..  code-block:: bash
+
+    python prediction.py 'output/pass-00009/'
+
+预测程序将读取用户的输入，然后输出预测分数。用户预测的命令行界面如下：
+
+..  code-block:: text
+
+    Input movie_id: 9
+    Input user_id: 4
+    Prediction Score is 2.56
+    Input movie_id: 8
+    Input user_id: 2
+    Prediction Score is 3.13
diff --git a/doc/tutorials/rec/ml_regression_en.rst b/doc/tutorials/rec/ml_regression_en.rst
index 0c14e4f5bb7f815a06c0c756b1a6e6ef9099fd66..993b9a516f134ff8b59e8755b721f76c8f32f0fd 100644
--- a/doc/tutorials/rec/ml_regression_en.rst
+++ b/doc/tutorials/rec/ml_regression_en.rst
@@ -16,7 +16,7 @@ Data Preparation
 ````````````````
 Download and extract dataset
 ''''''''''''''''''''''''''''
-We use `movielens 1m dataset <ml_dataset.html>`_ here. 
+We use :ref:`demo_ml_dataset` here. 
 To download and unzip the dataset, simply run the following commands.
 
 ..  code-block:: bash
@@ -36,7 +36,7 @@ And the directory structure of :code:`demo/recommendation/data/ml-1m` is:
 
 Field config file
 '''''''''''''''''
-**Field config file** is used to specific the fields dataset and file format,
+**Field config file** is used to specify the fields of the dataset and the file format,
 i.e, specific **WHAT** type it is in each feature file.
 
 The field config file of ml-1m shows in :code:`demo/recommendation/data/config.json`.
@@ -188,7 +188,7 @@ Split Training/Testing files
 We split :code:`ml-1m/ratings.dat` into a training and testing file. The way to split file is for each user, we split the
 rating by two parts. So each user in testing file will have some rating information in training file.
 
-Use separate.py to separate the training and testing file.
+Use :code:`separate.py` to separate the training and testing file.
 
 ..  code-block:: bash
 
@@ -217,7 +217,7 @@ The network structure shows below.
     :align: center
     :alt: rec_regression_network
 
-The demo's neural network config file "trainer_config.py" show as below.
+The demo's neural network config file :code:`trainer_config.py` show as below.
 
 ..  literalinclude:: ../../../demo/recommendation/trainer_config.py
     :language: python
@@ -239,26 +239,16 @@ Then we combine each features of movie into one movie feature by a
 get one user feature. Then we calculate the cosine similarity of these two
 features.
 
-In these network, we use several api in `trainer_config_helpers
-<../../ui/api/trainer_config_helpers/index.html>`_. There are
-
-*  Data Layer, `data_layer 
-   <../../ui/api/trainer_config_helpers/layers.html#id1>`_
-*  Fully Connected Layer, `fc_layer
-   <../../ui/api/trainer_config_helpers/layers.html#fc-layer>`_
-*  Embedding Layer, `embedding_layer
-   <../../ui/api/trainer_config_helpers/layers.html#embedding-layer>`_
-*  Context Projection Layer, `context_projection
-   <../../ui/api/trainer_config_helpers/layers.html#context-projection>`_
-*  Pooling Layer, `pooling_layer
-   <../../ui/api/trainer_config_helpers/layers.html#pooling-layer>`_
-*  Cosine Similarity Layer, `cos_sim
-   <../../ui/api/trainer_config_helpers/layers.html#cos-sim>`_
-*  Text Convolution Pooling Layer, `text_conv_pool
-   <../../ui/api/trainer_config_helpers/networks.html
-   #trainer_config_helpers.networks.text_conv_pool>`_
-*  Declare Python Data Sources, `define_py_data_sources2
-   <../../ui/api/trainer_config_helpers/data_sources.html>`_
+In these networks, we use several APIs in :ref:`api_trainer_config` . There are
+
+*  Data Layer, :ref:`api_trainer_config_helpers_layers_data_layer`
+*  Fully Connected Layer, :ref:`api_trainer_config_helpers_layers_fc_layer`
+*  Embedding Layer, :ref:`api_trainer_config_helpers_layers_embedding_layer`
+*  Context Projection Layer, :ref:`api_trainer_config_helpers_layers_context_projection`
+*  Pooling Layer, :ref:`api_trainer_config_helpers_layers_pooling_layer`
+*  Cosine Similarity Layer, :ref:`api_trainer_config_helpers_layers_cos_sim`
+*  Text Convolution Pooling Layer, :ref:`api_trainer_config_helpers_network_text_conv_pool`
+*  Declare Python Data Sources :ref:`api_trainer_config_helpers_data_sources`.
 
 Data Provider
 '''''''''''''
@@ -274,27 +264,26 @@ In this :code:`dataprovider.py`, we should set\:
 * use_seq\: Whether this :code:`dataprovider.py` in sequence mode or not.
 * process\: Return each sample of data to :code:`paddle`.
 
-The data provider details document see `there <../../ui/data_provider/pydataprovider2.html>`_.
+The data provider details document see :ref:`api_pydataprovider2`.
 
 Train
 `````
 
 After prepare data, config network, writting data provider, now we can run paddle training.
 
-The run.sh is shown as follow:
+The :code:`run.sh` is shown as follow:
 
 ..  literalinclude:: ../../../demo/recommendation/run.sh
     :language: bash
     :lines: 16-
 
-It just start a paddle training process, write the log to `log.txt`,
+It just start a paddle training process, write the log to :code:`log.txt`,
 then print it on screen.
 
-Each command line argument in :code:`run.sh`, please refer to the `command line
-arguments <../../ui/index.html#command-line-argument>`_ page. The short description of these arguments is shown as follow.
+Each command line argument in :code:`run.sh`, please refer to the :ref:`cmd_line_index` page. The short description of these arguments is shown as follow.
 
 *  config\: Tell paddle which file is neural network configuration.
-*  save_dir\: Tell paddle save model into './output'
+*  save_dir\: Tell paddle save model into :code:`./output`.
 *  use_gpu\: Use gpu or not. Default is false.
 *  trainer_count\: The compute thread in one machine.
 *  test_all_data_in_one_period\: Test All Data during one test period. Otherwise,
diff --git a/doc/tutorials/semantic_role_labeling/index_cn.md b/doc/tutorials/semantic_role_labeling/index_cn.md
index c7e0a78f5071ed0d1702036f4ee0af3881096c68..f6061766c038a7bb6e4ae376685a10cd5669d2ed 100644
--- a/doc/tutorials/semantic_role_labeling/index_cn.md
+++ b/doc/tutorials/semantic_role_labeling/index_cn.md
@@ -149,7 +149,7 @@ paddle train \
 
 训练后，模型将保存在目录`output`中。 我们的训练曲线如下：
 <center>
-![pic](./curve.jpg)
+![pic](./src/curve.jpg)
 </center>
 
 ### 测试
diff --git a/doc/tutorials/semantic_role_labeling/index_en.md b/doc/tutorials/semantic_role_labeling/index_en.md
index f5bdf64487aa189cefcd55d633cc6638912b9e31..92d7c634832119c718711a57c16f69492d405f28 100644
--- a/doc/tutorials/semantic_role_labeling/index_en.md
+++ b/doc/tutorials/semantic_role_labeling/index_en.md
@@ -1,3 +1,7 @@
+```eval_rst
+..  _semantic_role_labeling:
+```
+
 # Semantic Role labeling Tutorial #
 
 Semantic role labeling (SRL) is a form of shallow semantic parsing whose goal is to discover the predicate-argument structure of each predicate in a given input sentence. SRL is useful as an intermediate step in a wide range of natural language processing tasks, such as information extraction. automatic document categorization and question answering.  An instance is as following [1]:
@@ -41,13 +45,13 @@ Unlike Bidirectional-LSTM that used in Sentiment Analysis demo,  the DB-LSTM ado
 
 The following figure shows a temporal expanded 2-layer DB-LSTM network.
 <center>
-![pic](./network_arch.png)
+![pic](./src/network_arch.png)
 </center>
 
 ### Features
 Two input features play an essential role in this pipeline: predicate (pred) and argument (argu). Two other features: predicate context (ctx-p) and region mark (mr) are also adopted. Because a single predicate word can not exactly describe the predicate information, especially when the same words appear more than one times in a sentence. With the predicate context, the ambiguity can be largely eliminated. Similarly, we use region mark m<sub>r</sub> = 1 to denote the argument position if it locates in the predicate context region, or m<sub>r</sub> = 0 if does not. These four simple features are all we need for our SRL system. Features of one sample with context size set to 1 is showed as following[2]:
 <center>
-![pic](./feature.jpg)
+![pic](./src/feature.jpg)
 </center>
 
 In this sample, the coresponding labelled sentence is:
@@ -148,7 +152,7 @@ paddle train \
 
 After training, the models  will be saved in directory `output`. Our training curve is as following:
 <center>
-![pic](./curve.jpg)
+![pic](./src/curve.jpg)
 </center>
 
 ### Run testing
diff --git a/doc/tutorials/semantic_role_labeling/semantic_role_labeling_cn.md b/doc/tutorials/semantic_role_labeling/semantic_role_labeling_cn.md
deleted file mode 100644
index f3c855a9fd72b894ab69050b08c750fe9e4aa1a2..0000000000000000000000000000000000000000
--- a/doc/tutorials/semantic_role_labeling/semantic_role_labeling_cn.md
+++ /dev/null
@@ -1,201 +0,0 @@
-# 语义角色标注教程 #
-
-语义角色标注（Semantic role labeling, SRL）是浅语义解析的一种形式，其目的是在给定的输入句子中发现每个谓词的谓词参数结构。 SRL作为很多自然语言处理任务中的中间步骤是很有用的，如信息提取、文档自动分类和问答。 实例如下 [1]:
-
- [ <sub>A0</sub> 他 ] [ <sub>AM-MOD</sub> 将 ][ <sub>AM-NEG</sub> 不会 ] [ <sub>V</sub> 接受] [ <sub>A1</sub> 任何东西 ] 从 [<sub>A2</sub> 那些他写的东西中 ]。
-
-- V: 动词
-- A0: 接受者
-- A1: 接受的东西
-- A2: 从……接受
-- A3: 属性
-- AM-MOD: 情态动词 
-- AM-NEG: 否定
-
-给定动词“接受”，句子中的大部分将会扮演某些语义角色。这里，标签方案来自 Penn Proposition Bank。
-
-到目前为止，大多数成功的SRL系统是建立在某种形式的解析结果之上的，其中在语法结构上使用了预先定义的特征模板。 本教程将介绍使用深度双向长短期记忆（DB-LSTM）模型[2]的端到端系统来解决SRL任务，这在很大程度上优于先前的最先进的系统。 这个系统将SRL任务视为序列标记问题。
-
-## 数据描述
-相关论文[2]采用 CoNLL-2005＆2012 共享任务中设置的数据进行训练和测试。根据数据许可证，演示采用 CoNLL-2005 的测试数据集，可以在网站上找到。
-
-用户只需执行以下命令就可以下载并处理原始数据：
-
-```bash
-cd data
-./get_data.sh
-```
-`data `目录会出现如下几个新的文件：
-```bash
-conll05st-release：the test data set of CoNll-2005 shared task 
-test.wsj.words：the Wall Street Journal data sentences
-test.wsj.props:  the propositional arguments
-feature: the extracted features from data set
-```
-
-## 训练
-### DB-LSTM
-请参阅情绪分析的演示以了解有关长期短期记忆单元的更多信息。
-
-与在 Sentiment Analysis 演示中使用的 Bidirectional-LSTM 不同，DB-LSTM 采用另一种方法来堆叠LSTM层。首先，标准LSTM以正向处理该序列。该 LSTM 层的输入和输出作为下一个 LSTM 层的输入，并被反向处理。这两个标准 LSTM 层组成一对 LSTM。然后我们堆叠一对对的 LSTM 层后得到深度 LSTM 模型。
-
-下图展示了时间扩展的2层 DB-LSTM 网络。
-<center>
-![pic](./network_arch.png)
-</center>
-
-### 特征
-两个输入特性在这个管道中起着至关重要的作用：predicate（pred）和argument（arguments）。 还采用了两个其他特征：谓词上下文（ctx-p）和区域标记（mr）。 因为单个谓词不能精确地描述谓词信息，特别是当相同的词在句子中出现多于一次时。 使用谓词上下文，可以在很大程度上消除歧义。类似地，如果它位于谓词上下文区域中，则使用区域标记 m<sub>r</sub> = 1 来表示参数位置，反之则 m<sub>r</sub> = 0。这四个简单的特征是我们的SRL系统所需要的。上下文大小设置为1的一个样本的特征如下[2]所示：
-<center>
-![pic](./feature.jpg)
-</center>
-
-在这个示例中，相应的标记句子是：
-
-[ <sub>A1</sub> A record date ] has [ <sub>AM-NEG</sub> n't ] been [ <sub>V</sub> set ] . 
-
-在演示中, 我们采用上面的特征模板, 包括：  `argument`, `predicate`, `ctx-p (p=-1,0,1)`, `mark` 并使用 `B/I/O` 方案来标记每个参数。这些特征和标签存储在 `feature` 文件中, 用`\t`分割。
-
-### 数据提供
-
-`dataprovider.py` 是一个包装数据的 Python 文件。 函数 `hook()` 定义了网络的数据槽。六个特征和标签都是索引槽。
-```
-def hook(settings, word_dict, label_dict, **kwargs):
-    settings.word_dict = word_dict
-    settings.label_dict = label_dict
-    #all inputs are integral and sequential type
-    settings.slots = [
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(predicate_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(len(word_dict)),
-        integer_value_sequence(2),
-        integer_value_sequence(len(label_dict))]
-```
-相应的数据迭代器如下：
-```
-@provider(init_hook=hook, should_shuffle=True, calc_batch_size=get_batch_size,
-          can_over_batch_size=False, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, file_name):
-    with open(file_name, 'r') as fdata:
-        for line in fdata:
-            sentence, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2,  mark, label = \
-                line.strip().split('\t')
-
-            words = sentence.split()
-            sen_len = len(words)
-            word_slot = [settings.word_dict.get(w, UNK_IDX) for w in words]
-
-            predicate_slot = [settings.predicate_dict.get(predicate)] * sen_len
-            ctx_n2_slot = [settings.word_dict.get(ctx_n2, UNK_IDX)] * sen_len
-            ctx_n1_slot = [settings.word_dict.get(ctx_n1, UNK_IDX)] * sen_len
-            ctx_0_slot = [settings.word_dict.get(ctx_0, UNK_IDX)] * sen_len
-            ctx_p1_slot = [settings.word_dict.get(ctx_p1, UNK_IDX)] * sen_len
-            ctx_p2_slot = [settings.word_dict.get(ctx_p2, UNK_IDX)] * sen_len
-
-            marks = mark.split()
-            mark_slot = [int(w) for w in marks]
-
-            label_list = label.split()
-            label_slot = [settings.label_dict.get(w) for w in label_list]
-            yield word_slot, predicate_slot, ctx_n2_slot, ctx_n1_slot, \
-                  ctx_0_slot, ctx_p1_slot, ctx_p2_slot, mark_slot, label_slot
-```
-函数 `process` 产出有8个特征和标签的9个表。
-
-### 神经网络配置
-
-`db_lstm.py` 是在训练过程中加载字典并定义数据提供程序模块和网络架构的神经网络配置文件。
-
-九个 `data_layer` 从数据提供程序加载实例。八个特征分别转换为嵌入，并由`mixed_layer`混合。 深度双向LSTM层提取softmax层的特征。目标函数是标签的交叉熵。
-
-### 训练 
-训练的脚本是 `train.sh`，用户只需执行:
-```bash
-  ./train.sh
-```
-`train.sh` 中的内容：
-```
-paddle train \
-  --config=./db_lstm.py \
-  --use_gpu=0 \
-  --log_period=5000 \
-  --trainer_count=1 \
-  --show_parameter_stats_period=5000 \
-  --save_dir=./output \
-  --num_passes=10000 \
-  --average_test_period=10000000 \
-  --init_model_path=./data \
-  --load_missing_parameter_strategy=rand \
-  --test_all_data_in_one_period=1 \
-2>&1 | tee 'train.log'
-```
-
--  \--config=./db_lstm.py : 网络配置文件
--  \--use_gpu=false: 使用 CPU 训练（如果已安装 PaddlePaddle GPU版本并想使用 GPU 训练可以设置为true，目前 crf_layer 不支持 GPU）
--  \--log_period=500: 每20批(batch)输出日志
--  \--trainer_count=1: 设置线程数（或 GPU 数）
--  \--show_parameter_stats_period=5000: 每100批显示参数统计
--  \--save_dir=./output: 模型输出路径
--  \--num_passes=10000: 设置通过数，一次通过意味着PaddlePaddle训练数据集中的所有样本一次
--  \--average_test_period=10000000:  每个 average_test_period 批次对平均参数进行测试
--  \--init_model_path=./data: 参数初始化路径
--  \--load_missing_parameter_strategy=rand: 随机初始不存在的参数
--  \--test_all_data_in_one_period=1: 在一个周期内测试所有数据
-
-
-训练后，模型将保存在目录`output`中。 我们的训练曲线如下：
-<center>
-![pic](./curve.jpg)
-</center>
-
-### 测试
-测试脚本是 `test.sh`, 执行:
-```bash
-  ./test.sh
-```
-`tesh.sh` 的主要部分：
-```
-paddle train \
-  --config=./db_lstm.py \
-  --model_list=$model_list \
-  --job=test \
-  --config_args=is_test=1 \
-```
-
-  - \--config=./db_lstm.py: 网络配置文件
-  - \--model_list=$model_list.list: 模型列表文件
-  - \--job=test: 指示测试任务
-  - \--config_args=is_test=1: 指示测试任务的标记
-  - \--test_all_data_in_one_period=1: 在一个周期内测试所有数据
-  
-
-### 预测
-预测脚本是 `predict.sh`，用户只需执行：
-```bash
-  ./predict.sh
-  
-```
-在`predict.sh`中，用户应该提供网络配置文件，模型路径，标签文件，字典文件，特征文件。
-```
-python predict.py 
-     -c $config_file \
-     -w $best_model_path \
-     -l $label_file \
-     -p $predicate_dict_file  \
-     -d $dict_file \
-     -i $input_file \
-     -o $output_file
-```
-
-`predict.py` 是主要的可执行python脚本，其中包括函数：加载模型，加载数据，数据预测。网络模型将输出标签的概率分布。 在演示中，我们使用最大概率的标签作为结果。用户还可以根据概率分布矩阵实现集束搜索或维特比解码。
-
-预测后，结果保存在 `predict.res` 中。
-
-## 引用
-[1] Martha Palmer, Dan Gildea, and Paul Kingsbury. The Proposition Bank: An Annotated Corpus of Semantic Roles , Computational Linguistics, 31(1), 2005. 
-
-[2] Zhou, Jie, and Wei Xu. "End-to-end learning of semantic role labeling using recurrent neural networks." Proceedings of the Annual Meeting of the Association for Computational Linguistics. 2015.
diff --git a/doc/tutorials/semantic_role_labeling/curve.jpg b/doc/tutorials/semantic_role_labeling/src/curve.jpg
similarity index 100%
rename from doc/tutorials/semantic_role_labeling/curve.jpg
rename to doc/tutorials/semantic_role_labeling/src/curve.jpg
diff --git a/doc/tutorials/semantic_role_labeling/src/feature.jpg b/doc/tutorials/semantic_role_labeling/src/feature.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..0e3310e4ace5613917e7779d3198ccbb3cdc5ada
Binary files /dev/null and b/doc/tutorials/semantic_role_labeling/src/feature.jpg differ
diff --git a/doc/tutorials/semantic_role_labeling/src/network_arch.png b/doc/tutorials/semantic_role_labeling/src/network_arch.png
new file mode 100644
index 0000000000000000000000000000000000000000..4ae7864212f2a0a38102ee7ff600527ea99fec82
Binary files /dev/null and b/doc/tutorials/semantic_role_labeling/src/network_arch.png differ
diff --git a/doc_cn/demo/sentiment_analysis/sentiment_analysis.md b/doc/tutorials/sentiment_analysis/index_cn.md
similarity index 93%
rename from doc_cn/demo/sentiment_analysis/sentiment_analysis.md
rename to doc/tutorials/sentiment_analysis/index_cn.md
index b70f2d59675615c26b29932cdf99d728bb206148..1323ec1a6abb2e7b5eeb2fbfff9cce5fe78a2c06 100644
--- a/doc_cn/demo/sentiment_analysis/sentiment_analysis.md
+++ b/doc/tutorials/sentiment_analysis/index_cn.md
@@ -1,324 +1,325 @@
-# 情感分析教程
-
-情感分析有许多应用场景。 一个基本的应用场景是区分给定文本的褒贬两极性，给定的文本可以是一个文档、句子、或者是一个小的文本片段。 一个简单的例子如：把用户在购物网站、旅游网站、团购网站（亚马逊、天猫、淘宝等）上发表的评论分成正面评论和负面评论两类。
-
-情感分析也常用于基于大量评论和个人博客来监控社会媒体。 例如，研究人员分析了几个关于消费者信心和政治观点的调查，结果发现它们与同时期的Twitter消息中的情绪词频率相关 [1]。 另一个例子是通过分析每日Twitter博客的文本内容来预测股票变动 [2]。
-
-另一方面，抓取产品的用户评论并分析他们的情感，有助于理解用户对不同公司，不同产品，甚至不同竞争对手产品的偏好。
-
-本教程将指导您完成长期短期记忆（LSTM）网络的训练过程，以分类来自[大型电影评论数据集](http://ai.stanford.edu/~amaas/data/sentiment/)（有时称为[互联网电影数据库 (IMDB)](http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf)）的句子的情感 。 此数据集包含电影评论及其相关联的类别标签，即正面和负面。
-
-## 数椐准备
-
-### IMDB 数椐介绍
-
-训练模型之前, 我们需要预处理数椐并构建一个字典。 首先, 你可以使用下面的脚本下载 IMDB 数椐集和[Moses](http://www.statmt.org/moses/)工具, 这是一个基于统计的机器翻译系统. 我们提供了一个数据预处理脚本，它不仅能够处理IMDB数据，还能处理其他用户自定义的数据。 为了使用提前编写的脚本，需要将标记的训练和测试样本移动到另一个路径，这已经在`get_imdb.sh`中完成。
-
-```
-cd demo/sentiment/data
-./get_imdb.sh
-```
-如果数椐获取成功，你将在目录```./demo/sentiment/data```中看到下面的文件：
-
-```
-aclImdb  get_imdb.sh  imdb  mosesdecoder-master
-```
-
-* aclImdb: 从外部网站上下载的原始数椐集。
-* imdb: 仅包含训练和测试数椐集。
-* mosesdecoder-master: Moses 工具。
-
-IMDB数据集包含25,000个已标注过的高极性电影评论用于训练，25,000个用于测试。负面的评论的得分小于等于4,正面的评论的得大于等于7，总评分10分。 运行完脚本 `./get_imdb.sh`后, 我们可以看到在目录 `aclImdb`中的数椐集的结构如下：
-
-```
-imdbEr.txt  imdb.vocab  README  test  train
-```
-* train: 训练数椐集。
-* test : 测试数椐集。
-* imdb.vocab: 字典文件。
-* imdbEr.txt: 字典imdb.vocab中每个切分单词的预期评级。
-* README: 数椐说明文档。
-
-测试集和训练集目录包含下面的文件:
-
-```
-labeledBow.feat  neg  pos  unsup  unsupBow.feat  urls_neg.txt  urls_pos.txt  urls_unsup.txt
-```
-
-* pos: 正面评价样本，包含12,500个txt文件，每个文件是一个电影评论。
-* neg: 负面评价样本，包含12,500个txt文件，每个文件是一个电影评论。
-* unsup: 未标记的评价样本，包含50,000个txt文件。
-* urls_xx.txt: 每个评论的网址。
-* xxBow.feat: 用于统计词频的Bow模型特征。
-
-### IMDB 数椐准备
-
-在这个例子中，我们只使用已经标注过的训练集和测试集，且默认在训练集上构建字典，而不使用IMDB数椐集中的imdb.vocab做为字典。训练集已经做了随机打乱排序而测试集没有。 Moses 工具中的脚本`tokenizer.perl` 用于切分单单词和标点符号。执行下面的命令就可以预处理数椐。
-
-```
-cd demo/sentiment/
-./preprocess.sh
-```
-preprocess.sh:
-
-```
-data_dir="./data/imdb"
-python preprocess.py -i data_dir
-```
-
-* data_dir: 输入数椐所在目录。
-* preprocess.py: 预处理脚本。
-
-运行成功后目录`demo/sentiment/data/pre-imdb` 结构如下:
-
-```
-dict.txt  labels.list  test.list  test_part_000  train.list  train_part_000
-```
-* test\_part\_000 and train\_part\_000: 所有标记的测试集和训练集， 训练集已经随机打乱。
-* train.list and test.list: 训练集和测试集文件列表。
-* dict.txt: 利用训练集生成的字典。
-* labels.txt: neg  0, pos 1, 含义：标签0表示负面的评论，标签1表示正面的评论。
-
-### 用户自定义数椐预处理
-
-如果你执行其它的用情感分析来分类文本的任务，可以按如下的结构来准备数椐. 我们提供了脚本来构建字典和预处理数椐。所以你只用按下面的结构来组织数椐就行了。
-
-```
-dataset
-|----train
-|    |----class1
-|    |    |----text_files
-|    |----class2
-|    |    |----text_files
-|    |    ...
-|----test
-|    |----class1
-|    |    |----text_files
-|    |----class2
-|    |    |----text_files
-|    |    ...
-```
-* dataset: 一级目录。
-* train, test: 二级目录。
-* class1,class2,...: 三级目录。
-* text_files: 文本格式的实例文件。
-
-所有同目录下的文本实例文件都是同级别的。 每个文本文件包含一个或者多个实例，每一行表示一个实例。 为了充分的随机打乱训练集, 在预处理含有多行数椐的文本文件时参数设置稍有不同, 执行`preprocess.sh`脚本时需要加上`-m True`参数。 tokenizer.perl 默认用来切分单记和标点符号，如果你不需要这个操作，在运行`preprocess.sh`时加上`-t False`参数即可。
-
-## 训练模型
-
-在这步任务中,我们使用了循环神经网络（RNN）的 LSTM 架构来训练情感分析模型。 引入LSTM模型主要是为了克服消失梯度的问题。 LSTM网络类似于具有隐藏层的标准循环神经网络, 但是隐藏层中的每个普通节点被一个记忆单元替换。 每个记忆单元包含四个主要的元素: 输入门, 具有自循环连接的神经元，忘记门和输出门。 更多的细节可以在文献中找到[4]。 LSTM架构的最大优点是它可以在长时间间隔内记忆信息，而没有短时记忆的损失。在有新的单词来临的每一个时间步骤内，存储在记忆单元区块的历史信息被更新用来迭代的学习单词以合理的序列程现。
-
-<center>![LSTM](../../../doc/demo/sentiment_analysis/lstm.png)</center>
-<center>图表 1. LSTM [3]</center>
-
-情感分析是自然语言理解中最典型的问题之一。 它的目的是预测在一个序列中表达的情感态度。 通常, ，仅仅是一些关键词，如形容词和副词，在预测序列或段落的情感中起主要作用。然而有些评论上下文非常长，例如 IMDB的数椐集。 我们只所以使用LSTM来执行这个任务是因为其改进的设计并且具有门机制。 首先，它能够从词级到具有可变上下文长度的上下文级别来总结表示。 第二，它可以在句子级别利用可扩展的上下文, 而大多数方法只是利用n-gram级别的知识。第三，它直接学习段落表示，而不是组合上下文级别信息。
-
-在本演示中，我们提供两个网络，即双向LSTM和三层堆叠LSTM。
-
-#### 双向LSTM
-
-图2是双向LSTM网络，后面连全连接层和softmax层。
-
-<center>![BiLSTM](../../../doc/demo/sentiment_analysis/bi_lstm.jpg)</center>
-<center>图 2. Bidirectional-LSTM </center>
-
-#### Stacked-LSTM
-图3是三层LSTM结构。图的底部是word embedding(对文档处理后形成的单词向量)。 接下来，连接三个LSTM隐藏层，并且第二个是反向LSTM。然后提取隐藏LSTM层的所有时间步长的最大词向量作为整个序列的表示。 最后，使用具有softmax激活的全连接前馈层来执行分类任务。 更多内容可查看参考文献 [5]。
-
-<center>![StackedLSTM](../../../doc/demo/sentiment_analysis/stacked_lstm.jpg)</center>
-<center>图 3. Stacked-LSTM for sentiment analysis </center>
-
-**配置**
-
-进入`demo/sentiment` 目录 , `trainer_config.py` 是一个配置文件的例子, 其中包含算法和网络配置。第一行从`sentiment_net.py`中导出预定义的网络。
-
-trainer_config.py:
-
-```python
-from sentiment_net import *
-
-data_dir  = "./data/pre-imdb"
-# whether this config is used for test
-is_test = get_config_arg('is_test', bool, False)
-# whether this config is used for prediction
-is_predict = get_config_arg('is_predict', bool, False)
-dict_dim, class_dim = sentiment_data(data_dir, is_test, is_predict)
-
-################## Algorithm Config #####################
-
-settings(
-  batch_size=128,
-  learning_rate=2e-3,
-  learning_method=AdamOptimizer(),
-  regularization=L2Regularization(8e-4),
-  gradient_clipping_threshold=25
-)
-
-#################### Network Config ######################
-stacked_lstm_net(dict_dim, class_dim=class_dim,
-                 stacked_num=3, is_predict=is_predict)
-#bidirectional_lstm_net(dict_dim, class_dim=class_dim, is_predict=is_predict)
-```
-
-* **数椐定义**:
-   * get\_config\_arg(): 获取通过 `--config_args=xx` 设置的命令行参数。
-   * 定义训练数椐和测试数椐提供者, 这里使用了PaddlePaddle的Python接口来加载数椐。想了解更多细节可以参考PyDataProvider部分的文档
-
-* **算法配置**:
-   * 使用随机梯度下降（sgd）算法。
-   * 使用 adam 优化。
-   * 设置batch size大小为128。
-   * 设置平均sgd窗口。
-   * 设置全局学习率。
-* **网络配置**:
-   * dict_dim: 获取字典维度。
-   * class_dim: 设置类别数，IMDB有两个标签，即正面评价标签和负面评价标签。
-   * `stacked_lstm_net`: 预定义网络如图3所示，默认情况下使用此网络
-   * `bidirectional_lstm_net`: 预定义网络，如图2所示。
-
-**训练**
-
-首先安装PaddlePaddle。 然后使用下面的脚本 `train.sh` 来开启本地的训练。
-
-```
-cd demo/sentiment/
-./train.sh
-```
-
-train.sh:
-
-```
-config=trainer_config.py
-output=./model_output
-paddle train --config=$config \
-             --save_dir=$output \
-             --job=train \
-             --use_gpu=false \
-             --trainer_count=4 \
-             --num_passes=10 \
-             --log_period=20 \
-             --dot_period=20 \
-             --show_parameter_stats_period=100 \
-             --test_all_data_in_one_period=1 \
-             2>&1 | tee 'train.log'
-```
-
-* \--config=$config: 设置网络配置。
-* \--save\_dir=$output: 设置输出路径以保存训练完成的模型。
-* \--job=train: 设置工作模式为训练。
-* \--use\_gpu=false: 使用CPU训练，如果你安装GPU版本的PaddlePaddle，并想使用GPU来训练设置为true。
-* \--trainer\_count=4:设置线程数（或GPU个数）。
-* \--num\_passes=15: 设置pass，PaddlePaddle中的一个pass意味着对数据集中的所有样本进行一次训练。
-* \--log\_period=20: 每20个batch打印一次日志。
-* \--show\_parameter\_stats\_period=100: 每100个batch打印一次统计信息。
-* \--test\_all_data\_in\_one\_period=1: 每次测试都测试所有数据。
-
-如果运行成功，输出日志保存在路径 `demo/sentiment/train.log`中，模型保存在目录`demo/sentiment/model_output/`中。  输出日志说明如下：
-
-```
-Batch=20 samples=2560 AvgCost=0.681644 CurrentCost=0.681644 Eval: classification_error_evaluator=0.36875  CurrentEval: classification_error_evaluator=0.36875
-...
-Pass=0 Batch=196 samples=25000 AvgCost=0.418964 Eval: classification_error_evaluator=0.1922
-Test samples=24999 cost=0.39297 Eval: classification_error_evaluator=0.149406
-```
-- Batch=xx: 表示训练了xx个Batch。
-- samples=xx: 表示训练了xx个样本。。
-- AvgCost=xx: 从第0个batch到当前batch的平均损失。
-- CurrentCost=xx: 最新log_period个batch处理的当前损失。
-- Eval: classification\_error\_evaluator=xx: 表示第0个batch到当前batch的分类错误。
-- CurrentEval: classification\_error\_evaluator: 最新log_period个batch的分类错误。
-- Pass=0: 通过所有训练集一次称为一遍。 0表示第一次经过训练集。
-
-默认情况下，我们使用`stacked_lstm_net`网络，当传递相同的样本数时，它的收敛速度比`bidirectional_lstm_net`快。如果要使用双向LSTM，只需删除最后一行中的注释并把“stacked_lstm_net”注释掉。
-
-## 测试模型
-
-测试模型是指使用训练出的模型评估已标记的验证集。
-
-```
-cd demo/sentiment
-./test.sh
-```
-
-test.sh:
-
-```bash
-function get_best_pass() {
-  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
-  sed  -r 'N;s/Test.* error=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \
-  sort | head -n 1
-}
-
-log=train.log
-LOG=`get_best_pass $log`
-LOG=(${LOG})
-evaluate_pass="model_output/pass-${LOG[1]}"
-
-echo 'evaluating from pass '$evaluate_pass
-
-model_list=./model.list
-touch $model_list | echo $evaluate_pass > $model_list
-net_conf=trainer_config.py
-paddle train --config=$net_conf \
-             --model_list=$model_list \
-             --job=test \
-             --use_gpu=false \
-             --trainer_count=4 \
-             --config_args=is_test=1 \
-             2>&1 | tee 'test.log'
-```
-
-函数`get_best_pass`依据分类错误率获得最佳模型进行测试。 在本示例中，我们默认使用IMDB的测试数据集作为验证。 与训练不同，它需要在这里指定`--job = test`和模型路径，即`--model_list = $model_list`。如果运行成功，日志将保存在“demo / sentiment / test.log”的路径中。例如，在我们的测试中，最好的模型是`model_output / pass-00002`，分类误差是0.115645，如下：
-
-```
-Pass=0 samples=24999 AvgCost=0.280471 Eval: classification_error_evaluator=0.115645
-```
-
-## 预测
-
-`predict.py`脚本提供了一个预测接口。在使用它之前请安装PaddlePaddle的python api。 预测IMDB的未标记评论的一个实例如下：
-
-```
-cd demo/sentiment
-./predict.sh
-```
-predict.sh:
-
-```
-#Note the default model is pass-00002, you shold make sure the model path
-#exists or change the mode path.
-model=model_output/pass-00002/
-config=trainer_config.py
-label=data/pre-imdb/labels.list
-python predict.py \
-     -n $config\
-     -w $model \
-     -b $label \
-     -d data/pre-imdb/dict.txt \
-     -i data/aclImdb/test/pos/10007_10.txt
-```
-
-* `predict.py`: 预测接口脚本。
-*  -n $config : 设置网络配置。
-*  -w $model: 设置模型路径。
-*  -b $label: 设置标签类别字典，这个字典是整数标签和字符串标签的一个对应。
-*  -d data/pre-imdb/dict.txt: 设置字典文件。
-*  -i data/aclImdb/test/pos/10014_7.txt: 设置一个要预测的示例文件。
-
-注意应该确保默认模型路径`model_output / pass-00002`存在或更改为其它模型路径。
-
-本示例的预测结果：
-
-```
-Loading parameters from model_output/pass-00002/
-./data/aclImdb/test/pos/10014_7.txt: predicting label is pos
-```
-我们真诚地感谢您的关注，并欢迎您来参与贡献。
-
-## 参考文档
-[1] Brendan O'Connor, Ramnath Balasubramanyan, Bryan R. Routledge, and Noah A. Smith. 2010. [From Tweets to Polls: Linking Text Sentiment to Public Opinion Time Series](http://homes.cs.washington.edu/~nasmith/papers/oconnor+balasubramanyan+routledge+smith.icwsm10.pdf). In ICWSM-2010. <br>
-[2] Johan Bollen, Huina Mao, Xiaojun Zeng. 2011. [Twitter mood predicts the stock market](http://arxiv.org/abs/1010.3003), Journal of Computational Science.<br>
-[3] Alex Graves, Marcus Liwicki, Santiago Fernan- dez, Roman Bertolami, Horst Bunke, and Ju ̈rgen Schmidhuber. 2009. [A novel connectionist system for unconstrained handwriting recognition. IEEE Transactions on Pattern Analysis and Machine In- telligence](http://www.cs.toronto.edu/~graves/tpami_2009.pdf), 31(5):855–868.<br>
-[4] Zachary C. Lipton, [A Critical Review of Recurrent Neural Networks for Sequence Learning](http://arxiv.org/abs/1506.00019v1), arXiv:1506.00019. <br>
-[5] Jie Zhou and Wei Xu; [End-to-end Learning of Semantic Role Labeling Using Recurrent Neural Networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf); ACL-IJCNLP 2015. <br>
+# 情感分析教程
+
+情感分析有许多应用场景。 一个基本的应用场景是区分给定文本的褒贬两极性，给定的文本可以是一个文档、句子、或者是一个小的文本片段。 一个简单的例子如：把用户在购物网站、旅游网站、团购网站（亚马逊、天猫、淘宝等）上发表的评论分成正面评论和负面评论两类。
+
+情感分析也常用于基于大量评论和个人博客来监控社会媒体。 例如，研究人员分析了几个关于消费者信心和政治观点的调查，结果发现它们与同时期的Twitter消息中的情绪词频率相关 [1]。 另一个例子是通过分析每日Twitter博客的文本内容来预测股票变动 [2]。
+
+另一方面，抓取产品的用户评论并分析他们的情感，有助于理解用户对不同公司，不同产品，甚至不同竞争对手产品的偏好。
+
+本教程将指导您完成长期短期记忆（LSTM）网络的训练过程，以分类来自[大型电影评论数据集](http://ai.stanford.edu/~amaas/data/sentiment/)（有时称为[互联网电影数据库 (IMDB)](http://ai.stanford.edu/~amaas/papers/wvSent_acl2011.pdf)）的句子的情感 。 此数据集包含电影评论及其相关联的类别标签，即正面和负面。
+
+## 数椐准备
+
+### IMDB 数椐介绍
+
+训练模型之前, 我们需要预处理数椐并构建一个字典。 首先, 你可以使用下面的脚本下载 IMDB 数椐集和[Moses](http://www.statmt.org/moses/)工具, 这是一个基于统计的机器翻译系统. 我们提供了一个数据预处理脚本，它不仅能够处理IMDB数据，还能处理其他用户自定义的数据。 为了使用提前编写的脚本，需要将标记的训练和测试样本移动到另一个路径，这已经在`get_imdb.sh`中完成。
+
+```
+cd demo/sentiment/data
+./get_imdb.sh
+```
+如果数椐获取成功，你将在目录```./demo/sentiment/data```中看到下面的文件：
+
+```
+aclImdb  get_imdb.sh  imdb  mosesdecoder-master
+```
+
+* aclImdb: 从外部网站上下载的原始数椐集。
+* imdb: 仅包含训练和测试数椐集。
+* mosesdecoder-master: Moses 工具。
+
+IMDB数据集包含25,000个已标注过的高极性电影评论用于训练，25,000个用于测试。负面的评论的得分小于等于4,正面的评论的得大于等于7，总评分10分。 运行完脚本 `./get_imdb.sh`后, 我们可以看到在目录 `aclImdb`中的数椐集的结构如下：
+
+```
+imdbEr.txt  imdb.vocab  README  test  train
+```
+* train: 训练数椐集。
+* test : 测试数椐集。
+* imdb.vocab: 字典文件。
+* imdbEr.txt: 字典imdb.vocab中每个切分单词的预期评级。
+* README: 数椐说明文档。
+
+测试集和训练集目录包含下面的文件:
+
+```
+labeledBow.feat  neg  pos  unsup  unsupBow.feat  urls_neg.txt  urls_pos.txt  urls_unsup.txt
+```
+
+* pos: 正面评价样本，包含12,500个txt文件，每个文件是一个电影评论。
+* neg: 负面评价样本，包含12,500个txt文件，每个文件是一个电影评论。
+* unsup: 未标记的评价样本，包含50,000个txt文件。
+* urls_xx.txt: 每个评论的网址。
+* xxBow.feat: 用于统计词频的Bow模型特征。
+
+### IMDB 数椐准备
+
+在这个例子中，我们只使用已经标注过的训练集和测试集，且默认在训练集上构建字典，而不使用IMDB数椐集中的imdb.vocab做为字典。训练集已经做了随机打乱排序而测试集没有。 Moses 工具中的脚本`tokenizer.perl` 用于切分单单词和标点符号。执行下面的命令就可以预处理数椐。
+
+```
+cd demo/sentiment/
+./preprocess.sh
+```
+preprocess.sh:
+
+```
+data_dir="./data/imdb"
+python preprocess.py -i data_dir
+```
+
+* data_dir: 输入数椐所在目录。
+* preprocess.py: 预处理脚本。
+
+运行成功后目录`demo/sentiment/data/pre-imdb` 结构如下:
+
+```
+dict.txt  labels.list  test.list  test_part_000  train.list  train_part_000
+```
+* test\_part\_000 and train\_part\_000: 所有标记的测试集和训练集， 训练集已经随机打乱。
+* train.list and test.list: 训练集和测试集文件列表。
+* dict.txt: 利用训练集生成的字典。
+* labels.txt: neg  0, pos 1, 含义：标签0表示负面的评论，标签1表示正面的评论。
+
+### 用户自定义数椐预处理
+
+如果你执行其它的用情感分析来分类文本的任务，可以按如下的结构来准备数椐. 我们提供了脚本来构建字典和预处理数椐。所以你只用按下面的结构来组织数椐就行了。
+
+```
+dataset
+|----train
+|    |----class1
+|    |    |----text_files
+|    |----class2
+|    |    |----text_files
+|    |    ...
+|----test
+|    |----class1
+|    |    |----text_files
+|    |----class2
+|    |    |----text_files
+|    |    ...
+```
+* dataset: 一级目录。
+* train, test: 二级目录。
+* class1,class2,...: 三级目录。
+* text_files: 文本格式的实例文件。
+
+所有同目录下的文本实例文件都是同级别的。 每个文本文件包含一个或者多个实例，每一行表示一个实例。 为了充分的随机打乱训练集, 在预处理含有多行数椐的文本文件时参数设置稍有不同, 执行`preprocess.sh`脚本时需要加上`-m True`参数。 tokenizer.perl 默认用来切分单记和标点符号，如果你不需要这个操作，在运行`preprocess.sh`时加上`-t False`参数即可。
+
+## 训练模型
+
+在这步任务中,我们使用了循环神经网络（RNN）的 LSTM 架构来训练情感分析模型。 引入LSTM模型主要是为了克服消失梯度的问题。 LSTM网络类似于具有隐藏层的标准循环神经网络, 但是隐藏层中的每个普通节点被一个记忆单元替换。 每个记忆单元包含四个主要的元素: 输入门, 具有自循环连接的神经元，忘记门和输出门。 更多的细节可以在文献中找到[4]。 LSTM架构的最大优点是它可以在长时间间隔内记忆信息，而没有短时记忆的损失。在有新的单词来临的每一个时间步骤内，存储在记忆单元区块的历史信息被更新用来迭代的学习单词以合理的序列程现。
+
+<center>![LSTM](src/lstm.png)</center>
+<center>图表 1. LSTM [3]</center>
+
+情感分析是自然语言理解中最典型的问题之一。 它的目的是预测在一个序列中表达的情感态度。 通常, ，仅仅是一些关键词，如形容词和副词，在预测序列或段落的情感中起主要作用。然而有些评论上下文非常长，例如 IMDB的数椐集。 我们只所以使用LSTM来执行这个任务是因为其改进的设计并且具有门机制。 首先，它能够从词级到具有可变上下文长度的上下文级别来总结表示。 第二，它可以在句子级别利用可扩展的上下文, 而大多数方法只是利用n-gram级别的知识。第三，它直接学习段落表示，而不是组合上下文级别信息。
+
+在本演示中，我们提供两个网络，即双向LSTM和三层堆叠LSTM。
+
+#### 双向LSTM
+
+图2是双向LSTM网络，后面连全连接层和softmax层。
+
+<center>![BiLSTM](src/bi_lstm.jpg)</center>
+<center>图 2. Bidirectional-LSTM </center>
+
+#### Stacked-LSTM
+图3是三层LSTM结构。图的底部是word embedding(对文档处理后形成的单词向量)。 接下来，连接三个LSTM隐藏层，并且第二个是反向LSTM。然后提取隐藏LSTM层的所有时间步长的最大词向量作为整个序列的表示。 最后，使用具有softmax激活的全连接前馈层来执行分类任务。 更多内容可查看参考文献 [5]。
+
+<center>![StackedLSTM](src/stacked_lstm.jpg)</center>
+<center>图 3. Stacked-LSTM for sentiment analysis </center>
+
+**配置**
+
+进入`demo/sentiment` 目录 , `trainer_config.py` 是一个配置文件的例子, 其中包含算法和网络配置。第一行从`sentiment_net.py`中导出预定义的网络。
+
+trainer_config.py:
+
+```python
+from sentiment_net import *
+
+data_dir  = "./data/pre-imdb"
+# whether this config is used for test
+is_test = get_config_arg('is_test', bool, False)
+# whether this config is used for prediction
+is_predict = get_config_arg('is_predict', bool, False)
+dict_dim, class_dim = sentiment_data(data_dir, is_test, is_predict)
+
+################## Algorithm Config #####################
+
+settings(
+  batch_size=128,
+  learning_rate=2e-3,
+  learning_method=AdamOptimizer(),
+  regularization=L2Regularization(8e-4),
+  gradient_clipping_threshold=25
+)
+
+#################### Network Config ######################
+stacked_lstm_net(dict_dim, class_dim=class_dim,
+                 stacked_num=3, is_predict=is_predict)
+#bidirectional_lstm_net(dict_dim, class_dim=class_dim, is_predict=is_predict)
+```
+
+* **数椐定义**:
+   * get\_config\_arg(): 获取通过 `--config_args=xx` 设置的命令行参数。
+   * 定义训练数椐和测试数椐提供者, 这里使用了PaddlePaddle的Python接口来加载数椐。想了解更多细节可以参考PyDataProvider部分的文档
+
+* **算法配置**:
+   * 使用随机梯度下降（sgd）算法。
+   * 使用 adam 优化。
+   * 设置batch size大小为128。
+   * 设置平均sgd窗口。
+   * 设置全局学习率。
+* **网络配置**:
+   * dict_dim: 获取字典维度。
+   * class_dim: 设置类别数，IMDB有两个标签，即正面评价标签和负面评价标签。
+   * `stacked_lstm_net`: 预定义网络如图3所示，默认情况下使用此网络
+   * `bidirectional_lstm_net`: 预定义网络，如图2所示。
+
+**训练**
+
+首先安装PaddlePaddle。 然后使用下面的脚本 `train.sh` 来开启本地的训练。
+
+```
+cd demo/sentiment/
+./train.sh
+```
+
+train.sh:
+
+```
+config=trainer_config.py
+output=./model_output
+paddle train --config=$config \
+             --save_dir=$output \
+             --job=train \
+             --use_gpu=false \
+             --trainer_count=4 \
+             --num_passes=10 \
+             --log_period=20 \
+             --dot_period=20 \
+             --show_parameter_stats_period=100 \
+             --test_all_data_in_one_period=1 \
+             2>&1 | tee 'train.log'
+```
+
+* \--config=$config: 设置网络配置。
+* \--save\_dir=$output: 设置输出路径以保存训练完成的模型。
+* \--job=train: 设置工作模式为训练。
+* \--use\_gpu=false: 使用CPU训练，如果你安装GPU版本的PaddlePaddle，并想使用GPU来训练设置为true。
+* \--trainer\_count=4:设置线程数（或GPU个数）。
+* \--num\_passes=15: 设置pass，PaddlePaddle中的一个pass意味着对数据集中的所有样本进行一次训练。
+* \--log\_period=20: 每20个batch打印一次日志。
+* \--show\_parameter\_stats\_period=100: 每100个batch打印一次统计信息。
+* \--test\_all_data\_in\_one\_period=1: 每次测试都测试所有数据。
+
+如果运行成功，输出日志保存在路径 `demo/sentiment/train.log`中，模型保存在目录`demo/sentiment/model_output/`中。  输出日志说明如下：
+
+```
+Batch=20 samples=2560 AvgCost=0.681644 CurrentCost=0.681644 Eval: classification_error_evaluator=0.36875  CurrentEval: classification_error_evaluator=0.36875
+...
+Pass=0 Batch=196 samples=25000 AvgCost=0.418964 Eval: classification_error_evaluator=0.1922
+Test samples=24999 cost=0.39297 Eval: classification_error_evaluator=0.149406
+```
+- Batch=xx: 表示训练了xx个Batch。
+- samples=xx: 表示训练了xx个样本。。
+- AvgCost=xx: 从第0个batch到当前batch的平均损失。
+- CurrentCost=xx: 最新log_period个batch处理的当前损失。
+- Eval: classification\_error\_evaluator=xx: 表示第0个batch到当前batch的分类错误。
+- CurrentEval: classification\_error\_evaluator: 最新log_period个batch的分类错误。
+- Pass=0: 通过所有训练集一次称为一遍。 0表示第一次经过训练集。
+
+默认情况下，我们使用`stacked_lstm_net`网络，当传递相同的样本数时，它的收敛速度比`bidirectional_lstm_net`快。如果要使用双向LSTM，只需删除最后一行中的注释并把“stacked_lstm_net”注释掉。
+
+## 测试模型
+
+测试模型是指使用训练出的模型评估已标记的验证集。
+
+```
+cd demo/sentiment
+./test.sh
+```
+
+test.sh:
+
+```bash
+function get_best_pass() {
+  cat $1  | grep -Pzo 'Test .*\n.*pass-.*' | \
+  sed  -r 'N;s/Test.* error=([0-9]+\.[0-9]+).*\n.*pass-([0-9]+)/\1 \2/g' | \
+  sort | head -n 1
+}
+
+log=train.log
+LOG=`get_best_pass $log`
+LOG=(${LOG})
+evaluate_pass="model_output/pass-${LOG[1]}"
+
+echo 'evaluating from pass '$evaluate_pass
+
+model_list=./model.list
+touch $model_list | echo $evaluate_pass > $model_list
+net_conf=trainer_config.py
+paddle train --config=$net_conf \
+             --model_list=$model_list \
+             --job=test \
+             --use_gpu=false \
+             --trainer_count=4 \
+             --config_args=is_test=1 \
+             2>&1 | tee 'test.log'
+```
+
+函数`get_best_pass`依据分类错误率获得最佳模型进行测试。 在本示例中，我们默认使用IMDB的测试数据集作为验证。 与训练不同，它需要在这里指定`--job = test`和模型路径，即`--model_list = $model_list`。如果运行成功，日志将保存在“demo / sentiment / test.log”的路径中。例如，在我们的测试中，最好的模型是`model_output / pass-00002`，分类误差是0.115645，如下：
+
+```
+Pass=0 samples=24999 AvgCost=0.280471 Eval: classification_error_evaluator=0.115645
+```
+
+## 预测
+
+`predict.py`脚本提供了一个预测接口。在使用它之前请安装PaddlePaddle的python api。 预测IMDB的未标记评论的一个实例如下：
+
+```
+cd demo/sentiment
+./predict.sh
+```
+predict.sh:
+
+```
+#Note the default model is pass-00002, you shold make sure the model path
+#exists or change the mode path.
+model=model_output/pass-00002/
+config=trainer_config.py
+label=data/pre-imdb/labels.list
+cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \
+     --tconf=$config\
+     --model=$model \
+     --label=$label \
+     --dict=./data/pre-imdb/dict.txt \
+     --batch_size=1
+```
+
+* `cat ./data/aclImdb/test/pos/10007_10.txt` : 输入预测样本。
+* `predict.py` : 预测接口脚本。
+* `--tconf=$config` : 设置网络配置。
+* `--model=$model` : 设置模型路径。
+* `--label=$label` : 设置标签类别字典，这个字典是整数标签和字符串标签的一个对应。
+* `--dict=data/pre-imdb/dict.txt` : 设置字典文件。
+* `--batch_size=1` : 设置batch size。
+
+注意应该确保默认模型路径`model_output / pass-00002`存在或更改为其它模型路径。
+
+本示例的预测结果：
+
+```
+Loading parameters from model_output/pass-00002/
+./data/aclImdb/test/pos/10014_7.txt: predicting label is pos
+```
+我们真诚地感谢您的关注，并欢迎您来参与贡献。
+
+## 参考文档
+[1] Brendan O'Connor, Ramnath Balasubramanyan, Bryan R. Routledge, and Noah A. Smith. 2010. [From Tweets to Polls: Linking Text Sentiment to Public Opinion Time Series](http://homes.cs.washington.edu/~nasmith/papers/oconnor+balasubramanyan+routledge+smith.icwsm10.pdf). In ICWSM-2010. <br>
+[2] Johan Bollen, Huina Mao, Xiaojun Zeng. 2011. [Twitter mood predicts the stock market](http://arxiv.org/abs/1010.3003), Journal of Computational Science.<br>
+[3] Alex Graves, Marcus Liwicki, Santiago Fernan- dez, Roman Bertolami, Horst Bunke, and Ju ̈rgen Schmidhuber. 2009. [A novel connectionist system for unconstrained handwriting recognition. IEEE Transactions on Pattern Analysis and Machine In- telligence](http://www.cs.toronto.edu/~graves/tpami_2009.pdf), 31(5):855–868.<br>
+[4] Zachary C. Lipton, [A Critical Review of Recurrent Neural Networks for Sequence Learning](http://arxiv.org/abs/1506.00019v1), arXiv:1506.00019. <br>
+[5] Jie Zhou and Wei Xu; [End-to-end Learning of Semantic Role Labeling Using Recurrent Neural Networks](http://www.aclweb.org/anthology/P/P15/P15-1109.pdf); ACL-IJCNLP 2015. <br>
diff --git a/doc/tutorials/sentiment_analysis/index_en.md b/doc/tutorials/sentiment_analysis/index_en.md
index c53952c544de9fa88a6318432e34b0d05b149445..bb7681db44ca6f286ad6935ddfecb9becb429192 100644
--- a/doc/tutorials/sentiment_analysis/index_en.md
+++ b/doc/tutorials/sentiment_analysis/index_en.md
@@ -293,20 +293,21 @@ predict.sh:
 model=model_output/pass-00002/
 config=trainer_config.py
 label=data/pre-imdb/labels.list
-python predict.py \
-     -n $config\
-     -w $model \
-     -b $label \
-     -d data/pre-imdb/dict.txt \
-     -i data/aclImdb/test/pos/10007_10.txt
-```
-
-* `predict.py`: predicting interface.
-*  -n $config : set network configure.
-*  -w $model: set model path.
-*  -b $label: set dictionary about corresponding relation between integer label and string label.
-*  -d data/pre-imdb/dict.txt: set dictionary.
-*  -i data/aclImdb/test/pos/10014_7.txt: set one example file to predict.
+cat ./data/aclImdb/test/pos/10007_10.txt | python predict.py \
+     --tconf=$config\
+     --model=$model \
+     --label=$label \
+     --dict=./data/pre-imdb/dict.txt \
+     --batch_size=1
+```
+
+* `cat ./data/aclImdb/test/pos/10007_10.txt` : the input sample.
+* `predict.py` : predicting interface.
+* `--tconf=$config` : set network configure.
+* ` --model=$model` : set model path.
+* `--label=$label` : set dictionary about corresponding relation between integer label and string label.
+* `--dict=data/pre-imdb/dict.txt` : set dictionary.
+* `--batch_size=1` : set batch size.
 
 Note you should make sure the default model path `model_output/pass-00002`
 exists or change the model path.
diff --git a/doc/tutorials/sentiment_analysis/src/bi_lstm.jpg b/doc/tutorials/sentiment_analysis/src/bi_lstm.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..adec1606d64d6e35ffe7e62abfa9a09309b05c84
Binary files /dev/null and b/doc/tutorials/sentiment_analysis/src/bi_lstm.jpg differ
diff --git a/doc/tutorials/sentiment_analysis/src/lstm.png b/doc/tutorials/sentiment_analysis/src/lstm.png
new file mode 100644
index 0000000000000000000000000000000000000000..aaf1fc690da2ffb8418cde5ed81848ddb5263030
Binary files /dev/null and b/doc/tutorials/sentiment_analysis/src/lstm.png differ
diff --git a/doc/tutorials/sentiment_analysis/src/stacked_lstm.jpg b/doc/tutorials/sentiment_analysis/src/stacked_lstm.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..4239055050966e0095e188a8c81d860711bce29d
Binary files /dev/null and b/doc/tutorials/sentiment_analysis/src/stacked_lstm.jpg differ
diff --git a/doc_cn/CMakeLists.txt b/doc_cn/CMakeLists.txt
deleted file mode 100644
index 314b34525ca1d328f4e3b9814ee26deed39d89fd..0000000000000000000000000000000000000000
--- a/doc_cn/CMakeLists.txt
+++ /dev/null
@@ -1,31 +0,0 @@
-if(NOT DEFINED SPHINX_THEME)
-    set(SPHINX_THEME default)
-endif()
-
-if(NOT DEFINED SPHINX_THEME_DIR)
-    set(SPHINX_THEME_DIR)
-endif()
-
-# configured documentation tools and intermediate build results
-set(BINARY_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/_build")
-
-# Sphinx cache with pickled ReST documents
-set(SPHINX_CACHE_DIR "${CMAKE_CURRENT_BINARY_DIR}/_doctrees")
-
-# HTML output directory
-set(SPHINX_HTML_DIR "${CMAKE_CURRENT_BINARY_DIR}/html")
-
-configure_file(
-    "${CMAKE_CURRENT_SOURCE_DIR}/conf.py.in"
-    "${BINARY_BUILD_DIR}/conf.py"
-    @ONLY)
-
-sphinx_add_target(paddle_docs_cn
-                  html
-                  ${BINARY_BUILD_DIR}
-                  ${SPHINX_CACHE_DIR}
-                  ${CMAKE_CURRENT_SOURCE_DIR}
-                  ${SPHINX_HTML_DIR})
-
-add_dependencies(paddle_docs_cn
-  gen_proto_py)
diff --git a/doc_cn/algorithm/rnn/hrnn_demo.rst b/doc_cn/algorithm/rnn/hrnn_demo.rst
deleted file mode 100644
index 96396ff105d134920396ded9ad8f00494357a37c..0000000000000000000000000000000000000000
--- a/doc_cn/algorithm/rnn/hrnn_demo.rst
+++ /dev/null
@@ -1,7 +0,0 @@
-..	_algo_hrnn_demo:
-
-#################
-双层RNN的使用示例
-#################
-
-TBD
\ No newline at end of file
diff --git a/doc_cn/build_and_install/cmake/index.rst b/doc_cn/build_and_install/cmake/index.rst
deleted file mode 100644
index e2a12c500177ea5b075416380796ab82e1217f60..0000000000000000000000000000000000000000
--- a/doc_cn/build_and_install/cmake/index.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-使用cmake编译PaddlePaddle
-=========================
-
-..  toctree::
-    
-    install_deps.rst
-    compile_options.rst
-    make_and_install.rst
diff --git a/doc_cn/build_and_install/cmake/install_deps.rst b/doc_cn/build_and_install/cmake/install_deps.rst
deleted file mode 100644
index 7fa4665a954bd41e74145c4a1b00734c3ac41d83..0000000000000000000000000000000000000000
--- a/doc_cn/build_and_install/cmake/install_deps.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-安装编译PaddlePaddle需要的依赖
-==============================
-
-参见 `安装编译依赖 <../../../doc/build/build_from_source.html#install-dependencies>`_
diff --git a/doc_cn/build_and_install/cmake/make_and_install.rst b/doc_cn/build_and_install/cmake/make_and_install.rst
deleted file mode 100644
index 212b9c9352b01db5215221a6c2faafe0d679d962..0000000000000000000000000000000000000000
--- a/doc_cn/build_and_install/cmake/make_and_install.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-make和make install
-==================
-
-参见 `make和make install <../../../doc/build/build_from_source.html#build-and-install>`_
diff --git a/doc_cn/build_and_install/install/paddle_ssh.Dockerfile b/doc_cn/build_and_install/install/paddle_ssh.Dockerfile
deleted file mode 100644
index 7cb947bddf4593259cb69f525b44015836291605..0000000000000000000000000000000000000000
--- a/doc_cn/build_and_install/install/paddle_ssh.Dockerfile
+++ /dev/null
@@ -1,15 +0,0 @@
-FROM paddledev/paddle:cpu-latest
-
-MAINTAINER PaddlePaddle dev team <paddle-dev@baidu.com>
-
-RUN apt-get update
-RUN apt-get install -y openssh-server
-RUN mkdir /var/run/sshd
-RUN echo 'root:root' | chpasswd
-
-RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
-RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
-
-EXPOSE 22
-
-CMD    ["/usr/sbin/sshd", "-D"]
diff --git a/doc_cn/build_and_install/install/paddle_version.txt b/doc_cn/build_and_install/install/paddle_version.txt
deleted file mode 100644
index a80873303fd0d05d963482629000d76260185ef6..0000000000000000000000000000000000000000
--- a/doc_cn/build_and_install/install/paddle_version.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-PaddlePaddle 0.8.0b1, compiled with
-    with_avx: ON
-    with_gpu: OFF
-    with_double: OFF
-    with_python: ON
-    with_rdma: OFF
-    with_glog: ON
-    with_gflags: ON
-    with_metric_learning:
-    with_timer: OFF
-    with_predict_sdk:
diff --git a/doc_cn/cluster/index.rst b/doc_cn/cluster/index.rst
deleted file mode 100644
index 25313a9635bbf567a1aedfac3c379802d601d283..0000000000000000000000000000000000000000
--- a/doc_cn/cluster/index.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-集群训练
-========
-
-* `集群训练 <../../doc/cluster/index.html>`_
-
-.. toctree::
-    :maxdepth: 2
-    :glob:
-
-    集群训练(对内) <internal/index.md>
-
diff --git a/doc_cn/concepts/nn.rst b/doc_cn/concepts/nn.rst
deleted file mode 100644
index f4d2cf490d14761f4b9f6a308180c5e8015cbecb..0000000000000000000000000000000000000000
--- a/doc_cn/concepts/nn.rst
+++ /dev/null
@@ -1,3 +0,0 @@
-TBD
-
-目前正在书写中。敬请期待。
\ No newline at end of file
diff --git a/doc_cn/concepts/program_concepts.rst b/doc_cn/concepts/program_concepts.rst
deleted file mode 100644
index af5bbdac260afce0a032461ab913d05bc2f55929..0000000000000000000000000000000000000000
--- a/doc_cn/concepts/program_concepts.rst
+++ /dev/null
@@ -1,4 +0,0 @@
-TBD
-###
-
-目前正在书写中。敬请期待。
\ No newline at end of file
diff --git a/doc_cn/demo/index.rst b/doc_cn/demo/index.rst
deleted file mode 100644
index e15e839f93d4ac0d455e49fd8b1cde8bf60a29ac..0000000000000000000000000000000000000000
--- a/doc_cn/demo/index.rst
+++ /dev/null
@@ -1,26 +0,0 @@
-使用示例
-========
-
-图像
-''''
-
-* `图像分类 <../../doc/demo/image_classification/index.html>`_
-
-自然语言处理
-''''''''''''
-
-* `情感分析 <sentiment_analysis/index.html>`_
-* `文本生成 <../../doc/demo/text_generation/index.html>`_
-* `词性标注 <../../doc/demo/semantic_role_labeling/index.html>`_
-
-推荐
-''''
-
-* `MovieLens数据集 <../../doc/demo/rec/ml_dataset.html>`_
-* `MovieLens评分回归 <../../doc/demo/rec/ml_regression.html>`_
-
-常用模型
-''''''''
-
-* `ImageNet: ResNet <../../doc/demo/imagenet_model/resnet_model.html>`_
-* `Embedding: Chinese Word <../../doc/demo/embedding_model/index.html>`_
diff --git a/doc_cn/demo/quick_start/index.md b/doc_cn/demo/quick_start/index.md
deleted file mode 100644
index 4a6e07ee1ffd94cf8f781af307b53a96a78e6b93..0000000000000000000000000000000000000000
--- a/doc_cn/demo/quick_start/index.md
+++ /dev/null
@@ -1,543 +0,0 @@
-# PaddlePaddle快速入门教程
-
-我们以文本分类问题作为背景，介绍PaddlePaddle使用流程和常用的网络基础单元的配置方法。
-
-## 安装(Install)
-
-首先请参考<a href = "../../build_and_install/index.html">安装教程</a>安装PaddlePaddle。
-
-## 使用概述(Overview)
-
-**文本分类问题**：对于给定的一条文本， 我们从提前给定的类别集合中选择其所属类
-别。比如通过用户对电子商务网站评论，评估产品的质量：
-
-- 这个显示器很棒！ （好评）
-- 用了两个月之后这个显示器屏幕碎了。（差评）
-
-每一个任务流程都可以分为如下5个基础部分。
-<center> ![](./Pipeline.jpg) </center>
-
-1. 数据格式准备
-    - 每行保存一条样本，类别Id 和文本信息用Tab间隔， 文本中的单词用空格分隔（如果不切词，则字与字之间用空格分隔），例如：```类别Id ‘\t’ 这 个 显 示 器 很 棒 ！```
-2. 数据向模型传送
-    - PaddlePaddle可以读取Python写的传输数据脚本，所有字符都将转换为连续整数表示的Id传给模型
-3. 网络结构（由易到难展示4种不同的网络配置）
-    - 逻辑回归模型
-    - 词向量模型
-    - 卷积模型
-    - 时序模型
-    - 优化算法
-4. 训练模型
-5. 预测
-
-## 数据格式准备(Data Preparation)
-在本问题中，我们使用[Amazon电子产品评论数据](http://jmcauley.ucsd.edu/data/amazon/)，
-将评论分为好评(正样本)和差评(负样本)两类。[源码](https://github.com/PaddlePaddle/Paddle)的`demo/quick_start`里提供了下载已经预处理数据的脚本（如果想从最原始的数据处理，可以使用脚本 `./demo/quick_start/data/proc_from_raw_data/get_data.sh`）。
-
-```bash
-cd demo/quick_start
-./data/get_data.sh
-```
-
-## 数据向模型传送(Transfer Data to Model)
-
-### Python数据加载脚本(Data Provider Script)
-
-下面dataprovider_bow.py文件给出了完整例子，主要包括两部分：
-
-* initalizer： 定义文本信息、类别Id的数据类型。
-* process： yield文本信息和类别Id，和initalizer里定义顺序一致。
-
-```python
-from paddle.trainer.PyDataProvider2 import *
-
-# id of the word not in dictionary
-UNK_IDX = 0
-
-# initializer is called by the framework during initialization.
-# It allows the user to describe the data types and setup the
-# necessary data structure for later use.
-# `settings` is an object. initializer need to properly fill settings.input_types.
-# initializer can also store other data structures needed to be used at process().
-# In this example, dictionary is stored in settings.
-# `dictionay` and `kwargs` are arguments passed from trainer_config.lr.py
-def initializer(settings, dictionary, **kwargs):
-    # Put the word dictionary into settings
-    settings.word_dict = dictionary
-
-    # setting.input_types specifies what the data types the data provider
-    # generates.
-    settings.input_types = [
-        # The first input is a sparse_binary_vector,
-        # which means each dimension of the vector is either 0 or 1. It is the
-        # bag-of-words (BOW) representation of the texts.
-        sparse_binary_vector(len(dictionary)),
-        # The second input is an integer. It represents the category id of the
-        # sample. 2 means there are two labels in the dataset.
-        # (1 for positive and 0 for negative)
-        integer_value(2)]
-
-# Delaring a data provider. It has an initializer 'data_initialzer'.
-# It will cache the generated data of the first pass in memory, so that
-# during later pass, no on-the-fly data generation will be needed.
-# `setting` is the same object used by initializer()
-# `file_name` is the name of a file listed train_list or test_list file given
-# to define_py_data_sources2(). See trainer_config.lr.py.
-@provider(init_hook=initializer, cache=CacheType.CACHE_PASS_IN_MEM)
-def process(settings, file_name):
-    # Open the input data file.
-    with open(file_name, 'r') as f:
-        # Read each line.
-        for line in f:
-            # Each line contains the label and text of the comment, separated by \t.
-            label, comment = line.strip().split('\t')
-
-            # Split the words into a list.
-            words = comment.split()
-
-            # convert the words into a list of ids by looking them up in word_dict.
-            word_vector = [settings.word_dict.get(w, UNK_IDX) for w in words]
-
-            # Return the features for the current comment. The first is a list
-            # of ids representing a 0-1 binary sparse vector of the text,
-            # the second is the integer id of the label.
-            yield word_vector, int(label)
-```
-
-### 配置中的数据加载定义(Data Provider in Configure)
-
-在模型配置中利用`define_py_data_sources2`加载数据：
-
-```python
-from paddle.trainer_config_helpers import *
-
-file = "data/dict.txt"
-word_dict = dict()
-with open(dict_file, 'r') as f:
-    for i, line in enumerate(f):
-        w = line.strip().split()[0]
-        word_dict[w] = i
-# define the data sources for the model.
-# We need to use different process for training and prediction.
-# For training, the input data includes both word IDs and labels.
-# For prediction, the input data only includs word Ids.
-define_py_data_sources2(train_list='data/train.list',
-                        test_list='data/test.list',
-                        module="dataprovider_bow",
-                        obj="process",
-                        args={"dictionary": word_dict})
-```
-* data/train.list,data/test.list: 指定训练、测试数据
-* module="dataprovider": 数据处理Python文件名
-* obj="process": 指定生成数据的函数
-* args={"dictionary": word_dict}: 额外的参数，这里指定词典
-
-更详细数据格式和用例请参考<a href = "../../ui/data_provider/pydataprovider2.html">
-PyDataProvider2</a>。
-
-## 网络结构(Network Architecture)
-本节我们将专注于网络结构的介绍。
-<center> ![](./PipelineNetwork.jpg) </center>
-
-我们将以基本的逻辑回归网络作为起点，并逐渐展示更加深入的功能。更详细的网络配置
-连接请参考<a href = "../../../doc/layer.html">Layer文档</a>。
-所有配置在[源码](https://github.com/PaddlePaddle/Paddle)`demo/quick_start`目录，首先列举逻辑回归网络。
-
-### 逻辑回归模型(Logistic Regression)
-
-流程如下：
-<center> ![](./NetLR.jpg) </center>
-
-- 获取利用one-hot vector表示的每个单词，维度是词典大小
-
-```python
-word = data_layer(name="word",  size=word_dim)
-```
-
-- 获取该条样本类别Id，维度是类别个数。
-
-```python
-label = data_layer(name="label", size=label_dim)
-```
-
-- 利用逻辑回归模型对该向量进行分类，同时会计算分类准确率
-
-```python
-# Define a fully connected layer with logistic activation (also called softmax activation).
-output = fc_layer(input=word,
-                  size=label_dim,
-                  act_type=SoftmaxActivation())
-# Define cross-entropy classification loss and error.
-classification_cost(input=output, label=label)
-```
-
- - input: 除过data层，每个层都有一个或多个input,多个input以list方式输入
- - size: 该层神经元个数
- - act_type: 激活函数类型
-
-效果总结：我们将在后面介绍训练和预测的流程的脚本。在此为方便对比不同网络结构，
-我们随时总结了各个网络的复杂度和效果。
-
-<html>
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-
-<thead>
-<th scope="col" class="left">网络名称</th>
-<th scope="col" class="left">参数数量</th>
-<th scope="col" class="left">错误率</th>
-</tr>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">逻辑回归</td>
-<td class="left">252 KB</td>
-<td class="left">8.652%</td>
-</tr>
-
-</tbody>
-</table></center>
-</html>
-<br>
-
-### 词向量模型(Word Vector)
-
-embedding模型需要稍微改变数据提供的脚本，即`dataprovider_emb.py`，词向量模型、
-卷积模型、时序模型均使用该脚本。其中文本输入类型定义为整数时序类型integer_value_sequence。
-
-```
-def initializer(settings, dictionary, **kwargs):
-    settings.word_dict = dictionary
-    settings.input_types = [
-        # Define the type of the first input as sequence of integer.
-        # The value of the integers range from 0 to len(dictrionary)-1
-        integer_value_sequence(len(dictionary)),
-        # Define the second input for label id
-        integer_value(2)]
-
-@provider(init_hook=initializer)
-def process(settings, file_name):
-    ...
-    # omitted, it is same as the data provider for LR model
-```
-
-该模型依然是使用逻辑回归分类网络的框架， 只是将句子利用连续向量表示替换稀疏
-向量表示， 即对第3步进行替换。句子表示的计算更新为2步：
-<center> ![](./NetContinuous.jpg) </center>
-
-- 利用单词Id查找对应的该单词的连续表示向量(维度为word_dim)， 输入N个单词，输出为N个word_dim维度向量
-
-```python
-emb = embedding_layer(input=word, size=word_dim)
-```
-
-- 将该句话包含的所有单词向量求平均得到句子的表示
-
-```python
-avg = pooling_layer(input=emb, pooling_type=AvgPooling())
-```
-
-其它部分和逻辑回归网络结构一致。
-效果总结：
-
-<html>
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-
-<thead>
-<th scope="col" class="left">网络名称</th>
-<th scope="col" class="left">参数数量</th>
-<th scope="col" class="left">错误率</th>
-</tr>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">词向量模型</td>
-<td class="left">15 MB</td>
-<td class="left">8.484%</td>
-</tr>
-
-</tbody>
-</table>
-</html></center>
-<br>
-
-### 卷积模型(Convolution)
-卷积网络是一种特殊的从词向量表示到句子表示的方法， 也就是将词向量模型额步
-骤3-2进行进一步演化， 变为3个新的子步骤。
-<center> ![](./NetConv.jpg) </center>
-
-文本卷积分为三个步骤：
-1. 获取每个单词左右各k个近邻， 拼接成一个新的向量表示；
-2. 对该表示进行非线性变换 （例如Sigmoid变换）, 成为维度为hidden_dim的新的向量；
-3. 在每个维度上取出在该句话新的向量集合上该维度的最大值作为最后的句子表示向量。 这3个子步骤可配置为:
-
-```python
-text_conv = sequence_conv_pool(input=emb,
-	                           context_start=k,
-	                           context_len=2 * k + 1)
-```
-
-效果总结：
-
-<html>
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-
-<thead>
-<th scope="col" class="left">网络名称</th>
-<th scope="col" class="left">参数数量</th>
-<th scope="col" class="left">错误率</th>
-</tr>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">卷积模型</td>
-<td class="left">16 MB</td>
-<td class="left">5.628%</td>
-</tr>
-
-</tbody>
-</table></center>
-<br>
-
-### 时序模型(Time Sequence)
-<center> ![](./NetRNN.jpg) </center>
-
-时序模型即为RNN模型, 包括简单的RNN模型、GRU模型、LSTM模型等。
-
-- GRU模型配置：
-
-```python
-gru = simple_gru(input=emb, size=gru_size)
-```
-
-- LSTM模型配置：
-
-```python
-lstm = simple_lstm(input=emb, size=lstm_size)
-```
-
-针对本问题，我们采用单层LSTM模型，并使用了Dropout，效果总结：
-
-<html>
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-
-<thead>
-<th scope="col" class="left">网络名称</th>
-<th scope="col" class="left">参数数量</th>
-<th scope="col" class="left">错误率</th>
-</tr>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">时序模型</td>
-<td class="left">16 MB</td>
-<td class="left">4.812%</td>
-</tr>
-
-</tbody>
-</table></center>
-</html>
-<br>
-
-## 优化算法(Optimization Algorithm)
-<a href = "../../../doc/ui/trainer_config_helpers_api.html#module-paddle.trainer_config_helpers.optimizers">优化算法</a>包括
-Momentum, RMSProp，AdaDelta，AdaGrad，ADAM，Adamax等，这里采用Adam优化方法，加了L2正则和梯度截断。
-
-```python
-settings(batch_size=128,
-         learning_rate=2e-3,
-         learning_method=AdamOptimizer(),
-         regularization=L2Regularization(8e-4),
-         gradient_clipping_threshold=25)
-```
-
-## 训练模型(Training Model)
-在完成了数据和网络结构搭建之后， 我们进入到训练部分。
-<center> ![](./PipelineTrain.jpg) </center>
-
-训练脚本：我们将训练的命令行保存在了 `train.sh`文件中。训练时所需设置的主要参数如下：
-
-```bash
-paddle train \
---config=trainer_config.py \
---log_period=20 \
---save_dir=./output \
---num_passes=15 \
---use_gpu=false
-```
-这里没有介绍多机分布式训练，可以参考<a href = "../../cluster/index.html">分布式训练</a>的demo学习如何进行多机训练。
-
-## 预测(Prediction)
-可以使用训练好的模型评估带有label的验证集，也可以预测没有label的测试集。
-<center> ![](./PipelineTest.jpg) </center>
-
-测试脚本如下，将会测试配置文件中test.list指定的数据。
-
-```bash
-paddle train \
---use_gpu=false \
---job=test \
---init_model_path=./output/pass-0000x
-```
-
-可以参考<a href = "../../ui/predict/swig_py_paddle.html">Python API预测</a>
-教程，或其他<a href = "../../demo/index.html">demo</a>的Python预测过程。也可以通过如下方式预测。
-
-预测脚本(`predict.sh`)：
-
-```bash
-model="output/pass-00003"
-paddle train \
-    --config=trainer_config.lstm.py \
-    --use_gpu=false \
-    --job=test \
-    --init_model_path=$model \
-    --config_args=is_predict=1 \
-    --predict_output_dir=. \
-
-mv rank-00000 result.txt
-```
-这里以`output/pass-00003`为例进行预测，用户可以根据训练log选择test结果最好的模型来预测。与训练网络配置不同的是：无需label相关的层，指定outputs输出概率层(softmax输出)，
-指定batch_size=1，数据传输无需label数据，预测数据指定test_list的位置。
-
-预测结果以文本的形式保存在`result.txt`中，一行为一个样本，格式如下：
-
-```
-预测ID;ID为0的概率 ID为1的概率
-预测ID;ID为0的概率 ID为1的概率
-```
-
-```
-is_predict = get_config_arg('is_predict', bool, False)
-trn = 'data/train.list' if not is_predict else None
-tst = 'data/test.list' if not is_predict else 'data/pred.list'
-obj = 'process' if not is_predict else 'process_pre'
-batch_size = 128 if not is_predict else 1
-if is_predict:
-    maxid = maxid_layer(output)
-    outputs([maxid,output])
-else:
-    label = data_layer(name="label", size=2)
-    cls = classification_cost(input=output, label=label)
-    outputs(cls)
-```
-
-## 总体效果总结(Summary)
-这些流程中的数据下载、网络配置、训练脚本在`/demo/quick_start`目录，我们在此总
-结上述网络结构在Amazon-Elec测试集(25k)上的效果:
-
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-
-<thead>
-<th scope="col" class="left">网络名称</th>
-<th scope="col" class="left">参数数量</th>
-<th scope="col" class="left">错误率</th>
-<th scope="col" class="left">配置文件</th>
-</tr>
-</thead>
-
-<tbody>
-<tr>
-<td class="left">逻辑回归模型</td>
-<td class="left"> 252KB </td>
-<td class="left">8.652%</td>
-<td class="left">trainer_config.lr.py</td>
-</tr>
-
-<tr>
-<td class="left">词向量模型</td>
-<td class="left"> 15MB </td>
-<td class="left"> 8.484%</td>
-<td class="left">trainer_config.emb.py</td>
-</tr>
-
-<tr>
-<td class="left">卷积模型</td>
-<td class="left"> 16MB </td>
-<td class="left"> 5.628%</td>
-<td class="left">trainer_config.cnn.py</td>
-</tr>
-
-<tr>
-<td class="left">时序模型</td>
-<td class="left"> 16MB </td>
-<td class="left"> 4.812%</td>
-<td class="left">trainer_config.lstm.py</td>
-</tr>
-
-</tbody>
-</table>
-</center>
-<br>
-
-## 附录(Appendix)
-### 命令行参数(Command Line Argument)
-
-* \--config：网络配置
-* \--save_dir：模型存储路径
-* \--log_period：每隔多少batch打印一次日志
-* \--num_passes：训练轮次，一个pass表示过一遍所有训练样本
-* \--config_args：命令指定的参数会传入网络配置中。
-* \--init_model_path：指定初始化模型路径，可用在测试或训练时指定初始化模型。
-
-默认一个pass保存一次模型，也可以通过saving_period_by_batches设置每隔多少batch保存一次模型。
-可以通过show_parameter_stats_period设置打印参数信息等。
-其他参数请参考<a href = "../../ui/index.html#command-line-argument">令行参数文档</a>。
-
-### 输出日志(Log)
-
-```
-TrainerInternal.cpp:160]  Batch=20 samples=2560 AvgCost=0.628761 CurrentCost=0.628761 Eval: classification_error_evaluator=0.304297  CurrentEval: classification_error_evaluator=0.304297
-```
-模型训练会看到这样的日志，详细的参数解释如下面表格：
-<center>
-<table border="2" cellspacing="0" cellpadding="6" rules="all" frame="border">
-
-<thead>
-<th scope="col" class="left">名称</th>
-<th scope="col" class="left">解释</th>
-</tr>
-</thead>
-
-<tr>
-<td class="left">Batch=20</td>
-<td class="left"> 表示过了20个batch </td>
-</tr>
-
-<tr>
-<td class="left">samples=2560</td>
-<td class="left"> 表示过了2560个样本 </td>
-</tr>
-
-<tr>
-<td class="left">AvgCost</td>
-<td class="left"> 每个pass的第0个batch到当前batch所有样本的平均cost </td>
-</tr>
-
-<tr>
-<td class="left">CurrentCost</td>
-<td class="left"> 当前log_period个batch所有样本的平均cost </td>
-</tr>
-
-<tr>
-<td class="left">Eval: classification_error_evaluator</td>
-<td class="left"> 每个pass的第0个batch到当前batch所有样本的平均分类错误率 </td>
-</tr>
-
-<tr>
-<td class="left">CurrentEval: classification_error_evaluator</td>
-<td class="left"> 当前log_period个batch所有样本的平均分类错误率 </td>
-</tr>
-
-</tbody>
-</table>
-</center>
-<br>
diff --git a/doc_cn/demo/sentiment_analysis/index.rst b/doc_cn/demo/sentiment_analysis/index.rst
deleted file mode 100644
index 82400b2459ebcaf89ff5e884edfe721b9ec01d7f..0000000000000000000000000000000000000000
--- a/doc_cn/demo/sentiment_analysis/index.rst
+++ /dev/null
@@ -1,8 +0,0 @@
-情感分析教程
-===========================
-
-.. toctree::
-    :maxdepth: 3
-    :glob:
-
-    Training Locally <sentiment_analysis.md>
\ No newline at end of file
diff --git a/doc_cn/howto/build_docker_image.rst b/doc_cn/howto/build_docker_image.rst
deleted file mode 100644
index 46ba07d9ad7c1e1843cd953fa5c5fe1dedf6cdf1..0000000000000000000000000000000000000000
--- a/doc_cn/howto/build_docker_image.rst
+++ /dev/null
@@ -1,35 +0,0 @@
-构建PaddlePaddle的Docker Image
-==============================
-PaddlePaddle的Docker Image构建源码放置在 ``${源码根目录}/paddle/scripts/docker/`` 目录下。该目录有三类文件：
-
-- Dockerfile：Docker Image的描述文件，包括构建步骤、各种参数和维护人员等。
-  
-  - 一共维护了12个Dockerfile，Dockerfile.m4是它们的模板。
-  - PaddlePaddle中所有的Image都基于ubuntu 14.04。
-
-- build.sh：Docker Image的构建脚本，使用方式见下一小节。
-- generate.sh：通过Dockerfile.m4模板生成不同的Dockerfile。
-
-使用脚本构建Docker Image
-------------------------
-
-进入源码目录，执行 ``docker build`` 命令，即可在本地编译出PaddlePaddle的镜像。简单的使用样例为
-
-..  code-block:: bash
-
-    cd ${源码根目录}/paddle/scripts/docker/
-    docker build --build-arg LOWEST_DL_SPEED=50K \
-                 --build-arg WITH_GPU=ON \
-                 --tag  paddle_gpu:latest .
-
-其中，``--build-arg`` 传入的配置参数包括:
-
-- LOWEST\_DL\_SPEED\: 在多线程下载过程中，设置下载线程的最低速度。
-
-  - 默认单位是Bytes，但可以传入10K、10M、或10G等这样的单位。
-  - 如果小于这个速度，那么这个线程将会关闭。当所有的线程都关闭了，那么下载进程将会重启。
--  WITH\_GPU\: ON or OFF，是否开启GPU功能。注意，
-  - **编译** PaddlePaddle的GPU版本 **不一定** 要在具有GPU的机器上进行。
-  - **运行** PaddlePaddle的GPU版本 **一定** 要在具有GPU的机器上运行。
-
-注意：所有Image的构建在Docker 1.12版本测试通过, 低于1.12的版本并没有测试。原因是旧版本可能缺乏 ``--build-arg`` 参数，从而不能在运行编译命令的时候接受参数。
diff --git a/doc_cn/index.rst b/doc_cn/index.rst
deleted file mode 100644
index 88a9f79fd23c97785a054af2aa9ee53f8578ef63..0000000000000000000000000000000000000000
--- a/doc_cn/index.rst
+++ /dev/null
@@ -1,32 +0,0 @@
-PaddlePaddle文档
-================
-
-使用指南
---------
-* `介绍 <introduction/index.html>`_
-* `快速入门 <demo/quick_start/index.html>`_
-* `基本使用概念 <concepts/use_concepts.html>`_
-* `编译与安装 <build_and_install/index.html>`_
-* `用户接口 <ui/index.html>`_
-* `使用示例 <demo/index.html>`_
-* `模型配置 <../doc/ui/api/trainer_config_helpers/index.html>`_
-* `集群训练 <cluster/index.html>`_
-
-开发指南
---------
-* `新写Layer <../doc/dev/new_layer/index.html>`_
-* `如何贡献文档 <howto/how_to_write_docs/index.html>`_
-* `如何构建Docker Image <howto/build_docker_image.html>`_
-
-算法教程
---------
-
-* `Recurrent Group教程 <algorithm/rnn/rnn-tutorial.html>`_
-* `单层RNN示例 <../doc/algorithm/rnn/rnn.html>`_
-* :ref:`algo_hrnn_rnn_api_compare`
-* `支持双层序列作为输入的Layer <algorithm/rnn/hierarchical-layer.html>`_
-
-常见问题
---------
-
-* `常见问题 <faq/index.html>`_
diff --git a/doc_cn/introduction/parameters.png b/doc_cn/introduction/parameters.png
deleted file mode 100644
index 2ec67480951e21f0400bce1c34b3108dcd65c18c..0000000000000000000000000000000000000000
Binary files a/doc_cn/introduction/parameters.png and /dev/null differ
diff --git a/doc_cn/ui/cmd/index.rst b/doc_cn/ui/cmd/index.rst
deleted file mode 100644
index 31a8b8a79f4a87101bd6030eb4e779fd11d65811..0000000000000000000000000000000000000000
--- a/doc_cn/ui/cmd/index.rst
+++ /dev/null
@@ -1,20 +0,0 @@
-命令
-====
-
-安装好PaddlePaddle后，在命令行直接敲击 ``paddle`` 或 ``paddle --help`` 会显示如下一些命令。
-
-* ``train`` Start a paddle_trainer
-    启动一个PaddlePaddle训练进程。 ``paddle train`` 可以通过命令行参数 ``-local=true`` 启动一个单机的训练进程；也可以和 ``paddle pserver`` 一起使用启动多机的分布式训练进程。
-* ``pserver`` Start a paddle_pserver_main
-    在多机分布式训练下启动PaddlePaddle的parameter server进程。
-* ``version`` Print paddle version
-    用于打印当前PaddlePaddle的版本和编译选项相关信息。常见的输出格式如下：1）第一行说明了PaddlePaddle的版本信息；2）第二行开始说明了一些主要的编译选项，具体意义可以参考 `编译参数选项文件 <../../build_and_install/cmake/compile_options.html>`_ 。
-
-    ..  literalinclude:: paddle_version.txt
-
-* ``merge_model`` Start a paddle_merge_model
-    用于将PaddlePaddle的模型参数文件和模型配置文件打包成一个文件，方便做部署分发。
-* ``dump_config`` Dump the trainer config as proto string
-    用于将PaddlePaddle的模型配置文件以proto string的格式打印出来。
-* ``make_diagram``
-    使用graphviz对PaddlePaddle的模型配置文件进行绘制。
\ No newline at end of file
diff --git a/doc_cn/ui/cmd/paddle_version.txt b/doc_cn/ui/cmd/paddle_version.txt
deleted file mode 100644
index 33e2e4de7c24afd481eb6ca7eabed4924863d2b7..0000000000000000000000000000000000000000
--- a/doc_cn/ui/cmd/paddle_version.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-PaddlePaddle 0.8.0b, compiled with
-    with_avx: ON
-    with_gpu: ON
-    with_double: OFF
-    with_python: ON
-    with_rdma: OFF
-    with_glog: ON
-    with_gflags: ON
-    with_metric_learning: OFF
-    with_timer: OFF
-    with_predict_sdk: OFF
diff --git a/doc_cn/ui/index.rst b/doc_cn/ui/index.rst
deleted file mode 100644
index ff36c9adb690f4126cf6ee332a9f0b09648270bd..0000000000000000000000000000000000000000
--- a/doc_cn/ui/index.rst
+++ /dev/null
@@ -1,32 +0,0 @@
-########
-用户接口
-########
-
-数据提供
-========
-
-..  toctree::
-    :maxdepth: 1
-
-    data_provider/dataprovider.rst
-    data_provider/pydataprovider2.rst
-
-命令及命令行参数
-================
-
-..  toctree::
-    :maxdepth: 1
-
-    cmd/index.rst
-
-* `参数用例 <../../doc/ui/cmd_argument/use_case.html>`_
-* `参数分类 <../../doc/ui/cmd_argument/argument_outline.html>`_
-* `参数描述 <../../doc/ui/cmd_argument/detail_introduction.html>`_
-
-预测
-=======
-
-..  toctree::
-    :maxdepth: 1
-
-    predict/swig_py_paddle.rst
diff --git a/doc_theme/static/js/paddle_doc_init.js b/doc_theme/static/js/paddle_doc_init.js
index 5c815a8d3a3dab9bdbce544ff3bb49be40ad8934..153ce30745a0a21097fb385f2d66f12e6c8d5be5 100644
--- a/doc_theme/static/js/paddle_doc_init.js
+++ b/doc_theme/static/js/paddle_doc_init.js
@@ -28,4 +28,4 @@ $(document).ready(function(){
     $('.doc-menu-vertical').find('li.current').last().addClass('active');
 
     $('.doc-menu-vertical').perfectScrollbar();
-});
\ No newline at end of file
+});
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index 9b2d122a09adabd766014a9d21a167eec5b2de32..6ad1d79e59b11b2c1f7aacf22d13347b3fd8e0e2 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -17,22 +17,18 @@ add_library(paddle_api STATIC
         ${API_SOURCES})
 add_dependencies(paddle_api gen_proto_cpp)
 
+list(LENGTH "${GFLAGS_LIBRARIES}" GFLAGS_LIBRARIES_LENGTH)
 
-if(WITH_GFLAGS)
-  list(LENGTH "${GFLAGS_LIBRARIES}" GFLAGS_LIBRARIES_LENGTH)
-
-  if(${GFLAGS_LIBRARIES_LENGTH} EQUAL 0 AND TARGET "${GFLAGS_LIBRARIES}")
-    # Because gflags compiled by cmake, so it is imported by cmake target,
-    # not a real library path. Get the real library path here.
-    message(STATUS "GFLAGS Libraries is ${GFLAGS_LIBRARIES}")
-    get_target_property(GFLAGS_LOCATION ${GFLAGS_LIBRARIES} LOCATION)
-    message(STATUS "GFLAGS Target location is ${GFLAGS_LOCATION}")
-  else()
-    set(GFLAGS_LOCATION ${GFLAGS_LIBRARIES})
-  endif()
+if(${GFLAGS_LIBRARIES_LENGTH} EQUAL 0 AND TARGET "${GFLAGS_LIBRARIES}")
+# Because gflags compiled by cmake, so it is imported by cmake target,
+# not a real library path. Get the real library path here.
+message(STATUS "GFLAGS Libraries is ${GFLAGS_LIBRARIES}")
+get_target_property(GFLAGS_LOCATION ${GFLAGS_LIBRARIES} LOCATION)
+message(STATUS "GFLAGS Target location is ${GFLAGS_LOCATION}")
+else()
+set(GFLAGS_LOCATION ${GFLAGS_LIBRARIES})
 endif()
 
-
 configure_file(
     paddle_api_config.py.in
     ${PROJ_ROOT}/paddle/api/paddle_api_config.py
@@ -57,7 +53,7 @@ add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/dist/.timestamp
             paddle_trainer
             paddle_api
             paddle_cuda
-	    ${PY_PADDLE_PYTHON_FILES}
+        ${PY_PADDLE_PYTHON_FILES}
 )
 
 install(DIRECTORY ${PROJ_ROOT}/paddle/dist/
diff --git a/paddle/api/GradientMachine.cpp b/paddle/api/GradientMachine.cpp
index c1b546dbcb4dc6581bbcfe6a821ab15d0e048ea1..297eaa19bb9981c7f07c90763d76494b7910af93 100644
--- a/paddle/api/GradientMachine.cpp
+++ b/paddle/api/GradientMachine.cpp
@@ -15,8 +15,8 @@ limitations under the License. */
 #include "PaddleAPI.h"
 #include "PaddleAPIPrivate.h"
 
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 #include "Internal.h"
+#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 
 std::vector<int> GradientMachine::defaultParamTypes = {
     PARAMETER_VALUE, PARAMETER_GRADIENT, PARAMETER_MOMENTUM};
diff --git a/paddle/api/Internal.h b/paddle/api/Internal.h
index 4a07880d80440526002f31b1fccff4f7c25ea182..d48dd3a04c14f559e3c8ceb67226ddb36272e444 100644
--- a/paddle/api/Internal.h
+++ b/paddle/api/Internal.h
@@ -16,14 +16,13 @@ limitations under the License. */
 
 #include "PaddleAPI.h"
 
-#include <vector>
 #include <algorithm>
+#include <vector>
 
 template <typename T1, typename T2>
 void staticCastVector(std::vector<T2>* dest, const std::vector<T1>& src) {
   dest->resize(src.size());
-  std::transform(src.begin(),
-                 src.end(),
-                 dest->begin(),
-                 [](T1 t) { return static_cast<T2>(t); });
+  std::transform(src.begin(), src.end(), dest->begin(), [](T1 t) {
+    return static_cast<T2>(t);
+  });
 }
diff --git a/paddle/api/Matrix.cpp b/paddle/api/Matrix.cpp
index d4c00e7093d1ed62b37ff2ce05e44fc9bdbc204a..7c375e5cfb91fc5824f823346af6f80c90b36821 100644
--- a/paddle/api/Matrix.cpp
+++ b/paddle/api/Matrix.cpp
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "PaddleAPI.h"
 #include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
-#include "paddle/math/CpuSparseMatrix.h"
-#include <iostream>
 #include <cstring>
+#include <iostream>
+#include "PaddleAPI.h"
+#include "paddle/math/CpuSparseMatrix.h"
+#include "paddle/math/SparseMatrix.h"
 
 struct MatrixPrivate {
   std::shared_ptr<paddle::Matrix> mat;
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index f3c80e3b06ebd824f44ebec49158bd06e25b1a1c..84a66719c33678fc4aeb038bb81a6b7c5d0c93fb 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include <stddef.h>
 #include <stdint.h>
-#include <string>
 #include <stdexcept>
+#include <string>
 #include <vector>
 #include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/TypeDefs.h"
diff --git a/paddle/api/Parameter.cpp b/paddle/api/Parameter.cpp
index 742ad0679cf090b826405db1d2b24de206ed8b32..4eed00a84a695f2c48ff93b33419ae2b3dd03768 100644
--- a/paddle/api/Parameter.cpp
+++ b/paddle/api/Parameter.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "PaddleAPI.h"
 #include "paddle/parameter/Parameter.h"
+#include "PaddleAPI.h"
 
 struct ParameterPrivate {
   std::shared_ptr<paddle::Parameter> sharedPtr;
diff --git a/paddle/api/ParameterOptimizer.cpp b/paddle/api/ParameterOptimizer.cpp
index 606dccd5ac4a4e12a7fe414627e53540f594184a..21b851dd5e26c4752888067b20d0b1e16a4ab52d 100644
--- a/paddle/api/ParameterOptimizer.cpp
+++ b/paddle/api/ParameterOptimizer.cpp
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "PaddleAPI.h"
-#include "PaddleAPIPrivate.h"
 #include "paddle/parameter/ParameterOptimizer.h"
-#include "Internal.h"
 #include <algorithm>
+#include "Internal.h"
+#include "PaddleAPI.h"
+#include "PaddleAPIPrivate.h"
 
 struct ParameterOptimizerPrivate {
   std::unique_ptr<paddle::ParameterOptimizer> optimizer;
@@ -36,16 +36,13 @@ struct ParameterTraverseCallbackPrivate {
              size_t sparseId) {
     std::vector<paddle::VectorPtr> real_vecs;
     real_vecs.resize(vecs.size());
-    std::transform(vecs.begin(),
-                   vecs.end(),
-                   real_vecs.begin(),
-                   [](Vector* v) {
-                     if (v) {
-                       return *(paddle::VectorPtr*)(v->getSharedPtr());
-                     } else {
-                       return paddle::VectorPtr();
-                     }
-                   });
+    std::transform(vecs.begin(), vecs.end(), real_vecs.begin(), [](Vector* v) {
+      if (v) {
+        return *(paddle::VectorPtr*)(v->getSharedPtr());
+      } else {
+        return paddle::VectorPtr();
+      }
+    });
 
     paddle::ParameterConfig& real_conf =
         *(paddle::ParameterConfig*)(const_cast<ParameterConfig&>(conf)
diff --git a/paddle/api/SequenceGenerator.cpp b/paddle/api/SequenceGenerator.cpp
index 5c65b34f2393dd0d41fcf5293f5a4ed8a402beb6..8428edc60df6219fd1d3aebf74b0911a79d370cb 100644
--- a/paddle/api/SequenceGenerator.cpp
+++ b/paddle/api/SequenceGenerator.cpp
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <algorithm>
+#include <iterator>
+#include <sstream>
+#include <vector>
 #include "PaddleAPI.h"
 #include "paddle/gserver/gradientmachines/GradientMachine.h"
 #include "paddle/parameter/Argument.h"
 #include "paddle/utils/Flags.h"
-#include <vector>
-#include <sstream>
-#include <algorithm>
-#include <iterator>
 
 // used to represent partial sequence
 struct Path {
diff --git a/paddle/api/Trainer.cpp b/paddle/api/Trainer.cpp
index 9aeb874bdcee8101d255b8d0fbc80b82647f80f1..d83dc380beeec3747451a483f4811eb833e8c226 100644
--- a/paddle/api/Trainer.cpp
+++ b/paddle/api/Trainer.cpp
@@ -16,20 +16,20 @@ limitations under the License. */
 #include "PaddleAPIPrivate.h"
 
 #include <stdlib.h>
-#include <memory>
 #include <atomic>
+#include <memory>
 
+#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 #include "paddle/trainer/ParamUtil.h"
 #include "paddle/trainer/Trainer.h"
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 #include "paddle/trainer/TrainerInternal.h"
 #include "paddle/utils/Flags.h"
 
 using paddle::real;
 
-P_DECLARE_string(config);
-P_DECLARE_string(init_model_path);
-P_DECLARE_int32(start_pass);
+DECLARE_string(config);
+DECLARE_string(init_model_path);
+DECLARE_int32(start_pass);
 
 struct TrainerPrivate : public paddle::Trainer {
   bool _trainOneBatch(size_t batchSize);
diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
index 0c9c048099771653c56d922ef106b23881e965f3..c3f739568f50b6ee8b0894d06a4d7f91c7816879 100644
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -14,16 +14,16 @@ limitations under the License. */
 
 #include "PaddleAPI.h"
 
-#include "paddle/utils/Util.h"
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Excepts.h"
 #include "paddle/parameter/Parameter.h"
+#include "paddle/utils/Excepts.h"
+#include "paddle/utils/Flags.h"
+#include "paddle/utils/PythonUtil.h"
+#include "paddle/utils/Util.h"
 
 #include <fenv.h>
+#include <algorithm>
 #include <iostream>
 #include <iterator>
-#include <algorithm>
 
 void initPaddle(int argc, char** argv) {
   paddle::initMain(argc, argv);
diff --git a/paddle/api/Vector.cpp b/paddle/api/Vector.cpp
index 4f3ab7de60d28415368500597ced7a11afbfa30c..874f2fd044e9e86b44f8ca69f08bdfd3287d4749 100644
--- a/paddle/api/Vector.cpp
+++ b/paddle/api/Vector.cpp
@@ -282,7 +282,7 @@ FloatArray Vector::getData() const {
 }
 
 void Vector::copyFrom(Vector* src) throw(RangeError) {
-  if (src->m->vec->getSize() !=  m->vec->getSize()) {
+  if (src->m->vec->getSize() != m->vec->getSize()) {
     throw RangeError();
   }
   m->vec->copyFrom(*src->m->vec);
diff --git a/paddle/api/paddle_api_config.py.in b/paddle/api/paddle_api_config.py.in
index a2352250c31efa7ee3c4c8338d95dce5a5b9a511..23542b952b7699d66cf64b47d0354e9078ae06d9 100644
--- a/paddle/api/paddle_api_config.py.in
+++ b/paddle/api/paddle_api_config.py.in
@@ -8,9 +8,7 @@ CMAKE_DL_LIBS="@CMAKE_DL_LIBS@"
 
 WITH_PYTHON="@WITH_PYTHON@"
 PYTHON_LIBRARIES="@PYTHON_LIBRARIES@"
-WITH_GLOG="@WITH_GLOG@"
 LIBGLOG_LIBRARY="@LIBGLOG_LIBRARY@"
-WITH_GFLAGS="@WITH_GFLAGS@"
 GFLAGS_LIBRARIES="@GFLAGS_LIBRARIES@"
 GFLAGS_LOCATION="@GFLAGS_LOCATION@"
 CBLAS_LIBRARIES="@CBLAS_LIBS@"
diff --git a/paddle/api/paddle_ld_flags.py b/paddle/api/paddle_ld_flags.py
index 85cc54700faceb5a514cebe665a2da5ed2f7aa3c..51d7dfee58b786512201577872559ae510051ba9 100644
--- a/paddle/api/paddle_ld_flags.py
+++ b/paddle/api/paddle_ld_flags.py
@@ -47,10 +47,8 @@ try:
             self.with_python = PaddleLDFlag.cmake_bool(WITH_PYTHON)
             self.python_libs = PYTHON_LIBRARIES
 
-            self.with_glog = PaddleLDFlag.cmake_bool(WITH_GLOG)
             self.glog_libs = LIBGLOG_LIBRARY
 
-            self.with_gflags = PaddleLDFlag.cmake_bool(WITH_GFLAGS)
             self.with_coverage = PaddleLDFlag.cmake_bool(WITH_COVERALLS)
             self.gflags_libs = GFLAGS_LIBRARIES
             self.gflags_location = GFLAGS_LOCATION
@@ -88,6 +86,8 @@ try:
                 "-lpaddle_cuda",
                 "-lpaddle_api",
                 self.normalize_flag(self.protolib),
+                self.normalize_flag(self.glog_libs),
+                self.normalize_flag(self.gflags_libs),
                 self.normalize_flag(self.zlib),
                 self.normalize_flag(self.thread),
                 self.normalize_flag(self.dl_libs),
@@ -96,10 +96,6 @@ try:
 
             if self.with_python:
                 libs.append(self.normalize_flag(self.python_libs))
-            if self.with_glog:
-                libs.append(self.normalize_flag(self.glog_libs))
-            if self.with_gflags:
-                libs.append(self.normalize_flag(self.gflags_libs))
             if self.with_gpu:
                 libs.append(self.normalize_flag(self.curt))
             if self.with_coverage:
diff --git a/paddle/api/test/testMatrix.py b/paddle/api/test/testMatrix.py
index f76f84d2e12af7802532b014d3983fe017fbe2b1..37666bdccc9aedfe8f8079124129aad2ade53a43 100644
--- a/paddle/api/test/testMatrix.py
+++ b/paddle/api/test/testMatrix.py
@@ -100,11 +100,12 @@ class TestMatrix(unittest.TestCase):
 
             for a, e in zip(gpu_m.getData(), [1.0, 3.23, 3.0, 4.0, 5.0, 6.0]):
                 self.assertAlmostEqual(a, e)
-    
+
     def test_numpy(self):
         numpy_mat = np.matrix([[1, 2], [3, 4], [5, 6]], dtype="float32")
         m = swig_paddle.Matrix.createDenseFromNumpy(numpy_mat)
-        self.assertEqual((int(m.getHeight()), int(m.getWidth())), numpy_mat.shape)
+        self.assertEqual((int(m.getHeight()), int(m.getWidth())),
+                         numpy_mat.shape)
         self.assertEqual(m.isGpu(), swig_paddle.isUsingGpu())
         for a, e in zip(m.getData(), [1.0, 2.0, 3.0, 4.0, 5.0, 6.0]):
             self.assertAlmostEqual(a, e)
diff --git a/paddle/api/test/testVector.py b/paddle/api/test/testVector.py
index 525ed97eddbc51188f8c4a6d5c5c1c13ce08bac2..1ab095c1d3d0d2c84d2d2f95a03f172b901de209 100644
--- a/paddle/api/test/testVector.py
+++ b/paddle/api/test/testVector.py
@@ -26,17 +26,17 @@ class TestIVector(unittest.TestCase):
             self.assertEqual(m[i], 0)
             m[i] = i
             self.assertEqual(m[i], i)
-        
+
         m = swig_paddle.IVector.createZero(10)
         self.assertEqual(m.isGpu(), swig_paddle.isUsingGpu())
-        self.assertEqual(m.getData(), [0]*10)
+        self.assertEqual(m.getData(), [0] * 10)
 
     def test_create(self):
         m = swig_paddle.IVector.create(range(10), False)
         self.assertIsNotNone(m)
         for i in xrange(10):
             self.assertEqual(m[i], i)
-        
+
         m = swig_paddle.IVector.create(range(10))
         self.assertEqual(m.isGpu(), swig_paddle.isUsingGpu())
         self.assertEqual(m.getData(), range(10))
@@ -69,7 +69,7 @@ class TestIVector(unittest.TestCase):
             expect_vec = range(0, 10)
             expect_vec[4] = 7
             self.assertEqual(vec.getData(), expect_vec)
-    
+
     def test_numpy(self):
         vec = np.array([1, 3, 4, 65, 78, 1, 4], dtype="int32")
         iv = swig_paddle.IVector.createVectorFromNumpy(vec)
@@ -85,10 +85,10 @@ class TestVector(unittest.TestCase):
             self.assertTrue(util.doubleEqual(v[i], 0))
             v[i] = i
             self.assertTrue(util.doubleEqual(v[i], i))
-        
+
         v = swig_paddle.Vector.createZero(10)
         self.assertEqual(v.isGpu(), swig_paddle.isUsingGpu())
-        self.assertEqual(v.getData(), [0]*10)
+        self.assertEqual(v.getData(), [0] * 10)
 
     def testCreate(self):
         v = swig_paddle.Vector.create([x / 100.0 for x in xrange(100)], False)
@@ -96,14 +96,13 @@ class TestVector(unittest.TestCase):
         for i in xrange(len(v)):
             self.assertTrue(util.doubleEqual(v[i], i / 100.0))
         self.assertEqual(100, len(v))
-        
+
         v = swig_paddle.Vector.create([x / 100.0 for x in xrange(100)])
         self.assertEqual(v.isGpu(), swig_paddle.isUsingGpu())
         self.assertEqual(100, len(v))
         vdata = v.getData()
         for i in xrange(len(v)):
             self.assertTrue(util.doubleEqual(vdata[i], i / 100.0))
-        
 
     def testCpuNumpy(self):
         numpy_arr = np.array([1.2, 2.3, 3.4, 4.5], dtype="float32")
@@ -128,7 +127,7 @@ class TestVector(unittest.TestCase):
 
         for i in xrange(1, len(numpy_3)):
             util.doubleEqual(numpy_3[i], vec[i])
-    
+
     def testNumpy(self):
         numpy_arr = np.array([1.2, 2.3, 3.4, 4.5], dtype="float32")
         vec = swig_paddle.Vector.createVectorFromNumpy(numpy_arr)
@@ -136,7 +135,6 @@ class TestVector(unittest.TestCase):
         vecData = vec.getData()
         for n, v in zip(numpy_arr, vecData):
             self.assertTrue(util.doubleEqual(n, v))
-        
 
     def testCopyFromNumpy(self):
         vec = swig_paddle.Vector.createZero(1, False)
diff --git a/paddle/cuda/include/hl_base.h b/paddle/cuda/include/hl_base.h
index 0b9dfc6117685b48102a0681b38f25493259d624..84c5f2d5c91feb7896643d2c5f60a279ebe944e7 100644
--- a/paddle/cuda/include/hl_base.h
+++ b/paddle/cuda/include/hl_base.h
@@ -223,9 +223,9 @@ typedef struct {
 
 #ifdef __NVCC__
 
-#include "paddle/utils/Logging.h"
-#include "hl_cuda.h"
 #include "cuda_runtime.h"
+#include "hl_cuda.h"
+#include "paddle/utils/Logging.h"
 
 extern __thread bool g_sync_flag;
 extern __thread cudaStream_t default_stream;
diff --git a/paddle/cuda/include/hl_dso_loader.h b/paddle/cuda/include/hl_dso_loader.h
index 9ddf0e61ee5ecb49e02ac7f6f35e4961cb2119f1..20c13f21e61a92b0635b686f6f724ae2b44518cc 100644
--- a/paddle/cuda/include/hl_dso_loader.h
+++ b/paddle/cuda/include/hl_dso_loader.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #define HL_DSO_LOADER_H_
 
 #include <dlfcn.h>
-#include <string>
 #include <memory>
+#include <string>
 #include "hl_base.h"
 
 /**
diff --git a/paddle/cuda/include/hl_gpu.h b/paddle/cuda/include/hl_gpu.h
index aad0450c8c9b0ce7ed647962fdf94985c2f4a6fc..ede2670882ee2b93f610a2261a4ecc1784bc2d0c 100644
--- a/paddle/cuda/include/hl_gpu.h
+++ b/paddle/cuda/include/hl_gpu.h
@@ -15,28 +15,28 @@ limitations under the License. */
 #ifndef HL_GPU_H_
 #define HL_GPU_H_
 
+#include "hl_aggregate.h"
 #include "hl_base.h"
+#include "hl_cnn.h"
 #include "hl_cuda.h"
 #include "hl_cuda_cublas.h"
 #include "hl_cuda_cudnn.h"
-#include "hl_matrix.h"
-#include "hl_aggregate.h"
-#include "hl_cnn.h"
-#include "hl_sparse.h"
 #include "hl_lstm.h"
+#include "hl_matrix.h"
 #include "hl_sequence.h"
+#include "hl_sparse.h"
 #include "hl_warpctc_wrap.h"
 
 #ifdef HPPL_STUB_FUNC
-#include "stub/hl_cuda_stub.h"
-#include "stub/hl_cuda_cublas_stub.h"
-#include "stub/hl_cuda_cudnn_stub.h"
-#include "stub/hl_matrix_stub.h"
 #include "stub/hl_aggregate_stub.h"
 #include "stub/hl_cnn_stub.h"
-#include "stub/hl_sparse_stub.h"
+#include "stub/hl_cuda_cublas_stub.h"
+#include "stub/hl_cuda_cudnn_stub.h"
+#include "stub/hl_cuda_stub.h"
 #include "stub/hl_lstm_stub.h"
+#include "stub/hl_matrix_stub.h"
 #include "stub/hl_sequence_stub.h"
+#include "stub/hl_sparse_stub.h"
 #endif
 
 #endif /* HL_GPU_H_ */
diff --git a/paddle/cuda/include/hl_time.h b/paddle/cuda/include/hl_time.h
index f214b055f98de8eae76554bb4ec1deb868903750..f63f02582060156562061f73c429fc7bbd878d2c 100644
--- a/paddle/cuda/include/hl_time.h
+++ b/paddle/cuda/include/hl_time.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #ifndef HL_TIME_H_
 #define HL_TIME_H_
-
+#include <cstdint>
 /**
  * @brief   High resolution timer.
  *
diff --git a/paddle/cuda/src/hl_cuda_cublas.cc b/paddle/cuda/src/hl_cuda_cublas.cc
index 7cede8c63c8a6503b3cdb73f9cb6d01cba23af7a..182e8ab218cce18448f8a08f5c1a1dab7e38f2b6 100644
--- a/paddle/cuda/src/hl_cuda_cublas.cc
+++ b/paddle/cuda/src/hl_cuda_cublas.cc
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "hl_cuda_cublas.h"
 #include <sys/time.h>
 #include <mutex>
 #include "hl_cuda.h"
-#include "hl_cuda_cublas.h"
-#include "hl_thread.ph"
 #include "hl_dso_loader.h"
+#include "hl_thread.ph"
 #include "paddle/utils/Logging.h"
 
 namespace dynload {
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
index 9c9b8906c2b3137be6fbbe79a2cbc126f9b8e6f7..8cddf10d40c6277c6bb29a4fe11e5845a2770213 100644
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -12,19 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "hl_cuda_cudnn.h"
 #include <cudnn.h>
 #include <mutex>
-#include "hl_cuda_cudnn.h"
 #include "hl_cuda_cudnn.ph"
-#include "hl_thread.ph"
 #include "hl_dso_loader.h"
-#include "paddle/utils/Logging.h"
+#include "hl_thread.ph"
 #include "paddle/utils/CommandLineParser.h"
+#include "paddle/utils/Logging.h"
 
-P_DEFINE_int32(cudnn_conv_workspace_limit_in_mb,
-               4096,
-               "Specify cuDNN max workspace limit, in units MB, "
-               "4096MB=4GB by default.");
+DEFINE_int32(cudnn_conv_workspace_limit_in_mb,
+             4096,
+             "Specify cuDNN max workspace limit, in units MB, "
+             "4096MB=4GB by default.");
 
 namespace dynload {
 
diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc
index d1814482929768ea6626459ca51af5ad527e7b43..a71eecba2736234dafaf6b67e5efac5358a30871 100644
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -12,6 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+// clang-format off
+// Because clang-format 4.X and clang-format 3.8+ format
+// following lines in different. So disable clang-format.
+#include "hl_cuda.h"
 #include <cuda_profiler_api.h>
 #include <string.h>
 #include <sys/syscall.h>
@@ -23,6 +27,7 @@ limitations under the License. */
 #include "hl_dso_loader.h"
 #include "hl_thread.ph"
 #include "paddle/utils/Logging.h"
+// clang-format on
 
 namespace dynload {
 
diff --git a/paddle/cuda/src/hl_cudart_wrap.cc b/paddle/cuda/src/hl_cudart_wrap.cc
index a3ac750b530eb10f3889a3ab3cdef7330037acc1..ecc03a729dde2f2b4f8f004234a47d9272997a50 100644
--- a/paddle/cuda/src/hl_cudart_wrap.cc
+++ b/paddle/cuda/src/hl_cudart_wrap.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #ifdef PADDLE_USE_DSO
 
-#include <mutex>
 #include <cuda_runtime.h>
+#include <mutex>
 #include "hl_dso_loader.h"
 
 /**
diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/cuda/src/hl_dso_loader.cc
index f509b8924319dcaa48adaf1de32fec03e45d61c5..54c7620fc081f681d9d33bcd711008fa5029df05 100644
--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -16,21 +16,21 @@ limitations under the License. */
 #include "paddle/utils/CommandLineParser.h"
 #include "paddle/utils/Logging.h"
 
-P_DEFINE_string(cudnn_dir,
-                "",
-                "Specify path for loading libcudnn.so. For instance, "
-                "/usr/local/cudnn/lib. If empty [default], dlopen "
-                "will search cudnn from LD_LIBRARY_PATH");
-
-P_DEFINE_string(cuda_dir,
-                "",
-                "Specify path for loading cuda library, such as libcublas, "
-                "libcurand. For instance, /usr/local/cuda/lib64. (Note: "
-                "libcudart can not be specified by cuda_dir, since some "
-                "build-in function in cudart already ran before main entry). "
-                "If default, dlopen will search cuda from LD_LIBRARY_PATH");
-
-P_DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
+DEFINE_string(cudnn_dir,
+              "",
+              "Specify path for loading libcudnn.so. For instance, "
+              "/usr/local/cudnn/lib. If empty [default], dlopen "
+              "will search cudnn from LD_LIBRARY_PATH");
+
+DEFINE_string(cuda_dir,
+              "",
+              "Specify path for loading cuda library, such as libcublas, "
+              "libcurand. For instance, /usr/local/cuda/lib64. (Note: "
+              "libcudart can not be specified by cuda_dir, since some "
+              "build-in function in cudart already ran before main entry). "
+              "If default, dlopen will search cuda from LD_LIBRARY_PATH");
+
+DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
 
 static inline std::string join(const std::string& part1,
                                const std::string& part2) {
diff --git a/paddle/cuda/src/hl_time.cc b/paddle/cuda/src/hl_time.cc
index 300506589967bb257b6d2ea1ca39a6dfd592d98d..7e5d7e8aaecbcdc61c1e5b5006a2958d4dc84460 100644
--- a/paddle/cuda/src/hl_time.cc
+++ b/paddle/cuda/src/hl_time.cc
@@ -12,10 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <chrono>
+#include "hl_time.h"
 #include <stdlib.h>
+#include <chrono>
+#include <cstdint>
 #include <iostream>
-#include "hl_time.h"
 
 using std::chrono::high_resolution_clock;
 
diff --git a/paddle/cuda/src/hl_warpctc_wrap.cc b/paddle/cuda/src/hl_warpctc_wrap.cc
index 619b90120f6c86f966154a9e6902db8469500629..9ae8bc0f220e143a5c59d8c3ead012a20369e7b9 100644
--- a/paddle/cuda/src/hl_warpctc_wrap.cc
+++ b/paddle/cuda/src/hl_warpctc_wrap.cc
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <mutex>
 #include "hl_warpctc_wrap.h"
+#include <mutex>
 #include "hl_dso_loader.h"
 #include "paddle/utils/Logging.h"
 
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index f1d09c568db875d847564380179a8ccc6d0d3049..f8c4bcac2f8eb41400659dc24ba81768e7ae3640 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -15,13 +15,13 @@ limitations under the License. */
 #include "ActivationFunction.h"
 
 #include <algorithm>
-#include <memory>
 #include <iostream>
-#include <type_traits>
+#include <memory>
 #include <string>
 #include <thread>
-#include "paddle/utils/ClassRegistrar.h"
+#include <type_traits>
 #include "paddle/parameter/Argument.h"
+#include "paddle/utils/ClassRegistrar.h"
 
 #include "paddle/utils/Logging.h"
 
diff --git a/paddle/gserver/dataproviders/DataProvider.cpp b/paddle/gserver/dataproviders/DataProvider.cpp
index 55ca62543aa33cf40d1f69d0fa1d6348ccdf1251..0478256f9cd81f4a99eb0cbcbd1a5a21de5cf14b 100644
--- a/paddle/gserver/dataproviders/DataProvider.cpp
+++ b/paddle/gserver/dataproviders/DataProvider.cpp
@@ -14,12 +14,12 @@ limitations under the License. */
 
 #include "DataProvider.h"
 
-#include "paddle/utils/Util.h"
-#include "paddle/utils/StringUtil.h"
-#include "paddle/utils/Logging.h"
-#include <algorithm>
 #include <unistd.h>
+#include <algorithm>
 #include "ProtoDataProvider.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/StringUtil.h"
+#include "paddle/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index 5b854936c6c34926b789436efe58f193aff5cb9d..9b7f7e36cedaa230ae0694d87cc033bd6fa6e652 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -14,28 +14,28 @@ limitations under the License. */
 
 #pragma once
 
-#include <vector>
-#include <memory>
-#include <mutex>
-#include <iostream>
-#include <fstream>
 #include <stdint.h>
-#include <string.h>
-#include <stdlib.h>
 #include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <fstream>
+#include <iostream>
+#include <memory>
+#include <mutex>
+#include <vector>
 
+#include "DataConfig.pb.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
+#include "paddle/math/Vector.h"
+#include "paddle/parameter/Argument.h"
+#include "paddle/utils/ClassRegistrar.h"
+#include "paddle/utils/Locks.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Queue.h"
-#include "paddle/utils/Locks.h"
 #include "paddle/utils/ThreadLocal.h"
 #include "paddle/utils/TypeDefs.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
 #include "paddle/utils/Util.h"
-#include "paddle/math/Vector.h"
-#include "DataConfig.pb.h"
-#include "paddle/utils/ClassRegistrar.h"
-#include "paddle/parameter/Argument.h"
 
 namespace paddle {
 /**
diff --git a/paddle/gserver/dataproviders/MultiDataProvider.cpp b/paddle/gserver/dataproviders/MultiDataProvider.cpp
index e1fc4c93656bdeafc8d96d7a822104787e084cdf..46fe053768e480c5f69f597c49f363cb966a4168 100644
--- a/paddle/gserver/dataproviders/MultiDataProvider.cpp
+++ b/paddle/gserver/dataproviders/MultiDataProvider.cpp
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Util.h"
 #include "MultiDataProvider.h"
-#include "paddle/utils/Logging.h"
 #include <algorithm>
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.cpp b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
index 6a0cb5ef63bc7bf4232ed56ebca775790b89cd31..c6f5cab1915b7f41d505c37a7fef762a392bad7f 100644
--- a/paddle/gserver/dataproviders/ProtoDataProvider.cpp
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
@@ -13,18 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "ProtoDataProvider.h"
-#include "paddle/utils/Util.h"
-#include "paddle/utils/StringUtil.h"
 #include <algorithm>
 #include <fstream>
 #include <istream>
+#include "paddle/utils/StringUtil.h"
+#include "paddle/utils/Util.h"
 
-#include "paddle/utils/Logging.h"
 #include "DataProviderGroup.h"
+#include "paddle/utils/Logging.h"
 
-P_DEFINE_double(memory_threshold_on_load_data,
-                1.0,
-                "stop loading data when memory is not sufficient");
+DEFINE_double(memory_threshold_on_load_data,
+              1.0,
+              "stop loading data when memory is not sufficient");
 
 namespace paddle {
 
@@ -562,16 +562,16 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
         auto mat = cpuArguments[slot].value;
         mat->resize(size, dim);
         if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<GpuSparseMatrix>(mat)
-              ->copyFrom(dataPos.data(),
-                         slots_[slot].indices.data(),
-                         slots_[slot].sparseNonValueData.data(),
-                         HPPL_STREAM_1);
+          std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
+              dataPos.data(),
+              slots_[slot].indices.data(),
+              slots_[slot].sparseNonValueData.data(),
+              HPPL_STREAM_1);
         } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
-              ->copyFrom(dataPos.data(),
-                         slots_[slot].indices.data(),
-                         slots_[slot].sparseNonValueData.data());
+          std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
+              dataPos.data(),
+              slots_[slot].indices.data(),
+              slots_[slot].sparseNonValueData.data());
         } else {
           LOG(FATAL) << "Not Supported";
         }
@@ -598,16 +598,16 @@ int64_t ProtoDataProvider::getNextBatchInternal(int64_t size,
         auto mat = cpuArguments[slot].value;
         mat->resize(size, dim);
         if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<GpuSparseMatrix>(mat)
-              ->copyFrom(dataPos.data(),
-                         slots_[slot].indices.data(),
-                         slots_[slot].sparseFloatValueData.data(),
-                         HPPL_STREAM_1);
+          std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
+              dataPos.data(),
+              slots_[slot].indices.data(),
+              slots_[slot].sparseFloatValueData.data(),
+              HPPL_STREAM_1);
         } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
-          std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
-              ->copyFrom(dataPos.data(),
-                         slots_[slot].indices.data(),
-                         slots_[slot].sparseFloatValueData.data());
+          std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
+              dataPos.data(),
+              slots_[slot].indices.data(),
+              slots_[slot].sparseFloatValueData.data());
         } else {
           LOG(FATAL) << "Not Supported";
         }
diff --git a/paddle/gserver/dataproviders/ProtoDataProvider.h b/paddle/gserver/dataproviders/ProtoDataProvider.h
index 9ec5cb97c02d80b40371409c00e2487dceb3757c..7dd45e062248f20d24c633dd4e1c8b7eebcbfa1b 100644
--- a/paddle/gserver/dataproviders/ProtoDataProvider.h
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.h
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include <vector>
 
-#include "paddle/utils/Stat.h"
 #include "DataFormat.pb.h"
+#include "paddle/utils/Stat.h"
 
 #include "DataProvider.h"
 #include "ProtoReader.h"
diff --git a/paddle/gserver/dataproviders/ProtoReader.h b/paddle/gserver/dataproviders/ProtoReader.h
index 6708e7cde7b5db5e739cc4bbf9bc04a124fe9703..4e6f58a5292bec276994fde0764278d12d7ae9d5 100644
--- a/paddle/gserver/dataproviders/ProtoReader.h
+++ b/paddle/gserver/dataproviders/ProtoReader.h
@@ -16,10 +16,10 @@ limitations under the License. */
 
 #include <memory>
 
-#include <google/protobuf/message_lite.h>
 #include <google/protobuf/io/coded_stream.h>
-#include <google/protobuf/io/zero_copy_stream_impl.h>
 #include <google/protobuf/io/gzip_stream.h>
+#include <google/protobuf/io/zero_copy_stream_impl.h>
+#include <google/protobuf/message_lite.h>
 
 namespace paddle {
 
diff --git a/paddle/gserver/dataproviders/PyDataProvider.cpp b/paddle/gserver/dataproviders/PyDataProvider.cpp
index f5dcbfcf3464a027a3a8f2a67e66037a4495848c..5bdd55309c8bf8d5dcf84f5dcef2c5c85249a668 100644
--- a/paddle/gserver/dataproviders/PyDataProvider.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider.cpp
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "PyDataProvider.h"
-#include "paddle/utils/PythonUtil.h"
 #include <fenv.h>
-#include "paddle/utils/Util.h"
 #include "paddle/utils/Excepts.h"
+#include "paddle/utils/PythonUtil.h"
+#include "paddle/utils/Util.h"
 
 namespace paddle {
 
@@ -316,16 +316,16 @@ void PyDataProvider::handleSparseNonValueSlot(
   auto mat = cpuArguments[slotIndex].value;
   mat->resize(slot.sampleNum, dim, slot.sampleNum, NO_VALUE, SPARSE_CSR);
   if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-    std::dynamic_pointer_cast<GpuSparseMatrix>(mat)
-        ->copyFrom(slot.sampleSequenceIdVec.data(),
-                   slot.indices.data(),
-                   slot.sparseNonValueData.data(),
-                   HPPL_STREAM_1);
+    std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
+        slot.sampleSequenceIdVec.data(),
+        slot.indices.data(),
+        slot.sparseNonValueData.data(),
+        HPPL_STREAM_1);
   } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
-    std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
-        ->copyFrom(slot.sampleSequenceIdVec.data(),
-                   slot.indices.data(),
-                   slot.sparseNonValueData.data());
+    std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
+        slot.sampleSequenceIdVec.data(),
+        slot.indices.data(),
+        slot.sparseNonValueData.data());
   } else {
     LOG(FATAL) << "Not Supported";
   }
@@ -347,16 +347,16 @@ void PyDataProvider::handleSparseValueSlot(
   auto mat = cpuArguments[slotIndex].value;
   mat->resize(slot.sampleNum, dim, slot.sampleNum, FLOAT_VALUE, SPARSE_CSR);
   if (std::dynamic_pointer_cast<GpuSparseMatrix>(mat)) {
-    std::dynamic_pointer_cast<GpuSparseMatrix>(mat)
-        ->copyFrom(slot.sampleSequenceIdVec.data(),
-                   slot.indices.data(),
-                   slot.sparseFloatValueData.data(),
-                   HPPL_STREAM_DEFAULT);
+    std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
+        slot.sampleSequenceIdVec.data(),
+        slot.indices.data(),
+        slot.sparseFloatValueData.data(),
+        HPPL_STREAM_DEFAULT);
   } else if (std::dynamic_pointer_cast<CpuSparseMatrix>(mat)) {
-    std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
-        ->copyFrom(slot.sampleSequenceIdVec.data(),
-                   slot.indices.data(),
-                   slot.sparseFloatValueData.data());
+    std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
+        slot.sampleSequenceIdVec.data(),
+        slot.indices.data(),
+        slot.sparseFloatValueData.data());
   } else {
     LOG(FATAL) << "Not Supported";
   }
diff --git a/paddle/gserver/dataproviders/PyDataProvider2.cpp b/paddle/gserver/dataproviders/PyDataProvider2.cpp
index 8b04a03f6d26df5eee44fe112bea7bb53f7ef5a7..460efc5adc6f017e91dc9daff6ab32312e4460c1 100644
--- a/paddle/gserver/dataproviders/PyDataProvider2.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider2.cpp
@@ -15,18 +15,18 @@ limitations under the License. */
 #ifndef PADDLE_NO_PYTHON
 
 #include <Python.h>
+#include <numpy/numpyconfig.h>
 #include <stdio.h>
 #include <stdlib.h>
-#include <unordered_set>
 #include <list>
-#include <numpy/numpyconfig.h>
+#include <unordered_set>
 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION
 #include <numpy/ndarrayobject.h>
 
 #include "DataProvider.h"
 
-#include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Locks.h"
+#include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
@@ -400,10 +400,9 @@ private:
 
       if (this->loadThread_) {  // wait poolActualSize < poolSize;
         std::unique_lock<std::mutex> l(mtx_);
-        pushCV_.wait(l,
-                     [this, additionalBatchSize] {
-                       return this->poolActualSize_ < poolSize_;
-                     });
+        pushCV_.wait(l, [this, additionalBatchSize] {
+          return this->poolActualSize_ < poolSize_;
+        });
       }
 
       {
@@ -529,12 +528,10 @@ public:
                         // but, loading from cache, cache object should ensure
                         // data pool ready.
       std::unique_lock<std::mutex> l(mtx_);
-      pullCV_.wait(l,
-                   [this, &size] {
-                     return this->poolActualSize_ >=
-                                std::max(size, this->minPoolSize_) ||
-                            callingContexts_.empty();
-                   });
+      pullCV_.wait(l, [this, &size] {
+        return this->poolActualSize_ >= std::max(size, this->minPoolSize_) ||
+               callingContexts_.empty();
+      });
 
       if (unittest::OnPoolFilled) {
         (*unittest::OnPoolFilled)(this->poolActualSize_);
diff --git a/paddle/gserver/evaluators/Evaluator.cpp b/paddle/gserver/evaluators/Evaluator.cpp
index aa6dc7cb86cbbda6bac8823614901a0c2d175278..2f9928191170aa6cf25417362cb360b5e2865b69 100644
--- a/paddle/gserver/evaluators/Evaluator.cpp
+++ b/paddle/gserver/evaluators/Evaluator.cpp
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Stat.h"
 #include "paddle/gserver/evaluators/Evaluator.h"
+#include "paddle/utils/Stat.h"
 
 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 
-P_DECLARE_int32(trainer_id);
+DECLARE_int32(trainer_id);
 
 namespace paddle {
 
@@ -842,9 +842,9 @@ void PnpairEvaluator::calc(std::vector<PredictionResult>& predictArray) {
   auto start = predictArray.begin();
   while (start != predictArray.end()) {
     auto end = std::find_if(
-        start + 1,
-        predictArray.end(),
-        [=](const PredictionResult& x) { return x.queryid != start->queryid; });
+        start + 1, predictArray.end(), [=](const PredictionResult& x) {
+          return x.queryid != start->queryid;
+        });
     CHECK(end != start);
     stat(start - predictArray.begin(),
          end - predictArray.begin(),
diff --git a/paddle/gserver/evaluators/Evaluator.h b/paddle/gserver/evaluators/Evaluator.h
index a26c650c388d826d635fb1b98ac4da28a8bbb148..5770847309670ef1856cfb9255fa847c24513b56 100644
--- a/paddle/gserver/evaluators/Evaluator.h
+++ b/paddle/gserver/evaluators/Evaluator.h
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/pserver/ParameterClient2.h"
-#include "paddle/utils/ClassRegistrar.h"
+#include <fstream>
 #include "ModelConfig.pb.h"
 #include "paddle/parameter/Argument.h"
-#include <fstream>
+#include "paddle/pserver/ParameterClient2.h"
+#include "paddle/utils/ClassRegistrar.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/gradientmachines/GradientMachine.cpp b/paddle/gserver/gradientmachines/GradientMachine.cpp
index 6adee05dbee1fa9db9ea98fb27fb5e8a4e8ef328..36ca05b919b136c162105cf4f1fb7705ae7ca7f3 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/GradientMachine.cpp
@@ -14,16 +14,16 @@ limitations under the License. */
 
 #include "GradientMachine.h"
 
-#include "paddle/utils/Logging.h"
 #include <fstream>
+#include "paddle/utils/Logging.h"
 
-#include "hl_gpu.h"
-#include "NeuralNetwork.h"
-#include "ParallelNeuralNetwork.h"
+#include "GradientMachineMode.h"
 #include "MultiGradientMachine.h"
-#include "NeuralNetwork.h"
 #include "MultiNetwork.h"
-#include "GradientMachineMode.h"
+#include "NeuralNetwork.h"
+#include "NeuralNetwork.h"
+#include "ParallelNeuralNetwork.h"
+#include "hl_gpu.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/gradientmachines/GradientMachine.h b/paddle/gserver/gradientmachines/GradientMachine.h
index f3e44a9e3962c9d54cd1f9e2710c84f3f476e7ca..579eca71d4cdd2545a3a8be1c7f1dacfdd5ef66b 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.h
+++ b/paddle/gserver/gradientmachines/GradientMachine.h
@@ -17,15 +17,15 @@ limitations under the License. */
 #include <iostream>
 #include <vector>
 
-#include "paddle/math/Matrix.h"
-#include "paddle/parameter/Parameter.h"
-#include "paddle/parameter/ParameterUpdaterBase.h"
-#include "paddle/utils/Thread.h"
-#include "TrainerConfig.pb.h"
 #include "ModelConfig.pb.h"
+#include "TrainerConfig.pb.h"
 #include "paddle/gserver/dataproviders/DataProvider.h"
 #include "paddle/gserver/evaluators/Evaluator.h"
 #include "paddle/gserver/layers/Layer.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/parameter/Parameter.h"
+#include "paddle/parameter/ParameterUpdaterBase.h"
+#include "paddle/utils/Thread.h"
 
 namespace paddle {
 /**
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
index a7324f55451e696176a040b470c2d3bdf9eaa392..88c098b3559d8d2918309aa48329af067f79bdd5 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -21,11 +21,11 @@ limitations under the License. */
 #include "NeuralNetwork.h"
 #include "ParallelNeuralNetwork.h"
 
-P_DEFINE_bool(allow_only_one_model_on_one_gpu,
-              true,
-              "If true, do not allow multiple models on one GPU device");
+DEFINE_bool(allow_only_one_model_on_one_gpu,
+            true,
+            "If true, do not allow multiple models on one GPU device");
 #ifdef PADDLE_METRIC_LEARNING
-P_DECLARE_bool(external);
+DECLARE_bool(external);
 #endif
 
 namespace paddle {
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.h b/paddle/gserver/gradientmachines/MultiGradientMachine.h
index fe6d96e8ea3eff56f27da412d3a538730ccebbf1..5f9855c4be869aa73aaebfc2e75ee51f050f2722 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.h
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.h
@@ -18,9 +18,9 @@ limitations under the License. */
 
 #include "GradientMachine.h"
 
-#include "paddle/utils/Queue.h"
-#include "paddle/utils/Locks.h"
 #include "hl_gpu.h"
+#include "paddle/utils/Locks.h"
+#include "paddle/utils/Queue.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/gradientmachines/MultiNetwork.cpp b/paddle/gserver/gradientmachines/MultiNetwork.cpp
index 61af82fcb7e85a24f9b1311ca0b8168470c5ad8a..6eb3d8db962161ed4123b4ef4a4bb42147bfdf19 100644
--- a/paddle/gserver/gradientmachines/MultiNetwork.cpp
+++ b/paddle/gserver/gradientmachines/MultiNetwork.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <algorithm>
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
-#include <algorithm>
 
 #include "MultiNetwork.h"
 
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index dbcb97b42baa796dbd7017834867454f769cd3f2..ee36a87b9d848edcc37f89221141de3f939e1110 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -14,15 +14,15 @@ limitations under the License. */
 
 #include "paddle/utils/Util.h"
 
-#include "paddle/utils/Logging.h"
 #include "paddle/utils/CustomStackTrace.h"
+#include "paddle/utils/Logging.h"
 
-#include "paddle/utils/Stat.h"
-#include "hl_gpu.h"
+#include "MultiNetwork.h"
 #include "NeuralNetwork.h"
 #include "RecurrentGradientMachine.h"
-#include "MultiNetwork.h"
+#include "hl_gpu.h"
 #include "paddle/gserver/layers/AgentLayer.h"
+#include "paddle/utils/Stat.h"
 
 namespace paddle {
 void parameterInitNN(int paramId,
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h
index fd885b436a710d7910586f48a26faebded3a6fd1..384ca88f47ffb20ca7d16a276a190b063158d273 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.h
@@ -14,18 +14,18 @@ limitations under the License. */
 
 #pragma once
 
-#include <memory>
-#include <map>
 #include <functional>
+#include <map>
+#include <memory>
 
-#include "paddle/utils/ClassRegistrar.h"
-#include "paddle/parameter/Parameter.h"
 #include "ModelConfig.pb.h"
+#include "paddle/gserver/dataproviders/DataProvider.h"
 #include "paddle/gserver/gradientmachines/GradientMachine.h"
 #include "paddle/gserver/layers/CostLayer.h"
 #include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/dataproviders/DataProvider.h"
 #include "paddle/gserver/layers/Layer.h"
+#include "paddle/parameter/Parameter.h"
+#include "paddle/utils/ClassRegistrar.h"
 
 namespace paddle {
 /*
@@ -57,14 +57,13 @@ void parameterInitNN(int paramId,
 
 class NeuralNetwork : public GradientMachine {
 public:
-  virtual void init(
-      const ModelConfig& config,
-      ParamInitCallback callback = nullptr,
-      const std::vector<ParameterType>&
-          parameterTypes = std::vector<ParameterType>{PARAMETER_VALUE,
-                                                      PARAMETER_GRADIENT,
-                                                      PARAMETER_MOMENTUM},
-      bool useGpu = FLAGS_use_gpu);
+  virtual void init(const ModelConfig& config,
+                    ParamInitCallback callback = nullptr,
+                    const std::vector<ParameterType>& parameterTypes =
+                        std::vector<ParameterType>{PARAMETER_VALUE,
+                                                   PARAMETER_GRADIENT,
+                                                   PARAMETER_MOMENTUM},
+                    bool useGpu = FLAGS_use_gpu);
 
   /**
    * Connect two submodels and
diff --git a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
index 934a7cfc7b5f21e101542016b74cb3e4e3e24e2d..8f445b1ded3eb8960dc06512dd3f80b00d284acc 100644
--- a/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/ParallelNeuralNetwork.h
@@ -37,14 +37,13 @@ public:
                         NeuralNetwork *rootNetwork = nullptr)
       : NeuralNetwork(subModelName, rootNetwork) {}
 
-  virtual void init(
-      const ModelConfig &config,
-      ParamInitCallback callback = nullptr,
-      const std::vector<ParameterType>
-          &parameterTypes = std::vector<ParameterType>{PARAMETER_VALUE,
-                                                       PARAMETER_GRADIENT,
-                                                       PARAMETER_MOMENTUM},
-      bool useGpu = FLAGS_use_gpu);
+  virtual void init(const ModelConfig &config,
+                    ParamInitCallback callback = nullptr,
+                    const std::vector<ParameterType> &parameterTypes =
+                        std::vector<ParameterType>{PARAMETER_VALUE,
+                                                   PARAMETER_GRADIENT,
+                                                   PARAMETER_MOMENTUM},
+                    bool useGpu = FLAGS_use_gpu);
 
   virtual void forward(const std::vector<Argument> &inArgs,
                        std::vector<Argument> *outArgs,
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index 4fb1a44ab7b278a59fcec45e8ddeac094e12e44e..8f68b3d66bd263b8df34801878efee3e2de2622d 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -12,19 +12,19 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/Util.h"
-#include "paddle/utils/Flags.h"
+#include "RecurrentGradientMachine.h"
+#include <dlfcn.h>
 #include <algorithm>
+#include <cmath>
 #include <functional>
-#include <dlfcn.h>
 #include <limits>
-#include <cmath>
-#include "RecurrentGradientMachine.h"
 #include "NeuralNetwork.h"
 #include "paddle/gserver/layers/AgentLayer.h"
+#include "paddle/utils/Flags.h"
+#include "paddle/utils/Stat.h"
+#include "paddle/utils/Util.h"
 
-P_DEFINE_string(diy_beam_search_prob_so, "", "the diy beam search cost so");
+DEFINE_string(diy_beam_search_prob_so, "", "the diy beam search cost so");
 
 static const char* DIY_CALC_PROB_SYMBOL_NAME = "calc_prob";
 static const char* DIY_START_CALC_PROB_SYMBOL_NAME = "start_calc_prob";
@@ -78,20 +78,22 @@ static inline SymbolType loadDiySymbol(const char* symbolName) {
   return reinterpret_cast<SymbolType>(sym);
 }
 
-static InitFunction __init__diy_prob_method([] {
-  std::string soName = FLAGS_diy_beam_search_prob_so;
-  if (!soName.empty()) {
-    gDiyProbHandle = dlopen(soName.c_str(), RTLD_LAZY);
-    CHECK(gDiyProbHandle) << "Cannot Open DIY Prob So " << soName;
-    atexit(exit_diy_prob);
-    gDiyProbMethod =
-        loadDiySymbol<decltype(gDiyProbMethod)>(DIY_CALC_PROB_SYMBOL_NAME);
-    gDiyProbStart =
-        loadDiySymbol<decltype(gDiyProbStart)>(DIY_START_CALC_PROB_SYMBOL_NAME);
-    gDiyProbStop =
-        loadDiySymbol<decltype(gDiyProbStop)>(DIY_FINISH_CALC_PROB_SYMBOL_NAME);
-  }
-}, std::numeric_limits<int>::max());
+static InitFunction __init__diy_prob_method(
+    [] {
+      std::string soName = FLAGS_diy_beam_search_prob_so;
+      if (!soName.empty()) {
+        gDiyProbHandle = dlopen(soName.c_str(), RTLD_LAZY);
+        CHECK(gDiyProbHandle) << "Cannot Open DIY Prob So " << soName;
+        atexit(exit_diy_prob);
+        gDiyProbMethod =
+            loadDiySymbol<decltype(gDiyProbMethod)>(DIY_CALC_PROB_SYMBOL_NAME);
+        gDiyProbStart = loadDiySymbol<decltype(gDiyProbStart)>(
+            DIY_START_CALC_PROB_SYMBOL_NAME);
+        gDiyProbStop = loadDiySymbol<decltype(gDiyProbStop)>(
+            DIY_FINISH_CALC_PROB_SYMBOL_NAME);
+      }
+    },
+    std::numeric_limits<int>::max());
 
 class BeamSearchControlCallbacks {
 public:
@@ -1281,10 +1283,9 @@ void RecurrentGradientMachine::beamSearch(size_t batchSize) {
       std::vector<std::vector<int>*> prefixes;
       prefixes.resize(paths.size());
       std::transform(
-          paths.begin(),
-          paths.end(),
-          prefixes.begin(),
-          [](const Path& p) { return const_cast<std::vector<int>*>(&p.ids); });
+          paths.begin(), paths.end(), prefixes.begin(), [](const Path& p) {
+            return const_cast<std::vector<int>*>(&p.ids);
+          });
       beamSearchCtrlCallbacks_->beamSearchCandidateAdjust(
           prefixes, frames_[machineCur].get(), i);
     }
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
index 369c8c3d988ca0cac4147220231ac66bf3538776..db7d8aff6d3150dd272a924c20e16bfe28d11442 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <functional>
 #include "GradientMachine.h"
 #include "NeuralNetwork.h"
-#include <functional>
 
 #include "paddle/utils/Locks.h"
 
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.cpp b/paddle/gserver/layers/BatchNormBaseLayer.cpp
index 51463f111890ca0608dab760b6f320578cef15b3..1ceaaaa206ee3cbc5421238574c7f310011ccaa5 100644
--- a/paddle/gserver/layers/BatchNormBaseLayer.cpp
+++ b/paddle/gserver/layers/BatchNormBaseLayer.cpp
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Stat.h"
-#include "Layer.h"
 #include "BatchNormBaseLayer.h"
 #include "BatchNormalizationLayer.h"
+#include "Layer.h"
+#include "paddle/utils/Stat.h"
 #ifndef PADDLE_ONLY_CPU
 #include "CudnnBatchNormLayer.h"
 #endif
diff --git a/paddle/gserver/layers/BatchNormBaseLayer.h b/paddle/gserver/layers/BatchNormBaseLayer.h
index f5a555a6d040e687d36ff45c9b0825649dd72131..75bda95de1472b08538b48072ddf9ea607b83299 100644
--- a/paddle/gserver/layers/BatchNormBaseLayer.h
+++ b/paddle/gserver/layers/BatchNormBaseLayer.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/utils/Stat.h"
 #include "Layer.h"
+#include "paddle/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/BatchNormalizationLayer.h b/paddle/gserver/layers/BatchNormalizationLayer.h
index 56be4735683df933251944ac1a6b3246269692f8..052c2077322be59f9d41966c1c8b6ab20c8f85bb 100644
--- a/paddle/gserver/layers/BatchNormalizationLayer.h
+++ b/paddle/gserver/layers/BatchNormalizationLayer.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "Layer.h"
 #include "BatchNormBaseLayer.h"
+#include "Layer.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/ConcatenateLayer.cpp b/paddle/gserver/layers/ConcatenateLayer.cpp
index f6b3d86b8ccef6d0c1f9eb4c50369b65b078278d..d19adace7d58af16736fc2b6e536f5fd69a19863 100644
--- a/paddle/gserver/layers/ConcatenateLayer.cpp
+++ b/paddle/gserver/layers/ConcatenateLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Stat.h"
 #include "Layer.h"
 #include "Projection.h"
+#include "paddle/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/ContextProjection.cpp b/paddle/gserver/layers/ContextProjection.cpp
index 6080aa51b9955d4577c3732e1d6fad1e94a41f79..7ac56e3a2ab2a2a7f2219b8bfd34c16a84c427c0 100644
--- a/paddle/gserver/layers/ContextProjection.cpp
+++ b/paddle/gserver/layers/ContextProjection.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Stat.h"
 #include "ContextProjection.h"
+#include "paddle/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/ConvBaseLayer.cpp b/paddle/gserver/layers/ConvBaseLayer.cpp
index 473ca24a94c9560243a6bf96b8b85efcd54acb84..7b234dc2a6663dc677affcae7dc6306c104c1250 100644
--- a/paddle/gserver/layers/ConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ConvBaseLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "ConvBaseLayer.h"
 #include "paddle/math/MathUtils.h"
+#include "paddle/utils/Logging.h"
 namespace paddle {
 
 bool ConvBaseLayer::init(const LayerMap& layerMap,
diff --git a/paddle/gserver/layers/ConvOperator.cpp b/paddle/gserver/layers/ConvOperator.cpp
index 3ede98ba4b91f52238dc4f9740788c53cfe7c21e..f943410dee0dc2f3d356c9d7d8f61398fe2871c8 100644
--- a/paddle/gserver/layers/ConvOperator.cpp
+++ b/paddle/gserver/layers/ConvOperator.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/math/Matrix.h"
-#include "paddle/math/MathUtils.h"
 #include "Operator.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/ConvProjection.cpp b/paddle/gserver/layers/ConvProjection.cpp
index e72dc37ec8023e3f275af5097e253266959d9ea0..e1c4b91ace21522a3bc640dfc4eaa1a42668ed02 100644
--- a/paddle/gserver/layers/ConvProjection.cpp
+++ b/paddle/gserver/layers/ConvProjection.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Stat.h"
 #include "ConvProjection.h"
+#include "paddle/utils/Stat.h"
 
 namespace paddle {
 
@@ -130,6 +130,10 @@ void ConvProjection::reshapeTensorDesc(int batchSize) {
 void ConvProjection::reshape(int batchSize) {
   size_t width = calOutputSize();
   CHECK_EQ(width, out_->value->getWidth());
+  CHECK_EQ(channels_ * imageH_ * imageW_, in_->value->getWidth())
+      << "Wrong input size for convolution"
+      << " channels=" << channels_ << " imageH=" << imageH_
+      << " imageW=" << imageW_ << " inputSize=" << in_->value->getWidth();
 
   isSelectAlgo_ = (batchSize == batchNum_);
   batchNum_ = batchSize;
diff --git a/paddle/gserver/layers/ConvShiftLayer.cpp b/paddle/gserver/layers/ConvShiftLayer.cpp
index 527d885d865290e9a7cf44c701d4b4f4450adfa4..9bfb1ab7a47b11a6793159aefcb4f9fa12b81a6b 100644
--- a/paddle/gserver/layers/ConvShiftLayer.cpp
+++ b/paddle/gserver/layers/ConvShiftLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/ConvexCombinationLayer.cpp b/paddle/gserver/layers/ConvexCombinationLayer.cpp
index 57ff95fe37e2a19bc0c1bd7835d5fe932a01f797..3f4d77a2fe069f239db8cd099dd0d472d6ce3ccc 100644
--- a/paddle/gserver/layers/ConvexCombinationLayer.cpp
+++ b/paddle/gserver/layers/ConvexCombinationLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/CosSimVecMatLayer.cpp b/paddle/gserver/layers/CosSimVecMatLayer.cpp
index e8a7f671ee70414cb02d31e428660cba589e3a2b..ad490b0b8c4656c1eabf519233f2386b4b6e9417 100644
--- a/paddle/gserver/layers/CosSimVecMatLayer.cpp
+++ b/paddle/gserver/layers/CosSimVecMatLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp
index 90cd473c424a9cedf0a2c154165a4a287f980972..7e9519f6b3af50bf47b660b285c3593087f80271 100644
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <memory>
+#include "CostLayer.h"
 #include <algorithm>
-#include "paddle/utils/Logging.h"
 #include <cmath>
-#include "CostLayer.h"
+#include <memory>
+#include "paddle/utils/Logging.h"
 
 #include "paddle/math/SparseMatrix.h"
 
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.cpp b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
index d44c217105afb6cc1e2bba5cef904ae5277bffea..09dac05a7ad7a80bd6b9e12e8f7f060310d516c8 100644
--- a/paddle/gserver/layers/CudnnBatchNormLayer.cpp
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Stat.h"
-#include "Layer.h"
 #include "CudnnBatchNormLayer.h"
+#include "Layer.h"
+#include "paddle/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/CudnnBatchNormLayer.h b/paddle/gserver/layers/CudnnBatchNormLayer.h
index a52a683e15481f4a72037872d47dcfa5a852fc6a..b1e7d2082f1443313bfc858a17adfd737ecff98f 100644
--- a/paddle/gserver/layers/CudnnBatchNormLayer.h
+++ b/paddle/gserver/layers/CudnnBatchNormLayer.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/utils/Stat.h"
-#include "Layer.h"
 #include "BatchNormBaseLayer.h"
+#include "Layer.h"
+#include "paddle/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/CudnnConvLayer.cpp b/paddle/gserver/layers/CudnnConvLayer.cpp
index 6e28d5eb4294e24bd330023ec0700d0ed1dd4007..978c2c1479c64ab2cdebaaff7394059b3d033ab6 100644
--- a/paddle/gserver/layers/CudnnConvLayer.cpp
+++ b/paddle/gserver/layers/CudnnConvLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "CudnnConvLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
-#include "CudnnConvLayer.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/CudnnConvLayer.h b/paddle/gserver/layers/CudnnConvLayer.h
index 6317fab6f89266ffeef9dd332fad140166a96c0b..b869c695bd753076c6501a1253fcad22139ccadf 100644
--- a/paddle/gserver/layers/CudnnConvLayer.h
+++ b/paddle/gserver/layers/CudnnConvLayer.h
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "ConvBaseLayer.h"
-#include "paddle/math/Matrix.h"
 #include "Projection.h"
-#include <vector>
+#include "paddle/math/Matrix.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/CudnnPoolLayer.cpp b/paddle/gserver/layers/CudnnPoolLayer.cpp
index d0e71c63457ef476ad9428e7011b9338f6ee7b37..4adb2d4709e585a6fec052435c33714d6e3a3f0e 100644
--- a/paddle/gserver/layers/CudnnPoolLayer.cpp
+++ b/paddle/gserver/layers/CudnnPoolLayer.cpp
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "CudnnPoolLayer.h"
+#include "paddle/math/Matrix.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
-#include "paddle/math/Matrix.h"
-#include "CudnnPoolLayer.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/DataLayer.cpp b/paddle/gserver/layers/DataLayer.cpp
index 66f0606a38cf597c2697ef1d9e9419ea0e94ef5a..3551df4e172f0237685127b0b3869554d9c5f97d 100644
--- a/paddle/gserver/layers/DataLayer.cpp
+++ b/paddle/gserver/layers/DataLayer.cpp
@@ -54,7 +54,7 @@ void DataLayer::copyDataToOutput(Argument& output) {
     output.setFrameWidth(config_.width());
   } else {
     output.setFrameHeight(data_.getFrameHeight());
-    output.setFrameHeight(data_.getFrameHeight());
+    output.setFrameWidth(data_.getFrameWidth());
   }
   output.cpuSequenceDims = data_.cpuSequenceDims;
   output.sequenceStartPositions = data_.sequenceStartPositions;
diff --git a/paddle/gserver/layers/EosIdCheckLayer.cpp b/paddle/gserver/layers/EosIdCheckLayer.cpp
index dc3c6e6b644e14cff4fda5793c1fbe871c20c113..fa53e2e4cfc8a220eeb2a637d7fe759f1744f9d5 100644
--- a/paddle/gserver/layers/EosIdCheckLayer.cpp
+++ b/paddle/gserver/layers/EosIdCheckLayer.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "Layer.h"
+#include "paddle/utils/Logging.h"
 
 namespace paddle {
 /**
diff --git a/paddle/gserver/layers/ExpandConvBaseLayer.h b/paddle/gserver/layers/ExpandConvBaseLayer.h
index e14f6e6f4460f65a33503aa5f812dc504338b6b4..8445642217cf3e83441ddd9beec80f99faf946bc 100644
--- a/paddle/gserver/layers/ExpandConvBaseLayer.h
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "ConvBaseLayer.h"
 #include "paddle/math/Matrix.h"
-#include <vector>
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/ExpandConvLayer.cpp b/paddle/gserver/layers/ExpandConvLayer.cpp
index dcc78399602264ead8e32ad50dde9bbbeff0606b..f9267b81a7d4264f5f43552e3d54a45e4b212e00 100644
--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "ExpandConvLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
-#include "ExpandConvLayer.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/ExpandConvLayer.h b/paddle/gserver/layers/ExpandConvLayer.h
index 6f8504b50a935faadd4ed567e13fc5d858f8e7ab..de81a017e1bac38a5717e8c83a028f5408c0e084 100644
--- a/paddle/gserver/layers/ExpandConvLayer.h
+++ b/paddle/gserver/layers/ExpandConvLayer.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/math/Matrix.h"
 #include <vector>
 #include "ExpandConvBaseLayer.h"
+#include "paddle/math/Matrix.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/ExpandConvTransLayer.cpp b/paddle/gserver/layers/ExpandConvTransLayer.cpp
index cd4965c3c59e7c73cfd2141611ceef5088c89fad..520586b13889790c94a3e29902a4ea0ee55e8555 100644
--- a/paddle/gserver/layers/ExpandConvTransLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvTransLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "ExpandConvTransLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
-#include "ExpandConvTransLayer.h"
 
 /* The implementation of the convTransLayer is basically a swap of forward and
  * backward of the original convLayer.
diff --git a/paddle/gserver/layers/ExpandConvTransLayer.h b/paddle/gserver/layers/ExpandConvTransLayer.h
index fa9d7fb481cb5328fb9f2d23003672dbe0ff3af6..4a527d67995e255c65fea1f310551f8de5630030 100644
--- a/paddle/gserver/layers/ExpandConvTransLayer.h
+++ b/paddle/gserver/layers/ExpandConvTransLayer.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/math/Matrix.h"
 #include <vector>
 #include "ExpandConvBaseLayer.h"
+#include "paddle/math/Matrix.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/FullyConnectedLayer.cpp b/paddle/gserver/layers/FullyConnectedLayer.cpp
index d2a028dd8060a6865f25ae636f7074ff933abe6a..89afe33c36697f8d57885043ed68cdf26576e358 100644
--- a/paddle/gserver/layers/FullyConnectedLayer.cpp
+++ b/paddle/gserver/layers/FullyConnectedLayer.cpp
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "FullyConnectedLayer.h"
+#include <algorithm>
+#include <vector>
+#include "paddle/math/SparseMatrix.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
-#include "paddle/math/SparseMatrix.h"
-#include <vector>
-#include <algorithm>
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/GatedRecurrentLayer.cpp b/paddle/gserver/layers/GatedRecurrentLayer.cpp
index 01b210ba70ae6273e47bb5e603afef7e97881f39..930d9a056164e7c677adb53b7b67901364da1309 100644
--- a/paddle/gserver/layers/GatedRecurrentLayer.cpp
+++ b/paddle/gserver/layers/GatedRecurrentLayer.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "Layer.h"
 #include "GatedRecurrentLayer.h"
+#include "Layer.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
@@ -386,8 +386,9 @@ void GatedRecurrentLayer::backwardBatch(int batchSize, MatrixPtr inputGrad) {
       {
         batchSize = outputGradTmp->getHeight();
         gruValue.prevOutValue =
-            (n == 0 ? nullptr : (batchValue_->getBatchValue(n - 1, batchSize))
-                                    ->getData());
+            (n == 0
+                 ? nullptr
+                 : (batchValue_->getBatchValue(n - 1, batchSize))->getData());
         gruGrad.prevOutGrad =
             (n == 0 ? nullptr
                     : (batchGrad_->getBatchValue(n - 1, batchSize))->getData());
diff --git a/paddle/gserver/layers/GatedRecurrentLayer.h b/paddle/gserver/layers/GatedRecurrentLayer.h
index e099b4d18b17ab628730157885dfd45da2cc2f8d..25770ce57fbaa4d16c9454d824800f2f0c7f957d 100644
--- a/paddle/gserver/layers/GatedRecurrentLayer.h
+++ b/paddle/gserver/layers/GatedRecurrentLayer.h
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/math/Matrix.h"
-#include "SequenceToBatch.h"
 #include "GruCompute.h"
 #include "Layer.h"
+#include "SequenceToBatch.h"
+#include "paddle/math/Matrix.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/GruCompute.cpp b/paddle/gserver/layers/GruCompute.cpp
index 7d4e8001a88c6df1ceef1b876bc09f35f1c3bb61..06907768e98f4bad952706cffbbd65d1f86cc6df 100644
--- a/paddle/gserver/layers/GruCompute.cpp
+++ b/paddle/gserver/layers/GruCompute.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Util.h"
 #include "GruCompute.h"
 #include "hl_recurrent_apply.cuh"
+#include "paddle/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/GruCompute.h b/paddle/gserver/layers/GruCompute.h
index 2a5da72068ea1638488cbc3b47351abd782f9e53..42c0019319ac9f20f9c3349fb2429c30f03d682b 100644
--- a/paddle/gserver/layers/GruCompute.h
+++ b/paddle/gserver/layers/GruCompute.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/utils/TypeDefs.h"
 #include "ModelConfig.pb.h"
 #include "hl_gpu.h"
+#include "paddle/utils/TypeDefs.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/GruStepLayer.cpp b/paddle/gserver/layers/GruStepLayer.cpp
index c48b5e40e67bfc4d803bb2d6f1e3276b1d0086a4..4a1006aa941f396c233a0cecfc38228f1f9fafe1 100644
--- a/paddle/gserver/layers/GruStepLayer.cpp
+++ b/paddle/gserver/layers/GruStepLayer.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "Layer.h"
 #include "GruCompute.h"
+#include "Layer.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/IdentityProjection.cpp b/paddle/gserver/layers/IdentityProjection.cpp
index 8660631b5aa10742acdefe17b287fe18d17dc167..f1d41a33d40f120d5de8b2bfe9cf3271eefa08be 100644
--- a/paddle/gserver/layers/IdentityProjection.cpp
+++ b/paddle/gserver/layers/IdentityProjection.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Stat.h"
 #include "Projection.h"
+#include "paddle/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/InterpolationLayer.cpp b/paddle/gserver/layers/InterpolationLayer.cpp
index 94d4614b21b7ea6b2923eda926a3596127cc281c..44fe1fb1fea4203a4a1cac67c581b13adda65966 100644
--- a/paddle/gserver/layers/InterpolationLayer.cpp
+++ b/paddle/gserver/layers/InterpolationLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
index 3c539f30768a1cafd3c777567a82d165efef7ea5..c47943f81c01589eada4b825d54be5c69314b6fa 100644
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -14,15 +14,15 @@ limitations under the License. */
 
 #include "paddle/utils/Util.h"
 
-#include "paddle/utils/Logging.h"
 #include "paddle/math/SparseMatrix.h"
+#include "paddle/utils/Logging.h"
 
 #include "AddtoLayer.h"
+#include "CRFLayer.h"
 #include "CosSimLayer.h"
 #include "CostLayer.h"
-#include "ExpandConvLayer.h"
-#include "CRFLayer.h"
 #include "DataLayer.h"
+#include "ExpandConvLayer.h"
 #include "FullyConnectedLayer.h"
 #include "HierarchicalSigmoidLayer.h"
 #include "MaxLayer.h"
@@ -33,7 +33,7 @@ limitations under the License. */
 #include "TransLayer.h"
 #include "ValidationLayer.h"
 
-P_DEFINE_bool(log_error_clipping, false, "enable log error clipping or not");
+DEFINE_bool(log_error_clipping, false, "enable log error clipping or not");
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h
index 6609e16c4cf21cd4eba3114d3ca178a6acd14a29..172e558b82945296ef8a50d464c03efbfd597e0d 100644
--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -14,18 +14,18 @@ limitations under the License. */
 
 #pragma once
 
-#include <memory>
-#include <functional>
 #include <paddle/parameter/Argument.h>
-#include "paddle/utils/ClassRegistrar.h"
+#include <functional>
+#include <memory>
+#include "ModelConfig.pb.h"
 #include "paddle/math/CpuSparseMatrix.h"
 #include "paddle/parameter/Parameter.h"
+#include "paddle/utils/ClassRegistrar.h"
 #include "paddle/utils/Util.h"
-#include "ModelConfig.pb.h"
 
-#include "paddle/gserver/activations/ActivationFunction.h"
 #include <paddle/parameter/ParallelParameter.h>
 #include <paddle/parameter/Weight.h>
+#include "paddle/gserver/activations/ActivationFunction.h"
 
 /// Macro for registering a layer type.
 /// Example: REGISTER_LAYER(crf_error, CRFDecodingErrorLayer);
diff --git a/paddle/gserver/layers/LinearChainCRF.cpp b/paddle/gserver/layers/LinearChainCRF.cpp
index c6414c822eb5ec0743d4cef58b2959253e658f37..af550c7a0154802a93bacccab500695bdad36542 100644
--- a/paddle/gserver/layers/LinearChainCRF.cpp
+++ b/paddle/gserver/layers/LinearChainCRF.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <algorithm>
 #include "LinearChainCRF.h"
+#include <algorithm>
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/LinearChainCTC.cpp b/paddle/gserver/layers/LinearChainCTC.cpp
index 60e814fc3074ab91fa61013f66edf367861187e5..cb2b249110dbd736a46a713480eca12e59cb391b 100644
--- a/paddle/gserver/layers/LinearChainCTC.cpp
+++ b/paddle/gserver/layers/LinearChainCTC.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <math.h>
 #include "LinearChainCTC.h"
+#include <math.h>
 #include <limits>
 
 namespace paddle {
diff --git a/paddle/gserver/layers/LstmCompute.cpp b/paddle/gserver/layers/LstmCompute.cpp
index 18f79969588e41604ec5af9cecdf1bbe9152a25f..4c4297096423762355a5ee028cac252432cc1956 100644
--- a/paddle/gserver/layers/LstmCompute.cpp
+++ b/paddle/gserver/layers/LstmCompute.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Util.h"
-#include "hl_recurrent_apply.cuh"
 #include "LstmCompute.h"
+#include "hl_recurrent_apply.cuh"
+#include "paddle/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/LstmCompute.h b/paddle/gserver/layers/LstmCompute.h
index 9b7aee19dd2a0f439186603b96b9d32e1ec7c9b7..140a4c6ecf5cfaf1045cec3ca2db5d4f2e54aca4 100644
--- a/paddle/gserver/layers/LstmCompute.h
+++ b/paddle/gserver/layers/LstmCompute.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/utils/TypeDefs.h"
 #include "ModelConfig.pb.h"
 #include "hl_gpu.h"
+#include "paddle/utils/TypeDefs.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/LstmLayer.cpp b/paddle/gserver/layers/LstmLayer.cpp
index 975edcfe7fe400110f7abb99902f087b483fa746..2543d1b49a801943819e05bc52e53eaeafae1edf 100644
--- a/paddle/gserver/layers/LstmLayer.cpp
+++ b/paddle/gserver/layers/LstmLayer.cpp
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "LstmLayer.h"
-#include "paddle/math/Matrix.h"
 #include "paddle/math/BaseMatrix.h"
+#include "paddle/math/Matrix.h"
 #include "paddle/utils/Stat.h"
 
-P_DECLARE_bool(prev_batch_state);
+DECLARE_bool(prev_batch_state);
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/LstmLayer.h b/paddle/gserver/layers/LstmLayer.h
index 16c62aa88d8e9170d23c87013751fabd244cf590..f49df2c412f05f74da455d41cdf7c9bd4b9ec2e2 100644
--- a/paddle/gserver/layers/LstmLayer.h
+++ b/paddle/gserver/layers/LstmLayer.h
@@ -15,10 +15,10 @@ limitations under the License. */
 #pragma once
 
 #include "Layer.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/BaseMatrix.h"
-#include "SequenceToBatch.h"
 #include "LstmCompute.h"
+#include "SequenceToBatch.h"
+#include "paddle/math/BaseMatrix.h"
+#include "paddle/math/Matrix.h"
 namespace paddle {
 
 /**
diff --git a/paddle/gserver/layers/MDLstmLayer.cpp b/paddle/gserver/layers/MDLstmLayer.cpp
index 9d3797d16f9514c9ecd27cef58b7567658c9db4a..1243c12889542103f65b427da8f549e852773c5c 100644
--- a/paddle/gserver/layers/MDLstmLayer.cpp
+++ b/paddle/gserver/layers/MDLstmLayer.cpp
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "LstmLayer.h"
-#include "paddle/math/Matrix.h"
 #include "paddle/math/BaseMatrix.h"
+#include "paddle/math/Matrix.h"
 
 namespace paddle {
 
@@ -318,7 +318,7 @@ void MDLstmLayer::forward(PassType passType) {
   CHECK_EQ(starts[numSequences], batchSize);
 
   int* dimsData = input.cpuSequenceDims->getData();
-  CHECK_EQ(int(input.cpuSequenceDims->getSize()), numDims_ * numSequences);
+  CHECK_EQ(int(input.cpuSequenceDims->getSize()), numDims_* numSequences);
 
   for (int i = 0; i < numSequences; i++) {
     std::vector<int> dims;
diff --git a/paddle/gserver/layers/MaxOutLayer.cpp b/paddle/gserver/layers/MaxOutLayer.cpp
index 4fb99ce2a2941094e93e27e24b8930bb1c1611a8..3a86a95321d8843338267df374dae169271410f5 100644
--- a/paddle/gserver/layers/MaxOutLayer.cpp
+++ b/paddle/gserver/layers/MaxOutLayer.cpp
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "MaxOutLayer.h"
-#include "hl_gpu.h"
 #include "hl_cnn.h"
+#include "hl_gpu.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/MixedLayer.cpp b/paddle/gserver/layers/MixedLayer.cpp
index 490b217347f22a4840e44c89e4c84147358504ae..2525b1984b80a4200923c007d3021d468745133e 100644
--- a/paddle/gserver/layers/MixedLayer.cpp
+++ b/paddle/gserver/layers/MixedLayer.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Stat.h"
 #include "MixedLayer.h"
+#include "paddle/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/MixedLayer.h b/paddle/gserver/layers/MixedLayer.h
index d73ba6b7a1f9a2894066bbb68f934ccb777995b1..9655a152c7bc96fb3941fcbd9db4ff71a59e4ebe 100644
--- a/paddle/gserver/layers/MixedLayer.h
+++ b/paddle/gserver/layers/MixedLayer.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include "Layer.h"
-#include "Projection.h"
 #include "Operator.h"
+#include "Projection.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/MultinomialSampler.h b/paddle/gserver/layers/MultinomialSampler.h
index 6e50f8738e316a0bbc3dac1b715af1ec1288145f..677b047029305549084770bdb5eadfeaafbfac8a 100644
--- a/paddle/gserver/layers/MultinomialSampler.h
+++ b/paddle/gserver/layers/MultinomialSampler.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include <memory>
 #include <random>
-
 #include "paddle/utils/TypeDefs.h"
 
 namespace paddle {
@@ -32,6 +32,17 @@ class MultinomialSampler {
 public:
   MultinomialSampler(const real* prob, int size);
 
+  //! protobuf always using double.
+  static MultinomialSampler* create(const double* prob, int size) {
+#ifdef PADDLE_TYPE_DOUBLE
+    return new MultinomialSampler(prob, size);
+#else
+    std::unique_ptr<real[]> tmp(new real[size]);
+    std::copy(prob, prob + size, tmp.get());
+    return new MultinomialSampler(tmp.get(), size);
+#endif
+  }
+
   /**
    * @brief Generate a random sample.
    * @param g is a random number engine. See <random>.
diff --git a/paddle/gserver/layers/MultiplexLayer.cpp b/paddle/gserver/layers/MultiplexLayer.cpp
index dc4a1ec321635634682684958a336d740820d75c..d09720c5255747df11d4d7367f67a245e63e6846 100644
--- a/paddle/gserver/layers/MultiplexLayer.cpp
+++ b/paddle/gserver/layers/MultiplexLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/NCELayer.cpp b/paddle/gserver/layers/NCELayer.cpp
index 540db46545ef03010d6138954070283ec32e5577..5ab765247f63dfe6e6651ca4d27dc7183a9f33e1 100644
--- a/paddle/gserver/layers/NCELayer.cpp
+++ b/paddle/gserver/layers/NCELayer.cpp
@@ -99,8 +99,8 @@ public:
 
     if (config_.neg_sampling_dist_size()) {
       CHECK_EQ(numClasses_, config_.neg_sampling_dist_size());
-      sampler_.reset(new MultinomialSampler(config_.neg_sampling_dist().data(),
-                                            numClasses_));
+      sampler_.reset(MultinomialSampler::create(
+          config_.neg_sampling_dist().data(), numClasses_));
     }
 
     return true;
diff --git a/paddle/gserver/layers/NormLayer.cpp b/paddle/gserver/layers/NormLayer.cpp
index b8682a14228538da662c1a5f4e5d3d020866dc80..3db0af2515ee9f64aa6c0b0a441e88562d9e398e 100644
--- a/paddle/gserver/layers/NormLayer.cpp
+++ b/paddle/gserver/layers/NormLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "NormLayer.h"
 #include "NormProjectionLayer.h"
+#include "paddle/utils/Logging.h"
 namespace paddle {
 
 REGISTER_LAYER_CREATE_FUNC(norm, &NormLayer::create);
diff --git a/paddle/gserver/layers/NormLayer.h b/paddle/gserver/layers/NormLayer.h
index aedbb95b4fbb0366ce05b840372d736f6cea4b1a..86255b231b1eee578e81f31d76fd66bb845b10b7 100644
--- a/paddle/gserver/layers/NormLayer.h
+++ b/paddle/gserver/layers/NormLayer.h
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include <vector>
 #include "Layer.h"
-#include "paddle/math/Matrix.h"
 #include "NormLayer.h"
+#include "paddle/math/Matrix.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/NormProjectionLayer.cpp b/paddle/gserver/layers/NormProjectionLayer.cpp
index ea301292e0dcc71f2368550d41d43f0989ddc439..934fc31e0acf96263654f4d74a1a4394578986cc 100644
--- a/paddle/gserver/layers/NormProjectionLayer.cpp
+++ b/paddle/gserver/layers/NormProjectionLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "NormProjectionLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
-#include "NormProjectionLayer.h"
 
 namespace paddle {
 size_t CMRProjectionNormLayer::getSize() {
diff --git a/paddle/gserver/layers/NormProjectionLayer.h b/paddle/gserver/layers/NormProjectionLayer.h
index 0db8e2551f06d935289ae3a3631c087161d8dab1..4f7b638334afe3832e03537486f3ffc4dbbdcd9d 100644
--- a/paddle/gserver/layers/NormProjectionLayer.h
+++ b/paddle/gserver/layers/NormProjectionLayer.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "NormLayer.h"
 #include "paddle/math/Matrix.h"
-#include <vector>
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/Operator.h b/paddle/gserver/layers/Operator.h
index b0586b59e916bf85d804d7ed719775f2d2a95433..6fd331382f243039fa38b2762b2d5edede60d868 100644
--- a/paddle/gserver/layers/Operator.h
+++ b/paddle/gserver/layers/Operator.h
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/parameter/Parameter.h"
 #include "ModelConfig.pb.h"
+#include "paddle/parameter/Parameter.h"
 
-#include "paddle/parameter/Argument.h"
 #include "Layer.h"
+#include "paddle/parameter/Argument.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/OuterProdLayer.cpp b/paddle/gserver/layers/OuterProdLayer.cpp
index 42587dcce54fb766c8ec35d0b8b536bb6d829f78..cf9a008318e9d8dd50d1f401576082c07680f6c4 100644
--- a/paddle/gserver/layers/OuterProdLayer.cpp
+++ b/paddle/gserver/layers/OuterProdLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/PoolLayer.cpp b/paddle/gserver/layers/PoolLayer.cpp
index 36e396487ef7d03157b3a5733dd20b470f27f7e3..96d5c54accc047b685502a178de2d290f3158731 100644
--- a/paddle/gserver/layers/PoolLayer.cpp
+++ b/paddle/gserver/layers/PoolLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "PoolLayer.h"
 #include "PoolProjectionLayer.h"
+#include "paddle/utils/Logging.h"
 #ifndef PADDLE_ONLY_CPU
 #include "CudnnPoolLayer.h"
 #endif
diff --git a/paddle/gserver/layers/PoolLayer.h b/paddle/gserver/layers/PoolLayer.h
index c05d7a364d15e4f569499774878976e0c394215b..318b89d7c2bce896d183eba8c48c230d962918a5 100644
--- a/paddle/gserver/layers/PoolLayer.h
+++ b/paddle/gserver/layers/PoolLayer.h
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "Layer.h"
-#include "paddle/math/Matrix.h"
 #include "paddle/math/MathUtils.h"
-#include <vector>
+#include "paddle/math/Matrix.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/PoolProjectionLayer.cpp b/paddle/gserver/layers/PoolProjectionLayer.cpp
index 392c548d45554ea7c119ff94973dfd30deae8392..ed5011ab8990620acb12f3ca6c488ce403336d45 100644
--- a/paddle/gserver/layers/PoolProjectionLayer.cpp
+++ b/paddle/gserver/layers/PoolProjectionLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "PoolProjectionLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
-#include "PoolProjectionLayer.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/PowerLayer.cpp b/paddle/gserver/layers/PowerLayer.cpp
index eb692492709bee82303eb890294fac03a69781d0..64fecab5b08354ceea8b290b78eede72d24a98a2 100644
--- a/paddle/gserver/layers/PowerLayer.cpp
+++ b/paddle/gserver/layers/PowerLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/RecurrentLayer.cpp b/paddle/gserver/layers/RecurrentLayer.cpp
index 0832eeaa104c590a6c01202c6b84cfb22f3f1691..85812c9d660e07e915012a7337e621c10a6597ca 100644
--- a/paddle/gserver/layers/RecurrentLayer.cpp
+++ b/paddle/gserver/layers/RecurrentLayer.cpp
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "Layer.h"
-#include "paddle/utils/Stat.h"
 #include "SequenceToBatch.h"
 #include "paddle/utils/CommandLineParser.h"
+#include "paddle/utils/Stat.h"
 
-P_DEFINE_bool(rnn_use_batch, false, "Using the batch method for calculation.");
+DEFINE_bool(rnn_use_batch, false, "Using the batch method for calculation.");
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/RecurrentLayerGroup.cpp b/paddle/gserver/layers/RecurrentLayerGroup.cpp
index 5cb42206238c9a28f4bf958f367a8b2101935429..af8dd61d84e2e53ca26dc054d0516e62ab7aa216 100644
--- a/paddle/gserver/layers/RecurrentLayerGroup.cpp
+++ b/paddle/gserver/layers/RecurrentLayerGroup.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/gserver/layers/Layer.h"
 #include <functional>
+#include "paddle/gserver/layers/Layer.h"
 
 #include "paddle/gserver/gradientmachines/RecurrentGradientMachine.h"
 #include "paddle/utils/Stat.h"
diff --git a/paddle/gserver/layers/ResizeLayer.cpp b/paddle/gserver/layers/ResizeLayer.cpp
index e79732155abe53ffec43ef7ce73f43cef4ad4094..7fcb3adea01b9d16394ee90b751b10902dc3a190 100644
--- a/paddle/gserver/layers/ResizeLayer.cpp
+++ b/paddle/gserver/layers/ResizeLayer.cpp
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "Layer.h"
-#include "paddle/math/Matrix.h"
 #include "paddle/math/BaseMatrix.h"
+#include "paddle/math/Matrix.h"
 
 namespace paddle {
 /**
diff --git a/paddle/gserver/layers/ScalingLayer.cpp b/paddle/gserver/layers/ScalingLayer.cpp
index 013bff6b986b32a9e9eedeb3780efe8ae16d178d..7f0084be6b57f5ce8245609e64c744c1a049a925 100644
--- a/paddle/gserver/layers/ScalingLayer.cpp
+++ b/paddle/gserver/layers/ScalingLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
index 75d9fa8a97959976cd51a59ea6797a9144ae198b..9200a01eee3be8ab61b6181ec337b2c3c70c5966 100644
--- a/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
+++ b/paddle/gserver/layers/SelectiveFullyConnectedLayer.cpp
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "SelectiveFullyConnectedLayer.h"
+#include <algorithm>
+#include <vector>
+#include "paddle/math/SparseMatrix.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
-#include "paddle/math/SparseMatrix.h"
-#include <vector>
-#include <algorithm>
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/SequenceConcatLayer.cpp b/paddle/gserver/layers/SequenceConcatLayer.cpp
index d3e0e16e9692ac8aefe96b02125363f4a4ad7dc5..069bc26e602ff7d925b4115d12388b6716676b29 100644
--- a/paddle/gserver/layers/SequenceConcatLayer.cpp
+++ b/paddle/gserver/layers/SequenceConcatLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/SequencePoolLayer.cpp b/paddle/gserver/layers/SequencePoolLayer.cpp
index 856c889e3befe6915f202358d36a81c6985e9dc5..35260ca912d5d0e00213ffb7074bd8963da265da 100644
--- a/paddle/gserver/layers/SequencePoolLayer.cpp
+++ b/paddle/gserver/layers/SequencePoolLayer.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "SequencePoolLayer.h"
+#include "paddle/utils/Logging.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/SequenceReshapeLayer.cpp b/paddle/gserver/layers/SequenceReshapeLayer.cpp
index 4b90424215ceaffba318665fdcc3a2329f0c75cd..23924b0490851ad3c3c74d77e7abd8b0af8fc234 100644
--- a/paddle/gserver/layers/SequenceReshapeLayer.cpp
+++ b/paddle/gserver/layers/SequenceReshapeLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/SequenceToBatch.cpp b/paddle/gserver/layers/SequenceToBatch.cpp
index c12ed821975b466b955412ef560ff886419c2f5e..5fa7b6f4881b9582b540a5b1bfe849220cc2a4ea 100644
--- a/paddle/gserver/layers/SequenceToBatch.cpp
+++ b/paddle/gserver/layers/SequenceToBatch.cpp
@@ -12,11 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <vector>
-#include <algorithm>
 #include "SequenceToBatch.h"
-#include <iostream>
 #include <string.h>
+#include <algorithm>
+#include <iostream>
+#include <vector>
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/SequenceToBatch.h b/paddle/gserver/layers/SequenceToBatch.h
index fe9b34b224b8d17d454788bc07048f5a9ca2b3e8..17e735a135cba8b43caf0ed9e06bb53903b5cd6a 100644
--- a/paddle/gserver/layers/SequenceToBatch.h
+++ b/paddle/gserver/layers/SequenceToBatch.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/math/Vector.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/SlopeInterceptLayer.cpp b/paddle/gserver/layers/SlopeInterceptLayer.cpp
index 5c00e54f8c1e3f2103a04a7b719bec44ebbbdc13..b678f414b6d76fa26818cb379fb0f0fb8fc7ec09 100644
--- a/paddle/gserver/layers/SlopeInterceptLayer.cpp
+++ b/paddle/gserver/layers/SlopeInterceptLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/SubSequenceLayer.cpp b/paddle/gserver/layers/SubSequenceLayer.cpp
index 8b3545639193fe575db17078a1bfda0817db8a74..c52fbee26232ad6eb09f84315a57c73e6aa02eb0 100644
--- a/paddle/gserver/layers/SubSequenceLayer.cpp
+++ b/paddle/gserver/layers/SubSequenceLayer.cpp
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/Vector.h"
+#include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/SumToOneNormLayer.cpp b/paddle/gserver/layers/SumToOneNormLayer.cpp
index e6759171cbda2a4d18c2c13f9b05a7fd1e9ac126..aa99b49380d3682ccf3d89220c0c68f22e458271 100644
--- a/paddle/gserver/layers/SumToOneNormLayer.cpp
+++ b/paddle/gserver/layers/SumToOneNormLayer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
diff --git a/paddle/gserver/layers/TransLayer.cpp b/paddle/gserver/layers/TransLayer.cpp
index 5cbaaf8f0863f9f36693e6ad6ec4ab639716e574..d1fa90f38415c53bd1c56df4a6c4be0508004bc6 100644
--- a/paddle/gserver/layers/TransLayer.cpp
+++ b/paddle/gserver/layers/TransLayer.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "TransLayer.h"
+#include "paddle/utils/Logging.h"
 namespace paddle {
 
 REGISTER_LAYER(trans, TransLayer);
diff --git a/paddle/gserver/layers/TransLayer.h b/paddle/gserver/layers/TransLayer.h
index 8189700759090c1080c4676023c434158bcb10c9..b43fa1ebfb003226daed724b4ede3006545e8b07 100644
--- a/paddle/gserver/layers/TransLayer.h
+++ b/paddle/gserver/layers/TransLayer.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <vector>
 #include "Layer.h"
 #include "paddle/math/Matrix.h"
-#include <vector>
 
 namespace paddle {
 /**
diff --git a/paddle/gserver/layers/TransposedFullMatrixProjection.cpp b/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
index 8282584ab4bb744fe2279ac38e49f1c7eadd29ef..3f7ff0488207564e3ebbd5a467f42b46af3b31ff 100644
--- a/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
+++ b/paddle/gserver/layers/TransposedFullMatrixProjection.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Stat.h"
 #include "Projection.h"
+#include "paddle/utils/Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/ValidationLayer.cpp b/paddle/gserver/layers/ValidationLayer.cpp
index f029ea4c51257b4bba7e545f723f0f3a2042f22a..5127bcaba336b72dc76c832892e057724aeb3471 100644
--- a/paddle/gserver/layers/ValidationLayer.cpp
+++ b/paddle/gserver/layers/ValidationLayer.cpp
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <memory>
 #include <algorithm>
 #include <fstream>
+#include <memory>
 
-#include "paddle/utils/Logging.h"
 #include "ValidationLayer.h"
+#include "paddle/utils/Logging.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/ValidationLayer.h b/paddle/gserver/layers/ValidationLayer.h
index f9c61503aaa0963166a45f107cd5b801e97d4d84..4c1de7b3b7d6975c2693eb065f7d3e19cc51a95c 100644
--- a/paddle/gserver/layers/ValidationLayer.h
+++ b/paddle/gserver/layers/ValidationLayer.h
@@ -15,10 +15,10 @@ limitations under the License. */
 #pragma once
 #include <memory>
 
-#include "paddle/gserver/evaluators/Evaluator.h"
 #include "Layer.h"
+#include "paddle/gserver/evaluators/Evaluator.h"
 
-P_DECLARE_int32(trainer_id);
+DECLARE_int32(trainer_id);
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/WarpCTCLayer.cpp b/paddle/gserver/layers/WarpCTCLayer.cpp
index 23ca5257b6d2c8c6e88ad4bb92de6dc3f18fafe3..94e926a8d8f678c91b5c0614a78ba829869ec150 100644
--- a/paddle/gserver/layers/WarpCTCLayer.cpp
+++ b/paddle/gserver/layers/WarpCTCLayer.cpp
@@ -31,7 +31,6 @@ bool WarpCTCLayer::init(const LayerMap& layerMap,
   CHECK_EQ(numClasses_, inputLayers_[0]->getSize());
 
   blank_ = config_.blank();
-  CHECK_GE(blank_, 0UL);
   CHECK_LT(blank_, numClasses_);
 
   normByTimes_ = config_.norm_by_times();
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
index dffc24936faa2d855ae2eb762efbcc1c0f545943..1d5e7de1ba624d98c953efe1cdd2318548c4e914 100644
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "LayerGradUtil.h"
 
-P_DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(thread_local_rand_use_global_seed);
 
 namespace paddle {
 real getCostSum(LayerPtr& testLayer, MatrixPtr weights) {
diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h
index 2b8f334f19391111eb321d71e520e046b12013ca..62ac2d160fd916c5bb114341a442eac7df114c99 100644
--- a/paddle/gserver/tests/LayerGradUtil.h
+++ b/paddle/gserver/tests/LayerGradUtil.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "paddle/trainer/Trainer.h"
-#include "paddle/gserver/layers/DataLayer.h"
 #include "ModelConfig.pb.h"
+#include "paddle/gserver/layers/DataLayer.h"
+#include "paddle/trainer/Trainer.h"
 
 #include "TestUtil.h"
 using namespace std;  // NOLINT
diff --git a/paddle/gserver/tests/TestUtil.cpp b/paddle/gserver/tests/TestUtil.cpp
index dc007116977b48a22bc4d96b3f6c05c45ea9e837..e07c60861a4a6567fd1e28559b9806cb623a3bdf 100644
--- a/paddle/gserver/tests/TestUtil.cpp
+++ b/paddle/gserver/tests/TestUtil.cpp
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #include "TestUtil.h"
 
-#include "paddle/utils/CommandLineParser.h"
 #include "paddle/math/SparseMatrix.h"
+#include "paddle/utils/CommandLineParser.h"
 
-P_DEFINE_int32(fixed_seq_length, 0, "Produce some sequence of fixed length");
+DEFINE_int32(fixed_seq_length, 0, "Produce some sequence of fixed length");
 
 namespace paddle {
 
@@ -63,8 +63,8 @@ MatrixPtr makeRandomSparseMatrix(size_t height,
       std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
           ids.data(), indices.data(), data.data(), HPPL_STREAM_DEFAULT);
     } else {
-      std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
-          ->copyFrom(ids.data(), indices.data(), data.data());
+      std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
+          ids.data(), indices.data(), data.data());
     }
     return mat;
   } else {
@@ -80,8 +80,8 @@ MatrixPtr makeRandomSparseMatrix(size_t height,
       std::dynamic_pointer_cast<GpuSparseMatrix>(mat)->copyFrom(
           ids.data(), indices.data(), data.data(), HPPL_STREAM_DEFAULT);
     } else {
-      std::dynamic_pointer_cast<CpuSparseMatrix>(mat)
-          ->copyFrom(ids.data(), indices.data(), data.data());
+      std::dynamic_pointer_cast<CpuSparseMatrix>(mat)->copyFrom(
+          ids.data(), indices.data(), data.data());
     }
     return mat;
   }
diff --git a/paddle/gserver/tests/test_ActivationGrad.cpp b/paddle/gserver/tests/test_ActivationGrad.cpp
index 0181d62519212b8978fae96901067429504ac361..7d7e68da5c5a9dbcba024002a988f26f7613b724 100644
--- a/paddle/gserver/tests/test_ActivationGrad.cpp
+++ b/paddle/gserver/tests/test_ActivationGrad.cpp
@@ -13,20 +13,20 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <vector>
 #include <string>
-#include "paddle/gserver/layers/DataLayer.h"
+#include <vector>
 #include "ModelConfig.pb.h"
+#include "paddle/gserver/layers/DataLayer.h"
 #include "paddle/trainer/Trainer.h"
 
-#include "TestUtil.h"
 #include "LayerGradUtil.h"
+#include "TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(use_gpu);
+DECLARE_bool(thread_local_rand_use_global_seed);
 
 void testActivation(const string& act) {
   LOG(INFO) << "test activation: " << act;
diff --git a/paddle/gserver/tests/test_BatchNorm.cpp b/paddle/gserver/tests/test_BatchNorm.cpp
index 8575999aba17e5fea1051f36b40ff91821319aac..7f5fcb670b70aed9f0a04180d344556a0390122f 100644
--- a/paddle/gserver/tests/test_BatchNorm.cpp
+++ b/paddle/gserver/tests/test_BatchNorm.cpp
@@ -13,102 +13,109 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <vector>
 #include <string>
-#include "paddle/gserver/layers/DataLayer.h"
+#include <vector>
 #include "ModelConfig.pb.h"
+#include "paddle/gserver/layers/DataLayer.h"
+#include "paddle/gserver/layers/ExpandConvTransLayer.h"
 #include "paddle/trainer/Trainer.h"
 #include "paddle/utils/GlobalConstants.h"
-#include "paddle/gserver/layers/ExpandConvTransLayer.h"
 
-#include "TestUtil.h"
 #include "LayerGradUtil.h"
+#include "TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_double(checkgrad_eps);
-P_DECLARE_bool(thread_local_rand_use_global_seed);
-P_DECLARE_bool(prev_batch_state);
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);
 
 // Test that the batchNormLayer can be followed by a ConvLayer
 TEST(Layer, batchNorm) {
-    FLAGS_use_gpu = false;
-    TestConfig configBN;
-    const int CHANNELS = 6272;
-    const int IMG_SIZE = 1;
-    configBN.layerConfig.set_type("batch_norm");
-    configBN.layerConfig.set_name("bn");
-    configBN.layerConfig.set_size(CHANNELS * IMG_SIZE * IMG_SIZE);
-    configBN.layerConfig.set_active_type("relu");
-    configBN.biasSize = CHANNELS;
-    configBN.inputDefs.push_back({INPUT_DATA, "layer_0",
+  FLAGS_use_gpu = false;
+  TestConfig configBN;
+  const int CHANNELS = 6272;
+  const int IMG_SIZE = 1;
+  configBN.layerConfig.set_type("batch_norm");
+  configBN.layerConfig.set_name("bn");
+  configBN.layerConfig.set_size(CHANNELS * IMG_SIZE * IMG_SIZE);
+  configBN.layerConfig.set_active_type("relu");
+  configBN.biasSize = CHANNELS;
+  configBN.inputDefs.push_back({INPUT_DATA,
+                                "layer_0",
                                 /* dim= */ IMG_SIZE * IMG_SIZE * CHANNELS,
                                 /* paraSize= */ CHANNELS});
 
-    configBN.inputDefs.push_back({INPUT_DATA, "layer_1_running_mean",
-                                    1, CHANNELS});
-    configBN.inputDefs.back().isStatic = true;
-    configBN.inputDefs.push_back({INPUT_DATA, "layer_2_running_var",
-                                    1, CHANNELS});
-    configBN.inputDefs.back().isStatic = true;
-
-    LayerInputConfig* input = configBN.layerConfig.add_inputs();
-    configBN.layerConfig.add_inputs();
-    configBN.layerConfig.add_inputs();
-
-    ImageConfig* img_conf = input->mutable_image_conf();
-    img_conf->set_channels(CHANNELS);
-    img_conf->set_img_size(IMG_SIZE);
-
-    // Setting up conv-layer config
-    TestConfig config;
-    config.biasSize = 64;
-    config.layerConfig.set_type("exconv");
-    config.layerConfig.set_num_filters(64);
-    config.layerConfig.set_partial_sum(1);
-    config.layerConfig.set_shared_biases(true);
-
-    config.inputDefs.push_back({INPUT_DATA, "bn", 6272, 204800});
-    input = config.layerConfig.add_inputs();
-    ConvConfig* conv = input->mutable_conv_conf();
-    conv->set_filter_size(5);
-    conv->set_filter_size_y(5);
-    conv->set_channels(128);
-    conv->set_padding(1);
-    conv->set_padding_y(1);
-    conv->set_stride(2);
-    conv->set_stride_y(2);
-    conv->set_groups(1);
-    conv->set_filter_channels(conv->channels() / conv->groups());
-    conv->set_img_size(7);
-    conv->set_output_x(3);
-    config.layerConfig.set_size(conv->output_x() * conv->output_x() *
-                                config.layerConfig.num_filters());
-    config.layerConfig.set_name("conv");
-
-    // data layer initialize
-    std::vector<DataLayerPtr> dataLayers;
-    LayerMap layerMap;
-    vector<Argument> datas;
-    initDataLayer(configBN, &dataLayers, &datas, &layerMap, "batch_norm",
-                  100, false, false);
-    // test layer initialize
-    std::vector<ParameterPtr> parameters;
-    LayerPtr bnLayer;
-    initTestLayer(configBN, &layerMap, &parameters, &bnLayer);
-
-    std::vector<ParameterPtr> parameters2;
-    LayerPtr convLayer;
-    initTestLayer(config, &layerMap, &parameters2, &convLayer);
-
-    bnLayer->forward(PASS_GC);
-    convLayer->forward(PASS_GC);
-
-    CHECK_EQ(convLayer->getOutputValue()->getHeight(), 100);
-    CHECK_EQ(convLayer->getOutputValue()->getWidth(), 576);
+  configBN.inputDefs.push_back(
+      {INPUT_DATA, "layer_1_running_mean", 1, CHANNELS});
+  configBN.inputDefs.back().isStatic = true;
+  configBN.inputDefs.push_back(
+      {INPUT_DATA, "layer_2_running_var", 1, CHANNELS});
+  configBN.inputDefs.back().isStatic = true;
+
+  LayerInputConfig* input = configBN.layerConfig.add_inputs();
+  configBN.layerConfig.add_inputs();
+  configBN.layerConfig.add_inputs();
+
+  ImageConfig* img_conf = input->mutable_image_conf();
+  img_conf->set_channels(CHANNELS);
+  img_conf->set_img_size(IMG_SIZE);
+
+  // Setting up conv-layer config
+  TestConfig config;
+  config.biasSize = 64;
+  config.layerConfig.set_type("exconv");
+  config.layerConfig.set_num_filters(64);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  config.inputDefs.push_back({INPUT_DATA, "bn", 6272, 204800});
+  input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(5);
+  conv->set_filter_size_y(5);
+  conv->set_channels(128);
+  conv->set_padding(1);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(1);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(7);
+  conv->set_output_x(3);
+  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
+                              config.layerConfig.num_filters());
+  config.layerConfig.set_name("conv");
+
+  // data layer initialize
+  std::vector<DataLayerPtr> dataLayers;
+  LayerMap layerMap;
+  vector<Argument> datas;
+  initDataLayer(configBN,
+                &dataLayers,
+                &datas,
+                &layerMap,
+                "batch_norm",
+                100,
+                false,
+                false);
+  // test layer initialize
+  std::vector<ParameterPtr> parameters;
+  LayerPtr bnLayer;
+  initTestLayer(configBN, &layerMap, &parameters, &bnLayer);
+
+  std::vector<ParameterPtr> parameters2;
+  LayerPtr convLayer;
+  initTestLayer(config, &layerMap, &parameters2, &convLayer);
+
+  bnLayer->forward(PASS_GC);
+  convLayer->forward(PASS_GC);
+
+  CHECK_EQ(convLayer->getOutputValue()->getHeight(), 100);
+  CHECK_EQ(convLayer->getOutputValue()->getWidth(), 576);
 }
 
 int main(int argc, char** argv) {
diff --git a/paddle/gserver/tests/test_ConvTrans.cpp b/paddle/gserver/tests/test_ConvTrans.cpp
index 3af3f08f40166de2aa5b23fb0b5abd3e3fcbf4b3..99202c2d5702a9569c3a9a92897a8a0e38b8e2a6 100644
--- a/paddle/gserver/tests/test_ConvTrans.cpp
+++ b/paddle/gserver/tests/test_ConvTrans.cpp
@@ -13,26 +13,26 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <vector>
 #include <string>
-#include "paddle/gserver/layers/DataLayer.h"
+#include <vector>
 #include "ModelConfig.pb.h"
-#include "paddle/trainer/Trainer.h"
-#include "paddle/utils/GlobalConstants.h"
+#include "paddle/gserver/layers/DataLayer.h"
 #include "paddle/gserver/layers/ExpandConvTransLayer.h"
 #include "paddle/math/MathUtils.h"
+#include "paddle/trainer/Trainer.h"
+#include "paddle/utils/GlobalConstants.h"
 
-#include "TestUtil.h"
 #include "LayerGradUtil.h"
+#include "TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_double(checkgrad_eps);
-P_DECLARE_bool(thread_local_rand_use_global_seed);
-P_DECLARE_bool(prev_batch_state);
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);
 
 // Test that the convTrans forward is the same as conv backward
 TEST(Layer, convTransLayerFwd) {
diff --git a/paddle/gserver/tests/test_ConvUnify.cpp b/paddle/gserver/tests/test_ConvUnify.cpp
index d59acf96acc84b4cf4982656dadd27a89fac78b8..2ab18f886848d198b9063c7559790497ce131efe 100644
--- a/paddle/gserver/tests/test_ConvUnify.cpp
+++ b/paddle/gserver/tests/test_ConvUnify.cpp
@@ -13,33 +13,40 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <vector>
 #include <string>
-#include "paddle/gserver/layers/DataLayer.h"
+#include <vector>
 #include "ModelConfig.pb.h"
-#include "paddle/trainer/Trainer.h"
-#include "paddle/utils/GlobalConstants.h"
+#include "paddle/gserver/layers/DataLayer.h"
 #include "paddle/gserver/layers/ExpandConvTransLayer.h"
 #include "paddle/math/MathUtils.h"
+#include "paddle/trainer/Trainer.h"
+#include "paddle/utils/GlobalConstants.h"
 
-#include "TestUtil.h"
 #include "LayerGradUtil.h"
+#include "TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_double(checkgrad_eps);
-P_DECLARE_bool(thread_local_rand_use_global_seed);
-P_DECLARE_bool(prev_batch_state);
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);
 
 // Do one forward pass of convTrans layer and check to see if its output
 // matches the given result
-MatrixPtr doOneConvTest(size_t imgSize, size_t output_x, size_t stride,
-                    size_t padding, size_t filter_size, size_t channel,
-                    size_t numfilters, size_t groups, MatrixPtr& inputData,
-                    real* param, bool useGpu) {
+MatrixPtr doOneConvTest(size_t imgSize,
+                        size_t output_x,
+                        size_t stride,
+                        size_t padding,
+                        size_t filter_size,
+                        size_t channel,
+                        size_t numfilters,
+                        size_t groups,
+                        MatrixPtr& inputData,
+                        real* param,
+                        bool useGpu) {
   TestConfig config;
   config.biasSize = numfilters;
   if (useGpu) {
@@ -51,11 +58,10 @@ MatrixPtr doOneConvTest(size_t imgSize, size_t output_x, size_t stride,
   config.layerConfig.set_partial_sum(1);
   config.layerConfig.set_shared_biases(true);
 
-  size_t weightSize = channel* filter_size * filter_size *
-      config.layerConfig.num_filters() / groups;
-  config.inputDefs.push_back({INPUT_DATA, "layer_0",
-                              imgSize * imgSize * channel,
-                              weightSize});
+  size_t weightSize = channel * filter_size * filter_size *
+                      config.layerConfig.num_filters() / groups;
+  config.inputDefs.push_back(
+      {INPUT_DATA, "layer_0", imgSize * imgSize * channel, weightSize});
   LayerInputConfig* input = config.layerConfig.add_inputs();
   ConvConfig* conv = input->mutable_conv_conf();
   conv->set_filter_size(filter_size);
@@ -66,7 +72,7 @@ MatrixPtr doOneConvTest(size_t imgSize, size_t output_x, size_t stride,
   conv->set_stride(stride);
   conv->set_stride_y(stride);
   conv->set_groups(groups);
-  conv->set_filter_channels(channel/groups);
+  conv->set_filter_channels(channel / groups);
   conv->set_img_size(imgSize);
   conv->set_output_x(output_x);
 
@@ -77,8 +83,8 @@ MatrixPtr doOneConvTest(size_t imgSize, size_t output_x, size_t stride,
   std::vector<DataLayerPtr> dataLayers;
   LayerMap layerMap;
   vector<Argument> datas;
-  initDataLayer(config, &dataLayers, &datas, &layerMap, "conv",
-                1, false, useGpu);
+  initDataLayer(
+      config, &dataLayers, &datas, &layerMap, "conv", 1, false, useGpu);
   dataLayers[0]->getOutputValue()->zeroMem();
   dataLayers[0]->getOutputValue()->copyFrom(*inputData);
 
@@ -88,106 +94,124 @@ MatrixPtr doOneConvTest(size_t imgSize, size_t output_x, size_t stride,
   initTestLayer(config, &layerMap, &parameters, &convLayer);
   convLayer->getBiasParameter()->zeroMem();
   convLayer->getParameters()[0]->zeroMem();
-  convLayer->getParameters()[0]->getBuf(PARAMETER_VALUE)->copyFrom(param,
-      weightSize);
+  convLayer->getParameters()[0]
+      ->getBuf(PARAMETER_VALUE)
+      ->copyFrom(param, weightSize);
   convLayer->forward(PASS_GC);
 
   return convLayer->getOutputValue();
 }
 
 TEST(Layer, convParaUnified) {
-  #ifndef PADDLE_ONLY_CPU
-    MatrixPtr input, resultCpu, resultGpu;
-    input = Matrix::create(1, 4 * 4, false, false);
-    float inputData[] = {1, 2, 3, 4,
-                         5, 6, 7, 8,
-                         9, 10, 11, 12,
-                         13, 14, 15, 16};
-    float param[] = {1, 2, 3, 4, 5, 6, 7, 8, 9,
-                     9, 8, 7, 6, 5, 4, 3, 2, 1};
-
-    input->setData(inputData);
-
-    resultCpu = doOneConvTest(/* imgSize */ 4,
-                   /* output_x */ 2,
-                   /* stride */ 1,
-                   /* padding */ 0,
-                   /* filter_size */ 3,
-                   /*channel*/ 1,
-                   /*numfilters*/ 2,
-                   /*groups*/ 1,
-                   input, param, false);
-
-    resultGpu = doOneConvTest(/* imgSize */ 4,
-                       /* output_x */ 2,
-                       /* stride */ 1,
-                       /* padding */ 0,
-                       /* filter_size */ 3,
-                       /*channel*/ 1,
-                       /*numfilters*/ 2,
-                       /*groups*/ 1,
-                       input, param, true);
-    checkMatrixEqual(resultCpu, resultGpu);
-
-    input = Matrix::create(1, 3 * 3 * 2, false, false);
-    float inputData2[] = {1, 2, 3,
-                          4, 5, 6,
-                          7, 8, 9,
-
-                          10, 11, 12,
-                          13, 14, 15,
-                          16, 17, 18};
-    float param2[] = {1, 2, 3, 4, 5, 6, 7, 8,
-                      8, 7, 6, 5, 4, 3, 2, 1};
-
-    input->setData(inputData2);
-
-    resultCpu = doOneConvTest(/* imgSize */ 3,
-                   /* output_x */ 2,
-                   /* stride */ 1,
-                   /* padding */ 0,
-                   /* filter_size */ 2,
-                   /*channel*/ 2,
-                   /*numfilters*/ 2,
-                   /*groups*/ 1,
-                   input, param2, false);
-
-    resultGpu = doOneConvTest(/* imgSize */ 3,
-                       /* output_x */ 2,
-                       /* stride */ 1,
-                       /* padding */ 0,
-                       /* filter_size */ 2,
-                       /*channel*/ 2,
-                       /*numfilters*/ 2,
-                       /*groups*/ 1,
-                       input, param2, true);
-    checkMatrixEqual(resultCpu, resultGpu);
-
-
-    float param3[] = {1, 2, 3, 4,
-                      4, 3, 2, 1};
-
-    resultCpu = doOneConvTest(/* imgSize */ 3,
-                   /* output_x */ 2,
-                   /* stride */ 1,
-                   /* padding */ 0,
-                   /* filter_size */ 2,
-                   /*channel*/ 2,
-                   /*numfilters*/ 2,
-                   /*groups*/ 2,
-                   input, param3, false);
-
-    resultGpu = doOneConvTest(/* imgSize */ 3,
-                       /* output_x */ 2,
-                       /* stride */ 1,
-                       /* padding */ 0,
-                       /* filter_size */ 2,
-                       /*channel*/ 2,
-                       /*numfilters*/ 2,
-                       /*groups*/ 2,
-                       input, param3, true);
-    checkMatrixEqual(resultCpu, resultGpu);
-  #endif
+#ifndef PADDLE_ONLY_CPU
+  MatrixPtr input, resultCpu, resultGpu;
+  input = Matrix::create(1, 4 * 4, false, false);
+  float inputData[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
+  float param[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 7, 6, 5, 4, 3, 2, 1};
+
+  input->setData(inputData);
+
+  resultCpu = doOneConvTest(/* imgSize */ 4,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 3,
+                            /*channel*/ 1,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param,
+                            false);
+
+  resultGpu = doOneConvTest(/* imgSize */ 4,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 3,
+                            /*channel*/ 1,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param,
+                            true);
+  checkMatrixEqual(resultCpu, resultGpu);
+
+  input = Matrix::create(1, 3 * 3 * 2, false, false);
+  float inputData2[] = {1,
+                        2,
+                        3,
+                        4,
+                        5,
+                        6,
+                        7,
+                        8,
+                        9,
+
+                        10,
+                        11,
+                        12,
+                        13,
+                        14,
+                        15,
+                        16,
+                        17,
+                        18};
+  float param2[] = {1, 2, 3, 4, 5, 6, 7, 8, 8, 7, 6, 5, 4, 3, 2, 1};
+
+  input->setData(inputData2);
+
+  resultCpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param2,
+                            false);
+
+  resultGpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param2,
+                            true);
+  checkMatrixEqual(resultCpu, resultGpu);
+
+  float param3[] = {1, 2, 3, 4, 4, 3, 2, 1};
+
+  resultCpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 2,
+                            input,
+                            param3,
+                            false);
+
+  resultGpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 2,
+                            input,
+                            param3,
+                            true);
+  checkMatrixEqual(resultCpu, resultGpu);
+#endif
 }
 
 int main(int argc, char** argv) {
diff --git a/paddle/gserver/tests/test_Evaluator.cpp b/paddle/gserver/tests/test_Evaluator.cpp
index 2c20f3a52f8f10f91d7ac7dcdd21dc3f93d9e227..e07066dad84aa6326c2447fc5ee80fa496735fbf 100644
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ b/paddle/gserver/tests/test_Evaluator.cpp
@@ -15,15 +15,15 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <vector>
 #include "ModelConfig.pb.h"
-#include "paddle/trainer/Trainer.h"
 #include "TestUtil.h"
+#include "paddle/trainer/Trainer.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
 
 enum InputType {
   INPUT_DATA,         // dense vector
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 7983d9fe64c61648a2939ddc610a0f819e338577..8a8d094ed357a6565dd9827c4bb10b76db6a146a 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <vector>
 #include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/trainer/Trainer.h"
 #include "paddle/math/MathUtils.h"
+#include "paddle/trainer/Trainer.h"
 
 #include "LayerGradUtil.h"
 #include "TestUtil.h"
@@ -26,11 +26,11 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_double(checkgrad_eps);
-P_DECLARE_bool(thread_local_rand_use_global_seed);
-P_DECLARE_bool(prev_batch_state);
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);
 
 TEST(Operator, dot_mul) {
   TestConfig config;
diff --git a/paddle/gserver/tests/test_MultinomialSampler.cpp b/paddle/gserver/tests/test_MultinomialSampler.cpp
index fc164da8eab993dd8659b7f9cf34d2ef654c37de..eadf40ade091ae8b3e19d7dc6c999288e8e88c1b 100644
--- a/paddle/gserver/tests/test_MultinomialSampler.cpp
+++ b/paddle/gserver/tests/test_MultinomialSampler.cpp
@@ -20,8 +20,8 @@ limitations under the License. */
 #undef PADDLE_DISABLE_TIMER
 #include "paddle/utils/Stat.h"
 
-#include "paddle/utils/Util.h"
 #include "paddle/gserver/layers/MultinomialSampler.h"
+#include "paddle/utils/Util.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/gserver/tests/test_NetworkCompare.cpp b/paddle/gserver/tests/test_NetworkCompare.cpp
index ff6b5ab0d040bc09f03613f05123c46e720e4681..fc60228f816e0cea30ef764c59a8c7875ed4a0e8 100644
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -13,22 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #undef PADDLE_DISABLE_TIMER
+#include <gtest/gtest.h>
 #include <paddle/utils/PythonUtil.h>
-#include <cstdlib>
 #include <algorithm>
-#include <gtest/gtest.h>
+#include <cstdlib>
 
+#include "TestUtil.h"
 #include "paddle/trainer/Trainer.h"
 #include "paddle/utils/Stat.h"
-#include "TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_int32(gpu_id);
-P_DECLARE_double(checkgrad_eps);
-P_DEFINE_bool(use_label, true, "input label or sequence label");
-P_DEFINE_bool(static_para, false, "static parameter");
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DEFINE_bool(use_label, true, "input label or sequence label");
+DEFINE_bool(static_para, false, "static parameter");
 
 struct DataIn {
   std::vector<Argument> inArgs;
@@ -267,8 +267,8 @@ TEST(Compare, img_conv2) {
 }
 #endif
 
-P_DEFINE_string(config_file_a, "", "config of one network to compare");
-P_DEFINE_string(config_file_b, "", "config of another network to compare");
+DEFINE_string(config_file_a, "", "config of one network to compare");
+DEFINE_string(config_file_b, "", "config of another network to compare");
 TEST(Compare, network) {
   if (FLAGS_config_file_a != "" && FLAGS_config_file_b != "") {
     compareNetwork(FLAGS_config_file_a, FLAGS_config_file_b);
diff --git a/paddle/gserver/tests/test_ProtoDataProvider.cpp b/paddle/gserver/tests/test_ProtoDataProvider.cpp
index d5b8017cd117ab24b5f987cf34c9cbb87925007b..d421b6e2f2536e266883508ff29cbec731c9d7e3 100644
--- a/paddle/gserver/tests/test_ProtoDataProvider.cpp
+++ b/paddle/gserver/tests/test_ProtoDataProvider.cpp
@@ -17,8 +17,8 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 
-#include "paddle/utils/Util.h"
 #include "paddle/gserver/dataproviders/ProtoDataProvider.h"
+#include "paddle/utils/Util.h"
 
 #include "TestUtil.h"
 
diff --git a/paddle/gserver/tests/test_PyDataProvider2.cpp b/paddle/gserver/tests/test_PyDataProvider2.cpp
index 436318d35634f4ba46781a125ace110551029439..5f8bc5ecd0f77efc6dcda0330f124ca6cab7f277 100644
--- a/paddle/gserver/tests/test_PyDataProvider2.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider2.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Util.h"
 
-P_DEFINE_string(train_list, "unittest.list", "file list for unittest");
+DEFINE_string(train_list, "unittest.list", "file list for unittest");
 
 namespace paddle {
 namespace unittest {
diff --git a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
index a351667d8b18b734022820a777c551bb11a243bf..b47279b77a6b3aea1313628d79cfd27efa35c361 100644
--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <paddle/utils/Util.h>
 #include <paddle/utils/Version.h>
 
-P_DECLARE_int32(seed);
+DECLARE_int32(seed);
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
@@ -127,7 +127,7 @@ TEST(RecurrentGradientMachine, HasSubSequence) {
   }
 }
 
-TEST(RecurrentGradientMachine, rnn) {
+TEST(RecurrentGradientMachine, DISABLED_rnn) {
   for (bool useGpu : {false, true}) {
     test("gserver/tests/sequence_rnn.conf",
          "gserver/tests/sequence_nest_rnn.conf",
@@ -136,7 +136,7 @@ TEST(RecurrentGradientMachine, rnn) {
   }
 }
 
-TEST(RecurrentGradientMachine, rnn_multi_input) {
+TEST(RecurrentGradientMachine, DISABLED_rnn_multi_input) {
   for (bool useGpu : {false, true}) {
     test("gserver/tests/sequence_rnn_multi_input.conf",
          "gserver/tests/sequence_nest_rnn_multi_input.conf",
@@ -145,7 +145,7 @@ TEST(RecurrentGradientMachine, rnn_multi_input) {
   }
 }
 
-TEST(RecurrentGradientMachine, rnn_multi_unequalength_input) {
+TEST(RecurrentGradientMachine, DISABLED_rnn_multi_unequalength_input) {
   for (bool useGpu : {false, true}) {
     test("gserver/tests/sequence_rnn_multi_unequalength_inputs.py",
          "gserver/tests/sequence_nest_rnn_multi_unequalength_inputs.py",
@@ -155,13 +155,14 @@ TEST(RecurrentGradientMachine, rnn_multi_unequalength_input) {
 }
 
 int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+
   if (paddle::version::isWithPyDataProvider()) {
     if (!paddle::version::isWithGpu()) {
       FLAGS_use_gpu = false;
     }
     initMain(argc, argv);
     initPython(argc, argv);
-    testing::InitGoogleTest(&argc, argv);
     return RUN_ALL_TESTS();
   } else {
     return 0;
diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp
index 3f26b710e92b78f295402f1ba53e01242ab0a486..f91c788863b6963df92b735dbfef2bacee1fff45 100644
--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -13,19 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include <vector>
 #include <paddle/utils/Version.h>
+#include <vector>
+#include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
 #include "paddle/gserver/layers/Layer.h"
-#include "ModelConfig.pb.h"
 
 #include "TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
-P_DECLARE_bool(use_gpu);
-P_DECLARE_bool(rnn_use_batch);
-P_DECLARE_int32(fixed_seq_length);
+DECLARE_bool(use_gpu);
+DECLARE_bool(rnn_use_batch);
+DECLARE_int32(fixed_seq_length);
 
 void checkError(const Matrix& matrix1, const Matrix& matrix2) {
   CHECK(matrix1.getHeight() == matrix2.getHeight());
@@ -220,8 +220,8 @@ TEST(Layer, RecurrentLayer) {
 }
 
 #define protected public
-#include "paddle/gserver/layers/LstmLayer.h"
 #include "paddle/gserver/layers/GatedRecurrentLayer.h"
+#include "paddle/gserver/layers/LstmLayer.h"
 template <class T>
 class TestRecurrentLayer {
 public:
diff --git a/paddle/gserver/tests/test_SelectiveFCLayer.cpp b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
index c588f69446e86965d6bd3a4c47cf0da1337d4a5a..ab23d00a2cb6077147f5b89664a8e2437b4cd63b 100644
--- a/paddle/gserver/tests/test_SelectiveFCLayer.cpp
+++ b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
@@ -12,28 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <gtest/gtest.h>
+#include <math.h>
 #include <paddle/utils/PythonUtil.h>
+#include <algorithm>
 #include <cstdlib>
 #include <ctime>
-#include <math.h>
-#include <gtest/gtest.h>
-#include <algorithm>
+#include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/layers/Layer.h"
 #include "paddle/gserver/layers/FullyConnectedLayer.h"
+#include "paddle/gserver/layers/Layer.h"
 #include "paddle/gserver/layers/SelectiveFullyConnectedLayer.h"
-#include "ModelConfig.pb.h"
 #include "paddle/math/CpuSparseMatrix.h"
 #include "paddle/trainer/Trainer.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(num_passes);
-P_DECLARE_string(config);
-P_DECLARE_string(init_model_path);
-P_DECLARE_string(config_args);
+DECLARE_bool(use_gpu);
+DECLARE_int32(num_passes);
+DECLARE_string(config);
+DECLARE_string(init_model_path);
+DECLARE_string(config_args);
 
 size_t fcLayerWidth = 1024;
 
diff --git a/paddle/gserver/tests/test_WarpCTCLayer.cpp b/paddle/gserver/tests/test_WarpCTCLayer.cpp
index e526a27906aee1990fa3e9da85cf9258af26776b..0a4a814d5247410248f7418e1ef2c79a2da42507 100644
--- a/paddle/gserver/tests/test_WarpCTCLayer.cpp
+++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp
@@ -14,18 +14,18 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #include <paddle/utils/Version.h>
-#include "paddle/gserver/layers/Layer.h"
-#include "paddle/gserver/layers/DataLayer.h"
+#include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/CTCLayer.h"
+#include "paddle/gserver/layers/DataLayer.h"
+#include "paddle/gserver/layers/Layer.h"
 #include "paddle/gserver/layers/WarpCTCLayer.h"
-#include "ModelConfig.pb.h"
 
 #include "TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_bool(use_gpu);
+DECLARE_bool(use_gpu);
 
 const real* getData(const Matrix& matrix) {
   if (matrix.useGpu()) {
diff --git a/paddle/math/Allocator.h b/paddle/math/Allocator.h
index 4d0a1506bee3d905cd2c352e0d5395feab6e1212..666a8b8368e3e2ebc522902c176d7491d2920d2a 100644
--- a/paddle/math/Allocator.h
+++ b/paddle/math/Allocator.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include <mutex>
 #include <stdlib.h>
+#include <mutex>
 #include "hl_gpu.h"
 #include "paddle/utils/Logging.h"
 
diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h
index 368557bb26b3b437600d658e3f88ddfede973dca..2933c20fbad930248c41969d88d45cf397b9dcf8 100644
--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include <cstddef>
 #include <stdint.h>
-#include "paddle/utils/TypeDefs.h"
+#include <cstddef>
 #include "TensorExpression.h"
+#include "paddle/utils/TypeDefs.h"
 
 namespace paddle {
 
diff --git a/paddle/math/CpuSparseMatrix.cpp b/paddle/math/CpuSparseMatrix.cpp
index 324c7ec0ca8c69618acc7be652f497291b1af39b..b5d5b6ef615829fc1e24ccd417e2f0b3312f072d 100644
--- a/paddle/math/CpuSparseMatrix.cpp
+++ b/paddle/math/CpuSparseMatrix.cpp
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "hl_gpu.h"
 #include "CpuSparseMatrix.h"
 #include "SparseMatrix.h"
+#include "float.h"
+#include "hl_gpu.h"
 #include "paddle/math/MathUtils.h"
 #include "paddle/utils/Util.h"
-#include "float.h"
 
 namespace paddle {
 
@@ -656,9 +656,9 @@ void CpuSparseMatrix::trimFrom(const CpuSparseMatrix& src) {
   if (format_ == SPARSE_CSR) {
     int* srcCols = src.getCols();
     size_t numLessWidth =
-        std::count_if(srcCols,
-                      srcCols + src.getElementCnt(),
-                      [this](size_t n) { return n < this->width_; });
+        std::count_if(srcCols, srcCols + src.getElementCnt(), [this](size_t n) {
+          return n < this->width_;
+        });
     resize(height_, width_, numLessWidth, valueType_, format_);
     rows_[0] = 0;
     size_t index = 0;
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index 037525b40233848e6f1d82cc32dbc2209fe52fc0..d7aa1184872d5a6129becca1f6e282776c9dbe15 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "MathFunctions.h"
-#include "hl_matrix_ops.cuh"
 #include "hl_matrix_apply.cuh"
+#include "hl_matrix_ops.cuh"
 
 namespace paddle {
 
diff --git a/paddle/math/MathUtils.cpp b/paddle/math/MathUtils.cpp
index 1fb7655c5a04b82db72aefb8b3147020de8f4630..5bbc3e4e3725f186373072440a93f967178e0b27 100644
--- a/paddle/math/MathUtils.cpp
+++ b/paddle/math/MathUtils.cpp
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "MathUtils.h"
 #include <algorithm>
-#include "paddle/utils/Logging.h"
 #include "Vector.h"
+#include "paddle/utils/Logging.h"
 
 namespace paddle {
 
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 395143a4b1af84882f51b40a494b64c7260542df..5685cb7bcbbb6b90687790953d676e3792f36f36 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -14,20 +14,20 @@ limitations under the License. */
 
 #pragma once
 
+#include <stdint.h>
 #include <memory>
 #include <thread>
-#include <stdint.h>
 
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/ThreadLocal.h"
 
 #include <hl_gpu.h>
 
+#include "BaseMatrix.h"
 #include "MemoryHandle.h"
-#include "paddle/utils/TypeDefs.h"
 #include "Vector.h"
 #include "paddle/utils/ThreadLocal.h"
-#include "BaseMatrix.h"
+#include "paddle/utils/TypeDefs.h"
 
 namespace paddle {
 
@@ -408,7 +408,7 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
-  virtual void addBias(Matrix& b, real scale, bool sharedBias) {
+  void addBias(Matrix& b, real scale, bool sharedBias) {
     if (!sharedBias) {
       addBias(b, scale);
     } else {
@@ -425,7 +425,7 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
-  virtual void collectBias(Matrix& a, real scale, bool sharedBias) {
+  void collectBias(Matrix& a, real scale, bool sharedBias) {
     if (!sharedBias) {
       collectBias(a, scale);
     } else {
diff --git a/paddle/math/MatrixBitCode.cpp b/paddle/math/MatrixBitCode.cpp
index 6390d4b6a5284dee15e65d3347b7d2c5b2b0a163..cea912d3ca02715c203814d13529aadfd9d3b7fb 100644
--- a/paddle/math/MatrixBitCode.cpp
+++ b/paddle/math/MatrixBitCode.cpp
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Util.h"
 #include "Matrix.h"
 #include "hl_gpu.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/math/MemoryHandle.cpp b/paddle/math/MemoryHandle.cpp
index 4c4a827b23aa2c1c7cc8be1e69660bf3b780816f..84afb5944c3ea4aa3b8f44646b23d18b2903281b 100644
--- a/paddle/math/MemoryHandle.cpp
+++ b/paddle/math/MemoryHandle.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <cmath>
 #include "MemoryHandle.h"
+#include <cmath>
 #include "Storage.h"
 
 namespace paddle {
diff --git a/paddle/math/PoolAllocator.h b/paddle/math/PoolAllocator.h
index 1544cb2cfca830c608d78c0f93032416820d604c..c06efa9ac77a5659b242d039c38455e2ee9b0db6 100644
--- a/paddle/math/PoolAllocator.h
+++ b/paddle/math/PoolAllocator.h
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #pragma once
 
+#include <map>
 #include <memory>
 #include <mutex>
-#include <vector>
 #include <unordered_map>
-#include <map>
+#include <vector>
 #include "Allocator.h"
 
 namespace paddle {
diff --git a/paddle/math/SparseMatrix.cpp b/paddle/math/SparseMatrix.cpp
index d2779cc9f51a8440598d6d31b428f240363d7b26..9154503c2132a740aaa42f90eb7061156403ac00 100644
--- a/paddle/math/SparseMatrix.cpp
+++ b/paddle/math/SparseMatrix.cpp
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "SparseMatrix.h"
 #include <algorithm>
+#include <iostream>
 #include <vector>
 #include "hl_gpu.h"
-#include "SparseMatrix.h"
-#include "paddle/utils/Util.h"
 #include "hl_top_k.h"
-#include <iostream>
+#include "paddle/utils/Util.h"
 
 namespace paddle {
 
@@ -537,11 +537,9 @@ void GpuSparseMatrix::transpose(MatrixPtr matTrans, bool memAlloc) {
     dataVec.emplace_back(
         rows.getData()[i], cols_full.getData()[i], value.getData()[i]);
   }
-  std::sort(dataVec.begin(),
-            dataVec.end(),
-            [](Element a, Element b) {
-              return a.row < b.row || (a.row == b.row && a.col < b.col);
-            });
+  std::sort(dataVec.begin(), dataVec.end(), [](Element a, Element b) {
+    return a.row < b.row || (a.row == b.row && a.col < b.col);
+  });
 
   /*get sorted data, row index, and col index, put them in the right place*/
   cols.resize(height_ + 1);
diff --git a/paddle/math/SparseMatrix.h b/paddle/math/SparseMatrix.h
index f8d9ffc29fb721d5af2df3cc3f133419636d8fef..bd96a3301ded2fd89bd31b94f42b0cb4718cbcb7 100644
--- a/paddle/math/SparseMatrix.h
+++ b/paddle/math/SparseMatrix.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 #include <cstddef>
-#include "Matrix.h"
 #include "CpuSparseMatrix.h"
+#include "Matrix.h"
 
 namespace paddle {
 
diff --git a/paddle/math/SparseRowMatrix.cpp b/paddle/math/SparseRowMatrix.cpp
index 3091743123af03561f91dfb8b03e65087310ce64..b61c6b2d49ccead5e9cfdf595a8bebae0e5b87b5 100644
--- a/paddle/math/SparseRowMatrix.cpp
+++ b/paddle/math/SparseRowMatrix.cpp
@@ -24,9 +24,9 @@ limitations under the License. */
 #include "paddle/utils/Thread.h"
 #include "paddle/utils/Util.h"
 
-P_DEFINE_bool(allow_inefficient_sparse_update,
-              false,
-              "Whether to allow inefficient sparse update");
+DEFINE_bool(allow_inefficient_sparse_update,
+            false,
+            "Whether to allow inefficient sparse update");
 
 namespace paddle {
 
diff --git a/paddle/math/SparseRowMatrix.h b/paddle/math/SparseRowMatrix.h
index 2fee1b39fe34af534ce1a739da0f35de6826581b..9364feb4a1462a5a9d16ca0f69213ba32ad97d21 100644
--- a/paddle/math/SparseRowMatrix.h
+++ b/paddle/math/SparseRowMatrix.h
@@ -14,13 +14,13 @@ limitations under the License. */
 
 #pragma once
 
-#include <algorithm>
 #include <string.h>
-#include "paddle/utils/CommandLineParser.h"
+#include <algorithm>
 #include "Matrix.h"
+#include "paddle/utils/CommandLineParser.h"
 #include "paddle/utils/Util.h"
 
-P_DECLARE_bool(allow_inefficient_sparse_update);
+DECLARE_bool(allow_inefficient_sparse_update);
 
 namespace paddle {
 
diff --git a/paddle/math/Storage.cpp b/paddle/math/Storage.cpp
index 0170b4efb80284ba930dbbd9fb0bf0ef9b6e2d6c..56e5442394b04230c22d668aa734dc0fa44004c2 100644
--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Util.h"
-#include "Allocator.h"
 #include "Storage.h"
+#include "Allocator.h"
+#include "paddle/utils/Util.h"
 
-P_DEFINE_int32(pool_limit_size,
-               536870912,
-               "maximum memory size managed by a memory pool, default is 512M");
+DEFINE_int32(pool_limit_size,
+             536870912,
+             "maximum memory size managed by a memory pool, default is 512M");
 
 namespace paddle {
 
diff --git a/paddle/math/Storage.h b/paddle/math/Storage.h
index 36583201827fd53dd26d40dd15d1f03e6b0498fa..06a66b5f14643153f82a1596096fc28d3e47e3fd 100644
--- a/paddle/math/Storage.h
+++ b/paddle/math/Storage.h
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include <mutex>
 #include <vector>
-#include "paddle/utils/Locks.h"
 #include "PoolAllocator.h"
+#include "paddle/utils/Locks.h"
 
 namespace paddle {
 
diff --git a/paddle/math/TensorEvaluate.h b/paddle/math/TensorEvaluate.h
index 346ed7ab13a82599e9c5e06723dcaed82659aafd..9de2099b850d1723fe085eeed97c5b141629eec1 100644
--- a/paddle/math/TensorEvaluate.h
+++ b/paddle/math/TensorEvaluate.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include <algorithm>
-#include "paddle/utils/Logging.h"
 #include "hl_base.h"
+#include "paddle/utils/Logging.h"
 
 namespace paddle {
 
diff --git a/paddle/math/TensorExpression.h b/paddle/math/TensorExpression.h
index 7f28ad83bb078ed257aaa0e22bca85597c9ca486..9bd789e8c511f33d8415e421281e99eb10fc63fe 100644
--- a/paddle/math/TensorExpression.h
+++ b/paddle/math/TensorExpression.h
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include <cstddef>
 #include <stdint.h>
-#include "paddle/utils/TypeDefs.h"
-#include "paddle/utils/Logging.h"
+#include <cstddef>
 #include "hl_tensor_ops.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/TypeDefs.h"
 
 namespace paddle {
 
diff --git a/paddle/math/TrainingAlgorithmOp.h b/paddle/math/TrainingAlgorithmOp.h
index 2dc56f69e5cf1a31038ca737ffe6a8e43c32fa9e..881a8d72d888083ad87a536c127009d68c51076e 100644
--- a/paddle/math/TrainingAlgorithmOp.h
+++ b/paddle/math/TrainingAlgorithmOp.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/utils/Logging.h"
 #include "BaseMatrix.h"
+#include "paddle/utils/Logging.h"
 
 namespace paddle {
 
diff --git a/paddle/math/Vector.cpp b/paddle/math/Vector.cpp
index 484f4c925255c524fbde3d513098ca14c870a12b..eaa1cdce305c2f9d7a517e9e8c8606dc1f70780b 100644
--- a/paddle/math/Vector.cpp
+++ b/paddle/math/Vector.cpp
@@ -12,17 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Util.h"
 #include "Vector.h"
+#include "paddle/utils/Util.h"
 
 #include <memory>
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/ThreadLocal.h"
-#include "paddle/utils/Thread.h"
-#include "paddle/utils/Flags.h"
 #include "Matrix.h"
 #include "hl_gpu.h"
 #include "hl_table_apply.h"
+#include "paddle/utils/Flags.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Thread.h"
+#include "paddle/utils/ThreadLocal.h"
 
 namespace paddle {
 
@@ -754,8 +754,7 @@ void ParallelCpuVectorT<real>::exec(SyncThreadPool::JobFunc func) {
 }
 
 template <class T>
-CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, bool useGpu)
-    : sync_(nullptr) {
+CpuGpuVectorT<T>::CpuGpuVectorT(size_t size, bool useGpu) : sync_(nullptr) {
   if (!useGpu) {
     cpuVectorT_ = std::make_shared<CpuVectorT<T>>(size);
   } else {
diff --git a/paddle/math/Vector.h b/paddle/math/Vector.h
index 535580ac3739b99bee420dd0c834807e6f7b8b35..8a24103bd4107035c8068c24ec3be6ec06957112 100644
--- a/paddle/math/Vector.h
+++ b/paddle/math/Vector.h
@@ -14,15 +14,15 @@ limitations under the License. */
 
 #pragma once
 
-#include <memory>
 #include <cmath>
+#include <memory>
 
 #include <hl_gpu.h>
 
-#include "MemoryHandle.h"
-#include "paddle/utils/TypeDefs.h"
 #include "BaseMatrix.h"
+#include "MemoryHandle.h"
 #include "paddle/utils/Thread.h"
+#include "paddle/utils/TypeDefs.h"
 
 namespace paddle {
 
diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt
index fe5177291c21c3505c3694201b36b54397150ccf..a3ea078509704f305672d0b02d272de0f6c97f51 100644
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -16,12 +16,10 @@ add_simple_unittest(test_CpuGpuVector)
 add_simple_unittest(test_Allocator)
 
 if(WITH_GPU)
-    if(COMPILER_SUPPORT_CXX11)
-    	CUDA_ADD_EXECUTABLE(test_Tensor test_Tensor.cu)
-		link_paddle_test(test_Tensor)
-        CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu)
-        link_paddle_test(test_lazyAssign)
-    endif()
+    CUDA_ADD_EXECUTABLE(test_Tensor test_Tensor.cu)
+    link_paddle_test(test_Tensor)
+    CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu)
+    link_paddle_test(test_lazyAssign)
 else()
     compile_cu_as_cpp(test_Tensor.cu)
     add_unittest(test_Tensor test_Tensor.cu)
diff --git a/paddle/math/tests/OriginalOptimizerApi.h b/paddle/math/tests/OriginalOptimizerApi.h
index ddcdd6bb5122a2316d3b29bc23b054c9ba2a1882..0188372771d97942a0761c673d40d040528ff59a 100644
--- a/paddle/math/tests/OriginalOptimizerApi.h
+++ b/paddle/math/tests/OriginalOptimizerApi.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/utils/GlobalConstants.h"
 #include "paddle/math/Vector.h"
+#include "paddle/utils/GlobalConstants.h"
 
 using namespace paddle;  // NOLINT
 
diff --git a/paddle/math/tests/TestUtils.h b/paddle/math/tests/TestUtils.h
index 5f9fab7245004f8449c3865b76f5f29f7f47d646..c3020961880484a7944f8cc61377a4f08122e403 100644
--- a/paddle/math/tests/TestUtils.h
+++ b/paddle/math/tests/TestUtils.h
@@ -40,9 +40,9 @@ limitations under the License. */
 */
 
 #include <gtest/gtest.h>
+#include "TensorCheck.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
-#include "TensorCheck.h"
 
 namespace autotest {
 
diff --git a/paddle/math/tests/test_Allocator.cpp b/paddle/math/tests/test_Allocator.cpp
index 440fcda0fe1fd8ef2f395148eb430058c80986df..33e0952efedddec16acf6153209e14f18fd48134 100644
--- a/paddle/math/tests/test_Allocator.cpp
+++ b/paddle/math/tests/test_Allocator.cpp
@@ -13,11 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include "paddle/utils/Util.h"
 #include "paddle/utils/Logging.h"
+#include "paddle/utils/Util.h"
 #define private public
-#include "paddle/math/MemoryHandle.h"
 #include "paddle/math/Allocator.h"
+#include "paddle/math/MemoryHandle.h"
 #include "paddle/math/PoolAllocator.h"
 
 using namespace paddle;  // NOLINT
diff --git a/paddle/math/tests/test_BaseMatrix.cpp b/paddle/math/tests/test_BaseMatrix.cpp
index a4683918cade77c1e99461a668fb76b6c5eac726..cc7c1e7eb2734605cb278a4b97cab22bdba1594e 100644
--- a/paddle/math/tests/test_BaseMatrix.cpp
+++ b/paddle/math/tests/test_BaseMatrix.cpp
@@ -20,8 +20,8 @@ limitations under the License. */
  */
 
 #include <gtest/gtest.h>
-#include "paddle/math/BaseMatrix.h"
 #include "TestUtils.h"
+#include "paddle/math/BaseMatrix.h"
 
 using paddle::BaseMatrix;
 using paddle::Matrix;
diff --git a/paddle/math/tests/test_CpuGpuVector.cpp b/paddle/math/tests/test_CpuGpuVector.cpp
index c671735875b671c3959e28ec54b78d7f4ba3ea41..624fa20ca58bca3f16fa567487bbaa5d9656e1b1 100644
--- a/paddle/math/tests/test_CpuGpuVector.cpp
+++ b/paddle/math/tests/test_CpuGpuVector.cpp
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #ifndef PADDLE_ONLY_CPU
 
-#include "paddle/utils/Util.h"
+#include <gtest/gtest.h>
 #include "paddle/math/Vector.h"
+#include "paddle/utils/Util.h"
 #include "test_matrixUtil.h"
-#include <gtest/gtest.h>
 
 using namespace paddle;  // NOLINT
 
diff --git a/paddle/math/tests/test_ExecViaCpu.cpp b/paddle/math/tests/test_ExecViaCpu.cpp
index b328ebf554cb2903f48e07754d296c3f8e96221e..27216ddb58eccd7fd52e121e795baf463ea69f51 100644
--- a/paddle/math/tests/test_ExecViaCpu.cpp
+++ b/paddle/math/tests/test_ExecViaCpu.cpp
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <paddle/utils/PythonUtil.h>
 #include <gtest/gtest.h>
-#include <vector>
+#include <paddle/utils/PythonUtil.h>
 #include <paddle/utils/Util.h>
+#include <vector>
 #include "paddle/math/SparseMatrix.h"
 
 using namespace paddle;  // NOLINT
diff --git a/paddle/math/tests/test_GpuProfiler.cpp b/paddle/math/tests/test_GpuProfiler.cpp
index e5fd6f4523e0bbbf54f48949ca88235e07122fcb..d490078d909e7940e83a6f461f9386eeda02f53c 100644
--- a/paddle/math/tests/test_GpuProfiler.cpp
+++ b/paddle/math/tests/test_GpuProfiler.cpp
@@ -14,12 +14,12 @@ limitations under the License. */
 
 #ifndef PADDLE_ONLY_CPU
 
-#include "paddle/utils/Util.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
 #include <gtest/gtest.h>
 #include "paddle/gserver/tests/TestUtil.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
 #include "paddle/utils/Stat.h"
+#include "paddle/utils/Util.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
@@ -52,7 +52,9 @@ void MatrixCheckErr(const Matrix& matrix1, const Matrix& matrix2) {
   EXPECT_EQ(count, 0) << "There are " << count << " different element.";
 }
 
-void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
+void testBilinearFwdBwd(int numSamples,
+                        int imgSizeH,
+                        int imgSizeW,
                         int channels) {
   int inWidth = imgSizeH * imgSizeW * channels;
   int outWidth = 2 * imgSizeH * 2 * imgSizeW * channels;
@@ -73,10 +75,22 @@ void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
   {
     // nvprof: GPU Proflier
     REGISTER_GPU_PROFILER("testBilinearFwdBwd");
-    target->bilinearForward(*input, imgSizeH, imgSizeW,
-        2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW);
-    targetGpu->bilinearForward(*inputGpu, imgSizeH, imgSizeW,
-        2 * imgSizeH, 2 * imgSizeW, channels, ratioH, ratioW);
+    target->bilinearForward(*input,
+                            imgSizeH,
+                            imgSizeW,
+                            2 * imgSizeH,
+                            2 * imgSizeW,
+                            channels,
+                            ratioH,
+                            ratioW);
+    targetGpu->bilinearForward(*inputGpu,
+                               imgSizeH,
+                               imgSizeW,
+                               2 * imgSizeH,
+                               2 * imgSizeW,
+                               channels,
+                               ratioH,
+                               ratioW);
   }
 
   // check
@@ -88,8 +102,8 @@ void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
   MatrixPtr inputGpuGrad = GpuMatrix::create(numSamples, inWidth, false, true);
 
   MatrixPtr targetGrad = CpuMatrix::create(numSamples, outWidth, false, false);
-  MatrixPtr targetGpuGrad = GpuMatrix::create(numSamples, outWidth, false,
-                                              true);
+  MatrixPtr targetGpuGrad =
+      GpuMatrix::create(numSamples, outWidth, false, true);
   MatrixPtr targetCheckGrad =
       CpuMatrix::create(numSamples, inWidth, false, false);
 
@@ -98,10 +112,22 @@ void testBilinearFwdBwd(int numSamples, int imgSizeH, int imgSizeW,
   inputGpuGrad->copyFrom(*inputGrad);
   targetGpuGrad->copyFrom(*targetGrad);
 
-  inputGrad->bilinearBackward(*targetGrad, 2 * imgSizeH, 2 * imgSizeW,
-      imgSizeH, imgSizeW, channels, ratioH, ratioW);
-  inputGpuGrad->bilinearBackward(*targetGpuGrad, 2 * imgSizeH, 2 * imgSizeW,
-      imgSizeH, imgSizeW, channels, ratioH, ratioW);
+  inputGrad->bilinearBackward(*targetGrad,
+                              2 * imgSizeH,
+                              2 * imgSizeW,
+                              imgSizeH,
+                              imgSizeW,
+                              channels,
+                              ratioH,
+                              ratioW);
+  inputGpuGrad->bilinearBackward(*targetGpuGrad,
+                                 2 * imgSizeH,
+                                 2 * imgSizeW,
+                                 imgSizeH,
+                                 imgSizeW,
+                                 channels,
+                                 ratioH,
+                                 ratioW);
 
   // check
   targetCheckGrad->copyFrom(*inputGpuGrad);
@@ -116,8 +142,9 @@ TEST(Profiler, testBilinearFwdBwd) {
     // nvprof: GPU Proflier
     REGISTER_GPU_PROFILER("testBilinearFwdBwd");
     // Paddle built-in timer
-    REGISTER_TIMER_INFO("testBilinearFwdBwd",
-      "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
+    REGISTER_TIMER_INFO(
+        "testBilinearFwdBwd",
+        "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
     testBilinearFwdBwd(numSamples, imgSize, imgSize, channels);
   }
   globalStat.printAllStatus();
@@ -128,8 +155,9 @@ int main(int argc, char** argv) {
   initMain(argc, argv);
 
   // nvprof: GPU Proflier
-  REGISTER_GPU_PROFILER("RecursiveProfilingTest",
-    "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
+  REGISTER_GPU_PROFILER(
+      "RecursiveProfilingTest",
+      "numSamples = 10, channels = 16, imgSizeX = 64, imgSizeY = 64");
 
   return RUN_ALL_TESTS();
 }
diff --git a/paddle/math/tests/test_SIMDFunctions.cpp b/paddle/math/tests/test_SIMDFunctions.cpp
index 2c54121d996ce1778a060552bf5cf33aaae93a64..f62843310d886ba7d449e793066b19a7cc7bd5a9 100644
--- a/paddle/math/tests/test_SIMDFunctions.cpp
+++ b/paddle/math/tests/test_SIMDFunctions.cpp
@@ -17,10 +17,10 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 
-#include <random>
-#include <functional>
 #include <algorithm>
+#include <functional>
 #include <memory>
+#include <random>
 
 #include <stdlib.h>
 #include <time.h>
diff --git a/paddle/math/tests/test_TrainingAlgorithm.cpp b/paddle/math/tests/test_TrainingAlgorithm.cpp
index 93a930cc2f0f90887c649cf4411adbab2f99ed38..2c458cba9ca11e9af8a98b88a6392978c2a9be77 100644
--- a/paddle/math/tests/test_TrainingAlgorithm.cpp
+++ b/paddle/math/tests/test_TrainingAlgorithm.cpp
@@ -13,18 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include "paddle/utils/Util.h"
-#include "paddle/math/TrainingAlgorithmOp.h"
 #include "OriginalOptimizerApi.h"
-#include "TensorCheck.h"
 #include "PerfUtils.h"
+#include "TensorCheck.h"
+#include "paddle/math/TrainingAlgorithmOp.h"
+#include "paddle/utils/Util.h"
 
 using namespace paddle;  // NOLINT
 
 #ifndef PADDLE_TYPE_DOUBLE
-P_DEFINE_double(max_diff, 1e-5, "max diff allowed");
+DEFINE_double(max_diff, 1e-5, "max diff allowed");
 #else
-P_DEFINE_double(max_diff, 1e-13, "max diff allowed");
+DEFINE_double(max_diff, 1e-13, "max diff allowed");
 #endif
 
 class SetMaxDiff {
diff --git a/paddle/math/tests/test_batchTranspose.cpp b/paddle/math/tests/test_batchTranspose.cpp
index 88631c62b893d54a3b1ca16317fbf42faa332be7..9925e24dc14294ec70806ffd9cc496ea01beaa43 100644
--- a/paddle/math/tests/test_batchTranspose.cpp
+++ b/paddle/math/tests/test_batchTranspose.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "test_matrixUtil.h"
 #include "hl_batch_transpose.h"
+#include "test_matrixUtil.h"
 
 using namespace paddle;  // NOLINT
 
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 713792d82b3c569d26375780cc19fa0bd6cca391..62de5b25e4cc803d9ccc605fba29a1d29a3ea69c 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -16,13 +16,13 @@ limitations under the License. */
 /// This unittest checks GpuMatrix/CpuMatrix get same result, so disable when
 /// only cpu version.
 
-#include "paddle/utils/Util.h"
-#include "paddle/math/Matrix.h"
-#include "paddle/math/SparseMatrix.h"
 #include <gtest/gtest.h>
+#include "TensorCheck.h"
 #include "paddle/gserver/tests/TestUtil.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/SparseMatrix.h"
 #include "paddle/utils/Stat.h"
-#include "TensorCheck.h"
+#include "paddle/utils/Util.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/math/tests/test_perturbation.cpp b/paddle/math/tests/test_perturbation.cpp
index eaf4dfea664d0e405f39fbc021bcae98d6e3ca83..60ebae015381a3901c14d0cd4c1225e54ac5726f 100644
--- a/paddle/math/tests/test_perturbation.cpp
+++ b/paddle/math/tests/test_perturbation.cpp
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #ifndef PADDLE_ONLY_CPU
 
-#include <cmath>
+#include <cuda_runtime.h>
 #include <gtest/gtest.h>
+#include <cmath>
 #include <vector>
-#include <cuda_runtime.h>
 #include "hl_cuda.h"
 #include "hl_perturbation_util.cuh"
 
diff --git a/paddle/math/tests/test_sparseMatrixCompare.cpp b/paddle/math/tests/test_sparseMatrixCompare.cpp
index eff2c502bb88e454a57d3ae534ca60caccf93d4a..6f6de238bacaade85d728b7d773145326229015a 100644
--- a/paddle/math/tests/test_sparseMatrixCompare.cpp
+++ b/paddle/math/tests/test_sparseMatrixCompare.cpp
@@ -17,10 +17,10 @@ limitations under the License. */
 //  so disable when
 /// only cpu version.
 
-#include "paddle/utils/Util.h"
+#include <gtest/gtest.h>
 #include "paddle/math/Matrix.h"
+#include "paddle/utils/Util.h"
 #include "test_matrixUtil.h"
-#include <gtest/gtest.h>
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index b632a11bbdac07a9ce45f38aebe076f72a7b0870..65d01a15718ae2bebd4869eff0e5407524bc0e7c 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -245,6 +245,8 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src,
                                     bool useGpu,
                                     hl_stream_t stream) {
   dataId = src.dataId;
+  frameWidth = src.frameWidth;
+  frameHeight = src.frameHeight;
 
   if (!src.sequenceStartPositions) {
     // non-sequence input, copy samples directly
@@ -551,11 +553,10 @@ void Argument::getSeqInfo(std::vector<SeqInfo>* seqInfo) const {
     }
     seqInfo->push_back(info);
   }
-  std::sort(seqInfo->begin(),
-            seqInfo->end(),
-            [](const SeqInfo& a, const SeqInfo& b) {
-              return a.topLevelLength > b.topLevelLength;
-            });
+  std::sort(
+      seqInfo->begin(), seqInfo->end(), [](const SeqInfo& a, const SeqInfo& b) {
+        return a.topLevelLength > b.topLevelLength;
+      });
 }
 
 void Argument::checkSubset() const {
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
index 69d57a28c058173494d54c22a3eda33f6e339db9..afd2de0202bf0f14ec3d4c5b856455a3488e41f6 100644
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -18,9 +18,9 @@ limitations under the License. */
 
 #include "paddle/math/Matrix.h"
 #include "paddle/math/Vector.h"
+#include "paddle/parameter/Parameter.h"
 #include "paddle/utils/Locks.h"
 #include "paddle/utils/Util.h"
-#include "paddle/parameter/Parameter.h"
 
 namespace paddle {
 
diff --git a/paddle/parameter/FirstOrderOptimizer.cpp b/paddle/parameter/FirstOrderOptimizer.cpp
index 17268d37150c10dc4524184c05fa1f935f31847d..dbb738e98b5874f5bb33026ad585a6c3ef327d1d 100644
--- a/paddle/parameter/FirstOrderOptimizer.cpp
+++ b/paddle/parameter/FirstOrderOptimizer.cpp
@@ -12,14 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Util.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/math/TrainingAlgorithmOp.h"
 #include "FirstOrderOptimizer.h"
+#include "paddle/math/TrainingAlgorithmOp.h"
+#include "paddle/utils/Flags.h"
+#include "paddle/utils/Util.h"
 
 #include <cmath>
 
-P_DEFINE_bool(log_clipping, false, "enable log clipping or not");
+DEFINE_bool(log_clipping, false, "enable log clipping or not");
 
 namespace paddle {
 
diff --git a/paddle/parameter/ParallelParameter.cpp b/paddle/parameter/ParallelParameter.cpp
index b3182306a4115a5d5d9ad32f57437c66da8a95d9..cea77e5b1787c25ecb9ccd42e948bf90973fd4cb 100644
--- a/paddle/parameter/ParallelParameter.cpp
+++ b/paddle/parameter/ParallelParameter.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include <fstream>
+#include "paddle/utils/Logging.h"
 
 #include "ParallelParameter.h"
 
diff --git a/paddle/parameter/ParallelParameter.h b/paddle/parameter/ParallelParameter.h
index b0fe82d3c40ef1c670642d405da46b1ac51223b5..417e386dc74d308a6c0aefa2640f0f37de8dbf1f 100644
--- a/paddle/parameter/ParallelParameter.h
+++ b/paddle/parameter/ParallelParameter.h
@@ -16,19 +16,19 @@ limitations under the License. */
 
 #include <stdint.h>
 
+#include <sys/time.h>
+#include <unistd.h>
 #include <iostream>
 #include <string>
 #include <vector>
-#include <sys/time.h>
-#include <unistd.h>
 
 #include "hl_gpu.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/utils/Locks.h"
+#include "paddle/math/Vector.h"
 #include "paddle/parameter/Parameter.h"
 #include "paddle/parameter/ParameterUpdateFunctions.h"
+#include "paddle/utils/Flags.h"
+#include "paddle/utils/Locks.h"
 #include "paddle/utils/TypeDefs.h"
-#include "paddle/math/Vector.h"
 
 #include "ParameterConfig.pb.h"
 
diff --git a/paddle/parameter/Parameter.cpp b/paddle/parameter/Parameter.cpp
index 3b06650e0ca0b36fb7e764e7ef924b5592242518..1673fc6e533e416dfe4db557a1a8968667d1bfff 100644
--- a/paddle/parameter/Parameter.cpp
+++ b/paddle/parameter/Parameter.cpp
@@ -12,25 +12,25 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "Parameter.h"
 #include <fstream>
-#include "paddle/math/MathUtils.h"
 #include "AverageOptimizer.h"
 #include "FirstOrderOptimizer.h"
-#include "Parameter.h"
-#include "paddle/utils/Logging.h"
 #include "OptimizerFunctions.h"
 #include "OptimizerWithRegularizer.h"
 #include "ParameterUpdateFunctions.h"
-#include "paddle/math/SparseRowMatrix.h"
-#include "paddle/math/CpuSparseMatrix.h"
 #include "hl_gpu.h"
+#include "paddle/math/CpuSparseMatrix.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/SparseRowMatrix.h"
 #include "paddle/utils/CommandLineParser.h"
+#include "paddle/utils/Logging.h"
 
-P_DEFINE_int32(enable_grad_share,
-               (100 * 1024 * 1024),
-               "threshold for enable gradient parameter share for batch "
-               "multi-cpu training");
-P_DEFINE_int32(
+DEFINE_int32(enable_grad_share,
+             (100 * 1024 * 1024),
+             "threshold for enable gradient parameter share for batch "
+             "multi-cpu training");
+DEFINE_int32(
     grad_share_block_num,
     64,
     "block number of gradient parameter share for batch multi-cpu training");
diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h
index 6b0600517a95bd82a113a4b6133254a6515559c6..532c6770e596c33dfe7fd42f32157b2c6c19e18e 100644
--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@@ -23,14 +23,14 @@ limitations under the License. */
 #include "ParameterConfig.pb.h"
 #include "TrainerConfig.pb.h"
 
+#include "ParameterUpdaterHook.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
+#include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/Locks.h"
+#include "paddle/utils/ThreadLocal.h"
 #include "paddle/utils/TypeDefs.h"
-#include "paddle/math/Vector.h"
-#include "paddle/math/Matrix.h"
 #include "paddle/utils/Util.h"
-#include "paddle/utils/ThreadLocal.h"
-#include "ParameterUpdaterHook.h"
-#include "paddle/utils/GlobalConstants.h"
 
 namespace paddle {
 
diff --git a/paddle/parameter/ParameterUpdateFunctions.h b/paddle/parameter/ParameterUpdateFunctions.h
index 7374843d80de7c2c93d3310b07f09ce1620910d2..2d277e47e7eafc118fa37343e93e8a331a260aa9 100644
--- a/paddle/parameter/ParameterUpdateFunctions.h
+++ b/paddle/parameter/ParameterUpdateFunctions.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/utils/TypeDefs.h"
 #include "paddle/math/Vector.h"
+#include "paddle/utils/TypeDefs.h"
 
 namespace paddle {
 
diff --git a/paddle/parameter/ParameterUpdaterBase.cpp b/paddle/parameter/ParameterUpdaterBase.cpp
index b938270ce1884674b513b326192adb58af5ec9f4..49e2ae2b393f4a5e6c0986bc5e645011f5a3eca1 100644
--- a/paddle/parameter/ParameterUpdaterBase.cpp
+++ b/paddle/parameter/ParameterUpdaterBase.cpp
@@ -12,10 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <fstream>
-#include "paddle/utils/Logging.h"
 #include "ParameterUpdaterBase.h"
+#include <fstream>
 #include "hl_gpu.h"
+#include "paddle/utils/Logging.h"
 
 namespace paddle {
 
diff --git a/paddle/parameter/ParameterUpdaterHook.cpp b/paddle/parameter/ParameterUpdaterHook.cpp
index 466560c4376444f96927211d330f900cd4d66287..f826e8448c666bb3305c150f2bd95aade23223fb 100644
--- a/paddle/parameter/ParameterUpdaterHook.cpp
+++ b/paddle/parameter/ParameterUpdaterHook.cpp
@@ -14,16 +14,16 @@ limitations under the License. */
 
 #include "ParameterUpdaterHook.h"
 
+#include <atomic>
 #include <fstream>
-#include <unordered_map>
 #include <mutex>
-#include <atomic>
 #include <thread>
+#include <unordered_map>
 
 #include "paddle/math/Vector.h"
 #include "paddle/parameter/Parameter.h"
-#include "paddle/utils/Util.h"
 #include "paddle/utils/Flags.h"
+#include "paddle/utils/Util.h"
 
 namespace paddle {
 
@@ -156,7 +156,8 @@ private:
 
 static WeakKVCache<std::pair<std::string, int>,
                    IParameterUpdaterHook,
-                   StringIntPairHasher> g_hookCache_;
+                   StringIntPairHasher>
+    g_hookCache_;
 
 /**
  * ParameterUpdaterHook actually factory method.
diff --git a/paddle/parameter/Regularizer.cpp b/paddle/parameter/Regularizer.cpp
index 4420ee00311d57ae0832529d16645351ccb852b0..8511900150363a2247d508833eeb42b2d87beec1 100644
--- a/paddle/parameter/Regularizer.cpp
+++ b/paddle/parameter/Regularizer.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Util.h"
-#include "paddle/utils/Flags.h"
 #include "Regularizer.h"
+#include "paddle/utils/Flags.h"
+#include "paddle/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/parameter/Weight.cpp b/paddle/parameter/Weight.cpp
index f366a2b53f4a527fbb7ca9e0ecfe7ff3084b289e..3738a58d7f84081db9b6179cef9361322553a627 100644
--- a/paddle/parameter/Weight.cpp
+++ b/paddle/parameter/Weight.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Logging.h"
 #include "Weight.h"
+#include "paddle/utils/Logging.h"
 
 namespace paddle {
 
diff --git a/paddle/parameter/tests/test_common.cpp b/paddle/parameter/tests/test_common.cpp
index 4e4d0ccfa26a06e1516a03b5dbd61fad64513691..aa57a6346917b259dbb89f6ad2340fb8db28f3e3 100644
--- a/paddle/parameter/tests/test_common.cpp
+++ b/paddle/parameter/tests/test_common.cpp
@@ -12,12 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <stdlib.h>
 #include <paddle/utils/Util.h>
+#include <stdlib.h>
 
 #include <gtest/gtest.h>
-#include <paddle/utils/Flags.h>
 #include <paddle/parameter/ParameterUpdateFunctions.h>
+#include <paddle/utils/Flags.h>
 #include <paddle/utils/Stat.h>
 #include <paddle/utils/Thread.h>
 
diff --git a/paddle/pserver/BaseClient.cpp b/paddle/pserver/BaseClient.cpp
index 62fafc18918c631e8b13dd4bf49e4db1cf203f96..b4ac7a2506921b2409baaff077cc3541f3dc8d73 100644
--- a/paddle/pserver/BaseClient.cpp
+++ b/paddle/pserver/BaseClient.cpp
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <vector>
+#include "BaseClient.h"
 #include <string.h>
-#include "paddle/utils/Stat.h"
+#include <vector>
 #include "paddle/utils/CommandLineParser.h"
-#include "BaseClient.h"
+#include "paddle/utils/Stat.h"
 
-P_DECLARE_string(pservers);
+DECLARE_string(pservers);
 
 namespace paddle {
 
diff --git a/paddle/pserver/BaseClient.h b/paddle/pserver/BaseClient.h
index 5924f80684ee031c403e72ad4f1c7d84011b431c..262afafbe2d61305a158d945fac2d3b265012cbd 100644
--- a/paddle/pserver/BaseClient.h
+++ b/paddle/pserver/BaseClient.h
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/pserver/ProtoServer.h"
+#include "ParameterService.pb.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/pserver/ProtoServer.h"
 #include "paddle/utils/Queue.h"
 #include "paddle/utils/TypeDefs.h"
-#include "ParameterService.pb.h"
 
 namespace paddle {
 
diff --git a/paddle/pserver/LightNetwork.cpp b/paddle/pserver/LightNetwork.cpp
index 9a398d4f455516f630040831623bec3fdc0a4bae..cbc105e651faa0f283b3becb10449f4e1bc78b38 100644
--- a/paddle/pserver/LightNetwork.cpp
+++ b/paddle/pserver/LightNetwork.cpp
@@ -12,42 +12,42 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <sys/types.h>
-#include <sys/socket.h>
+#include <fcntl.h>
 #include <netdb.h>
 #include <netinet/in.h>
 #include <netinet/tcp.h>
-#include <fcntl.h>
+#include <sys/socket.h>
+#include <sys/types.h>
 
 #include <arpa/inet.h>
-#include <sys/ioctl.h>
 #include <net/if.h>
 #include <net/if_arp.h>
+#include <sys/ioctl.h>
 #include <sstream>
 
 #include "LightNetwork.h"
-#include "paddle/utils/Util.h"
-#include "paddle/utils/StringUtil.h"
 #include "RDMANetwork.h"
+#include "paddle/utils/StringUtil.h"
+#include "paddle/utils/Util.h"
 
 /// quick ack can reduce the latency of small message
-P_DEFINE_bool(small_messages,
-              false,
-              "if message size is small, recommend set it True to enable quick "
-              "ack and no delay");
+DEFINE_bool(small_messages,
+            false,
+            "if message size is small, recommend set it True to enable quick "
+            "ack and no delay");
 
 /// reasonable sock_send_buf_size can control the traffic injected into switch
 /// network. Injecting too many data into traffic could cause packets loss which
 /// cause long latency and degrade the efficiency of communication.
-P_DEFINE_int32(sock_send_buf_size,
-               1024 * 1024 * 40,
-               "restrict sock send buff size, can reduce network congestion if "
-               "set carefully");
+DEFINE_int32(sock_send_buf_size,
+             1024 * 1024 * 40,
+             "restrict sock send buff size, can reduce network congestion if "
+             "set carefully");
 
 /// reasonable size can hold bursted packets and reduce packets loss
-P_DEFINE_int32(sock_recv_buf_size,
-               1024 * 1024 * 40,
-               "restrict sock recv buff size");
+DEFINE_int32(sock_recv_buf_size,
+             1024 * 1024 * 40,
+             "restrict sock recv buff size");
 
 namespace paddle {
 
diff --git a/paddle/pserver/LightNetwork.h b/paddle/pserver/LightNetwork.h
index 7aff007a2704570c627672f14dc64a677262bb02..c4a06deb940e8f39af2fcb6de54de1b6cb2d1483 100644
--- a/paddle/pserver/LightNetwork.h
+++ b/paddle/pserver/LightNetwork.h
@@ -16,10 +16,10 @@ limitations under the License. */
 
 #include "SocketChannel.h"
 
+#include <atomic>
 #include <memory>
 #include <thread>
 #include <vector>
-#include <atomic>
 
 #include "paddle/utils/Thread.h"
 
diff --git a/paddle/pserver/ParameterClient2.cpp b/paddle/pserver/ParameterClient2.cpp
index 31418822b3f83aed471cdb970dbdbcaf928e7f15..a97859f83fe6495b298e920346c964ef2a9b146c 100644
--- a/paddle/pserver/ParameterClient2.cpp
+++ b/paddle/pserver/ParameterClient2.cpp
@@ -15,34 +15,27 @@ limitations under the License. */
 #include <unistd.h>
 
 #include "ParameterClient2.h"
-#include "paddle/utils/StringUtil.h"
+#include "paddle/math/SparseRowMatrix.h"
 #include "paddle/utils/Flags.h"
 #include "paddle/utils/Stat.h"
-#include "paddle/math/SparseRowMatrix.h"
+#include "paddle/utils/StringUtil.h"
 
-P_DEFINE_string(pservers, "127.0.0.1", "Comma separated addresses of pservers");
-P_DEFINE_int32(parallel_thread_num, 1, "Thread number for parameter send");
+DEFINE_string(pservers, "127.0.0.1", "Comma separated addresses of pservers");
+DEFINE_int32(parallel_thread_num, 1, "Thread number for parameter send");
 
 namespace paddle {
 
-template <class T>
-void copyToRepeatedField(google::protobuf::RepeatedField<T>* dest,
-                         const T* src,
+template <typename T1, typename T2>
+void copyToRepeatedField(google::protobuf::RepeatedField<T1>* dest,
+                         const T2* src,
                          size_t size) {
   dest->Clear();
   dest->Reserve(size);
-
   for (size_t i = 0; i < size; ++i) {
     dest->AddAlreadyReserved(src[i]);
   }
 }
 
-template <class T>
-void copyToRepeatedField(const std::vector<T>& src,
-                         google::protobuf::RepeatedField<T>* dest) {
-  copyToRepeatedField(dest, &src[0], src.size());
-}
-
 ParameterClient2::ParameterClient2(bool separate, int port, int numPorts)
     : BaseClient(separate, numPorts), port_(port) {
 #ifndef PADDLE_DISABLE_TIMER
@@ -618,6 +611,8 @@ void PreparedOperations::addOperationHelper(Operation* op, CpuMatrixPtr mat) {
       pmat.mutable_values(), mat->getData(), pmat.num_cols() * pmat.num_rows());
 }
 
+static inline real addTwo(real a, double b) { return a + b; }
+
 void ParameterClient2::doOperation(PreparedOperations& ops,
                                    bool waitForGradient,
                                    bool sendBackGradient,
@@ -682,8 +677,11 @@ void ParameterClient2::doOperation(PreparedOperations& ops,
         CpuVectorPtr rvec = resultVectors[i];
         if (!rvec) continue;
         CHECK_EQ(rvec->getSize(), (size_t)vec.dim());
-        CpuVector avec(rvec->getSize(), const_cast<real*>(vec.values().data()));
-        rvec->add(avec);
+        std::transform(rvec->getData(),
+                       rvec->getData() + rvec->getSize(),
+                       vec.values().data(),
+                       rvec->getData(),
+                       addTwo);
       }
 
       CHECK_EQ(resultMatrices.size(), (size_t)result.matrices_size());
@@ -693,11 +691,12 @@ void ParameterClient2::doOperation(PreparedOperations& ops,
         if (!rmat) continue;
         CHECK_EQ(rmat->getHeight(), (size_t)mat.num_rows());
         CHECK_EQ(rmat->getWidth(), (size_t)mat.num_cols());
-        CpuMatrixPtr amat =
-            std::make_shared<CpuMatrix>(const_cast<real*>(mat.values().data()),
-                                        rmat->getHeight(),
-                                        rmat->getWidth());
-        rmat->add(*amat);
+
+        std::transform(rmat->getData(),
+                       rmat->getData() + rmat->getElementCnt(),
+                       mat.values().data(),
+                       rmat->getData(),
+                       addTwo);
       }
     }
   }
diff --git a/paddle/pserver/ParameterClient2.h b/paddle/pserver/ParameterClient2.h
index 0f180722e329ca62cc2598a4080fe8f173e87398..eed71ccb43b0fec76a74a7f00662c32c97c26ff4 100644
--- a/paddle/pserver/ParameterClient2.h
+++ b/paddle/pserver/ParameterClient2.h
@@ -16,25 +16,25 @@ limitations under the License. */
 
 #include <atomic>
 #include <mutex>
-#include <vector>
 #include <unordered_map>
+#include <vector>
 
-#include "paddle/utils/Locks.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
 #include "paddle/parameter/Parameter.h"
+#include "paddle/pserver/BaseClient.h"
+#include "paddle/utils/Flags.h"
+#include "paddle/utils/Locks.h"
 #include "paddle/utils/Queue.h"
 #include "paddle/utils/TypeDefs.h"
 #include "paddle/utils/Util.h"
-#include "paddle/math/Vector.h"
-#include "paddle/utils/Flags.h"
-#include "paddle/pserver/BaseClient.h"
 
 #include "ParameterService.pb.h"
 
-#include "SparseParameterDistribution.h"
 #include "ProtoServer.h"
+#include "SparseParameterDistribution.h"
 
-P_DECLARE_int32(parallel_thread_num);
+DECLARE_int32(parallel_thread_num);
 
 namespace paddle {
 
diff --git a/paddle/pserver/ParameterServer2.cpp b/paddle/pserver/ParameterServer2.cpp
index ac70efc64f99f3bfef5a6d12dc7ff560bd10d25d..856fa0ad1ab30e3fc554ac96dd3bed71b1548579 100644
--- a/paddle/pserver/ParameterServer2.cpp
+++ b/paddle/pserver/ParameterServer2.cpp
@@ -21,20 +21,20 @@ limitations under the License. */
 
 #include "paddle/parameter/AverageOptimizer.h"
 #include "paddle/parameter/FirstOrderOptimizer.h"
-#include "paddle/utils/Flags.h"
 #include "paddle/parameter/OptimizerFunctions.h"
 #include "paddle/parameter/OptimizerWithRegularizer.h"
-#include "paddle/parameter/ParameterUpdateFunctions.h"
 #include "paddle/parameter/ParameterOptimizer.h"
+#include "paddle/parameter/ParameterUpdateFunctions.h"
 #include "paddle/parameter/Regularizer.h"
-#include "paddle/utils/Stat.h"
+#include "paddle/utils/Flags.h"
 #include "paddle/utils/GlobalConstants.h"
+#include "paddle/utils/Stat.h"
 
-P_DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec");
-P_DEFINE_double(async_lagged_ratio_min,
-                1.0,
-                "control config_.async_lagged_grad_discard_ratio() min value");
-P_DEFINE_double(
+DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec");
+DEFINE_double(async_lagged_ratio_min,
+              1.0,
+              "control config_.async_lagged_grad_discard_ratio() min value");
+DEFINE_double(
     async_lagged_ratio_default,
     1.5,
     "if async_lagged_grad_discard_ratio is not set in trainer_config.conf"
diff --git a/paddle/pserver/ParameterServer2.h b/paddle/pserver/ParameterServer2.h
index 47122f363218e5956e1617df1765942644923ae7..b0cf22e1fb158e76fcee1ce6ef1f375995803ce6 100644
--- a/paddle/pserver/ParameterServer2.h
+++ b/paddle/pserver/ParameterServer2.h
@@ -15,30 +15,30 @@ limitations under the License. */
 #pragma once
 
 #include <atomic>
+#include <limits>
 #include <mutex>
 #include <string>
-#include <vector>
-#include <unordered_map>
 #include <type_traits>
-#include <limits>
+#include <unordered_map>
+#include <vector>
 
 #include <stddef.h>
 #include <stdlib.h>
 
-#include "paddle/utils/Locks.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
 #include "paddle/parameter/Parameter.h"
 #include "paddle/parameter/ParameterOptimizer.h"
+#include "paddle/utils/Locks.h"
+#include "paddle/utils/Stat.h"
 #include "paddle/utils/ThreadLocal.h"
 #include "paddle/utils/TypeDefs.h"
-#include "paddle/math/Vector.h"
-#include "paddle/utils/Stat.h"
 
 #include "ParameterService.pb.h"
 
 #include "ProtoServer.h"
 
-P_DECLARE_int32(port);
+DECLARE_int32(port);
 
 namespace paddle {
 
diff --git a/paddle/pserver/ParameterServer2Main.cpp b/paddle/pserver/ParameterServer2Main.cpp
index 1ba9b48c2382d97aeabcb33aee3b60720bab62d6..ffc521f2c143d95ff07c3825e0a746cb31743d9b 100644
--- a/paddle/pserver/ParameterServer2Main.cpp
+++ b/paddle/pserver/ParameterServer2Main.cpp
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Util.h"
-#include "paddle/utils/StringUtil.h"
 #include <fstream>
+#include "paddle/utils/StringUtil.h"
+#include "paddle/utils/Util.h"
 
-#include "paddle/utils/Flags.h"
 #include "ParameterServer2.h"
 #include "RDMANetwork.h"
+#include "paddle/utils/Flags.h"
 
 using namespace paddle;  // NOLINT
 
diff --git a/paddle/pserver/ProtoServer.h b/paddle/pserver/ProtoServer.h
index 97b7bf167d5c6564aa01da18a46fe4bfe1c9a9b5..3acdcc27dab532f964dc97636be020138180e780 100644
--- a/paddle/pserver/ProtoServer.h
+++ b/paddle/pserver/ProtoServer.h
@@ -100,7 +100,8 @@ protected:
                              ResponseCallback callback);
 
   typedef std::function<void(std::unique_ptr<MsgReader> msgReader,
-                             ResponseCallback callback)> ServiceFunction;
+                             ResponseCallback callback)>
+      ServiceFunction;
 
   /**
    * @brief register one RPC function in function mapping
diff --git a/paddle/pserver/SocketChannel.cpp b/paddle/pserver/SocketChannel.cpp
index f3e74257f6f8f8d83aac1b6f196f69ca7ff5c9fd..05998891649cee30e23e556d9311c3a383f43e10 100644
--- a/paddle/pserver/SocketChannel.cpp
+++ b/paddle/pserver/SocketChannel.cpp
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #include "SocketChannel.h"
 
-#include <stdio.h>
-#include <sys/types.h>
-#include <sys/socket.h>
 #include <netdb.h>
 #include <netinet/in.h>
+#include <stdio.h>
+#include <sys/socket.h>
+#include <sys/types.h>
 #include <unistd.h>
 #include "RDMANetwork.h"
 
diff --git a/paddle/pserver/SparseParameterDistribution.cpp b/paddle/pserver/SparseParameterDistribution.cpp
index 0068f85b52be75fd2f958ad7a1e2ae76949b1fee..6dd725db30cd6d50539d1b2b30ab9e42a081c7b3 100644
--- a/paddle/pserver/SparseParameterDistribution.cpp
+++ b/paddle/pserver/SparseParameterDistribution.cpp
@@ -20,26 +20,26 @@ limitations under the License. */
 
 #include "SparseParameterDistribution.h"
 
-P_DEFINE_bool(check_sparse_distribution_in_pserver,
-              false,
-              "check whether sparse parameter exhibts balanced distribution at "
-              "all pservers");
-P_DEFINE_bool(show_check_sparse_distribution_log,
-              false,
-              "show logs details for sparse parameter distribution in pserver");
-P_DEFINE_int32(check_sparse_distribution_batches,
-               100,
-               "run sparse parameter distribution check for N batches");
-P_DEFINE_double(
+DEFINE_bool(check_sparse_distribution_in_pserver,
+            false,
+            "check whether sparse parameter exhibts balanced distribution at "
+            "all pservers");
+DEFINE_bool(show_check_sparse_distribution_log,
+            false,
+            "show logs details for sparse parameter distribution in pserver");
+DEFINE_int32(check_sparse_distribution_batches,
+             100,
+             "run sparse parameter distribution check for N batches");
+DEFINE_double(
     check_sparse_distribution_ratio,
     0.6,
     "if parameters dispatched to different pservers exhibit unbalanced "
     " distribution for check_sparse_distribution_ratio * "
     " check_sparse_distribution_batches times, crash program");
-P_DEFINE_double(check_sparse_distribution_unbalance_degree,
-                2.0,
-                "the ratio of maximum data size and minimun data size for "
-                "different pserver");
+DEFINE_double(check_sparse_distribution_unbalance_degree,
+              2.0,
+              "the ratio of maximum data size and minimun data size for "
+              "different pserver");
 
 namespace paddle {
 
diff --git a/paddle/pserver/SparseParameterDistribution.h b/paddle/pserver/SparseParameterDistribution.h
index dc63b065a7e91ece23a9ffbdbeea8683c7341e25..24b14106cf64060afa61ecede9e981301ea5634a 100644
--- a/paddle/pserver/SparseParameterDistribution.h
+++ b/paddle/pserver/SparseParameterDistribution.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include <unistd.h>
 
-#include "paddle/utils/Logging.h"
 #include <atomic>
+#include "paddle/utils/Logging.h"
 
 namespace paddle {
 
diff --git a/paddle/pserver/test/SocketTest.cpp b/paddle/pserver/test/SocketTest.cpp
index 528f5e381eed0679388389633fa7e86822514811..066a6c02939695e7050a7693365d7c449f70e723 100644
--- a/paddle/pserver/test/SocketTest.cpp
+++ b/paddle/pserver/test/SocketTest.cpp
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #include "paddle/utils/Util.h"
 
-#include <stdio.h>
-#include <sys/types.h>
-#include <sys/socket.h>
 #include <netdb.h>
 #include <netinet/in.h>
+#include <stdio.h>
+#include <sys/socket.h>
+#include <sys/types.h>
 
 #include <thread>
 
@@ -195,9 +195,9 @@ SocketClient::SocketClient(const std::string& serverAddr, int serverPort) {
   channel_.reset(new SocketChannel(sockfd));
 }
 
-P_DEFINE_string(server_addr, "127.0.0.1", "Server address");
-P_DEFINE_int64(dim, 10000000, "Data size");
-P_DEFINE_int32(loop_time, 100000, "test loop time");
+DEFINE_string(server_addr, "127.0.0.1", "Server address");
+DEFINE_int64(dim, 10000000, "Data size");
+DEFINE_int32(loop_time, 100000, "test loop time");
 
 using namespace paddle;  // NOLINT
 
diff --git a/paddle/pserver/test/test_ParameterServer2.cpp b/paddle/pserver/test/test_ParameterServer2.cpp
index 493b6d060ca14a970548c6f200615d707be92d37..8e7231a9e1aee7b61f8dfa42f1367b79fee81a2b 100644
--- a/paddle/pserver/test/test_ParameterServer2.cpp
+++ b/paddle/pserver/test/test_ParameterServer2.cpp
@@ -12,18 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <gtest/gtest.h>
 #include <paddle/pserver/ParameterClient2.h>
 #include <paddle/pserver/ParameterServer2.h>
-#include <gtest/gtest.h>
 #include <paddle/utils/Flags.h>
 #include <paddle/utils/Util.h>
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_int32(num_gradient_servers);
-P_DEFINE_string(server_addr, "127.0.0.1", "assign server address");
-P_DEFINE_int32(server_cpu, 0, "assign server cpu");
+DECLARE_int32(num_gradient_servers);
+DEFINE_string(server_addr, "127.0.0.1", "assign server address");
+DEFINE_int32(server_cpu, 0, "assign server cpu");
 
 class ParameterServer2Tester : public ParameterServer2 {
 public:
diff --git a/paddle/pserver/test/test_ProtoServer.cpp b/paddle/pserver/test/test_ProtoServer.cpp
index cfed0d30d3c2109d8ab5ca15e5855a3551b9ef87..9f86ee80f4e5cc99ea3597b3ed37a387578f032a 100644
--- a/paddle/pserver/test/test_ProtoServer.cpp
+++ b/paddle/pserver/test/test_ProtoServer.cpp
@@ -16,15 +16,15 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 
-#include "paddle/utils/Stat.h"
+#include "ParameterService.pb.h"
 #include "paddle/math/Vector.h"
 #include "paddle/pserver/ProtoServer.h"
-#include "ParameterService.pb.h"
+#include "paddle/utils/Stat.h"
 
-P_DEFINE_string(server_addr, "127.0.0.1", "Server address");
-P_DEFINE_int64(dim, 50000000, "Data size");
-P_DEFINE_bool(test_proto_server, true, "whether to test ProtoServer");
-P_DEFINE_bool(benchmark, false, "Do benchmark. Skip some tests");
+DEFINE_string(server_addr, "127.0.0.1", "Server address");
+DEFINE_int64(dim, 50000000, "Data size");
+DEFINE_bool(test_proto_server, true, "whether to test ProtoServer");
+DEFINE_bool(benchmark, false, "Do benchmark. Skip some tests");
 
 using namespace paddle;  // NOLINT
 
diff --git a/paddle/py_paddle/util.py b/paddle/py_paddle/util.py
index d6bbf9a5a924db5e03b5acdb8ca2fe627ce5dd2c..ce105d249aaf3e838443d3e0cf5996fe8c783a22 100644
--- a/paddle/py_paddle/util.py
+++ b/paddle/py_paddle/util.py
@@ -559,10 +559,10 @@ def __monkey_patch_trainer__():
 
 
 def monkeypatches():
-    patches = [__monkeypatch_init_paddle__,
-               __monkeypatch_gradient_machine__,
-               __monkey_patch_protobuf_objects__,
-               __monkey_patch_parameter__,
-               __monkey_patch_trainer__]
+    patches = [
+        __monkeypatch_init_paddle__, __monkeypatch_gradient_machine__,
+        __monkey_patch_protobuf_objects__, __monkey_patch_parameter__,
+        __monkey_patch_trainer__
+    ]
     for patch in patches:
         patch()
diff --git a/paddle/scripts/docker/Dockerfile b/paddle/scripts/docker/Dockerfile
index edb84712d8a238ad84feb424246c6b44937601bf..207f97c4a69e6681702d3fe73475885d9b867ce9 100644
--- a/paddle/scripts/docker/Dockerfile
+++ b/paddle/scripts/docker/Dockerfile
@@ -4,14 +4,16 @@ MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 ARG DEBIAN_FRONTEND=noninteractive
 RUN apt-get update \
     && apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
-    libgoogle-glog-dev libgflags-dev libatlas-dev libatlas3-base g++ m4 python-pip \
+    libgoogle-glog-dev libgflags-dev libgtest-dev \
+    libatlas-dev libatlas3-base g++ m4 python-pip \
     python-protobuf python-numpy python-dev swig openssh-server \
     wget unzip python-matplotlib tar xz-utils bzip2 gzip coreutils \
     sed grep graphviz libjpeg-dev zlib1g-dev doxygen \
     clang-3.8 llvm-3.8 libclang-3.8-dev \
     && apt-get clean -y
+RUN cd /usr/src/gtest && cmake . && make && cp *.a /usr/lib
 RUN pip install -U BeautifulSoup docopt PyYAML pillow \
-    sphinx sphinx_rtd_theme breathe recommonmark
+    sphinx sphinx_rtd_theme recommonmark
 
 # cmake tends to hide and blur the dependencies between code modules, as
 # noted here https://github.com/PaddlePaddle/Paddle/issues/763. We are
diff --git a/paddle/scripts/docker/Dockerfile.gpu b/paddle/scripts/docker/Dockerfile.gpu
index 5d175e15a79f033e05c2e5c43350b2fde26e7107..33f6adfea2a602c53beb4685e0bf3f87452e2d53 100644
--- a/paddle/scripts/docker/Dockerfile.gpu
+++ b/paddle/scripts/docker/Dockerfile.gpu
@@ -4,14 +4,16 @@ MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 ARG DEBIAN_FRONTEND=noninteractive
 RUN apt-get update \
     && apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
-    libgoogle-glog-dev libgflags-dev libatlas-dev libatlas3-base g++ m4 python-pip \
+    libgoogle-glog-dev libgflags-dev libgtest-dev \
+    libatlas-dev libatlas3-base g++ m4 python-pip \
     python-protobuf python-numpy python-dev swig openssh-server \
     wget unzip python-matplotlib tar xz-utils bzip2 gzip coreutils \
     sed grep graphviz libjpeg-dev zlib1g-dev doxygen \
     clang-3.8 llvm-3.8 libclang-3.8-dev \
     && apt-get clean -y
+RUN cd /usr/src/gtest && cmake . && make && cp *.a /usr/lib
 RUN pip install -U BeautifulSoup docopt PyYAML pillow \
-    sphinx sphinx_rtd_theme breathe recommonmark
+    sphinx sphinx_rtd_theme recommonmark
 
 # cmake tends to hide and blur the dependencies between code modules, as
 # noted here https://github.com/PaddlePaddle/Paddle/issues/763. We are
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index ace2c0dee972e338001a0e5a4045c32e64ff157e..283fd34a6d8a2268f3800ec69920e128ac75e7dc 100644
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -21,8 +21,6 @@ function version(){
         echo "    with_double: @WITH_DOUBLE@"
         echo "    with_python: @WITH_PYTHON@"
         echo "    with_rdma: @WITH_RDMA@"
-        echo "    with_glog: @WITH_GLOG@"
-        echo "    with_gflags: @WITH_GFLAGS@"
         echo "    with_metric_learning: @WITH_METRIC@"
         echo "    with_timer: @WITH_TIMER@"
         echo "    with_predict_sdk: @WITH_PREDICT_SDK@"
diff --git a/paddle/scripts/tools/build_docs/Dockerfile b/paddle/scripts/tools/build_docs/Dockerfile
index 506b13210ba1ee7277e2671870d79750cf63e900..78dc756bd1175019d90fc852635497fea1eb55e2 100644
--- a/paddle/scripts/tools/build_docs/Dockerfile
+++ b/paddle/scripts/tools/build_docs/Dockerfile
@@ -3,5 +3,5 @@ COPY build.sh /
 RUN pip install sphinx &&\
     pip install sphinx_rtd_theme &&\
     apt install -y doxygen graphviz &&\
-    pip install breathe recommonmark numpy protobuf==2.6.1
+    pip install recommonmark numpy protobuf==2.6.1
 CMD /build.sh
diff --git a/paddle/scripts/travis/before_install.linux.sh b/paddle/scripts/travis/before_install.linux.sh
index ec2ac1f2240765d1c453ce50ec44286a551a37ba..9620bff6bcf77c6e87f149e8e33408170dd8e507 100755
--- a/paddle/scripts/travis/before_install.linux.sh
+++ b/paddle/scripts/travis/before_install.linux.sh
@@ -1,5 +1,16 @@
 #!/bin/bash
 set -e
+pip install protobuf
+cd /tmp
+wget https://github.com/google/protobuf/archive/v3.0.2.tar.gz -O protobuf.tar.gz
+tar xf protobuf.tar.gz
+cd protobuf*
+./autogen.sh
+./configure --prefix=/usr/
+make -j 2 install
+cd ..
+rm -rf protobuf*
+
 pushd /usr/src/gtest
 cmake .
 make
diff --git a/paddle/scripts/travis/before_install.osx.sh b/paddle/scripts/travis/before_install.osx.sh
index f438e69b822aa45448e1d303bf27af482a3d88d8..bd88ed39132f19ca7cfc4f0dd6acdbc6b83e94ab 100755
--- a/paddle/scripts/travis/before_install.osx.sh
+++ b/paddle/scripts/travis/before_install.osx.sh
@@ -2,9 +2,8 @@
 brew update
 brew tap homebrew/science
 brew install python
-sudo pip install --upgrade protobuf==2.6.0
-brew install homebrew/versions/protobuf260 --without-python
-brew install cmake python glog gflags openblas wget md5sha1sum
+sudo pip install --upgrade protobuf
+brew install cmake python glog gflags openblas wget md5sha1sum protobuf
 
 wget https://github.com/google/googletest/archive/release-1.8.0.tar.gz -O gtest.tar.gz
 tar xf gtest.tar.gz
diff --git a/paddle/scripts/travis/docs.sh b/paddle/scripts/travis/docs.sh
index c2a4809d75b97a9d8d8b83cf197e90bd62b48603..0bbb76a8a3caa27da0911af0fe87df7fbff617b4 100755
--- a/paddle/scripts/travis/docs.sh
+++ b/paddle/scripts/travis/docs.sh
@@ -47,17 +47,20 @@ if [ $? -eq 0 ]; then
 fi
 set -e
 
-# Commit
-git add .
-git config user.name "Travis CI"
-git config user.email "paddle-dev@baidu.com"
-git commit -m "Deploy to GitHub Pages: ${SHA}"
-
-# Set ssh private key
-openssl aes-256-cbc -K $SSL_KEY -iv $SSL_IV -in ../../paddle/scripts/travis/deploy_key.enc -out deploy_key -d
-chmod 600 deploy_key
-eval `ssh-agent -s`
-ssh-add deploy_key
-
-# Push
-git push $SSH_REPO $TARGET_BRANCH
+if [ -n $SSL_KEY ]; then  # Only push updated docs for github.com/PaddlePaddle/Paddle.
+  # Commit
+  git add .
+  git config user.name "Travis CI"
+  git config user.email "paddle-dev@baidu.com"
+  git commit -m "Deploy to GitHub Pages: ${SHA}"
+
+  # Set ssh private key
+  openssl aes-256-cbc -K $SSL_KEY -iv $SSL_IV -in ../../paddle/scripts/travis/deploy_key.enc -out deploy_key -d
+  chmod 600 deploy_key
+  eval `ssh-agent -s`
+  ssh-add deploy_key
+
+  # Push
+  git push $SSH_REPO $TARGET_BRANCH
+
+fi
diff --git a/paddle/scripts/travis/main.sh b/paddle/scripts/travis/main.sh
index c49d4546c24ac9304cd6f3c5940ed3d1d32ebb3d..13f2552d29db38041a73edca0acd202945c67484 100755
--- a/paddle/scripts/travis/main.sh
+++ b/paddle/scripts/travis/main.sh
@@ -5,6 +5,8 @@ if [ ${JOB} == "BUILD_AND_TEST" ]; then
   ./build_and_test.sh
 elif [ ${JOB} == "DOCS" ]; then
   ./docs.sh
+elif [ ${JOB} == "PRE_COMMIT" ]; then
+  ./precommit.sh
 else
   echo Unknown job ${JOB}
   exit 1
diff --git a/paddle/scripts/travis/precommit.sh b/paddle/scripts/travis/precommit.sh
new file mode 100755
index 0000000000000000000000000000000000000000..7a59b1131d0a410be9c5cef08e3cc11633d2ba67
--- /dev/null
+++ b/paddle/scripts/travis/precommit.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+function abort(){
+    echo "Your commit not fit PaddlePaddle code style" 1>&2
+    echo "Please use pre-commit scripts to auto-format your code" 1>&2
+    exit 1
+}
+
+trap 'abort' 0
+set -e
+source common.sh
+cd ..
+export PATH=/usr/bin:$PATH
+pre-commit install
+clang-format --version
+
+if ! pre-commit run -a ; then
+  git diff  --exit-code
+fi
+
+trap : 0
diff --git a/paddle/trainer/MergeModel.cpp b/paddle/trainer/MergeModel.cpp
index 8cb2873feb13e0ce420bc5133f14ecde6c3bfe76..91d89b61a32259b8bbe70fda2579f87ec6b9af00 100644
--- a/paddle/trainer/MergeModel.cpp
+++ b/paddle/trainer/MergeModel.cpp
@@ -14,13 +14,13 @@ limitations under the License. */
 
 #include <memory>
 
-#include "paddle/utils/PythonUtil.h"
-#include "paddle/pserver/ParameterServer2.h"
 #include "ParamUtil.h"
 #include "Trainer.h"
+#include "paddle/pserver/ParameterServer2.h"
+#include "paddle/utils/PythonUtil.h"
 
-P_DEFINE_string(model_dir, "", "Directory for separated model files");
-P_DEFINE_string(model_file, "", "File for merged model file");
+DEFINE_string(model_dir, "", "Directory for separated model files");
+DEFINE_string(model_file, "", "File for merged model file");
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/trainer/ParamUtil.cpp b/paddle/trainer/ParamUtil.cpp
index 200417ebfc51a2b07e785f994ce7e92d58ab01d3..ffbca42e106591ddeb2cefcfafbeb408c544371b 100644
--- a/paddle/trainer/ParamUtil.cpp
+++ b/paddle/trainer/ParamUtil.cpp
@@ -17,22 +17,22 @@ limitations under the License. */
 #include <fenv.h>
 #include <stdio.h>
 
-#include <iostream>
 #include <iomanip>
-#include <sstream>
+#include <iostream>
 #include <limits>
+#include <sstream>
 
 #include <google/protobuf/text_format.h>
 #include <paddle/utils/Version.h>
 
+#include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
-#include "paddle/utils/GlobalConstants.h"
 
+#include "TesterConfig.h"
 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 #include "paddle/gserver/layers/ValidationLayer.h"
-#include "TesterConfig.h"
 
 namespace paddle {
 
diff --git a/paddle/trainer/ParamUtil.h b/paddle/trainer/ParamUtil.h
index 8fa6fda75ca21c26e7de38fc3614a4794fb19a74..2e05595848760c9abd7d916003656c8103151abf 100644
--- a/paddle/trainer/ParamUtil.h
+++ b/paddle/trainer/ParamUtil.h
@@ -22,11 +22,11 @@ limitations under the License. */
 #include "paddle/gserver/dataproviders/DataProvider.h"
 #include "paddle/gserver/gradientmachines/GradientMachine.h"
 
+#include <stdlib.h>
+#include <fstream>
+#include "ParameterUpdater.h"
 #include "TrainerConfig.pb.h"
 #include "TrainerConfigHelper.h"
-#include "ParameterUpdater.h"
-#include <fstream>
-#include <stdlib.h>
 
 namespace paddle {
 
diff --git a/paddle/trainer/ParameterUpdater.h b/paddle/trainer/ParameterUpdater.h
index 81ac374425b4f9181b474c05fe21a6288af84456..e52b5cd318b4d647a4bd126adf2ecfaba08d8363 100644
--- a/paddle/trainer/ParameterUpdater.h
+++ b/paddle/trainer/ParameterUpdater.h
@@ -24,8 +24,8 @@ limitations under the License. */
 #include "paddle/parameter/Parameter.h"
 #include "paddle/parameter/ParameterUpdaterBase.h"
 
-#include "paddle/gserver/layers/Layer.h"
 #include "TrainerConfig.pb.h"
+#include "paddle/gserver/layers/Layer.h"
 
 #include <memory>
 #include <vector>
diff --git a/paddle/trainer/RemoteParameterUpdater.cpp b/paddle/trainer/RemoteParameterUpdater.cpp
index 702ea07f8ad34d64c4b6ecf932528b20696819bb..974e78fa17d6564414962475f81497491bbb0482 100644
--- a/paddle/trainer/RemoteParameterUpdater.cpp
+++ b/paddle/trainer/RemoteParameterUpdater.cpp
@@ -14,11 +14,11 @@ limitations under the License. */
 
 #include "RemoteParameterUpdater.h"
 #include "Trainer.h"
-#include "paddle/utils/Stat.h"
 #include "paddle/utils/GlobalConstants.h"
+#include "paddle/utils/Stat.h"
 
-P_DECLARE_int32(trainer_id);
-P_DECLARE_string(save_dir);
+DECLARE_int32(trainer_id);
+DECLARE_string(save_dir);
 
 namespace paddle {
 
diff --git a/paddle/trainer/RemoteParameterUpdater.h b/paddle/trainer/RemoteParameterUpdater.h
index 46ce4be1460380a6485b0ef98af49fc4abb7209c..66055c778e439a1edf7d1b6dd2e13b945fa73323 100644
--- a/paddle/trainer/RemoteParameterUpdater.h
+++ b/paddle/trainer/RemoteParameterUpdater.h
@@ -14,12 +14,12 @@ limitations under the License. */
 
 #pragma once
 
-#include <thread>
 #include <functional>
-#include "paddle/pserver/ParameterClient2.h"
+#include <thread>
 #include "ParameterUpdater.h"
-#include "paddle/utils/Util.h"
+#include "paddle/pserver/ParameterClient2.h"
 #include "paddle/utils/Queue.h"
+#include "paddle/utils/Util.h"
 
 namespace paddle {
 
diff --git a/paddle/trainer/Tester.cpp b/paddle/trainer/Tester.cpp
index 97d1b53934b65da05689126292d0e4deb99f9b4f..24fac3e5a8141cbec912d276833ec491385b97ab 100644
--- a/paddle/trainer/Tester.cpp
+++ b/paddle/trainer/Tester.cpp
@@ -46,6 +46,12 @@ Tester::Tester(const std::shared_ptr<TrainerConfigHelper>& config,
       gradientMachine_(gradientMachine),
       parameterUpdater_(parameterUpdater),
       testDataProvider_(testDataProvider) {
+  if (config_->getOptConfig().use_sparse_remote_updater()) {
+    LOG(FATAL) << "It's prohibited to set sparse_remote_update "
+               << "when doing train and test jobs in the same "
+               << "process. You could run paddle --job=test in "
+               << "a separate process.";
+  }
   testEvaluator_.reset(gradientMachine_->makeEvaluator());
   if (intconfig_->distributeTest) {
     testParameterClient_.reset(new ParameterClient2(true));
diff --git a/paddle/trainer/Tester.h b/paddle/trainer/Tester.h
index ae7e0e93bff9c9c21f991ac6136a18ffd88fb176..e892744db278586f2fd5b3cb527aa7c17752c477 100644
--- a/paddle/trainer/Tester.h
+++ b/paddle/trainer/Tester.h
@@ -24,12 +24,12 @@ limitations under the License. */
 
 #include "TrainerConfig.pb.h"
 
-#include "ParameterUpdater.h"
+#include <stdlib.h>
+#include <fstream>
 #include "ParamUtil.h"
+#include "ParameterUpdater.h"
 #include "TesterConfig.h"
 #include "TrainerInternalConfig.h"
-#include <fstream>
-#include <stdlib.h>
 
 namespace paddle {
 
diff --git a/paddle/trainer/TesterConfig.h b/paddle/trainer/TesterConfig.h
index 9ff145a8a18b3e0704028a0c7ff94e31e2a216bf..68d4c931ff2df8e24acaa9fe6b35bfd613197c72 100644
--- a/paddle/trainer/TesterConfig.h
+++ b/paddle/trainer/TesterConfig.h
@@ -23,9 +23,9 @@ limitations under the License. */
 
 #include "TrainerConfig.pb.h"
 
-#include "ParameterUpdater.h"
-#include <fstream>
 #include <stdlib.h>
+#include <fstream>
+#include "ParameterUpdater.h"
 
 namespace paddle {
 
diff --git a/paddle/trainer/ThreadParameterUpdater.cpp b/paddle/trainer/ThreadParameterUpdater.cpp
index bee7f061fed3a01e8292137272c3288334ef70c2..9caa92a4d7557c0c8633d881820862bbbd5df87e 100644
--- a/paddle/trainer/ThreadParameterUpdater.cpp
+++ b/paddle/trainer/ThreadParameterUpdater.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/math/SparseRowMatrix.h"
 #include "paddle/utils/Thread.h"
 
-P_DECLARE_int32(trainer_count);
+DECLARE_int32(trainer_count);
 
 namespace paddle {
 
diff --git a/paddle/trainer/ThreadParameterUpdater.h b/paddle/trainer/ThreadParameterUpdater.h
index 492692dbe5b2209c7fc1eecc54f5bebea8d457a8..d01ac689f97f360b64d4e63032a804f1f24c83e2 100644
--- a/paddle/trainer/ThreadParameterUpdater.h
+++ b/paddle/trainer/ThreadParameterUpdater.h
@@ -14,13 +14,13 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/utils/Util.h"
 #include "paddle/parameter/AverageOptimizer.h"
 #include "paddle/parameter/FirstOrderOptimizer.h"
 #include "paddle/parameter/OptimizerFunctions.h"
 #include "paddle/parameter/OptimizerWithRegularizer.h"
 #include "paddle/parameter/Parameter.h"
 #include "paddle/parameter/Regularizer.h"
+#include "paddle/utils/Util.h"
 
 #include <memory>
 #include <vector>
diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp
index 85610ec04e3f64dc83575426922ac936a604b3a7..1eec2c432d235ef484b688db08aae8a39f878a85 100644
--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -38,60 +38,56 @@ limitations under the License. */
 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 #include "paddle/gserver/layers/ValidationLayer.h"
 
-P_DEFINE_string(config, "", "Trainer config file");
-
-P_DEFINE_int32(test_period,
-               0,
-               "if equal 0, do test on all test data at the end of "
-               "each pass. While if equal non-zero, do test on all test "
-               "data every test_period batches");
-P_DEFINE_bool(test_all_data_in_one_period,
-              false,
-              "This option was deprecated, since we will always do "
-              "test on all test set ");
-
-P_DEFINE_bool(local, true, "Train in local mode or not");
-
-P_DEFINE_int32(average_test_period,
-               0,
-               "Do test on average parameter every so"
-               " many batches. MUST be devided by FLAGS_log_period."
-               " Default 0 means do not test average parameter");
-
-P_DEFINE_int32(saving_period, 1, "Save parameteres every so many passes");
-P_DEFINE_int64(saving_period_by_batches,
-               0,
-               "Save parameters every so many batches in one pass");
-P_DEFINE_string(save_dir, "", "Directory for saving model parameter");
-P_DEFINE_int32(start_pass,
-               0,
-               "Start training from this pass. "
-               "Will load parameter from the previous pass");
-P_DEFINE_int32(test_pass,
-               -1,
-               "Will load parameter start from this pass to test");
-P_DEFINE_int32(test_wait, 0, "Waiting for pass parameter if not exist");
-P_DEFINE_bool(with_cost, true, "enable cost layer or not");
-P_DEFINE_bool(distribute_test, false, "test in distribute mode");
-
-P_DEFINE_int32(num_passes, 100, "train for so many passes");
-
-P_DEFINE_string(config_args,
-                "",
-                "arguments passed to config file."
-                "Format: key1=value1,key2=value2");
-
-P_DEFINE_bool(save_only_one,
-              false,
-              "Save only parameters in last pass, remove previous.");
-
-P_DEFINE_string(feat_file, "", "File name of extracted feature.");
-P_DEFINE_string(predict_output_dir,
-                "",
-                "Directory that saves the predicted results of output layers");
-P_DEFINE_string(model_list,
-                "",
-                "File that saves the model list when evaluation");
+DEFINE_string(config, "", "Trainer config file");
+
+DEFINE_int32(test_period,
+             0,
+             "if equal 0, do test on all test data at the end of "
+             "each pass. While if equal non-zero, do test on all test "
+             "data every test_period batches");
+DEFINE_bool(test_all_data_in_one_period,
+            false,
+            "This option was deprecated, since we will always do "
+            "test on all test set ");
+
+DEFINE_bool(local, true, "Train in local mode or not");
+
+DEFINE_int32(average_test_period,
+             0,
+             "Do test on average parameter every so"
+             " many batches. MUST be devided by FLAGS_log_period."
+             " Default 0 means do not test average parameter");
+
+DEFINE_int32(saving_period, 1, "Save parameteres every so many passes");
+DEFINE_int64(saving_period_by_batches,
+             0,
+             "Save parameters every so many batches in one pass");
+DEFINE_string(save_dir, "", "Directory for saving model parameter");
+DEFINE_int32(start_pass,
+             0,
+             "Start training from this pass. "
+             "Will load parameter from the previous pass");
+DEFINE_int32(test_pass, -1, "Will load parameter start from this pass to test");
+DEFINE_int32(test_wait, 0, "Waiting for pass parameter if not exist");
+DEFINE_bool(with_cost, true, "enable cost layer or not");
+DEFINE_bool(distribute_test, false, "test in distribute mode");
+
+DEFINE_int32(num_passes, 100, "train for so many passes");
+
+DEFINE_string(config_args,
+              "",
+              "arguments passed to config file."
+              "Format: key1=value1,key2=value2");
+
+DEFINE_bool(save_only_one,
+            false,
+            "Save only parameters in last pass, remove previous.");
+
+DEFINE_string(feat_file, "", "File name of extracted feature.");
+DEFINE_string(predict_output_dir,
+              "",
+              "Directory that saves the predicted results of output layers");
+DEFINE_string(model_list, "", "File that saves the model list when evaluation");
 
 namespace paddle {
 
diff --git a/paddle/trainer/Trainer.h b/paddle/trainer/Trainer.h
index f50b56143d314a2ad9493409e4d3674868520d53..7cbf18ace7a5fed053653c73e62d36c388b15123 100644
--- a/paddle/trainer/Trainer.h
+++ b/paddle/trainer/Trainer.h
@@ -22,19 +22,19 @@ limitations under the License. */
 #include "paddle/gserver/dataproviders/DataProvider.h"
 #include "paddle/gserver/gradientmachines/GradientMachine.h"
 
-#include "TrainerConfigHelper.h"
+#include <stdlib.h>
+#include <fstream>
+#include "ParamUtil.h"
 #include "ParameterUpdater.h"
-#include "TrainerInternal.h"
 #include "Tester.h"
-#include "ParamUtil.h"
-#include <fstream>
-#include <stdlib.h>
+#include "TrainerConfigHelper.h"
+#include "TrainerInternal.h"
 
 #ifdef PADDLE_METRIC_LEARNING
 #include "paddle/internals/metric_learning/MetricTrainer.h"
 #endif
 
-P_DECLARE_int32(num_passes);
+DECLARE_int32(num_passes);
 
 namespace paddle {
 
diff --git a/paddle/trainer/TrainerBenchmark.cpp b/paddle/trainer/TrainerBenchmark.cpp
index 5c3177c8083e11ebf6b6854f86dbee8299d7e3b1..173653c81688fe4606731c68ea1854268b3f4590 100644
--- a/paddle/trainer/TrainerBenchmark.cpp
+++ b/paddle/trainer/TrainerBenchmark.cpp
@@ -18,9 +18,9 @@ limitations under the License. */
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
 
-P_DECLARE_int32(test_period);
+DECLARE_int32(test_period);
 
-P_DEFINE_bool(feed_data, false, "Wether to read data from DataProvider.");
+DEFINE_bool(feed_data, false, "Wether to read data from DataProvider.");
 
 namespace paddle {
 
diff --git a/paddle/trainer/TrainerConfigHelper.cpp b/paddle/trainer/TrainerConfigHelper.cpp
index 2017a08d20d494cbce8c3beba564ed07c1d7cc73..60ac8459a12db801321da4a9d9c1d48ac8bd6d16 100644
--- a/paddle/trainer/TrainerConfigHelper.cpp
+++ b/paddle/trainer/TrainerConfigHelper.cpp
@@ -18,16 +18,16 @@ limitations under the License. */
 #include "paddle/utils/Flags.h"
 #include "paddle/utils/PythonUtil.h"
 
-P_DECLARE_string(config);
-P_DECLARE_string(init_model_path);
-P_DECLARE_int32(start_pass);
-P_DECLARE_string(save_dir);
-P_DECLARE_int32(trainer_id);
-P_DECLARE_bool(local);
-P_DECLARE_bool(with_cost);
-P_DECLARE_bool(with_gpu);
-P_DECLARE_bool(parallel_nn);
-P_DECLARE_string(config_args);
+DECLARE_string(config);
+DECLARE_string(init_model_path);
+DECLARE_int32(start_pass);
+DECLARE_string(save_dir);
+DECLARE_int32(trainer_id);
+DECLARE_bool(local);
+DECLARE_bool(with_cost);
+DECLARE_bool(with_gpu);
+DECLARE_bool(parallel_nn);
+DECLARE_string(config_args);
 
 const char *kConfigParserModuleName = "paddle.trainer.config_parser";
 const char *kConfigParserFuncName = "parse_config_and_serialize";
diff --git a/paddle/trainer/TrainerConfigHelper.h b/paddle/trainer/TrainerConfigHelper.h
index 2c5c492ce872bd76fec431b6c511caaf021cc0c2..f1366cc041b0d983e65a1bf5b02ec2128324c5a8 100644
--- a/paddle/trainer/TrainerConfigHelper.h
+++ b/paddle/trainer/TrainerConfigHelper.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
-#include <memory>
 #include <paddle/utils/Logging.h>
 #include <paddle/utils/Util.h>
+#include <memory>
 
 namespace paddle {
 
diff --git a/paddle/trainer/TrainerInternal.cpp b/paddle/trainer/TrainerInternal.cpp
index 1b49d4aa28b3166787e5c3f029e47dd97a9f1aef..f3b465b444167d4624a5e99c30e1257eda53ca2c 100644
--- a/paddle/trainer/TrainerInternal.cpp
+++ b/paddle/trainer/TrainerInternal.cpp
@@ -17,22 +17,22 @@ limitations under the License. */
 #include <fenv.h>
 #include <stdio.h>
 
-#include <iostream>
 #include <iomanip>
-#include <sstream>
+#include <iostream>
 #include <limits>
+#include <sstream>
 
 #include <google/protobuf/text_format.h>
 
+#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/gserver/layers/ValidationLayer.h"
+#include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
-#include "paddle/utils/GlobalConstants.h"
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
-#include "paddle/gserver/layers/ValidationLayer.h"
 
-#include "ThreadParameterUpdater.h"
 #include "RemoteParameterUpdater.h"
+#include "ThreadParameterUpdater.h"
 
 namespace paddle {
 
diff --git a/paddle/trainer/TrainerInternal.h b/paddle/trainer/TrainerInternal.h
index b67711a7219ae476dc32b35c04e0c977c9791072..7018faab24744f7a087a53130acc56ec6314101e 100644
--- a/paddle/trainer/TrainerInternal.h
+++ b/paddle/trainer/TrainerInternal.h
@@ -17,15 +17,15 @@ limitations under the License. */
 #include "paddle/utils/Util.h"
 
 #include <stdio.h>
-#include <fstream>
 #include <stdlib.h>
+#include <fstream>
 
-#include "hl_gpu.h"
-#include "paddle/gserver/gradientmachines/GradientMachine.h"
-#include "TrainerConfig.pb.h"
 #include "ParameterUpdater.h"
+#include "TrainerConfig.pb.h"
 #include "TrainerConfigHelper.h"
 #include "TrainerInternalConfig.h"
+#include "hl_gpu.h"
+#include "paddle/gserver/gradientmachines/GradientMachine.h"
 
 namespace paddle {
 
diff --git a/paddle/trainer/TrainerInternalConfig.cpp b/paddle/trainer/TrainerInternalConfig.cpp
index a017cdec9d06a51ddf0925280f3b60cc2dc1c17a..039fcdb524527d5e8bfa829fc403b6f2fa789991 100644
--- a/paddle/trainer/TrainerInternalConfig.cpp
+++ b/paddle/trainer/TrainerInternalConfig.cpp
@@ -14,17 +14,17 @@ limitations under the License. */
 
 #include "TrainerInternalConfig.h"
 
-P_DEFINE_int32(show_parameter_stats_period,
-               0,
-               "Whether to show parameter stats during training");
+DEFINE_int32(show_parameter_stats_period,
+             0,
+             "Whether to show parameter stats during training");
 
-P_DEFINE_int32(dot_period, 1, "Print '.' every so many batches");
+DEFINE_int32(dot_period, 1, "Print '.' every so many batches");
 
-P_DEFINE_bool(use_old_updater, false, "Use the old RemoteParameterUpdater");
+DEFINE_bool(use_old_updater, false, "Use the old RemoteParameterUpdater");
 
-P_DECLARE_int32(num_passes);
+DECLARE_int32(num_passes);
 
-P_DECLARE_bool(local);
+DECLARE_bool(local);
 
 namespace paddle {
 
diff --git a/paddle/trainer/TrainerInternalConfig.h b/paddle/trainer/TrainerInternalConfig.h
index fd6fdf45e60eb6d182edeecd7404b8bf3f79d5ba..b47692720efc2ed4f2db84f61ca81fcb52d234c0 100644
--- a/paddle/trainer/TrainerInternalConfig.h
+++ b/paddle/trainer/TrainerInternalConfig.h
@@ -23,10 +23,10 @@ limitations under the License. */
 
 #include "TrainerConfig.pb.h"
 
-#include "ParameterUpdater.h"
+#include <stdlib.h>
 #include <fstream>
 #include <sstream>
-#include <stdlib.h>
+#include "ParameterUpdater.h"
 
 namespace paddle {
 /**
diff --git a/paddle/trainer/TrainerMain.cpp b/paddle/trainer/TrainerMain.cpp
index 7a18f9836c8f6860b331cd49da5221cb135e6840..947f9cadcc983d58ce31ef462e51dc42e41eaf1b 100644
--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/trainer/TrainerMain.cpp
@@ -13,30 +13,29 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <fenv.h>
+#include "paddle/pserver/ParameterServer2.h"
+#include "paddle/utils/Excepts.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/StringUtil.h"
-#include "paddle/utils/Excepts.h"
-#include "paddle/pserver/ParameterServer2.h"
 
 #include "ParamUtil.h"
 #include "Trainer.h"
 #include "paddle/pserver/RDMANetwork.h"
 
-P_DEFINE_bool(start_pserver, false, "Whether to start pserver");
-P_DECLARE_int32(gpu_id);
-P_DEFINE_string(job, "train", "one of (train, test, checkgrad)");
-P_DECLARE_int32(start_pass);
-P_DECLARE_string(config);
-P_DECLARE_string(init_model_path);
-P_DECLARE_string(rdma_tcp);
+DEFINE_bool(start_pserver, false, "Whether to start pserver");
+DECLARE_int32(gpu_id);
+DEFINE_string(job, "train", "one of (train, test, checkgrad)");
+DECLARE_int32(start_pass);
+DECLARE_string(config);
+DECLARE_string(init_model_path);
+DECLARE_string(rdma_tcp);
 
 using namespace paddle;  // NOLINT
 
 int main(int argc, char** argv) {
-// write logs instantly (never buffer log messages)
-#ifdef PADDLE_USE_GLOG
+  // write logs instantly (never buffer log messages)
   FLAGS_logbuflevel = -1;
-#endif
+
   initMain(argc, argv);
   initPython(argc, argv);
 
diff --git a/paddle/trainer/tests/picojson.h b/paddle/trainer/tests/picojson.h
index cb657d219e55c1e349ffb77a88945085b4149c78..23bfa164080a6ea392bb6ee15e7e2bec25257ce9 100644
--- a/paddle/trainer/tests/picojson.h
+++ b/paddle/trainer/tests/picojson.h
@@ -30,10 +30,10 @@
 #define picojson_h
 
 #include <algorithm>
+#include <cstddef>
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
-#include <cstddef>
 #include <iostream>
 #include <iterator>
 #include <limits>
diff --git a/paddle/trainer/tests/test_Compare.cpp b/paddle/trainer/tests/test_Compare.cpp
index 07a47b2990ce0a95070321ef652d8e90bed26f69..72fc76bea35e433eeb08ba625b4bf6afdda491fb 100644
--- a/paddle/trainer/tests/test_Compare.cpp
+++ b/paddle/trainer/tests/test_Compare.cpp
@@ -16,18 +16,18 @@ limitations under the License. */
 
 #include "paddle/trainer/Trainer.h"
 
-#include <cstdlib>
 #include <gtest/gtest.h>
+#include <cstdlib>
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
 static const string& configFile = "trainer/tests/sample_trainer_config.conf";
 
-P_DECLARE_int32(gpu_id);
-P_DECLARE_bool(use_gpu);
-P_DECLARE_string(config);
-P_DECLARE_string(config_args);
+DECLARE_int32(gpu_id);
+DECLARE_bool(use_gpu);
+DECLARE_string(config);
+DECLARE_string(config_args);
 
 struct comData {
   vector<Argument> outArgs;
diff --git a/paddle/trainer/tests/test_CompareSparse.cpp b/paddle/trainer/tests/test_CompareSparse.cpp
index 3fea3a3c24303b84f78f4029b0ed8e42e419c442..a7000eb77e1bbeab4f6e38c0322f82bde7164080 100644
--- a/paddle/trainer/tests/test_CompareSparse.cpp
+++ b/paddle/trainer/tests/test_CompareSparse.cpp
@@ -25,22 +25,22 @@ using namespace std;     // NOLINT
 static const string& configFile1 =
     "trainer/tests/sample_trainer_config_qb_rnn.conf";
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_string(config);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_int32(seed);
-P_DECLARE_int32(num_passes);
-P_DECLARE_int32(saving_period);
-
-P_DECLARE_int32(num_gradient_servers);
-P_DECLARE_int32(port);
-P_DECLARE_bool(local);
-P_DECLARE_bool(use_old_updater);
-P_DECLARE_bool(parallel_nn);
-P_DECLARE_string(config_args);
-P_DEFINE_double(max_diff_ratio,
-                0.0f,
-                "max diff ratio allowed for parameters value");
+DECLARE_bool(use_gpu);
+DECLARE_string(config);
+DECLARE_int32(gpu_id);
+DECLARE_int32(seed);
+DECLARE_int32(num_passes);
+DECLARE_int32(saving_period);
+
+DECLARE_int32(num_gradient_servers);
+DECLARE_int32(port);
+DECLARE_bool(local);
+DECLARE_bool(use_old_updater);
+DECLARE_bool(parallel_nn);
+DECLARE_string(config_args);
+DEFINE_double(max_diff_ratio,
+              0.0f,
+              "max diff ratio allowed for parameters value");
 
 int gNumDevices = 0;
 
diff --git a/paddle/trainer/tests/test_CompareTwoNets.cpp b/paddle/trainer/tests/test_CompareTwoNets.cpp
index 7e5449dcba66645329a5bd1b9dad73cf10cc0a5a..80c61e259e71dd31d7637072248b22a2910c532e 100644
--- a/paddle/trainer/tests/test_CompareTwoNets.cpp
+++ b/paddle/trainer/tests/test_CompareTwoNets.cpp
@@ -12,35 +12,35 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <gtest/gtest.h>
 #include <paddle/utils/PythonUtil.h>
-#include <cstdlib>
 #include <algorithm>
-#include <gtest/gtest.h>
+#include <cstdlib>
 
 #include "paddle/trainer/Trainer.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_int32(gpu_id);
+DECLARE_int32(gpu_id);
 
-P_DECLARE_bool(local);
-P_DECLARE_bool(use_gpu);
+DECLARE_bool(local);
+DECLARE_bool(use_gpu);
 
-P_DECLARE_string(config);
-P_DECLARE_string(nics);
+DECLARE_string(config);
+DECLARE_string(nics);
 
-P_DEFINE_string(config_file_a, "", "config of one network to compare");
-P_DEFINE_string(config_file_b, "", "config of another network to compare");
-P_DEFINE_bool(need_high_accuracy,
-              false,
-              "whether need to run in double accuracy");
-P_DEFINE_double(
+DEFINE_string(config_file_a, "", "config of one network to compare");
+DEFINE_string(config_file_b, "", "config of another network to compare");
+DEFINE_bool(need_high_accuracy,
+            false,
+            "whether need to run in double accuracy");
+DEFINE_double(
     max_diff_ratio,
     0.0f,
     "max diff ratio allowed for outputs and parameters (value/gradient)");
-P_DECLARE_bool(thread_local_rand_use_global_seed);
-P_DECLARE_int32(seed);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_int32(seed);
 
 struct ComData {
   vector<Argument> outArgs;
diff --git a/paddle/trainer/tests/test_CompareTwoOpts.cpp b/paddle/trainer/tests/test_CompareTwoOpts.cpp
index 4d051b537cd6e00229b8c4db2472dc91607fd971..383505f8131264844069d6f0fa13f4e0ac1f97af 100644
--- a/paddle/trainer/tests/test_CompareTwoOpts.cpp
+++ b/paddle/trainer/tests/test_CompareTwoOpts.cpp
@@ -12,30 +12,30 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include <gtest/gtest.h>
 #include <paddle/utils/PythonUtil.h>
-#include <cstdlib>
 #include <algorithm>
-#include <gtest/gtest.h>
+#include <cstdlib>
 
 #include "paddle/trainer/Trainer.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
 
-P_DECLARE_int32(gpu_id);
+DECLARE_int32(gpu_id);
 
-P_DECLARE_bool(local);
-P_DECLARE_bool(use_gpu);
+DECLARE_bool(local);
+DECLARE_bool(use_gpu);
 
-P_DECLARE_string(config);
-P_DECLARE_string(nics);
+DECLARE_string(config);
+DECLARE_string(nics);
 
-P_DEFINE_string(config_file_a, "", "config of one network to compare");
-P_DEFINE_string(config_file_b, "", "config of another network to compare");
-P_DEFINE_bool(need_high_accuracy,
-              true,
-              "whether need to run in double accuracy (recommended)");
-P_DEFINE_double(
+DEFINE_string(config_file_a, "", "config of one network to compare");
+DEFINE_string(config_file_b, "", "config of another network to compare");
+DEFINE_bool(need_high_accuracy,
+            true,
+            "whether need to run in double accuracy (recommended)");
+DEFINE_double(
     max_diff_ratio,
     0.0f,
     "max diff ratio allowed for outputs and parameters (value/gradient)");
diff --git a/paddle/trainer/tests/test_Prediction.cpp b/paddle/trainer/tests/test_Prediction.cpp
index 322121a579440fcf164c042b3265e5d2878e3732..0c79404eee1c0902c5c8e8eefd139da3da584636 100644
--- a/paddle/trainer/tests/test_Prediction.cpp
+++ b/paddle/trainer/tests/test_Prediction.cpp
@@ -18,11 +18,11 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 
-P_DECLARE_string(config);
-P_DECLARE_string(config_args);
-P_DEFINE_string(merger,
-                "./paddle_merge_model",
-                "path to paddle_merge_model binary");
+DECLARE_string(config);
+DECLARE_string(config_args);
+DEFINE_string(merger,
+              "./paddle_merge_model",
+              "path to paddle_merge_model binary");
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
index 5c5c6d534692b1ce18b4bc5251707e1b4b39ae85..66ec65e340a435a7260028611828fb28845e0728 100644
--- a/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
+++ b/paddle/trainer/tests/test_PyDataProviderWrapper.cpp
@@ -13,16 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #ifndef PADDLE_NO_PYTHON
+#include <DataConfig.pb.h>
 #include <gtest/gtest.h>
-#include <paddle/utils/PythonUtil.h>
 #include <paddle/gserver/dataproviders/DataProvider.h>
-#include <DataConfig.pb.h>
 #include <paddle/math/Matrix.h>
 #include <paddle/parameter/Argument.h>
+#include <paddle/utils/PythonUtil.h>
+#include <fstream>
+#include <typeinfo>
 #include <unordered_map>
 #include <unordered_set>
-#include <typeinfo>
-#include <fstream>
 #include "picojson.h"
 
 void checkEqual(const paddle::Argument& expect, const paddle::Argument& actual);
diff --git a/paddle/trainer/tests/test_Trainer.cpp b/paddle/trainer/tests/test_Trainer.cpp
index 0fede59f8d8d62edcdb2d030952d0e738452160a..371282dd6bb9a995bc6ae8b2a5bd708f831d7e33 100644
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ b/paddle/trainer/tests/test_Trainer.cpp
@@ -28,10 +28,10 @@ static const string& configFile3 = "trainer/tests/chunking.conf";
 static const string& configFile4 =
     "trainer/tests/sample_trainer_config_parallel.conf";
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_string(config);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_bool(allow_only_one_model_on_one_gpu);
+DECLARE_bool(use_gpu);
+DECLARE_string(config);
+DECLARE_int32(gpu_id);
+DECLARE_bool(allow_only_one_model_on_one_gpu);
 
 void checkGradientTest(const string& configFile,
                        bool useGpu,
diff --git a/paddle/trainer/tests/test_TrainerOnePass.cpp b/paddle/trainer/tests/test_TrainerOnePass.cpp
index 1d9dce1b0e044c9445c5b559b9273a24c8fd8785..ee21008aec56da289dab88f72f57a1703e392fad 100644
--- a/paddle/trainer/tests/test_TrainerOnePass.cpp
+++ b/paddle/trainer/tests/test_TrainerOnePass.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <paddle/utils/PythonUtil.h>
 #include <paddle/utils/GlobalConstants.h>
+#include <paddle/utils/PythonUtil.h>
 #include "paddle/trainer/Trainer.h"
 #include "paddle/trainer/TrainerInternal.h"
 
@@ -27,12 +27,12 @@ static const string& configFile1 = "trainer/tests/sample_trainer_config.conf";
 static const string& configFile2 =
     "trainer/tests/sample_trainer_config_parallel.conf";
 
-P_DECLARE_bool(use_gpu);
-P_DECLARE_string(config);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_int32(seed);
-P_DECLARE_int32(num_passes);
-P_DECLARE_int32(saving_period);
+DECLARE_bool(use_gpu);
+DECLARE_string(config);
+DECLARE_int32(gpu_id);
+DECLARE_int32(seed);
+DECLARE_int32(num_passes);
+DECLARE_int32(saving_period);
 
 class TrainerForTest : public paddle::Trainer {
 public:
@@ -122,10 +122,10 @@ TEST(average_window_cpu, gpu4) {
 #endif
 
 // 3. test trainer + pserver.
-P_DECLARE_int32(num_gradient_servers);
-P_DECLARE_int32(port);
-P_DECLARE_bool(local);
-P_DECLARE_bool(use_old_updater);
+DECLARE_int32(num_gradient_servers);
+DECLARE_int32(port);
+DECLARE_bool(local);
+DECLARE_bool(use_old_updater);
 
 double checkRemoteParameterUpdater(TrainerForTest& trainer) {
   auto gradientMachine = trainer.getGradientMachine();
diff --git a/paddle/trainer/tests/test_recurrent_machine_generation.cpp b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
index b52acc2ca7c658700356e6038754e604df0cf7cd..03446b3b2f6d5ff42fbf0d735a24d88bd0429747 100644
--- a/paddle/trainer/tests/test_recurrent_machine_generation.cpp
+++ b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include <fstream>
 
-#include <paddle/utils/PythonUtil.h>
 #include <paddle/trainer/Trainer.h>
+#include <paddle/utils/PythonUtil.h>
 
 #include <gtest/gtest.h>
 
@@ -30,7 +30,7 @@ static string modelDir = "trainer/tests/rnn_gen_test_model_dir/t1";  // NOLINT
 static string expectFile =                                           // NOLINT
     "trainer/tests/rnn_gen_test_model_dir/r1.test";                  // NOLINT
 
-P_DECLARE_string(config_args);
+DECLARE_string(config_args);
 
 vector<float> readRetFile(const string& fname) {
   ifstream inFile(fname);
diff --git a/paddle/utils/BarrierStat.cpp b/paddle/utils/BarrierStat.cpp
index 5040deefd074cbec86e86405f3466656355d1da1..a6dbdcae3f32c894d35e8114488d4a3264c6c5f2 100644
--- a/paddle/utils/BarrierStat.cpp
+++ b/paddle/utils/BarrierStat.cpp
@@ -12,23 +12,23 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
+#include "paddle/utils/BarrierStat.h"
+#include <string.h>
 #include <sys/types.h>
-#include <iomanip>
 #include <algorithm>
-#include <string.h>
-#include "paddle/utils/Stat.h"
-#include "paddle/utils/BarrierStat.h"
+#include <iomanip>
 #include "paddle/utils/Flags.h"
+#include "paddle/utils/Stat.h"
 
-P_DEFINE_bool(log_barrier_abstract,
-              true,
-              "if true, show abstract of barrier performance");
-P_DEFINE_int32(log_barrier_lowest_nodes,
-               5,
-               "how many lowest node will be logged");
-P_DEFINE_bool(log_barrier_show_log,
-              false,  // for performance tuning insight
-              "if true, always show barrier abstract even with little gap");
+DEFINE_bool(log_barrier_abstract,
+            true,
+            "if true, show abstract of barrier performance");
+DEFINE_int32(log_barrier_lowest_nodes,
+             5,
+             "how many lowest node will be logged");
+DEFINE_bool(log_barrier_show_log,
+            false,  // for performance tuning insight
+            "if true, always show barrier abstract even with little gap");
 
 namespace paddle {
 
diff --git a/paddle/utils/BarrierStat.h b/paddle/utils/BarrierStat.h
index 3c5c0885d6ef71d2eac92cc1928ffe54dea73c96..a9c925eff66838d58d540d7be5476e6207a30bec 100644
--- a/paddle/utils/BarrierStat.h
+++ b/paddle/utils/BarrierStat.h
@@ -15,18 +15,17 @@ limitations under the License. */
 #pragma once
 
 #include <stdint.h>
-#include <string>
 #include <sys/time.h>
-#include <memory>
 #include <iostream>
+#include <list>
+#include <memory>
 #include <mutex>
+#include <string>
 #include <unordered_map>
-#include <list>
 
-#include "Logging.h"
 #include "Locks.h"
+#include "Logging.h"
 #include "ThreadLocal.h"
-#include "Stat.h"
 
 namespace paddle {
 
diff --git a/paddle/utils/CommandLineParser.cpp b/paddle/utils/CommandLineParser.cpp
index 14f83241c58c7ca0f7253c4020e3c9cd7bbf11de..63f16bc54c575a0d5ae02141be3c467ee784b095 100644
--- a/paddle/utils/CommandLineParser.cpp
+++ b/paddle/utils/CommandLineParser.cpp
@@ -13,223 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "CommandLineParser.h"
-#ifndef PADDLE_USE_GFLAGS
-#include "paddle/utils/StringUtil.h"
-#include <algorithm>
-#include <iostream>
-#include <iomanip>
-#include <stdlib.h>
-#include <string>
-#include <vector>
-#include <utility>
-#include <tuple>
 
-namespace paddle {
-
-static constexpr int kStatusOK = 0;
-static constexpr int kStatusInvalid = 1;
-static constexpr int kStatusNotFound = 2;
-
-/**
- * \brief: Convert a string to any type value.
- *
- * \note: It will specialize by type T that is supported.
- */
-template <typename T>
-bool StringToValue(const std::string& content, T* value) {
-  bool ok;
-  *value = str::toWithStatus<T>(content, &ok);
-  return ok;
-}
-
-template <>
-bool StringToValue<bool>(const std::string& content, bool* value) {
-  std::string tmp = content;
-
-  std::transform(tmp.begin(),
-                 tmp.end(),
-                 tmp.begin(),
-                 [](char in) -> char {
-                   if (in <= 'Z' && in >= 'A') {
-                     return in - ('Z' - 'z');
-                   } else {
-                     return in;
-                   }
-                 });  // tolower.
-
-  if (tmp == "true" || tmp == "1") {
-    *value = true;
-    return true;
-  } else if (tmp == "false" || tmp == "0") {
-    *value = false;
-    return true;
-  } else {
-    return false;
-  }
-}
-
-template <>
-bool StringToValue<std::string>(const std::string& content,
-                                std::string* value) {
-  *value = content;
-  return true;
-}
-
-/**
- * \brief Parse argument "--blah=blah".
- *
- * \param argument: The command line argument string, such as "--blah=blah"
- * \param [out] extraInfo: The details error message for parse argument.
- * \return: kStatusOK, kStatusInvalid, kStatusNotFound
- */
-template <typename T>
-int ParseArgument(const std::string& argument, std::string* extraInfo) {
-  for (auto& command :
-       flags_internal::CommandLineFlagRegistry<T>::Instance()->commands) {
-    std::string& name = command.name;
-    T* value = command.value;
-
-    std::string prefix = "--";
-    prefix += name;
-    prefix += "=";
-    std::string content;
-    if (str::startsWith(argument, prefix)) {
-      content = argument.substr(prefix.size(), argument.size() - prefix.size());
-    } else {
-      prefix = "-";
-      prefix += name;
-      prefix += "=";
-      if (str::startsWith(argument, prefix)) {
-        content =
-            argument.substr(prefix.size(), argument.size() - prefix.size());
-      }
-    }
-
-    if (!content.empty()) {
-      if (StringToValue(content, value)) {
-        return kStatusOK;
-      } else {
-        *extraInfo = name;
-        return kStatusInvalid;
-      }
-    }
-  }
-  return kStatusNotFound;
-}
-
-/**
- * @brief ParseBoolArgumentExtra
- * parse '--flag_name', '-flag_name' as true; '--noflag_name', '-noflag_name' as
- * false
- */
-static int ParseBoolArgumentExtra(const std::string& argument,
-                                  std::string* extraInfo) {
-  (void)(extraInfo);  // unused extraInfo, just make api same.
-
-  //! @warning: The order and content of prefixes is DESIGNED for parsing
-  //! command line. The length of prefixes are 1, 2, 3, 4. The parse logic takes
-  //! use of this fact. DO NOT CHANGE IT without reading how to parse command
-  //! below.
-  static const std::vector<std::pair<const char*, bool>> prefixes = {
-      {"-", true}, {"--", true}, {"-no", false}, {"--no", false}};
-
-  for (flags_internal::CommandLineFlagRegistry<bool>::Command& command :
-       flags_internal::CommandLineFlagRegistry<bool>::Instance()->commands) {
-    if (argument.size() > command.name.size()) {
-      //! Use the length of prefix is 1, 2, 3, 4.
-      size_t diff = argument.size() - command.name.size() - 1UL;
-      if (diff < prefixes.size()) {
-        const std::string& prefix = std::get<0>(prefixes[diff]);
-        if (argument == prefix + command.name) {
-          *command.value = std::get<1>(prefixes[diff]);
-          return kStatusOK;
-        }
-      }
-    }
-  }
-  return kStatusNotFound;
-}
-
-/**
- * \brief: Print command line arguments' usage with type T.
- */
-template <typename T>
-static void PrintTypeUsage() {
-  for (auto& command :
-       flags_internal::CommandLineFlagRegistry<T>::Instance()->commands) {
-    std::string& name = command.name;
-    name = "--" + name;  // Program will exit, so modify name is safe.
-    std::string& desc = command.text;
-    T& defaultValue = command.defaultValue;
-    std::cerr << std::setw(20) << name << ": " << desc
-              << "[default:" << defaultValue << "]." << std::endl;
-  }
-}
-
-template <typename... TS>
-static void PrintTypeUsages() {
-  int unused[] = {0, (PrintTypeUsage<TS>(), 0)...};
-  (void)(unused);
-}
-/**
- * \brief: Print all usage, and exit(1)
- */
-static void PrintUsageAndExit(const char* argv0) {
-  std::cerr << "Program " << argv0 << " Flags: " << std::endl;
-  PrintTypeUsages<bool, int32_t, std::string, double, int64_t, uint64_t>();
-  exit(1);
-}
-
-/**
- * \brief: Print the error flags, usage, and exit.
- */
-static void PrintParseError(const std::string& name,
-                            const char* actualInput,
-                            const char* arg0) {
-  std::cerr << "Parse command flag " << name << " error! User input is "
-            << actualInput << std::endl;
-  PrintUsageAndExit(arg0);
-}
-
-void ParseCommandLineFlags(int* argc, char** argv, bool withHelp) {
-  int unused_argc = 1;
-  std::string extra;
-  for (int i = 1; i < *argc; ++i) {
-    std::string arg = argv[i];
-    int s = kStatusInvalid;
-#define ParseArgumentWithType(type)           \
-  s = ParseArgument<type>(arg, &extra);       \
-  if (s == kStatusOK) {                       \
-    continue;                                 \
-  } else if (s == kStatusInvalid) {           \
-    PrintParseError(extra, argv[i], argv[0]); \
-  }
-
-    ParseArgumentWithType(bool);  // NOLINT
-    ParseArgumentWithType(int32_t);
-    ParseArgumentWithType(double);  // NOLINT
-    ParseArgumentWithType(int64_t);
-    ParseArgumentWithType(uint64_t);
-    ParseArgumentWithType(std::string);
-
-#undef ParseArgumentWithType
-    s = ParseBoolArgumentExtra(arg, &extra);
-    if (s == kStatusOK) {
-      continue;
-    }
-
-    if (withHelp && (arg == "--help" || arg == "-h")) {
-      PrintUsageAndExit(argv[0]);
-    }
-
-    // NOT Found for all flags.
-    std::swap(argv[unused_argc++], argv[i]);
-  }
-  *argc = unused_argc;
-}
-
-}  // namespace paddle
-#else
 namespace paddle {
 #ifndef GFLAGS_NS
 #define GFLAGS_NS google
@@ -246,4 +30,3 @@ void ParseCommandLineFlags(int* argc, char** argv, bool withHelp) {
 }
 
 }  // namespace paddle
-#endif
diff --git a/paddle/utils/CommandLineParser.h b/paddle/utils/CommandLineParser.h
index 3d25bc3b0b50a210b7de179427a15350fa3ac981..4e89f90bb910cee1adc7fb8dace81ff58435351f 100644
--- a/paddle/utils/CommandLineParser.h
+++ b/paddle/utils/CommandLineParser.h
@@ -13,167 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#ifndef PADDLE_USE_GFLAGS
-#include "DisableCopy.h"
-#include <string>
-#include <vector>
-#include <stdint.h>
 
-namespace paddle {
-
-namespace flags_internal {
-
-/**
- * Command line flag registry for special type T. It will store all command
- * arguments settings. such as name, default value.
- */
-template <typename T>
-struct CommandLineFlagRegistry {
-  /**
-   * The factory method of CommandLineFlagRegistry
-   *
-   * \return: The singleton instance of CommandLineFlagRegistry.
-   */
-  static CommandLineFlagRegistry* Instance() {
-    static CommandLineFlagRegistry instance_;
-    return &instance_;
-  }
-
-  struct Command {
-    /// name of argument.
-    std::string name;
-    /// address of actual variable. such as FLAGS_xxx.
-    T* value;
-    /// usage text.
-    std::string text;
-    /// default value of this command.
-    T defaultValue;
-  };
-
-  /// the command line arguments of type T.
-  std::vector<Command> commands;
-
-  DISABLE_COPY(CommandLineFlagRegistry);
-
-private:
-  inline CommandLineFlagRegistry() {}
-};
-
-/**
- *Helper class to register command line flag.
- */
-template <typename T>
-struct CommandLineFlagRegister {
-  /**
-   * \brief: Register a command line argument
-   *
-   * \param [in] name: The command line name.
-   * \param [inout] val: The command line argument instance, FLAGS_xxx.
-   * \param [in] desc: The command line helper message.
-   */
-  CommandLineFlagRegister(const std::string& name,
-                          T* val,
-                          const std::string desc) {
-    CommandLineFlagRegistry<T>::Instance()->commands.push_back(
-        {name, val, desc, *val});
-  }
-};
-
-/**
- * \brief: Define a command line arguments.
- *
- * \param type: The variable type, such as int, double, etc.
- * \param name: The variable name. The command line argument is '--name', the
- *variable
- *is 'FLAGS_name'
- * \param default_value: The default value of command line argument.
- * \param text: The description in command line argument.
- */
-#define PADDLE_DEFINE_variable(type, name, default_value, text) \
-  type FLAGS_##name = default_value;                            \
-  namespace paddle_flags_internal {                             \
-  paddle::flags_internal::CommandLineFlagRegister<type>         \
-      flags_internal_var_##name(#name, &FLAGS_##name, text);    \
-  }  // namespace paddle_flags_internal
-
-/**
- * Declare a variable to use.
- */
-#define PADDLE_DECLARE_variable(type, name) extern type FLAGS_##name;
-
-// DEFINE macro for each types.
-#define P_DEFINE_int32(name, default_value, text) \
-  PADDLE_DEFINE_variable(int32_t, name, default_value, text)
-
-#define P_DEFINE_bool(name, default_value, text) \
-  PADDLE_DEFINE_variable(bool, name, default_value, text)
-
-#define P_DEFINE_string(name, default_value, text) \
-  PADDLE_DEFINE_variable(std::string, name, default_value, text)
-
-#define P_DEFINE_double(name, default_value, text) \
-  PADDLE_DEFINE_variable(double, name, default_value, text)
-
-#define P_DEFINE_int64(name, default_value, text) \
-  PADDLE_DEFINE_variable(int64_t, name, default_value, text)
-
-#define P_DEFINE_uint64(name, default_value, text) \
-  PADDLE_DEFINE_variable(uint64_t, name, default_value, text)
-
-// Declare macro for each types.
-#define P_DECLARE_int32(name) PADDLE_DECLARE_variable(int32_t, name)
-#define P_DECLARE_bool(name) PADDLE_DECLARE_variable(bool, name)
-#define P_DECLARE_string(name) PADDLE_DECLARE_variable(std::string, name)
-#define P_DECLARE_double(name) PADDLE_DECLARE_variable(double, name)
-#define P_DECLARE_int64(name) PADDLE_DECLARE_variable(int64_t, name)
-#define P_DECLARE_uint64(name) PADDLE_DECLARE_variable(uint64_t, name)
-}  // namespace flags_internal
-
-/**
- * \brief Parse command line flags. If parse error, just failed and exit 1.
- *
- * \param [inout] argc: The command argument count. This method will modify
- *argc, and left unused arguments.
- * \param [inout] argv: The command argument values. This method will modify
- *argv, and left unused arguments.
- * \param [in] withHelp: True will parse '-h' and '--help' to print usage.
- *
- * \note: The Command line flags format basically as follow:
- *
- *  * If the type of flag is not bool, then the follow format of command line
- *    will be parsed:
- *    * --flag_name=value
- *    * -flag_name=value
- *
- *  * If the flag is bool, then:
- *    * --flag_name=value, -flag_name=value will be parsed.
- *       * if value.tolower() == "true"| "1" will be treated as true.
- *       * else if value.tolower() == "false" | "0" will be treated as false.
- *    * --flag_name will be parsed as true.
- *    * --noflag_name will be parsed as false.
- */
-void ParseCommandLineFlags(int* argc, char** argv, bool withHelp = true);
-
-}  // namespace paddle
-
-#else  // if use gflags.
 #include <gflags/gflags.h>
 
-#define P_DEFINE_int32 DEFINE_int32
-#define P_DEFINE_bool DEFINE_bool
-#define P_DEFINE_string DEFINE_string
-#define P_DEFINE_double DEFINE_double
-#define P_DEFINE_int64 DEFINE_int64
-#define P_DEFINE_uint64 DEFINE_uint64
-#define P_DECLARE_int32 DECLARE_int32
-#define P_DECLARE_bool DECLARE_bool
-#define P_DECLARE_string DECLARE_string
-#define P_DECLARE_double DECLARE_double
-#define P_DECLARE_int64 DECLARE_int64
-#define P_DECLARE_uint64 DECLARE_uint64
 namespace paddle {
 void ParseCommandLineFlags(int* argc, char** argv, bool withHelp = true);
 
 }  // namespace paddle
-
-#endif
diff --git a/paddle/utils/CpuId.cpp b/paddle/utils/CpuId.cpp
index 734b2e09246a56358160f9d1d090e10266bee2fa..8eefdd2980e7f56a836df6fd2ff8c31b81a55555 100644
--- a/paddle/utils/CpuId.cpp
+++ b/paddle/utils/CpuId.cpp
@@ -14,44 +14,48 @@ limitations under the License. */
 
 #ifdef _WIN32
 
+#include <intrin.h>
+
 /// for MSVC
-#define CPUID(info, x)  __cpuidex(info, x, 0)
+#define CPUID(info, x) __cpuidex(info, x, 0)
 
 #else
 
 #include <cpuid.h>
 
 /// for GCC/Clang
-#define CPUID(info, x)  __cpuid_count(x, 0, info[0], info[1], info[2], info[3])
+#define CPUID(info, x) __cpuid_count(x, 0, info[0], info[1], info[2], info[3])
 
 #endif
 
 namespace paddle {
 
 SIMDFlags::SIMDFlags() {
-    unsigned int cpuInfo[4];
-    // CPUID: https://en.wikipedia.org/wiki/CPUID
-    CPUID(cpuInfo, 0x00000001);
-    simd_flags_ |= cpuInfo[3] & (1 << 25) ? SIMD_SSE   : SIMD_NONE;
-    simd_flags_ |= cpuInfo[3] & (1 << 26) ? SIMD_SSE2  : SIMD_NONE;
-    simd_flags_ |= cpuInfo[2] & (1 << 0)  ? SIMD_SSE3  : SIMD_NONE;
-    simd_flags_ |= cpuInfo[2] & (1 << 9)  ? SIMD_SSSE3 : SIMD_NONE;
-    simd_flags_ |= cpuInfo[2] & (1 << 19) ? SIMD_SSE41 : SIMD_NONE;
-    simd_flags_ |= cpuInfo[2] & (1 << 20) ? SIMD_SSE42 : SIMD_NONE;
-    simd_flags_ |= cpuInfo[2] & (1 << 12) ? SIMD_FMA3  : SIMD_NONE;
-    simd_flags_ |= cpuInfo[2] & (1 << 28) ? SIMD_AVX   : SIMD_NONE;
-
-    CPUID(cpuInfo, 0x00000007);
-    simd_flags_ |= cpuInfo[1] & (1 << 5)  ? SIMD_AVX2  : SIMD_NONE;
-    simd_flags_ |= cpuInfo[1] & (1 << 16) ? SIMD_AVX512: SIMD_NONE;
-
-    CPUID(cpuInfo, 0x80000001);
-    simd_flags_ |= cpuInfo[2] & (1 << 16) ? SIMD_FMA4  : SIMD_NONE;
+  unsigned int cpuInfo[4];
+  // CPUID: https://en.wikipedia.org/wiki/CPUID
+  // clang-format off
+  CPUID(cpuInfo, 0x00000001);
+  simd_flags_ |= cpuInfo[3] & (1 << 25) ? SIMD_SSE   : SIMD_NONE;
+  simd_flags_ |= cpuInfo[3] & (1 << 26) ? SIMD_SSE2  : SIMD_NONE;
+  simd_flags_ |= cpuInfo[2] & (1 <<  0) ? SIMD_SSE3  : SIMD_NONE;
+  simd_flags_ |= cpuInfo[2] & (1 <<  9) ? SIMD_SSSE3 : SIMD_NONE;
+  simd_flags_ |= cpuInfo[2] & (1 << 19) ? SIMD_SSE41 : SIMD_NONE;
+  simd_flags_ |= cpuInfo[2] & (1 << 20) ? SIMD_SSE42 : SIMD_NONE;
+  simd_flags_ |= cpuInfo[2] & (1 << 12) ? SIMD_FMA3  : SIMD_NONE;
+  simd_flags_ |= cpuInfo[2] & (1 << 28) ? SIMD_AVX   : SIMD_NONE;
+
+  CPUID(cpuInfo, 0x00000007);
+  simd_flags_ |= cpuInfo[1] & (1 <<  5) ? SIMD_AVX2  : SIMD_NONE;
+  simd_flags_ |= cpuInfo[1] & (1 << 16) ? SIMD_AVX512: SIMD_NONE;
+
+  CPUID(cpuInfo, 0x80000001);
+  simd_flags_ |= cpuInfo[2] & (1 << 16) ? SIMD_FMA4  : SIMD_NONE;
+  // clang-fotmat on
 }
 
-SIMDFlags* SIMDFlags::instance() {
-    static SIMDFlags instance;
-    return &instance;
+SIMDFlags const* SIMDFlags::instance() {
+  static SIMDFlags instance;
+  return &instance;
 }
 
-}   // namespace paddle
+}  // namespace paddle
diff --git a/paddle/utils/CpuId.h b/paddle/utils/CpuId.h
index d15e58d1dddde3263826d22c0a26915a1f09ca71..7a354da75851ed7cca4e85e77714624634951f00 100644
--- a/paddle/utils/CpuId.h
+++ b/paddle/utils/CpuId.h
@@ -11,61 +11,90 @@ limitations under the License. */
 
 #pragma once
 
-#include <iostream>
 #include "DisableCopy.h"
 
 namespace paddle {
 
+// clang-format off
+enum simd_t {
+  SIMD_NONE   = 0,          ///< None
+  SIMD_SSE    = 1 << 0,     ///< SSE
+  SIMD_SSE2   = 1 << 1,     ///< SSE 2
+  SIMD_SSE3   = 1 << 2,     ///< SSE 3
+  SIMD_SSSE3  = 1 << 3,     ///< SSSE 3
+  SIMD_SSE41  = 1 << 4,     ///< SSE 4.1
+  SIMD_SSE42  = 1 << 5,     ///< SSE 4.2
+  SIMD_FMA3   = 1 << 6,     ///< FMA 3
+  SIMD_FMA4   = 1 << 7,     ///< FMA 4
+  SIMD_AVX    = 1 << 8,     ///< AVX
+  SIMD_AVX2   = 1 << 9,     ///< AVX 2
+  SIMD_AVX512 = 1 << 10,    ///< AVX 512
+};
+// clang-format on
+
 class SIMDFlags final {
 public:
-    DISABLE_COPY(SIMDFlags);
+  DISABLE_COPY(SIMDFlags);
 
-    SIMDFlags();
+  SIMDFlags();
 
-    static SIMDFlags* instance();
+  static SIMDFlags const* instance();
 
-    inline bool isSSE()   const { return simd_flags_ & SIMD_SSE;   }
-    inline bool isSSE2()  const { return simd_flags_ & SIMD_SSE2;  }
-    inline bool isSSE3()  const { return simd_flags_ & SIMD_SSE3;  }
-    inline bool isSSSE3() const { return simd_flags_ & SIMD_SSSE3; }
-    inline bool isSSE41() const { return simd_flags_ & SIMD_SSE41; }
-    inline bool isSSE42() const { return simd_flags_ & SIMD_SSE42; }
-    inline bool isFMA3()  const { return simd_flags_ & SIMD_FMA3;  }
-    inline bool isFMA4()  const { return simd_flags_ & SIMD_FMA4;  }
-    inline bool isAVX()   const { return simd_flags_ & SIMD_AVX;   }
-    inline bool isAVX2()  const { return simd_flags_ & SIMD_AVX2;  }
-    inline bool isAVX512()const { return simd_flags_ & SIMD_AVX512;}
+  inline bool check(int flags) const {
+    return !((simd_flags_ & flags) ^ flags);
+  }
 
 private:
-    enum simd_t {
-        SIMD_NONE     = 0,        ///< None
-        SIMD_SSE      = 1 << 0,   ///< SSE
-        SIMD_SSE2     = 1 << 1,   ///< SSE 2
-        SIMD_SSE3     = 1 << 2,   ///< SSE 3
-        SIMD_SSSE3    = 1 << 3,   ///< SSSE 3
-        SIMD_SSE41    = 1 << 4,   ///< SSE 4.1
-        SIMD_SSE42    = 1 << 5,   ///< SSE 4.2
-        SIMD_FMA3     = 1 << 6,   ///< FMA 3
-        SIMD_FMA4     = 1 << 7,   ///< FMA 4
-        SIMD_AVX      = 1 << 8,   ///< AVX
-        SIMD_AVX2     = 1 << 9,   ///< AVX 2
-        SIMD_AVX512   = 1 << 10,  ///< AVX 512
-    };
-
-    /// simd flags
-    int simd_flags_ = SIMD_NONE;
+  int simd_flags_ = SIMD_NONE;
 };
 
-#define HAS_SSE      SIMDFlags::instance()->isSSE()
-#define HAS_SSE2     SIMDFlags::instance()->isSSE2()
-#define HAS_SSE3     SIMDFlags::instance()->isSSE3()
-#define HAS_SSSE3    SIMDFlags::instance()->isSSSE3()
-#define HAS_SSE41    SIMDFlags::instance()->isSSE41()
-#define HAS_SSE42    SIMDFlags::instance()->isSSE42()
-#define HAS_FMA3     SIMDFlags::instance()->isFMA3()
-#define HAS_FMA4     SIMDFlags::instance()->isFMA4()
-#define HAS_AVX      SIMDFlags::instance()->isAVX()
-#define HAS_AVX2     SIMDFlags::instance()->isAVX2()
-#define HAS_AVX512   SIMDFlags::instance()->isAVX512()
+/**
+ * @brief   Check SIMD flags at runtime.
+ *
+ * For example.
+ * @code{.cpp}
+ *
+ * if (HAS_SIMD(SIMD_AVX2 | SIMD_FMA4)) {
+ *      avx2_fm4_stub();
+ * } else if (HAS_SIMD(SIMD_AVX)) {
+ *      avx_stub();
+ * }
+ *
+ * @endcode
+ */
+#define HAS_SIMD(__flags) SIMDFlags::instance()->check(__flags)
+
+/**
+ * @brief   Check SIMD flags at runtime.
+ *
+ * 1. Check all SIMD flags at runtime:
+ *
+ * @code{.cpp}
+ * if (HAS_AVX && HAS_AVX2) {
+ *      avx2_stub();
+ * }
+ * @endcod
+ *
+ * 2. Check one SIMD flag at runtime:
+ *
+ * @code{.cpp}
+ * if (HAS_SSE41 || HAS_SSE42) {
+ *      sse4_stub();
+ * }
+ * @endcode
+ */
+// clang-format off
+#define HAS_SSE     HAS_SIMD(SIMD_SSE)
+#define HAS_SSE2    HAS_SIMD(SIMD_SSE2)
+#define HAS_SSE3    HAS_SIMD(SIMD_SSE3)
+#define HAS_SSSE3   HAS_SIMD(SIMD_SSSE3)
+#define HAS_SSE41   HAS_SIMD(SIMD_SSE41)
+#define HAS_SSE42   HAS_SIMD(SIMD_SSE42)
+#define HAS_FMA3    HAS_SIMD(SIMD_FMA3)
+#define HAS_FMA4    HAS_SIMD(SIMD_FMA4)
+#define HAS_AVX     HAS_SIMD(SIMD_AVX)
+#define HAS_AVX2    HAS_SIMD(SIMD_AVX2)
+#define HAS_AVX512  HAS_SIMD(SIMD_AVX512)
+// clang-format on
 
-}   // namespace paddle
+}  // namespace paddle
diff --git a/paddle/utils/CustomStackTrace.cpp b/paddle/utils/CustomStackTrace.cpp
index 730788cb9893b93208ed6d55dbcd2231ee8495e1..66b38218a7c7ec146f366ded516ebe22d012e47f 100644
--- a/paddle/utils/CustomStackTrace.cpp
+++ b/paddle/utils/CustomStackTrace.cpp
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "CustomStackTrace.h"
-#include "CommandLineParser.h"
 #include <iostream>
+#include "CommandLineParser.h"
 
-P_DEFINE_bool(
+DEFINE_bool(
     layer_stack_error_only_current_thread,
     true,
     "Dump current thread or whole process layer stack when signal error "
diff --git a/paddle/utils/CustomStackTrace.h b/paddle/utils/CustomStackTrace.h
index 5686f3c84ce72622fbf2f894409965f2a0cab103..6992e856223494d6575ef3261d82cbdf4e375885 100644
--- a/paddle/utils/CustomStackTrace.h
+++ b/paddle/utils/CustomStackTrace.h
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #pragma once
 
+#include <functional>
 #include <stack>
 #include <thread>
 #include <unordered_map>
-#include <functional>
 
 #include "ThreadLocal.h"
 
@@ -96,7 +96,8 @@ public:
    */
   typedef std::function<void(const std::thread::id& /*threadId*/,
                              bool* /*isPushing*/,
-                             const T& /*item*/)> DumpCallback;
+                             const T& /*item*/)>
+      DumpCallback;
 
   /**
    * Dump all thread stack, and all stack will be cleared.
diff --git a/paddle/utils/Flags.cpp b/paddle/utils/Flags.cpp
index 1c9e602f45a818824a34aca23ef8f52a5e14cd17..59d6cbdc513660b87cb013d8aa92c5c8f9289ecb 100644
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
@@ -15,65 +15,61 @@ limitations under the License. */
 #include "Flags.h"
 
 #ifdef PADDLE_ONLY_CPU
-P_DEFINE_bool(use_gpu, false, "Only support CPU training");
+DEFINE_bool(use_gpu, false, "Only support CPU training");
 #else
-P_DEFINE_bool(use_gpu, true, "Whether to use GPU for training");
+DEFINE_bool(use_gpu, true, "Whether to use GPU for training");
 #endif
 
-P_DEFINE_bool(
-    parallel_nn,
-    false,
-    "Whether to use multi-threads to calculate one neural network."
-    "If it was set false, use gpu_id specify which gpu core to use"
-    "(the device property in the trainer config file will be ingored)."
-    "If it was set true, the gpu core is specified by the trainer"
-    "  config file(gpu_id will be ignored).");
-P_DEFINE_int32(trainer_count, 1, "Defined how many trainers to train");
-P_DEFINE_int32(gpu_id, 0, "Which gpu core to use");
-P_DEFINE_int32(port, 20134, "Listening port for pserver");
-P_DEFINE_int32(data_server_port, 21134, "Listening port for dserver");
-P_DEFINE_int32(ports_num,
-               1,
-               "The ports number for parameter send,"
-               " increment based on default port number");
-P_DEFINE_int32(ports_num_for_sparse,
-               0,
-               "The ports number for parameter send,"
-               " increment based on default (port + ports_num)");
-P_DEFINE_string(nics, "xgbe0,xgbe1", "network device name for pservers");
-P_DEFINE_string(rdma_tcp, "tcp", "use rdma or tcp rdma transport protocol");
-P_DEFINE_int32(
-    trainer_id,
-    0,
-    "For distributed training, each trainer must be given an unique id"
-    " ranging from 0 to num_trainers-1. Trainer 0 is the master"
-    " trainer");
-P_DEFINE_int32(num_gradient_servers, 1, "number of gradient servers");
-P_DEFINE_string(comment, "", "A string for commenting this training task");
-P_DEFINE_string(load_missing_parameter_strategy,
-                "fail",
-                "which operation to take on load model fails. support "
-                "fail/rand/zero only.");
-P_DEFINE_int32(log_period, 100, "Log progress every so many batches");
-P_DEFINE_int32(log_period_server,
-               500,
-               "Log progress every so many batches at pserver end");
-P_DEFINE_double(checkgrad_eps, 1e-5, "parameter change size for checkgrad");
-P_DEFINE_int32(enable_parallel_vector,
-               0,
-               "threshold for enable parallel vector");
-P_DEFINE_bool(loadsave_parameters_in_pserver,
-              false,
-              "load and save parameters in pserver. "
-              "only work while parameter set sparse_remote_update.");
-P_DEFINE_int32(beam_size,
-               1,
-               "Beam size used in generating most probable output sequences.");
+DEFINE_bool(parallel_nn,
+            false,
+            "Whether to use multi-threads to calculate one neural network."
+            "If it was set false, use gpu_id specify which gpu core to use"
+            "(the device property in the trainer config file will be ingored)."
+            "If it was set true, the gpu core is specified by the trainer"
+            "  config file(gpu_id will be ignored).");
+DEFINE_int32(trainer_count, 1, "Defined how many trainers to train");
+DEFINE_int32(gpu_id, 0, "Which gpu core to use");
+DEFINE_int32(port, 20134, "Listening port for pserver");
+DEFINE_int32(data_server_port, 21134, "Listening port for dserver");
+DEFINE_int32(ports_num,
+             1,
+             "The ports number for parameter send,"
+             " increment based on default port number");
+DEFINE_int32(ports_num_for_sparse,
+             0,
+             "The ports number for parameter send,"
+             " increment based on default (port + ports_num)");
+DEFINE_string(nics, "xgbe0,xgbe1", "network device name for pservers");
+DEFINE_string(rdma_tcp, "tcp", "use rdma or tcp rdma transport protocol");
+DEFINE_int32(trainer_id,
+             0,
+             "For distributed training, each trainer must be given an unique id"
+             " ranging from 0 to num_trainers-1. Trainer 0 is the master"
+             " trainer");
+DEFINE_int32(num_gradient_servers, 1, "number of gradient servers");
+DEFINE_string(comment, "", "A string for commenting this training task");
+DEFINE_string(load_missing_parameter_strategy,
+              "fail",
+              "which operation to take on load model fails. support "
+              "fail/rand/zero only.");
+DEFINE_int32(log_period, 100, "Log progress every so many batches");
+DEFINE_int32(log_period_server,
+             500,
+             "Log progress every so many batches at pserver end");
+DEFINE_double(checkgrad_eps, 1e-5, "parameter change size for checkgrad");
+DEFINE_int32(enable_parallel_vector, 0, "threshold for enable parallel vector");
+DEFINE_bool(loadsave_parameters_in_pserver,
+            false,
+            "load and save parameters in pserver. "
+            "only work while parameter set sparse_remote_update.");
+DEFINE_int32(beam_size,
+             1,
+             "Beam size used in generating most probable output sequences.");
 
-P_DEFINE_bool(show_layer_stat, false, "show the statistics of each layer");
-P_DEFINE_string(predict_file, "", "File name for saving predict result");
-P_DEFINE_bool(prev_batch_state, false, "batch is continue with next batch");
-P_DEFINE_string(init_model_path,
-                "",
-                "Path of the initial model parameters."
-                "If it was set, start_pass will be ignored.");
+DEFINE_bool(show_layer_stat, false, "show the statistics of each layer");
+DEFINE_string(predict_file, "", "File name for saving predict result");
+DEFINE_bool(prev_batch_state, false, "batch is continue with next batch");
+DEFINE_string(init_model_path,
+              "",
+              "Path of the initial model parameters."
+              "If it was set, start_pass will be ignored.");
diff --git a/paddle/utils/Flags.h b/paddle/utils/Flags.h
index 922533d63e7f0c28a1dcec6b4d9f453f1794abb5..2ebbcb24eb061531d0807756528d7bf16e6aa124 100644
--- a/paddle/utils/Flags.h
+++ b/paddle/utils/Flags.h
@@ -16,28 +16,28 @@ limitations under the License. */
 
 #include "CommandLineParser.h"
 
-P_DECLARE_bool(parallel_nn);
-P_DECLARE_int32(async_count);
-P_DECLARE_int32(port);
-P_DECLARE_int32(data_server_port);
-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_int32(trainer_count);
-P_DECLARE_int32(ports_num);
-P_DECLARE_int32(ports_num_for_sparse);
-P_DECLARE_string(nics);
-P_DECLARE_string(rdma_tcp);
-P_DECLARE_int32(trainer_id);
-P_DECLARE_int32(num_gradient_servers);
-P_DECLARE_string(comment);
-P_DECLARE_string(load_missing_parameter_strategy);
-P_DECLARE_int32(log_period);
-P_DECLARE_int32(log_period_server);
-P_DECLARE_double(checkgrad_eps);
-P_DECLARE_int32(enable_parallel_vector);
-P_DECLARE_bool(loadsave_parameters_in_pserver);
-P_DECLARE_int32(beam_size);
-P_DECLARE_bool(show_layer_stat);
-P_DECLARE_string(predict_file);
-P_DECLARE_bool(prev_batch_state);
-P_DECLARE_string(init_model_path);
+DECLARE_bool(parallel_nn);
+DECLARE_int32(async_count);
+DECLARE_int32(port);
+DECLARE_int32(data_server_port);
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_int32(trainer_count);
+DECLARE_int32(ports_num);
+DECLARE_int32(ports_num_for_sparse);
+DECLARE_string(nics);
+DECLARE_string(rdma_tcp);
+DECLARE_int32(trainer_id);
+DECLARE_int32(num_gradient_servers);
+DECLARE_string(comment);
+DECLARE_string(load_missing_parameter_strategy);
+DECLARE_int32(log_period);
+DECLARE_int32(log_period_server);
+DECLARE_double(checkgrad_eps);
+DECLARE_int32(enable_parallel_vector);
+DECLARE_bool(loadsave_parameters_in_pserver);
+DECLARE_int32(beam_size);
+DECLARE_bool(show_layer_stat);
+DECLARE_string(predict_file);
+DECLARE_bool(prev_batch_state);
+DECLARE_string(init_model_path);
diff --git a/paddle/utils/Logging.cpp b/paddle/utils/Logging.cpp
index 3c31633e58a2b958c804c0c6934830d8e3a928e4..5a1c6ecb2219f7983609c27f3215c7fc1e9e9ef2 100644
--- a/paddle/utils/Logging.cpp
+++ b/paddle/utils/Logging.cpp
@@ -18,175 +18,9 @@ limitations under the License. */
  */
 
 #include "Logging.h"
-#ifndef PADDLE_USE_GLOG
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <vector>
-#include <thread>
-#include <mutex>
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
 
 namespace paddle {
 
-namespace internal {
-
-std::string join(const std::string& part1, const std::string& part2) {
-  const char sep = '/';
-  if (!part2.empty() && part2.front() == sep) {
-    return part2;
-  }
-  std::string ret;
-  ret.reserve(part1.size() + part2.size() + 1);
-  ret = part1;
-  if (!ret.empty() && ret.back() != sep) {
-    ret += sep;
-  }
-  ret += part2;
-  return ret;
-}
-
-static inline bool env2bool(const char* envName, bool defaultValue = false) {
-  char* envValue = getenv(envName);
-  if (envValue == nullptr) {
-    return defaultValue;
-  } else {
-    return memchr("tTyY1\0", envValue[0], 6) != nullptr;
-  }
-}
-
-static inline int env2int(const char* envName, int defaultValue = 0) {
-  char* envValue = getenv(envName);
-  if (envValue == nullptr) {
-    return defaultValue;
-  } else {
-    int retValue = defaultValue;
-    try {
-      retValue = std::stoi(envValue);
-    } catch (...) {
-      // pass
-    }
-    return retValue;
-  }
-}
-
-static inline int env2index(const char* envName,
-                            const std::vector<std::string>& options,
-                            int defaultValue) {
-  char* envValue = getenv(envName);
-  if (envValue == nullptr) {
-    return defaultValue;
-  } else {
-    for (size_t i = 0; i < options.size(); ++i) {
-      if (options[i] == envValue) {
-        return static_cast<int>(i);
-      }
-    }
-    return defaultValue;
-  }
-}
-
-static bool gLogToStderr = env2bool("PLOG_LOGTOSTDERR", true);
-static const std::vector<std::string> gLevelName = {
-    "INFO", "WARNING", "ERROR", "FATAL"};
-static int gMinLogLevel =
-    env2int("PLOG_MINLOGLEVEL", env2index("PLOG_MINLOGLEVEL", gLevelName, 0));
-
-static std::vector<std::vector<int>> gLogFds;
-static std::vector<int> gLogFileFds;
-static bool gLogInited = false;
-static void freeLogFileFds() {
-  for (auto fd : gLogFileFds) {
-    close(fd);
-  }
-}
-
-static void initializeLogFds(char* argv0) {
-  gLogFds.resize(NUM_SEVERITIES);
-
-  for (int i = gMinLogLevel; i < NUM_SEVERITIES && gLogToStderr;
-       ++i) {  // Add stderr
-    std::vector<int>& fds = gLogFds[i];
-    fds.push_back(STDERR_FILENO);
-  }
-
-  char* logDir = getenv("PLOG_LOGDIR");
-
-  for (int i = gMinLogLevel; i < NUM_SEVERITIES && logDir != nullptr; ++i) {
-    std::string filename =
-        join(logDir, std::string(argv0) + "." + gLevelName[i]);
-    int fd = open(filename.c_str(), O_CREAT | O_WRONLY, 0644);
-    if (fd == -1) {
-      fprintf(stderr, "Open log file error!");
-      exit(1);
-    }
-    gLogFileFds.push_back(fd);
-
-    std::vector<int>& curFds = gLogFds[i];
-    curFds.insert(curFds.end(), gLogFileFds.begin(), gLogFileFds.end());
-  }
-
-  atexit(freeLogFileFds);
-  gLogInited = true;
-}
-
-static void (*gFailureFunctionPtr)() ATTR_NORETURN = abort;
-
-LogMessage::LogMessage(const char* fname, int line, int severity)
-    : fname_(fname), line_(line), severity_(severity) {}
-
-LogMessage::~LogMessage() { this->generateLogMessage(); }
-
-void LogMessage::generateLogMessage() {
-  if (!gLogInited) {
-    fprintf(stderr,
-            "%c %s:%d] %s\n",
-            "IWEF"[severity_],
-            fname_,
-            line_,
-            str().c_str());
-  } else {
-    for (auto& fd : gLogFds[this->severity_]) {
-      dprintf(fd,
-              "%c %s:%d] %s\n",
-              "IWEF"[severity_],
-              fname_,
-              line_,
-              str().c_str());
-    }
-  }
-}
-
-LogMessageFatal::LogMessageFatal(const char* file, int line)
-    : LogMessage(file, line, FATAL) {}
-
-LogMessageFatal::~LogMessageFatal() {
-  generateLogMessage();
-  gFailureFunctionPtr();
-}
-}  // namespace internal
-
-void initializeLogging(int argc, char** argv) {
-  internal::initializeLogFds(argv[0]);
-}
-
-namespace logging {
-void setMinLogLevel(int level) { paddle::internal::gMinLogLevel = level; }
-
-void installFailureFunction(void (*callback)() ATTR_NORETURN) {
-  paddle::internal::gFailureFunctionPtr = callback;
-}
-
-}  // namespace logging
-
-}  // namespace paddle
-
-#else
-namespace paddle {
 void initializeLogging(int argc, char** argv) {
   (void)(argc);
   if (!getenv("GLOG_logtostderr")) {
@@ -197,13 +31,16 @@ void initializeLogging(int argc, char** argv) {
 }
 
 namespace logging {
+
 void setMinLogLevel(int level) { FLAGS_minloglevel = level; }
+
 void installFailureFunction(void (*callback)()) {
   google::InstallFailureFunction(callback);
 }
+
 void installFailureWriter(void (*callback)(const char*, int)) {
   google::InstallFailureWriter(callback);
 }
+
 }  // namespace logging
 }  // namespace paddle
-#endif
diff --git a/paddle/utils/Logging.h b/paddle/utils/Logging.h
index c91ca9fecc5e74334f419a2e9631f9556945923c..d9e551f0891fa0808b8699aea94a0d2ab4f81cb3 100644
--- a/paddle/utils/Logging.h
+++ b/paddle/utils/Logging.h
@@ -18,179 +18,25 @@ limitations under the License. */
  */
 
 #pragma once
-#include <sstream>
 #include <memory>
+#include <sstream>
 #include <string>
 
-#ifndef PADDLE_USE_GLOG
-#include "CompilerMacros.h"
-
-//! TODO(yuyang18): Move this utility macro into some global header.
-#define PP_CAT(a, b) PP_CAT_I(a, b)
-#define PP_CAT_I(a, b) PP_CAT_II(~, a##b)
-#define PP_CAT_II(p, res) res
-
-/**
- * Generate Unique Variable Name, Usefully in macro.
- * @SEE
- * http://stackoverflow.com/questions/1082192/how-to-generate-random-variable-names-in-c-using-macros
- */
-#define UNIQUE_NAME(base) PP_CAT(base, __LINE__)
-
+#include <glog/logging.h>
 namespace paddle {
 
-//! Log levels.
-const int INFO = 0;
-const int WARNING = 1;
-const int ERROR = 2;
-const int FATAL = 3;
-const int NUM_SEVERITIES = 4;
-
-namespace internal {
-
-class LogMessage : public std::basic_ostringstream<char> {
-public:
-  LogMessage(const char* fname, int line, int severity);
-  ~LogMessage();
-
-protected:
-  /**
-   * @brief Print log message to stderr, files, etc.
-   */
-  void generateLogMessage();
-
-private:
-  const char* fname_;
-  int line_;
-  int severity_;
-};
-
-// LogMessageFatal ensures the process will exit in failure after
-// logging this message.
-class LogMessageFatal : public LogMessage {
-public:
-  LogMessageFatal(const char* file, int line) __attribute__((cold));
-  ~LogMessageFatal() __attribute__((noreturn));
-};
-
-#define _P_LOG_INFO \
-  ::paddle::internal::LogMessage(__FILE__, __LINE__, paddle::INFO)
-#define _P_LOG_WARNING \
-  ::paddle::internal::LogMessage(__FILE__, __LINE__, paddle::WARNING)
-#define _P_LOG_ERROR \
-  ::paddle::internal::LogMessage(__FILE__, __LINE__, paddle::ERROR)
-#define _P_LOG_FATAL ::paddle::internal::LogMessageFatal(__FILE__, __LINE__)
-
-#define P_LOG(severity) _P_LOG_##severity
-
-#define P_LOG_FIRST_N(severity, n)                                       \
-  static int UNIQUE_NAME(LOG_OCCURRENCES) = 0;                           \
-  if (UNIQUE_NAME(LOG_OCCURRENCES) <= n) ++UNIQUE_NAME(LOG_OCCURRENCES); \
-  if (UNIQUE_NAME(LOG_OCCURRENCES) <= n) P_LOG(severity)
-
-#define P_LOG_IF_EVERY_N(severity, condition, n)                              \
-  static int UNIQUE_NAME(LOG_OCCURRENCES) = 0;                                \
-  if (condition && ((UNIQUE_NAME(LOG_OCCURRENCES) =                           \
-                         (UNIQUE_NAME(LOG_OCCURRENCES) + 1) % n) == (1 % n))) \
-  P_LOG(severity)
-
-#define P_LOG_EVERY_N(severity, n) P_LOG_IF_EVERY_N(severity, true, n)
-
-// TODO(jeff): Define a proper implementation of VLOG_IS_ON
-#define P_VLOG_IS_ON(lvl) ((lvl) <= 0)
-
-#define P_LOG_IF(severity, condition) \
-  if (condition) P_LOG(severity)
-
-#define P_VLOG(lvl) P_LOG_IF(INFO, P_VLOG_IS_ON(lvl))
-
-#define P_VLOG_IF(lvl, cond) P_LOG_IF(INFO, P_VLOG_IS_ON(lvl) && cond)
-
-#define P_VLOG_EVERY_N(lvl, n) P_LOG_IF_EVERY_N(INFO, P_VLOG_IS_ON(lvl), n)
-
-#define PREDICT_FALSE(x) (__builtin_expect(x, 0))
-#define PREDICT_TRUE(x) (__builtin_expect(!!(x), 1))
-
-// CHECK dies with a fatal error if condition is not true.  It is *not*
-// controlled by NDEBUG, so the check will be executed regardless of
-// compilation mode.  Therefore, it is safe to do things like:
-//    CHECK(fp->Write(x) == 4)
-#define P_CHECK(condition)         \
-  if (PREDICT_FALSE(!(condition))) \
-  P_LOG(FATAL) << "Check failed: " #condition " "
-
-#define P_CHECK_EQ(val1, val2) P_CHECK((val1) == (val2))
-#define P_CHECK_NE(val1, val2) P_CHECK((val1) != (val2))
-#define P_CHECK_LE(val1, val2) P_CHECK((val1) <= (val2))
-#define P_CHECK_LT(val1, val2) P_CHECK((val1) < (val2))
-#define P_CHECK_GE(val1, val2) P_CHECK((val1) >= (val2))
-#define P_CHECK_GT(val1, val2) P_CHECK((val1) > (val2))
-#define P_CHECK_NOTNULL(val) P_CHECK((val) != NULL)
-
-//! GLOG compatible APIs
-//! NOTE: only implement Paddle actually used APIs.
-#define LOG(x) P_LOG(x)
-#define VLOG(x) P_VLOG(x)
-#define DLOG(x) P_VLOG(5)
-#define CHECK(x) P_CHECK(x)
-#define PCHECK(x) P_CHECK(x)
-#define CHECK_EQ(val1, val2) P_CHECK((val1) == (val2))
-#define CHECK_NE(val1, val2) P_CHECK((val1) != (val2))
-#define CHECK_LE(val1, val2) P_CHECK((val1) <= (val2))
-#define CHECK_LT(val1, val2) P_CHECK((val1) < (val2))
-#define CHECK_GE(val1, val2) P_CHECK((val1) >= (val2))
-#define CHECK_GT(val1, val2) P_CHECK((val1) > (val2))
-#define CHECK_NOTNULL(val) P_CHECK((val) != NULL)
-#define VLOG_IS_ON(x) P_VLOG_IS_ON(x)
-#define LOG_FIRST_N(severity, n) P_LOG_FIRST_N(severity, n)
-#define LOG_IF(severity, condition) P_LOG_IF(severity, condition)
-#define VLOG_EVERY_N(lvl, n) P_VLOG_EVERY_N(lvl, n)
-#define VLOG_IF(lvl, cond) P_VLOG_IF(lvl, cond)
-#define LOG_EVERY_N(severity, n) P_LOG_EVERY_N(severity, n)
-}  //  namespace internal
-
-/**
- * @brief initialize logging
- * @note: Current implement of logging is lack of:
- *          PrintCallStack when fatal.
- *          VLOG_IS_ON
- *        But it is portable to multi-platform, and simple enough to modify.
- */
 void initializeLogging(int argc, char** argv);
-namespace logging {
-/**
- * @brief Set Min Log Level. if Log.level < minLogLevel, then will not print log
- *        to stream
- * @param level. Any integer is OK, but only 0 <= x <= NUM_SEVERITIES is useful.
- */
-void setMinLogLevel(int level);
 
-/**
- * @brief Install Log(Fatal) failure function. Default is abort();
- * @param callback: The failure function.
- */
-void installFailureFunction(void (*callback)() ATTR_NORETURN);
-
-/**
- * @brief installFailureWriter
- * @note: not implemented currently.
- */
-inline void installFailureWriter(void (*callback)(const char*, int)) {
-  (void)(callback);  // unused callback.
-}
-}  //  namespace logging
-}  //  namespace paddle
-#else
-#include <glog/logging.h>
-namespace paddle {
-void initializeLogging(int argc, char** argv);
 namespace logging {
+
 void setMinLogLevel(int level);
+
 void installFailureFunction(void (*callback)());
+
 void installFailureWriter(void (*callback)(const char*, int));
-}  //  namespace logging
-}
-#endif  // PADDLE_USE_GLOG
+
+}  // namespace logging
+}  // namespace paddle
 
 #ifndef NDEBUG
 #define DEBUG_LEVEL 5
diff --git a/paddle/utils/PythonUtil.cpp b/paddle/utils/PythonUtil.cpp
index a9c6a20997e78c51b71707f6290ca52dbe3614db..7faeff55c28b9065179ad27b3b604a9f411249e5 100644
--- a/paddle/utils/PythonUtil.cpp
+++ b/paddle/utils/PythonUtil.cpp
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "PythonUtil.h"
-#include <sstream>
 #include <signal.h>
+#include <sstream>
 
 namespace paddle {
 
 #ifdef PADDLE_NO_PYTHON
 
-P_DEFINE_string(python_path, "", "python path");
-P_DEFINE_string(python_bin, "python2.7", "python bin");
+DEFINE_string(python_path, "", "python path");
+DEFINE_string(python_bin, "python2.7", "python bin");
 
 constexpr int kExecuteCMDBufLength = 204800;
 
diff --git a/paddle/utils/PythonUtil.h b/paddle/utils/PythonUtil.h
index 2cbc2fdd37cb46375ae37a4aa149f30440e38f0a..daebaffc855518425ae43942c22ec150d2e327f0 100644
--- a/paddle/utils/PythonUtil.h
+++ b/paddle/utils/PythonUtil.h
@@ -13,6 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
+// clang-format off
+#include "paddle/utils/Util.h"
 
 #ifndef PADDLE_NO_PYTHON
 // must include the following two blocks, otherwise,
@@ -33,13 +35,12 @@ limitations under the License. */
 #endif
 #include <Python.h>
 #include <frameobject.h>
-
 #endif
 
-#include "paddle/utils/Util.h"
 #include <stdarg.h>
-#include <mutex>
 #include <map>
+#include <mutex>
+// clang-format on
 
 namespace paddle {
 
diff --git a/paddle/utils/Queue.h b/paddle/utils/Queue.h
index 37748345a4b1036ca80c378368c1e858a001583d..f054738f87c02d2d749eec8d6c7bb55b506a6d91 100644
--- a/paddle/utils/Queue.h
+++ b/paddle/utils/Queue.h
@@ -142,9 +142,9 @@ public:
    */
   bool waitNotEmptyFor(int seconds) {
     std::unique_lock<std::mutex> lock(queueLock_);
-    return queueCV_.wait_for(lock,
-                             std::chrono::seconds(seconds),
-                             [this] { return numElements_ != 0; });
+    return queueCV_.wait_for(lock, std::chrono::seconds(seconds), [this] {
+      return numElements_ != 0;
+    });
   }
 
 private:
diff --git a/paddle/utils/Stat.cpp b/paddle/utils/Stat.cpp
index 01ea535cfd429daf3bc2e5906161fab42f8cd767..44acee249554e41f715314a3cd7eef29e3e6c5b0 100644
--- a/paddle/utils/Stat.cpp
+++ b/paddle/utils/Stat.cpp
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "Stat.h"
-#include "Util.h"
-#include <iomanip>
 #include <algorithm>
+#include <iomanip>
+#include "Util.h"
 
 namespace paddle {
 
@@ -207,10 +207,9 @@ static unsigned g_profileCount = 0;
 static std::recursive_mutex g_profileMutex;
 
 GpuProfiler::GpuProfiler(std::string statName, std::string info)
-  : guard_(g_profileMutex)  {
+    : guard_(g_profileMutex) {
   if (++g_profileCount == 1) {
-    LOG(INFO) << "Enable GPU Profiler Stat: ["
-              << statName << "] " << info;
+    LOG(INFO) << "Enable GPU Profiler Stat: [" << statName << "] " << info;
     hl_profiler_start();
   }
 }
diff --git a/paddle/utils/StringUtil.h b/paddle/utils/StringUtil.h
index 8a63ca23b4322673fe1102b819a7ce4765fd73a9..0b4f4c9113ae9d714b634b67931e51b408bbe777 100644
--- a/paddle/utils/StringUtil.h
+++ b/paddle/utils/StringUtil.h
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #pragma once
 
+#include <sstream>
 #include <string>
 #include <vector>
-#include <sstream>
 #include "Logging.h"
 
 namespace paddle {
diff --git a/paddle/utils/Thread.h b/paddle/utils/Thread.h
index 435dff2f668e3efcfadee439045ad359f775b84f..ef36a8c5b2b0e95d759da8a781d781b71d067b7a 100644
--- a/paddle/utils/Thread.h
+++ b/paddle/utils/Thread.h
@@ -13,9 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #pragma once
-#include "Util.h"
-#include "Logging.h"
 #include <thread>
+#include "Logging.h"
+#include "Util.h"
 
 #include "Queue.h"
 #include "ThreadLocal.h"
diff --git a/paddle/utils/ThreadLocal.cpp b/paddle/utils/ThreadLocal.cpp
index c9b32784d9baba9e6c9275c75bf339cd2039e0af..75ccbd28cf21b7fafb43a072503dff14a29fec8a 100644
--- a/paddle/utils/ThreadLocal.cpp
+++ b/paddle/utils/ThreadLocal.cpp
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "Util.h"
 #include "ThreadLocal.h"
 #include "CommandLineParser.h"
+#include "Util.h"
 
-P_DEFINE_bool(thread_local_rand_use_global_seed,
-              false,
-              "Whether to use global seed in thread local rand.");
+DEFINE_bool(thread_local_rand_use_global_seed,
+            false,
+            "Whether to use global seed in thread local rand.");
 
 namespace paddle {
 
diff --git a/paddle/utils/ThreadLocal.h b/paddle/utils/ThreadLocal.h
index b6e31bd05bb82cbe9aa0156799137772b5115d2b..a4987c9ec261a2ee57e62d1640e2a21c7f804c99 100644
--- a/paddle/utils/ThreadLocal.h
+++ b/paddle/utils/ThreadLocal.h
@@ -15,14 +15,14 @@ limitations under the License. */
 #pragma once
 
 #include <pthread.h>
-#include <sys/types.h>
 #include <sys/syscall.h>
+#include <sys/types.h>
 #include <unistd.h>
 #include <map>
 #include <mutex>
 #include <random>
-#include "Util.h"
 #include "Logging.h"
+#include "Util.h"
 
 namespace paddle {
 
diff --git a/paddle/utils/Util.cpp b/paddle/utils/Util.cpp
index f48726bff068080ce7f83d8dfdc67fcd73b4c669..7c0d66c488f5064641c53ea7995a75c330a3e49d 100644
--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
@@ -15,11 +15,11 @@ limitations under the License. */
 #include "Util.h"
 
 #include <dirent.h>
+#include <pmmintrin.h>
 #include <signal.h>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <xmmintrin.h>
-#include <pmmintrin.h>
 
 #include <fstream>
 #include <mutex>
@@ -28,12 +28,12 @@ limitations under the License. */
 
 #include "CommandLineParser.h"
 #include "CustomStackTrace.h"
+#include "StringUtil.h"
 #include "Thread.h"
 #include "ThreadLocal.h"
 #include "Version.h"
-#include "StringUtil.h"
 
-P_DEFINE_int32(seed, 1, "random number seed. 0 for srand(time)");
+DEFINE_int32(seed, 1, "random number seed. 0 for srand(time)");
 
 #ifdef WITH_GOOGLE_PERFTOOLS
 /*
@@ -52,10 +52,8 @@ P_DEFINE_int32(seed, 1, "random number seed. 0 for srand(time)");
 
 #include <gperftools/profiler.h>
 
-P_DEFINE_int32(profile_signal, 12, "signal for switch google profiler");
-P_DEFINE_string(profile_data_file,
-                "gperf.prof",
-                "file for storing profile data");
+DEFINE_int32(profile_signal, 12, "signal for switch google profiler");
+DEFINE_string(profile_data_file, "gperf.prof", "file for storing profile data");
 
 static void profilerSwitch(int signalNumber) {
   bool static started = false;
@@ -126,25 +124,23 @@ void registerInitFunction(std::function<void()> func, int priority) {
 }
 
 void runInitFunctions() {
-  std::call_once(
-      g_onceFlag,
-      []() {
-        LOG(INFO) << "Calling runInitFunctions";
-        if (g_initFuncs) {
-          std::sort(g_initFuncs->begin(),
-                    g_initFuncs->end(),
-                    [](const PriorityFuncPair& x, const PriorityFuncPair& y) {
-                      return x.first > y.first;
-                    });
-          for (auto& f : *g_initFuncs) {
-            f.second();
-          }
-          delete g_initFuncs;
-          g_initFuncs = nullptr;
-        }
-        g_initialized = true;
-        LOG(INFO) << "Call runInitFunctions done.";
-      });
+  std::call_once(g_onceFlag, []() {
+    LOG(INFO) << "Calling runInitFunctions";
+    if (g_initFuncs) {
+      std::sort(g_initFuncs->begin(),
+                g_initFuncs->end(),
+                [](const PriorityFuncPair& x, const PriorityFuncPair& y) {
+                  return x.first > y.first;
+                });
+      for (auto& f : *g_initFuncs) {
+        f.second();
+      }
+      delete g_initFuncs;
+      g_initFuncs = nullptr;
+    }
+    g_initialized = true;
+    LOG(INFO) << "Call runInitFunctions done.";
+  });
 }
 
 void initMain(int argc, char** argv) {
diff --git a/paddle/utils/Util.h b/paddle/utils/Util.h
index ff67439da6d80d0c40043e0a7fea0cdf0a19acc9..24ddde28e7e9f44c32d70e1b9621954ee77b2883 100644
--- a/paddle/utils/Util.h
+++ b/paddle/utils/Util.h
@@ -14,25 +14,25 @@ limitations under the License. */
 
 #pragma once
 
+#include <sys/syscall.h>  // for syscall()
+#include <sys/types.h>
 #include <algorithm>
 #include <cmath>
-#include <string>
-#include <vector>
+#include <functional>
 #include <memory>
+#include <mutex>
+#include <string>
 #include <thread>
 #include <unordered_map>
-#include <mutex>
-#include <functional>
-#include <sys/syscall.h>  // for syscall()
-#include <sys/types.h>
+#include <vector>
 
 #include "CommandLineParser.h"
+#include "DisableCopy.h"
 #include "Logging.h"
 #include "TrainerConfig.pb.h"
-#include "DisableCopy.h"
 
-#include "TypeDefs.h"
 #include "Flags.h"
+#include "TypeDefs.h"
 #include "hl_gpu.h"
 
 /**
diff --git a/paddle/utils/Version.cpp b/paddle/utils/Version.cpp
index 086515791d8870f2c5ee3ab2dcccfcbc178c4b61..731c30842118bce59ce45297d9c8f47fa0a69d69 100644
--- a/paddle/utils/Version.cpp
+++ b/paddle/utils/Version.cpp
@@ -14,17 +14,12 @@ limitations under the License. */
 
 #include "Version.h"
 
-#include "Flags.h"
-#include "Util.h"
 #include <iomanip>
 #include <numeric>
-//! TODO(yuyang18) in gflags, version has another define. Use another flag
-//! instead.
-#ifndef PADDLE_USE_GFLAGS
-P_DEFINE_bool(version, false, "print version");
-#else
-P_DECLARE_bool(version);
-#endif
+#include "Flags.h"
+#include "Util.h"
+
+DECLARE_bool(version);
 
 namespace paddle {
 namespace version {
@@ -33,7 +28,8 @@ void printVersion(std::ostream& os) {
 #ifndef PADDLE_VERSION
 #define PADDLE_VERSION "unknown"
 #endif
-// converts macro to string https://gcc.gnu.org/onlinedocs/cpp/Stringification.html
+// converts macro to string
+// https://gcc.gnu.org/onlinedocs/cpp/Stringification.html
 #define xstr(s) str(s)
 #define str(s) #s
 
diff --git a/paddle/utils/Version.h b/paddle/utils/Version.h
index ac04963c2ce2cb2083431a56b2b3c2301e568e1a..d1a07d9485076e5382d47f7408fcbf032166b1ed 100644
--- a/paddle/utils/Version.h
+++ b/paddle/utils/Version.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 #include <stddef.h>
-#include "TypeDefs.h"
 #include <iostream>
+#include "TypeDefs.h"
 
 namespace paddle {
 
diff --git a/paddle/utils/arch/osx/Locks.cpp b/paddle/utils/arch/osx/Locks.cpp
index 85902264314c6a4dc6f2c4ddb86bf4923627ee56..e03992363fd6051a1970664d63406b2e7a47fce3 100644
--- a/paddle/utils/arch/osx/Locks.cpp
+++ b/paddle/utils/arch/osx/Locks.cpp
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/utils/Locks.h"
-#include "paddle/utils/Logging.h"
 #include <dispatch/dispatch.h>
-#include <atomic>
 #include <libkern/OSAtomic.h>
+#include <atomic>
+#include "paddle/utils/Logging.h"
 
 namespace paddle {
 
diff --git a/paddle/utils/tests/CMakeLists.txt b/paddle/utils/tests/CMakeLists.txt
index 298ede5cd6451c9b03219dff72f6e81c374f8ef1..26fafbd1ab3f2967b765b8bcb973fb745c0e6422 100644
--- a/paddle/utils/tests/CMakeLists.txt
+++ b/paddle/utils/tests/CMakeLists.txt
@@ -1,5 +1,3 @@
-add_simple_unittest(test_CommandLineParser)
-add_simple_unittest(test_Logging)
 add_simple_unittest(test_Thread)
 add_simple_unittest(test_StringUtils)
 add_simple_unittest(test_CustomStackTrace)
diff --git a/paddle/utils/tests/test_CommandLineParser.cpp b/paddle/utils/tests/test_CommandLineParser.cpp
deleted file mode 100644
index 9a1d2391a8b47814c772bdf86e57ea440c11713b..0000000000000000000000000000000000000000
--- a/paddle/utils/tests/test_CommandLineParser.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifndef PADDLE_USE_GFLAGS
-//! Test Command Line Parser for paddle internal implement.
-
-#include <paddle/utils/CommandLineParser.h>
-#include <gtest/gtest.h>
-
-P_DEFINE_int32(i1, 1, "test int flag 1");
-P_DEFINE_int32(i2, 2, "test int flag 2");
-
-P_DEFINE_string(str1, "1", "test str flag 1");
-P_DEFINE_string(str2, "2", "test str flag 2");
-
-P_DEFINE_bool(b1, true, "test bool flag 1");
-P_DEFINE_bool(b2, false, "test bool flag 2");
-
-P_DEFINE_double(d1, 0.1, "test double flag 1");
-P_DEFINE_double(d2, -42.3, "test double flag 2");
-
-P_DEFINE_int64(l1, 1, "test int64 flag 1");
-P_DEFINE_int64(l2, 2, "test int64 flag 2");
-
-P_DEFINE_uint64(ul1, 32, "test uint64 flag 1");
-P_DEFINE_uint64(ul2, 33, "test uint64 flag 2");
-
-constexpr double EPSILON = 1e-5;
-
-#define cc(x) const_cast<char*>((x))
-
-TEST(CommandLineParser, defaultValue) {
-  char* argv[] = {cc("test_program"), cc("--unused_flag=134")};
-  int argc = sizeof(argv) / sizeof(char*);
-
-  paddle::ParseCommandLineFlags(&argc, argv);
-
-  // Check Default Value
-  ASSERT_EQ(argc, 2);
-  ASSERT_EQ(FLAGS_i1, 1);
-  ASSERT_EQ(FLAGS_i2, 2);
-  ASSERT_EQ(FLAGS_str1, "1");
-  ASSERT_EQ(FLAGS_str2, "2");
-  ASSERT_EQ(FLAGS_b1, true);
-  ASSERT_EQ(FLAGS_b2, false);
-  ASSERT_NEAR(FLAGS_d1, 0.1, EPSILON);
-  ASSERT_NEAR(FLAGS_d2, -42.3, EPSILON);
-  ASSERT_EQ(FLAGS_i1, 1);
-  ASSERT_EQ(FLAGS_i2, 2);
-  ASSERT_EQ(FLAGS_ul1, 32UL);
-  ASSERT_EQ(FLAGS_ul2, 33UL);
-}
-
-TEST(CommandLineParser, normal) {
-  char* argv[] = {cc("test_program"),
-                  cc("--i2=32"),
-                  cc("--str1=abc"),
-                  cc("--b2=1"),
-                  cc("-b1=False"),
-                  cc("--d2=.34"),
-                  cc("--d1=0"),
-                  cc("--l1=-12345678901234"),
-                  cc("-ul2=3212")};
-  int argc = sizeof(argv) / sizeof(char*);
-  paddle::ParseCommandLineFlags(&argc, argv);
-  ASSERT_EQ(argc, 1);
-  ASSERT_EQ(FLAGS_i2, 32);
-  ASSERT_EQ(FLAGS_str1, "abc");
-  ASSERT_EQ(FLAGS_b2, true);
-  ASSERT_EQ(FLAGS_b1, false);
-  ASSERT_NEAR(FLAGS_d2, 0.34, EPSILON);
-  ASSERT_NEAR(FLAGS_d1, 0.0, EPSILON);
-  ASSERT_EQ(FLAGS_l1, -12345678901234);
-  ASSERT_EQ(FLAGS_ul2, 3212UL);
-}
-
-TEST(CommandLineParser, printHelp) {
-  char* argv[] = {cc("test_program"), cc("--help")};
-  int argc = sizeof(argv) / sizeof(char*);
-
-  // Will Print Usage
-  ASSERT_DEATH(paddle::ParseCommandLineFlags(&argc, argv), ".*test_program.*");
-}
-
-TEST(CommandLineParser, parseError) {
-  char* argv[] = {cc("test_program"), cc("--i1=abc")};
-
-  int argc = sizeof(argv) / sizeof(char*);
-  ASSERT_DEATH(
-      paddle::ParseCommandLineFlags(&argc, argv),
-      "Parse command flag i1 error! User input is --i1=abc.*test_program.*");
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
-#else
-
-int main(int argc, char** argv) { return 0; }
-
-#endif
diff --git a/paddle/utils/tests/test_CustomStackTrace.cpp b/paddle/utils/tests/test_CustomStackTrace.cpp
index 512330b49e9d31ef5a9334b2371d2fe65e9b6fb1..2ce199837601755ac018889c07c223ad34c4a45b 100644
--- a/paddle/utils/tests/test_CustomStackTrace.cpp
+++ b/paddle/utils/tests/test_CustomStackTrace.cpp
@@ -15,12 +15,12 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <chrono>
 
-#include "paddle/utils/CustomStackTrace.h"
 #include "paddle/utils/CommandLineParser.h"
-#include "paddle/utils/Util.h"
+#include "paddle/utils/CustomStackTrace.h"
 #include "paddle/utils/Locks.h"
+#include "paddle/utils/Util.h"
 
-P_DEFINE_int32(test_thread_num, 10, "testing thread number");
+DEFINE_int32(test_thread_num, 10, "testing thread number");
 
 void testNormalImpl(
     const std::function<void(paddle::CustomStackTrace<std::string>&,
diff --git a/paddle/utils/tests/test_CustomStackTracePrint.cpp b/paddle/utils/tests/test_CustomStackTracePrint.cpp
index 60ba210b700581cd239945e2431431a4977a9376..611b16aa7116d03ee51ba0095d043b78df1742ba 100644
--- a/paddle/utils/tests/test_CustomStackTracePrint.cpp
+++ b/paddle/utils/tests/test_CustomStackTracePrint.cpp
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/utils/Util.h"
 #include "paddle/utils/CustomStackTrace.h"
+#include "paddle/utils/Util.h"
 
 int main(int argc, char** argv) {
   paddle::initMain(argc, argv);
diff --git a/paddle/utils/tests/test_Logging.cpp b/paddle/utils/tests/test_Logging.cpp
deleted file mode 100644
index 667864aa758373caa82a8c66048709d228783029..0000000000000000000000000000000000000000
--- a/paddle/utils/tests/test_Logging.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-/*
- * Basically from tensorflow/core/platform/default/logging.cc
- * Used in embedded system where there is no glogs.
- */
-
-#include <gtest/gtest.h>
-#include <fstream>
-#include <stdlib.h>
-#include <dirent.h>
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Util.h"
-#ifndef PADDLE_USE_GLOG
-TEST(Logging, BasicalLog) {
-  auto pinfo = [] {
-    P_LOG(INFO) << "INFO";
-    exit(1);
-  };
-  ASSERT_DEATH(pinfo(), "I .*test_Logging.cpp:[0-9]+] INFO");
-
-  auto pwarn = [] {
-    P_LOG(WARNING) << "WARN";
-    exit(1);
-  };
-  ASSERT_DEATH(pwarn(), "W .*test_Logging.cpp:[0-9]+] WARN");
-
-  auto perr = [] {
-    P_LOG(ERROR) << "ERROR";
-    exit(1);
-  };
-  ASSERT_DEATH(perr(), "E .*test_Logging.cpp:[0-9]+] ERROR");
-
-  auto pfatal = [] { P_LOG(FATAL) << "FATAL"; };
-  ASSERT_DEATH(pfatal(), "F .*test_Logging.cpp:[0-9]+] FATAL");
-}
-
-TEST(Logging, Check) {
-  int a = 1;
-  int b = 2;
-  P_CHECK(a != b);
-
-  auto pcheckDown = [&] { P_CHECK(a == b); };
-  ASSERT_DEATH(pcheckDown(),
-               "F .*test_Logging.cpp:[0-9]+] Check failed: a == b ");
-
-  P_CHECK_LE(a, b);
-  P_CHECK_LT(a, b);
-  double t = 1.2;
-  P_CHECK_LE(a, t);
-  double* ptr = nullptr;
-
-  auto pcheckDown2 = [&] { P_CHECK_NOTNULL(ptr); };
-  ASSERT_DEATH(pcheckDown2(), "F");
-}
-
-#define cc(x) const_cast<char*>(x)
-
-TEST(Logging, LogToStderr) {
-  auto logToStderrCallback = [] {
-    setenv("PLOG_LOGTOSTDERR", "0", true);
-    char* argv[] = {cc("test")};
-    paddle::initializeLogging(1, argv);
-    P_LOG(INFO) << "This output will not print to std error";
-    exit(1);
-  };
-
-  ASSERT_DEATH(logToStderrCallback(), "");
-}
-
-constexpr char kLogDirName[] = "./test_log_dir";
-const std::vector<std::string> kLevels = {"INFO", "WARNING", "ERROR", "FATAL"};
-
-TEST(Logging, LogToDir) {
-  ASSERT_EQ(0, mkdir(kLogDirName, 0777));
-  auto logToDirCallback = [] {
-    setenv("PLOG_LOGTOSTDERR", "0", true);
-    setenv("PLOG_LOGDIR", kLogDirName, true);
-    char* argv[] = {cc("test")};
-    paddle::initializeLogging(1, argv);
-
-    P_LOG(INFO) << "INFO";
-    P_LOG(WARNING) << "WARNING";
-    P_LOG(ERROR) << "ERROR";
-    P_LOG(FATAL) << "FATAL";
-  };
-  ASSERT_DEATH(logToDirCallback(), "");
-
-  // There 4 file in logdir
-  auto dir = opendir(kLogDirName);
-  size_t fileCount = 0;
-  std::vector<std::string> filenames;
-  for (auto dirContent = readdir(dir); dirContent != nullptr;
-       dirContent = readdir(dir)) {
-    std::string filename(dirContent->d_name);
-    if (filename == "." || filename == "..") {
-      continue;
-    } else {
-      ++fileCount;
-      for (size_t i = 0; i < kLevels.size(); ++i) {
-        const std::string& curLevel = kLevels[i];
-        if (filename.size() > curLevel.length()) {
-          size_t diff = filename.size() - curLevel.length();
-          size_t j = 0;
-          for (; j < curLevel.length(); ++j) {
-            if (filename[j + diff] != curLevel[j]) {
-              // File Suffix Not Same, then break.
-              break;
-            }
-          }
-          if (j == curLevel.length()) {  // Same suffix.
-            std::ifstream fin;
-            auto fn = paddle::path::join(kLogDirName, filename);
-            fin.open(fn);
-            filenames.push_back(fn);
-            ASSERT_TRUE(fin.is_open());
-            size_t lineCounter = 0;
-            for (std::string line; std::getline(fin, line); ++lineCounter) {
-              // Do Nothing, Just calc lineCounter.
-            }
-
-            // For example.
-            // The info channel will have all log which level >= INFO
-            // So the info file's lineCounter should == 4.
-            ASSERT_EQ(kLevels.size() - i, lineCounter);
-            fin.close();
-          }
-        }
-      }
-    }
-  }
-  closedir(dir);
-  ASSERT_EQ(4UL, fileCount);  // 4 levels.
-  // Clean Unittest.
-  for (std::string& fn : filenames) {
-    ASSERT_EQ(remove(fn.c_str()), 0);
-  }
-  ASSERT_EQ(rmdir(kLogDirName), 0);
-}
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
-
-#else
-
-int main(int, char**) { return 0; }
-
-#endif
diff --git a/paddle/utils/tests/test_SIMDFlags.cpp b/paddle/utils/tests/test_SIMDFlags.cpp
index a544901aa388a0d0a59ed0116ae62b8387a8829b..42edede209ad957c13c1cec8e6bb20bd0fe9d28b 100644
--- a/paddle/utils/tests/test_SIMDFlags.cpp
+++ b/paddle/utils/tests/test_SIMDFlags.cpp
@@ -9,44 +9,43 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-
 #include <gtest/gtest.h>
 
 #include "paddle/utils/CpuId.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Util.h"
 
-using namespace paddle; // NOLINT
+using namespace paddle;  // NOLINT
 
 TEST(SIMDFlags, gccTest) {
 #if (defined(__GNUC__) || defined(__GNUG__)) && !(defined(__clang__))
-    CHECK(!__builtin_cpu_supports("sse")   != HAS_SSE);
-    CHECK(!__builtin_cpu_supports("sse2")  != HAS_SSE2);
-    CHECK(!__builtin_cpu_supports("sse3")  != HAS_SSE3);
-    CHECK(!__builtin_cpu_supports("ssse3") != HAS_SSSE3);
-    CHECK(!__builtin_cpu_supports("sse4.1")!= HAS_SSE41);
-    CHECK(!__builtin_cpu_supports("sse4.2")!= HAS_SSE42);
-    CHECK(!__builtin_cpu_supports("avx")   != HAS_AVX);
-    CHECK(!__builtin_cpu_supports("avx2")  != HAS_AVX2);
+  // clang-format off
+  CHECK(!__builtin_cpu_supports("sse")    != HAS_SSE);
+  CHECK(!__builtin_cpu_supports("sse2")   != HAS_SSE2);
+  CHECK(!__builtin_cpu_supports("sse3")   != HAS_SSE3);
+  CHECK(!__builtin_cpu_supports("ssse3")  != HAS_SSSE3);
+  CHECK(!__builtin_cpu_supports("sse4.1") != HAS_SSE41);
+  CHECK(!__builtin_cpu_supports("sse4.2") != HAS_SSE42);
+  CHECK(!__builtin_cpu_supports("avx")    != HAS_AVX);
+  CHECK(!__builtin_cpu_supports("avx2")   != HAS_AVX2);
+// clang-format on
 #endif
 }
 
 TEST(SIMDFlags, normalPrint) {
-    auto simd = SIMDFlags::instance();
-    LOG(INFO) << "Has SSE2:    " << std::boolalpha << simd->isSSE2();
-    LOG(INFO) << "Has SSE3:    " << std::boolalpha << simd->isSSE3();
-    LOG(INFO) << "Has SSSE3:   " << std::boolalpha << simd->isSSSE3();
-    LOG(INFO) << "Has SSE4.1:  " << std::boolalpha << simd->isSSE41();
-    LOG(INFO) << "Has SSE4.2:  " << std::boolalpha << simd->isSSE42();
-    LOG(INFO) << "Has FMA3:    " << std::boolalpha << simd->isFMA3();
-    LOG(INFO) << "Has FMA4:    " << std::boolalpha << simd->isFMA4();
-    LOG(INFO) << "Has AVX:     " << std::boolalpha << simd->isAVX();
-    LOG(INFO) << "Has AVX2:    " << std::boolalpha << simd->isAVX2();
-    LOG(INFO) << "Has AVX512:  " << std::boolalpha << simd->isAVX512();
+  LOG(INFO) << "Has SSE:     " << std::boolalpha << HAS_SSE;
+  LOG(INFO) << "Has SSE2:    " << std::boolalpha << HAS_SSE2;
+  LOG(INFO) << "Has SSE3:    " << std::boolalpha << HAS_SSE3;
+  LOG(INFO) << "Has SSSE3:   " << std::boolalpha << HAS_SSSE3;
+  LOG(INFO) << "Has SSE4:    " << std::boolalpha << HAS_SSE41 || HAS_SSE42;
+  LOG(INFO) << "Has FMA3:    " << std::boolalpha << HAS_FMA3;
+  LOG(INFO) << "Has FMA4:    " << std::boolalpha << HAS_FMA4;
+  LOG(INFO) << "Has AVX:     " << std::boolalpha << HAS_AVX;
+  LOG(INFO) << "Has AVX2:    " << std::boolalpha << HAS_AVX2;
+  LOG(INFO) << "Has AVX512:  " << std::boolalpha << HAS_AVX512;
 }
 
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
   return RUN_ALL_TESTS();
 }
diff --git a/paddle/utils/tests/test_SpinLock.cpp b/paddle/utils/tests/test_SpinLock.cpp
index 9c7ad05b0b6f7e6cf98717a2b317e7d242c22d00..8351e7e3acd1afe1c6507ffced32f27ce065e5ce 100644
--- a/paddle/utils/tests/test_SpinLock.cpp
+++ b/paddle/utils/tests/test_SpinLock.cpp
@@ -14,12 +14,12 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #include <vector>
-#include "paddle/utils/Logging.h"
 #include "paddle/utils/CommandLineParser.h"
-#include "paddle/utils/Util.h"
 #include "paddle/utils/Locks.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Util.h"
 
-P_DEFINE_int32(test_thread_num, 100, "testing thread number");
+DEFINE_int32(test_thread_num, 100, "testing thread number");
 
 void testNormalImpl(
     size_t thread_num,
diff --git a/paddle/utils/tests/test_Thread.cpp b/paddle/utils/tests/test_Thread.cpp
index b069be1d7a28847a4aaefbdcef985f6b9100f8ab..2f5c5bbce07f39b799b928fd231bb4db1d2b3e05 100644
--- a/paddle/utils/tests/test_Thread.cpp
+++ b/paddle/utils/tests/test_Thread.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <atomic>
-#include <paddle/utils/Thread.h>
 #include <gtest/gtest.h>
+#include <paddle/utils/Thread.h>
+#include <atomic>
 
 using paddle::AsyncThreadPool;  // NOLINT
 
@@ -52,17 +52,13 @@ TEST(AsyncThreadPool, multiThreadAddBatchJob) {
   int counter = 0;
   const int numMonitors = 300;
   const int numSlaves = 300;
-  std::vector<AsyncThreadPool::JobFunc> moniterJobs(
-      numMonitors,
-      [&] {
-        std::vector<AsyncThreadPool::JobFunc> slaveJobs(
-            numSlaves,
-            [mut, &counter] {
-              std::lock_guard<std::mutex> lk(*mut);
-              counter++;
-            });
-        levelTwoPool.addBatchJobs(slaveJobs);
-      });
+  std::vector<AsyncThreadPool::JobFunc> moniterJobs(numMonitors, [&] {
+    std::vector<AsyncThreadPool::JobFunc> slaveJobs(numSlaves, [mut, &counter] {
+      std::lock_guard<std::mutex> lk(*mut);
+      counter++;
+    });
+    levelTwoPool.addBatchJobs(slaveJobs);
+  });
   levelOnePool.addBatchJobs(moniterJobs);
   ASSERT_EQ(counter, numMonitors * numSlaves);
 }
diff --git a/paddle/utils/tests/test_ThreadBarrier.cpp b/paddle/utils/tests/test_ThreadBarrier.cpp
index 997a393683cac7ad8e9aeceef9a74aba6c6fdf6b..60c2214ffd1066ed4f7b95cd63dfe6a24fe66d67 100644
--- a/paddle/utils/tests/test_ThreadBarrier.cpp
+++ b/paddle/utils/tests/test_ThreadBarrier.cpp
@@ -15,12 +15,12 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <set>
 #include <vector>
-#include "paddle/utils/Logging.h"
 #include "paddle/utils/CommandLineParser.h"
-#include "paddle/utils/Util.h"
 #include "paddle/utils/Locks.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Util.h"
 
-P_DEFINE_int32(test_thread_num, 100, "testing thread number");
+DEFINE_int32(test_thread_num, 100, "testing thread number");
 
 void testNormalImpl(
     size_t thread_num,
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
index d7f523bc8d9bce00ba72c41284d2b3eb3cde6529..2c40070eca44d8656d7ce82157a1b840092b9965 100644
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -6,25 +6,6 @@ set(proto_filenames
     ParameterService.proto
     TrainerConfig.proto)
 
-set(real_proto_files)
-
-# TODO(yuyang18): Some internal proto will also be depended on.
-#                 Find a way to automatically calculate all depends.
-foreach(filename ${proto_filenames})
-    set(PROTOBUF_3_FLAGS "")
-    if (PROTOBUF_3)
-        set(PROTOBUF_3_FLAGS "-Dproto3")
-    endif()
-    add_custom_command(OUTPUT ${filename}
-	COMMAND ${M4_EXECUTABLE} -Dreal=${ACCURACY} ${PROTOBUF_3_FLAGS} -I '${INTERNAL_PROTO_PATH}'
-              ${PROJ_ROOT}/proto/${filename}.m4 > ${filename}
-        DEPENDS ${PROJ_ROOT}/proto/${filename}.m4
-        COMMENT "Generate ${filename}")
-endforeach()
-
-add_custom_target(proto_accuracy ALL
-                    DEPENDS ${proto_filenames})
-
 set(PROTO_GEN)
 set(PROTO_GEN_PY)
 
@@ -39,9 +20,8 @@ foreach(filename ${proto_filenames})
     add_custom_command(OUTPUT ${CUR_PROTO_GEN}
         COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} 
                   --cpp_out ${CMAKE_CURRENT_BINARY_DIR}
-                  --proto_path ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/${filename}
-        DEPENDS proto_accuracy
-                ${PROJ_ROOT}/proto/${filename}.m4)
+		  --proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename}
+        DEPENDS ${filename})
 
     set(CUR_PROTO_GEN_PY
         ${PROJ_ROOT}/paddle/python/paddle/proto/${base_filename}_pb2.py)
@@ -50,9 +30,8 @@ foreach(filename ${proto_filenames})
         ${PROTO_GEN_PY})
     add_custom_command(OUTPUT ${CUR_PROTO_GEN_PY}
         COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${PROJ_ROOT}/python/paddle/proto
-                  --proto_path ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_BINARY_DIR}/${filename}
-        DEPENDS proto_accuracy
-                ${PROJ_ROOT}/proto/${filename}.m4)
+	--proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename}
+        DEPENDS ${filename})
 endforeach()
 
 include_directories(${CMAKE_CURRENT_BINARY_DIR}/proto)
@@ -61,5 +40,4 @@ add_custom_target(gen_proto_cpp ALL DEPENDS ${PROTO_GEN})
 add_custom_target(gen_proto_py ALL DEPENDS ${PROTO_GEN_PY})
 add_library(paddle_proto STATIC
     ${PROTO_GEN})
-add_dependencies(paddle_proto proto_accuracy)
 target_include_directories(paddle_proto PUBLIC ${CMAKE_CURRENT_BINARY_DIR})
diff --git a/proto/DataConfig.proto.m4 b/proto/DataConfig.proto
similarity index 93%
rename from proto/DataConfig.proto.m4
rename to proto/DataConfig.proto
index 1f8e3f4f3e523447b69bfd2dbce9c99dc22571d1..e895c184d9f95dba1449e6467a2566712837600b 100644
--- a/proto/DataConfig.proto.m4
+++ b/proto/DataConfig.proto
@@ -11,11 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-ifdef(`proto3', `syntax = "proto2";')
+syntax = "proto2";
 
 package paddle;
 
-sinclude(`DataConfigExt.proto.m4')
+
 message FileGroupConf {
   optional uint32 queue_capacity = 1 [default = 1];
   // how many files to load for a load file thread
@@ -26,7 +26,7 @@ message FileGroupConf {
 };
 
 message DataConfig {
-sinclude(`DataConfigInter.proto.m4')
+
   required string type = 1;
 
   // name of a text file which contains a list of file names at each line
@@ -51,11 +51,11 @@ sinclude(`DataConfigInter.proto.m4')
 
   /// Note the field number 17, 18 and 19 have been deprecated.
 
-  // a list of values which will be used to create additional one dimensional real
+  // a list of values which will be used to create additional one dimensional float
   // values slots. These one dimensional slots can be used as the weight input
   // for cost layers.
   // Currently this is only supported by ProtoDataProvider.
-  repeated real constant_slots = 20;
+  repeated double constant_slots = 20;
 
   // for PyDataProvider.
   // Specify the load data script module name, object name and user args
@@ -80,6 +80,6 @@ sinclude(`DataConfigInter.proto.m4')
   optional bool is_main_data = 26 [default = true];
 
   // the usage ratio of instances. Setting to 1.0 means the use of all instances.
-  optional real usage_ratio = 27 [default = 1.0];
+  optional double usage_ratio = 27 [default = 1.0];
 };
 
diff --git a/proto/DataFormat.proto.m4 b/proto/DataFormat.proto
similarity index 98%
rename from proto/DataFormat.proto.m4
rename to proto/DataFormat.proto
index 54e9fd008e485d24c21c58d543be6b311378905b..19b1499b0281a1b92028cc8944c27ee4d56b8dd2 100644
--- a/proto/DataFormat.proto.m4
+++ b/proto/DataFormat.proto
@@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-ifdef(`proto3', `syntax = "proto2";')
+syntax = "proto2";
 
 package paddle;
 
diff --git a/proto/ModelConfig.proto.m4 b/proto/ModelConfig.proto
similarity index 94%
rename from proto/ModelConfig.proto.m4
rename to proto/ModelConfig.proto
index ccad69a3c2209542d2be855ddf3f75def9e8d729..552af71e76e5adf27f35bb5ad6fd8a69c71df0f1 100644
--- a/proto/ModelConfig.proto.m4
+++ b/proto/ModelConfig.proto
@@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-ifdef(`proto3', `syntax = "proto2";')
+syntax = "proto2";
 
 import "ParameterConfig.proto";
 
@@ -20,7 +20,7 @@ package paddle;
 /**
  * Various structs for the configuration of a neural network
  */
-sinclude(`ModelConfigExt.proto.m4')
+
 
 message ExternalConfig {
   repeated string layer_names = 1;
@@ -146,8 +146,8 @@ message NormConfig {
 
   // the parameters for normalization
   // u = u / (1+scale*sum(u^2 in window))^pow
-  required real scale = 4;
-  required real pow = 5;
+  required double scale = 4;
+  required double pow = 5;
 
   // The size of output feature map.
   required uint32 output_x = 6;
@@ -223,7 +223,7 @@ message OperatorConfig {
   required uint64 output_size = 4;
 
   // For DotMulOperator
-  optional real dotmul_scale = 5 [default = 1.0];
+  optional double dotmul_scale = 5 [default = 1.0];
 
   // For ConvOperator
   optional ConvConfig conv_conf = 6;
@@ -245,7 +245,7 @@ message ImageConfig {
 
   // The size of input feature map.
   required uint32 img_size = 8;
-  required uint32 img_size_y = 9;
+  optional uint32 img_size_y = 9;
 }
 
 message LayerInputConfig {
@@ -266,7 +266,7 @@ message LayerInputConfig {
 }
 
 message LayerConfig {
-sinclude(`ModelConfigLayer.proto.m4')
+
   required string name = 1;
   required string type = 2;
   optional uint64 size = 3;
@@ -293,7 +293,7 @@ sinclude(`ModelConfigLayer.proto.m4')
   optional uint32 partial_sum = 9;
 
   // for dropout
-  optional real drop_rate = 10;
+  optional double drop_rate = 10;
 
   // for HierarchicalSoftmaxLayer and NCELayer
   // the number of classes
@@ -317,17 +317,17 @@ sinclude(`ModelConfigLayer.proto.m4')
   // For NCELayer
   // The distribution for generating the random negative labels.
   // A uniform distribution will be used if not provided
-  repeated real neg_sampling_dist = 17 [packed = true];
+  repeated double neg_sampling_dist = 17 [packed = true];
 
   // For MaxLayer
   // default: output VALUE of MaxLayer. set this flag to true for output INDEX
-  // INDEX will be put in Argument::value as real values.
+  // INDEX will be put in Argument::value as double values.
   optional bool output_max_index = 19 [default = false];
 
   /// The filed number 20 have been deprecated.
 
   // For self-normalized estimation
-  optional real softmax_selfnorm_alpha = 21 [default = 0.1];
+  optional double softmax_selfnorm_alpha = 21 [default = 0.1];
 
   /// The filed numbers 22 and 23 have been deprecated.
 
@@ -338,14 +338,14 @@ sinclude(`ModelConfigLayer.proto.m4')
   optional bool norm_by_times = 25;
 
   // for CostLayers
-  optional real coeff = 26 [default = 1.0];
+  optional double coeff = 26 [default = 1.0];
 
   // for AverageLayer
   // can be set to: 'average', 'sum' or 'squarerootn'
   optional string average_strategy = 27;
 
   // for error clipping
-  optional real error_clipping_threshold = 28 [default = 0.0];
+  optional double error_clipping_threshold = 28 [default = 0.0];
 
   // for operators used by mixed layer
   repeated OperatorConfig operator_confs = 29;
@@ -355,11 +355,11 @@ sinclude(`ModelConfigLayer.proto.m4')
   optional int32 max_sort_size = 31;
 
   // for SlopeInterceptLayer
-  optional real slope = 32;
-  optional real intercept = 33;
+  optional double slope = 32;
+  optional double intercept = 33;
 
   // for CosSimVecMatLayer and CosSimLayer
-  optional real cos_scale = 34;
+  optional double cos_scale = 34;
 
   // for DataNormLayer
   // can be set to: 'z-score', 'min-max' or 'decimal-scaling'
@@ -394,7 +394,7 @@ sinclude(`ModelConfigLayer.proto.m4')
   // if number of the selected columns is less than
   // sample number * selective_fc output size * selective_fc_mull_mull_ratio
   // sparse multiplication is used, otherwise, using full multiplication.
-  optional real selective_fc_full_mul_ratio = 44 [default = 0.02];
+  optional double selective_fc_full_mul_ratio = 44 [default = 0.02];
 
   // to indicate how many threads selective_fc use to to accelate
   // the plain_mul period
@@ -406,7 +406,7 @@ sinclude(`ModelConfigLayer.proto.m4')
   optional bool use_global_stats = 46;
 
   // use to compute moving mean and variance.
-  optional real moving_average_fraction = 47 [default = 0.9];
+  optional double moving_average_fraction = 47 [default = 0.9];
 
   // bias size
   optional uint32 bias_size = 48 [default = 0];
@@ -438,7 +438,7 @@ message EvaluatorConfig {
 
   // Used by PrecisionRecallEvaluator and ClassificationErrorEvaluator
   // For multi binary labels: true if output > classification_threshold
-  optional real classification_threshold = 6 [default = 0.5];
+  optional double classification_threshold = 6 [default = 0.5];
   // The positive label. -1 means average precision and recall
   optional int32 positive_label = 7 [default = -1];
 
diff --git a/proto/ParameterConfig.proto.m4 b/proto/ParameterConfig.proto
similarity index 87%
rename from proto/ParameterConfig.proto.m4
rename to proto/ParameterConfig.proto
index b5c0fea6c373307dc0af2e29c0f1ff5362823411..cbcd0af598df22c36c66767fdeb7add2aa49e87d 100644
--- a/proto/ParameterConfig.proto.m4
+++ b/proto/ParameterConfig.proto
@@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-ifdef(`proto3', `syntax = "proto2";')
+syntax = "proto2";
 
 package paddle;
 
@@ -32,14 +32,14 @@ message ParameterUpdaterHookConfig {
 message ParameterConfig {
   required string name = 1;
   required uint64 size = 2;
-  optional real learning_rate = 3 [default = 1.0];
-  optional real momentum = 4 [default = 0.0];
-  optional real initial_mean = 5 [default = 0.0];
-  optional real initial_std = 6 [default = 0.01];
+  optional double learning_rate = 3 [default = 1.0];
+  optional double momentum = 4 [default = 0.0];
+  optional double initial_mean = 5 [default = 0.0];
+  optional double initial_std = 6 [default = 0.01];
   // use L2-regularization if decay_rate set and decay_rate_l1 not set
-  optional real decay_rate = 7 [default = 0.0];
+  optional double decay_rate = 7 [default = 0.0];
   // use L1-regularization if decay_rate_l1 set
-  optional real decay_rate_l1 = 8 [default = 0.0];
+  optional double decay_rate_l1 = 8 [default = 0.0];
   // dims of Parameter, e.g. dims[0] as height, dims[1] as width..
   repeated uint64 dims = 9;
   // the gpu device which the parameter in.
@@ -60,7 +60,7 @@ message ParameterConfig {
   // sparse remote update or not
   optional bool sparse_remote_update = 16 [default = false];
   // gradient clipping threshold, no clipping by default
-  optional real gradient_clipping_threshold = 17 [default = 0.0];
+  optional double gradient_clipping_threshold = 17 [default = 0.0];
   // static parameters are fixed when training
   optional bool is_static = 18 [default = false];
   // para_id should NOT be set by config_parser. It is for
diff --git a/proto/ParameterService.proto.m4 b/proto/ParameterService.proto
similarity index 97%
rename from proto/ParameterService.proto.m4
rename to proto/ParameterService.proto
index 25b0991583ec128aeeca1ca775a574f81500d6e5..c1c04d8cc5bdedd09173d5dfa10b82c7ee7ed6a4 100644
--- a/proto/ParameterService.proto.m4
+++ b/proto/ParameterService.proto
@@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-ifdef(`proto3', `syntax = "proto2";')
+syntax = "proto2";
 
 import "ParameterConfig.proto";
 import "TrainerConfig.proto";
@@ -73,7 +73,7 @@ message SendParameterRequest {
   optional int64 num_samples = 4;
 
   // cost will be used to calculate global objective value
-  optional real cost = 5;
+  optional double cost = 5;
 
   required BatchStatus batch_status = 6;
 
@@ -245,13 +245,13 @@ enum MatrixVectorOperation {
 
 message ProtoVector {
   required int64 dim = 1;
-  repeated real values = 2 [packed = true];
+  repeated double values = 2 [packed = true];
 }
 
 message ProtoMatrix {
   required int64 num_rows = 1;
   required int64 num_cols = 2;
-  repeated real values = 3 [packed = true];
+  repeated double values = 3 [packed = true];
 }
 
 message Operation {
@@ -263,7 +263,7 @@ message Operation {
   // matrix handles created on the pserver
   repeated int64 pmatrices = 3;       // A, B, C
 
-  repeated real scalars = 4;  	      // a, b, c
+  repeated double scalars = 4;  	      // a, b, c
   repeated ProtoVector vectors = 5;   // x, y, z
   repeated ProtoMatrix matrices = 6;  // X, Y, Z
 }
@@ -272,7 +272,7 @@ message OperationResult {
   // error message. Empty if success
   optional string return_message = 1;
 //
-  repeated real scalars = 2;  // d, e, f
+  repeated double scalars = 2;  // d, e, f
   repeated ProtoVector vectors = 3;  // p, q, r
   repeated ProtoMatrix matrices = 4;  // P, Q, R
 }
diff --git a/proto/TrainerConfig.proto.m4 b/proto/TrainerConfig.proto
similarity index 87%
rename from proto/TrainerConfig.proto.m4
rename to proto/TrainerConfig.proto
index 4684203b03e3297f60629ff6929729c3daffd8c6..a334e07b6282a6ff9867482e0c3a299df2a78d1d 100644
--- a/proto/TrainerConfig.proto.m4
+++ b/proto/TrainerConfig.proto
@@ -11,7 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-ifdef(`proto3', `syntax = "proto2";')
+syntax = "proto2";
 
 import "DataConfig.proto";
 import "ModelConfig.proto";
@@ -24,9 +24,9 @@ message OptimizationConfig {
   optional int32 num_batches_per_send_parameter = 5 [default = 1];
   optional int32 num_batches_per_get_parameter = 6 [default = 1];
 
-  required real learning_rate = 7;
-  optional real learning_rate_decay_a = 8 [default = 0];
-  optional real learning_rate_decay_b = 9 [default = 0];
+  required double learning_rate = 7;
+  optional double learning_rate_decay_a = 8 [default = 0];
+  optional double learning_rate_decay_b = 9 [default = 0];
   optional string learning_rate_schedule = 27 [default = "constant"];
   // learning rate will be scaled according to learning_rate_schedule
   // 1), constant:
@@ -49,14 +49,14 @@ message OptimizationConfig {
 
   // owlqn related
   // L1-regularization
-  optional real l1weight = 10 [default = 0.1];
+  optional double l1weight = 10 [default = 0.1];
   // L2-regularization
-  optional real l2weight = 11 [default = 0];
+  optional double l2weight = 11 [default = 0];
   // "c1" in wolfe condition: if (newobj <= oldobj + c1 * origDirDeriv * step)
   // then accept the step
-  optional real c1 = 12 [default = 0.0001];
+  optional double c1 = 12 [default = 0.0001];
   // multiply the step with "backoff", when wolfe condition doesn't satisfy
-  optional real backoff = 13 [default = 0.5];
+  optional double backoff = 13 [default = 0.5];
   // how many "s"s and "y"s are kept in owlqn
   optional int32 owlqn_steps = 14 [default = 10];
   // accept the step if encountered "max_backoff" times of "reduce the step"
@@ -82,15 +82,15 @@ message OptimizationConfig {
   // default learning method("momentum") use global decayed learning rate with momentum.
   // "adagrad", "adadelta" and "rmsprop" can set momentum too.
   optional string learning_method = 23 [default = "momentum"];
-  optional real ada_epsilon = 24 [default = 1e-6];
-  optional real ada_rou = 26 [default = 0.95];
+  optional double ada_epsilon = 24 [default = 1e-6];
+  optional double ada_rou = 26 [default = 0.95];
 
   // Force to do average in cpu in order to save gpu memory usage
   optional bool do_average_in_cpu = 25 [default = false];
 
   // delta add rate in pserver, used while num_batches_per_send_parameter>1
   // will be divided by #machines automatically.
-  optional real delta_add_rate = 28 [default = 1.0];
+  optional double delta_add_rate = 28 [default = 1.0];
 
   // We split a large size into smaller mini-batches, whose sizes are
   // determined by mini_batch_size. It only takes effect when there is
@@ -108,14 +108,14 @@ message OptimizationConfig {
 
   // shrink sparse parameter value
   // only works if parameter is remote sparse update and has L1 decay rate
-  optional real shrink_parameter_value = 32 [default = 0];
+  optional double shrink_parameter_value = 32 [default = 0];
 
   ////////////////////////////
   // Options Adam Optimizer //
   ////////////////////////////
-  optional real adam_beta1 = 33 [default = 0.9];
-  optional real adam_beta2 = 34 [default = 0.999];
-  optional real adam_epsilon = 35 [default = 1e-8];
+  optional double adam_beta1 = 33 [default = 0.9];
+  optional double adam_beta2 = 34 [default = 0.999];
+  optional double adam_epsilon = 35 [default = 1e-8];
 
   // arguments for learning rate scheduler
   // Format: num1:rate1,num2:rate2,...,numK:rateK
@@ -127,7 +127,7 @@ message OptimizationConfig {
   // for async sgd gradient commit control.
   // when async_lagged_grad_discard_ratio * num_gradient_servers commit passed,
   // current async gradient will be discard silently.
-  optional real async_lagged_grad_discard_ratio = 37 [default = 1.5];
+  optional double async_lagged_grad_discard_ratio = 37 [default = 1.5];
 };
 
 message TrainerConfig {
diff --git a/python/paddle/trainer/PyDataProvider2.py b/python/paddle/trainer/PyDataProvider2.py
index 6e8cce1cce700f3a46571451da17b781e838c3b8..6618153df30250652f1721d2fb0bb75ecbb8a04a 100644
--- a/python/paddle/trainer/PyDataProvider2.py
+++ b/python/paddle/trainer/PyDataProvider2.py
@@ -203,6 +203,26 @@ class CheckWrapper(object):
             callback(each)
 
 
+class CheckInputTypeWrapper(object):
+    def __init__(self, generator, input_types, logger):
+        self.generator = generator
+        self.input_types = input_types
+        self.logger = logger
+
+    def __call__(self, obj, filename):
+        for items in self.generator(obj, filename):
+            try:
+                # dict type is required for input_types when item is dict type 
+                assert (isinstance(items, dict) and \
+                        not isinstance(self.input_types, dict))==False
+                yield items
+            except AssertionError as e:
+                self.logger.error(
+                    "%s type is required for input type but got %s" %
+                    (repr(type(items)), repr(type(self.input_types))))
+                raise
+
+
 def provider(input_types=None,
              should_shuffle=None,
              pool_size=-1,
@@ -355,6 +375,9 @@ def provider(input_types=None,
                 if use_dynamic_order:
                     self.generator = InputOrderWrapper(self.generator,
                                                        self.input_order)
+                else:
+                    self.generator = CheckInputTypeWrapper(
+                        self.generator, self.slots, self.logger)
                 if self.check:
                     self.generator = CheckWrapper(self.generator, self.slots,
                                                   check_fail_continue,
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index fd7fb40822cdfc84e5a6b62559ab18fba7b2824c..5b7f4d85e2c3343013938e38492be8985a8cd11f 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -141,9 +141,9 @@ def init_config_environment(
         g_add_submodel_suffix=False,
 
         # Whether current layer needs to pass the image height and width.
-        # Default value is true, but if it encounters recurrent_layer_group, 
-        # it will be false. The reason is that image is converted to be sequence, 
-        # image height will be sequence length, and image width will be feature 
+        # Default value is true, but if it encounters recurrent_layer_group,
+        # it will be false. The reason is that image is converted to be sequence,
+        # image height will be sequence length, and image width will be feature
         # length of each timestep.
         g_pass_height_width=True, ):
 
@@ -1067,7 +1067,7 @@ def cnn_output_size(img_size, filter_size, padding, stride, caffe_mode):
         return 1 + int(math.ceil(output))
 
 
-#calcualte image_size based on output_size for de-convolution (ConvTransLayer). 
+#calcualte image_size based on output_size for de-convolution (ConvTransLayer).
 #It is the reverse function of cnn_output_size
 def cnn_image_size(output_size, filter_size, padding, stride, caffe_mode):
     img_size = (output_size - 1) * stride + filter_size - 2 * padding
@@ -3365,12 +3365,26 @@ def my_fatal(s):
     raise Exception()
 
 
+_parse_config_hooks = set()
+
+
+def register_parse_config_hook(f):
+    """
+    Register a hook function for parse_config. parse_config will invoke the hook
+    at the beginning of parse. This make it possible to reset global state for
+    for constructing the model.
+    """
+    _parse_config_hooks.add(f)
+
+
 def parse_config(config_file, config_arg_str):
     '''
     @param config_arg_str: a string of the form var1=val1,var2=val2. It will be
     passed to config script as a dictionary CONFIG_ARGS
     '''
     init_config_environment()
+    for hook in _parse_config_hooks:
+        hook()
 
     config_args = {}
 
diff --git a/python/paddle/trainer_config_helpers/__init__.py b/python/paddle/trainer_config_helpers/__init__.py
index 3ac14549340bb61809a434053a8f04071a767b14..a2335768b92b66e3bfc0fb0d2562fb24ad291258 100644
--- a/python/paddle/trainer_config_helpers/__init__.py
+++ b/python/paddle/trainer_config_helpers/__init__.py
@@ -22,4 +22,4 @@ from optimizers import *
 from attrs import *
 
 # This will enable operator overload for LayerOutput
-import math
+import math as layer_math
diff --git a/python/paddle/trainer_config_helpers/data_sources.py b/python/paddle/trainer_config_helpers/data_sources.py
index b6ecd42857852c1f9f12db8370183d7799761a4f..d7cb95c477133823f9147e2085c9c609916f16e8 100644
--- a/python/paddle/trainer_config_helpers/data_sources.py
+++ b/python/paddle/trainer_config_helpers/data_sources.py
@@ -69,7 +69,7 @@ def define_py_data_source(file_list,
     """
     if isinstance(file_list, list):
         file_list_name = 'train.list'
-        if isinstance(cls, TestData):
+        if cls == TestData:
             file_list_name = 'test.list'
         with open(file_list_name, 'w') as f:
             f.writelines(file_list)
@@ -186,8 +186,7 @@ def define_py_data_sources2(train_list, test_list, module, obj, args=None):
                                 obj="process", 
                                 args={"dictionary": dict_name})
 
-    The related data provider can refer to 
-    `here <../../data_provider/pydataprovider2.html#dataprovider-for-the-sequential-model>`__.
+    The related data provider can refer to :ref:`api_pydataprovider2_sequential_model` .
 
     :param train_list: Train list name.
     :type train_list: basestring
diff --git a/python/paddle/trainer_config_helpers/default_decorators.py b/python/paddle/trainer_config_helpers/default_decorators.py
index 1caad193496cc62bca881d23d1d6ff2bebcc8f98..ad3efcbf369411b9c42b2a32ed05b04f86bf7de6 100644
--- a/python/paddle/trainer_config_helpers/default_decorators.py
+++ b/python/paddle/trainer_config_helpers/default_decorators.py
@@ -78,6 +78,20 @@ class DefaultNameFactory(object):
         """
         pass
 
+    def reset(self):
+        self.__counter__ = 0
+
+
+_name_factories = []
+
+
+def reset_hook():
+    for factory in _name_factories:
+        factory.reset()
+
+
+register_parse_config_hook(reset_hook)
+
 
 def wrap_name_default(name_prefix=None):
     """
@@ -95,7 +109,9 @@ def wrap_name_default(name_prefix=None):
     :return: a decorator to set default name
     :rtype: callable
     """
-    return wrap_param_default(["name"], DefaultNameFactory(name_prefix))
+    factory = DefaultNameFactory(name_prefix)
+    _name_factories.append(factory)
+    return wrap_param_default(["name"], factory)
 
 
 def wrap_param_attr_default(param_names=None, default_factory=None):
diff --git a/python/paddle/trainer_config_helpers/evaluators.py b/python/paddle/trainer_config_helpers/evaluators.py
index 0ee116d8c47c2f2d351af89583e2edbc22f9df55..3e0e88972c58e8c853e79e21f839943ae4b027d6 100644
--- a/python/paddle/trainer_config_helpers/evaluators.py
+++ b/python/paddle/trainer_config_helpers/evaluators.py
@@ -327,9 +327,10 @@ def ctc_error_evaluator(
 @wrap_name_default()
 def chunk_evaluator(
         input,
-        name=None,
-        chunk_scheme=None,
-        num_chunk_types=None, ):
+        label,
+        chunk_scheme,
+        num_chunk_types,
+        name=None, ):
     """
     Chunk evaluator is used to evaluate segment labelling accuracy for a
     sequence. It calculates the chunk detection F1 score.
@@ -363,22 +364,24 @@ def chunk_evaluator(
 
     .. code-block:: python
 
-       eval = chunk_evaluator(input)
+       eval = chunk_evaluator(input, label, chunk_scheme, num_chunk_types)
 
     :param input: The input layers.
     :type input: LayerOutput
-    :param name: The Evaluator name, it is not necessary.
-    :type name: basename|None
+    :param label: An input layer containing the ground truth label.
+    :type label: LayerOutput
     :param chunk_scheme: The labelling schemes support 4 types. It is one of
-                         "IOB", "IOE", "IOBES", "plain".This Evaluator must
-                         contain this chunk_scheme.
+                         "IOB", "IOE", "IOBES", "plain". It is required.
     :type chunk_scheme: basestring
     :param num_chunk_types: number of chunk types other than "other"
+    :param name: The Evaluator name, it is optional.
+    :type name: basename|None
     """
     evaluator_base(
         name=name,
         type="chunk",
         input=input,
+        label=label,
         chunk_scheme=chunk_scheme,
         num_chunk_types=num_chunk_types)
 
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 8dd6b7b7d28f841d7d7657b8ef3c25188c2f086e..c10fa671bdbf7c0a5e34c183533e04bea76029d4 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -970,7 +970,7 @@ def pooling_layer(input,
     :param layer_attr: The Extra Attributes for layer, such as dropout.
     :type layer_attr: ExtraLayerAttribute|None
     :return: LayerOutput object.
-    :rtype: LayerType
+    :rtype: LayerOutput
     """
     extra_dict = dict()
     # noinspection PyUnresolvedReferences
diff --git a/python/paddle/trainer_config_helpers/optimizers.py b/python/paddle/trainer_config_helpers/optimizers.py
index d95b2cfe464bbe6c9bbd216ab4f066545c970cdc..a53ebe160be3b5d6d115e3e15d059d3d87e80942 100644
--- a/python/paddle/trainer_config_helpers/optimizers.py
+++ b/python/paddle/trainer_config_helpers/optimizers.py
@@ -361,9 +361,6 @@ def settings(batch_size,
              learning_rate_decay_b=0.,
              learning_rate_schedule='poly',
              learning_rate_args='',
-             average_window=0,
-             do_average_in_cpu=False,
-             max_average_window=None,
              learning_method=None,
              regularization=None,
              is_async=False,
@@ -411,8 +408,7 @@ def settings(batch_size,
 
     args = [
         'batch_size', 'learning_rate', 'learning_rate_decay_a',
-        'learning_rate_decay_b', 'learning_rate_schedule', 'learning_rate_args',
-        'average_window', 'do_average_in_cpu', 'max_average_window'
+        'learning_rate_decay_b', 'learning_rate_schedule', 'learning_rate_args'
     ]
     kwargs = dict()
     kwargs['algorithm'] = algorithm
diff --git a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
index 6180b2efbcad87e511a4b981d533f204f45fb5dc..d1a9843d326669711bf3d0769df1b804cfcfa673 100644
--- a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
+++ b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
@@ -4,6 +4,11 @@ add_test(NAME layers_test
         python ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/layers_test.py
     WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
 
+add_test(NAME test_reset_hook
+  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+        python ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
+    WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
+
 if (PROTOBUF_3)
   add_paddle_exe(protobuf_equal
     ProtobufEqualMain.cpp)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
index e55f9bd3884a907dcc17a882e3c1dfd71fef79bb..a54af94ce3db4ed300dee697b30516c3b6448d7c 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
@@ -11,10 +11,12 @@ for conf in ${configs[*]}
 do
     echo "Generating " $conf
     python -m paddle.utils.dump_config $conf.py > $protostr/$conf.protostr.unittest
+    cat ${conf}.py |python test_config_parser_for_non_file_config.py > $protostr/$conf.protostr.non_file_config.unittest
 done
 
 for conf in ${whole_configs[*]}
 do
     echo "Generating " $conf
     python -m paddle.utils.dump_config $conf.py "" --whole > $protostr/$conf.protostr.unittest
+    cat ${conf}.py |python test_config_parser_for_non_file_config.py --whole > $protostr/$conf.protostr.non_file_config.unittest
 done
diff --git a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
index c4c6d4020f59c9d39c0cfc1f075c16ac16ac33db..3331c10d6497f58eb135208bd7abe48aacfb10ae 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/math_ops.py
@@ -1,15 +1,14 @@
 from paddle.trainer_config_helpers import *
-from paddle.trainer_config_helpers import math
 
 settings(batch_size=1000, learning_rate=1e-5)
 
 x = data_layer(name='data', size=100)
-x = math.exp(x)
-x = math.log(x)
-x = math.abs(x)
-x = math.sigmoid(x)
-x = math.square(x)
-x = math.square(x)
+x = layer_math.exp(x)
+x = layer_math.log(x)
+x = layer_math.abs(x)
+x = layer_math.sigmoid(x)
+x = layer_math.square(x)
+x = layer_math.square(x)
 y = 1 + x
 y = y + 1
 y = x + y
diff --git a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
index 73f8b333b236a8850e4c2dfa8fc75addeb143e9d..e984ee70625456241b3cfe6202fdadaa3807d33c 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
@@ -17,6 +17,7 @@ if [ -z $1 ]; then
       base_protostr=$protostr/$file
       new_protostr=$protostr/$file.unittest
       diff $base_protostr $new_protostr -u
+      diff $protostr/$file $protostr/$file.non_file_config.unittest -u
   done
 else
   for file in ${configs[*]}
@@ -24,6 +25,9 @@ else
     if ! $1 $protostr/$file.protostr $protostr/$file.protostr.unittest; then
       diff $protostr/$file.protostr $protostr/$file.protostr.unittest -u
     fi
+    if ! $1 $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest; then
+      diff $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest -u
+    fi
   done
 
   for file in ${whole_configs[*]}
@@ -31,5 +35,8 @@ else
     if ! $1 $protostr/$file.protostr $protostr/$file.protostr.unittest --whole; then
       diff $protostr/$file.protostr $protostr/$file.protostr.unittest -u
     fi
+    if ! $1 $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest --whole; then
+      diff $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest -u
+    fi
   done
 fi
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_config_parser_for_non_file_config.py b/python/paddle/trainer_config_helpers/tests/configs/test_config_parser_for_non_file_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..9b791a0222dd60e9ae2fca8b2798cddd13ed1d1c
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_config_parser_for_non_file_config.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import re
+import getopt
+
+
+def main(print_whole_config, globals, locals):
+    '''
+     this test will all test_config.py
+  '''
+    cmdstr = """from paddle.trainer.config_parser import parse_config\n"""
+    importstr = ""
+    functionstr = ""
+
+    for line in sys.stdin:
+        if re.match("^import", line) or re.match("^from.*import", line):
+            importstr = importstr + line
+        else:
+            functionstr = functionstr + "  " + line
+
+    cmdstr = cmdstr + importstr + """def configs():\n""" + functionstr
+    #cmdstr = cmdstr + """def configs():\n""" + importstr + functionstr
+    if print_whole_config:
+        cmdstr = cmdstr + """print parse_config(configs, "")"""
+    else:
+        cmdstr = cmdstr + """print parse_config(configs, "").model_config"""
+
+    exec (cmdstr, globals, locals)
+
+
+if __name__ == '__main__':
+    whole = False
+    opts, args = getopt.getopt(sys.argv[1:], "", ["whole"])
+    for op, value in opts:
+        if op == "--whole":
+            whole = True
+    main(whole, globals(), locals())
diff --git a/python/paddle/trainer_config_helpers/tests/test_reset_hook.py b/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
new file mode 100644
index 0000000000000000000000000000000000000000..0423babdb720191d8e9dfc67f1af3be339dbe27d
--- /dev/null
+++ b/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
@@ -0,0 +1,28 @@
+# Copyright PaddlePaddle contributors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from paddle.trainer.config_parser import parse_config
+
+
+class TestParse(unittest.TestCase):
+    def test_parse(self):
+        a = parse_config('trainer_config_helpers/tests/layers_test_config.py',
+                         '')
+        b = parse_config('trainer_config_helpers/tests/layers_test_config.py',
+                         '')
+        self.assertEqual(a, b)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/third_party/gflags.BUILD b/third_party/gflags.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..85e8bd0bd74942102e5e9a9f817dc49383a745e7
--- /dev/null
+++ b/third_party/gflags.BUILD
@@ -0,0 +1,12 @@
+# Bazel (http://bazel.io/) BUILD file for gflags.
+#
+# See INSTALL.md for instructions for adding gflags to a Bazel workspace.
+
+licenses(["notice"])
+
+exports_files(["src/gflags_complections.sh", "COPYING.txt"])
+
+load(":bazel/gflags.bzl", "gflags_sources", "gflags_library")
+(hdrs, srcs) = gflags_sources(namespace=["google", "gflags"])
+gflags_library(hdrs=hdrs, srcs=srcs, threads=0)
+gflags_library(hdrs=hdrs, srcs=srcs, threads=1)
diff --git a/third_party/gflags_test/BUILD b/third_party/gflags_test/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..b50615203ba17c74a4c7611b685f3d3210389bbf
--- /dev/null
+++ b/third_party/gflags_test/BUILD
@@ -0,0 +1,10 @@
+licenses(["notice"])  # Apache 2.0
+
+cc_test(
+    name="gflags_test",
+    srcs=["gflags_test.cc"],
+    copts=["-Iexternal/gtest/include"],
+    deps=[
+        "@gtest//:gtest",
+        "@gflags//:gflags",
+    ], )
diff --git a/third_party/gflags_test/gflags_test.cc b/third_party/gflags_test/gflags_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..53286e7e5be062cf66b37d07047b173ea831e6c4
--- /dev/null
+++ b/third_party/gflags_test/gflags_test.cc
@@ -0,0 +1,33 @@
+#include <iostream>
+#include <string>
+
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+
+DEFINE_bool(verbose, false, "Display program name before message");
+DEFINE_string(message, "Hello world!", "Message to print");
+
+static bool IsNonEmptyMessage(const char *flagname, const std::string &value) {
+  return value[0] != '\0';
+}
+DEFINE_validator(message, &IsNonEmptyMessage);
+
+namespace third_party {
+namespace gflags_test {
+
+TEST(GflagsTest, ParseAndPrint) {
+  gflags::SetUsageMessage("some usage message");
+  gflags::SetVersionString("1.0.0");
+  int argc = 1;
+  char program_name[] = "gflags_test";
+  char **argv = new char *[2];
+  argv[0] = program_name;
+  argv[1] = NULL;
+  gflags::ParseCommandLineFlags(&argc, reinterpret_cast<char ***>(&argv), true);
+  EXPECT_EQ("gflags_test", std::string(gflags::ProgramInvocationShortName()));
+  EXPECT_EQ("Hello world!", FLAGS_message);
+  gflags::ShutDownCommandLineFlags();
+}
+
+}  // namespace gflags_test
+}  // namespace third_party
diff --git a/third_party/glog.BUILD b/third_party/glog.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..a0ff1d6b416c2217b62f64bceee3c6a611c11dfe
--- /dev/null
+++ b/third_party/glog.BUILD
@@ -0,0 +1,128 @@
+licenses(["notice"])
+
+cc_library(
+    visibility=["//visibility:public"],
+    name="glog",
+    includes=[
+        ".",
+        "src",
+    ],
+    copts=[
+        "-D_START_GOOGLE_NAMESPACE_='namespace google {'",
+        "-D_END_GOOGLE_NAMESPACE_='}'",
+        "-DGOOGLE_NAMESPACE='google'",
+        "-DGOOGLE_GLOG_DLL_DECL=''",
+        "-DHAVE_DLADDR",
+        "-DHAVE_SNPRINTF",
+        "-DHAVE_DLFCN_H",
+        "-DHAVE_FCNTL",
+        "-DHAVE_GLOB_H",
+        "-DHAVE_INTTYPES_H",
+        "-DHAVE_LIBPTHREAD",
+        "-DHAVE_SYS_SYSCALL_H",
+        "-DHAVE_MEMORY_H",
+        "-DHAVE_NAMESPACES",
+        "-DHAVE_PREAD",
+        "-DHAVE_PTHREAD",
+        "-DHAVE_PWD_H",
+        "-DHAVE_PWRITE",
+        "-DHAVE_RWLOCK",
+        "-DHAVE_SIGACTION",
+        "-DHAVE_SIGALTSTACK",
+        "-DHAVE_STDINT_H",
+        "-DHAVE_STRING_H",
+        "-DHAVE_SYS_TIME_H",
+        "-DHAVE_SYS_TYPES_H",
+        "-DHAVE_SYS_UCONTEXT_H",
+        "-DHAVE_SYS_UTSNAME_H",
+        "-DHAVE_UNISTD_H",
+        "-DHAVE_USING_OPERATOR",
+        "-DHAVE_HAVE___ATTRIBUTE___",
+        "-DHAVE_HAVE___BUILTIN_EXPECT",
+        #"-DNO_FRAME_POINTER",
+        "-D_GNU_SOURCE",
+        #"-fno-sanitize=thread",
+        #"-fno-sanitize=address",
+        "-Iexternal/glog/src",
+    ],
+    srcs=[
+        "src/demangle.cc",
+        "src/logging.cc",
+        "src/raw_logging.cc",
+        "src/signalhandler.cc",
+        "src/symbolize.cc",
+        "src/utilities.cc",
+        "src/vlog_is_on.cc",
+        ":config_h",
+        ":logging_h",
+        ":raw_logging_h",
+        ":stl_logging_h",
+        ":vlog_is_on_h",
+    ],
+    hdrs=[
+        "src/demangle.h",
+        "src/mock-log.h",
+        "src/stacktrace.h",
+        "src/symbolize.h",
+        "src/utilities.h",
+        "src/base/commandlineflags.h",
+        "src/base/googleinit.h",
+        "src/base/mutex.h",
+        "src/glog/log_severity.h",
+    ])
+
+genrule(
+    name="config_h",
+    srcs=["src/config.h.cmake.in"],
+    outs=["config.h"],
+    cmd="awk '{ gsub(/^#cmakedefine/, \"//cmakedefine\"); print; }' $(<) > $(@)",
+)
+
+genrule(
+    name="logging_h",
+    srcs=["src/glog/logging.h.in"],
+    outs=["glog/logging.h"],
+    cmd="$(location :gen_sh) < $(<) > $(@)",
+    tools=[":gen_sh"])
+
+genrule(
+    name="raw_logging_h",
+    srcs=["src/glog/raw_logging.h.in"],
+    outs=["glog/raw_logging.h"],
+    cmd="$(location :gen_sh) < $(<) > $(@)",
+    tools=[":gen_sh"])
+
+genrule(
+    name="stl_logging_h",
+    srcs=["src/glog/stl_logging.h.in"],
+    outs=["glog/stl_logging.h"],
+    cmd="$(location :gen_sh) < $(<) > $(@)",
+    tools=[":gen_sh"])
+
+genrule(
+    name="vlog_is_on_h",
+    srcs=["src/glog/vlog_is_on.h.in"],
+    outs=["glog/vlog_is_on.h"],
+    cmd="$(location :gen_sh) < $(<) > $(@)",
+    tools=[":gen_sh"])
+
+genrule(
+    name="gen_sh",
+    outs=["gen.sh"],
+    cmd="""
+cat > $@ <<"EOF"
+#! /bin/sh
+sed -e 's/@ac_cv_have_unistd_h@/1/g' \
+    -e 's/@ac_cv_have_stdint_h@/1/g' \
+    -e 's/@ac_cv_have_systypes_h@/1/g' \
+    -e 's/@ac_cv_have_libgflags_h@/1/g' \
+    -e 's/@ac_cv_have_uint16_t@/1/g' \
+    -e 's/@ac_cv_have___builtin_expect@/1/g' \
+    -e 's/@ac_cv_have_.*@/0/g' \
+    -e 's/@ac_google_start_namespace@/namespace google {/g' \
+    -e 's/@ac_google_end_namespace@/}/g' \
+    -e 's/@ac_google_namespace@/google/g' \
+    -e 's/@ac_cv___attribute___noinline@/__attribute__((noinline))/g' \
+    -e 's/@ac_cv___attribute___noreturn@/__attribute__((noreturn))/g' \
+    -e 's/@ac_cv___attribute___printf_4_5@/__attribute__((__format__ (__printf__, 4, 5)))/g'
+EOF""")
diff --git a/third_party/glog_test/BUILD b/third_party/glog_test/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..56d08e95f8e8f063829ae68586fa9ef53306fef6
--- /dev/null
+++ b/third_party/glog_test/BUILD
@@ -0,0 +1,10 @@
+licenses(["notice"])  # Apache 2.0
+
+cc_test(
+    name="glog_test",
+    srcs=["glog_test.cc"],
+    copts=["-Iexternal/gtest/include"],
+    deps=[
+        "@gtest//:gtest",
+        "@glog//:glog",
+    ], )
diff --git a/third_party/glog_test/glog_test.cc b/third_party/glog_test/glog_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f1d737d625d25e8675f636075876903c42881a35
--- /dev/null
+++ b/third_party/glog_test/glog_test.cc
@@ -0,0 +1,7 @@
+#include <iostream>
+#include <string>
+
+#include "glog/logging.h"
+#include "gtest/gtest.h"
+
+TEST(GlogTest, Logging) { LOG(INFO) << "Hello world"; }
diff --git a/third_party/gtest.BUILD b/third_party/gtest.BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..9255b51d9aaa9c7ee5cbc1b2d537815c7ecbfcba
--- /dev/null
+++ b/third_party/gtest.BUILD
@@ -0,0 +1,8 @@
+cc_library(
+    name="gtest",
+    srcs=glob(
+        ["src/*.cc"], exclude=["src/gtest-all.cc"]),
+    hdrs=glob(["include/**/*.h", "src/*.h"]),
+    copts=["-Iexternal/gtest/include"],
+    linkopts=["-pthread"],
+    visibility=["//visibility:public"], )
diff --git a/third_party/protobuf_test/BUILD b/third_party/protobuf_test/BUILD
new file mode 100644
index 0000000000000000000000000000000000000000..67d4293c70eef081f6bb95de9774613a19ba91dd
--- /dev/null
+++ b/third_party/protobuf_test/BUILD
@@ -0,0 +1,24 @@
+licenses(["notice"])  # Apache 2.0
+
+load("@protobuf//:protobuf.bzl", "cc_proto_library")
+
+cc_proto_library(
+    name="example_proto",
+    srcs=["example.proto"],
+    protoc="@protobuf//:protoc",
+    default_runtime="@protobuf//:protobuf", )
+
+cc_library(
+    name="example_lib",
+    srcs=["example_lib.cc"],
+    hdrs=["example_lib.h"],
+    deps=[":example_proto"], )
+
+cc_test(
+    name="example_lib_test",
+    srcs=["example_lib_test.cc"],
+    copts=["-Iexternal/gtest/include"],
+    deps=[
+        "@gtest//:gtest",
+        ":example_lib",
+    ], )
diff --git a/third_party/protobuf_test/README.md b/third_party/protobuf_test/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e8bdeee6fee66ef79d0b813b4d8dfa4c180754c6
--- /dev/null
+++ b/third_party/protobuf_test/README.md
@@ -0,0 +1 @@
+This package tests that Bazel can build protobuf related rules.
diff --git a/third_party/protobuf_test/example.proto b/third_party/protobuf_test/example.proto
new file mode 100644
index 0000000000000000000000000000000000000000..6a7eada9c14a9df5d3ef8971b636c14a11da3d11
--- /dev/null
+++ b/third_party/protobuf_test/example.proto
@@ -0,0 +1,7 @@
+syntax = "proto3";
+
+package third_party.protobuf_test;
+
+message Greeting {
+  string name = 1;
+}
diff --git a/third_party/protobuf_test/example_lib.cc b/third_party/protobuf_test/example_lib.cc
new file mode 100644
index 0000000000000000000000000000000000000000..ced377bc0a17dde31c5c853dec1a852fa0be7223
--- /dev/null
+++ b/third_party/protobuf_test/example_lib.cc
@@ -0,0 +1,9 @@
+#include "third_party/protobuf_test/example_lib.h"
+
+namespace third_party {
+namespace protobuf_test {
+
+std::string get_greet(const Greeting& who) { return "Hello " + who.name(); }
+
+}  // namespace protobuf_test
+}  // namespace thrid_party
diff --git a/third_party/protobuf_test/example_lib.h b/third_party/protobuf_test/example_lib.h
new file mode 100644
index 0000000000000000000000000000000000000000..516326e812e19eb162f5392b519904a65c66c660
--- /dev/null
+++ b/third_party/protobuf_test/example_lib.h
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "third_party/protobuf_test/example.pb.h"
+
+#include <string>
+
+namespace third_party {
+namespace protobuf_test {
+
+std::string get_greet(const Greeting &who);
+
+}  // namespace protobuf_test
+}  // namespace third_party
diff --git a/third_party/protobuf_test/example_lib_test.cc b/third_party/protobuf_test/example_lib_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6229f56e6026908fff991765bd6bdaff6f8236ac
--- /dev/null
+++ b/third_party/protobuf_test/example_lib_test.cc
@@ -0,0 +1,15 @@
+#include "third_party/protobuf_test/example_lib.h"
+
+#include "gtest/gtest.h"
+
+namespace third_party {
+namespace protobuf_test {
+
+TEST(ProtobufTest, GetGreet) {
+  Greeting g;
+  g.set_name("Paddle");
+  EXPECT_EQ("Hello Paddle", get_greet(g));
+}
+
+}  // namespace protobuf_test
+}  // namespace third_party