Merge branch 'develop' of github.com:baidu/Paddle into feature/upgrade_to_proto3

79611a27 · Yu Yang · 0c65442c · 438a4ec6 · 79611a27 · 79611a27
149 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -25,8 +25,8 @@ find_package(ZLIB REQUIRED)
 find_package(NumPy REQUIRED)
 find_package(Threads REQUIRED)
 find_package(AVX QUIET)
-find_package(Glog)
-find_package(Gflags QUIET)
+find_package(Glog REQUIRED)
+find_package(Gflags REQUIRED)
 find_package(GTest)
 find_package(Sphinx)
 find_package(Doxygen)
@@ -40,8 +40,6 @@ option(WITH_AVX "Compile PaddlePaddle with avx intrinsics" ${AVX_FOUND})
 option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
 option(WITH_STYLE_CHECK "Style Check for PaddlePaddle" ${PYTHONINTERP_FOUND})
 option(WITH_RDMA "Compile PaddlePaddle with rdma support" OFF)
-option(WITH_GLOG "Compile PaddlePaddle use glog, otherwise use a log implement internally" ${LIBGLOG_FOUND})
-option(WITH_GFLAGS "Compile PaddlePaddle use gflags, otherwise use a flag implement internally" ${GFLAGS_FOUND})
 option(WITH_TIMER "Compile PaddlePaddle use timer" OFF)
 option(WITH_PROFILER "Compile PaddlePaddle use gpu profiler" OFF)
 option(WITH_TESTING "Compile and run unittest for PaddlePaddle" ${GTEST_FOUND})
@@ -136,16 +134,12 @@ else(WITH_RDMA)
  add_definitions(-DPADDLE_DISABLE_RDMA)
 endif(WITH_RDMA)

-if(WITH_GLOG)
-    add_definitions(-DPADDLE_USE_GLOG)
-    include_directories(${LIBGLOG_INCLUDE_DIR})
-endif()
+# glog
+include_directories(${LIBGLOG_INCLUDE_DIR})

-if(WITH_GFLAGS)
-    add_definitions(-DPADDLE_USE_GFLAGS)
-    add_definitions(-DGFLAGS_NS=${GFLAGS_NAMESPACE})
-    include_directories(${GFLAGS_INCLUDE_DIRS})
-endif()
+#gflags
+add_definitions(-DGFLAGS_NS=${GFLAGS_NAMESPACE})
+include_directories(${GFLAGS_INCLUDE_DIRS})

 if(WITH_TESTING)
    enable_testing()

--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
+./doc/howto/dev/contribute_to_paddle_en.md
--- a/WORKSPACE
+++ b/WORKSPACE
@@ -3,7 +3,7 @@ http_archive(
    name="protobuf",
    url="http://github.com/google/protobuf/archive/v3.1.0.tar.gz",
    sha256="0a0ae63cbffc274efb573bdde9a253e3f32e458c41261df51c5dbc5ad541e8f7",
-    strip_prefix="protobuf-3.1.0", )
+    strip_prefix="protobuf-3.1.0")

 # External dependency to gtest 1.7.0.  This method comes from
 # https://www.bazel.io/versions/master/docs/tutorial/cpp.html.
@@ -12,4 +12,20 @@ new_http_archive(
    url="https://github.com/google/googletest/archive/release-1.7.0.zip",
    sha256="b58cb7547a28b2c718d1e38aee18a3659c9e3ff52440297e965f5edffe34b6d0",
    build_file="third_party/gtest.BUILD",
-    strip_prefix="googletest-release-1.7.0", )
+    strip_prefix="googletest-release-1.7.0")
+
+# External dependency to gflags.  This method comes from
+# https://github.com/gflags/example/blob/master/WORKSPACE.
+new_git_repository(
+    name="gflags",
+    tag="v2.2.0",
+    remote="https://github.com/gflags/gflags.git",
+    build_file="third_party/gflags.BUILD")
+
+# External dependency to glog.  This method comes from
+# https://github.com/reyoung/bazel_playground/blob/master/WORKSPACE
+new_git_repository(
+    name="glog",
+    remote="https://github.com/google/glog.git",
+    commit="b6a5e0524c28178985f0d228e9eaa43808dbec3c",
+    build_file="third_party/glog.BUILD")
--- a/cmake/FindSphinx.cmake
+++ b/cmake/FindSphinx.cmake
@@ -72,7 +72,7 @@ function( Sphinx_add_target target_name builder conf cache source destination )
    ${source}
    ${destination}
    COMMENT "Generating sphinx documentation: ${builder}"
-    COMMAND ln -s ${destination}/index_*.html ${destination}/index.html
+    COMMAND ln -sf ${destination}/index_*.html ${destination}/index.html
    )

  set_property(

--- a/cmake/check_packages.cmake
+++ b/cmake/check_packages.cmake
@@ -14,13 +14,9 @@ if(WITH_STYLE_CHECK)
  find_package(PythonInterp REQUIRED)
 endif()

-if(WITH_GLOG)
-  find_package(Glog REQUIRED)
-endif()
+find_package(Glog REQUIRED)

-if(WITH_GFLAGS)
-  find_package(Gflags REQUIRED)
-endif()
+find_package(Gflags REQUIRED)

 if(WITH_TESTING)
  find_package(GTest REQUIRED)

--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -65,7 +65,7 @@ endmacro()
 # link_paddle_exe
 # add paddle library for a paddle executable, such as trainer, pserver.
 #
-# It will handle WITH_PYTHON/WITH_GLOG etc.
+# It will handle WITH_PYTHON etc.
 function(link_paddle_exe TARGET_NAME)
    if(WITH_RDMA)
        generate_rdma_links()
@@ -108,6 +108,8 @@ function(link_paddle_exe TARGET_NAME)
        paddle_cuda
        ${METRIC_LIBS}
        ${PROTOBUF_LIBRARY}
+        ${LIBGLOG_LIBRARY}
+        ${GFLAGS_LIBRARIES}
        ${CMAKE_THREAD_LIBS_INIT}
        ${CBLAS_LIBS}
        ${ZLIB_LIBRARIES}
@@ -125,16 +127,6 @@ function(link_paddle_exe TARGET_NAME)
            ${PYTHON_LIBRARIES})
    endif()

-    if(WITH_GLOG)
-        target_link_libraries(${TARGET_NAME}
-            ${LIBGLOG_LIBRARY})
-    endif()
-
-    if(WITH_GFLAGS)
-        target_link_libraries(${TARGET_NAME}
-            ${GFLAGS_LIBRARIES})
-    endif()
-
    if(WITH_GPU)
        if(NOT WITH_DSO OR WITH_METRIC)
            target_link_libraries(${TARGET_NAME}

--- a/demo/semantic_role_labeling/data/extract_dict_feature.py
+++ b/demo/semantic_role_labeling/data/extract_dict_feature.py
@@ -43,13 +43,13 @@ def extract_dict_features(pair_file, feature_file):
            mark[verb_index] = 1
            ctx_0 = sentence_list[verb_index]

-            if verb_index < len(labels_list) - 2:
+            if verb_index < len(labels_list) - 1:
                mark[verb_index + 1] = 1
                ctx_p1 = sentence_list[verb_index + 1]
            else:
                ctx_p1 = 'eos'

-            if verb_index < len(labels_list) - 3:
+            if verb_index < len(labels_list) - 2:
                mark[verb_index + 2] = 1
                ctx_p2 = sentence_list[verb_index + 2]
            else:

--- a/doc/CMakeLists.txt
+++ b/doc/CMakeLists.txt
@@ -16,7 +16,7 @@ set(SPHINX_CACHE_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/_doctrees")
 set(SPHINX_HTML_DIR_EN "${CMAKE_CURRENT_BINARY_DIR}/en/html")

 configure_file(
-    "${CMAKE_CURRENT_SOURCE_DIR}/conf.py.en.in"
+    "${CMAKE_CURRENT_SOURCE_DIR}/templates/conf.py.en.in"
    "${BINARY_BUILD_DIR_EN}/conf.py"
    @ONLY)

@@ -41,7 +41,7 @@ set(SPHINX_CACHE_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/_doctrees")
 set(SPHINX_HTML_DIR_CN "${CMAKE_CURRENT_BINARY_DIR}/cn/html")

 configure_file(
-    "${CMAKE_CURRENT_SOURCE_DIR}/conf.py.cn.in"
+    "${CMAKE_CURRENT_SOURCE_DIR}/templates/conf.py.cn.in"
    "${BINARY_BUILD_DIR_CN}/conf.py"
    @ONLY)


--- a/doc/about/index_cn.md
+++ b/doc/about/index_cn.md
+关于PaddlePaddle
+================
+
+PaddlePaddle是一个最早由百度科学家和工程师共同研发的并行分布式深度学习平台，兼备易用性、高效性、灵活性和可扩展性，目前已被百度内部多个产品线广泛使用。
+PaddlePaddle目前已经开放源码, 但是远未完善，我们希望能在这个基础上不断的改进、扩展和延伸。
+同时我们希望广大开发者积极提供反馈和贡献源代码，建立一个活跃的开源社区。
+
+致谢
+--------
+
+在此，特别感谢PaddlePaddle的[所有贡献者](https://github.com/PaddlePaddle/Paddle/graphs/contributors)。
--- a/doc/about/index_en.rst
+++ b/doc/about/index_en.rst
@@ -11,4 +11,4 @@ We hope to build an active open source community both by providing feedback and
 Credits
 --------

-We owe many thanks to `all contributors and developers <https://github.com/PaddlePaddle/Paddle/blob/develop/authors>`_ of PaddlePaddle!
+We owe many thanks to `all contributors and developers <https://github.com/PaddlePaddle/Paddle/graphs/contributors>`_ of PaddlePaddle!
--- a/doc/api/index_cn.rst
+++ b/doc/api/index_cn.rst
-API
-===
+API中文手册
+============

 DataProvider API
 ----------------

--- a/doc/getstarted/basic_usage/index_cn.rst
+++ b/doc/getstarted/basic_usage/index_cn.rst
-简介
-====
+经典的线性回归任务
+==================

 PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍将向你展示如何利用PaddlePaddle来解决一个经典的线性回归问题。

-1. 一个经典的任务
-----------------
+任务简介
+--------

 我们展示如何用PaddlePaddle解决 `单变量的线性回归 <https://www.baidu.com/s?wd=单变量线性回归>`_ 问题。线性回归的输入是一批点 `(x, y)` ，其中 `y = wx + b + ε`， 而 ε 是一个符合高斯分布的随机变量。线性回归的输出是从这批点估计出来的参数 `w` 和 `b` 。

 一个例子是房产估值。我们假设房产的价格（y）是其大小（x）的一个线性函数，那么我们可以通过收集市场上房子的大小和价格，用来估计线性函数的参数w 和 b。

-2. 准备数据
+准备数据
 -----------

 假设变量 `x` 和 `y` 的真实关系为： `y = 2x + 0.3 + ε`，这里展示如何使用观测数据来拟合这一线性关系。首先，Python代码将随机产生2000个观测点，作为线性回归的输入。下面脚本符合PaddlePaddle期待的读取数据的Python程序的模式。
@@ -28,7 +28,7 @@ PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍
            x = random.random()
            yield [x], [2*x+0.3]

-3. 训练模型
+训练模型
 -----------

 为了还原 `y = 2x + 0.3`，我们先从一条随机的直线 `y' = wx + b` 开始，然后利用观测数据调整 `w` 和 `b` 使得 `y'` 和 `y` 的差距不断减小，最终趋于接近。这个过程就是模型的训练过程，而 `w` 和 `b` 就是模型的参数，即我们的训练目标。
@@ -79,7 +79,7 @@ PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍

 PaddlePaddle将在观测数据集上迭代训练30轮，并将每轮的模型结果存放在 `./output` 路径下。从输出日志可以看到，随着轮数增加误差代价函数的输出在不断的减小，这意味着模型在训练数据上不断的改进，直到逼近真实解：` y = 2x + 0.3 `

-4. 模型检验
+模型检验
 -----------

 训练完成后，我们希望能够检验模型的好坏。一种常用的做法是用学习的模型对另外一组测试数据进行预测，评价预测的效果。在这个例子中，由于已经知道了真实答案，我们可以直接观察模型的参数是否符合预期来进行检验。
@@ -106,10 +106,3 @@ PaddlePaddle将每个模型参数作为一个numpy数组单独存为一个文件
 从图中可以看到，虽然 `w` 和 `b` 都使用随机值初始化，但在起初的几轮训练中它们都在快速逼近真实值，并且后续仍在不断改进，使得最终得到的模型几乎与真实模型一致。

 这样，我们用PaddlePaddle解决了单变量线性回归问题， 包括数据输入、模型训练和最后的结果验证。
-
-5. 推荐后续阅读
---------------
-
- `安装/编译 <../build_and_install/index.html>`_ ：PaddlePaddle的安装与编译文档。
- `快速入门 <../demo/quick_start/index.html>`_ ：使用商品评论分类任务，系统性的介绍如何一步步改进，最终得到产品级的深度模型。
- `示例 <../demo/index.html>`_ ：各种实用案例，涵盖图像、文本、推荐等多个领域。
\ No newline at end of file
--- a/doc/getstarted/basic_usage/index_en.rst
+++ b/doc/getstarted/basic_usage/index_en.rst
-Basic Usage
-=============
+Simple Linear Regression
+========================

 PaddlePaddle is a deep learning platform open-sourced by Baidu. With PaddlePaddle, you can easily train a classic neural network within a couple lines of configuration, or you can build sophisticated models that provide state-of-the-art performance on difficult learning tasks like sentiment analysis, machine translation, image caption and so on.

-1. A Classic Problem
---------------------
+Problem Background
+------------------

 Now, to give you a hint of what using PaddlePaddle looks like, let's start with a fundamental learning problem - `simple linear regression <https://en.wikipedia.org/wiki/Simple_linear_regression>`_: you have observed a set of two-dimensional data points of ``X`` and ``Y``, where ``X`` is an explanatory variable and ``Y`` is corresponding dependent variable, and you want to recover the underlying correlation between ``X`` and ``Y``. Linear regression can be used in many practical scenarios. For example, ``X`` can be a variable about house size, and ``Y`` a variable about house price. You can build a model that captures relationship between them by observing real estate markets.

-2. Prepare the Data
--------------------
+Prepare the Data
+-----------------

 Suppose the true relationship can be characterized as ``Y = 2X + 0.3``, let's see how to recover this pattern only from observed data. Here is a piece of python code that feeds synthetic data to PaddlePaddle. The code is pretty self-explanatory, the only extra thing you need to add for PaddlePaddle is a definition of input data types.

@@ -26,8 +26,8 @@ Suppose the true relationship can be characterized as ``Y = 2X + 0.3``, let's se
                x = random.random()
                yield [x], [2*x+0.3]

-3. Train a NeuralNetwork
-------------------------
+Train a NeuralNetwork
+----------------------

 To recover this relationship between ``X`` and ``Y``, we use a neural network with one layer of linear activation units and a square error cost layer. Don't worry if you are not familiar with these terminologies, it's just saying that we are starting from a random line ``Y' = wX + b`` , then we gradually adapt ``w`` and ``b`` to minimize the difference between ``Y'`` and ``Y``. Here is what it looks like in PaddlePaddle:

@@ -73,8 +73,8 @@ Now that everything is ready, you can train the network with a simple command li
 This means that PaddlePaddle will train this network on the synthectic dataset for 30 passes, and save all the models under path ``./output``. You will see from the messages printed out during training phase that the model cost is decreasing as time goes by, which indicates we are getting a closer guess.


-4. Evaluate the Model
-----------------------
+Evaluate the Model
+-------------------

 Usually, a different dataset that left out during training phase should be used to evalute the models. However, we are lucky enough to know the real answer: ``w=2, b=0.3``, thus a better option is to check out model parameters directly.


--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -49,10 +49,8 @@ PaddlePaddle supports some build options. To enable it, first you need to instal
 <tbody>
 <tr><td class="left">WITH_GPU</td><td class="left">Compile with GPU mode.</td></tr>
 <tr><td class="left">WITH_DOUBLE</td><td class="left">Compile with double precision floating-point, default: single precision.</td></tr>
-<tr><td class="left">WITH_GLOG</td><td class="left">Compile with glog. If not found, default: an internal log implementation.</td></tr>
-<tr><td class="left">WITH_GFLAGS</td><td class="left">Compile with gflags. If not found, default: an internal flag implementation.</td></tr>
 <tr><td class="left">WITH_TESTING</td><td class="left">Compile with gtest for PaddlePaddle's unit testing.</td></tr>
-<tr><td class="left">WITH_DOC</td><td class="left">	Compile to generate PaddlePaddle's docs, default: disabled (OFF).</td></tr>
+<tr><td class="left">WITH_DOC</td><td class="left">    Compile to generate PaddlePaddle's docs, default: disabled (OFF).</td></tr>
 <tr><td class="left">WITH_SWIG_PY</td><td class="left">Compile with python predict API, default: disabled (OFF).</td></tr>
 <tr><td class="left">WITH_STYLE_CHECK</td><td class="left">Compile with code style check, default: enabled (ON).</td></tr>
 </tbody>

--- a/doc/getstarted/build_and_install/cmake/compile_options.csv
+++ b/doc/getstarted/build_and_install/cmake/compile_options.csv
@@ -6,8 +6,6 @@ WITH_AVX,是否编译含有AVX指令集的PaddlePaddle二进制文件,是
 WITH_PYTHON,是否内嵌PYTHON解释器。方便今后的嵌入式移植工作。,是
 WITH_STYLE_CHECK,是否编译时进行代码风格检查,是
 WITH_RDMA,是否开启RDMA,否
-WITH_GLOG,是否开启GLOG。如果不开启，则会使用一个简化版的日志，同时方便今后的嵌入式移植工作。,取决于是否寻找到GLOG
-WITH_GFLAGS,是否使用GFLAGS。如果不开启，则会使用一个简化版的命令行参数解析器，同时方便今后的嵌入式移植工作。,取决于是否寻找到GFLAGS
 WITH_TIMER,是否开启计时功能。如果开启会导致运行略慢，打印的日志变多，但是方便调试和测Benchmark,否
 WITH_TESTING,是否开启单元测试,取决于是否寻找到GTEST
 WITH_DOC,是否编译中英文文档,否

--- a/doc/getstarted/build_and_install/index_cn.rst
+++ b/doc/getstarted/build_and_install/index_cn.rst
 编译与安装
-========================
+==========

 安装
 ++++
@@ -24,4 +24,4 @@ PaddlePaddle提供数个预编译的二进制来进行安装，包括Docker镜
 ..  toctree::
    :maxdepth: 1

-    cmake/build_from_source_cn.rst
\ No newline at end of file
+    cmake/build_from_source_cn.rst
--- a/doc/getstarted/build_and_install/ubuntu_install_cn.rst
+++ b/doc/getstarted/build_and_install/ubuntu_install_cn.rst
@@ -46,8 +46,6 @@ PaddlePaddle提供了ubuntu 14.04 deb安装包。
        with_double: OFF
        with_python: ON
        with_rdma: OFF
-        with_glog: ON
-        with_gflags: ON
        with_metric_learning:
        with_timer: OFF
        with_predict_sdk:

--- a/doc/getstarted/index_cn.rst
+++ b/doc/getstarted/index_cn.rst
-GET STARTED
+新手入门
 ============

 ..  toctree::

--- a/doc/howto/concepts/nn_cn.rst
+++ b/doc/howto/concepts/nn_cn.rst
-TBD
-
-目前正在书写中。敬请期待。
\ No newline at end of file
--- a/doc/howto/concepts/program_concepts_cn.rst
+++ b/doc/howto/concepts/program_concepts_cn.rst
-TBD
-###
-
-目前正在书写中。敬请期待。
\ No newline at end of file
--- a/doc/howto/deep_model/index_cn.rst
+++ b/doc/howto/deep_model/index_cn.rst
-How to Configure Deep Models
-============================
-
-..  toctree::
-  :maxdepth: 1
-
-  rnn/recurrent_group_cn.md
-  rnn/hierarchical_layer_cn.rst
-  rnn/hrnn_rnn_api_compare_cn.rst
-  rnn/hrnn_demo_cn.rst
--- a/doc/howto/deep_model/index_en.rst
+++ b/doc/howto/deep_model/index_en.rst
-How to Configure Deep Models
-============================
-
-..  toctree::
-  :maxdepth: 1
-
-  rnn/rnn_en.rst
--- a/doc/howto/deep_model/rnn/hrnn_demo_cn.rst
+++ b/doc/howto/deep_model/rnn/hrnn_demo_cn.rst
-..	_algo_hrnn_demo:
-
-#################
-双层RNN的使用示例
-#################
-
-TBD
\ No newline at end of file
--- a/doc/howto/deep_model/rnn/index_cn.rst
+++ b/doc/howto/deep_model/rnn/index_cn.rst
+RNN相关模型
+===========
+
+..  toctree::
+  :maxdepth: 1
+
+  recurrent_group_cn.md
+  hierarchical_layer_cn.rst
+  hrnn_rnn_api_compare_cn.rst
--- a/doc/howto/deep_model/rnn/index_en.rst
+++ b/doc/howto/deep_model/rnn/index_en.rst
+RNN Models
+==========
+
+..  toctree::
+  :maxdepth: 1
+
+  rnn_config_en.rst
--- a/doc/howto/deep_model/rnn/rnn_en.rst
+++ b/doc/howto/deep_model/rnn/rnn_en.rst
--- a/doc/howto/new_layer/FullyConnected.jpg
+++ b/doc/howto/new_layer/FullyConnected.jpg
--- a/doc/howto/dev/contribute_to_paddle_cn.md
+++ b/doc/howto/dev/contribute_to_paddle_cn.md
+# 如何贡献代码
+
+我们真诚地感谢您的贡献，欢迎通过 GitHub 的 fork 和 pull request 流程来提交代码。
+ 
+## 代码要求
+- 你的代码必须完全遵守 [doxygen](http://www.stack.nl/~dimitri/doxygen/) 的样式。
+- 确保编译器选项 WITH\_STYLE\_CHECK 已打开，并且编译能通过代码样式检查。
+- 所有代码必须具有单元测试。
+- 通过所有单元测试。
+
+以下教程将指导您提交代码。
+ 
+## [Fork](https://help.github.com/articles/fork-a-repo/)
+ 
+跳转到[PaddlePaddle](https://github.com/PaddlePaddle/Paddle) GitHub首页，然后单击 `Fork` 按钮。
+
+## 克隆（Clone）
+
+Paddle 目前使用[git流分支模型](http://nvie.com/posts/a-successful-git-branching-model/)进行开发，测试，发行和维护。
+**develop** 是主分支，其他用户分支是特征分支（feature branches）。
+
+一旦你创建了一个fork，你可以使用你最喜欢的 git 客户端克隆你的仓库（repo）或只是直接在命令行输入：
+
+```shell
+# 克隆 fork 到本地
+git clone --branch develop https://github.com/USERNAME/Paddle.git
+```
+如果你的仓库不包含 **develop** 分支，你只需自己创建它。
+
+```shell
+git clone https://github.com/USERNAME/Paddle.git Paddle
+cd Paddle
+git checkout -b develop  # 创建 develop 分支
+git remote add upstream https://github.com/PaddlePaddle/Paddle.git  # 添加 upstream 到 baidu/Paddle
+git pull upstream develop  # 更新 upstream
+git submodule update --init --recursive
+```
+
+然后你可以通过做一个本地开发分支开始开发
+
+```shell
+git checkout -b MY_COOL_STUFF_BRANCH
+```
+
+## 使用 `pre-commit` 钩子
+
+Paddle 开发人员使用 [pre-commit](http://pre-commit.com/) 工具来管理git预提交钩子。 它可以帮助我们格式化源代码（cpp，python），在提交前检查一些基本事宜（每个文件只有一个 EOL 
+，git 中不要添加大文件）。 `pre-commit`测试是 Travis-CI 中单元测试的一部分，不满足钩子
+的 PR 不能提交代码到 Paddle。
+
+你可以通过 `pip install pre-commit` 安装 [pre-commit](http://pre-commit.com/)，
+目前 Paddle 使用 `clang-format` 来调整C/C++源代码格式。请确保 clang-format 版本在3.8以上。
+
+然后只需在 Paddle clone 目录中运行 `pre-commit install` 。当你
+提交你的代码时，pre-commit 钩子会检查本地代码是否存在
+不适合提交的东西，等等。
+
+## 提交（Commit）
+
+提交你的代码：
+
+```shell
+# 显示工作树状态
+git status
+# 添加修改过的文件
+git add xx
+env EDITOR=vim git commit  # 你可以用 vim/nano/emacs 写下你的注释
+```
+提交信息的第一行是标题，其他行可以添加一些细节（如果有必要的话）。
+
+## 保持 Fork 状态最新
+
+在拉（pull）你的请求（request）之前，你应该从最新的 PaddlePaddle 同步代码。
+为此，你需要首先添加远程（remote）：
+
+```shell
+# 观察当前远程仓库配置
+git remote -v
+# 添加上游（upstream）仓库
+git remote add upstream https://github.com/PaddlePaddle/Paddle.git
+# 验证新的 upstream
+git remote -v
+```
+
+用最新的 upstream 更新你的 fork：
+
+```shell
+git pull --rebase upstream develop
+```
+如果本地没有提交，git 将简单地执行快进。但是，如果你一直在做一些改变（绝大多数情况下不应该），你可能要处理冲突。
+
+现在，你的本地主分支与上游修改的一致并是最新的。
+
+## 推送（Push）到 GitHub
+
+```shell
+# 在 GitHub 上 push 你的仓库
+git push -u origin MY_COOL_STUFF_BRANCH  # 创建远程分支 MY_COOL_STUFF_BRANCH 到 origin.
+```
+
+## 拉取请求（Pull Request）
+
+转到 GitHub上 你 fork 的页面，选择你的开发分支并单击 **pull request 按钮**。
+
+## 使用最新版本更新你的 pull 请求
+
+在代码审查（code review）期间，由于 baidu/Paddle 中新的提交导致你的 pull 请求可能会失效。如果没有冲突，GitHub允许自动更新。 你可以点击 pull request 页面中的“更新分支（Update Branch）”按钮。 但是如果存在代码冲突，你需要手动进行更新。你需要在本地仓库执行如下命令：
+
+```shell
+git checkout MY_COOL_STUFF_BRANCH
+git pull upstream develop
+# 你可能需要根据git提示解决冲突
+# 创建并测试你的代码
+git push origin MY_COOL_STUFF_BRANCH
+```
+现在你的 Pull Request 是最新的了。
+
+## 修改你的 pull request
+
+当根据审阅者的意见修改 pull 请求时，请使用“git commit”而不是“git commit --amend”来提交更改，以便审阅者可以看到新的请求和旧的请求之间的区别。
+
+可能的命令是
+
+```shell
+git checkout MY_COOL_STUFF_BRANCH
+git pull upstream develop   # 将本地更新到最新的代码库
+# 可能会发生一些冲突
+# 开始开发吧！
+env EDITOR=vim git commit  # 添加修改日志
+git push origin MY_COOL_STUFF_BRANCH
+```
--- a/doc/howto/contribute_to_paddle_en.md
+++ b/doc/howto/contribute_to_paddle_en.md
-# How to Contribute Code
+# Contribute Code

 We sincerely appreciate your contributions. You can use fork and pull request
-workflow to merge your code. 
- 
+workflow to merge your code.
+
 ## Code Requirements
 - Your code must be fully documented by
  [doxygen](http://www.stack.nl/~dimitri/doxygen/) style.
@@ -12,11 +12,11 @@ workflow to merge your code.
 - Pass all unit tests.

 The following tutorial guides you into submitting your contibution.
- 
+
 ## [Creating a Fork](https://help.github.com/articles/fork-a-repo/)
- 
+
 Just head over to the GitHub page and click the "Fork" button.
-It's just that simple. 
+It's just that simple.

 ## Clone

@@ -25,7 +25,7 @@ The **develop** is the main branch, and other user's branches are feature branch

 Once you've created a fork, you can use your favorite git client to clone your
 repo or just head straight to the command line:
- 
+
 ```shell
 # Clone your fork to your local machine
 git clone --branch develop https://github.com/USERNAME/Paddle.git
@@ -47,6 +47,22 @@ Then you can start to develop by making a local developement branch
 git checkout -b MY_COOL_STUFF_BRANCH
 ```

+## Using `pre-commit` hook
+
+Paddle developers use [pre-commit](http://pre-commit.com/) tool to manage git
+pre-commit hooks. It can help us format source codes (cpp, python), check some
+basic thing before commit (only one EOL for each file, do not add a huge file
+in git). `pre-commit` tests is a part of unit tests in Travis-CI now, every
+PR doesn't fit hook can not be merged into Paddle.
+
+To use [pre-commit](http://pre-commit.com/), you should install it by
+`pip install pre-commit`, and currently, Paddle uses `clang-format` to format
+c/cpp sources. Please make sure clang-format 3.8+ installed.
+
+Then just run `pre-commit install` in your Paddle clone directory. When you
+commit your code, the pre-commit hook will check the local code if there is
+anything not suitable to commit, and so on.
+
 ## Commit

 Commit your changes by following command lines:
@@ -83,7 +99,7 @@ git pull --rebase upstream develop

 If there are no unique commits locally, git will simply perform a fast-forward.
 However, if you have been making changes (in the vast majority of cases you
-probably shouldn't be), you may have to deal with conflicts. 
+probably shouldn't be), you may have to deal with conflicts.

 Now, your local master branch is up-to-date with everything modified upstream.


--- a/doc/howto/new_layer/index_en.rst
+++ b/doc/howto/new_layer/index_en.rst
-=======================
-How to Write New Layers
-=======================
+================
+Write New Layers
+================

 This tutorial will guide you to write customized layers in PaddlePaddle. We will utilize fully connected layer as an example to guide you through the following steps for writing a new layer.


--- a/doc/howto/write_docs/index_cn.rst
+++ b/doc/howto/write_docs/index_cn.rst
-###############################
-如何贡献/修改PaddlePaddle的文档
-###############################
+##################
+如何贡献/修改文档
+##################

 PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成，生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。

@@ -51,4 +51,4 @@ TBD


 ..	_cmake: https://cmake.org/
-..	_sphinx: http://www.sphinx-doc.org/en/1.4.8/
\ No newline at end of file
+..	_sphinx: http://www.sphinx-doc.org/en/1.4.8/
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
-HOW TO
-=======
+进阶指南
+========

-Usage
-------
+使用说明
+--------

 ..  toctree::
  :maxdepth: 1

-  concepts/use_concepts_cn.rst
-  cluster/k8s/paddle_on_k8s_cn.md
-  cluster/k8s/distributed_training_on_k8s_cn.md
+  usage/concepts/use_concepts_cn.rst
+  usage/cluster/k8s/k8s_cn.md
+  usage/cluster/k8s/k8s_distributed_cn.md

-Development
------------
+开发标准
+--------

 ..  toctree::
  :maxdepth: 1

-  write_docs/index_cn.rst
-  deep_model/index_cn.rst
+  dev/write_docs_cn.rst
+  dev/contribute_to_paddle_cn.md

-Optimization
-------------
+模型配置
+--------

 ..  toctree::
  :maxdepth: 1
+
+  deep_model/rnn/index_cn.rst
+
+性能优化
+--------
+
+..  toctree::
+  :maxdepth: 1
+
+  optimization/gpu_profiling_cn.rst
--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -7,9 +7,8 @@ Usage
 ..  toctree::
  :maxdepth: 1

-  cmd_parameter/index_en.md
-  deep_model/index_en.rst
-  cluster/cluster_train_en.md
+  usage/cmd_parameter/index_en.md
+  usage/cluster/cluster_train_en.md

 Development
 ------------
@@ -17,8 +16,16 @@ Development
 ..  toctree::
  :maxdepth: 1

-  new_layer/index_en.rst
-  contribute_to_paddle_en.md
+  dev/new_layer_en.rst
+  dev/contribute_to_paddle_en.md
+
+Configuration
+-------------
+
+..  toctree::
+  :maxdepth: 1
+
+  deep_model/rnn/index_en.rst

 Optimization
 -------------
@@ -26,4 +33,4 @@ Optimization
 ..  toctree::
  :maxdepth: 1

-  optimization/index_en.rst
+  optimization/gpu_profiling_en.rst
--- a/doc/howto/optimization/gpu_profiling_cn.rst
+++ b/doc/howto/optimization/gpu_profiling_cn.rst
+==================
+GPU性能分析与调优
+==================
+
+..  contents::
+
+此教程将向您分步介绍如何使用内置的定时工具、 **nvprof** 或 **nvvp** 来运行性能分析和调优。
+
+- 什么是性能分析？
+- 为什么需要性能分析？
+- 如何进行性能分析？
+- 性能分析工具介绍
+- 详细教程
+- 性能分析小技巧
+
+什么是性能分析？
+================
+在软件工程的范畴里，性能分析（Profiling）是一个动态程序分析的术语，它可以指测量一个程序的空间（内存）复杂度或时间复杂度，
+也可以说是某些特定指令的使用情况，或者是函数调用的频率和耗时等。通常情况下，分析得到的信息用于协助进行程序的优化。
+
+简单来说，性能分析工具是用于给应用程序的性能做定量分析的。如果想很好的理解程序的行为，那程序分析工具是必不可少的利器。简单的性能分析，可以告诉您某个操作到底花了多长时间？而更深入的分析，甚至能解释为什么某个操作花了很长时间？
+
+为什么需要性能分析？
+============================
+训练好一个深层神经网络通常要耗费非常长的时间，所以性能也就逐步变成了深度学习领域最重要的指标。
+而优化性能的首要任务，是需要了解哪些步骤拖慢了整体。
+如果某一块根本就不怎么耗时，那也就不需要急着优化性能啦！
+
+如何进行性能分析？
+========================
+为了达到性能最优，您可以采用下面五个步骤：
+
+- 对代码进行性能分析
+- 找到运行慢的部分
+- 找到运行慢的原因
+- 修改成更快的版本
+- 再次对代码进行性能分析
+
+Usually, processor has two key performance limits include float point throughput and
+memory throughput. For GPU,  it also need more parallelism to fulfill its potential.
+This is why they can be so fast.
+
+通常情况下，处理器有两个关键性能限制：一个是浮点计算量，另一个是内存操作量。
+GPU则还需要高并行性，才能发挥其全部能力。这正是它们速度快的原因。
+
+性能分析工具介绍
+======================
+就通常的GPU性能分析来说，市面上已经有NVIDIA或第三方提供的众多工具。
+
+**nvprof** 是Nvidia性能分析工具， **nvvp** 则是带GUI的Nvidia可视化性能分析工具。
+在这个教程中，我们主要会介绍nvprof和nvvp。
+
+:code:`test_GpuProfiler` from :code:`paddle/math/tests` directory will be used to evaluate
+above profilers.
+
+:code:`paddle/math/test` 目录中的 :code:`test_GpuProfiler` 就是用于展示上述分析工具的用法。
+
+.. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
+   :language: c++
+   :lines: 137-151
+   :linenos:
+
+上述的代码片段包含了两种方法，您可以任意使用一个或两个来对感兴趣的代码段做性能分析。
+
+1. :code:`REGISTER_TIMER_INFO` 是一个内置的定时器封装，可以用来计算CPU函数或cuda内核的时间消耗。
+
+2. :code:`REGISTER_GPU_PROFILER` is a general purpose wrapper object of :code:`cudaProfilerStart` and :code:`cudaProfilerStop` to avoid
+program crashes when CPU version of PaddlePaddle invokes them.
+
+3. :code:`REGISTER_GPU_PROFILER` 是一个封装对象，封装了 :code:`cudaProfilerStart` 和 :code:`cudaProfileStop` 两个操作；同时其内部实现可以避免纯CPU版本PaddlePaddle在执行本语句时发生崩溃。
+
+您会在接下来的部分中获得更多的细节介绍。
+
+详细教程
+============
+
+内置定时器
+------------
+
+如果想要启用PaddlePaddle的内置定时器，您首先需要在相关代码段中加入 :code:`REGISTER_TIMER_INFO`。
+接下来就可以使用 :code:`printStatus` 或者 :code:`printAllStatus` 函数来将信息输出到界面中。
+下面举个简单的例子：
+
+1. 加入 :code:`REGISTER_TIMER_INFO` 和 :code:`printAllStatus` 函数（如高亮部分）。
+
+    .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 137-151
+        :emphasize-lines: 8-12,14
+        :linenos:
+
+2. cmake配置中将 **WITH_TIMER** 打开，重新编译PaddlePaddle。
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_TIMER=ON
+        make
+
+3. 执行您的代码，并观察结果(如高亮部分）。
+
+    .. code-block:: bash
+        :emphasize-lines: 1,12-15
+
+        > ./paddle/math/tests/test_GpuProfiler
+        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler
+        I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
+        I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
+        [==========] Running 1 test from 1 test case.
+        [----------] Global test environment set-up.
+        [----------] 1 test from Profiler
+        [ RUN      ] Profiler.BilinearFwdBwd
+        I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im
+        gSizeX = 64, imgSizeY = 64"
+        I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751
+        I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======
+        I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd     total=136.141    avg=136.141    max=136.141    min=136.141   count=1
+        I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======
+        I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------
+        [       OK ] Profiler.BilinearFwdBwd (136 ms)
+        [----------] 1 test from Profiler (136 ms total)
+
+        [----------] Global test environment tear-down
+        [==========] 1 test from 1 test case ran. (136 ms total)
+        [  PASSED  ] 1 test.
+
+nvprof 工具
+----------------
+
+要使用命令行分析工具 **nvprof**，您按如下步骤操作即可：
+
+1. 将 :code:`REGISTER_GPU_PROFILER` 函数加到代码中（参考强调部分）。
+
+    .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
+        :language: c++
+        :lines: 137-151
+        :emphasize-lines: 6-7
+        :linenos:
+
+2. cmake中将 **WITH_PROFILER** 配置打开，重新编译PaddlePaddle。
+
+    .. code-block:: bash
+
+        cmake .. -DWITH_PROFILER=ON
+        make
+
+3. 使用 **nvprof** 来分析执行文件。
+
+    .. code-block:: bash
+
+        nvprof  ./paddle/math/tests/test_GpuProfiler
+
+然后，您就能获得如下的分析结果：
+
+.. code-block:: bash
+
+    ==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler
+    ==78544== Profiling result:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]
+    26.07%  9.0957ms         1  9.0957ms  9.0957ms  9.0957ms  KeBilinearInterpBw
+    23.78%  8.2977ms         1  8.2977ms  8.2977ms  8.2977ms  KeBilinearInterpFw
+    22.55%  7.8661ms         2  3.9330ms  1.5798ms  6.2863ms  [CUDA memcpy DtoH]
+
+    ==78544== API calls:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    46.85%  682.28ms         8  85.285ms  12.639us  682.03ms  cudaStreamCreateWithFlags
+    39.83%  580.00ms         4  145.00ms     302ns  550.27ms  cudaFree
+    9.82%   143.03ms         9  15.892ms  8.7090us  142.78ms  cudaStreamCreate
+    1.23%   17.983ms         7  2.5690ms  23.210us  6.4563ms  cudaMemcpy
+    1.23%   17.849ms         2  8.9247ms  8.4726ms  9.3768ms  cudaStreamSynchronize
+    0.66%   9.5969ms         7  1.3710ms  288.43us  2.4279ms  cudaHostAlloc
+    0.13%   1.9530ms        11  177.54us  7.6810us  591.06us  cudaMalloc
+    0.07%   1.0424ms         8  130.30us  1.6970us  453.72us  cudaGetDevice
+    0.04%   527.90us        40  13.197us     525ns  253.99us  cudaEventCreateWithFlags
+    0.03%   435.73us       348  1.2520us     124ns  42.704us  cuDeviceGetAttribute
+    0.03%   419.36us         1  419.36us  419.36us  419.36us  cudaGetDeviceCount
+    0.02%   260.75us         2  130.38us  129.32us  131.43us  cudaGetDeviceProperties
+    0.02%   222.32us         2  111.16us  106.94us  115.39us  cudaLaunch
+    0.01%   214.06us         4  53.514us  28.586us  77.655us  cuDeviceGetName
+    0.01%   115.45us         4  28.861us  9.8250us  44.526us  cuDeviceTotalMem
+    0.01%   83.988us         4  20.997us     578ns  77.760us  cudaSetDevice
+    0.00%   38.918us         1  38.918us  38.918us  38.918us  cudaEventCreate
+    0.00%   34.573us        31  1.1150us     279ns  12.784us  cudaDeviceGetAttribute
+    0.00%   17.767us         1  17.767us  17.767us  17.767us  cudaProfilerStart
+    0.00%   15.228us         2  7.6140us  3.5460us  11.682us  cudaConfigureCall
+    0.00%   14.536us         2  7.2680us  1.1490us  13.387us  cudaGetLastError
+    0.00%   8.6080us        26     331ns     173ns     783ns  cudaSetupArgument
+    0.00%   5.5470us         6     924ns     215ns  2.6780us  cuDeviceGet
+    0.00%   5.4090us         6     901ns     328ns  3.3320us  cuDeviceGetCount
+    0.00%   4.1770us         3  1.3920us  1.0630us  1.8300us  cuDriverGetVersion
+    0.00%   3.4650us         3  1.1550us  1.0810us  1.2680us  cuInit
+    0.00%      830ns         1     830ns     830ns     830ns  cudaRuntimeGetVersion
+
+
+nvvp 工具
+--------------
+
+如果想使用可视化的分析器 **nvvp**，您可以导入 :code:`nvprof -o ...` 的输出，或者从工具的界面里运行您的应用。
+
+**备注: nvvp 也支持CPU的性能分析** (需在nvvp界面中选上才能开启）
+
+..  image:: nvvp1.png
+    :align: center
+    :scale: 33%
+
+从内核函数的角度， **nvvp** 可以精确说明一个长耗时操作的具体原因。
+同时，如下图所示， **nvvp** 的内核block使用情况、寄存器使用情况和共享内存使用情况能让我们对GPU的整体使用有更好的理解。
+
+
+..  image:: nvvp2.png
+    :align: center
+    :scale: 33%
+
+而从应用的角度， **nvvp** 可以帮您提供一些定位性能瓶颈的建议。
+例如，下图中就展示了一些关于内存数据迁徙和计算资源利用率的建议，为您做性能调优提供了方向。
+
+..  image:: nvvp3.png
+    :align: center
+    :scale: 33%
+
+..  image:: nvvp4.png
+    :align: center
+    :scale: 33%
+
+性能分析小技巧
+==================
+
+- 开始阶段，从 **nvprof** 和 **nvvp** 的输出信息入手是个不错的选择。
+- 接下来可以考虑下时间线的分析。
+- 如果真想挖掘内核深处的某个秘密，您最好先确认：这一块的耗时比例真的太高，值得深入分析。
+- 可能的情况下，试着让输出的分析数据和理论值对应。
+
+    1) 例如，如果我知道内核花了10ms来移动1GB数据，那我会期望分析工具统计到速度是100GB/s。
+    2) 若有不一致之处，很有可能实际应用就是没有按照您的预期情况运行。
+- 了解您的硬件：如果您的GPU理论可以达到6 TFLOPs（6万亿次浮点运算每秒），而当前已经有5.5 TFLOPs了，那估计这里的潜力就没啥好挖的了……
+
+性能分析是性能优化的关键一步。有的时候简简单单的改变就能在性能上产生明显的优化效果！
+当然，具体情况因人而异。
+
+参考资料
+===========
+Jeremy Appleyard, `GPU Profiling for Deep Learning <http://www.robots.ox.ac.uk/~seminars/seminars/Extra/2015_10_08_JeremyAppleyard.pdf>`_, 2015
--- a/doc/howto/optimization/gpu_profiling_en.rst
+++ b/doc/howto/optimization/gpu_profiling_en.rst
-Profiling on PaddlePaddle
-=========================
+====================
+Tune GPU Performance 
+====================
+
+..  contents::

 This tutorial will guide you step-by-step through how to conduct profiling and performance tuning using built-in timer, **nvprof** and **nvvp**.

@@ -49,11 +52,11 @@ For general GPU profiling, a bunch of tools are provided from both NVIDIA and th
 In this tutorial, we will focus on nvprof and nvvp.

 :code:`test_GpuProfiler` from :code:`paddle/math/tests` directory will be used to evaluate
-above profilers. 
+above profilers.

 .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
   :language: c++
-   :lines: 111-124
+   :lines: 137-151
   :linenos:

 The above code snippet includes two methods, you can use any of them to profile the regions of interest.
@@ -79,8 +82,8 @@ As a simple example, consider the following:

    .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
        :language: c++
-        :lines: 111-124
-        :emphasize-lines: 8-10,13
+        :lines: 137-151
+        :emphasize-lines: 8-12,14
        :linenos:

 2. Configure cmake with **WITH_TIMER** and recompile PaddlePaddle.
@@ -90,31 +93,31 @@ As a simple example, consider the following:
        cmake .. -DWITH_TIMER=ON
        make

-3. Execute your code and observe the results (see the emphasize-lines). 
+3. Execute your code and observe the results (see the emphasize-lines).

    .. code-block:: bash
        :emphasize-lines: 1,12-15

-        > ./paddle/math/tests/test_GpuProfiler                                                                             
-        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler                                             
-        I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions                                                                      
-        I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.                                                                   
-        [==========] Running 1 test from 1 test case.                                                                                                
-        [----------] Global test environment set-up.                                                                                                 
-        [----------] 1 test from Profiler                                                                                                            
-        [ RUN      ] Profiler.BilinearFwdBwd                                                                                                         
+        > ./paddle/math/tests/test_GpuProfiler
+        I1117 11:13:42.313065 2522362816 Util.cpp:155] commandline: ./paddle/math/tests/test_GpuProfiler
+        I1117 11:13:42.845065 2522362816 Util.cpp:130] Calling runInitFunctions
+        I1117 11:13:42.845208 2522362816 Util.cpp:143] Call runInitFunctions done.
+        [==========] Running 1 test from 1 test case.
+        [----------] Global test environment set-up.
+        [----------] 1 test from Profiler
+        [ RUN      ] Profiler.BilinearFwdBwd
        I1117 11:13:42.845310 2522362816 test_GpuProfiler.cpp:114] Enable GPU Profiler Stat: [testBilinearFwdBwd] "numSamples = 10, channels = 16, im
-        gSizeX = 64, imgSizeY = 64"                                                                                                                  
-        I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751                                           
-        I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======                                               
-        I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd     total=136.141    avg=136.141    max=136.141    min=136.141   count=1                                                                                                                                  
-        I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======                                                          
-        I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------                                            
-        [       OK ] Profiler.BilinearFwdBwd (136 ms)                                                                                                
-        [----------] 1 test from Profiler (136 ms total)                                                                                             
-                                                                                                                                                    
-        [----------] Global test environment tear-down                                                                                               
-        [==========] 1 test from 1 test case ran. (136 ms total)                                                                                     
+        gSizeX = 64, imgSizeY = 64"
+        I1117 11:13:42.850154 2522362816 ThreadLocal.cpp:37] thread use undeterministic rand seed:20659751
+        I1117 11:13:42.981501 2522362816 Stat.cpp:130] ======= StatSet: [GlobalStatInfo] status ======
+        I1117 11:13:42.981539 2522362816 Stat.cpp:133] Stat=testBilinearFwdBwd     total=136.141    avg=136.141    max=136.141    min=136.141   count=1
+        I1117 11:13:42.981572 2522362816 Stat.cpp:141] ======= BarrierStatSet status ======
+        I1117 11:13:42.981575 2522362816 Stat.cpp:154] --------------------------------------------------
+        [       OK ] Profiler.BilinearFwdBwd (136 ms)
+        [----------] 1 test from Profiler (136 ms total)
+
+        [----------] Global test environment tear-down
+        [==========] 1 test from 1 test case ran. (136 ms total)
        [  PASSED  ] 1 test.

 nvprof profiler
@@ -126,7 +129,7 @@ To use this command line profiler **nvprof**, you can simply issue the following

    .. literalinclude:: ../../../paddle/math/tests/test_GpuProfiler.cpp
        :language: c++
-        :lines: 111-124
+        :lines: 137-151
        :emphasize-lines: 6-7
        :linenos:

@@ -147,42 +150,42 @@ Then, you can get the following profiling result:

 .. code-block:: bash

-    ==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler                                                                                                      
-    ==78544== Profiling result:                                                                                                                                                
-    Time(%)     Time     Calls       Avg       Min       Max  Name                                                                                                            
-    27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]                                                                                              
-    26.07%  9.0957ms         1  9.0957ms  9.0957ms  9.0957ms  KeBilinearInterpBw                                                                                            
-    23.78%  8.2977ms         1  8.2977ms  8.2977ms  8.2977ms  KeBilinearInterpFw                                                                                        
-    22.55%  7.8661ms         2  3.9330ms  1.5798ms  6.2863ms  [CUDA memcpy DtoH]                                                                                              
-                                                                                                                                                                            
-    ==78544== API calls:                                                                                                                                                       
-    Time(%)     Time     Calls       Avg       Min       Max  Name                                                                                                            
-    46.85%  682.28ms         8  85.285ms  12.639us  682.03ms  cudaStreamCreateWithFlags                                                                                       
-    39.83%  580.00ms         4  145.00ms     302ns  550.27ms  cudaFree                                                                                                        
-    9.82%   143.03ms         9  15.892ms  8.7090us  142.78ms  cudaStreamCreate                                                                                                
-    1.23%   17.983ms         7  2.5690ms  23.210us  6.4563ms  cudaMemcpy                                                                                                      
-    1.23%   17.849ms         2  8.9247ms  8.4726ms  9.3768ms  cudaStreamSynchronize                                                                                           
-    0.66%   9.5969ms         7  1.3710ms  288.43us  2.4279ms  cudaHostAlloc                                                                                                   
-    0.13%   1.9530ms        11  177.54us  7.6810us  591.06us  cudaMalloc                                                                                                      
-    0.07%   1.0424ms         8  130.30us  1.6970us  453.72us  cudaGetDevice                                                                                                   
-    0.04%   527.90us        40  13.197us     525ns  253.99us  cudaEventCreateWithFlags                                                                                        
-    0.03%   435.73us       348  1.2520us     124ns  42.704us  cuDeviceGetAttribute                                                                                            
-    0.03%   419.36us         1  419.36us  419.36us  419.36us  cudaGetDeviceCount                                                                                              
-    0.02%   260.75us         2  130.38us  129.32us  131.43us  cudaGetDeviceProperties                                                                                         
-    0.02%   222.32us         2  111.16us  106.94us  115.39us  cudaLaunch                                                                                                      
-    0.01%   214.06us         4  53.514us  28.586us  77.655us  cuDeviceGetName                                                                                                 
-    0.01%   115.45us         4  28.861us  9.8250us  44.526us  cuDeviceTotalMem                                                                                                
-    0.01%   83.988us         4  20.997us     578ns  77.760us  cudaSetDevice                                                                                                   
-    0.00%   38.918us         1  38.918us  38.918us  38.918us  cudaEventCreate                                                                                                 
-    0.00%   34.573us        31  1.1150us     279ns  12.784us  cudaDeviceGetAttribute                                                                                          
-    0.00%   17.767us         1  17.767us  17.767us  17.767us  cudaProfilerStart                                                                                               
-    0.00%   15.228us         2  7.6140us  3.5460us  11.682us  cudaConfigureCall                                                                                               
-    0.00%   14.536us         2  7.2680us  1.1490us  13.387us  cudaGetLastError                                                                                                
-    0.00%   8.6080us        26     331ns     173ns     783ns  cudaSetupArgument                                                                                               
-    0.00%   5.5470us         6     924ns     215ns  2.6780us  cuDeviceGet                                                                                                     
-    0.00%   5.4090us         6     901ns     328ns  3.3320us  cuDeviceGetCount                                                                                                
-    0.00%   4.1770us         3  1.3920us  1.0630us  1.8300us  cuDriverGetVersion                                                                                              
-    0.00%   3.4650us         3  1.1550us  1.0810us  1.2680us  cuInit                                                                                                          
+    ==78544== Profiling application: ./paddle/math/tests/test_GpuProfiler
+    ==78544== Profiling result:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    27.60%  9.6305ms         5  1.9261ms  3.4560us  6.4035ms  [CUDA memcpy HtoD]
+    26.07%  9.0957ms         1  9.0957ms  9.0957ms  9.0957ms  KeBilinearInterpBw
+    23.78%  8.2977ms         1  8.2977ms  8.2977ms  8.2977ms  KeBilinearInterpFw
+    22.55%  7.8661ms         2  3.9330ms  1.5798ms  6.2863ms  [CUDA memcpy DtoH]
+
+    ==78544== API calls:
+    Time(%)     Time     Calls       Avg       Min       Max  Name
+    46.85%  682.28ms         8  85.285ms  12.639us  682.03ms  cudaStreamCreateWithFlags
+    39.83%  580.00ms         4  145.00ms     302ns  550.27ms  cudaFree
+    9.82%   143.03ms         9  15.892ms  8.7090us  142.78ms  cudaStreamCreate
+    1.23%   17.983ms         7  2.5690ms  23.210us  6.4563ms  cudaMemcpy
+    1.23%   17.849ms         2  8.9247ms  8.4726ms  9.3768ms  cudaStreamSynchronize
+    0.66%   9.5969ms         7  1.3710ms  288.43us  2.4279ms  cudaHostAlloc
+    0.13%   1.9530ms        11  177.54us  7.6810us  591.06us  cudaMalloc
+    0.07%   1.0424ms         8  130.30us  1.6970us  453.72us  cudaGetDevice
+    0.04%   527.90us        40  13.197us     525ns  253.99us  cudaEventCreateWithFlags
+    0.03%   435.73us       348  1.2520us     124ns  42.704us  cuDeviceGetAttribute
+    0.03%   419.36us         1  419.36us  419.36us  419.36us  cudaGetDeviceCount
+    0.02%   260.75us         2  130.38us  129.32us  131.43us  cudaGetDeviceProperties
+    0.02%   222.32us         2  111.16us  106.94us  115.39us  cudaLaunch
+    0.01%   214.06us         4  53.514us  28.586us  77.655us  cuDeviceGetName
+    0.01%   115.45us         4  28.861us  9.8250us  44.526us  cuDeviceTotalMem
+    0.01%   83.988us         4  20.997us     578ns  77.760us  cudaSetDevice
+    0.00%   38.918us         1  38.918us  38.918us  38.918us  cudaEventCreate
+    0.00%   34.573us        31  1.1150us     279ns  12.784us  cudaDeviceGetAttribute
+    0.00%   17.767us         1  17.767us  17.767us  17.767us  cudaProfilerStart
+    0.00%   15.228us         2  7.6140us  3.5460us  11.682us  cudaConfigureCall
+    0.00%   14.536us         2  7.2680us  1.1490us  13.387us  cudaGetLastError
+    0.00%   8.6080us        26     331ns     173ns     783ns  cudaSetupArgument
+    0.00%   5.5470us         6     924ns     215ns  2.6780us  cuDeviceGet
+    0.00%   5.4090us         6     901ns     328ns  3.3320us  cuDeviceGetCount
+    0.00%   4.1770us         3  1.3920us  1.0630us  1.8300us  cuDriverGetVersion
+    0.00%   3.4650us         3  1.1550us  1.0810us  1.2680us  cuInit
    0.00%      830ns         1     830ns     830ns     830ns  cudaRuntimeGetVersion



--- a/doc/howto/optimization/index_en.rst
+++ b/doc/howto/optimization/index_en.rst
-How to Tune GPU Performance
-===========================
-
-.. toctree::
-  :maxdepth: 3
-
-  gpu_profiling_en.rst
--- a/doc/howto/cluster/cluster_train_en.md
+++ b/doc/howto/cluster/cluster_train_en.md
-# How to Run Distributed Training
+# Run Distributed Training

 In this article, we explain how to run distributed Paddle training jobs on clusters.  We will create the distributed version of the single-process training example, [recommendation](https://github.com/baidu/Paddle/tree/develop/demo/recommendation).


--- a/doc/howto/cluster/k8s/Dockerfile
+++ b/doc/howto/cluster/k8s/Dockerfile
--- a/doc/howto/cluster/k8s/job.yaml
+++ b/doc/howto/cluster/k8s/job.yaml
--- a/doc/howto/cluster/k8s/k8s-paddle-arch.png
+++ b/doc/howto/cluster/k8s/k8s-paddle-arch.png
--- a/doc/howto/cluster/k8s/paddle_on_k8s_cn.md
+++ b/doc/howto/cluster/k8s/paddle_on_k8s_cn.md
-# Paddle On Kubernetes：单机训练
+# Kubernetes 单机训练

 在这篇文档里，我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的Paddle训练作业。在下一篇中，我们将介绍如何启动分布式训练作业。


--- a/doc/howto/cluster/k8s/distributed_training_on_k8s_cn.md
+++ b/doc/howto/cluster/k8s/distributed_training_on_k8s_cn.md
-
-# PaddlePaddle on Kubernetes：分布式训练
+# Kubernetes 分布式训练

 前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里，我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练，文章 [Cluster Training](https://github.com/baidu/Paddle/blob/develop/doc/cluster/opensource/cluster_train.md)介绍了一种通过SSH远程分发任务，进行分布式训练的方法，与此不同的是，本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群，进行分布式训练的方案。


--- a/doc/howto/cluster/k8s/start.sh
+++ b/doc/howto/cluster/k8s/start.sh
--- a/doc/howto/cluster/k8s/start_paddle.py
+++ b/doc/howto/cluster/k8s/start_paddle.py
--- a/doc/howto/cmd_parameter/arguments_en.md
+++ b/doc/howto/cmd_parameter/arguments_en.md
--- a/doc/howto/cmd_parameter/detail_introduction_en.md
+++ b/doc/howto/cmd_parameter/detail_introduction_en.md
--- a/doc/howto/cmd_parameter/index_en.md
+++ b/doc/howto/cmd_parameter/index_en.md
 ```eval_rst
 ..  _cmd_line_index:
 ```
-# How to Set Command-line Parameters
+# Set Command-line Parameters

 * [Use Case](use_case_en.md)
 * [Arguments](arguments_en.md)

--- a/doc/howto/cmd_parameter/use_case_en.md
+++ b/doc/howto/cmd_parameter/use_case_en.md
--- a/doc/howto/concepts/src/pserver_topology.dot
+++ b/doc/howto/concepts/src/pserver_topology.dot
--- a/doc/howto/concepts/src/trainer_config.py
+++ b/doc/howto/concepts/src/trainer_config.py
--- a/doc/howto/concepts/use_concepts_cn.rst
+++ b/doc/howto/concepts/use_concepts_cn.rst
-#########################
-PaddlePaddle 基本使用概念
-#########################
+############
+基本使用概念
+############

 PaddlePaddle是一个深度学习框架，支持单机模式和多机模式。


--- a/doc/conf.py.cn.in
+++ b/doc/conf.py.cn.in
--- a/doc/conf.py.en.in
+++ b/doc/conf.py.en.in
--- a/doc/tutorials/index_cn.md
+++ b/doc/tutorials/index_cn.md
-# TUTORIALS
-There are several examples and demos here.
+# 完整教程

-## Quick Start
+## 快速入门

-* [Quick Start](quick_start/index_cn.rst)
+使用商品评论分类任务，系统性的介绍如何一步步改进，最终得到产品级的深度模型。

-## Image
+* [阅读教程](quick_start/index_cn.rst)
+
+## 图像

 * TBD

-## NLP
+## 自然语言处理

-* [Sentiment Analysis](sentiment_analysis/index_cn.md)
-* [Semantic Role Labeling](semantic_role_labeling/index_cn.rst)
+* [情感分类](sentiment_analysis/index_cn.md)
+* [语义角色标注](semantic_role_labeling/index_cn.md)

-## Recommendation
+## 个性化推荐

 * TBD

-## Model Zoo
+## 常用模型

 * TBD
--- a/doc/tutorials/index_en.md
+++ b/doc/tutorials/index_en.md
@@ -17,7 +17,6 @@ There are several examples and demos here.

 ## Recommendation

-* [MovieLens Dataset](rec/ml_dataset_en.md)
 * [MovieLens Regression](rec/ml_regression_en.rst)

 ## Model Zoo

--- a/doc/tutorials/quick_start/index_cn.rst
+++ b/doc/tutorials/quick_start/index_cn.rst
-PaddlePaddle快速入门教程
-========================
+=============
+快速入门教程
+=============

 我们将以 `文本分类问题 <https://en.wikipedia.org/wiki/Document_classification>`_ 为例,
 介绍PaddlePaddle的基本使用方法。

--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -17,22 +17,18 @@ add_library(paddle_api STATIC
        ${API_SOURCES})
 add_dependencies(paddle_api gen_proto_cpp)

+list(LENGTH "${GFLAGS_LIBRARIES}" GFLAGS_LIBRARIES_LENGTH)

-if(WITH_GFLAGS)
-  list(LENGTH "${GFLAGS_LIBRARIES}" GFLAGS_LIBRARIES_LENGTH)
-
-  if(${GFLAGS_LIBRARIES_LENGTH} EQUAL 0 AND TARGET "${GFLAGS_LIBRARIES}")
-    # Because gflags compiled by cmake, so it is imported by cmake target,
-    # not a real library path. Get the real library path here.
-    message(STATUS "GFLAGS Libraries is ${GFLAGS_LIBRARIES}")
-    get_target_property(GFLAGS_LOCATION ${GFLAGS_LIBRARIES} LOCATION)
-    message(STATUS "GFLAGS Target location is ${GFLAGS_LOCATION}")
-  else()
-    set(GFLAGS_LOCATION ${GFLAGS_LIBRARIES})
-  endif()
+if(${GFLAGS_LIBRARIES_LENGTH} EQUAL 0 AND TARGET "${GFLAGS_LIBRARIES}")
+# Because gflags compiled by cmake, so it is imported by cmake target,
+# not a real library path. Get the real library path here.
+message(STATUS "GFLAGS Libraries is ${GFLAGS_LIBRARIES}")
+get_target_property(GFLAGS_LOCATION ${GFLAGS_LIBRARIES} LOCATION)
+message(STATUS "GFLAGS Target location is ${GFLAGS_LOCATION}")
+else()
+set(GFLAGS_LOCATION ${GFLAGS_LIBRARIES})
 endif()

-
 configure_file(
    paddle_api_config.py.in
    ${PROJ_ROOT}/paddle/api/paddle_api_config.py
@@ -57,7 +53,7 @@ add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/dist/.timestamp
            paddle_trainer
            paddle_api
            paddle_cuda
-	    ${PY_PADDLE_PYTHON_FILES}
+        ${PY_PADDLE_PYTHON_FILES}
 )

 install(DIRECTORY ${PROJ_ROOT}/paddle/dist/

--- a/paddle/api/Trainer.cpp
+++ b/paddle/api/Trainer.cpp
@@ -27,9 +27,9 @@ limitations under the License. */

 using paddle::real;

-P_DECLARE_string(config);
-P_DECLARE_string(init_model_path);
-P_DECLARE_int32(start_pass);
+DECLARE_string(config);
+DECLARE_string(init_model_path);
+DECLARE_int32(start_pass);

 struct TrainerPrivate : public paddle::Trainer {
  bool _trainOneBatch(size_t batchSize);

--- a/paddle/api/paddle_api_config.py.in
+++ b/paddle/api/paddle_api_config.py.in
@@ -8,9 +8,7 @@ CMAKE_DL_LIBS="@CMAKE_DL_LIBS@"

 WITH_PYTHON="@WITH_PYTHON@"
 PYTHON_LIBRARIES="@PYTHON_LIBRARIES@"
-WITH_GLOG="@WITH_GLOG@"
 LIBGLOG_LIBRARY="@LIBGLOG_LIBRARY@"
-WITH_GFLAGS="@WITH_GFLAGS@"
 GFLAGS_LIBRARIES="@GFLAGS_LIBRARIES@"
 GFLAGS_LOCATION="@GFLAGS_LOCATION@"
 CBLAS_LIBRARIES="@CBLAS_LIBS@"

--- a/paddle/api/paddle_ld_flags.py
+++ b/paddle/api/paddle_ld_flags.py
@@ -47,10 +47,8 @@ try:
            self.with_python = PaddleLDFlag.cmake_bool(WITH_PYTHON)
            self.python_libs = PYTHON_LIBRARIES

-            self.with_glog = PaddleLDFlag.cmake_bool(WITH_GLOG)
            self.glog_libs = LIBGLOG_LIBRARY

-            self.with_gflags = PaddleLDFlag.cmake_bool(WITH_GFLAGS)
            self.with_coverage = PaddleLDFlag.cmake_bool(WITH_COVERALLS)
            self.gflags_libs = GFLAGS_LIBRARIES
            self.gflags_location = GFLAGS_LOCATION
@@ -88,6 +86,8 @@ try:
                "-lpaddle_cuda",
                "-lpaddle_api",
                self.normalize_flag(self.protolib),
+                self.normalize_flag(self.glog_libs),
+                self.normalize_flag(self.gflags_libs),
                self.normalize_flag(self.zlib),
                self.normalize_flag(self.thread),
                self.normalize_flag(self.dl_libs),
@@ -96,10 +96,6 @@ try:

            if self.with_python:
                libs.append(self.normalize_flag(self.python_libs))
-            if self.with_glog:
-                libs.append(self.normalize_flag(self.glog_libs))
-            if self.with_gflags:
-                libs.append(self.normalize_flag(self.gflags_libs))
            if self.with_gpu:
                libs.append(self.normalize_flag(self.curt))
            if self.with_coverage:

--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -21,10 +21,10 @@ limitations under the License. */
 #include "paddle/utils/CommandLineParser.h"
 #include "paddle/utils/Logging.h"

-P_DEFINE_int32(cudnn_conv_workspace_limit_in_mb,
-               4096,
-               "Specify cuDNN max workspace limit, in units MB, "
-               "4096MB=4GB by default.");
+DEFINE_int32(cudnn_conv_workspace_limit_in_mb,
+             4096,
+             "Specify cuDNN max workspace limit, in units MB, "
+             "4096MB=4GB by default.");

 namespace dynload {


--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 // clang-format off
-// Because clang-format 4.X and clang-format 3.8+ format 
+// Because clang-format 4.X and clang-format 3.8+ format
 // following lines in different. So disable clang-format.
 #include "hl_cuda.h"
 #include <cuda_profiler_api.h>
@@ -22,6 +22,7 @@ limitations under the License. */
 #include <sys/time.h>
 #include <unistd.h>
 #include <mutex>
+#include "hl_cuda.h"
 #include "hl_cuda.ph"
 #include "hl_dso_loader.h"
 #include "hl_thread.ph"

--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -16,21 +16,21 @@ limitations under the License. */
 #include "paddle/utils/CommandLineParser.h"
 #include "paddle/utils/Logging.h"

-P_DEFINE_string(cudnn_dir,
-                "",
-                "Specify path for loading libcudnn.so. For instance, "
-                "/usr/local/cudnn/lib. If empty [default], dlopen "
-                "will search cudnn from LD_LIBRARY_PATH");
-
-P_DEFINE_string(cuda_dir,
-                "",
-                "Specify path for loading cuda library, such as libcublas, "
-                "libcurand. For instance, /usr/local/cuda/lib64. (Note: "
-                "libcudart can not be specified by cuda_dir, since some "
-                "build-in function in cudart already ran before main entry). "
-                "If default, dlopen will search cuda from LD_LIBRARY_PATH");
-
-P_DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
+DEFINE_string(cudnn_dir,
+              "",
+              "Specify path for loading libcudnn.so. For instance, "
+              "/usr/local/cudnn/lib. If empty [default], dlopen "
+              "will search cudnn from LD_LIBRARY_PATH");
+
+DEFINE_string(cuda_dir,
+              "",
+              "Specify path for loading cuda library, such as libcublas, "
+              "libcurand. For instance, /usr/local/cuda/lib64. (Note: "
+              "libcudart can not be specified by cuda_dir, since some "
+              "build-in function in cudart already ran before main entry). "
+              "If default, dlopen will search cuda from LD_LIBRARY_PATH");
+
+DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");

 static inline std::string join(const std::string& part1,
                               const std::string& part2) {

--- a/paddle/gserver/dataproviders/ProtoDataProvider.cpp
+++ b/paddle/gserver/dataproviders/ProtoDataProvider.cpp
@@ -22,9 +22,9 @@ limitations under the License. */
 #include "DataProviderGroup.h"
 #include "paddle/utils/Logging.h"

-P_DEFINE_double(memory_threshold_on_load_data,
-                1.0,
-                "stop loading data when memory is not sufficient");
+DEFINE_double(memory_threshold_on_load_data,
+              1.0,
+              "stop loading data when memory is not sufficient");

 namespace paddle {


--- a/paddle/gserver/evaluators/Evaluator.cpp
+++ b/paddle/gserver/evaluators/Evaluator.cpp
@@ -17,7 +17,7 @@ limitations under the License. */

 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"

-P_DECLARE_int32(trainer_id);
+DECLARE_int32(trainer_id);

 namespace paddle {


--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -21,11 +21,11 @@ limitations under the License. */
 #include "NeuralNetwork.h"
 #include "ParallelNeuralNetwork.h"

-P_DEFINE_bool(allow_only_one_model_on_one_gpu,
-              true,
-              "If true, do not allow multiple models on one GPU device");
+DEFINE_bool(allow_only_one_model_on_one_gpu,
+            true,
+            "If true, do not allow multiple models on one GPU device");
 #ifdef PADDLE_METRIC_LEARNING
-P_DECLARE_bool(external);
+DECLARE_bool(external);
 #endif

 namespace paddle {

--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -24,7 +24,7 @@ limitations under the License. */
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"

-P_DEFINE_string(diy_beam_search_prob_so, "", "the diy beam search cost so");
+DEFINE_string(diy_beam_search_prob_so, "", "the diy beam search cost so");

 static const char* DIY_CALC_PROB_SYMBOL_NAME = "calc_prob";
 static const char* DIY_START_CALC_PROB_SYMBOL_NAME = "start_calc_prob";

--- a/paddle/gserver/layers/DataLayer.cpp
+++ b/paddle/gserver/layers/DataLayer.cpp
@@ -54,7 +54,7 @@ void DataLayer::copyDataToOutput(Argument& output) {
    output.setFrameWidth(config_.width());
  } else {
    output.setFrameHeight(data_.getFrameHeight());
-    output.setFrameHeight(data_.getFrameHeight());
+    output.setFrameWidth(data_.getFrameWidth());
  }
  output.cpuSequenceDims = data_.cpuSequenceDims;
  output.sequenceStartPositions = data_.sequenceStartPositions;

--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -33,7 +33,7 @@ limitations under the License. */
 #include "TransLayer.h"
 #include "ValidationLayer.h"

-P_DEFINE_bool(log_error_clipping, false, "enable log error clipping or not");
+DEFINE_bool(log_error_clipping, false, "enable log error clipping or not");

 namespace paddle {


--- a/paddle/gserver/layers/LstmLayer.cpp
+++ b/paddle/gserver/layers/LstmLayer.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/math/Matrix.h"
 #include "paddle/utils/Stat.h"

-P_DECLARE_bool(prev_batch_state);
+DECLARE_bool(prev_batch_state);

 namespace paddle {


--- a/paddle/gserver/layers/RecurrentLayer.cpp
+++ b/paddle/gserver/layers/RecurrentLayer.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/utils/CommandLineParser.h"
 #include "paddle/utils/Stat.h"

-P_DEFINE_bool(rnn_use_batch, false, "Using the batch method for calculation.");
+DEFINE_bool(rnn_use_batch, false, "Using the batch method for calculation.");

 namespace paddle {


--- a/paddle/gserver/layers/ValidationLayer.h
+++ b/paddle/gserver/layers/ValidationLayer.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "Layer.h"
 #include "paddle/gserver/evaluators/Evaluator.h"

-P_DECLARE_int32(trainer_id);
+DECLARE_int32(trainer_id);

 namespace paddle {


--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -14,7 +14,7 @@ limitations under the License. */

 #include "LayerGradUtil.h"

-P_DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(thread_local_rand_use_global_seed);

 namespace paddle {
 real getCostSum(LayerPtr& testLayer, MatrixPtr weights) {

--- a/paddle/gserver/tests/TestUtil.cpp
+++ b/paddle/gserver/tests/TestUtil.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/math/SparseMatrix.h"
 #include "paddle/utils/CommandLineParser.h"

-P_DEFINE_int32(fixed_seq_length, 0, "Produce some sequence of fixed length");
+DEFINE_int32(fixed_seq_length, 0, "Produce some sequence of fixed length");

 namespace paddle {


--- a/paddle/gserver/tests/test_ActivationGrad.cpp
+++ b/paddle/gserver/tests/test_ActivationGrad.cpp
@@ -25,8 +25,8 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT

-P_DECLARE_bool(use_gpu);
-P_DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(use_gpu);
+DECLARE_bool(thread_local_rand_use_global_seed);

 void testActivation(const string& act) {
  LOG(INFO) << "test activation: " << act;

--- a/paddle/gserver/tests/test_BatchNorm.cpp
+++ b/paddle/gserver/tests/test_BatchNorm.cpp
@@ -27,11 +27,11 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT

-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_double(checkgrad_eps);
-P_DECLARE_bool(thread_local_rand_use_global_seed);
-P_DECLARE_bool(prev_batch_state);
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);

 // Test that the batchNormLayer can be followed by a ConvLayer
 TEST(Layer, batchNorm) {

--- a/paddle/gserver/tests/test_ConvTrans.cpp
+++ b/paddle/gserver/tests/test_ConvTrans.cpp
@@ -28,11 +28,11 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT

-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_double(checkgrad_eps);
-P_DECLARE_bool(thread_local_rand_use_global_seed);
-P_DECLARE_bool(prev_batch_state);
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);

 // Test that the convTrans forward is the same as conv backward
 TEST(Layer, convTransLayerFwd) {

--- a/paddle/gserver/tests/test_ConvUnify.cpp
+++ b/paddle/gserver/tests/test_ConvUnify.cpp
@@ -28,11 +28,11 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT

-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_double(checkgrad_eps);
-P_DECLARE_bool(thread_local_rand_use_global_seed);
-P_DECLARE_bool(prev_batch_state);
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);

 // Do one forward pass of convTrans layer and check to see if its output
 // matches the given result

--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ b/paddle/gserver/tests/test_Evaluator.cpp
@@ -21,9 +21,9 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT

-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);

 enum InputType {
  INPUT_DATA,         // dense vector

--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -26,11 +26,11 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT

-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(gpu_id);
-P_DECLARE_double(checkgrad_eps);
-P_DECLARE_bool(thread_local_rand_use_global_seed);
-P_DECLARE_bool(prev_batch_state);
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(prev_batch_state);

 TEST(Operator, dot_mul) {
  TestConfig config;

--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -25,10 +25,10 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT

-P_DECLARE_int32(gpu_id);
-P_DECLARE_double(checkgrad_eps);
-P_DEFINE_bool(use_label, true, "input label or sequence label");
-P_DEFINE_bool(static_para, false, "static parameter");
+DECLARE_int32(gpu_id);
+DECLARE_double(checkgrad_eps);
+DEFINE_bool(use_label, true, "input label or sequence label");
+DEFINE_bool(static_para, false, "static parameter");

 struct DataIn {
  std::vector<Argument> inArgs;
@@ -267,8 +267,8 @@ TEST(Compare, img_conv2) {
 }
 #endif

-P_DEFINE_string(config_file_a, "", "config of one network to compare");
-P_DEFINE_string(config_file_b, "", "config of another network to compare");
+DEFINE_string(config_file_a, "", "config of one network to compare");
+DEFINE_string(config_file_b, "", "config of another network to compare");
 TEST(Compare, network) {
  if (FLAGS_config_file_a != "" && FLAGS_config_file_b != "") {
    compareNetwork(FLAGS_config_file_a, FLAGS_config_file_b);

--- a/paddle/gserver/tests/test_PyDataProvider2.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider2.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Util.h"

-P_DEFINE_string(train_list, "unittest.list", "file list for unittest");
+DEFINE_string(train_list, "unittest.list", "file list for unittest");

 namespace paddle {
 namespace unittest {

--- a/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
+++ b/paddle/gserver/tests/test_RecurrentGradientMachine.cpp
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <paddle/utils/Util.h>
 #include <paddle/utils/Version.h>

-P_DECLARE_int32(seed);
+DECLARE_int32(seed);

 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT

--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -23,9 +23,9 @@ limitations under the License. */

 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
-P_DECLARE_bool(use_gpu);
-P_DECLARE_bool(rnn_use_batch);
-P_DECLARE_int32(fixed_seq_length);
+DECLARE_bool(use_gpu);
+DECLARE_bool(rnn_use_batch);
+DECLARE_int32(fixed_seq_length);

 void checkError(const Matrix& matrix1, const Matrix& matrix2) {
  CHECK(matrix1.getHeight() == matrix2.getHeight());

--- a/paddle/gserver/tests/test_SelectiveFCLayer.cpp
+++ b/paddle/gserver/tests/test_SelectiveFCLayer.cpp
@@ -29,11 +29,11 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT

-P_DECLARE_bool(use_gpu);
-P_DECLARE_int32(num_passes);
-P_DECLARE_string(config);
-P_DECLARE_string(init_model_path);
-P_DECLARE_string(config_args);
+DECLARE_bool(use_gpu);
+DECLARE_int32(num_passes);
+DECLARE_string(config);
+DECLARE_string(init_model_path);
+DECLARE_string(config_args);

 size_t fcLayerWidth = 1024;


--- a/paddle/gserver/tests/test_WarpCTCLayer.cpp
+++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp
@@ -25,7 +25,7 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT

-P_DECLARE_bool(use_gpu);
+DECLARE_bool(use_gpu);

 const real* getData(const Matrix& matrix) {
  if (matrix.useGpu()) {

--- a/paddle/math/SparseRowMatrix.cpp
+++ b/paddle/math/SparseRowMatrix.cpp
@@ -24,9 +24,9 @@ limitations under the License. */
 #include "paddle/utils/Thread.h"
 #include "paddle/utils/Util.h"

-P_DEFINE_bool(allow_inefficient_sparse_update,
-              false,
-              "Whether to allow inefficient sparse update");
+DEFINE_bool(allow_inefficient_sparse_update,
+            false,
+            "Whether to allow inefficient sparse update");

 namespace paddle {


--- a/paddle/math/SparseRowMatrix.h
+++ b/paddle/math/SparseRowMatrix.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/utils/CommandLineParser.h"
 #include "paddle/utils/Util.h"

-P_DECLARE_bool(allow_inefficient_sparse_update);
+DECLARE_bool(allow_inefficient_sparse_update);

 namespace paddle {


--- a/paddle/math/Storage.cpp
+++ b/paddle/math/Storage.cpp
@@ -16,9 +16,9 @@ limitations under the License. */
 #include "Allocator.h"
 #include "paddle/utils/Util.h"

-P_DEFINE_int32(pool_limit_size,
-               536870912,
-               "maximum memory size managed by a memory pool, default is 512M");
+DEFINE_int32(pool_limit_size,
+             536870912,
+             "maximum memory size managed by a memory pool, default is 512M");

 namespace paddle {


--- a/paddle/math/tests/test_TrainingAlgorithm.cpp
+++ b/paddle/math/tests/test_TrainingAlgorithm.cpp
@@ -22,9 +22,9 @@ limitations under the License. */
 using namespace paddle;  // NOLINT

 #ifndef PADDLE_TYPE_DOUBLE
-P_DEFINE_double(max_diff, 1e-5, "max diff allowed");
+DEFINE_double(max_diff, 1e-5, "max diff allowed");
 #else
-P_DEFINE_double(max_diff, 1e-13, "max diff allowed");
+DEFINE_double(max_diff, 1e-13, "max diff allowed");
 #endif

 class SetMaxDiff {

--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -245,6 +245,8 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src,
                                    bool useGpu,
                                    hl_stream_t stream) {
  dataId = src.dataId;
+  frameWidth = src.frameWidth;
+  frameHeight = src.frameHeight;

  if (!src.sequenceStartPositions) {
    // non-sequence input, copy samples directly

--- a/paddle/parameter/FirstOrderOptimizer.cpp
+++ b/paddle/parameter/FirstOrderOptimizer.cpp
@@ -19,7 +19,7 @@ limitations under the License. */

 #include <cmath>

-P_DEFINE_bool(log_clipping, false, "enable log clipping or not");
+DEFINE_bool(log_clipping, false, "enable log clipping or not");

 namespace paddle {


--- a/paddle/parameter/Parameter.cpp
+++ b/paddle/parameter/Parameter.cpp
@@ -26,11 +26,11 @@ limitations under the License. */
 #include "paddle/utils/CommandLineParser.h"
 #include "paddle/utils/Logging.h"

-P_DEFINE_int32(enable_grad_share,
-               (100 * 1024 * 1024),
-               "threshold for enable gradient parameter share for batch "
-               "multi-cpu training");
-P_DEFINE_int32(
+DEFINE_int32(enable_grad_share,
+             (100 * 1024 * 1024),
+             "threshold for enable gradient parameter share for batch "
+             "multi-cpu training");
+DEFINE_int32(
    grad_share_block_num,
    64,
    "block number of gradient parameter share for batch multi-cpu training");

--- a/paddle/pserver/BaseClient.cpp
+++ b/paddle/pserver/BaseClient.cpp
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/utils/CommandLineParser.h"
 #include "paddle/utils/Stat.h"

-P_DECLARE_string(pservers);
+DECLARE_string(pservers);

 namespace paddle {


--- a/paddle/pserver/LightNetwork.cpp
+++ b/paddle/pserver/LightNetwork.cpp
@@ -31,23 +31,23 @@ limitations under the License. */
 #include "paddle/utils/Util.h"

 /// quick ack can reduce the latency of small message
-P_DEFINE_bool(small_messages,
-              false,
-              "if message size is small, recommend set it True to enable quick "
-              "ack and no delay");
+DEFINE_bool(small_messages,
+            false,
+            "if message size is small, recommend set it True to enable quick "
+            "ack and no delay");

 /// reasonable sock_send_buf_size can control the traffic injected into switch
 /// network. Injecting too many data into traffic could cause packets loss which
 /// cause long latency and degrade the efficiency of communication.
-P_DEFINE_int32(sock_send_buf_size,
-               1024 * 1024 * 40,
-               "restrict sock send buff size, can reduce network congestion if "
-               "set carefully");
+DEFINE_int32(sock_send_buf_size,
+             1024 * 1024 * 40,
+             "restrict sock send buff size, can reduce network congestion if "
+             "set carefully");

 /// reasonable size can hold bursted packets and reduce packets loss
-P_DEFINE_int32(sock_recv_buf_size,
-               1024 * 1024 * 40,
-               "restrict sock recv buff size");
+DEFINE_int32(sock_recv_buf_size,
+             1024 * 1024 * 40,
+             "restrict sock recv buff size");

 namespace paddle {


--- a/paddle/pserver/ParameterClient2.cpp
+++ b/paddle/pserver/ParameterClient2.cpp
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/StringUtil.h"

-P_DEFINE_string(pservers, "127.0.0.1", "Comma separated addresses of pservers");
-P_DEFINE_int32(parallel_thread_num, 1, "Thread number for parameter send");
+DEFINE_string(pservers, "127.0.0.1", "Comma separated addresses of pservers");
+DEFINE_int32(parallel_thread_num, 1, "Thread number for parameter send");

 namespace paddle {


--- a/paddle/pserver/ParameterClient2.h
+++ b/paddle/pserver/ParameterClient2.h
@@ -34,7 +34,7 @@ limitations under the License. */
 #include "ProtoServer.h"
 #include "SparseParameterDistribution.h"

-P_DECLARE_int32(parallel_thread_num);
+DECLARE_int32(parallel_thread_num);

 namespace paddle {


--- a/paddle/pserver/ParameterServer2.cpp
+++ b/paddle/pserver/ParameterServer2.cpp
@@ -30,11 +30,11 @@ limitations under the License. */
 #include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/Stat.h"

-P_DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec");
-P_DEFINE_double(async_lagged_ratio_min,
-                1.0,
-                "control config_.async_lagged_grad_discard_ratio() min value");
-P_DEFINE_double(
+DEFINE_int32(pserver_num_threads, 1, "number of threads for sync op exec");
+DEFINE_double(async_lagged_ratio_min,
+              1.0,
+              "control config_.async_lagged_grad_discard_ratio() min value");
+DEFINE_double(
    async_lagged_ratio_default,
    1.5,
    "if async_lagged_grad_discard_ratio is not set in trainer_config.conf"

--- a/paddle/pserver/ParameterServer2.h
+++ b/paddle/pserver/ParameterServer2.h
@@ -38,7 +38,7 @@ limitations under the License. */

 #include "ProtoServer.h"

-P_DECLARE_int32(port);
+DECLARE_int32(port);

 namespace paddle {


--- a/paddle/pserver/SparseParameterDistribution.cpp
+++ b/paddle/pserver/SparseParameterDistribution.cpp
@@ -20,26 +20,26 @@ limitations under the License. */

 #include "SparseParameterDistribution.h"

-P_DEFINE_bool(check_sparse_distribution_in_pserver,
-              false,
-              "check whether sparse parameter exhibts balanced distribution at "
-              "all pservers");
-P_DEFINE_bool(show_check_sparse_distribution_log,
-              false,
-              "show logs details for sparse parameter distribution in pserver");
-P_DEFINE_int32(check_sparse_distribution_batches,
-               100,
-               "run sparse parameter distribution check for N batches");
-P_DEFINE_double(
+DEFINE_bool(check_sparse_distribution_in_pserver,
+            false,
+            "check whether sparse parameter exhibts balanced distribution at "
+            "all pservers");
+DEFINE_bool(show_check_sparse_distribution_log,
+            false,
+            "show logs details for sparse parameter distribution in pserver");
+DEFINE_int32(check_sparse_distribution_batches,
+             100,
+             "run sparse parameter distribution check for N batches");
+DEFINE_double(
    check_sparse_distribution_ratio,
    0.6,
    "if parameters dispatched to different pservers exhibit unbalanced "
    " distribution for check_sparse_distribution_ratio * "
    " check_sparse_distribution_batches times, crash program");
-P_DEFINE_double(check_sparse_distribution_unbalance_degree,
-                2.0,
-                "the ratio of maximum data size and minimun data size for "
-                "different pserver");
+DEFINE_double(check_sparse_distribution_unbalance_degree,
+              2.0,
+              "the ratio of maximum data size and minimun data size for "
+              "different pserver");

 namespace paddle {


--- a/paddle/pserver/test/SocketTest.cpp
+++ b/paddle/pserver/test/SocketTest.cpp
--- a/paddle/pserver/test/test_ParameterServer2.cpp
+++ b/paddle/pserver/test/test_ParameterServer2.cpp
--- a/paddle/pserver/test/test_ProtoServer.cpp
+++ b/paddle/pserver/test/test_ProtoServer.cpp
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
--- a/paddle/scripts/travis/precommit.sh
+++ b/paddle/scripts/travis/precommit.sh
--- a/paddle/trainer/MergeModel.cpp
+++ b/paddle/trainer/MergeModel.cpp
--- a/paddle/trainer/RemoteParameterUpdater.cpp
+++ b/paddle/trainer/RemoteParameterUpdater.cpp
--- a/paddle/trainer/ThreadParameterUpdater.cpp
+++ b/paddle/trainer/ThreadParameterUpdater.cpp
--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
--- a/paddle/trainer/Trainer.h
+++ b/paddle/trainer/Trainer.h
--- a/paddle/trainer/TrainerBenchmark.cpp
+++ b/paddle/trainer/TrainerBenchmark.cpp
--- a/paddle/trainer/TrainerConfigHelper.cpp
+++ b/paddle/trainer/TrainerConfigHelper.cpp
--- a/paddle/trainer/TrainerInternalConfig.cpp
+++ b/paddle/trainer/TrainerInternalConfig.cpp
--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/trainer/TrainerMain.cpp
--- a/paddle/trainer/tests/test_Compare.cpp
+++ b/paddle/trainer/tests/test_Compare.cpp
--- a/paddle/trainer/tests/test_CompareSparse.cpp
+++ b/paddle/trainer/tests/test_CompareSparse.cpp
--- a/paddle/trainer/tests/test_CompareTwoNets.cpp
+++ b/paddle/trainer/tests/test_CompareTwoNets.cpp
--- a/paddle/trainer/tests/test_CompareTwoOpts.cpp
+++ b/paddle/trainer/tests/test_CompareTwoOpts.cpp
--- a/paddle/trainer/tests/test_Prediction.cpp
+++ b/paddle/trainer/tests/test_Prediction.cpp
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ b/paddle/trainer/tests/test_Trainer.cpp
--- a/paddle/trainer/tests/test_TrainerOnePass.cpp
+++ b/paddle/trainer/tests/test_TrainerOnePass.cpp
--- a/paddle/trainer/tests/test_recurrent_machine_generation.cpp
+++ b/paddle/trainer/tests/test_recurrent_machine_generation.cpp
--- a/paddle/utils/BarrierStat.cpp
+++ b/paddle/utils/BarrierStat.cpp
--- a/paddle/utils/CommandLineParser.cpp
+++ b/paddle/utils/CommandLineParser.cpp
--- a/paddle/utils/CommandLineParser.h
+++ b/paddle/utils/CommandLineParser.h
--- a/paddle/utils/CustomStackTrace.cpp
+++ b/paddle/utils/CustomStackTrace.cpp
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
--- a/paddle/utils/Flags.h
+++ b/paddle/utils/Flags.h
--- a/paddle/utils/Logging.cpp
+++ b/paddle/utils/Logging.cpp
--- a/paddle/utils/Logging.h
+++ b/paddle/utils/Logging.h
--- a/paddle/utils/PythonUtil.cpp
+++ b/paddle/utils/PythonUtil.cpp
--- a/paddle/utils/ThreadLocal.cpp
+++ b/paddle/utils/ThreadLocal.cpp
--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
--- a/paddle/utils/Version.cpp
+++ b/paddle/utils/Version.cpp
--- a/paddle/utils/tests/CMakeLists.txt
+++ b/paddle/utils/tests/CMakeLists.txt
--- a/paddle/utils/tests/test_CommandLineParser.cpp
+++ b/paddle/utils/tests/test_CommandLineParser.cpp
--- a/paddle/utils/tests/test_CustomStackTrace.cpp
+++ b/paddle/utils/tests/test_CustomStackTrace.cpp
--- a/paddle/utils/tests/test_Logging.cpp
+++ b/paddle/utils/tests/test_Logging.cpp
--- a/paddle/utils/tests/test_SpinLock.cpp
+++ b/paddle/utils/tests/test_SpinLock.cpp
--- a/paddle/utils/tests/test_ThreadBarrier.cpp
+++ b/paddle/utils/tests/test_ThreadBarrier.cpp
--- a/proto/ModelConfig.proto
+++ b/proto/ModelConfig.proto
--- a/third_party/gflags.BUILD
+++ b/third_party/gflags.BUILD
--- a/third_party/gflags_test/BUILD
+++ b/third_party/gflags_test/BUILD
--- a/third_party/gflags_test/gflags_test.cc
+++ b/third_party/gflags_test/gflags_test.cc
--- a/third_party/glog.BUILD
+++ b/third_party/glog.BUILD
--- a/third_party/glog_test/BUILD
+++ b/third_party/glog_test/BUILD
--- a/third_party/glog_test/glog_test.cc
+++ b/third_party/glog_test/glog_test.cc
--- a/third_party/gtest.BUILD
+++ b/third_party/gtest.BUILD
--- a/third_party/protobuf_test/BUILD
+++ b/third_party/protobuf_test/BUILD