diff --git a/RELEASE.md b/RELEASE.md
index f1588cb15bb8d1f79732d6b5195775cb13acef54..146f7afa7dfbc152500b82fde28445ae3155c16c 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,11 +1,103 @@
 # Release v0.10.0
 
+We are glad to release version 0.10.0.  In this version, we are happy to release the new 
+[Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/).
+
+- Our old Python API is kind of out of date.  It's hard to learn and hard to
+  use.  To write a PaddlePaddle program using the old API, we'd have to write
+  at least two Python files: one `data provider` and another one that defines
+  the network topology.  Users start a PaddlePaddle job by running the
+  `paddle_trainer` C++ program, which calls Python interpreter to run the
+  network topology configuration script and then start the training loop,
+  which iteratively calls the data provider function to load minibatches.
+  This prevents us from writing a Python program in a modern way, e.g., in the
+  Jupyter Notebook.
+  
+- The new API, which we often refer to as the *v2 API*, allows us to write
+  much shorter Python programs to define the network and the data in a single
+  .py file.  Also, this program can run in Jupyter Notebook, since the entry
+  point is in Python program and PaddlePaddle runs as a shared library loaded
+  and invoked by this Python program.
+  
+Basing on the new API, we delivered an online interative
+book, [Deep Learning 101](http://book.paddlepaddle.org/index.en.html)
+and [its Chinese version](http://book.paddlepaddle.org/).
+
+We also worked on updating our online documentation to describe the new API.
+But this is an ongoing work.  We will release more documentation improvements
+in the next version.
+
+We also worked on bring the new API to distributed model training (via MPI and
+Kubernetes).  This work is ongoing. We will release more about it in the next
+version.
+
 ## New Features
 
+* We release [new Python API](http://research.baidu.com/paddlepaddles-new-api-simplifies-deep-learning-programs/).
+* Deep Learning 101 book in [English](http://book.paddlepaddle.org/index.en.html) and [Chinese](http://book.paddlepaddle.org/).
+* Support rectangle input for CNN.
+* Support stride pooling for seqlastin and seqfirstin.
+* Expose `seq_concat_layer/seq_reshape_layer` in `trainer_config_helpers`.
+* Add dataset package: CIFAR, MNIST, IMDB, WMT14, CONLL05, movielens, imikolov.
+* Add Priorbox layer for Single Shot Multibox Detection. 
+* Add smooth L1 cost.
+* Add data reader creator and data reader decorator for v2 API.
+* Add the CPU implementation of cmrnorm projection.
+
 ## Improvements
 
+* Support Python virtualenv for `paddle_trainer`.
+* Add pre-commit hooks, used for automatically format our code.
+* Upgrade protobuf to version 3.x.
+* Add an option to check data type in Python data provider.
+* Speedup the backward of average layer on GPU.
+* Documentation refinement.
+* Check dead links in documents using Travis-CI.
+* Add a example for explaining `sparse_vector`.
+* Add ReLU in layer_math.py
+* Simplify data processing flow for Quick Start.
+* Support CUDNN Deconv.
+* Add data feeder in v2 API.
+* Support predicting the samples from sys.stdin for sentiment demo.
+* Provide multi-proccess interface for image preprocessing. 
+* Add benchmark document for v1 API.
+* Add ReLU in `layer_math.py`.
+* Add packages for automatically downloading public datasets.
+* Rename `Argument::sumCost` to `Argument::sum` since class `Argument` is nothing with cost.
+* Expose Argument::sum to Python
+* Add a new `TensorExpression` implementation for matrix-related expression evaluations.
+* Add lazy assignment for optimizing the calculation of a batch of multiple expressions.
+* Add abstract calss `Function` and its implementation:
+  * `PadFunc` and `PadGradFunc`.
+  * `ContextProjectionForwardFunc` and `ContextProjectionBackwardFunc`.
+  * `CosSimBackward` and `CosSimBackwardFunc`.
+  * `CrossMapNormalFunc` and `CrossMapNormalGradFunc`.
+  * `MulFunc`.
+* Add class `AutoCompare` and `FunctionCompare`, which make it easier to write unit tests for comparing gpu and cpu version of a function.
+* Generate `libpaddle_test_main.a` and remove the main function inside the test file.
+* Support dense numpy vector in PyDataProvider2.
+* Clean code base, remove some copy-n-pasted code snippets:
+  * Extract `RowBuffer` class for `SparseRowMatrix`.
+  * Clean the  interface of `GradientMachine`.
+  * Use `override` keyword in layer.
+  * Simplify `Evaluator::create`, use `ClassRegister` to create `Evaluator`s.
+* Check MD5 checksum when downloading demo's dataset.
+* Add `paddle::Error` which intentially replace `LOG(FATAL)` in Paddle.
+
 ## Bug Fixes
 
+* Check layer input types for `recurrent_group`.
+* Don't run `clang-format` with .cu source files.
+* Fix bugs with `LogActivation`.
+* Fix the bug that runs `test_layerHelpers` multiple times.
+* Fix the bug that the seq2seq demo exceeds protobuf message size limit.
+* Fix the bug in dataprovider converter in GPU mode.
+* Fix a bug in `GatedRecurrentLayer`.
+* Fix bug for `BatchNorm` when testing more than one models.
+* Fix broken unit test of paramRelu.
+* Fix some compile-time warnings about `CpuSparseMatrix`.
+* Fix `MultiGradientMachine` error when `trainer_count > batch_size`.
+* Fix bugs that prevents from asynchronous data loading in `PyDataProvider2`.
 
 # Release v0.9.0
 
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index aebb5d9fcb186005607c4849b70ecb61de771deb..0918e6cc633e7067b8bd2d5c5e1622d4139d4d14 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -44,7 +44,6 @@ if(MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
   message(STATUS "Found MKL (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
   set(CBLAS_FOUND ON)
   if(${MKL_LAPACK_INC_DIR})
-    add_definitions(-DPADDLE_USE_LAPACK)
     message(STATUS "Found lapack in MKL (include: ${MKL_LAPACK_INC_DIR})")
   endif()
   return() # return file.
@@ -80,7 +79,6 @@ if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB AND NOT CBLAS_FOUND)
   message(STATUS "Found ATLAS (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
   set(CBLAS_FOUND ON)
   if(ATLAS_CLAPACK_INC_DIR)
-    add_definitions(-DPADDLE_USE_LAPACK)
     set(CBLAS_INC_DIR ${CBLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
     message(STATUS "Found lapack in ATLAS (include: ${ATLAS_CLAPACK_INC_DIR})")
   endif()
@@ -115,7 +113,6 @@ if(OPENBLAS_INC_DIR AND OPENBLAS_LIB)
   message(STATUS "Found OpenBLAS (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
   set(CBLAS_FOUND ON)
   if(OPENBLAS_LAPACKE_INC_DIR)
-    add_definitions(-DPADDLE_USE_LAPACK)
     message(STATUS "Found lapack in OpenBLAS (include: ${OPENBLAS_LAPACKE_INC_DIR})")
   endif()
   return()
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 4ddadb9fa3b2c3993d1938fcbf1b823e66db99f2..97b6768decbf27c62af98542a5633eda1c544f29 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -24,45 +24,17 @@ IF(NOT ${CBLAS_FOUND})
     SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/${LIBRARY_PREFIX}openblas${STATIC_LIBRARY_SUFFIX}"
         CACHE FILEPATH "openblas library." FORCE)
 
-    # check fortran compiler and library
+    SET(COMMON_ARGS CC=${CMAKE_C_COMPILER} NO_LAPACK=1 NO_SHARED=1)
+
     IF(ANDROID)
         SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
-        SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 ARM_SOFTFP_ABI=1 NOFORTRAN=1 USE_THREAD=0 libs)
+        SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0 libs)
     ELSEIF(RPI)
         SET(OPENBLAS_COMMIT "v0.2.19")
-        SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 NOFORTRAN=1 USE_THREAD=0 libs)
+        SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER} TARGET=ARMV7 USE_THREAD=0 libs)
     ELSE()
-        IF(CMAKE_COMPILER_IS_GNUCC)
-            ENABLE_LANGUAGE(Fortran)
-            if (NOT CMAKE_Fortran_COMPILER_VERSION)
-              # cmake < 3.4 cannot get CMAKE_Fortran_COMPILER_VERSION directly.
-              execute_process(COMMAND ${CMAKE_Fortran_COMPILER} -dumpversion
-                        OUTPUT_VARIABLE CMAKE_Fortran_COMPILER_VERSION)
-            endif()
-            string(REGEX MATCHALL "[0-9]+" Fortran_VERSION ${CMAKE_Fortran_COMPILER_VERSION})
-            list(GET Fortran_VERSION 0 Fortran_MAJOR)
-            list(GET Fortran_VERSION 1 Fortran_MINOR)
-            find_library(GFORTRAN_LIBRARY NAMES gfortran PATHS
-                         /lib
-                         /usr/lib
-                         /usr/lib/gcc/x86_64-linux-gnu/${Fortran_MAJOR}.${Fortran_MINOR}/
-                         /usr/lib/gcc/x86_64-linux-gnu/${Fortran_MAJOR}/)
-            if (NOT GFORTRAN_LIBRARY)
-                message(FATAL_ERROR "Cannot found gfortran library which it is used by openblas")
-            endif()
-            find_package(Threads REQUIRED)
-            LIST(APPEND CBLAS_LIBRARIES ${GFORTRAN_LIBRARY} ${CMAKE_THREAD_LIBS_INIT})
-        ENDIF(CMAKE_COMPILER_IS_GNUCC)
-
-        IF(NOT CMAKE_Fortran_COMPILER)
-            MESSAGE(FATAL_ERROR "To build lapack in libopenblas, "
-                    "you need to set gfortran compiler: cmake .. -DCMAKE_Fortran_COMPILER=...")
-        ENDIF(NOT CMAKE_Fortran_COMPILER)
-
-        ADD_DEFINITIONS(-DPADDLE_USE_LAPACK)
-
         SET(OPENBLAS_COMMIT "v0.2.19")
-        SET(OPENBLAS_ARGS FC=${CMAKE_Fortran_COMPILER} DYNAMIC_ARCH=1 libs netlib)
+        SET(OPENBLAS_ARGS DYNAMIC_ARCH=1 libs)
     ENDIF()
 
     ExternalProject_Add(
@@ -73,7 +45,7 @@ IF(NOT ${CBLAS_FOUND})
         PREFIX              ${CBLAS_SOURCES_DIR}
         INSTALL_DIR         ${CBLAS_INSTALL_DIR}
         BUILD_IN_SOURCE     1
-        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} CC=${CMAKE_C_COMPILER} NO_SHARED=1 ${OPTIONAL_ARGS}
+        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS}
         INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 PREFIX=<INSTALL_DIR>
         UPDATE_COMMAND      ""
         CONFIGURE_COMMAND   ""
diff --git a/cmake/package.cmake b/cmake/package.cmake
index 211593f358eb34cf1a5692697247511893dfeb93..ff49a2d08e8f6004320acfce266339aa301eb9c4 100644
--- a/cmake/package.cmake
+++ b/cmake/package.cmake
@@ -1,5 +1,4 @@
 set(CPACK_PACKAGE_NAME paddle)
-set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "")
 set(CPACK_PACKAGE_VERSION_MAJOR ${PADDLE_MAJOR_VERSION})
 set(CPACK_PACKAGE_VERSION_MINOR ${PADDLE_MINOR_VERSION})
 set(CPACK_PACKAGE_VERSION_PATCH ${PADDLE_PATCH_VERSION})
@@ -10,8 +9,9 @@ set(CPACK_DEBIAN_PACKAGE_ARCHITECTURE amd64)
 set(CPACK_DEBIAN_PACKAGE_MAINTAINER PaddlePaddle Dev <paddle-dev@baidu.com>)
 set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "Paddle")
 set(CPACK_PACKAGE_DESCRIPTION "")
-set(CPACK_DEBIAN_PACKAGE_DEPENDS "libatlas3-base, libgflags2, libgoogle-glog0, libprotobuf8, libpython2.7, libstdc++6, python-numpy, python-pip, python-pip-whl, python-protobuf")
+set(CPACK_DEBIAN_PACKAGE_DEPENDS "libpython2.7-dev, libstdc++6, python-pip, curl, libgfortran3, python-pip-whl")
 set(CPACK_DEBIAN_PACKAGE_SECTION Devel)
+set(CPACK_DEBIAN_PACKAGE_VERSION ${PADDLE_VERSION})
 set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJ_ROOT}/paddle/scripts/deb/postinst")
 #set(CPACK_GENERATOR "DEB")
 # Start cpack
diff --git a/demo/sentiment/trainer_config.py b/demo/sentiment/trainer_config.py
index 2defecd178262900c03c1eda60b351dc44629d1f..f1cadaa728ac58107e15f77b5994d31da088caf7 100644
--- a/demo/sentiment/trainer_config.py
+++ b/demo/sentiment/trainer_config.py
@@ -29,7 +29,7 @@ settings(
     batch_size=128,
     learning_rate=2e-3,
     learning_method=AdamOptimizer(),
-    average_window=0.5,
+    model_average=ModelAverage(0.5),
     regularization=L2Regularization(8e-4),
     gradient_clipping_threshold=25)
 
diff --git a/demo/seqToseq/seqToseq_net.py b/demo/seqToseq/seqToseq_net.py
index e523a34d5a95120d1f0a583be8bbdbff5678d1ab..3d1f86ec3b7eda4fceaf3a1e406e3d0a1a4a2f60 100644
--- a/demo/seqToseq/seqToseq_net.py
+++ b/demo/seqToseq/seqToseq_net.py
@@ -69,7 +69,8 @@ def gru_encoder_decoder(data_conf,
                         encoder_size=512,
                         decoder_size=512,
                         beam_size=3,
-                        max_length=250):
+                        max_length=250,
+                        error_clipping=50):
     """
     A wrapper for an attention version of GRU Encoder-Decoder network
     is_generating: whether this config is used for generating
@@ -90,9 +91,19 @@ def gru_encoder_decoder(data_conf,
         input=src_word_id,
         size=word_vector_dim,
         param_attr=ParamAttr(name='_source_language_embedding'))
-    src_forward = simple_gru(input=src_embedding, size=encoder_size)
+    src_forward = simple_gru(
+        input=src_embedding,
+        size=encoder_size,
+        naive=True,
+        gru_layer_attr=ExtraLayerAttribute(
+            error_clipping_threshold=error_clipping))
     src_backward = simple_gru(
-        input=src_embedding, size=encoder_size, reverse=True)
+        input=src_embedding,
+        size=encoder_size,
+        reverse=True,
+        naive=True,
+        gru_layer_attr=ExtraLayerAttribute(
+            error_clipping_threshold=error_clipping))
     encoded_vector = concat_layer(input=[src_forward, src_backward])
 
     with mixed_layer(size=decoder_size) as encoded_proj:
@@ -117,11 +128,13 @@ def gru_encoder_decoder(data_conf,
             decoder_inputs += full_matrix_projection(input=context)
             decoder_inputs += full_matrix_projection(input=current_word)
 
-        gru_step = gru_step_layer(
+        gru_step = gru_step_naive_layer(
             name='gru_decoder',
             input=decoder_inputs,
             output_mem=decoder_mem,
-            size=decoder_size)
+            size=decoder_size,
+            layer_attr=ExtraLayerAttribute(
+                error_clipping_threshold=error_clipping))
 
         with mixed_layer(
                 size=target_dict_dim, bias_attr=True,
diff --git a/doc/getstarted/index_cn.rst b/doc/getstarted/index_cn.rst
index c6a4d3121c5857cd434acecb389d68f4d4c7a532..cadf092f8f42ca16bbeb23bd21e0d018af8e43cc 100644
--- a/doc/getstarted/index_cn.rst
+++ b/doc/getstarted/index_cn.rst
@@ -2,7 +2,8 @@
 ============
 
 ..  toctree::
-  :maxdepth: 2
+  :maxdepth: 1
 
   build_and_install/index_cn.rst
-  basic_usage/index_cn.rst
+
+- `深度学习入门课程 <http://book.paddlepaddle.org/>`_
diff --git a/doc/getstarted/index_en.rst b/doc/getstarted/index_en.rst
index 55d95d8015e56ddae3363d19315db0fad841caad..9f771e93e8b63eb98e31ec12667bd1aa007af20e 100644
--- a/doc/getstarted/index_en.rst
+++ b/doc/getstarted/index_en.rst
@@ -2,7 +2,8 @@ GET STARTED
 ============
 
 ..  toctree::
-  :maxdepth: 2
+  :maxdepth: 1
 
   build_and_install/index_en.rst
-  basic_usage/index_en.rst
+
+- `Deep Learning 101 <http://book.paddlepaddle.org/index.en.html>`_
diff --git a/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst b/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
index 4b328fc9d38bc5dfec35d5e0f0d46136aeeb41bc..79048e92482851af6c2dd7d055868ebcaa7a298b 100644
--- a/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
+++ b/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
@@ -19,18 +19,18 @@
 
 在 PaddlePaddle中，下面这些Layer能够接受双层序列作为输入，完成相应的计算。
 
-pooling_layer
-==============
+pooling
+========
 
-pooling_layer 的使用示例如下，详细见 :ref:`api_trainer_config_helpers_layers_pooling_layer` 配置API。
+pooling 的使用示例如下，详细见 :ref:`api_v2.layer_pooling` 配置API。
 
 ..	code-block:: bash
 
-        seq_pool = pooling_layer(input=layer,
-                                 pooling_type=AvgPooling(),
-                                 agg_level=AggregateLevel.EACH_SEQUENCE)
+        seq_pool = pooling(input=layer,
+                           pooling_type=pooling.Max(),
+                           agg_level=AggregateLevel.EACH_SEQUENCE)
         
-- `pooling_type` 目前支持两种，分别是：MaxPooling()和AvgPooling()。
+- `pooling_type` 目前支持两种，分别是：pooling.Max()和pooling.Avg()。
 
 - `agg_level=AggregateLevel.EACH_TIMESTEP` 时（默认值）：
 
@@ -47,7 +47,7 @@ pooling_layer 的使用示例如下，详细见 :ref:`api_trainer_config_helpers
 last_seq 和 first_seq
 =====================
 
-last_seq 的使用示例如下（ :ref:`api_trainer_config_helpers_layers_first_seq` 类似），详细见 :ref:`api_trainer_config_helpers_layers_last_seq` 配置API。
+last_seq 的使用示例如下（ :ref:`api_v2.layer_first_seq` 类似），详细见 :ref:`api_v2.layer_last_seq` 配置API。
 
 ..	code-block:: bash
 
@@ -65,16 +65,16 @@ last_seq 的使用示例如下（ :ref:`api_trainer_config_helpers_layers_first_
   - 输入：必须是一个双层序列
   - 输出：一个单层序列，其中每个元素是双层序列中每个subseq最后一个（或第一个）元素。
 
-expand_layer
-============
+expand
+======
 
-expand_layer 的使用示例如下，详细见 :ref:`api_trainer_config_helpers_layers_expand_layer` 配置API。
+expand 的使用示例如下，详细见 :ref:`api_v2.layer_expand` 配置API。
 
 ..	code-block:: bash
 
-        expand = expand_layer(input=layer1,
-                              expand_as=layer2,
-                              expand_level=ExpandLevel.FROM_TIMESTEP)
+        ex = expand(input=layer1,
+                    expand_as=layer2,
+                    expand_level=ExpandLevel.FROM_TIMESTEP)
         
 - `expand_level=ExpandLevel.FROM_TIMESTEP` 时（默认值）：
 
diff --git a/doc/howto/deep_model/rnn/index_cn.rst b/doc/howto/deep_model/rnn/index_cn.rst
index 9ecab5594cff47cde4700b7ce0f58013a960a16e..9e805ca85191b793c8798a239927a318c70b96f5 100644
--- a/doc/howto/deep_model/rnn/index_cn.rst
+++ b/doc/howto/deep_model/rnn/index_cn.rst
@@ -4,7 +4,6 @@ RNN相关模型
 ..  toctree::
   :maxdepth: 1
 
-  rnn_config_cn.rst
   recurrent_group_cn.md
   hierarchical_layer_cn.rst
   hrnn_rnn_api_compare_cn.rst
diff --git a/doc/howto/deep_model/rnn/index_en.rst b/doc/howto/deep_model/rnn/index_en.rst
index 7adc79873d699fdfd5a85034bcef964dd1f19132..13a153b05c578e0af82ee29db5ea27fd4b6d6f59 100644
--- a/doc/howto/deep_model/rnn/index_en.rst
+++ b/doc/howto/deep_model/rnn/index_en.rst
@@ -1,7 +1,2 @@
 RNN Models
 ==========
-
-..  toctree::
-  :maxdepth: 1
-
-  rnn_config_en.rst
diff --git a/doc/index_cn.rst b/doc/index_cn.rst
index 460fedb5658a8ea9bbe8b602ee2b5df66502fa62..9279bac7f4b2898c18979630a8d6dfcb2dba70e0 100644
--- a/doc/index_cn.rst
+++ b/doc/index_cn.rst
@@ -5,7 +5,6 @@ PaddlePaddle 文档
   :maxdepth: 1
 
   getstarted/index_cn.rst
-  tutorials/index_cn.md
   howto/index_cn.rst
   api/index_cn.rst
   faq/index_cn.rst
diff --git a/doc/index_en.rst b/doc/index_en.rst
index 1d9cca7de720ebc23fe816f32d158930d91c07e7..168c7667c61da677905585d6c4b5037ce80b3765 100644
--- a/doc/index_en.rst
+++ b/doc/index_en.rst
@@ -5,8 +5,6 @@ PaddlePaddle Documentation
   :maxdepth: 1
 
   getstarted/index_en.rst
-  tutorials/index_en.md
   howto/index_en.rst
   api/index_en.rst
   about/index_en.rst
- 
\ No newline at end of file
diff --git a/doc_theme/templates/layout.html b/doc_theme/templates/layout.html
index 034740369ed10a748856e2205d3315f51a7de62f..65e61c5f298e19adc6330c378779a6edf418752e 100644
--- a/doc_theme/templates/layout.html
+++ b/doc_theme/templates/layout.html
@@ -114,10 +114,7 @@
           </ul>
         </div>
         <ul class="site-page-links">
-          <li><a>Home</a></li>
-          <li><a>Get Started</a></li>
-          <li class="active"><a>Documentation</a></li>
-          <li><a>About Us</a></li>
+          <li><a href="/">Home</a></li>
         </ul>
       </div>
       <div class="doc-module">
@@ -137,7 +134,7 @@
           {{ toctree }}
         {% endblock %}
     </nav>
-    {% if toc %}
+    {% if False %}
     <nav class="local-toc">{{ toc }}</nav>
     {% endif %}
     <section class="doc-content-wrap">
@@ -168,7 +165,8 @@
             VERSION:'{{ release|e }}',
             COLLAPSE_INDEX:false,
             FILE_SUFFIX:'{{ '' if no_search_suffix else file_suffix }}',
-            HAS_SOURCE:  {{ has_source|lower }}
+            HAS_SOURCE:  {{ has_source|lower }},
+            SOURCELINK_SUFFIX: ".txt",
         };
     </script>
     {%- for scriptfile in script_files %}
diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
index a28ccd6f07cfd56b7f1978f67fdcf6e7e5fe6337..f9061e96deb659dcf7bfb88b46e6509af0425199 100755
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -21,16 +21,13 @@ set(CUDA_CXX_WITH_GPU_SOURCES
 
 if(WITH_GPU)
     set(CUDA_CXX_SOURCES
-        src/hl_dso_loader.cc
         src/hl_warpctc_wrap.cc
         ${CUDA_CXX_WITH_GPU_SOURCES})
 
     set_source_files_properties(${CUDA_CXX_SOURCES}
                                 PROPERTIES COMPILE_FLAGS "-D__NVCC__")
 else()
-    set(CUDA_CXX_SOURCES
-        src/hl_dso_loader.cc
-        src/hl_warpctc_wrap.cc)
+    set(CUDA_CXX_SOURCES src/hl_warpctc_wrap.cc)
 endif()
 
 set(CUDA_CU_SOURCES
@@ -47,7 +44,6 @@ set(CUDA_CU_SOURCES
 
 set(CUDA_HEADERS
     include/hl_time.h
-    include/hl_dso_loader.h
     include/hl_warpctc_wrap.h
     include/hl_sequence.h
     include/hl_cuda_cublas.h
diff --git a/paddle/cuda/include/hl_activation_functions.h b/paddle/cuda/include/hl_activation_functions.h
index cdb2dba06cb4123da4be2088e290c6a740e0375b..93957fd9644652c103d15873b732d0b9fa89330f 100644
--- a/paddle/cuda/include/hl_activation_functions.h
+++ b/paddle/cuda/include/hl_activation_functions.h
@@ -40,18 +40,18 @@ public:
 namespace gpu {
 static __device__ Active<real>::forward forward[] = HPPL_ACTIVE_FUNCTION;
 static __device__ Active<real>::backward backward[] = HPPL_ACTIVE_FUNCTION;
-}
+}  // namespace gpu
 #else
 namespace cpu {
 static Active<real>::forward forward[] = HPPL_ACTIVE_FUNCTION;
 static Active<real>::backward backward[] = HPPL_ACTIVE_FUNCTION;
-}
+}  // namespace cpu
 
 #ifdef __AVX__
 namespace avx {
 static Active<__m256>::forward forward[] = HPPL_ACTIVE_FUNCTION;
 static Active<__m256>::backward backward[] = HPPL_ACTIVE_FUNCTION;
-}
+}  // namespace avx
 #endif
 #endif
 
diff --git a/paddle/cuda/include/hl_cnn.h b/paddle/cuda/include/hl_cnn.h
index c5787630abbe105af64888692b1106bd21f4c1e8..f55197c8c9ebb4a0f67ab915abfefd6a45cd13aa 100644
--- a/paddle/cuda/include/hl_cnn.h
+++ b/paddle/cuda/include/hl_cnn.h
@@ -273,23 +273,23 @@ extern void hl_bilinear_forward(const real* inData,
                                 const real ratioW);
 
 /**
-* @brief   Bilinear interpolation backward.
-*
-* @param[out]  inGrad      input gradient.
-* @param[in]   inImgH      input image height.
-* @param[in]   inImgW      input image width.
-* @param[in]   inputH      input batchSize.
-* @param[in]   inputW      input image data dim.
-* @param[in]   outGrad     output gradient.
-* @param[in]   outImgH     output image height.
-* @param[in]   outImgW     output image width.
-* @param[in]   outputH     output batchSize.
-* @param[in]   outputW     output image data dim.
-* @param[in]   numChannels number of channels.
-* @param[in]   ratioH      inImgH / outImgH.
-* @param[in]   ratioW      inImgW / outImgW.
-*
-*/
+ * @brief   Bilinear interpolation backward.
+ *
+ * @param[out]  inGrad      input gradient.
+ * @param[in]   inImgH      input image height.
+ * @param[in]   inImgW      input image width.
+ * @param[in]   inputH      input batchSize.
+ * @param[in]   inputW      input image data dim.
+ * @param[in]   outGrad     output gradient.
+ * @param[in]   outImgH     output image height.
+ * @param[in]   outImgW     output image width.
+ * @param[in]   outputH     output batchSize.
+ * @param[in]   outputW     output image data dim.
+ * @param[in]   numChannels number of channels.
+ * @param[in]   ratioH      inImgH / outImgH.
+ * @param[in]   ratioW      inImgW / outImgW.
+ *
+ */
 extern void hl_bilinear_backward(real* inGrad,
                                  const size_t inImgH,
                                  const size_t inImgW,
diff --git a/paddle/cuda/src/hl_cuda_cublas.cc b/paddle/cuda/src/hl_cuda_cublas.cc
index 182e8ab218cce18448f8a08f5c1a1dab7e38f2b6..6163209e9bc681209712243ba68dec549b7e360a 100644
--- a/paddle/cuda/src/hl_cuda_cublas.cc
+++ b/paddle/cuda/src/hl_cuda_cublas.cc
@@ -14,10 +14,9 @@ limitations under the License. */
 
 #include "hl_cuda_cublas.h"
 #include <sys/time.h>
-#include <mutex>
 #include "hl_cuda.h"
-#include "hl_dso_loader.h"
 #include "hl_thread.ph"
+#include "paddle/utils/DynamicLoader.h"
 #include "paddle/utils/Logging.h"
 
 namespace dynload {
diff --git a/paddle/cuda/src/hl_cuda_cudnn.cc b/paddle/cuda/src/hl_cuda_cudnn.cc
index 6198f067bab2ec790e641e77dce058fe6a52491a..c53a5636829cab9d575f58cc2326cb3efe383e1c 100644
--- a/paddle/cuda/src/hl_cuda_cudnn.cc
+++ b/paddle/cuda/src/hl_cuda_cudnn.cc
@@ -15,10 +15,9 @@ limitations under the License. */
 #include "hl_cuda_cudnn.h"
 #include <cudnn.h>
 #include <gflags/gflags.h>
-#include <mutex>
 #include "hl_cuda_cudnn.ph"
-#include "hl_dso_loader.h"
 #include "hl_thread.ph"
+#include "paddle/utils/DynamicLoader.h"
 #include "paddle/utils/Logging.h"
 
 DEFINE_int32(cudnn_conv_workspace_limit_in_mb,
diff --git a/paddle/cuda/src/hl_cuda_device.cc b/paddle/cuda/src/hl_cuda_device.cc
index 6dfb12e00b80db36ad2e53326b880c7d1ed59263..4042d9742a92f6718406c8923d9129b81afe89e7 100644
--- a/paddle/cuda/src/hl_cuda_device.cc
+++ b/paddle/cuda/src/hl_cuda_device.cc
@@ -21,11 +21,10 @@ limitations under the License. */
 #include <sys/syscall.h>
 #include <sys/time.h>
 #include <unistd.h>
-#include <mutex>
 #include "hl_cuda.ph"
 #include "hl_thread.ph"
-#include "hl_dso_loader.h"
 #include "paddle/utils/Logging.h"
+#include "paddle/utils/DynamicLoader.h"
 // clang-format on
 
 namespace dynload {
diff --git a/paddle/cuda/src/hl_warpctc_wrap.cc b/paddle/cuda/src/hl_warpctc_wrap.cc
index f57efb2b46797c303d99a5468ad96163a3e74972..9f812dd0dead8b4b4e0a4ac58b12a81d1da00aee 100644
--- a/paddle/cuda/src/hl_warpctc_wrap.cc
+++ b/paddle/cuda/src/hl_warpctc_wrap.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "hl_warpctc_wrap.h"
 #include <mutex>
-#include "hl_dso_loader.h"
+#include "paddle/utils/DynamicLoader.h"
 #include "paddle/utils/Logging.h"
 
 namespace dynload {
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 1522510e8bb9816cb468fcf406e22560163950cc..233a53709a80f06dd2a06995b159c1aef10e2788 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -12,7 +12,7 @@ endif()
 
 add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
 add_dependencies(paddle_function ${external_project_dependencies})
-
+add_dependencies(paddle_function gen_proto_cpp)
 
 if(WITH_GPU)
 if(WITH_TESTING)
diff --git a/paddle/function/MulOpTest.cpp b/paddle/function/MulOpTest.cpp
index 8748eb0d79fa0fcb0935eac5bb37b44274128aa0..8753057ebf73c99336b2f5d9c610e4aaf293f845 100644
--- a/paddle/function/MulOpTest.cpp
+++ b/paddle/function/MulOpTest.cpp
@@ -74,9 +74,9 @@ TEST(MulOp, DDDMatrixMul) {
 }
 
 /**
-  * C += A * B, B, C dense, A sparse
-  * dense = sparse * dense
-  */
+ * C += A * B, B, C dense, A sparse
+ * dense = sparse * dense
+ */
 void testFuncDSparseDMatrix(
     size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
   real scaleT = 1.0;
@@ -119,9 +119,9 @@ TEST(MuLOp, DSparseDMul) {
 }
 
 /**
-  * C += A * B, A, C dense, B sparse
-  * dense = dense * sparse
-  */
+ * C += A * B, A, C dense, B sparse
+ * dense = dense * sparse
+ */
 void testFuncDDSparseMatrix(
     size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
   real scaleT = 1.0;
@@ -165,9 +165,9 @@ TEST(MulOp, DDSparseMul) {
 }
 
 /**
-  * C += A * B, A sparse, B, C dense
-  * sparse = dense * dense
-  */
+ * C += A * B, A sparse, B, C dense
+ * sparse = dense * dense
+ */
 void testFuncSparseDDMatrix(
     size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
   real scaleT = 1.0;
diff --git a/paddle/gserver/gradientmachines/GradientMachine.cpp b/paddle/gserver/gradientmachines/GradientMachine.cpp
index 3eb87d9b85c8207a23046fdb4bda06ba8185e2a3..b44e4dc202f01956ed21c175aa897ced8e92546b 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/GradientMachine.cpp
@@ -21,7 +21,6 @@ limitations under the License. */
 #include "MultiGradientMachine.h"
 #include "MultiNetwork.h"
 #include "NeuralNetwork.h"
-#include "NeuralNetwork.h"
 #include "ParallelNeuralNetwork.h"
 #include "hl_gpu.h"
 
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
index 2ab964b8fc2e080282aa03db4ee6836540e666d7..01158d1dce8d711c67b1ecf29bb644e42ccf6ff5 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.cpp
@@ -637,7 +637,7 @@ void RecurrentGradientMachine::removeBeamSearchStatisticsCallbacks() {
 /* create scattered id infomation for all realLayer of inFrameLines one time.
  * If hasSubseq, will also create scattered sequenceStartPositions infomation
  * for all realLayer of inFrameLines one time.
-*/
+ */
 
 void RecurrentGradientMachine::createInFrameInfo(int inlinkId,
                                                  const Argument& input,
diff --git a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
index 910ca4376bedeac31674c71b9ea1205ef769cda9..c2bc52709ab42bbe21dcc3951f23f2e0b5e6793d 100644
--- a/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
+++ b/paddle/gserver/gradientmachines/RecurrentGradientMachine.h
@@ -107,18 +107,18 @@ public:
       DropCallback;
 
   /**
-    * @brief NormOrDropNodeCallback
-    *
-    * Normalize a path's probabilities or just drop it by modifying path.logProb
-    *
-    * The first parameter is sequence index in a batch
-    *
-    * The second parameter is path.ids
-    *
-    * The third parameter is probabilites for each node in this path.
-    *
-    * The fourth parameter is the probability of the whole path.
-    */
+   * @brief NormOrDropNodeCallback
+   *
+   * Normalize a path's probabilities or just drop it by modifying path.logProb
+   *
+   * The first parameter is sequence index in a batch
+   *
+   * The second parameter is path.ids
+   *
+   * The third parameter is probabilites for each node in this path.
+   *
+   * The fourth parameter is the probability of the whole path.
+   */
   typedef std::function<void(
       int seqId, const std::vector<int>&, std::vector<real>&, real*)>
       NormOrDropNodeCallback;
@@ -348,9 +348,9 @@ protected:
   int targetInfoInlinkId_;
 
   /* create scattered id infomation for all realLayer of inFrameLines one time.
-  *  If hasSubseq, will also create scattered sequenceStartPositions infomation
-  *  for all realLayer of inFrameLines one time.
-  */
+   *  If hasSubseq, will also create scattered sequenceStartPositions infomation
+   *  for all realLayer of inFrameLines one time.
+   */
   void createInFrameInfo(int inlinks_id,
                          const Argument& input,
                          PassType passType);
diff --git a/paddle/gserver/layers/Layer.h b/paddle/gserver/layers/Layer.h
index 47182c9ecc695f4d79089d06d6a1a61b878ce409..0ed482889d0cea884db3759620088575c5b10201 100644
--- a/paddle/gserver/layers/Layer.h
+++ b/paddle/gserver/layers/Layer.h
@@ -106,9 +106,9 @@ protected:
 
 public:
   /**
-    * Wait until all input value ready.
-    * Called before Layer::forward() function.
-    */
+   * Wait until all input value ready.
+   * Called before Layer::forward() function.
+   */
   virtual void waitInputValue();
 
   /**
@@ -118,9 +118,9 @@ public:
   virtual void copyOutputToOtherDevice();
 
   /**
-    * Wait until all output grad ready and merge them to output_.grad.
-    * Called before Layer::backward() function.
-    */
+   * Wait until all output grad ready and merge them to output_.grad.
+   * Called before Layer::backward() function.
+   */
   virtual void waitAndMergeOutputGrad();
 
   /**
diff --git a/paddle/gserver/layers/RotateLayer.h b/paddle/gserver/layers/RotateLayer.h
index 1a64d4d5a51d9c04df07861f02f1bb91eaec088e..d05c2065cb1cb81452c54ee1858c34cb46e6c7f6 100644
--- a/paddle/gserver/layers/RotateLayer.h
+++ b/paddle/gserver/layers/RotateLayer.h
@@ -29,7 +29,7 @@ namespace paddle {
  *
  * The config file api is rotate_layer
  *
-*/
+ */
 
 class RotateLayer : public Layer {
 public:
diff --git a/paddle/gserver/layers/SequencePoolLayer.cpp b/paddle/gserver/layers/SequencePoolLayer.cpp
index 8c49502011582b534a2ba4113ffeffaa2f06a51c..235d9a9b0f0653df5c0b671092df9e195f08fc48 100644
--- a/paddle/gserver/layers/SequencePoolLayer.cpp
+++ b/paddle/gserver/layers/SequencePoolLayer.cpp
@@ -60,7 +60,7 @@ void SequencePoolLayer::forward(PassType passType) {
    * thus, in this case, output_ has no sequenceStartPositions.
    * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
    * case, we should compute the new sequenceStartPositions.
-  */
+   */
   if (type_) {
     CHECK(input.subSequenceStartPositions)
         << "when trans_type = seq, input must hasSubseq";
diff --git a/paddle/gserver/tests/sequence_layer_group.conf b/paddle/gserver/tests/sequence_layer_group.conf
index 68d150d553588c864de56ce1e6f283cc42fbbf2f..50f2d89d0271b2eaa460e57636eb09b6d6aeda18 100644
--- a/paddle/gserver/tests/sequence_layer_group.conf
+++ b/paddle/gserver/tests/sequence_layer_group.conf
@@ -48,8 +48,7 @@ lstm = lstmemory_group(
     size=hidden_dim,
     act=TanhActivation(),
     gate_act=SigmoidActivation(),
-    state_act=TanhActivation(),
-    lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
+    state_act=TanhActivation())
 
 lstm_last = last_seq(input=lstm)
 
diff --git a/paddle/gserver/tests/sequence_nest_layer_group.conf b/paddle/gserver/tests/sequence_nest_layer_group.conf
index 88cb42798baff79fa6a86ef11dabf1781575c0b4..c01b95f7a29ae73c2b3ccd5b56ad1d316cbc72ec 100644
--- a/paddle/gserver/tests/sequence_nest_layer_group.conf
+++ b/paddle/gserver/tests/sequence_nest_layer_group.conf
@@ -51,8 +51,7 @@ def lstm_group(lstm_group_input):
         size=hidden_dim,
         act=TanhActivation(),
         gate_act=SigmoidActivation(),
-        state_act=TanhActivation(),
-        lstm_layer_attr=ExtraLayerAttribute(error_clipping_threshold=50))
+        state_act=TanhActivation())
     return lstm_output
 
 
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index 6203cd3b9ab9f95853cd3c46750fd55d6dfbba4a..178fce5b0a97442173a035fe85bdaddabba7da17 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -15,6 +15,54 @@ limitations under the License. */
 #include "MathFunctions.h"
 #include "hl_matrix_apply.cuh"
 #include "hl_matrix_ops.cuh"
+#include "paddle/utils/DynamicLoader.h"
+
+namespace dynload {
+
+std::once_flag lapack_dso_flag;
+void* lapack_dso_handle = nullptr;
+
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load lapack routine
+ * via operator overloading.
+ *
+ * note: default dynamic linked libs
+ */
+#define DYNAMIC_LOAD_LAPACK_WRAP(__name)                                       \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    auto operator()(Args... args) -> decltype(__name(args...)) {               \
+      using lapack_func = decltype(__name(args...)) (*)(Args...);              \
+      std::call_once(lapack_dso_flag, GetLapackDsoHandle, &lapack_dso_handle); \
+      void* p_##__name = dlsym(lapack_dso_handle, #__name);                    \
+      return reinterpret_cast<lapack_func>(p_##__name)(args...);               \
+    }                                                                          \
+  } __name;  // struct DynLoad__##__name
+
+// clang-format off
+#ifdef PADDLE_USE_ATLAS
+  #define  PADDLE_SGETRF  clapack_sgetrf
+  #define  PADDLE_DGETRF  clapack_dgetrf
+  #define  PADDLE_SGETRI  clapack_sgetri
+  #define  PADDLE_DGETRI  clapack_dgetri
+#else
+  #define  PADDLE_SGETRF  LAPACKE_sgetrf
+  #define  PADDLE_DGETRF  LAPACKE_dgetrf
+  #define  PADDLE_SGETRI  LAPACKE_sgetri
+  #define  PADDLE_DGETRI  LAPACKE_dgetri
+#endif  
+
+#define LAPACK_ROUTINE_EACH(__macro)       \
+  __macro(PADDLE_SGETRF)                   \
+  __macro(PADDLE_DGETRF)                   \
+  __macro(PADDLE_SGETRI)                   \
+  __macro(PADDLE_DGETRI)
+// clang-format on
+
+LAPACK_ROUTINE_EACH(DYNAMIC_LOAD_LAPACK_WRAP)
+
+}  // namespace dynload
 
 namespace paddle {
 
@@ -85,16 +133,7 @@ int getrf<float>(const CBLAS_ORDER order,
                  float* A,
                  const int lda,
                  int* ipiv) {
-#ifdef PADDLE_USE_LAPACK
-#ifdef PADDLE_USE_ATLAS
-  return clapack_sgetrf(order, M, N, A, lda, ipiv);
-#else
-  return LAPACKE_sgetrf(order, M, N, A, lda, ipiv);
-#endif
-#else
-  LOG(FATAL) << "Not implemented";
-#endif
-  return 0;
+  return dynload::PADDLE_SGETRF(order, M, N, A, lda, ipiv);
 }
 
 template <>
@@ -104,16 +143,7 @@ int getrf<double>(const CBLAS_ORDER order,
                   double* A,
                   const int lda,
                   int* ipiv) {
-#ifdef PADDLE_USE_LAPACK
-#ifdef PADDLE_USE_ATLAS
-  return clapack_dgetrf(order, M, N, A, lda, ipiv);
-#else
-  return LAPACKE_dgetrf(order, M, N, A, lda, ipiv);
-#endif
-#else
-  LOG(FATAL) << "Not implemented";
-#endif
-  return 0;
+  return dynload::PADDLE_DGETRF(order, M, N, A, lda, ipiv);
 }
 
 template <>
@@ -122,16 +152,7 @@ int getri<float>(const CBLAS_ORDER order,
                  float* A,
                  const int lda,
                  const int* ipiv) {
-#ifdef PADDLE_USE_LAPACK
-#ifdef PADDLE_USE_ATLAS
-  return clapack_sgetri(order, N, A, lda, ipiv);
-#else
-  return LAPACKE_sgetri(order, N, A, lda, ipiv);
-#endif
-#else
-  LOG(FATAL) << "Not implemented";
-#endif
-  return 0;
+  return dynload::PADDLE_SGETRI(order, N, A, lda, ipiv);
 }
 
 template <>
@@ -140,15 +161,7 @@ int getri<double>(const CBLAS_ORDER order,
                   double* A,
                   const int lda,
                   const int* ipiv) {
-#ifdef PADDLE_USE_LAPACK
-#ifdef PADDLE_USE_ATLAS
-  return clapack_dgetri(order, N, A, lda, ipiv);
-#else
-  return LAPACKE_dgetri(order, N, A, lda, ipiv);
-#endif
-#else
-  LOG(FATAL) << "Not implemented";
-#endif
+  return dynload::PADDLE_DGETRI(order, N, A, lda, ipiv);
   return 0;
 }
 
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
index 9f8f84a87c5e60b2a6573844f251c42152d8156b..c8559eefd8378450fc18c2ba821c65b39c8cc046 100644
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -17,14 +17,11 @@ limitations under the License. */
 
 #ifdef PADDLE_USE_MKL
 #include <mkl.h>
-#ifdef PADDLE_USE_LAPACK
 #include <mkl_lapacke.h>
-#endif
 #else
 extern "C" {
 #include <cblas.h>
 }
-#ifdef PADDLE_USE_LAPACK
 #ifdef PADDLE_USE_ATLAS
 extern "C" {
 #include <clapack.h>
@@ -33,7 +30,6 @@ extern "C" {
 #include <lapacke.h>
 #endif
 #endif
-#endif
 
 #include <cmath>
 
diff --git a/paddle/math/tests/TestUtils.h b/paddle/math/tests/TestUtils.h
index c3020961880484a7944f8cc61377a4f08122e403..713f407f496099c04e5834b2bdcf7b1cf5a86a3a 100644
--- a/paddle/math/tests/TestUtils.h
+++ b/paddle/math/tests/TestUtils.h
@@ -37,7 +37,7 @@ limitations under the License. */
  *
  *  AutoCompare test;
  *  test.cmpWithoutArg<I...>(function, height, width)
-*/
+ */
 
 #include <gtest/gtest.h>
 #include "TensorCheck.h"
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 5210fe3fa1f3e221d7025edbc8a511d74ddaed51..3b1b0065af38d72716194787471889e69e719b9e 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -21,6 +21,7 @@ limitations under the License. */
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
 #include "paddle/testing/TestUtil.h"
+#include "paddle/utils/DynamicLoader.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
 
@@ -235,10 +236,15 @@ TEST(Matrix, unary) {
       testMatrixTranspose(height, width);
       testMatrixRotate(height, width);
     }
-// inverse
-#ifdef PADDLE_USE_LAPACK
-    testMatrixInverse(height);
-#endif
+    // inverse matrix
+    void** dso_handler = nullptr;
+    GetLapackDsoHandle(dso_handler);
+    if (nullptr == *dso_handler) {
+      LOG(WARNING) << "Failed to find liblapack.so, please specify its path "
+                      "using LD_LIBRARY_PATH.";
+    } else {
+      testMatrixInverse(height);
+    }
   }
 }
 
diff --git a/paddle/parameter/FirstOrderOptimizer.h b/paddle/parameter/FirstOrderOptimizer.h
index 095019b74f4f667991a0d4c5d5511e371889539f..caa78acd98ea4b35fc69643689cfce23026275e0 100644
--- a/paddle/parameter/FirstOrderOptimizer.h
+++ b/paddle/parameter/FirstOrderOptimizer.h
@@ -126,7 +126,7 @@ protected:
 /*
  * AdaDelta Optimization.
  * http://www.matthewzeiler.com/pubs/googleTR2012/googleTR2012.pdf
-*/
+ */
 class AdaDeltaParameterOptimizer : public ParameterOptimizer {
 public:
   explicit AdaDeltaParameterOptimizer(const OptimizationConfig& optConfig)
diff --git a/paddle/scripts/deb/postinst b/paddle/scripts/deb/postinst
new file mode 100644
index 0000000000000000000000000000000000000000..91620b1ee7569cd17927f44112dfa9279ddbdd32
--- /dev/null
+++ b/paddle/scripts/deb/postinst
@@ -0,0 +1,6 @@
+#!/bin/bash
+set -e
+echo "Post install paddle debian package."
+echo "Install some python package used for paddle. You can run "
+echo "  pip install /usr/opt/paddle/share/wheels/*.whl to install them."
+find /usr/ -name '*paddle*.whl' | xargs pip install
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index a750c436dc50f906a35313490f667d9a24cc0c00..4172063d923f939dac7229573bc087ec8c62b844 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -5,13 +5,8 @@ set -e
 # Set BASE_IMAGE according to env variables
 if [ ${WITH_GPU} == "ON" ]; then
   BASE_IMAGE="nvidia/cuda:8.0-cudnn5-runtime-ubuntu14.04"
-  # additional packages to install when building gpu images
-  GPU_DOCKER_PKG="python-pip python-dev"
 else
-  BASE_IMAGE="python:2.7.13-slim"
-  # FIXME: python base image uses different python version than WITH_GPU
-  # need to change PYTHONHOME to /usr/local when using python base image
-  CPU_DOCKER_PYTHON_HOME_ENV="ENV PYTHONHOME /usr/local"
+  BASE_IMAGE="ubuntu:14.04"
 fi
 
 DOCKERFILE_GPU_ENV=""
@@ -66,10 +61,7 @@ if [ ${WITH_DOC} == "ON" ]; then
     rm -rf /paddle/build_doc
 fi
 # generate deb package for current build
-# FIXME(typhoonzero): should we remove paddle/scripts/deb ?
-# FIXME: CPACK_DEBIAN_PACKAGE_DEPENDS removes all dev dependencies, must
-# install them in docker
-cpack -D CPACK_GENERATOR='DEB' -D CPACK_DEBIAN_PACKAGE_DEPENDS="" ..
+cpack -D CPACK_GENERATOR='DEB' ..
 
 if [[ ${WOBOQ:-OFF} == 'ON' ]]; then
     apt-get install -y clang-3.8 llvm-3.8 libclang-3.8-dev
@@ -97,32 +89,30 @@ fi
 
 paddle version
 
-if [[ -n ${APT_MIRROR} ]]; then
-  MIRROR_UPDATE="sed -i '${APT_MIRROR}' /etc/apt/sources.list && \\"
-else
-  MIRROR_UPDATE="\\"
-fi
-
 cat > /paddle/build/Dockerfile <<EOF
 FROM ${BASE_IMAGE}
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 ENV HOME /root
 ENV LANG en_US.UTF-8
 # Use Fix locales to en_US.UTF-8
-RUN ${MIRROR_UPDATE}
-    apt-get update && \
-    apt-get install -y libgfortran3 libpython2.7 ${GPU_DOCKER_PKG} && \
-    apt-get clean -y && \
-    pip install --upgrade pip && \
-    pip install -U 'protobuf==3.1.0' requests numpy
+EOF
+
+if [[ -n ${APT_MIRROR} ]]; then
+cat >> /paddle/build/Dockerfile <<EOF
+RUN sed -i '${APT_MIRROR}' /etc/apt/sources.list
+EOF
+fi
+
+cat >> /paddle/build/Dockerfile <<EOF
 # Use different deb file when building different type of images
-ADD *.deb /usr/local/opt/paddle/deb/
+ADD *.deb /
 # run paddle version to install python packages first
-RUN dpkg -i /usr/local/opt/paddle/deb/*.deb && \
-    rm -f /usr/local/opt/paddle/deb/*.deb && \
-    find /usr/ -name '*paddle-*.whl' | xargs pip install && \
+RUN apt-get update &&\
+    apt-get install -y python-pip && pip install -U pip && \
+    dpkg -i /*.deb ; apt-get install -f -y && \
+    apt-get clean -y && \
+    rm -f /*.deb && \
     paddle version
-${CPU_DOCKER_PYTHON_HOME_ENV}
 ${DOCKERFILE_CUDNN_DSO}
 ${DOCKERFILE_GPU_ENV}
 # default command shows the paddle version and exit
diff --git a/paddle/scripts/travis/docs.sh b/paddle/scripts/travis/docs.sh
index c784293695bf134b5e990639778b6e84ba45d00d..67b89adb4ddb7bb93cb776d64711078cb11a2784 100755
--- a/paddle/scripts/travis/docs.sh
+++ b/paddle/scripts/travis/docs.sh
@@ -60,6 +60,7 @@ function deploy_docs() {
 
 deploy_docs "master" "." 
 deploy_docs "develop" "./develop/"
+deploy_docs "release/0.10.0" "./release/0.10.0/"
 
 # Check is there anything changed.
 set +e
diff --git a/paddle/setup.py.in b/paddle/setup.py.in
index 0b62436a7f81682d5279c3b307ac1abf09eafffb..06d55d3abc6097fa7d4b2b2ac9e29681e0fddfd5 100644
--- a/paddle/setup.py.in
+++ b/paddle/setup.py.in
@@ -23,7 +23,7 @@ setup(name="py_paddle",
       install_requires = [
         'nltk>=3.2.2',
         'numpy>=1.8.0',      # The numpy is required.
-        'protobuf>=${PROTOBUF_VERSION}'    # The paddle protobuf version
+        'protobuf==${PROTOBUF_VERSION}'    # The paddle protobuf version
       ],
       url='http://www.paddlepaddle.org/',
       license='Apache 2.0',
diff --git a/paddle/trainer/tests/picojson.h b/paddle/trainer/tests/picojson.h
index 23bfa164080a6ea392bb6ee15e7e2bec25257ce9..4aa64961d096ce94a4187fe94000b05de4080122 100644
--- a/paddle/trainer/tests/picojson.h
+++ b/paddle/trainer/tests/picojson.h
@@ -1059,14 +1059,14 @@ inline bool operator==(const value& x, const value& y) {
 }
 
 inline bool operator!=(const value& x, const value& y) { return !(x == y); }
-}
+}  // namespace picojson
 
 namespace std {
 template <>
 inline void swap(picojson::value& x, picojson::value& y) {
   x.swap(y);
 }
-}
+}  // namespace std
 
 inline std::istream& operator>>(std::istream& is, picojson::value& x) {
   picojson::set_last_error(std::string());
diff --git a/paddle/cuda/src/hl_dso_loader.cc b/paddle/utils/DynamicLoader.cpp
similarity index 94%
rename from paddle/cuda/src/hl_dso_loader.cc
rename to paddle/utils/DynamicLoader.cpp
index 53164dd27c7c5f5254e743b6fcf1d7b6fc895e31..368c35e15186d4d01f939dd4e4c05e7cac3dd214 100644
--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/utils/DynamicLoader.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "hl_dso_loader.h"
+#include "DynamicLoader.h"
 #include <gflags/gflags.h>
-#include "paddle/utils/Logging.h"
+#include "Logging.h"
 
 DEFINE_string(cudnn_dir,
               "",
@@ -30,6 +30,8 @@ DEFINE_string(cuda_dir,
 
 DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
 
+DEFINE_string(lapack_dir, "", "Specify path for loading liblapack.so.");
+
 static inline std::string join(const std::string& part1,
                                const std::string& part2) {
   // directory separator
@@ -160,3 +162,11 @@ void GetWarpCTCDsoHandle(void** dso_handle) {
   GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle);
 #endif
 }
+
+void GetLapackDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_lapack_dir, "liblapack.so", dso_handle);
+#endif
+}
diff --git a/paddle/cuda/include/hl_dso_loader.h b/paddle/utils/DynamicLoader.h
similarity index 83%
rename from paddle/cuda/include/hl_dso_loader.h
rename to paddle/utils/DynamicLoader.h
index 276a07d3c735c771c851e8b4bd14c720f9ab6569..9b5ad21724afd7176f958619e7e10d12dc08fa49 100644
--- a/paddle/cuda/include/hl_dso_loader.h
+++ b/paddle/utils/DynamicLoader.h
@@ -12,13 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef HL_DSO_LOADER_H_
-#define HL_DSO_LOADER_H_
+#ifndef DYNAMIC_LOAD_H_
+#define DYNAMIC_LOAD_H_
 
 #include <dlfcn.h>
 #include <memory>
+#include <mutex>
 #include <string>
-#include "hl_base.h"
 
 /**
  * @brief    load the DSO of CUBLAS
@@ -52,4 +52,12 @@ void GetCurandDsoHandle(void** dso_handle);
  */
 void GetWarpCTCDsoHandle(void** dso_handle);
 
-#endif  // HL_DSO_LOADER_H_
+/**
+ * @brief    load the DSO of lapack
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetLapackDsoHandle(void** dso_handle);
+
+#endif  // DYNAMIC_LOAD_H_
diff --git a/python/paddle/trainer_config_helpers/attrs.py b/python/paddle/trainer_config_helpers/attrs.py
index 41ad05c3eb467b9a2e09315f980410d5e9b3853f..7ae9e5cb3050fa6f70fa84785a1ddbdc68c70235 100644
--- a/python/paddle/trainer_config_helpers/attrs.py
+++ b/python/paddle/trainer_config_helpers/attrs.py
@@ -208,12 +208,15 @@ class ExtraLayerAttribute(object):
                  drop_rate=None,
                  device=None):
         self.attr = dict()
-        if isinstance(error_clipping_threshold, float):
-            assert error_clipping_threshold > 0
-            self.attr["error_clipping_threshold"] = error_clipping_threshold
-
-        if isinstance(drop_rate, float):
-            assert drop_rate > 0
+        if error_clipping_threshold is not None:
+            error_clipping_threshold = float(error_clipping_threshold)
+            if error_clipping_threshold < 0:
+                raise ValueError("Error clipping must > 0")
+            self.attr['error_clipping_threshold'] = error_clipping_threshold
+        if drop_rate is not None:
+            drop_rate = float(drop_rate)
+            if drop_rate < 0:
+                raise ValueError("Dropout rate must > 0")
             self.attr["drop_rate"] = drop_rate
 
         if isinstance(device, int):
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 46fd752d527fa63578a1e01865356780955bc87a..31652613fb3a55636b32babbc4bde60d65776c61 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -84,6 +84,7 @@ __all__ = [
     'GeneratedInput',
     'SubsequenceInput',
     'gru_step_layer',
+    'gru_step_naive_layer',
     'recurrent_layer',
     'BaseGeneratedInput',
     'conv_operator',
@@ -2286,7 +2287,7 @@ def img_pool_layer(input,
 
     type_name = pool_type.name + '-projection' \
         if (
-    isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
+        isinstance(pool_type, AvgPooling) or isinstance(pool_type, MaxPooling)) \
         else pool_type.name
 
     pool_size_y = pool_size if pool_size_y is None else pool_size_y
@@ -3086,6 +3087,78 @@ def gru_step_layer(input,
         activation=act)
 
 
+@wrap_bias_attr_default()
+@wrap_param_attr_default()
+@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
+@wrap_act_default(act=TanhActivation())
+@wrap_name_default('gru_step_naive')
+@layer_support(ERROR_CLIPPING, DROPOUT)
+def gru_step_naive_layer(input,
+                         output_mem,
+                         size=None,
+                         name=None,
+                         act=None,
+                         gate_act=None,
+                         bias_attr=None,
+                         param_attr=None,
+                         layer_attr=None):
+    """
+    GRU Step Layer, but using MixedLayer to generate. It support ERROR_CLIPPING
+    and DROPOUT.
+
+    :param input:
+    :param output_mem:
+    :param size:
+    :param name:
+    :param act:
+    :param gate_act:
+    :param bias_attr:
+    :param param_attr:
+    :param layer_attr:
+    :return:
+    """
+    if input.size % 3 != 0:
+        raise ValueError("GruStep input size must be divided by 3")
+    if size is None:
+        size = input.size / 3
+
+    def __gate__(gate_name, offset):
+        with mixed_layer(
+                name=name + "_" + gate_name,
+                size=size,
+                layer_attr=layer_attr,
+                bias_attr=bias_attr,
+                act=gate_act) as gate:
+            gate += identity_projection(input=input, offset=offset)
+            gate += full_matrix_projection(
+                input=output_mem, param_attr=param_attr)
+        return gate
+
+    update_gate = __gate__("update", 0)
+    reset_gate = __gate__("reset", size)
+
+    with mixed_layer(
+            name=name + "_reset_output", bias_attr=False) as reset_output:
+        reset_output += dotmul_operator(a=output_mem, b=reset_gate)
+
+    with mixed_layer(
+            name=name + "_output_candidate",
+            size=size,
+            layer_attr=layer_attr,
+            bias_attr=bias_attr,
+            act=act) as output_candidate:
+        output_candidate += identity_projection(input=input, offset=2 * size)
+        output_candidate += full_matrix_projection(
+            input=reset_output, param_attr=param_attr)
+
+    with mixed_layer(name=name) as output:
+        output += identity_projection(output_mem)
+        output += dotmul_operator(a=output_mem, b=update_gate, scale=-1.0)
+        output += dotmul_operator(a=output_candidate, b=update_gate)
+
+    return output
+
+
 @wrap_name_default()
 @layer_support()
 def get_output_layer(input, arg_name, name=None, layer_attr=None):
diff --git a/python/paddle/trainer_config_helpers/networks.py b/python/paddle/trainer_config_helpers/networks.py
index cadde11ff81658cb309cd1bf7a44bac6374c1e44..fb533a47e0b0585be6f0e019086993f8b3aa7f38 100755
--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -825,7 +825,8 @@ def gru_unit(input,
              gru_param_attr=None,
              act=None,
              gate_act=None,
-             gru_layer_attr=None):
+             gru_layer_attr=None,
+             naive=False):
     """
     Define calculations that a gated recurrent unit performs in a single time
     step. This function itself is not a recurrent layer, so that it can not be
@@ -857,7 +858,12 @@ def gru_unit(input,
 
     out_mem = memory(name=name, size=size)
 
-    gru_out = gru_step_layer(
+    if naive:
+        __step__ = gru_step_naive_layer
+    else:
+        __step__ = gru_step_layer
+
+    gru_out = __step__(
         name=name,
         input=input,
         output_mem=out_mem,
@@ -879,7 +885,8 @@ def gru_group(input,
               gru_param_attr=None,
               act=None,
               gate_act=None,
-              gru_layer_attr=None):
+              gru_layer_attr=None,
+              naive=False):
     """
     gru_group is a recurrent layer group version of Gated Recurrent Unit. It
     does exactly the same calculation as the grumemory layer does. A promising
@@ -928,7 +935,8 @@ def gru_group(input,
             gru_param_attr=gru_param_attr,
             act=act,
             gate_act=gate_act,
-            gru_layer_attr=gru_layer_attr)
+            gru_layer_attr=gru_layer_attr,
+            naive=naive)
 
     return recurrent_group(
         name='%s_recurrent_group' % name,
@@ -949,7 +957,8 @@ def simple_gru(input,
                gru_param_attr=None,
                act=None,
                gate_act=None,
-               gru_layer_attr=None):
+               gru_layer_attr=None,
+               naive=False):
     """
     You maybe see gru_step_layer, grumemory in layers.py, gru_unit, gru_group,
     simple_gru in network.py. The reason why there are so many interfaces is
@@ -1018,7 +1027,8 @@ def simple_gru(input,
         gru_param_attr=gru_param_attr,
         act=act,
         gate_act=gate_act,
-        gru_layer_attr=gru_layer_attr)
+        gru_layer_attr=gru_layer_attr,
+        naive=naive)
 
 
 @wrap_name_default('simple_gru2')
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
index 2afc3afef6d39ce9b8eef05948861284775d5011..d8bd7b9dfb71a392d0dc53872a0d72f47530530f 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
@@ -320,6 +320,7 @@ layers {
     }
   }
   drop_rate: 0.5
+  error_clipping_threshold: 40.0
 }
 parameters {
   name: "___embedding_0__.w0"
diff --git a/python/paddle/v2/layer.py b/python/paddle/v2/layer.py
index 384de9b9d57f88e84ab6067846174bb037502dc0..89cca7acd34b8dea0572169338649b5e9ff6536a 100644
--- a/python/paddle/v2/layer.py
+++ b/python/paddle/v2/layer.py
@@ -356,6 +356,9 @@ def mixed(size=0,
     return MixedLayerV2(size, input, name, act, bias_attr, layer_attr)
 
 
+mixed.__doc__ = conf_helps.mixed_layer.__doc__
+
+
 class RecurrentLayerInput(Layer):
     def __init__(self, recurrent_name, index, parent_layers):
         parents_len = len(parent_layers)
@@ -404,6 +407,8 @@ data.__name__ = 'data'
 AggregateLevel = conf_helps.layers.AggregateLevel
 ExpandLevel = conf_helps.layers.ExpandLevel
 memory = MemoryV2
+memory.__name__ = 'memory'
+memory.__doc__ = conf_helps.memory.__doc__
 
 
 def __layer_name_mapping__(inname):
@@ -512,6 +517,9 @@ def recurrent_group(step, input, name=None):
         return retv
 
 
+recurrent_group.__doc__ = conf_helps.recurrent_group.__doc__
+
+
 @wrap_name_default()
 def beam_search(step,
                 input,
@@ -579,6 +587,8 @@ def beam_search(step,
     return tmp
 
 
+beam_search.__doc__ = conf_helps.beam_search.__doc__
+
 __projection_names__ = filter(lambda x: x.endswith('_projection'),
                               dir(conf_helps))
 
diff --git a/python/setup.py.in b/python/setup.py.in
index 228e762d56f672d7b31ede2b2b92c77f9a126f3c..5dfb46192ae54fdc36b0867312cf156aefb84f84 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -15,6 +15,9 @@ setup(name='paddle',
       description='Parallel Distributed Deep Learning',
       install_requires=[
           "requests",
+          "numpy",
+          "protobuf==${PROTOBUF_VERSION}",
+          "matplotlib",
       ],
       packages=packages,
       package_dir={