diff --git a/.gitignore b/.gitignore
index 1c9730a5ad57cd70613c0692529bcb1ccf056d59..6aae076a49012b032b8fc0f1dc02c2714fb7b4a3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -9,6 +9,8 @@ build/
 .pydevproject
 Makefile
 .test_env/
+third_party/
 
 *~
 bazel-*
+third_party/
diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index f635e65784af47a21df80cc92073ef14eba9a731..0000000000000000000000000000000000000000
--- a/.gitmodules
+++ /dev/null
@@ -1,3 +0,0 @@
-[submodule "warp-ctc"]
-	path = warp-ctc
-	url = https://github.com/baidu-research/warp-ctc.git
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b9902a863d864b28f0fad0fefe64248e356010e4..a6e45028ebc3f53ea20806f0dd2a7acc820607fe 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@
     sha: c25201a00e6b0514370501050cf2a8538ac12270
     hooks:
     -   id: remove-crlf
-        files: (?!.*warp-ctc)^.*$
+        files: (?!.*third_party)^.*$
 -   repo: https://github.com/reyoung/mirrors-yapf.git
     sha: v0.13.2
     hooks:
@@ -15,7 +15,7 @@
     -   id: check-merge-conflict
     -   id: check-symlinks
     -   id: detect-private-key
-        files: (?!.*warp-ctc)^.*$
+        files: (?!.*third_party)^.*$
     -   id: end-of-file-fixer
 -   repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
     sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29
diff --git a/.travis.yml b/.travis.yml
index 047ca6ffe79bdaf013f6ef6dbf1a82bdb2f1f2b3..bc91855a8571985a386b698e7ecd43bad20477ac 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -28,16 +28,9 @@ addons:
       - python
       - python-pip
       - python2.7-dev
-      - m4
-      - python-numpy
-      - python-wheel
-      - libgoogle-glog-dev
-      - libgflags-dev
-      - libgtest-dev
       - curl
-      - lcov
-      - graphviz
       - swig
+      - graphviz
       - clang-format-3.8
       - automake
       - libtool
@@ -53,10 +46,10 @@ before_install:
         fi
       fi
     fi
-  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi
   - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
   - if [[ "$JOB" == "PRE_COMMIT" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
-  - pip install wheel protobuf sphinx recommonmark virtualenv numpy sphinx_rtd_theme pre-commit requests==2.9.2 LinkChecker
+  - pip install --upgrade pip
+  - pip install wheel protobuf sphinx recommonmark sphinx_rtd_theme virtualenv pre-commit requests==2.9.2 LinkChecker
 script:
   - paddle/scripts/travis/main.sh
 notifications:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 65fbbb481c432f7b905f4dec7ea39c51ec853ae8..9ed757bd1bfbd23ca24445c15e7cf8e13860d26f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,166 +1,89 @@
-cmake_minimum_required(VERSION 2.8)
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+
+cmake_minimum_required(VERSION 3.0)
 
 project(paddle CXX C)
 
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/cmake")
 set(PROJ_ROOT ${CMAKE_SOURCE_DIR})
-include(package)
-find_package(SWIG 2.0)
-find_package(CUDA QUIET)
-find_package(Protobuf REQUIRED)
 
-# Check protobuf library version.
-execute_process(COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --version
-    OUTPUT_VARIABLE PROTOBUF_VERSION)
-string(REPLACE "libprotoc " "" PROTOBUF_VERSION ${PROTOBUF_VERSION})
+find_package(Sphinx)
+find_package(CUDA QUIET)
+find_package(Git REQUIRED)
+find_package(Threads REQUIRED)
 
-set(PROTOBUF_3 OFF)
-if (${PROTOBUF_VERSION} VERSION_GREATER "3.0.0" OR ${PROTOBUF_VERSION} VERSION_EQUAL "3.0.0")
-    set(PROTOBUF_3 ON)
-endif()
+include(system)
+include(simd)
 
-find_package(PythonLibs 2.7 REQUIRED)
-find_package(PythonInterp 2.7 REQUIRED)
-find_package(ZLIB REQUIRED)
-find_package(NumPy REQUIRED)
-find_package(Threads REQUIRED)
-find_package(AVX QUIET)
-find_package(Glog REQUIRED)
-find_package(Gflags REQUIRED)
-find_package(GTest)
-find_package(Sphinx)
-find_package(Doxygen)
-include(cblas)
-find_program(M4_EXECUTABLE m4)
-###################### Configurations ###########################
+###################### Configurations ############################
 option(WITH_DSO "Compile PaddlePaddle with dynamic linked libraries" ON)
 option(WITH_GPU "Compile PaddlePaddle with gpu" ${CUDA_FOUND})
 option(WITH_DOUBLE "Compile PaddlePaddle with double precision, otherwise use single precision" OFF)
 option(WITH_AVX "Compile PaddlePaddle with avx intrinsics" ${AVX_FOUND})
 option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
-option(WITH_STYLE_CHECK "Style Check for PaddlePaddle" ${PYTHONINTERP_FOUND})
+option(WITH_STYLE_CHECK "Style Check for PaddlePaddle" ON)
 option(WITH_RDMA "Compile PaddlePaddle with rdma support" OFF)
 option(WITH_TIMER "Compile PaddlePaddle use timer" OFF)
 option(WITH_PROFILER "Compile PaddlePaddle use gpu profiler" OFF)
-option(WITH_TESTING "Compile and run unittest for PaddlePaddle" ${GTEST_FOUND})
+option(WITH_TESTING "Compile and run unittest for PaddlePaddle" ON)
 option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
-option(WITH_SWIG_PY "Compile PaddlePaddle with py PaddlePaddle prediction api" ${SWIG_FOUND})
+option(WITH_SWIG_PY "Compile PaddlePaddle with py PaddlePaddle prediction api" ON)
 option(ON_TRAVIS "Running test on travis-ci or not." OFF)
 option(ON_COVERALLS "Generating code coverage data on coveralls or not." OFF)
 option(COVERALLS_UPLOAD "Uploading the generated coveralls json." ON)
 
+include(external/zlib)      # download, build, install zlib
+include(external/gflags)    # download, build, install gflags
+include(external/glog)      # download, build, install glog
+include(external/gtest)     # download, build, install gtest
+include(external/protobuf)  # download, build, install protobuf
+include(external/python)    # download, build, install python
+include(external/openblas)  # download, build, install openblas
+include(external/swig)      # download, build, install swig
+include(external/warpctc)   # download, build, install warpctc
+
+include(package)            # set paddle packages
+include(cpplint)            # set paddle c++ style
+include(ccache)             # set ccache for compilation
+include(util)               # set unittest and link libs
+include(rdma)               # set rdma libraries
+include(flags)              # set paddle compile flags
+include(cudnn)              # set cudnn libraries
+include(version)            # set PADDLE_VERSION
+include(coveralls)          # set code coverage
+include(python_module)      # set python module
+
+include(configure)          # add paddle env configuration
 
-include(cpplint)
-include(ccache)
-if(WITH_RDMA)
-  include(rdma)
-endif()
-include(util)
-include(flags)
-include(cudnn)
-include(FindPythonModule)
-include(check_packages)
-include(swig)
-include(coveralls)
-
-# Set PaddlePaddle version to Git tag name or Git commit ID.
-find_package(Git REQUIRED)
-# version.cmake will get the current PADDLE_VERSION
-include(version)
-add_definitions(-DPADDLE_VERSION=${PADDLE_VERSION})
-
-if(NOT WITH_GPU)
-    add_definitions(-DPADDLE_ONLY_CPU)
-    add_definitions(-DHPPL_STUB_FUNC)
-
-    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
-else()
-    if(${CUDA_VERSION_MAJOR} VERSION_LESS 7)
-        message(FATAL_ERROR "Paddle need CUDA >= 7.0 to compile")
-    endif()
-
-    if(NOT CUDNN_FOUND)
-        message(FATAL_ERROR "Paddle need cudnn to compile")
-    endif()
-
-    if(WITH_AVX)
-        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${AVX_FLAG}")
-    else(WITH_AVX)
-        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}")
-    endif(WITH_AVX)
-
-    # Include cuda and cudnn
-    include_directories(${CUDNN_INCLUDE_DIR})
-    include_directories(${CUDA_TOOLKIT_INCLUDE})
-endif(NOT WITH_GPU)
-
-if(WITH_DSO)
-    add_definitions(-DPADDLE_USE_DSO)
-endif(WITH_DSO)
-
-if(WITH_DOUBLE)
-    add_definitions(-DPADDLE_TYPE_DOUBLE)
-    set(ACCURACY double)
-else(WITH_DOUBLE)
-    set(ACCURACY float)
-endif(WITH_DOUBLE)
-
-if(NOT WITH_TIMER)
-    add_definitions(-DPADDLE_DISABLE_TIMER)
-endif(NOT WITH_TIMER)
-
-if(NOT WITH_PROFILER)
-    add_definitions(-DPADDLE_DISABLE_PROFILER)
-endif(NOT WITH_PROFILER)
-
-if(WITH_AVX)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}")
-else(WITH_AVX)
-    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SSE3_FLAG}")
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SSE3_FLAG}")
-endif(WITH_AVX)
-
-if(WITH_PYTHON)
-    include_directories(${PYTHON_INCLUDE_DIR})
-    include_directories(${PYTHON_NUMPY_INCLUDE_DIR})
-else(WITH_PYTHON)
-    add_definitions(-DPADDLE_NO_PYTHON)
-endif(WITH_PYTHON)
-
-if(WITH_RDMA)
-  include_directories("${RDMA_INC_DIR}")
-else(WITH_RDMA)
-  add_definitions(-DPADDLE_DISABLE_RDMA)
-endif(WITH_RDMA)
-
-# glog
-include_directories(${LIBGLOG_INCLUDE_DIR})
-
-#gflags
-add_definitions(-DGFLAGS_NS=${GFLAGS_NAMESPACE})
-include_directories(${GFLAGS_INCLUDE_DIRS})
-
-if(WITH_TESTING)
-    enable_testing()
-    include_directories(${GTEST_INCLUDE_DIRS})
-endif()
-
-include_directories("${CBLAS_INC_DIR}")
 include_directories("${PROJ_ROOT}")
 include_directories("${PROJ_ROOT}/paddle/cuda/include")
-include_directories(${PROTOBUF_INCLUDE_DIRS})
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
-if(EXISTS "${PROJ_ROOT}/paddle/internals/CMakeLists.txt")
-    set(PADDLE_WITH_INTERNAL ON)
-    include(paddle/internals/CMakeLists.txt)
-else()
-    set(PADDLE_WITH_INTERNAL OFF)
-    set(INTERNAL_PROTO_PATH "")
-endif()
+
+set(EXTERNAL_LIBS
+    # have not include gtest here.
+    ${GFLAGS_LIBRARIES}
+    ${GLOG_LIBRARIES}
+    ${CBLAS_LIBRARIES}
+    ${PROTOBUF_LIBRARY}
+    ${ZLIB_LIBRARIES}
+)
+
 add_subdirectory(proto)
 add_subdirectory(paddle)
 add_subdirectory(python)
+
 if(WITH_DOC)
     add_subdirectory(doc)
 endif()
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 685334c6585060c0344e552c6f3fda2c7324de03..4e1ae7dc81231943c4bf3db4d4ac6f073f4fd1c4 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -13,6 +13,7 @@
 # system paths.
 #
 
+set(CBLAS_FOUND OFF)
 
 ## Find MKL First.
 set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains MKL")
@@ -35,11 +36,12 @@ find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
 if(MKL_INCLUDE_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
   set(CBLAS_PROVIDER MKL)
   set(CBLAS_INC_DIR ${MKL_INCLUDE_DIR})
-  set(CBLAS_LIBS ${MKL_INTEL_LP64}
+  set(CBLAS_LIBRARIES ${MKL_INTEL_LP64}
           ${MKL_SEQUENTIAL_LIB}
           ${MKL_CORE_LIB})
   add_definitions(-DPADDLE_USE_MKL)
-  message(STATUS "Found MKL (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
+  message(STATUS "Found MKL (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  set(CBLAS_FOUND ON)
   return() # return file.
 endif()
 
@@ -68,9 +70,10 @@ find_library(ATLAS_LIB NAMES lapack_atlas liblapack_atlas.so.3
 if(ATLAS_INC_DIR AND ATLAS_CBLAS_LIB AND ATLAS_LIB)
   set(CBLAS_PROVIDER ATLAS)
   set(CBLAS_INC_DIR ${ATLAS_INC_DIR} ${ATLAS_CLAPACK_INC_DIR})
-  set(CBLAS_LIBS ${ATLAS_LIB} ${ATLAS_CBLAS_LIB})
+  set(CBLAS_LIBRARIES ${ATLAS_LIB} ${ATLAS_CBLAS_LIB})
   add_definitions(-DPADDLE_USE_ATLAS)  
-  message(STATUS "Found Atlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
+  message(STATUS "Found Atlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  set(CBLAS_FOUND ON)
   return()
 endif()
 
@@ -98,8 +101,9 @@ find_library(OPENBLAS_LIB NAMES openblas
 if(OPENBLAS_INC_DIR AND OPENBLAS_LIB)
   set(CBLAS_PROVIDER OPENBLAS)
   set(CBLAS_INC_DIR ${OPENBLAS_INC_DIR})
-  set(CBLAS_LIBS ${OPENBLAS_LIB})
-  message(STATUS "Found OpenBlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
+  set(CBLAS_LIBRARIES ${OPENBLAS_LIB})
+  message(STATUS "Found OpenBlas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  set(CBLAS_FOUND ON)
   return()
 endif()
 
@@ -130,9 +134,7 @@ find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
 if (REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
   set(CBLAS_PROVIDER REFERENCE)
   set(CBLAS_INC_DIR ${REFERENCE_CBLAS_INCLUDE_DIR})
-  set(CBLAS_LIBS ${REFERENCE_CBLAS_LIBRARY})
-  return()
+  set(CBLAS_LIBRARIES ${REFERENCE_CBLAS_LIBRARY})
+  message(STATUS "Found reference-cblas (include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBS})")
+  set(CBLAS_FOUND ON)
 endif()
-
-message(FATAL_ERROR "CBlas must be set. Paddle support MKL, ATLAS, OpenBlas, reference-cblas."
-  " Try set MKL_ROOT, ATLAS_ROOT, OPENBLAS_ROOT or REFERENCE_CBLAS_ROOT.")
diff --git a/cmake/check_packages.cmake b/cmake/check_packages.cmake
deleted file mode 100644
index afb84c6ff52af05769a99246d2e93380832c04e0..0000000000000000000000000000000000000000
--- a/cmake/check_packages.cmake
+++ /dev/null
@@ -1,39 +0,0 @@
-# Check package for each cmake option
-
-if(WITH_GPU)
-  find_package(CUDA REQUIRED)  # CUDA is required when use gpu
-endif()
-
-if(WITH_PYTHON)
-  find_package(PythonLibs 2.6 REQUIRED)
-  find_package(PythonInterp REQUIRED)
-  find_package(NumPy REQUIRED)
-endif()
-
-if(WITH_STYLE_CHECK)
-  find_package(PythonInterp REQUIRED)
-endif()
-
-find_package(Glog REQUIRED)
-
-find_package(Gflags REQUIRED)
-
-if(WITH_TESTING)
-  find_package(GTest REQUIRED)
-endif()
-
-if(WITH_DOC)
-  find_package(Sphinx REQUIRED)
-  find_python_module(recommonmark REQUIRED)
-endif()
-
-if(WITH_SWIG_PY)
-  if(NOT SWIG_FOUND)
-    message(FATAL_ERROR "SWIG is not found. Please install swig or disable WITH_SWIG_PY")
-  endif()
-  find_python_module(wheel REQUIRED)  # package wheel
-endif()
-
-if(NOT M4_EXECUTABLE)
-  message(FATAL_ERROR "Paddle need m4 to generate proto file.")
-endif()
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..ae0ec01d94da49f23b56f7d34f862ca57fb39b18
--- /dev/null
+++ b/cmake/configure.cmake
@@ -0,0 +1,64 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+if(WITH_DSO)
+    add_definitions(-DPADDLE_USE_DSO)
+endif(WITH_DSO)
+
+if(WITH_DOUBLE)
+    add_definitions(-DPADDLE_TYPE_DOUBLE)
+endif(WITH_DOUBLE)
+
+if(NOT WITH_TIMER)
+    add_definitions(-DPADDLE_DISABLE_TIMER)
+endif(NOT WITH_TIMER)
+
+if(NOT WITH_PROFILER)
+    add_definitions(-DPADDLE_DISABLE_PROFILER)
+endif(NOT WITH_PROFILER)
+
+if(NOT WITH_GPU)
+    add_definitions(-DPADDLE_ONLY_CPU)
+    add_definitions(-DHPPL_STUB_FUNC)
+
+    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
+else()
+    FIND_PACKAGE(CUDA REQUIRED)
+
+    if(${CUDA_VERSION_MAJOR} VERSION_LESS 7)
+        message(FATAL_ERROR "Paddle need CUDA >= 7.0 to compile")
+    endif()
+
+    if(NOT CUDNN_FOUND)
+        message(FATAL_ERROR "Paddle need cudnn to compile")
+    endif()
+
+    if(WITH_AVX)
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${AVX_FLAG}")
+    else(WITH_AVX)
+        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}")
+    endif(WITH_AVX)
+
+    # Include cuda and cudnn
+    include_directories(${CUDNN_INCLUDE_DIR})
+    include_directories(${CUDA_TOOLKIT_INCLUDE})
+endif(NOT WITH_GPU)
+
+if(WITH_AVX)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${AVX_FLAG}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${AVX_FLAG}")
+else(WITH_AVX)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SSE3_FLAG}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SSE3_FLAG}")
+endif(WITH_AVX)
diff --git a/cmake/cpplint.cmake b/cmake/cpplint.cmake
index 241af9a0835b2f100c8fb8b246426e631e42aef3..38c636b30edc0af1c07255814e8bc2b1ad9514da 100644
--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@@ -53,7 +53,7 @@ macro(add_style_check_target TARGET_NAME)
             if(LINT MATCHES ON)
                 add_custom_command(TARGET ${TARGET_NAME}
                     PRE_BUILD
-                    COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
+                    COMMAND env ${py_env} "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
                                 "--filter=${STYLE_FILTER}" ${filename}
                     WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR})
             endif()
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..d38b7d1ba2a74d5bb46d0c07e3abe6832d4c8af3
--- /dev/null
+++ b/cmake/external/gflags.cmake
@@ -0,0 +1,39 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+SET(GFLAGS_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/gflags)
+SET(GFLAGS_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/gflags)
+SET(GFLAGS_INCLUDE_DIR "${GFLAGS_INSTALL_DIR}/include" CACHE PATH "gflags include directory." FORCE)
+IF(WIN32)
+    set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
+ELSE(WIN32)
+    set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/libgflags.a" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
+ENDIF(WIN32)
+
+INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR})
+
+ExternalProject_Add(
+    gflags
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/gflags/gflags.git"
+    PREFIX          ${GFLAGS_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GFLAGS_INSTALL_DIR}
+    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+    CMAKE_ARGS      -DBUILD_TESTING=OFF
+)
+
+LIST(APPEND external_project_dependencies gflags)
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..bec69f3ddf093b62f084f9080fa1fe4398c93e9a
--- /dev/null
+++ b/cmake/external/glog.cmake
@@ -0,0 +1,41 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+SET(GLOG_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/glog)
+SET(GLOG_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/glog)
+SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include directory." FORCE)
+
+IF(WIN32)
+    SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.lib" CACHE FILEPATH "glog library." FORCE)
+ELSE(WIN32)
+    SET(GLOG_LIBRARIES "${GLOG_INSTALL_DIR}/lib/libglog.a" CACHE FILEPATH "glog library." FORCE)
+ENDIF(WIN32)
+
+INCLUDE_DIRECTORIES(${GLOG_INCLUDE_DIR})
+
+ExternalProject_Add(
+    glog
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/google/glog.git"
+    PREFIX          ${GLOG_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${GLOG_INSTALL_DIR}
+    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+    CMAKE_ARGS      -DWITH_GFLAGS=OFF
+    CMAKE_ARGS      -DBUILD_TESTING=OFF
+)
+
+LIST(APPEND external_project_dependencies glog)
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..2fcb7893fa30e7fcd84b9e860217f82cf01bf89e
--- /dev/null
+++ b/cmake/external/gtest.cmake
@@ -0,0 +1,51 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(WITH_TESTING)
+    ENABLE_TESTING()
+    INCLUDE(ExternalProject)
+
+    SET(GTEST_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/gtest)
+    SET(GTEST_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/gtest)
+    SET(GTEST_INCLUDE_DIR "${GTEST_INSTALL_DIR}/include" CACHE PATH "gtest include directory." FORCE)
+
+    INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIR})
+
+    IF(WIN32)
+        set(GTEST_LIBRARIES
+            "${GTEST_INSTALL_DIR}/lib/gtest.lib" CACHE FILEPATH "gtest libraries." FORCE)
+        set(GTEST_MAIN_LIBRARIES
+            "${GTEST_INSTALL_DIR}/lib/gtest_main.lib" CACHE FILEPATH "gtest main libraries." FORCE)
+    ELSE(WIN32)
+        set(GTEST_LIBRARIES
+            "${GTEST_INSTALL_DIR}/lib/libgtest.a" CACHE FILEPATH "gtest libraries." FORCE)
+        set(GTEST_MAIN_LIBRARIES
+            "${GTEST_INSTALL_DIR}/lib/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE)
+    ENDIF(WIN32)
+
+    ExternalProject_Add(
+        gtest
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY  "https://github.com/google/googletest.git"
+        GIT_TAG         "release-1.8.0"
+        PREFIX          ${GTEST_SOURCES_DIR}
+        UPDATE_COMMAND  ""
+        CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${GTEST_INSTALL_DIR}
+        CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+        CMAKE_ARGS      -DBUILD_GMOCK=ON
+        CMAKE_ARGS      -Dgtest_disable_pthreads=ON
+        CMAKE_ARGS      -Dgtest_force_shared_crt=ON
+    )
+    LIST(APPEND external_project_dependencies gtest)
+ENDIF(WITH_TESTING)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..677999cc9f5d320b4ac18fe0cc0d67a8e9921f8f
--- /dev/null
+++ b/cmake/external/openblas.cmake
@@ -0,0 +1,46 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(cblas)
+
+IF(NOT ${CBLAS_FOUND})
+    INCLUDE(ExternalProject)
+
+    SET(CBLAS_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/openblas)
+    SET(CBLAS_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/openblas)
+    SET(CBLAS_INC_DIR "${CBLAS_INSTALL_DIR}/include" CACHE PATH "openblas include directory." FORCE)
+
+    IF(WIN32)
+        SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/openblas.lib" CACHE FILEPATH "openblas library." FORCE)
+    ELSE(WIN32)
+        SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/libopenblas.a" CACHE FILEPATH "openblas library" FORCE)
+    ENDIF(WIN32)
+
+    ExternalProject_Add(
+        openblas
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        URL                 "https://github.com/xianyi/OpenBLAS/archive/v0.2.19.tar.gz"
+        PREFIX              ${CBLAS_SOURCES_DIR}
+        INSTALL_DIR         ${CBLAS_INSTALL_DIR}
+        BUILD_IN_SOURCE     1
+        CONFIGURE_COMMAND   ""
+        BUILD_COMMAND       make CC=${CMAKE_C_COMPILER} FC=${CMAKE_Fortran_COMPILER}
+        INSTALL_COMMAND     make install PREFIX=<INSTALL_DIR>
+        UPDATE_COMMAND      ""
+    )
+
+    LIST(APPEND external_project_dependencies openblas)
+ENDIF()
+
+INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..2f2769b4c628d8570c335d344cbf608bda84206f
--- /dev/null
+++ b/cmake/external/protobuf.cmake
@@ -0,0 +1,62 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+SET(PROTOBUF_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/protobuf)
+SET(PROTOBUF_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/protobuf)
+SET(PROTOBUF_INCLUDE_DIR "${PROTOBUF_INSTALL_DIR}/include" CACHE PATH "protobuf include directory." FORCE)
+
+INCLUDE_DIRECTORIES(${PROTOBUF_INCLUDE_DIR})
+
+IF(WIN32)
+  SET(PROTOBUF_LITE_LIBRARY
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf-lite.lib" CACHE FILEPATH "protobuf lite library." FORCE)
+  SET(PROTOBUF_LIBRARY
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotobuf.lib" CACHE FILEPATH "protobuf library." FORCE)
+  SET(PROTOBUF_PROTOC_LIBRARY
+        "${PROTOBUF_INSTALL_DIR}/lib/libprotoc.lib" CACHE FILEPATH "protoc library." FORCE)
+  SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc.exe" CACHE FILEPATH "protobuf executable." FORCE)
+ELSE(WIN32)
+  IF(${HOST_SYSTEM} STREQUAL "centos")
+    SET(LIB "lib64")
+  ELSE()
+    SET(LIB "lib")
+  ENDIF()
+  SET(PROTOBUF_LITE_LIBRARY
+        "${PROTOBUF_INSTALL_DIR}/${LIB}/libprotobuf-lite.a" CACHE FILEPATH "protobuf lite library." FORCE)
+  SET(PROTOBUF_LIBRARY
+        "${PROTOBUF_INSTALL_DIR}/${LIB}/libprotobuf.a" CACHE FILEPATH "protobuf library." FORCE)
+  SET(PROTOBUF_PROTOC_LIBRARY
+        "${PROTOBUF_INSTALL_DIR}/${LIB}/libprotoc.a" CACHE FILEPATH "protoc library." FORCE)
+  SET(PROTOBUF_PROTOC_EXECUTABLE "${PROTOBUF_INSTALL_DIR}/bin/protoc" CACHE FILEPATH "protobuf executable." FORCE)
+ENDIF(WIN32)
+
+ExternalProject_Add(
+  protobuf
+  ${EXTERNAL_PROJECT_LOG_ARGS}
+  PREFIX          ${PROTOBUF_SOURCES_DIR}
+  UPDATE_COMMAND  ""
+  DEPENDS         zlib
+  GIT_REPOSITORY  "https://github.com/google/protobuf.git"
+  GIT_TAG         "9f75c5aa851cd877fb0d93ccc31b8567a6706546"
+  CONFIGURE_COMMAND
+    ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake
+    -Dprotobuf_BUILD_TESTS=OFF
+    -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+    -DCMAKE_BUILD_TYPE=Release
+    -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
+)
+
+LIST(APPEND external_project_dependencies protobuf)
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..e4c570479f682e951413017b256a8e16dfce625b
--- /dev/null
+++ b/cmake/external/python.cmake
@@ -0,0 +1,204 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+
+##################################### PYTHON ########################################
+SET(PYTHON_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/python)
+SET(PYTHON_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/python)
+SET(_python_DIR ${PYTHON_INSTALL_DIR})
+
+IF(UNIX)
+    SET(PYTHON_FOUND ON)
+    SET(PYTHON_INCLUDE_DIR "${PYTHON_INSTALL_DIR}/include/python2.7" CACHE PATH "Python include dir" FORCE)
+    SET(PYTHON_LIBRARIES "${PYTHON_INSTALL_DIR}/lib/libpython2.7.a" CACHE FILEPATH "Python library" FORCE)
+    SET(PYTHON_EXECUTABLE ${PYTHON_INSTALL_DIR}/bin/python CACHE FILEPATH "Python executable" FORCE)
+    SET(PY_SITE_PACKAGES_PATH "${PYTHON_INSTALL_DIR}/lib/python2.7/site-packages" CACHE PATH "Python site-packages path" FORCE)
+ELSEIF(WIN32)
+    SET(PYTHON_FOUND ON)
+    SET(PYTHON_INCLUDE_DIR "${PYTHON_INSTALL_DIR}/include" CACHE PATH "Python include dir" FORCE)
+    SET(PYTHON_LIBRARIES "${PYTHON_INSTALL_DIR}/libs/python27.lib" CACHE FILEPATH "Python library" FORCE)
+    SET(PYTHON_EXECUTABLE "${PYTHON_INSTALL_DIR}/bin/python.exe" CACHE FILEPATH "Python executable" FORCE)
+    SET(PY_SITE_PACKAGES_PATH "${PYTHON_INSTALL_DIR}/Lib/site-packages" CACHE PATH "Python site-packages path" FORCE)
+ELSE()
+    MESSAGE(FATAL_ERROR "Unknown system !")
+ENDIF()
+
+SET(py_env
+    PATH=${PYTHON_INSTALL_DIR}/bin/:$ENV{PATH}
+    PYTHONHOME=${PYTHON_INSTALL_DIR}
+    PYTHONPATH=${PYTHON_INSTALL_DIR}/lib:${PYTHON_INSTALL_DIR}/lib/python2.7:${PY_SITE_PACKAGES_PATH})
+
+INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR})
+
+IF(APPLE)
+    LIST(APPEND EXTERNAL_PROJECT_OPTIONAL_CMAKE_ARGS
+        -DCMAKE_BUILD_WITH_INSTALL_RPATH:BOOL=ON
+        )
+ENDIF()
+
+SET(EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS)
+
+# Force Python build to "Release".
+IF(CMAKE_CONFIGURATION_TYPES)
+    SET(SAVED_CMAKE_CFG_INTDIR ${CMAKE_CFG_INTDIR})
+    SET(CMAKE_CFG_INTDIR "Release")
+ELSE()
+    LIST(APPEND EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS
+        -DCMAKE_BUILD_TYPE:STRING=Release
+        )
+ENDIF()
+
+ExternalProject_Add(python
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY    "https://github.com/python-cmake-buildsystem/python-cmake-buildsystem.git"
+    PREFIX            ${PYTHON_SOURCES_DIR}
+    UPDATE_COMMAND    ""
+    CMAKE_ARGS        -DPYTHON_VERSION=2.7.12
+    CMAKE_ARGS        -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+    CMAKE_ARGS        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    CMAKE_CACHE_ARGS
+        -DCMAKE_INSTALL_PREFIX:PATH=${PYTHON_INSTALL_DIR}
+        -DBUILD_LIBPYTHON_SHARED:BOOL=OFF
+        -DUSE_SYSTEM_LIBRARIES:BOOL=OFF
+        -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
+        -DZLIB_INCLUDE_DIR:PATH=${ZLIB_INCLUDE_DIR}
+        -DZLIB_LIBRARY:FILEPATH=${ZLIB_LIBRARIES}
+        -DDOWNLOAD_SOURCES:BOOL=ON
+        -DINSTALL_WINDOWS_TRADITIONAL:BOOL=OFF
+        ${EXTERNAL_PROJECT_OPTIONAL_CMAKE_CACHE_ARGS}
+        ${EXTERNAL_PROJECT_OPTIONAL_CMAKE_ARGS}
+    DEPENDS zlib
+)
+####################################################################################
+
+##################################### SETUPTOOLS ###################################
+SET(SETUPTOOLS_SOURCES_DIR ${PYTHON_SOURCES_DIR}/setuptools)
+ExternalProject_Add(setuptools
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX              ${SETUPTOOLS_SOURCES_DIR}
+    URL                 "https://pypi.python.org/packages/source/s/setuptools/setuptools-18.3.2.tar.gz"
+    BUILD_IN_SOURCE     1
+    PATCH_COMMAND       ""
+    UPDATE_COMMAND      ""
+    CONFIGURE_COMMAND   ""
+    INSTALL_COMMAND     ""
+    BUILD_COMMAND       env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+    DEPENDS             python zlib
+)
+#####################################################################################
+
+##################################### SIX ###########################################
+SET(SIX_SOURCES_DIR ${PYTHON_SOURCES_DIR}/six)
+ExternalProject_Add(six
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX              ${SIX_SOURCES_DIR}
+    URL                 https://pypi.python.org/packages/source/s/six/six-1.10.0.tar.gz
+    BUILD_IN_SOURCE     1
+    PATCH_COMMAND       ""
+    UPDATE_COMMAND      ""
+    CONFIGURE_COMMAND   ""
+    INSTALL_COMMAND     ""
+    BUILD_COMMAND       env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+    DEPENDS             python setuptools
+)
+#####################################################################################
+
+##################################### CYTHON ########################################
+SET(CYTHON_SOURCES_DIR ${PYTHON_SOURCES_DIR}/cython)
+ExternalProject_Add(cython
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX                ${CYTHON_SOURCES_DIR}
+    URL                   https://github.com/cython/cython/archive/0.25.2.tar.gz
+    GIT_TAG               0.25.2
+    BUILD_IN_SOURCE       1
+    CONFIGURE_COMMAND     ""
+    PATCH_COMMAND         ""
+    UPDATE_COMMAND        ""
+    INSTALL_COMMAND       ""
+    BUILD_COMMAND         env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+    DEPENDS               python
+)
+####################################################################################
+
+##################################### NUMPY ########################################
+SET(NUMPY_SOURCES_DIR ${PYTHON_SOURCES_DIR}/numpy)
+SET(NUMPY_TAG_VERSION "v1.11.3")
+SET(NUMPY_VERSION "1.11.3")
+
+SET(EGG_NAME "")
+SET(PYTHON_NUMPY_INCLUDE_DIR "")
+IF(WIN32)
+    SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-${HOST_SYSTEM}.egg")
+ELSE(WIN32)
+    IF(APPLE)
+        SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-${HOST_SYSTEM}-${MACOS_VERSION}")
+    ELSE(APPLE)
+        SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-linux")
+        SET(EGG_NAME "numpy-${NUMPY_VERSION}-py2.7-linux")
+    ENDIF(APPLE)
+
+    FOREACH(suffix x86_64 intel fat64 fat32 universal)
+        LIST(APPEND PYTHON_NUMPY_INCLUDE_DIR ${PY_SITE_PACKAGES_PATH}/${EGG_NAME}-${suffix}.egg/numpy/core/include)
+    ENDFOREACH()
+ENDIF(WIN32)
+
+INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR})
+
+ExternalProject_Add(numpy
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY      https://github.com/numpy/numpy.git
+    GIT_TAG             ${NUMPY_TAG_VERSION}
+    CONFIGURE_COMMAND   ""
+    UPDATE_COMMAND      ""
+    PREFIX              ${NUMPY_SOURCES_DIR}
+    BUILD_COMMAND       env ${py_env} ${PYTHON_EXECUTABLE} setup.py build
+    INSTALL_COMMAND     env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+    BUILD_IN_SOURCE     1
+    DEPENDS             python setuptools cython
+)
+####################################################################################
+
+##################################### WHEEL ########################################
+SET(WHEEL_SOURCES_DIR ${PYTHON_SOURCES_DIR}/wheel)
+ExternalProject_Add(wheel
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    URL                 https://pypi.python.org/packages/source/w/wheel/wheel-0.29.0.tar.gz
+    PREFIX              ${WHEEL_SOURCES_DIR}
+    CONFIGURE_COMMAND   ""
+    UPDATE_COMMAND      ""
+    BUILD_COMMAND       ""
+    INSTALL_COMMAND     env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+    BUILD_IN_SOURCE     1
+    DEPENDS             python setuptools
+)
+####################################################################################
+
+################################### PROTOBUF #######################################
+SET(PY_PROTOBUF_SOURCES_DIR ${PYTHON_SOURCES_DIR}/protobuf)
+ExternalProject_Add(python-protobuf
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    URL                   https://pypi.python.org/packages/e0/b0/0a1b364fe8a7d177b4b7d4dca5b798500dc57a7273b93cca73931b305a6a/protobuf-3.1.0.post1.tar.gz
+    URL_MD5               38b5fb160c768d2f8444d0c6d637ff91
+    PREFIX                ${PY_PROTOBUF_SOURCES_DIR}
+    BUILD_IN_SOURCE       1
+    PATCH_COMMAND         ""
+    CONFIGURE_COMMAND     ""
+    BUILD_COMMAND         env PATH=${PROTOBUF_INSTALL_DIR}/bin:$ENV{PATH} ${py_env} ${PYTHON_EXECUTABLE} setup.py build
+    INSTALL_COMMAND       env PATH=${PROTOBUF_INSTALL_DIR}/bin:$ENV{PATH} ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+    DEPENDS               python setuptools six
+)
+
+LIST(APPEND external_project_dependencies python setuptools six cython numpy wheel python-protobuf)
diff --git a/cmake/external/swig.cmake b/cmake/external/swig.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..40088c65ef7166ddef52956a1a7470ccab8087c9
--- /dev/null
+++ b/cmake/external/swig.cmake
@@ -0,0 +1,74 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+FIND_PACKAGE(SWIG)
+
+IF(NOT SWIG_FOUND)
+    # build swig as an external project
+    INCLUDE(ExternalProject)
+
+    SET(SWIG_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/swig)
+    SET(SWIG_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/swig)
+    SET(SWIG_TARGET_VERSION "3.0.2")
+    SET(SWIG_DOWNLOAD_SRC_MD5 "62f9b0d010cef36a13a010dc530d0d41")
+    SET(SWIG_DOWNLOAD_WIN_MD5 "3f18de4fc09ab9abb0d3be37c11fbc8f")
+
+    IF(WIN32)
+        # swig.exe available as pre-built binary on Windows:
+        ExternalProject_Add(swig
+            URL                 http://prdownloads.sourceforge.net/swig/swigwin-${SWIG_TARGET_VERSION}.zip
+            URL_MD5             ${SWIG_DOWNLOAD_WIN_MD5}
+            SOURCE_DIR          ${SWIG_SOURCES_DIR}
+            CONFIGURE_COMMAND   ""
+            BUILD_COMMAND       ""
+            INSTALL_COMMAND     ""
+            UPDATE_COMMAND      ""
+        )
+        SET(SWIG_DIR ${SWIG_SOURCES_DIR} CACHE FILEPATH "SWIG Directory" FORCE)
+        SET(SWIG_EXECUTABLE ${SWIG_SOURCES_DIR}/swig.exe  CACHE FILEPATH "SWIG Executable" FORCE)
+    ELSE(WIN32)
+        # From PCRE configure
+        ExternalProject_Add(pcre
+            ${EXTERNAL_PROJECT_LOG_ARGS}
+            GIT_REPOSITORY https://github.com/svn2github/pcre.git
+            PREFIX ${SWIG_SOURCES_DIR}/pcre
+            CMAKE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SWIG_INSTALL_DIR}/pcre
+        )
+
+        # swig uses bison find it by cmake and pass it down
+        FIND_PACKAGE(BISON)
+
+        # From SWIG configure
+        ExternalProject_Add(swig
+            GIT_REPOSITORY      https://github.com/swig/swig.git
+            GIT_TAG             rel-3.0.10
+            PREFIX              ${SWIG_SOURCES_DIR}
+            CONFIGURE_COMMAND   cd ${SWIG_SOURCES_DIR}/src/swig && ./autogen.sh
+            CONFIGURE_COMMAND   cd ${SWIG_SOURCES_DIR}/src/swig &&
+            env "PCRE_LIBS=${SWIG_INSTALL_DIR}/pcre/lib/libpcre.a ${SWIG_INSTALL_DIR}/pcre/lib/libpcrecpp.a ${SWIG_INSTALL_DIR}/pcre/lib/libpcreposix.a"
+            ./configure
+                --prefix=${SWIG_INSTALL_DIR}
+                --with-pcre-prefix=${SWIG_INSTALL_DIR}/pcre
+            BUILD_COMMAND   cd ${SWIG_SOURCES_DIR}/src/swig && make
+            INSTALL_COMMAND cd ${SWIG_SOURCES_DIR}/src/swig && make install
+            UPDATE_COMMAND  ""
+            DEPENDS pcre
+        )
+
+        SET(SWIG_DIR ${SWIG_INSTALL_DIR}/share/swig/${SWIG_TARGET_VERSION})
+        SET(SWIG_EXECUTABLE ${SWIG_INSTALL_DIR}/bin/swig)
+    ENDIF(WIN32)
+
+    LIST(APPEND external_project_dependencies swig)
+ENDIF(NOT SWIG_FOUND)
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..d90768b6f1576e6d469d91d694ae0b9d1c7e8384
--- /dev/null
+++ b/cmake/external/warpctc.cmake
@@ -0,0 +1,58 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+SET(WARPCTC_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/warpctc)
+SET(WARPCTC_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/warpctc)
+SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" CACHE PATH "Warp-ctc Directory" FORCE)
+
+INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
+
+SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib" CACHE PATH "Warp-ctc Library Directory" FORCE)
+
+IF(WIN32)
+    SET(WARPCTC_LIBRARIES
+        "${WARPCTC_INSTALL_DIR}/lib/warpctc.dll" CACHE FILEPATH "Warp-ctc Library" FORCE)
+ELSE(WIN32)
+    IF(APPLE)
+        SET(_warpctc_SHARED_SUFFIX dylib)
+    ELSE(APPLE)
+        SET(_warpctc_SHARED_SUFFIX so)
+    ENDIF(APPLE)
+
+    SET(WARPCTC_LIBRARIES
+        "${WARPCTC_INSTALL_DIR}/lib/libwarpctc.${_warpctc_SHARED_SUFFIX}" CACHE FILEPATH "Warp-ctc Library" FORCE)
+ENDIF(WIN32)
+
+IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" )
+    SET(USE_OMP OFF)
+ELSE()
+    SET(USE_OMP ON)
+ENDIF()
+
+ExternalProject_Add(
+    warpctc
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/gangliao/warp-ctc.git"
+    PREFIX          ${WARPCTC_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+    CMAKE_ARGS      -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR}
+    CMAKE_ARGS      -DWITH_GPU=${WITH_GPU}
+    CMAKE_ARGS      -DWITH_OMP=${USE_OMP}
+)
+
+LIST(APPEND external_project_dependencies warpctc)
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..916f6816aae9938aad95ac527cf07ffbe38f7479
--- /dev/null
+++ b/cmake/external/zlib.cmake
@@ -0,0 +1,43 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+INCLUDE(ExternalProject)
+
+SET(ZLIB_SOURCES_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/zlib)
+SET(ZLIB_INSTALL_DIR ${CMAKE_CURRENT_SOURCE_DIR}/third_party/install/zlib)
+SET(ZLIB_ROOT ${ZLIB_INSTALL_DIR} CACHE FILEPATH "zlib root directory." FORCE)
+SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include directory." FORCE)
+
+IF(WIN32)
+  SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib" CACHE FILEPATH "zlib library." FORCE)
+ELSE(WIN32)
+  set(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE)
+ENDIF(WIN32)
+
+INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR})
+
+ExternalProject_Add(
+    zlib
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY  "https://github.com/madler/zlib.git"
+    GIT_TAG         "v1.2.8"
+    PREFIX          ${ZLIB_SOURCES_DIR}
+    UPDATE_COMMAND  ""
+    CMAKE_ARGS      -DCMAKE_INSTALL_PREFIX=${ZLIB_INSTALL_DIR}
+    CMAKE_ARGS      -DBUILD_SHARED_LIBS=OFF
+    CMAKE_ARGS      -DCMAKE_POSITION_INDEPENDENT_CODE=ON
+    CMAKE_ARGS      -DCMAKE_MACOSX_RPATH=ON
+)
+
+LIST(APPEND external_project_dependencies zlib)
diff --git a/cmake/FindPythonModule.cmake b/cmake/python_module.cmake
similarity index 100%
rename from cmake/FindPythonModule.cmake
rename to cmake/python_module.cmake
diff --git a/cmake/rdma.cmake b/cmake/rdma.cmake
index e9a4da79aa92a92aa7e5d21bb795ab9aaf60ab8b..9ff1a77cac74fb1bdfe470a78d225ed1767bb1b5 100644
--- a/cmake/rdma.cmake
+++ b/cmake/rdma.cmake
@@ -5,72 +5,76 @@
 # svn co https://svn.baidu.com/sys/ip/trunk/rdma/thirdparty rdma/
 # we use static output in svn repositories to avoid implict bugs from not standard runtime env.
 
-set(RDMA_ROOT $ENV{RDMA_ROOT} CACHE PATH "Folder contains RDMA sock library and thirdparty library")
+if(WITH_RDMA)
+  set(RDMA_ROOT $ENV{RDMA_ROOT} CACHE PATH "Folder contains RDMA sock library and thirdparty library")
 
-function(generate_rdma_links)
-  #redirect to current DIR to isolate the pollution from system runtime environment
-  #it can benifits unified control for different gcc environment. 
-  #e.g, by default gcc48 did not refer /usr/lib64 which could contain low version
-  #runtime libraries that will crash process while loading it. That redirect trick
-  #can fix it.
-  execute_process(
-    COMMAND mkdir -p librdma
-    COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so.1
-    COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so
-    COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so.1
-    COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so 
-    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
-  )
-endfunction(generate_rdma_links)
-
-
-#check and set headers
-find_path(RDMA_INC_SXISOCK sxi_sock.h PATHS ${RDMA_ROOT}/sockrdmav1/output/include)
-find_path(RDMA_INC_XIO libxio.h PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
-find_path(RDMA_INC_EVENT event2 PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
-find_path(RDMA_INC_NUMA numa.h PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
-
-#check and set libs
-find_library(RDMA_LIB_SXISOCK NAMES sxisock PATHS ${RDMA_ROOT}/sockrdmav1/output)
-find_library(RDMA_LIB_XIO NAMES xio PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
-find_library(RDMA_LIB_EVENT NAMES event PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
-find_library(RDMA_LIB_EVENT_CORE NAMES event_core PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
-find_library(RDMA_LIB_EVENT_EXTRA NAMES event_extra PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
-find_library(RDMA_LIB_EVENT_PTHREADS NAMES event_pthreads PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
-find_library(RDMA_LIB_NUMA NAMES numa PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
-
-if(
-    RDMA_INC_SXISOCK AND
-    RDMA_INC_XIO AND
-    RDMA_INC_EVENT AND
-    RDMA_INC_NUMA AND
-    RDMA_LIB_SXISOCK AND 
-    RDMA_LIB_XIO AND
-    RDMA_LIB_EVENT AND
-    RDMA_LIB_EVENT_CORE AND
-    RDMA_LIB_EVENT_EXTRA AND
-    RDMA_LIB_EVENT_PTHREADS AND
-    RDMA_LIB_NUMA
+  function(generate_rdma_links)
+    #redirect to current DIR to isolate the pollution from system runtime environment
+    #it can benifits unified control for different gcc environment. 
+    #e.g, by default gcc48 did not refer /usr/lib64 which could contain low version
+    #runtime libraries that will crash process while loading it. That redirect trick
+    #can fix it.
+    execute_process(
+      COMMAND mkdir -p librdma
+      COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so.1
+      COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so
+      COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so.1
+      COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so 
+      WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
     )
+  endfunction(generate_rdma_links)
 
-  set(RDMA_INC_DIR 
-    ${RDMA_INC_SXISOCK} 
-    ${RDMA_INC_XIO}
-    ${RDMA_INC_EVENT}
-    ${RDMA_INC_NUMA})
-  set(RDMA_LIBS  
-    ${RDMA_LIB_SXISOCK} 
-    ${RDMA_LIB_XIO} 
-    ${RDMA_LIB_EVENT} 
-    ${RDMA_LIB_EVENT_CORE} 
-    ${RDMA_LIB_EVENT_EXTRA} 
-    ${RDMA_LIB_EVENT_PTHREADS} 
-    ${RDMA_LIB_NUMA} 
-    )
-  set(RDMA_LD_FLAGS "-L./librdma -libverbs -lrdmacm -Xlinker -rpath ./librdma")
-  return()
-endif()
+  #check and set headers
+  find_path(RDMA_INC_SXISOCK sxi_sock.h PATHS ${RDMA_ROOT}/sockrdmav1/output/include)
+  find_path(RDMA_INC_XIO libxio.h PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
+  find_path(RDMA_INC_EVENT event2 PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+  find_path(RDMA_INC_NUMA numa.h PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
+
+  #check and set libs
+  find_library(RDMA_LIB_SXISOCK NAMES sxisock PATHS ${RDMA_ROOT}/sockrdmav1/output)
+  find_library(RDMA_LIB_XIO NAMES xio PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
+  find_library(RDMA_LIB_EVENT NAMES event PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+  find_library(RDMA_LIB_EVENT_CORE NAMES event_core PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+  find_library(RDMA_LIB_EVENT_EXTRA NAMES event_extra PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+  find_library(RDMA_LIB_EVENT_PTHREADS NAMES event_pthreads PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
+  find_library(RDMA_LIB_NUMA NAMES numa PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
 
-#if this module is not called, RDMA_INC_DIR RDMA_LIBS will be null, so top module always refer this variable
+  if(
+      RDMA_INC_SXISOCK AND
+      RDMA_INC_XIO AND
+      RDMA_INC_EVENT AND
+      RDMA_INC_NUMA AND
+      RDMA_LIB_SXISOCK AND 
+      RDMA_LIB_XIO AND
+      RDMA_LIB_EVENT AND
+      RDMA_LIB_EVENT_CORE AND
+      RDMA_LIB_EVENT_EXTRA AND
+      RDMA_LIB_EVENT_PTHREADS AND
+      RDMA_LIB_NUMA
+      )
 
-message(FATAL_ERROR, "RDMA libraries are not found, try to set RDMA_ROOT or check all related libraries.")
+    set(RDMA_INC_DIR 
+      ${RDMA_INC_SXISOCK} 
+      ${RDMA_INC_XIO}
+      ${RDMA_INC_EVENT}
+      ${RDMA_INC_NUMA})
+    set(RDMA_LIBS  
+      ${RDMA_LIB_SXISOCK} 
+      ${RDMA_LIB_XIO} 
+      ${RDMA_LIB_EVENT} 
+      ${RDMA_LIB_EVENT_CORE} 
+      ${RDMA_LIB_EVENT_EXTRA} 
+      ${RDMA_LIB_EVENT_PTHREADS} 
+      ${RDMA_LIB_NUMA} 
+      )
+    set(RDMA_LD_FLAGS "-L./librdma -libverbs -lrdmacm -Xlinker -rpath ./librdma")
+    include_directories("${RDMA_INC_DIR}")
+  else()
+    #if this module is not called, RDMA_INC_DIR RDMA_LIBS will be null, so top module always refer this variable
+    message(FATAL_ERROR, "RDMA libraries are not found, try to set RDMA_ROOT or check all related libraries.")
+  endif()
+else(WITH_RDMA)
+  set(RDMA_LIBS "")
+  set(RDMA_LD_FLAGS "")
+  add_definitions(-DPADDLE_DISABLE_RDMA)
+endif(WITH_RDMA)
diff --git a/cmake/FindAVX.cmake b/cmake/simd.cmake
similarity index 100%
rename from cmake/FindAVX.cmake
rename to cmake/simd.cmake
diff --git a/cmake/swig.cmake b/cmake/swig.cmake
deleted file mode 100644
index 97e87aa947791e2c5a88e7e554dec43bcd661664..0000000000000000000000000000000000000000
--- a/cmake/swig.cmake
+++ /dev/null
@@ -1,15 +0,0 @@
-function(generate_python_api target_name)
-    add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
-                              ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
-                              ${PROJ_ROOT}/paddle/Paddle_wrap.h
-        COMMAND swig -python -c++ -outcurrentdir -I../ api/Paddle.swig
-                && mv ${PROJ_ROOT}/paddle/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
-        DEPENDS ${PROJ_ROOT}/paddle/api/Paddle.swig
-                ${PROJ_ROOT}/paddle/api/PaddleAPI.h
-        WORKING_DIRECTORY ${PROJ_ROOT}/paddle
-        COMMENT "Generate Python API from swig")
-    add_custom_target(${target_name} ALL DEPENDS
-                ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
-                ${PROJ_ROOT}/paddle/Paddle_wrap.h
-                ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py)
-endfunction(generate_python_api)
diff --git a/cmake/system.cmake b/cmake/system.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..788db404ebfb6facbaedf2910186f3b1afe775c1
--- /dev/null
+++ b/cmake/system.cmake
@@ -0,0 +1,53 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+# http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(WIN32)
+    SET(HOST_SYSTEM "win32")
+ELSE(WIN32)
+    IF(APPLE)
+        EXEC_PROGRAM (sw_vers ARGS -productVersion OUTPUT_VARIABLE MACOSX_VERSION)
+        STRING(REGEX MATCH "[0-9]+.[0-9]+" VERSION "${MACOSX_VERSION}")
+        SET(MACOS_VERSION ${VERSION})
+        SET(HOST_SYSTEM "macosx")
+    ELSE(APPLE)
+        IF(EXISTS "/etc/issue")
+            FILE(READ "/etc/issue" LINUX_ISSUE)
+            IF(LINUX_ISSUE MATCHES "CentOS")
+                SET(HOST_SYSTEM "centos")
+            ELSEIF(LINUX_ISSUE MATCHES "Debian")
+                SET(HOST_SYSTEM "debian")
+            ELSEIF(LINUX_ISSUE MATCHES "Ubuntu")
+                SET(HOST_SYSTEM "ubuntu")
+            ENDIF()
+        ENDIF(EXISTS "/etc/issue")
+    ENDIF(APPLE)
+ENDIF(WIN32)
+
+# query number of logical cores
+CMAKE_HOST_SYSTEM_INFORMATION(RESULT CPU_CORES QUERY NUMBER_OF_LOGICAL_CORES)
+
+MARK_AS_ADVANCED(HOST_SYSTEM CPU_CORES)
+
+MESSAGE(STATUS "Found Paddle host system: ${HOST_SYSTEM}")
+MESSAGE(STATUS "Found Paddle host system's CPU: ${CPU_CORES} cores")
+
+# external dependencies log output
+SET(EXTERNAL_PROJECT_LOG_ARGS
+    LOG_DOWNLOAD    0     # Wrap download in script to log output
+    LOG_UPDATE      1     # Wrap update in script to log output
+    LOG_CONFIGURE   1     # Wrap configure in script to log output
+    LOG_BUILD       1     # Wrap build in script to log output
+    LOG_TEST        1     # Wrap test in script to log output
+    LOG_INSTALL     1     # Wrap install in script to log output
+)
diff --git a/cmake/util.cmake b/cmake/util.cmake
index 43a56378df0094200d3c7c95a704c27222654708..a19bf2a7998ed7772a66f6a7eb5f9e858b0e75a2 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -24,7 +24,7 @@ function(target_circle_link_libraries TARGET_NAME)
                 list(APPEND libsInArgn ${arg})
             endif()
         endforeach()
-        if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
+        if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
             list(APPEND LIBS "-undefined dynamic_lookup")
         endif()
         list(REVERSE libsInArgn)
@@ -81,18 +81,6 @@ function(link_paddle_exe TARGET_NAME)
         set(METRIC_LIBS "")
     endif()
 
-    if(PADDLE_WITH_INTERNAL)
-        set(INTERAL_LIBS paddle_internal_gserver paddle_internal_parameter)
-        target_circle_link_libraries(${TARGET_NAME}
-            ARCHIVE_START
-            paddle_internal_gserver
-            paddle_internal_owlqn
-            ARCHIVE_END
-            paddle_internal_parameter)
-    else()
-        set(INTERAL_LIBS "")
-    endif()
-
     target_circle_link_libraries(${TARGET_NAME}
         ARCHIVE_START
         paddle_gserver
@@ -108,24 +96,15 @@ function(link_paddle_exe TARGET_NAME)
         paddle_proto
         paddle_cuda
         ${METRIC_LIBS}
-        ${PROTOBUF_LIBRARY}
-        ${LIBGLOG_LIBRARY}
-        ${GFLAGS_LIBRARIES}
+        ${EXTERNAL_LIBS}
         ${CMAKE_THREAD_LIBS_INIT}
-        ${CBLAS_LIBS}
-        ${ZLIB_LIBRARIES}
-        ${INTERAL_LIBS}
-        ${CMAKE_DL_LIBS})
-
-    if(WITH_RDMA)
-        target_link_libraries(${TARGET_NAME}
-            ${RDMA_LD_FLAGS}
-            ${RDMA_LIBS})
-    endif()
+        ${CMAKE_DL_LIBS}
+        ${RDMA_LD_FLAGS}
+        ${RDMA_LIBS})
 
     if(WITH_PYTHON)
         target_link_libraries(${TARGET_NAME}
-            ${PYTHON_LIBRARIES})
+            ${PYTHON_LIBRARIES} util)
     endif()
 
     if(WITH_GPU)
@@ -141,11 +120,7 @@ function(link_paddle_exe TARGET_NAME)
             target_link_libraries(${TARGET_NAME} rt)
         endif()
     endif()
-
-    if(NOT WITH_DSO)
-        target_link_libraries(${TARGET_NAME}
-            ${WARPCTC_LIBRARY})
-    endif()
+    add_dependencies(${TARGET_NAME} ${external_project_dependencies})
 endfunction()
 
 # link_paddle_test
@@ -156,6 +131,7 @@ function(link_paddle_test TARGET_NAME)
     link_paddle_exe(${TARGET_NAME})
     target_link_libraries(${TARGET_NAME}
                           paddle_test_main
+                          paddle_test_util
                           ${GTEST_LIBRARIES})
 endfunction()
 
diff --git a/cmake/version.cmake b/cmake/version.cmake
index a0518e07e88a1ff468c301523f888c7d95e15185..ac1583a24c828629c46cb9cf4e965f8da2273732 100644
--- a/cmake/version.cmake
+++ b/cmake/version.cmake
@@ -21,4 +21,5 @@ while ("${PADDLE_VERSION}" STREQUAL "")
   endif()
 endwhile()
 
+add_definitions(-DPADDLE_VERSION=${PADDLE_VERSION})
 message(STATUS "Paddle version is ${PADDLE_VERSION}")
diff --git a/doc/howto/deep_model/rnn/index_cn.rst b/doc/howto/deep_model/rnn/index_cn.rst
index 9e805ca85191b793c8798a239927a318c70b96f5..9ecab5594cff47cde4700b7ce0f58013a960a16e 100644
--- a/doc/howto/deep_model/rnn/index_cn.rst
+++ b/doc/howto/deep_model/rnn/index_cn.rst
@@ -4,6 +4,7 @@ RNN相关模型
 ..  toctree::
   :maxdepth: 1
 
+  rnn_config_cn.rst
   recurrent_group_cn.md
   hierarchical_layer_cn.rst
   hrnn_rnn_api_compare_cn.rst
diff --git a/doc/howto/deep_model/rnn/rnn_cn.md b/doc/howto/deep_model/rnn/rnn_cn.md
deleted file mode 100644
index 5ec05b2cab9ba85f9f6e9644375ee14f647a413c..0000000000000000000000000000000000000000
--- a/doc/howto/deep_model/rnn/rnn_cn.md
+++ /dev/null
@@ -1,226 +0,0 @@
-RNN 配置
-=================
-
-本教程将指导你如何在 PaddlePaddle 中配置循环神经网络（RNN）。PaddlePaddle 高度支持灵活和高效的循环神经网络配置。 在本教程中，您将了解如何：
-
--   准备用来学习循环神经网络的序列数据。
--   配置循环神经网络架构。
--   使用学习完成的循环神经网络模型生成序列。
-
-我们将使用 vanilla 循环神经网络和 sequence to sequence 模型来指导你完成这些步骤。sequence to sequence 模型的代码可以在`demo / seqToseq`找到。
-
-准备序列数据
----------------------
-
-PaddlePaddle 不需要对序列数据进行任何预处理，例如填充。唯一需要做的是将相应类型设置为输入。例如，以下代码段定义了三个输入。 它们都是序列，它们的大小是`src_dict`，`trg_dict`和`trg_dict`：
-
-``` sourceCode
-settings.input_types = [
-  integer_value_sequence(len(settings.src_dict)),
-  integer_value_sequence(len(settings.trg_dict)),
-  integer_value_sequence(len(settings.trg_dict))]
-```
-
-在`process`函数中，每个`yield`函数将返回三个整数列表。每个整数列表被视为一个整数序列：
-
-``` sourceCode
-yield src_ids, trg_ids, trg_ids_next
-```
-
-有关如何编写数据提供程序的更多细节描述，请参考 [PyDataProvider2](../../ui/data_provider/index.html)。完整的数据提供文件在 `demo/seqToseq/dataprovider.py`。
-
-配置循环神经网络架构
------------------------------------------------
-
-### 简单门控循环神经网络(Gated Recurrent Neural Network)
-
-循环神经网络在每个时间步骤顺序地处理序列。下面列出了 LSTM 的架构的示例。
-
-![image](../../../tutorials/sentiment_analysis/bi_lstm.jpg)
-
-一般来说，循环网络从 *t* = 1 到 *t* = *T* 或者反向地从 *t* = *T* 到 *t* = 1 执行以下操作。
-
-*x*<sub>*t* + 1</sub> = *f*<sub>*x*</sub>(*x*<sub>*t*</sub>),*y*<sub>*t*</sub> = *f*<sub>*y*</sub>(*x*<sub>*t*</sub>)
-
-其中 *f*<sub>*x*</sub>(.) 称为**单步函数**（即单时间步执行的函数，step function），而 *f*<sub>*y*</sub>(.) 称为**输出函数**。在 vanilla 循环神经网络中，单步函数和输出函数都非常简单。然而，PaddlePaddle 可以通过修改这两个函数来实现复杂的网络配置。我们将使用 sequence to sequence 模型演示如何配置复杂的循环神经网络模型。在本节中，我们将使用简单的 vanilla 循环神经网络作为使用`recurrent_group`配置简单循环神经网络的例子。 注意，如果你只需要使用简单的RNN，GRU或LSTM，那么推荐使用`grumemory`和`lstmemory`，因为它们的计算效率比`recurrent_group`更高。
-
-对于 vanilla RNN，在每个时间步长，**单步函数**为：
-
-*x*<sub>*t* + 1</sub> = *W*<sub>*x*</sub>*x*<sub>*t*</sub> + *W*<sub>*i*</sub>*I*<sub>*t*</sub> + *b*
-
-其中 *x*<sub>*t*</sub> 是RNN状态，并且 *I*<sub>*t*</sub> 是输入，*W*<sub>*x*</sub> 和 *W*<sub>*i*</sub> 分别是RNN状态和输入的变换矩阵。*b* 是偏差。它的**输出函数**只需要*x*<sub>*t*</sub>作为输出。
-
-`recurrent_group`是构建循环神经网络的最重要的工具。 它定义了**单步函数**，**输出函数**和循环神经网络的输入。注意，这个函数的`step`参数需要实现`step function`（单步函数）和`output function`（输出函数）：
-
-
-``` sourceCode
-def simple_rnn(input,
-               size=None,
-               name=None,
-               reverse=False,
-               rnn_bias_attr=None,
-               act=None,
-               rnn_layer_attr=None):
-    def __rnn_step__(ipt):
-       out_mem = memory(name=name, size=size)
-       rnn_out = mixed_layer(input = [full_matrix_projection(ipt),
-                                      full_matrix_projection(out_mem)],
-                             name = name,
-                             bias_attr = rnn_bias_attr,
-                             act = act,
-                             layer_attr = rnn_layer_attr,
-                             size = size)
-       return rnn_out
-    return recurrent_group(name='%s_recurrent_group' % name,
-                           step=__rnn_step__,
-                           reverse=reverse,
-                           input=input)
-```
-
-PaddlePaddle 使用“Memory”（记忆模块）实现单步函数。**Memory**是在PaddlePaddle中构造循环神经网络时最重要的概念。 Memory是在单步函数中循环使用的状态，例如*x*<sub>*t* + 1</sub> = *f*<sub>*x*</sub>(*x*<sub>*t*</sub>)。 一个Memory包含**输出**和**输入**。当前时间步处的Memory的输出作为下一时间步Memory的输入。Memory也可以具有**boot layer(引导层)**，其输出被用作Memory的初始值。 在我们的例子中，门控循环单元的输出被用作输出Memory。请注意，`rnn_out`层的名称与`out_mem`的名称相同。这意味着`rnn_out` (*x*<sub>*t* + 1</sub>)的输出被用作`out_mem`Memory的**输出**。
-
-Memory也可以是序列。在这种情况下，在每个时间步中，我们有一个序列作为循环神经网络的状态。这在构造非常复杂的循环神经网络时是有用的。 其他高级功能包括定义多个Memory，以及使用子序列来定义分级循环神经网络架构。
-
-我们在函数的结尾返回`rnn_out`。 这意味着 `rnn_out` 层的输出被用作门控循环神经网络的**输出**函数。
-
-### Sequence to Sequence Model with Attention
-
-我们将使用 sequence to sequence model with attention 作为例子演示如何配置复杂的循环神经网络模型。该模型的说明如下图所示。
-
-![image](../../../tutorials/text_generation/encoder-decoder-attention-model.png)
-
-在这个模型中，源序列 *S* = {*s*<sub>1</sub>, …, *s*<sub>*T*</sub>} 用双向门控循环神经网络编码。双向门控循环神经网络的隐藏状态 *H*<sub>*S*</sub> = {*H*<sub>1</sub>, …, *H*<sub>*T*</sub>} 被称为 *编码向量*。解码器是门控循环神经网络。当解读每一个*y*<sub>*t*</sub>时, 这个门控循环神经网络生成一系列权重 *W*<sub>*S*</sub><sup>*t*</sup> = {*W*<sub>1</sub><sup>*t*</sup>, …, *W*<sub>*T*</sub><sup>*t*</sup>}, 用于计算编码向量的加权和。加权和用来生成*y*<sub>*t*</sub>。
-
-模型的编码器部分如下所示。它叫做`grumemory`来表示门控循环神经网络。如果网络架构简单，那么推荐使用循环神经网络的方法，因为它比 `recurrent_group` 更快。我们已经实现了大多数常用的循环神经网络架构，可以参考 [Layers](../../ui/api/trainer_config_helpers/layers_index.html) 了解更多细节。
-
-我们还将编码向量投射到 `decoder_size` 维空间。这通过获得反向循环网络的第一个实例，并将其投射到 `decoder_size` 维空间完成：
-
-``` sourceCode
-# 定义源语句的数据层
-src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
-# 计算每个词的词向量
-src_embedding = embedding_layer(
-    input=src_word_id,
-    size=word_vector_dim,
-    param_attr=ParamAttr(name='_source_language_embedding'))
-# 应用前向循环神经网络
-src_forward = grumemory(input=src_embedding, size=encoder_size)
-# 应用反向递归神经网络（reverse=True表示反向循环神经网络）
-src_backward = grumemory(input=src_embedding,
-                          size=encoder_size,
-                          reverse=True)
-# 将循环神经网络的前向和反向部分混合在一起
-encoded_vector = concat_layer(input=[src_forward, src_backward])
-
-# 投射编码向量到 decoder_size
-encoder_proj = mixed_layer(input = [full_matrix_projection(encoded_vector)],
-                           size = decoder_size)
-
-# 计算反向RNN的第一个实例
-backward_first = first_seq(input=src_backward)
-
-# 投射反向RNN的第一个实例到 decoder size
-decoder_boot = mixed_layer(input=[full_matrix_projection(backward_first)], size=decoder_size, act=TanhActivation())
-```
-
-解码器使用 `recurrent_group` 来定义循环神经网络。单步函数和输出函数在 `gru_decoder_with_attention` 中定义：
-
-``` sourceCode
-group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
-              StaticInput(input=encoded_proj,is_seq=True)]
-trg_embedding = embedding_layer(
-    input=data_layer(name='target_language_word',
-                     size=target_dict_dim),
-    size=word_vector_dim,
-    param_attr=ParamAttr(name='_target_language_embedding'))
-group_inputs.append(trg_embedding)
-
-# 对于配备有注意力机制的解码器，在训练中，
-# 目标向量（groudtruth）是数据输入，
-# 而源序列的编码向量可以被无边界的memory访问
-# StaticInput 意味着不同时间步的输入都是相同的值，
-# 否则它以一个序列输入，不同时间步的输入是不同的。
-# 所有输入序列应该有相同的长度。
-decoder = recurrent_group(name=decoder_group_name,
-                          step=gru_decoder_with_attention,
-                          input=group_inputs)
-```
-
-单步函数的实现如下所示。首先，它定义解码网络的**Memory**。然后定义 attention，门控循环单元单步函数和输出函数：
-
-``` sourceCode
-def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
-    # 定义解码器的Memory
-    # Memory的输出定义在 gru_step 内
-    # 注意 gru_step 应该与它的Memory名字相同
-    decoder_mem = memory(name='gru_decoder',
-                         size=decoder_size,
-                         boot_layer=decoder_boot)
-    # 计算 attention 加权编码向量
-    context = simple_attention(encoded_sequence=enc_vec,
-                               encoded_proj=enc_proj,
-                               decoder_state=decoder_mem)
-    # 混合当前词向量和attention加权编码向量
-    decoder_inputs = mixed_layer(inputs = [full_matrix_projection(context),
-                                           full_matrix_projection(current_word)],
-                                 size = decoder_size * 3)
-    # 定义门控循环单元循环神经网络单步函数
-    gru_step = gru_step_layer(name='gru_decoder',
-                              input=decoder_inputs,
-                              output_mem=decoder_mem,
-                              size=decoder_size)
-    # 定义输出函数
-    out = mixed_layer(input=[full_matrix_projection(input=gru_step)],
-                      size=target_dict_dim,
-                      bias_attr=True,
-                      act=SoftmaxActivation())
-    return out
-```
-
-生成序列
------------------
-
-训练模型后，我们可以使用它来生成序列。通常的做法是使用**beam search** 生成序列。以下代码片段定义 beam search 算法。注意，`beam_search` 函数假设 `step` 的输出函数返回的是下一个时刻输出词的 softmax 归一化概率向量。我们对模型进行了以下更改。
-
--   使用 `GeneratedInput` 来表示 trg\_embedding。 `GeneratedInput` 将上一时间步所生成的词的向量来作为当前时间步的输入。
--   使用 `beam_search` 函数。这个函数需要设置：
-    -   `bos_id`: 开始标记。每个句子都以开始标记开头。
-    -   `eos_id`: 结束标记。每个句子都以结束标记结尾。
-    -   `beam_size`: beam search 算法中的beam大小。
-    -   `max_length`: 生成序列的最大长度。
--   使用 `seqtext_printer_evaluator` 根据索引矩阵和字典打印文本。这个函数需要设置：
-    -   `id_input`: 数据的整数ID，用于标识生成的文件中的相应输出。
-    -   `dict_file`: 用于将词ID转换为词的字典文件。
-    -   `result_file`: 生成结果文件的路径。
-
-代码如下：
-
-``` sourceCode
-group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
-              StaticInput(input=encoded_proj,is_seq=True)]
-# 在生成时，解码器基于编码源序列和最后生成的目标词预测下一目标词。
-# 编码源序列（编码器输出）必须由只读Memory的 StaticInput 指定。
-# 这里， GeneratedInputs 自动获取上一个生成的词，并在最开始初始化为起始词，如 <s>。
-trg_embedding = GeneratedInput(
-    size=target_dict_dim,
-    embedding_name='_target_language_embedding',
-    embedding_size=word_vector_dim)
-group_inputs.append(trg_embedding)
-beam_gen = beam_search(name=decoder_group_name,
-                       step=gru_decoder_with_attention,
-                       input=group_inputs,
-                       bos_id=0, # Beginnning token.
-                       eos_id=1, # End of sentence token.
-                       beam_size=beam_size,
-                       max_length=max_length)
-
-seqtext_printer_evaluator(input=beam_gen,
-                          id_input=data_layer(name="sent_id", size=1),
-                          dict_file=trg_dict_path,
-                          result_file=gen_trans_file)
-outputs(beam_gen)
-```
-
-注意，这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务，请参阅 [Semantic Role Labeling Demo](../../demo/semantic_role_labeling/index.html) 了解更多详细信息。
-
-完整的配置文件在`demo/seqToseq/seqToseq_net.py`。
diff --git a/doc/howto/deep_model/rnn_config_cn.rst b/doc/howto/deep_model/rnn/rnn_config_cn.rst
similarity index 86%
rename from doc/howto/deep_model/rnn_config_cn.rst
rename to doc/howto/deep_model/rnn/rnn_config_cn.rst
index e6d8c1133a5e8a481c9bf5340c4641343804dcbe..ac2bd0775f4ab2e0a0c37462e2c23001123b152b 100644
--- a/doc/howto/deep_model/rnn_config_cn.rst
+++ b/doc/howto/deep_model/rnn/rnn_config_cn.rst
@@ -1,4 +1,4 @@
-RNN 配置
+RNN配置
 ========
 
 本教程将指导你如何在 PaddlePaddle
@@ -20,7 +20,7 @@ PaddlePaddle
 不需要对序列数据进行任何预处理，例如填充。唯一需要做的是将相应类型设置为输入。例如，以下代码段定义了三个输入。
 它们都是序列，它们的大小是\ ``src_dict``\ ，\ ``trg_dict``\ 和\ ``trg_dict``\ ：
 
-.. code:: sourcecode
+.. code:: python
 
     settings.input_types = [
       integer_value_sequence(len(settings.src_dict)),
@@ -29,12 +29,11 @@ PaddlePaddle
 
 在\ ``process``\ 函数中，每个\ ``yield``\ 函数将返回三个整数列表。每个整数列表被视为一个整数序列：
 
-.. code:: sourcecode
+.. code:: python
 
     yield src_ids, trg_ids, trg_ids_next
 
-有关如何编写数据提供程序的更多细节描述，请参考
-`PyDataProvider2 <../../ui/data_provider/index.html>`__\ 。完整的数据提供文件在
+有关如何编写数据提供程序的更多细节描述，请参考 :ref:`api_pydataprovider2` 。完整的数据提供文件在
 ``demo/seqToseq/dataprovider.py``\ 。
 
 配置循环神经网络架构
@@ -45,18 +44,17 @@ PaddlePaddle
 
 循环神经网络在每个时间步骤顺序地处理序列。下面列出了 LSTM 的架构的示例。
 
-.. figure:: ../../../tutorials/sentiment_analysis/bi_lstm.jpg
-   :alt: image
+.. image:: ../../../tutorials/sentiment_analysis/bi_lstm.jpg
+      :align: center
 
-   image
+一般来说，循环网络从 :math:`t=1` 到 :math:`t=T` 或者反向地从 :math:`t=T` 到 :math:`t=1` 执行以下操作。
 
-一般来说，循环网络从 *t* = 1 到 *t* = *T* 或者反向地从 *t* = *T* 到 *t*
-= 1 执行以下操作。
+.. math::
 
-*x*\ \ *t* + 1 = *f*\ \ *x*\ (*x*\ \ *t*\ ),\ *y*\ \ *t*\  = *f*\ \ *y*\ (*x*\ \ *t*\ )
+    x_{t+1} = f_x(x_t), y_t = f_y(x_t)
 
-其中 *f*\ \ *x*\ (.) 称为\ **单步函数**\ （即单时间步执行的函数，step
-function），而 *f*\ \ *y*\ (.) 称为\ **输出函数**\ 。在 vanilla
+其中 :math:`f_x(.)` 称为\ **单步函数**\ （即单时间步执行的函数，step
+function），而 :math:`f_y(.)` 称为\ **输出函数**\ 。在 vanilla
 循环神经网络中，单步函数和输出函数都非常简单。然而，PaddlePaddle
 可以通过修改这两个函数来实现复杂的网络配置。我们将使用 sequence to
 sequence
@@ -67,16 +65,17 @@ vanilla
 
 对于 vanilla RNN，在每个时间步长，\ **单步函数**\ 为：
 
-*x*\ \ *t* + 1 = *W*\ \ *x*\ \ *x*\ \ *t*\  + *W*\ \ *i*\ \ *I*\ \ *t*\  + *b*
+.. math::
 
-其中 *x*\ \ *t*\  是RNN状态，并且 *I*\ \ *t*\  是输入，\ *W*\ \ *x*\  和
-*W*\ \ *i*\  分别是RNN状态和输入的变换矩阵。\ *b*
-是偏差。它的\ **输出函数**\ 只需要\ *x*\ \ *t*\ 作为输出。
+    x_{t+1} = W_x x_t + W_i I_t + b
+
+其中 :math:`x_t` 是RNN状态，并且 :math:`I_t` 是输入，:math:`W_x` 和
+:math:`W_i` 分别是RNN状态和输入的变换矩阵。:math:`b` 是偏差。它的\ **输出函数**\ 只需要 :math:`x_t` 作为输出。
 
 ``recurrent_group``\ 是构建循环神经网络的最重要的工具。
 它定义了\ **单步函数**\ ，\ **输出函数**\ 和循环神经网络的输入。注意，这个函数的\ ``step``\ 参数需要实现\ ``step function``\ （单步函数）和\ ``output function``\ （输出函数）：
 
-.. code:: sourcecode
+.. code:: python
 
     def simple_rnn(input,
                    size=None,
@@ -102,7 +101,7 @@ vanilla
 
 PaddlePaddle
 使用“Memory”（记忆模块）实现单步函数。\ **Memory**\ 是在PaddlePaddle中构造循环神经网络时最重要的概念。
-Memory是在单步函数中循环使用的状态，例如\ *x*\ \ *t* + 1 = *f*\ \ *x*\ (*x*\ \ *t*\ )。
+Memory是在单步函数中循环使用的状态，例如 :math:`x_{t+1} = f_x(x_t)` 。
 一个Memory包含\ **输出**\ 和\ **输入**\ 。当前时间步处的Memory的输出作为下一时间步Memory的输入。Memory也可以具有\ **boot
 layer(引导层)**\ ，其输出被用作Memory的初始值。
 在我们的例子中，门控循环单元的输出被用作输出Memory。请注意，\ ``rnn_out``\ 层的名称与\ ``out_mem``\ 的名称相同。这意味着\ ``rnn_out``
@@ -120,30 +119,25 @@ Sequence to Sequence Model with Attention
 我们将使用 sequence to sequence model with attention
 作为例子演示如何配置复杂的循环神经网络模型。该模型的说明如下图所示。
 
-.. figure:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png
-   :alt: image
-
-   image
+.. image:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png
+      :align: center
 
-在这个模型中，源序列 *S* = {*s*\ 1, …, \ *s*\ \ *T*\ }
+在这个模型中，源序列 :math:`S = \{s_1, \dots, s_T\}` 
 用双向门控循环神经网络编码。双向门控循环神经网络的隐藏状态
-*H*\ \ *S*\  = {*H*\ 1, …, \ *H*\ \ *T*\ } 被称为
-*编码向量*\ 。解码器是门控循环神经网络。当解读每一个\ *y*\ \ *t*\ 时,
-这个门控循环神经网络生成一系列权重
-*W*\ \ *S*\ \ *t*\  = {*W*\ 1\ *t*\ , …, \ *W*\ \ *T*\ \ *t*\ },
-用于计算编码向量的加权和。加权和用来生成\ *y*\ \ *t*\ 。
+:math:`H_S = \{H_1, \dots, H_T\}` 被称为
+*编码向量*\ 。解码器是门控循环神经网络。当解读每一个 :math:`y_t` 时,
+这个门控循环神经网络生成一系列权重  :math:`W_S^t = \{W_1^t, \dots, W_T^t\}` ,
+用于计算编码向量的加权和。加权和用来生成 :math:`y_t` 。
 
 模型的编码器部分如下所示。它叫做\ ``grumemory``\ 来表示门控循环神经网络。如果网络架构简单，那么推荐使用循环神经网络的方法，因为它比
 ``recurrent_group``
-更快。我们已经实现了大多数常用的循环神经网络架构，可以参考
-`Layers <../../ui/api/trainer_config_helpers/layers_index.html>`__
-了解更多细节。
+更快。我们已经实现了大多数常用的循环神经网络架构，可以参考 :ref:`api_trainer_config_helpers_layers` 了解更多细节。
 
 我们还将编码向量投射到 ``decoder_size``
 维空间。这通过获得反向循环网络的第一个实例，并将其投射到
 ``decoder_size`` 维空间完成：
 
-.. code:: sourcecode
+.. code:: python
 
     # 定义源语句的数据层
     src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
@@ -174,7 +168,7 @@ Sequence to Sequence Model with Attention
 解码器使用 ``recurrent_group`` 来定义循环神经网络。单步函数和输出函数在
 ``gru_decoder_with_attention`` 中定义：
 
-.. code:: sourcecode
+.. code:: python
 
     group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
                   StaticInput(input=encoded_proj,is_seq=True)]
@@ -198,7 +192,7 @@ Sequence to Sequence Model with Attention
 单步函数的实现如下所示。首先，它定义解码网络的\ **Memory**\ 。然后定义
 attention，门控循环单元单步函数和输出函数：
 
-.. code:: sourcecode
+.. code:: python
 
     def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
         # 定义解码器的Memory
@@ -253,7 +247,7 @@ attention，门控循环单元单步函数和输出函数：
 
 代码如下：
 
-.. code:: sourcecode
+.. code:: python
 
     group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
                   StaticInput(input=encoded_proj,is_seq=True)]
@@ -279,9 +273,6 @@ attention，门控循环单元单步函数和输出函数：
                               result_file=gen_trans_file)
     outputs(beam_gen)
 
-注意，这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务，请参阅
-`Semantic Role Labeling
-Demo <../../demo/semantic_role_labeling/index.html>`__
-了解更多详细信息。
+注意，这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务，请参阅 :ref:`semantic_role_labeling` 了解更多详细信息。
 
 完整的配置文件在\ ``demo/seqToseq/seqToseq_net.py``\ 。
diff --git a/doc/howto/dev/new_layer_cn.rst b/doc/howto/dev/new_layer_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..9489a921c70ad6ee5709f46445554f5d9640162c
--- /dev/null
+++ b/doc/howto/dev/new_layer_cn.rst
@@ -0,0 +1,389 @@
+================
+实现新的网络层
+================
+
+这份教程展示了如何在PaddlePaddle中实现一个自定义的网络层。在这里我们使用全连接层作为例子来展示实现新网络层所需要的四个步骤。
+
+1. 推导该层前向和后向传递的方程。
+2. 实现该层的C++类。
+3. 增加梯度检测的单元测试，以保证梯度的正确计算。
+4. 封装该层的Python接口。
+
+推导方程
+================
+
+首先我们需要推导该网络层的*前向传播*和*后向传播*的方程。前向传播给定输入，计算输出。后向传播给定输出的梯度，计算输入和参数的梯度。
+
+下图是一个全连接层的示意图。在全连接层中，每个输出节点都连接到所有的输入节点上。
+
+..  image:: FullyConnected.jpg
+    :align: center
+    :scale: 60 %
+
+一个网络层的前向传播部分把输入转化为相应的输出。
+全连接层以一个维度为 :math:`D_i` 的稠密向量作为输入，使用一个尺度为 :math:`D_i \times D_o` 的变换矩阵 :math:`W` 把 :math:`x` 映射到一个维度为 :math:`D_o` 的向量，并在乘积结果上再加上维度为 :math:`D_o` 的偏置向量 :math:`b` 。
+
+.. math::
+
+   y = f(W^T x + b)
+
+其中 :math:`f(.)` 是一个非线性的*激活方程*，例如sigmoid， tanh，以及Relu。
+
+变换矩阵 :math:`W` 和偏置向量 :math:`b`  是该网络层的*参数*。一个网络层的参数是在*反向传播*时被训练的。反向传播根据输出的梯度，分别计算每个参数的梯度，以及输入的梯度。优化器则用链式法则来对每个参数计算损失函数的梯度。
+
+假设损失函数是 :math:`c(y)` ，那么
+
+.. math::
+
+   \frac{\partial c(y)}{\partial x} = \frac{\partial c(y)}{\partial y} \frac{\partial y}{\partial x}
+
+假设 :math:`z = f(W^T x + b)` ，那么
+
+.. math::
+
+   \frac{\partial y}{\partial z} = \frac{\partial f(z)}{\partial z}
+
+PaddlePaddle的base layer类可以自动计算上面的导数。
+
+因此，对全连接层来说，我们需要计算：
+
+.. math::
+
+   \frac{\partial z}{\partial x} = W, \frac{\partial z_j}{\partial W_{ij}} = x_i, \frac{\partial z}{\partial b} = \mathbf 1
+
+其中 :math:`\mathbf 1` 是一个全1的向量， :math:`W_{ij}` 是矩阵 :math:`W` 第i行第j列的数值， :math:`z_j` 是向量 :math:`z` 的第j个值， :math:`x_i` 是向量 :math:`x` 的第i个值。
+
+最后我们使用链式法则计算 :math:`\frac{\partial z}{\partial x}` 以及 :math:`\frac{\partial z}{\partial W}` 。计算的细节将在下面的小节给出。
+
+实现C++类
+===================
+
+一个网络层的C++类需要实现初始化，前向和后向。全连接层的实现位于:code:`paddle/gserver/layers/FullyConnectedLayer.h`及:code:`paddle/gserver/layers/FullyConnectedLayer.cpp`。这里我们展示一份简化过的代码。
+
+这个类需要继承 :code:`paddle::Layer` 这个基类，并且需要重写基类中的以下几个虚函数：
+
+- 类的构造函数和析构函数。
+- :code:`init` 函数。用于初始化参数和设置。
+- :code:`forward` 。实现网络层的前向传播。
+- :code:`backward` 。实现网络层的后向传播。
+- :code:`prefetch` 。用来从参数服务器预取参数矩阵相应的行。如果网络层不需要远程稀疏更新，则不需要重写该函数。（大多数网络层不需要支持远程稀疏更新）
+
+
+头文件如下：
+
+.. code-block:: c++
+
+    namespace paddle {
+    /**
+     * 全连接层的每个输出都连接到上一层的所有的神经元上。
+     * 它的输入与经过学习的参数做内积并加上偏置（可选）。
+     *
+     * 配置文件接口是fc_layer。
+     */
+
+    class FullyConnectedLayer : public Layer {
+    protected:
+      WeightList weights_;
+      std::unique_ptr<Weight> biases_;
+
+    public:
+      explicit FullyConnectedLayer(const LayerConfig& config)
+          : Layer(config) {}
+      ~FullyConnectedLayer() {}
+
+      bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+
+      Weight& getWeight(int idx) { return *weights_[idx]; }
+
+      void prefetch();
+      void forward(PassType passType);
+      void backward(const UpdateCallback& callback = nullptr);
+    };
+    }  // namespace paddle
+
+头文件中把参数定义为类的成员变量。我们使用 :code:`Weight` 类作为参数的抽象，它支持多线程更新。该类的实现细节在“实现细节”中详细介绍。
+
+- :code:`weights_` 是存有一系列变换矩阵的权重。在当前的实现方式下，网络层可以有多个输入。因此，它可能有不止一个权重。每个权重对应一个输入。
+- :code:`biases_` 是存有偏置向量的权重。
+
+全连接层没有网络层配置的超参数。如果一个网络层需要配置的话，通常的做法是将配置存于 :code:`LayerConfig& config` 中，并在类构建函数中把它放入一个类成员变量里。
+
+下面的代码片段实现了 :code:`init` 函数。
+
+- 首先，所有的 :code:`init` 函数必须先调用基类中的函数 :code:`Layer::init(layerMap, parameterMap);` 。该语句会为每个层初始化其所需要的变量和连接。
+- 之后初始化所有的权重矩阵 :math:`W` 。当前的实现方式下，网络层可以有多个输入。因此，它可能有不止一个权重。
+- 最后，初始化偏置向量。
+
+
+.. code-block:: c++
+
+    bool FullyConnectedLayer::init(const LayerMap& layerMap,
+                                   const ParameterMap& parameterMap) {
+      /* 初始化父类 */
+      Layer::init(layerMap, parameterMap);
+
+      /* 初始化权重表 */
+      CHECK(inputLayers_.size() == parameters_.size());
+      for (size_t i = 0; i < inputLayers_.size(); i++) {
+        // 获得参数尺寸
+        size_t height = inputLayers_[i]->getSize();
+        size_t width = getSize();
+
+        // 新建一个权重
+        if (parameters_[i]->isSparse()) {
+          CHECK_LE(parameters_[i]->getSize(), width * height);
+        } else {
+          CHECK_EQ(parameters_[i]->getSize(), width * height);
+        }
+        Weight* w = new Weight(height, width, parameters_[i]);
+
+        // 将新建的权重加入权重表
+        weights_.emplace_back(w);
+      }
+
+      /* 初始化biases_ */
+      if (biasParameter_.get() != NULL) {
+        biases_ = std::unique_ptr<Weight>(new Weight(1, getSize(), biasParameter_));
+      }
+
+      return true;
+    }
+
+实现前向传播的部分有下面几个步骤。
+
+- 每个层在其 :code:`forward` 函数的开头必须调用 :code:`Layer::forward(passType);` 。
+- 之后使用 :code:`reserveOutput(batchSize, size);` 为输出分配内存。由于我们支持训练数据有不同的批次大小，所以这一步是必要的。 :code:`reserveOutput`  会相应地改变输出的尺寸。为了保证效率，如果需要扩大矩阵，我们会重新分配内存；如果需要缩减矩阵，我们会继续使用现有的内存块。
+- 之后使用矩阵运算函数来计算 :math:`\sum_i W_i x + b`。:code:`getInput(i).value` 返回第i个输入矩阵。每个输入都是一个 :math:`batchSize \times dim` 的矩阵，每行表示一个批次中的单个输入。对于我们支持的全部矩阵操作，请参考 :code:`paddle/math/Matrix.h`和:code:`paddle/math/BaseMatrix.h` 。
+- 最终，使用 :code:`forwardActivation();` 进行激活操作。这会自动进行网络配置中声明的激活操作。
+
+
+.. code-block:: c++
+
+    void FullyConnectedLayer::forward(PassType passType) {
+      Layer::forward(passType);
+
+      /* 若有必要，为output_申请内存 */
+      int batchSize = getInput(0).getBatchSize();
+      int size = getSize();
+
+      {
+        // 设置输出的尺寸
+        reserveOutput(batchSize, size);
+      }
+
+      MatrixPtr outV = getOutputValue();
+
+      // 对每个输入乘上变换矩阵
+      for (size_t i = 0; i != inputLayers_.size(); ++i) {
+        auto input = getInput(i);
+        CHECK(input.value) << "The input of 'fc' layer must be matrix";
+        i == 0 ? outV->mul(input.value, weights_[i]->getW(), 1, 0)
+               : outV->mul(input.value, weights_[i]->getW(), 1, 1);
+      }
+
+      /* 加上偏置向量 */
+      if (biases_.get() != NULL) {
+        outV->addBias(*(biases_->getW()), 1);
+      }
+
+      /* 激活 */ {
+        forwardActivation();
+      }
+    }
+
+实现后向传播的部分有下面几个步骤。
+
+- :code:`backwardActivation()` 计算激活函数的梯度。通过 :code:`getOutputGrad()` 来获得输出的梯度，调用该函数后，梯度会就地（不使用额外空间）乘上输出的梯度。
+- 计算偏置的梯度。注意，我们使用 :code:`biases_->getWGrad()` 来得到某个特定参数的梯度矩阵。在一个参数的梯度被更新后，**必须**要调用 :code:`getParameterPtr()->incUpdate(callback);` 。这用于在多线程和多机上更新参数。
+- 最后，计算转换矩阵和输入的梯度，并对相应的参数调用 :code:`incUpdate` 。PaddlePaddle可以通过该机制判断是否已经收集齐所有的梯度，从而可以做一些与计算重叠的工作（例如，网络通信）。
+
+
+.. code-block:: c++
+
+    void FullyConnectedLayer::backward(const UpdateCallback& callback) {
+      /* 对激活求导 */ {
+        backwardActivation();
+      }
+
+      if (biases_ && biases_->getWGrad()) {
+        biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
+
+        biases_->getParameterPtr()->incUpdate(callback);
+      }
+
+      bool syncFlag = hl_get_sync_flag();
+
+      for (size_t i = 0; i != inputLayers_.size(); ++i) {
+        /* 计算当前层权重的梯度 */
+        if (weights_[i]->getWGrad()) {
+          MatrixPtr input_T = getInputValue(i)->getTranspose();
+          MatrixPtr oGrad = getOutputGrad();
+          {
+            weights_[i]->getWGrad()->mul(input_T, oGrad, 1, 1);
+          }
+        }
+
+
+        /* 计算输入层的偏差 */
+        MatrixPtr preGrad = getInputGrad(i);
+        if (NULL != preGrad) {
+          MatrixPtr weights_T = weights_[i]->getW()->getTranspose();
+          preGrad->mul(getOutputGrad(), weights_T, 1, 1);
+        }
+
+        {
+          weights_[i]->getParameterPtr()->incUpdate(callback);
+        }
+      }
+    }
+
+ :code:`prefetch` 函数指出了在训练时需要从参数服务器取出的行。仅在远程稀疏训练时有效。使用远程稀疏方式训练时，完整的参数矩阵被分布在不同的参数服务器上。当网络层用一个批次做训练时，该批次的输入中仅有一个子集是非零的。因此，该层仅需要这些非零样本位置所对应的变换矩阵的那些行。 :code:`prefetch` 表明了这些行的标号。
+
+大多数层不需要远程稀疏训练函数。这种情况下不需要重写该函数。
+
+.. code-block:: c++
+
+    void FullyConnectedLayer::prefetch() {
+      for (size_t i = 0; i != inputLayers_.size(); ++i) {
+        auto* sparseParam =
+            dynamic_cast<SparsePrefetchRowCpuMatrix*>(weights_[i]->getW().get());
+        if (sparseParam) {
+          MatrixPtr input = getInputValue(i);
+          sparseParam->addRows(input);
+        }
+      }
+    }
+
+最后，使用 :code:`REGISTER_LAYER(fc, FullyConnectedLayer);` 来注册该层。 :code:`fc` 是该层的标识符， :code:`FullyConnectedLayer` 是该层的类名。
+
+.. code-block:: c++
+
+    namespace paddle {
+    REGISTER_LAYER(fc, FullyConnectedLayer);
+    }
+
+若 :code:`cpp` 被放在 :code:`paddle/gserver/layers` 目录下，其会自动被加入编译列表。
+
+
+写梯度检查单元测试
+===============================
+
+写梯度检查单元测试是一个验证新实现的层是否正确的相对简单的办法。梯度检查单元测试通过有限差分法来验证一个层的梯度。首先对输入做一个小的扰动 :math:`\Delta x` ，然后观察到输出的变化为 :math:`\Delta y` ，那么，梯度就可以通过这个方程计算得到 :math:`\frac{\Delta y}{\Delta x }` 。之后，再用这个梯度去和 :code:`backward` 函数得到的梯度去对比，以保证梯度计算的正确性。需要注意的是梯度检查仅仅验证了梯度的计算，并不保证 :code:`forward` 和 :code:`backward` 函数的实现是正确的。你需要一些更复杂的单元测试来保证你实现的网络层是正确的。
+
+所有网络层的梯度检查单测都位于 :code:`paddle/gserver/tests/test_LayerGrad.cpp` 。我们建议你在写新网络层时把测试代码放入新的文件中。下面列出了全连接层的梯度检查单元测试。它包含以下几步：
+
++ 生成网络层配置。网络层配置包含以下几项：
+   - 偏置参数的大小。（例子中是4096）
+   - 层的类型。（例子中是fc）
+   - 层的大小。（例子中是4096）
+   - 激活的类型。（例子中是softmax）
+   - dropout的比例。（例子中是0.1）
++ 配置网络层的输入。在这个例子里，我们仅有一个输入。
+   - 输入的类型（ :code:`INPUT_DATA` ），可以是以下几种：
+       - :code:`INPUT_DATA` ：稠密向量。
+       - :code:`INPUT_LABEL` ：整数。
+       - :code:`INPUT_DATA_TARGET` ：稠密向量，但不用于计算梯度。
+       - :code:`INPUT_SEQUENCE_DATA` ：含有序列信息的稠密向量。
+       - :code:`INPUT_HASSUB_SEQUENCE_DATA` ：含有序列信息和子序列信息的稠密向量。
+       - :code:`INPUT_SEQUENCE_LABEL` ：含有序列信息的整数。
+       - :code:`INPUT_SPARSE_NON_VALUE_DATA` ：0-1稀疏数据。
+       - :code:`INPUT_SPARSE_FLOAT_VALUE_DATA` ：浮点稀疏数据。
+   - 输入的名字。（例子中是 :code:`layer_0` ）
+   - 输入的大小。（例子中是8192）
+   - 非零数字的个数，仅对稀疏数据有效。
+   - 稀疏数据的格式，仅对稀疏数据有效。
++ 对每个输入，都需要调用一次 :code:`config.layerConfig.add_inputs();` 。
++ 调用 :code:`testLayerGrad` 来做梯度检查。它包含以下参数。
+   - 层和输入的配置。（例子中是 :code:`config` ）
+   - 网络层的类型。（例子中是 :code:`fc` ）
+   - 梯度检查的输入数据的批次大小。（例子中是100）
+   - 输入是否是转置的。大多数层需要设置为 :code:`false` 。（例子中是 :code:`false` ）
+   - 是否使用权重。有些层或者激活需要做归一化以保证它们的输出的和是一个常数。例如，softmax激活的输出的和总是1。在这种情况下，我们不能通过常规的梯度检查的方式来计算梯度。因此我们采用输出的加权和（非常数）来计算梯度。（例子中是 :code:`true` ，因为全连接层的激活可以是softmax）
+
+.. code-block:: c++
+
+    void testFcLayer(string format, size_t nnz) {
+      // Create layer configuration.
+      TestConfig config;
+      config.biasSize = 4096;
+      config.layerConfig.set_type("fc");
+      config.layerConfig.set_size(4096);
+      config.layerConfig.set_active_type("softmax");
+      config.layerConfig.set_drop_rate(0.1);
+      // Setup inputs.
+      config.inputDefs.push_back(
+          {INPUT_DATA, "layer_0", 8192, nnz, ParaSparse(format)});
+        config.layerConfig.add_inputs();
+      LOG(INFO) << config.inputDefs[0].sparse.sparse << " "
+                << config.inputDefs[0].sparse.format;
+      for (auto useGpu : {false, true}) {
+        testLayerGrad(config, "fc", 100, /* trans */ false, useGpu,
+                      /* weight */ true);
+      }
+    }
+
+如果你要为了测试而增加新的文件，例如 :code:`paddle/gserver/tests/testFCGrad.cpp` ，你需要把该文件加入 :code:`paddle/gserver/tests/CMakeLists.txt` 中。下面给出了一个例子。当你执行命令 :code:`make tests` 时，所有的单测都会被执行一次。注意，有些层可能需要高精度来保证梯度检查单测正确执行。你需要在配置cmake时将 :code:`WITH_DOUBLE` 设置为 `ON` 。
+
+.. code-block:: bash
+
+    add_unittest_without_exec(test_FCGrad
+        test_FCGrad.cpp
+        LayerGradUtil.cpp
+        TestUtil.cpp)
+
+    add_test(NAME test_FCGrad
+        COMMAND test_FCGrad)
+
+
+实现python封装
+========================
+
+python封装的实现使得我们可以在配置文件中使用新实现的网络层。所有的python封装都在 :code:`python/paddle/trainer/config_parser.py` 中。全连接层python封装的例子中包含下面几步：
+
+- 所有的Python封装都使用 :code:`@config_layer('fc')` 这样的装饰器。网络层的标识符为 :code:`fc` 。
+- 实现构造函数 :code:`__init__` 。
+	- 它首先调用基构造函数 :code:`super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)` 。 :code:`FCLayer` 是Python封装的类名。 :code:`fc` 是网络层的标识符。为了封装能够正确工作，这些名字必须要写对。
+	- 之后，计算变换矩阵的大小和格式（是否稀疏）。
+
+.. code-block:: python
+
+    @config_layer('fc')
+    class FCLayer(LayerBase):
+        def __init__(
+                self,
+                name,
+                size,
+                inputs,
+                bias=True,
+                **xargs):
+            super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)
+            for input_index in xrange(len(self.inputs)):
+                input_layer = self.get_input_layer(input_index)
+                psize = self.config.size * input_layer.size
+                dims = [input_layer.size, self.config.size]
+                format = self.inputs[input_index].format
+                sparse = format == "csr" or format == "csc"
+                if sparse:
+                    psize = self.inputs[input_index].nnz
+                self.create_input_parameter(input_index, psize, dims, sparse, format)
+            self.create_bias_parameter(bias, self.config.size)
+
+在网络配置中，网络层的细节可以通过下面这些代码片段来指定。这个类的参数包括：
+
+- :code:`name` 是网络层实例的名字标识符。
+- :code:`type` 是网络层的类型，通过网络层的标识符来指定。
+- :code:`size` 是网络层输出的大小。
+- :code:`bias` 表明这个层的一个实例是否需要偏置。
+- :code:`inputs` 说明这个层的输入，输入是由一个list中的网络层实例的名字组成的。
+
+.. code-block:: python
+
+    Layer(
+        name = "fc1",
+        type = "fc",
+        size = 64,
+        bias = True,
+        inputs = [Input("pool3")]
+    )
+
+我们建议你为你的Python封装实现一个“助手”，使得搭模型时更方便。具体可以参考 :code:`python/paddle/trainer_config_helpers/layers.py` 。
diff --git a/doc/howto/dev/new_layer_en.rst b/doc/howto/dev/new_layer_en.rst
index 0513f068f39ad0d931b03d066a0083a1a8a33b79..46481f5ead33dc6a26507e021fd9ae0f8316e940 100644
--- a/doc/howto/dev/new_layer_en.rst
+++ b/doc/howto/dev/new_layer_en.rst
@@ -209,7 +209,6 @@ The implementation of the backward part has the following steps.
       if (biases_ && biases_->getWGrad()) {
         biases_->getWGrad()->collectBias(*getOutputGrad(), 1);
 
-        /* Increasing the number of gradient */
         biases_->getParameterPtr()->incUpdate(callback);
       }
 
@@ -297,7 +296,7 @@ All the gradient check unit tests are located in :code:`paddle/gserver/tests/tes
 + each inputs needs to call :code:`config.layerConfig.add_inputs();` once.
 + call :code:`testLayerGrad` to perform gradient checks. It has the following arguments.
    - layer and input configurations. (:code:`config` in our example)
-   - type of the input. (:code:`fc` in our example)
+   - type of the layer. (:code:`fc` in our example)
    - batch size of the gradient check. (100 in our example)
    - whether the input is transpose. Most layers need to set it to :code:`false`. (:code:`false` in our example)
    - whether to use weights. Some layers or activations perform normalization so that the sum of their output is a constant. For example, the sum of output of a softmax activation is one. In this case, we cannot correctly compute the gradients using regular gradient check techniques. A weighted sum of the output, which is not a constant, is utilized to compute the gradients. (:code:`true` in our example, because the activation of a fully connected layer can be softmax)
@@ -310,7 +309,7 @@ All the gradient check unit tests are located in :code:`paddle/gserver/tests/tes
       config.biasSize = 4096;
       config.layerConfig.set_type("fc");
       config.layerConfig.set_size(4096);
-      config.layerConfig.set_active_type("sigmoid");
+      config.layerConfig.set_active_type("softmax");
       config.layerConfig.set_drop_rate(0.1);
       // Setup inputs.
       config.inputDefs.push_back(
diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst
index 6a14ce8ae75c3dd372184ea6ea9f6034a3dbf919..bd3d0ec292057037414792b1ac176d12605b90d5 100644
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -7,10 +7,11 @@
 ..  toctree::
   :maxdepth: 1
 
+  usage/cmd_parameter/index_cn.rst
   usage/concepts/use_concepts_cn.rst
   usage/cluster/cluster_train_cn.md
-  usage/cluster/k8s/k8s_cn.md
-  usage/cluster/k8s/k8s_distributed_cn.md
+  usage/k8s/k8s_cn.md
+  usage/k8s/k8s_distributed_cn.md
 
 开发标准
 --------
diff --git a/doc/howto/index_en.rst b/doc/howto/index_en.rst
index 983dc743eb453a0210bc5fb3c7e4525fa838d428..1fbfcd260b912078f00ed5b720ed607db725c4e2 100644
--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -7,8 +7,10 @@ Usage
 ..  toctree::
   :maxdepth: 1
 
-  usage/cmd_parameter/index_en.md
+  usage/cmd_parameter/index_en.rst
   usage/cluster/cluster_train_en.md
+  usage/k8s/k8s_en.md
+  usage/k8s/k8s_aws_en.md
 
 Development
 ------------
diff --git a/doc/howto/usage/cmd_parameter/index_cn.rst b/doc/howto/usage/cmd_parameter/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4c8729821110b9aec99351fc0a83a1ba75a8a2bb
--- /dev/null
+++ b/doc/howto/usage/cmd_parameter/index_cn.rst
@@ -0,0 +1,11 @@
+..  _cmd_line_index:
+
+设置命令行参数
+===============
+
+..  toctree::
+  :maxdepth: 1
+
+  use_case_cn.md
+  arguments_cn.md
+  detail_introduction_cn.md
diff --git a/doc/howto/usage/cmd_parameter/index_en.md b/doc/howto/usage/cmd_parameter/index_en.md
deleted file mode 100644
index 2a96e7e976c43fd69befccd78753cee431ef61bc..0000000000000000000000000000000000000000
--- a/doc/howto/usage/cmd_parameter/index_en.md
+++ /dev/null
@@ -1,8 +0,0 @@
-```eval_rst
-..  _cmd_line_index:
-```
-# Set Command-line Parameters
-
-* [Use Case](use_case_en.md)
-* [Arguments](arguments_en.md)
-* [Detailed Descriptions](detail_introduction_en.md)
diff --git a/doc/howto/usage/cmd_parameter/index_en.rst b/doc/howto/usage/cmd_parameter/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0e3c72d27aca063f1b6f1c23e55718dba373c40a
--- /dev/null
+++ b/doc/howto/usage/cmd_parameter/index_en.rst
@@ -0,0 +1,11 @@
+..  _cmd_line_index:
+
+Set Command-line Parameters
+===========================
+
+..  toctree::
+  :maxdepth: 1
+
+  use_case_en.md
+  arguments_en.md
+  detail_introduction_en.md
diff --git a/doc/howto/usage/cluster/k8s-aws/README.md b/doc/howto/usage/k8s/k8s_aws_en.md
similarity index 97%
rename from doc/howto/usage/cluster/k8s-aws/README.md
rename to doc/howto/usage/k8s/k8s_aws_en.md
index 593158428803c067a07cd741aabfe601f6f8e194..b04bfba590de42956dfe99256cde325b24adbfab 100644
--- a/doc/howto/usage/cluster/k8s-aws/README.md
+++ b/doc/howto/usage/k8s/k8s_aws_en.md
@@ -1,13 +1,13 @@
-# PaddlePaddle on AWS with Kubernetes
+# Kubernetes on AWS
 
 ## Create AWS Account and IAM Account
 
 To use AWS, we need to sign up an AWS account on Amazon's Web site.
 An AWS account allows us to login to the AWS Console Web interface to
-create IAM users and user groups.  Usually, we create a user group with
+create IAM users and user groups. Usually, we create a user group with
 privileges required to run PaddlePaddle, and we create users for
 those who are going to run PaddlePaddle and add these users into the
-group.  IAM users can identify themselves using password and tokens,
+group. IAM users can identify themselves using password and tokens,
 where passwords allows users to log in to the AWS Console, and tokens
 make it easy for users to submit and inspect jobs from the command
 line.
@@ -331,15 +331,15 @@ For sharing the training data across all the Kubernetes nodes, we use EFS (Elast
 1. Make sure you added AmazonElasticFileSystemFullAccess policy in your group.
 
 1. Create the Elastic File System in AWS console, and attach the new VPC with it.
-<img src="create_efs.png" width="800">
+<center>![](src/create_efs.png)</center>
 
 
 1. Modify the Kubernetes security group under ec2/Security Groups, add additional inbound policy "All TCP TCP 0 - 65535 0.0.0.0/0" for Kubernetes default VPC security group. 
-<img src="add_security_group.png" width="800">
+<center>![](src/add_security_group.png)</center>
 
 
 1. Follow the EC2 mount instruction to mount the disk onto all the Kubernetes nodes, we recommend to mount EFS disk onto ~/efs.
-<img src="efs_mount.png" width="800">
+<center>![](src/efs_mount.png)</center>
 
 
 Before starting the training, you should place your user config and divided training data onto EFS. When the training start, each task will copy related files from EFS into container, and it will also write the training results back onto EFS, we will show you how to place the data later in this article.
@@ -360,7 +360,7 @@ In one time of distributed training, user will confirm the PaddlePaddle node num
 
 ####Create PaddlePaddle Node
 
-After Kubernetes master gets the request, it will parse the yaml file and create several pods (defined by PaddlePaddle's node number)， Kubernetes will allocate these pods onto cluster's node. A pod represents a PaddlePaddle node, when pod is successfully allocated onto one physical/virtual machine, Kubernetes will startup the container in the pod, and this container will use the environment variables in yaml file and start up `paddle pserver` and `paddle trainer` processes.
+After Kubernetes master gets the request, it will parse the yaml file and create several pods (defined by PaddlePaddle's node number), Kubernetes will allocate these pods onto cluster's node. A pod represents a PaddlePaddle node, when pod is successfully allocated onto one physical/virtual machine, Kubernetes will startup the container in the pod, and this container will use the environment variables in yaml file and start up `paddle pserver` and `paddle trainer` processes.
 
 
 ####Start up Training
@@ -661,6 +661,6 @@ Sometimes we might need to create or manage the cluster on AWS manually with lim
 ### Some Presumptions
 
 * Instances run on CoreOS, the official IAM.
-* Kubernetes node use instance storage, no EBS get mounted.  Etcd is running on additional node.
+* Kubernetes node use instance storage, no EBS get mounted. Etcd is running on additional node.
 * For networking, we use Flannel network at this moment, we will use Calico solution later on.
 * When you create a service with Type=LoadBalancer, Kubernetes will create and ELB, and create a security group for the ELB.
diff --git a/doc/howto/usage/cluster/k8s/k8s_cn.md b/doc/howto/usage/k8s/k8s_cn.md
similarity index 99%
rename from doc/howto/usage/cluster/k8s/k8s_cn.md
rename to doc/howto/usage/k8s/k8s_cn.md
index 2575701053ca12cc3af45682af6cd682a88bb987..ab07cb9cd5b135ddea82b3360720537f1dc5a801 100644
--- a/doc/howto/usage/cluster/k8s/k8s_cn.md
+++ b/doc/howto/usage/k8s/k8s_cn.md
@@ -1,4 +1,4 @@
-# Kubernetes 单机训练
+# Kubernetes单机训练
 
 在这篇文档里，我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的Paddle训练作业。在下一篇中，我们将介绍如何启动分布式训练作业。
 
diff --git a/doc/howto/usage/cluster/k8s/k8s_distributed_cn.md b/doc/howto/usage/k8s/k8s_distributed_cn.md
similarity index 99%
rename from doc/howto/usage/cluster/k8s/k8s_distributed_cn.md
rename to doc/howto/usage/k8s/k8s_distributed_cn.md
index 53d0b4676c6a3a2dc8c58e231756638cc0b67765..b63b8437a0114a0165971933912da83c2dd770a6 100644
--- a/doc/howto/usage/cluster/k8s/k8s_distributed_cn.md
+++ b/doc/howto/usage/k8s/k8s_distributed_cn.md
@@ -1,4 +1,4 @@
-# Kubernetes 分布式训练
+# Kubernetes分布式训练
 
 前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里，我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练，文章 [Cluster Training](https://github.com/baidu/Paddle/blob/develop/doc/cluster/opensource/cluster_train.md)介绍了一种通过SSH远程分发任务，进行分布式训练的方法，与此不同的是，本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群，进行分布式训练的方案。
 
@@ -22,7 +22,7 @@
 
 首先，我们需要拥有一个Kubernetes集群，在这个集群中所有node与pod都可以互相通信。关于Kubernetes集群搭建，可以参考[官方文档](http://kubernetes.io/docs/getting-started-guides/kubeadm/)，在以后的文章中我们也会介绍AWS上搭建的方案。本文假设大家能找到几台物理机，并且可以按照官方文档在上面部署Kubernetes。在本文的环境中，Kubernetes集群中所有node都挂载了一个[MFS](http://moosefs.org/)（Moose filesystem，一种分布式文件系统）共享目录，我们通过这个目录来存放训练文件与最终输出的模型。关于MFS的安装部署，可以参考[MooseFS documentation](https://moosefs.com/documentation.html)。在训练之前，用户将配置与训练数据切分好放在MFS目录中，训练时，程序从此目录拷贝文件到容器内进行训练，将结果保存到此目录里。整体的结构图如下：
 
-![paddle on kubernetes结构图](k8s-paddle-arch.png)
+![paddle on kubernetes结构图](src/k8s-paddle-arch.png)
 
 上图描述了一个3节点的分布式训练场景，Kubernetes集群的每个node上都挂载了一个MFS目录，这个目录可以通过volume的形式挂载到容器中。Kubernetes为这次训练创建了3个pod并且调度到了3个node上运行，每个pod包含一个PaddlePaddle容器。在容器创建后，会启动pserver与trainer进程，读取volume中的数据进行这次分布式训练。
 
diff --git a/doc/howto/usage/cluster/k8s/k8s_en.md b/doc/howto/usage/k8s/k8s_en.md
similarity index 100%
rename from doc/howto/usage/cluster/k8s/k8s_en.md
rename to doc/howto/usage/k8s/k8s_en.md
diff --git a/doc/howto/usage/cluster/k8s/Dockerfile b/doc/howto/usage/k8s/src/Dockerfile
similarity index 100%
rename from doc/howto/usage/cluster/k8s/Dockerfile
rename to doc/howto/usage/k8s/src/Dockerfile
diff --git a/doc/howto/usage/cluster/k8s-aws/add_security_group.png b/doc/howto/usage/k8s/src/add_security_group.png
similarity index 100%
rename from doc/howto/usage/cluster/k8s-aws/add_security_group.png
rename to doc/howto/usage/k8s/src/add_security_group.png
diff --git a/doc/howto/usage/cluster/k8s-aws/create_efs.png b/doc/howto/usage/k8s/src/create_efs.png
similarity index 100%
rename from doc/howto/usage/cluster/k8s-aws/create_efs.png
rename to doc/howto/usage/k8s/src/create_efs.png
diff --git a/doc/howto/usage/cluster/k8s-aws/efs_mount.png b/doc/howto/usage/k8s/src/efs_mount.png
similarity index 100%
rename from doc/howto/usage/cluster/k8s-aws/efs_mount.png
rename to doc/howto/usage/k8s/src/efs_mount.png
diff --git a/doc/howto/usage/cluster/k8s/job.yaml b/doc/howto/usage/k8s/src/job.yaml
similarity index 100%
rename from doc/howto/usage/cluster/k8s/job.yaml
rename to doc/howto/usage/k8s/src/job.yaml
diff --git a/doc/howto/usage/cluster/k8s/k8s-paddle-arch.png b/doc/howto/usage/k8s/src/k8s-paddle-arch.png
similarity index 100%
rename from doc/howto/usage/cluster/k8s/k8s-paddle-arch.png
rename to doc/howto/usage/k8s/src/k8s-paddle-arch.png
diff --git a/doc/howto/usage/cluster/k8s-aws/managed_policy.png b/doc/howto/usage/k8s/src/managed_policy.png
similarity index 100%
rename from doc/howto/usage/cluster/k8s-aws/managed_policy.png
rename to doc/howto/usage/k8s/src/managed_policy.png
diff --git a/doc/howto/usage/cluster/k8s/start.sh b/doc/howto/usage/k8s/src/start.sh
similarity index 100%
rename from doc/howto/usage/cluster/k8s/start.sh
rename to doc/howto/usage/k8s/src/start.sh
diff --git a/doc/howto/usage/cluster/k8s/start_paddle.py b/doc/howto/usage/k8s/src/start_paddle.py
similarity index 100%
rename from doc/howto/usage/cluster/k8s/start_paddle.py
rename to doc/howto/usage/k8s/src/start_paddle.py
diff --git a/doc/tutorials/gan/gan.png b/doc/tutorials/gan/gan.png
index 001ed6cc19e8911f9b10f63211c9658160b3a06e..0eafd7cb49b545f412f8e775804bcd0b22c42454 100644
Binary files a/doc/tutorials/gan/gan.png and b/doc/tutorials/gan/gan.png differ
diff --git a/doc/tutorials/gan/index_en.md b/doc/tutorials/gan/index_en.md
index 99c8d730117a469c89abb218eeacf66103c0cbed..ac9ed37b2264778869f92c0910b1cb946fb4427f 100644
--- a/doc/tutorials/gan/index_en.md
+++ b/doc/tutorials/gan/index_en.md
@@ -4,9 +4,7 @@ This demo implements GAN training described in the original [GAN paper](https://
 
 The high-level structure of GAN is shown in Figure. 1 below. It is composed of two major parts: a generator and a discriminator, both of which are based on neural networks. The generator takes in some kind of noise with a known distribution and transforms it into an image. The discriminator takes in an image and determines whether it is artificially generated by the generator or a real image. So the generator and the discriminator are in a competitive game in which generator is trying to generate image to look as real as possible to fool the discriminator, while the discriminator is trying to distinguish between real and fake images. 
 
-<p align="center">
-    <img src="./gan.png" width="500" height="300"> 
-</p>
+<center>![](./gan.png)</center>
 <p align="center">
     Figure 1. GAN-Model-Structure
     <a href="https://ishmaelbelghazi.github.io/ALI/">figure credit</a>
@@ -111,9 +109,7 @@ $python gan_trainer.py -d uniform --useGpu 1
 ```
 The generated samples can be found in ./uniform_samples/ and one example is shown below as Figure 2. One can see that it roughly recovers the 2D uniform distribution. 
 
-<p align="center">
-    <img src="./uniform_sample.png" width="300" height="300"> 
-</p>
+<center>![](./uniform_sample.png)</center>
 <p align="center">
     Figure 2. Uniform Sample
 </p>
@@ -135,9 +131,7 @@ To train the GAN model on mnist data, one can use the following command:
 $python gan_trainer.py -d mnist --useGpu 1
 ```
 The generated sample images can be found at ./mnist_samples/ and one example is shown below as Figure 3. 
-<p align="center">
-    <img src="./mnist_sample.png" width="300" height="300"> 
-</p>
+<center>![](./mnist_sample.png)</center>
 <p align="center">
     Figure 3. MNIST Sample
 </p>
diff --git a/doc/tutorials/gan/uniform_sample.png b/doc/tutorials/gan/uniform_sample.png
index 4a96c45cae82673f5a1df986f2643a8026da7937..e716c48e782019a757bed0cb443f2ed97386cbe2 100644
Binary files a/doc/tutorials/gan/uniform_sample.png and b/doc/tutorials/gan/uniform_sample.png differ
diff --git a/doc/tutorials/index_cn.md b/doc/tutorials/index_cn.md
index 97014d537655d21871295699381c5dd2106d0b56..6a27004d58d24cc466d930322be8cdbb2f434c74 100644
--- a/doc/tutorials/index_cn.md
+++ b/doc/tutorials/index_cn.md
@@ -2,6 +2,7 @@
 
 * [快速入门](quick_start/index_cn.rst)
 * [个性化推荐](rec/ml_regression_cn.rst)
+* [图像分类](image_classification/index_cn.md)
 * [情感分析](sentiment_analysis/index_cn.md)
 * [语义角色标注](semantic_role_labeling/index_cn.md)
 * [机器翻译](text_generation/index_cn.md)
@@ -9,3 +10,4 @@
 ## 常用模型
 
 * [ResNet模型](imagenet_model/resnet_model_cn.md)
+* [词向量模型](embedding_model/index_cn.md)
diff --git a/doc/tutorials/index_en.md b/doc/tutorials/index_en.md
index cce9d3a176a5e5c87e97c16362ec8a202e8eb80a..77331a703b6f0fdf92921ebcc476325b7327e976 100644
--- a/doc/tutorials/index_en.md
+++ b/doc/tutorials/index_en.md
@@ -7,6 +7,7 @@ There are several examples and demos here.
 * [Sentiment Analysis](sentiment_analysis/index_en.md)
 * [Semantic Role Labeling](semantic_role_labeling/index_en.md)
 * [Text Generation](text_generation/index_en.md)
+* [Image Auto-Generation](gan/index_en.md)
 
 ## Model Zoo
 * [ImageNet: ResNet](imagenet_model/resnet_model_en.md)
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index da6dad10cd807654f9ddd03beeb29cef69fc8de0..3ac50e34bb434b14d346f1c4707084f93461284d 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -1,3 +1,21 @@
+FUNCTION(generate_python_api target_name)
+    ADD_CUSTOM_COMMAND(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
+                              ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
+                              ${PROJ_ROOT}/paddle/Paddle_wrap.h
+        COMMAND ${SWIG_EXECUTABLE} -python -c++ -outcurrentdir -I../ api/Paddle.swig
+                && mv ${PROJ_ROOT}/paddle/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
+        DEPENDS ${PROJ_ROOT}/paddle/api/Paddle.swig
+                ${PROJ_ROOT}/paddle/api/PaddleAPI.h
+                ${external_project_dependencies}
+        WORKING_DIRECTORY ${PROJ_ROOT}/paddle
+        COMMENT "Generate Python API from swig")
+    ADD_CUSTOM_TARGET(${target_name} ALL DEPENDS
+                ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
+                ${PROJ_ROOT}/paddle/Paddle_wrap.h
+                ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
+                ${external_project_dependencies})
+ENDFUNCTION(generate_python_api)
+
 set(API_SOURCES
     Arguments.cpp
     ConfigParser.cpp
@@ -42,7 +60,7 @@ file(GLOB PY_PADDLE_PYTHON_FILES ${PROJ_ROOT}/paddle/py_paddle/*.py)
 
 # TODO(yuyang18) : make wheel name calculated by cmake
 add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/dist/.timestamp
-    COMMAND ${PYTHON_EXECUTABLE} setup.py  bdist_wheel
+    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch dist/.timestamp
     COMMAND rm -rf py_paddle.egg-info build
     WORKING_DIRECTORY ${PROJ_ROOT}/paddle
@@ -76,5 +94,17 @@ add_dependencies(python_api_wheel python_swig_sources
   paddle_cuda)
 
 if(WITH_TESTING)
+    SET(PIP_SOURCES_DIR ${PYTHON_SOURCES_DIR}/pip)
+    ExternalProject_Add(pip
+        ${EXTERNAL_PROJECT_LOG_ARGS}
+        GIT_REPOSITORY      https://github.com/pypa/pip.git
+        GIT_TAG             9.0.1
+        PREFIX              ${PIP_SOURCES_DIR}
+        CONFIGURE_COMMAND   ""
+        BUILD_COMMAND       ""
+        INSTALL_COMMAND     env ${py_env} ${PYTHON_EXECUTABLE} setup.py install
+        BUILD_IN_SOURCE     1
+        DEPENDS python setuptools python_api_wheel
+    )
     add_subdirectory(test)
 endif()
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index 81c9eed0bccd5ad63f524cdb011fc73cd568f465..364d19f9414430709108824dce75a1007332d824 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -19,8 +19,8 @@ limitations under the License. */
 #include <stdexcept>
 #include <string>
 #include <vector>
+#include "paddle/utils/Common.h"
 #include "paddle/utils/GlobalConstants.h"
-#include "paddle/utils/common.h"
 
 /// Import PaddlePaddle's enumeration into global namespace.
 using namespace paddle::enumeration_wrapper;  // NOLINT
diff --git a/paddle/api/Util.cpp b/paddle/api/Util.cpp
index c3f739568f50b6ee8b0894d06a4d7f91c7816879..54d67aa62f4d87ad03282962c722019698dc621a 100644
--- a/paddle/api/Util.cpp
+++ b/paddle/api/Util.cpp
@@ -15,12 +15,11 @@ limitations under the License. */
 #include "PaddleAPI.h"
 
 #include "paddle/parameter/Parameter.h"
-#include "paddle/utils/Excepts.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Flags.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Util.h"
 
-#include <fenv.h>
 #include <algorithm>
 #include <iostream>
 #include <iterator>
diff --git a/paddle/api/paddle_api_config.py.in b/paddle/api/paddle_api_config.py.in
index 23542b952b7699d66cf64b47d0354e9078ae06d9..e11ee920362aed3ec79a2e62d447d7dde4a99248 100644
--- a/paddle/api/paddle_api_config.py.in
+++ b/paddle/api/paddle_api_config.py.in
@@ -1,17 +1,17 @@
 PADDLE_BUILD_DIR="@CMAKE_CURRENT_BINARY_DIR@/../"
 WITH_GPU="@WITH_GPU@"
-PROTOBUF_LIB="@PROTOBUF_LIBRARY@"
-ZLIB_LIB="@ZLIB_LIBRARIES@"
+PROTOBUF_LIBRARY="@PROTOBUF_LIBRARY@"
+ZLIB_LIBRARIES="@ZLIB_LIBRARIES@"
 CMAKE_THREAD_LIB="@CMAKE_THREAD_LIBS_INIT@"
 CMAKE_DL_LIBS="@CMAKE_DL_LIBS@"
 
 
 WITH_PYTHON="@WITH_PYTHON@"
 PYTHON_LIBRARIES="@PYTHON_LIBRARIES@"
-LIBGLOG_LIBRARY="@LIBGLOG_LIBRARY@"
+GLOG_LIBRARIES="@GLOG_LIBRARIES@"
 GFLAGS_LIBRARIES="@GFLAGS_LIBRARIES@"
 GFLAGS_LOCATION="@GFLAGS_LOCATION@"
-CBLAS_LIBRARIES="@CBLAS_LIBS@"
+CBLAS_LIBRARIES="@CBLAS_LIBRARIES@"
 
-CUDA_LIBRARIES="@CUDA_LIBRARIES@"
+CUDA_LIBRARIES="@CUDA_cudart_shared_LIBRARY@"
 WITH_COVERALLS="@ON_COVERALLS@"
diff --git a/paddle/api/paddle_ld_flags.py b/paddle/api/paddle_ld_flags.py
index b4d27b1cc728f92b2210f30b69f3f5899fe81d65..ad5dce209bf8e14120320a58c3cd85d6f6a97688 100644
--- a/paddle/api/paddle_ld_flags.py
+++ b/paddle/api/paddle_ld_flags.py
@@ -40,14 +40,14 @@ try:
             self.paddle_build_dir = PADDLE_BUILD_DIR
             self.paddle_build_dir = os.path.abspath(self.paddle_build_dir)
             self.with_gpu = PaddleLDFlag.cmake_bool(WITH_GPU)
-            self.protolib = PROTOBUF_LIB
-            self.zlib = ZLIB_LIB
+            self.protolib = PROTOBUF_LIBRARY
+            self.zlib = ZLIB_LIBRARIES
             self.thread = CMAKE_THREAD_LIB
             self.dl_libs = CMAKE_DL_LIBS
             self.with_python = PaddleLDFlag.cmake_bool(WITH_PYTHON)
             self.python_libs = PYTHON_LIBRARIES
 
-            self.glog_libs = LIBGLOG_LIBRARY
+            self.glog_libs = GLOG_LIBRARIES
 
             self.with_coverage = PaddleLDFlag.cmake_bool(WITH_COVERALLS)
             self.gflags_libs = GFLAGS_LIBRARIES
diff --git a/paddle/api/test/CMakeLists.txt b/paddle/api/test/CMakeLists.txt
index 08a0fe96a004d38b81d0bac881da1faeb52685f4..a2fa623c80087d42e6a2a5c05f62eba4997f8ec4 100644
--- a/paddle/api/test/CMakeLists.txt
+++ b/paddle/api/test/CMakeLists.txt
@@ -1,2 +1,2 @@
 add_test(NAME test_swig_api
-    COMMAND bash ${PROJ_ROOT}/paddle/api/test/run_tests.sh)
+    COMMAND bash ${PROJ_ROOT}/paddle/api/test/run_tests.sh ${PYTHON_EXECUTABLE})
diff --git a/paddle/api/test/run_tests.sh b/paddle/api/test/run_tests.sh
index 2f12ba026430ba7adb6f4dee11ed17ea3ad3f36d..bcf06afa86aaa1a3151aeb966b54f69657c541e3 100755
--- a/paddle/api/test/run_tests.sh
+++ b/paddle/api/test/run_tests.sh
@@ -20,11 +20,7 @@ popd > /dev/null
 
 cd $SCRIPTPATH
 
-rm -rf .test_env
-virtualenv .test_env
-source .test_env/bin/activate
-
-pip --timeout 600  install ../../dist/*.whl
+$1 -m pip install ../../dist/*.whl
 
 test_list="testArguments.py testGradientMachine.py testMatrix.py  testVector.py testTrain.py testTrainer.py"
 
@@ -33,7 +29,7 @@ export PYTHONPATH=$PWD/../../../python/
 for fn in $test_list
 do
   echo "test $fn"
-  python $fn
+  $1 $fn
   if [ $? -ne 0 ]; then
     exit 1
   fi
diff --git a/paddle/cuda/CMakeLists.txt b/paddle/cuda/CMakeLists.txt
index aa1ff4a771c4a1c64be86893e7b2261ae65f0f94..57fb89608f4bcf3e6829fe850a61c2a626adfbdc 100755
--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -88,6 +88,8 @@ else()
                 ${CUDA_CXX_SOURCES})
 endif()
 
+add_dependencies(paddle_cuda ${external_project_dependencies})
+
 add_style_check_target(paddle_cuda
                        ${CUDA_SOURCES}
                        ${CUDA_HEADERS}
diff --git a/paddle/cuda/include/hl_warpctc_wrap.h b/paddle/cuda/include/hl_warpctc_wrap.h
index 79bf6c3db7f876009d98a62b6523588f021886e8..7885ae570148c0b9870089baf22b6bacb786f995 100644
--- a/paddle/cuda/include/hl_warpctc_wrap.h
+++ b/paddle/cuda/include/hl_warpctc_wrap.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #ifndef HL_WARPCTC_WRAP_H_
 #define HL_WARPCTC_WRAP_H_
 
+#include "ctc.h"
 #include "hl_base.h"
-#include "warp-ctc/include/ctc.h"
 
 typedef ctcStatus_t hl_warpctc_status_t;
 typedef ctcOptions hl_warpctc_options_t;
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 0b3126155d0c0872a70fc83260d4ea34161cb717..de85eeca821742e1d39d5ce26f873238d4359cba 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -10,6 +10,8 @@ if(WITH_GPU)
 endif()
 
 add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
+add_dependencies(paddle_function ${external_project_dependencies})
+
 
 if(WITH_GPU)
 if(WITH_TESTING)
@@ -17,9 +19,7 @@ if(WITH_TESTING)
     # file(GLOB test_files . *OpTest.cpp)
     # add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
     add_simple_unittest(CrossMapNormalOpTest)
-    add_unittest(ContextProjectionOpTest
-        ContextProjectionOpTest.cpp
-        ../gserver/tests/TestUtil.cpp)
+    add_simple_unittest(ContextProjectionOpTest)
 endif()
 endif()
 
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
index bd367a859e10c0522206cd0215970922905905ed..07907fc1ba7973c728c3a882e4be6b1a7ef7a97a 100644
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -85,15 +85,15 @@ public:
   void calc(const Arguments& inputs,
             const Arguments& outputs,
             const Arguments& inouts) override {
-    CHECK_EQ(3, inputs.size());
-    CHECK_EQ(1, outputs.size());
-    CHECK_EQ(0, inouts.size());
+    CHECK_EQ(3, static_cast<int>(inputs.size()));
+    CHECK_EQ(1, static_cast<int>(outputs.size()));
+    CHECK_EQ(0, static_cast<int>(inouts.size()));
 
     CHECK(outputs[0].getData() && inputs[0].getData() && inputs[2].getData());
-    CHECK_EQ(outputs[0].dims_.size(), 2);
-    CHECK_EQ(inputs[0].dims_.size(), 2);
-    CHECK_EQ(inputs[1].dims_.size(), 2);
-    CHECK_EQ(inputs[2].dims_.size(), 1);
+    CHECK_EQ(static_cast<int>(outputs[0].dims_.size()), 2);
+    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 2);
+    CHECK_EQ(static_cast<int>(inputs[1].dims_.size()), 2);
+    CHECK_EQ(static_cast<int>(inputs[2].dims_.size()), 1);
     /// dim of output = dim of input * context_length
     CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
     /// dim of input == dim of weight
@@ -202,15 +202,15 @@ public:
   void calc(const Arguments& inputs,
             const Arguments& outputs,
             const Arguments& inouts) override {
-    CHECK_EQ(3, inputs.size());
-    CHECK_EQ(1, outputs.size());
-    CHECK_EQ(0, inouts.size());
+    CHECK_EQ(3, static_cast<int>(inputs.size()));
+    CHECK_EQ(1, static_cast<int>(outputs.size()));
+    CHECK_EQ(0, static_cast<int>(inouts.size()));
 
     CHECK(outputs[0].getData() && inputs[2].getData());
-    CHECK_EQ(outputs[0].dims_.size(), 2);
-    CHECK_EQ(inputs[0].dims_.size(), 2);
-    CHECK_EQ(inputs[1].dims_.size(), 2);
-    CHECK_EQ(inputs[2].dims_.size(), 1);
+    CHECK_EQ(static_cast<int>(outputs[0].dims_.size()), 2);
+    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 2);
+    CHECK_EQ(static_cast<int>(inputs[1].dims_.size()), 2);
+    CHECK_EQ(static_cast<int>(inputs[2].dims_.size()), 1);
 
     /// dim of input == dim of weight
     CHECK_EQ(inputs[0].dims_[1], inputs[1].dims_[1]);
@@ -269,13 +269,13 @@ public:
   void calc(const Arguments& inputs,
             const Arguments& outputs,
             const Arguments& inouts) override {
-    CHECK_EQ(2, inputs.size());
-    CHECK_EQ(1, outputs.size());
-    CHECK_EQ(0, inouts.size());
+    CHECK_EQ(2, static_cast<int>(inputs.size()));
+    CHECK_EQ(1, static_cast<int>(outputs.size()));
+    CHECK_EQ(0, static_cast<int>(inouts.size()));
     CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
-    CHECK_EQ(outputs[0].dims_.size(), 2);
-    CHECK_EQ(inputs[0].dims_.size(), 2);
-    CHECK_EQ(inputs[1].dims_.size(), 1);
+    CHECK_EQ(static_cast<int>(outputs[0].dims_.size()), 2);
+    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 2);
+    CHECK_EQ(static_cast<int>(inputs[1].dims_.size()), 1);
     CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
     /// input and output has the same batch_size
     CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
@@ -317,14 +317,14 @@ public:
   void calc(const Arguments& inputs,
             const Arguments& outputs,
             const Arguments& inouts) override {
-    CHECK_EQ(2, inputs.size());
-    CHECK_EQ(1, outputs.size());
-    CHECK_EQ(0, inouts.size());
+    CHECK_EQ(2, static_cast<int>(inputs.size()));
+    CHECK_EQ(1, static_cast<int>(outputs.size()));
+    CHECK_EQ(0, static_cast<int>(inouts.size()));
 
     CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
-    CHECK_EQ(outputs[0].dims_.size(), 2);
-    CHECK_EQ(inputs[0].dims_.size(), 2);
-    CHECK_EQ(inputs[1].dims_.size(), 1);
+    CHECK_EQ(static_cast<int>(outputs[0].dims_.size()), 2);
+    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 2);
+    CHECK_EQ(static_cast<int>(inputs[1].dims_.size()), 1);
     CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
 
     auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
diff --git a/paddle/function/ContextProjectionOpTest.cpp b/paddle/function/ContextProjectionOpTest.cpp
index 359428fc03d698145cb880bd735c908838f96f56..6223d2fd23ac3bbb4fbcf51d37d22feaf3b1330b 100644
--- a/paddle/function/ContextProjectionOpTest.cpp
+++ b/paddle/function/ContextProjectionOpTest.cpp
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #include "FunctionTest.h"
-#include "paddle/gserver/tests/TestUtil.h"
 #include "paddle/math/Matrix.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 
diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
index f13eb78d27d900064f8cf0dc4194d1e34ded2b14..96a7a30eebbf0f01fa89ea91110ddb826fd2f64b 100644
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -128,11 +128,11 @@ public:
   void calc(const Arguments& inputs,
             const Arguments& outputs,
             const Arguments& inouts) override {
-    CHECK_EQ(1, inputs.size());
-    CHECK_EQ(2, outputs.size());
-    CHECK_EQ(0, inouts.size());
+    CHECK_EQ(1, static_cast<int>(inputs.size()));
+    CHECK_EQ(2, static_cast<int>(outputs.size()));
+    CHECK_EQ(0, static_cast<int>(inouts.size()));
 
-    CHECK_EQ(inputs[0].dims_.size(), 4);
+    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 4);
     for (size_t i = 0; i < inputs[0].dims_.size(); i++) {
       CHECK_EQ(inputs[0].dims_[i], outputs[0].dims_[i]);
       CHECK_EQ(inputs[0].dims_[i], outputs[1].dims_[i]);
@@ -180,11 +180,11 @@ public:
   void calc(const Arguments& inputs,
             const Arguments& outputs,
             const Arguments& inouts) override {
-    CHECK_EQ(4, inputs.size());
-    CHECK_EQ(1, outputs.size());
-    CHECK_EQ(0, inouts.size());
+    CHECK_EQ(4, static_cast<int>(inputs.size()));
+    CHECK_EQ(1, static_cast<int>(outputs.size()));
+    CHECK_EQ(0, static_cast<int>(inouts.size()));
 
-    CHECK_EQ(inputs[0].dims_.size(), 4);
+    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 4);
     for (size_t i = 0; i < inputs[0].dims_.size(); i++) {
       CHECK_EQ(inputs[0].dims_[i], inputs[1].dims_[i]);
       CHECK_EQ(inputs[0].dims_[i], inputs[2].dims_[i]);
diff --git a/paddle/function/Function.cpp b/paddle/function/Function.cpp
index 6f82a8d053bc203eed44bd0d8d4c47d23a15268d..614e76b8ac0c9a9145a27f5b532ea63bef7f90f0 100644
--- a/paddle/function/Function.cpp
+++ b/paddle/function/Function.cpp
@@ -46,28 +46,32 @@ bool FuncConfig::get<bool>(const std::string& key) const {
 
 template <>
 FuncConfig& FuncConfig::set<size_t>(const std::string& key, size_t v) {
-  CHECK_EQ(valueMap_.count(key), 0) << "Duplicated value: " << key;
+  CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
+                                                      << key;
   valueMap_[key].s = v;
   return *this;
 }
 
 template <>
 FuncConfig& FuncConfig::set<real>(const std::string& key, real v) {
-  CHECK_EQ(valueMap_.count(key), 0) << "Duplicated value: " << key;
+  CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
+                                                      << key;
   valueMap_[key].r = v;
   return *this;
 }
 
 template <>
 FuncConfig& FuncConfig::set<int>(const std::string& key, int v) {
-  CHECK_EQ(valueMap_.count(key), 0) << "Duplicated value: " << key;
+  CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
+                                                      << key;
   valueMap_[key].i = v;
   return *this;
 }
 
 template <>
 FuncConfig& FuncConfig::set<bool>(const std::string& key, bool v) {
-  CHECK_EQ(valueMap_.count(key), 0) << "Duplicated value: " << key;
+  CHECK_EQ(static_cast<int>(valueMap_.count(key)), 0) << "Duplicated value: "
+                                                      << key;
   valueMap_[key].b = v;
   return *this;
 }
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index 5f031fc7c0761a8fe97eb16fe1dd8e0a1debfcdb..9a2ad7567f0dc93d0a8e396fd88b2488afe9d049 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -30,12 +30,12 @@ limitations under the License. */
 #include "paddle/math/Vector.h"
 #include "paddle/parameter/Argument.h"
 #include "paddle/utils/ClassRegistrar.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Locks.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Queue.h"
 #include "paddle/utils/ThreadLocal.h"
 #include "paddle/utils/Util.h"
-#include "paddle/utils/common.h"
 
 namespace paddle {
 /**
diff --git a/paddle/gserver/dataproviders/PyDataProvider.cpp b/paddle/gserver/dataproviders/PyDataProvider.cpp
index 5bdd55309c8bf8d5dcf84f5dcef2c5c85249a668..b53790e764b9f9ad668abd1f4125695e3533a027 100644
--- a/paddle/gserver/dataproviders/PyDataProvider.cpp
+++ b/paddle/gserver/dataproviders/PyDataProvider.cpp
@@ -13,8 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "PyDataProvider.h"
-#include <fenv.h>
-#include "paddle/utils/Excepts.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Util.h"
 
diff --git a/paddle/gserver/layers/ContextProjection.cpp b/paddle/gserver/layers/ContextProjection.cpp
index e947b2b9ecbebda11db5c049e1606a2d5926c28c..ee4db219890a135d786c46827632d02d1db5b760 100644
--- a/paddle/gserver/layers/ContextProjection.cpp
+++ b/paddle/gserver/layers/ContextProjection.cpp
@@ -111,7 +111,8 @@ void ContextProjection::forward() {
   size_t dim = out_->value->getWidth();
   CHECK_EQ(dim, input_dim * config_.context_length());
   size_t batch_size = in_->value->getHeight();
-  CHECK_EQ(forward_.size(), 1) << "Only one forward function here";
+  CHECK_EQ(static_cast<int>(forward_.size()), 1)
+      << "Only one forward function here";
 
   REGISTER_TIMER_INFO("ContextProjectionForward", getName().c_str());
   bool is_padding = config_.trainable_padding();
@@ -154,7 +155,8 @@ void ContextProjection::backward(const UpdateCallback& callback) {
   CHECK_EQ(dim, input_dim * config_.context_length());
   size_t batch_size = in_->value->getHeight();
   CHECK_EQ(batch_size, out_->value->getHeight());
-  CHECK_EQ(backward_.size(), 1) << "Only one backward function here";
+  CHECK_EQ(static_cast<int>(backward_.size()), 1)
+      << "Only one backward function here";
 
   REGISTER_TIMER_INFO("ContextProjectionBackward", getName().c_str());
   bool is_padding = config_.trainable_padding();
diff --git a/paddle/gserver/layers/ConvProjection.cpp b/paddle/gserver/layers/ConvProjection.cpp
index e1c4b91ace21522a3bc640dfc4eaa1a42668ed02..0281170bc59855f6f4d2f4212523275a92d202d5 100644
--- a/paddle/gserver/layers/ConvProjection.cpp
+++ b/paddle/gserver/layers/ConvProjection.cpp
@@ -130,7 +130,8 @@ void ConvProjection::reshapeTensorDesc(int batchSize) {
 void ConvProjection::reshape(int batchSize) {
   size_t width = calOutputSize();
   CHECK_EQ(width, out_->value->getWidth());
-  CHECK_EQ(channels_ * imageH_ * imageW_, in_->value->getWidth())
+  CHECK_EQ(static_cast<size_t>(channels_ * imageH_ * imageW_),
+           in_->value->getWidth())
       << "Wrong input size for convolution"
       << " channels=" << channels_ << " imageH=" << imageH_
       << " imageW=" << imageW_ << " inputSize=" << in_->value->getWidth();
diff --git a/paddle/gserver/layers/GruCompute.h b/paddle/gserver/layers/GruCompute.h
index a56af21317d1d43c836f7fe599a4dc614804bfec..3340e38e62cc396fd619cfa2a1fad57b0a8cf4c7 100644
--- a/paddle/gserver/layers/GruCompute.h
+++ b/paddle/gserver/layers/GruCompute.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "ModelConfig.pb.h"
 #include "hl_gpu.h"
-#include "paddle/utils/common.h"
+#include "paddle/utils/Common.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/LstmCompute.h b/paddle/gserver/layers/LstmCompute.h
index 0d65b4158ebdc04f199048bbba98317c89fc8beb..2588fad2793961da2b2af889e8985f49540f1bda 100644
--- a/paddle/gserver/layers/LstmCompute.h
+++ b/paddle/gserver/layers/LstmCompute.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include "ModelConfig.pb.h"
 #include "hl_gpu.h"
-#include "paddle/utils/common.h"
+#include "paddle/utils/Common.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/layers/MultinomialSampler.h b/paddle/gserver/layers/MultinomialSampler.h
index b48073c80b6f57cd86ceb80b9d749548c3acc1ac..546ef9c1f24d1bc8abe68ba8b2fe6ab55f4b03e5 100644
--- a/paddle/gserver/layers/MultinomialSampler.h
+++ b/paddle/gserver/layers/MultinomialSampler.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <memory>
 #include <random>
-#include "paddle/utils/common.h"
+#include "paddle/utils/Common.h"
 
 namespace paddle {
 
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index c26a2a7f06bc16c113f1812868b5d2b8a5060635..0caa5e1e11e6d42fadfa87149814c4b77b3b6271 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -2,8 +2,7 @@
 
 ################### test_ProtoDataProvider ############
 add_unittest_without_exec(test_ProtoDataProvider
-    test_ProtoDataProvider.cpp
-    TestUtil.cpp)
+    test_ProtoDataProvider.cpp)
 
 # test_ProtoDataProvider will mkdir as same name,
 # so if WORKING_DIRECTORY is default directory, then
@@ -15,53 +14,46 @@ add_test(NAME test_ProtoDataProvider
 ################# test_LayerGrad #######################
 add_unittest_without_exec(test_LayerGrad
     test_LayerGrad.cpp
-    LayerGradUtil.cpp
-    TestUtil.cpp)
+    LayerGradUtil.cpp)
 add_test(NAME test_LayerGrad
     COMMAND test_LayerGrad)
 
 add_unittest_without_exec(test_ActivationGrad
     test_ActivationGrad.cpp
-    LayerGradUtil.cpp
-    TestUtil.cpp)
+    LayerGradUtil.cpp)
 add_test(NAME test_ActivationGrad
     COMMAND test_ActivationGrad)
 ################# test_ConvTrans #######################
 add_unittest_without_exec(test_ConvTrans
     test_ConvTrans.cpp
-    LayerGradUtil.cpp
-    TestUtil.cpp)
+    LayerGradUtil.cpp)
 
 add_test(NAME test_ConvTrans
     COMMAND test_ConvTrans)
 ################# test_PriorBox #######################
 add_unittest_without_exec(test_PriorBox
     test_PriorBox.cpp
-    LayerGradUtil.cpp
-    TestUtil.cpp)
+    LayerGradUtil.cpp)
 
 add_test(NAME test_PriorBox
     COMMAND test_PriorBox)
 ################# test_ConvUnify #######################
 add_unittest_without_exec(test_ConvUnify
     test_ConvUnify.cpp
-    LayerGradUtil.cpp
-    TestUtil.cpp)
+    LayerGradUtil.cpp)
     
 add_test(NAME test_ConvUnify
     COMMAND test_ConvUnify)
 ################# test_BatchNorm #######################
 add_unittest_without_exec(test_BatchNorm
     test_BatchNorm.cpp
-    LayerGradUtil.cpp
-    TestUtil.cpp)
+    LayerGradUtil.cpp)
 
 add_test(NAME test_BatchNorm
     COMMAND test_BatchNorm)
 ################## test_Evaluator #######################
 add_unittest(test_Evaluator
-    test_Evaluator.cpp
-    TestUtil.cpp)
+    test_Evaluator.cpp)
 
 ################ test_LinearChainCRF ####################
 add_simple_unittest(test_LinearChainCRF)
@@ -72,8 +64,7 @@ add_simple_unittest(test_MultinomialSampler)
 ############## test_PyDataProvider ########################
 if(WITH_PYTHON)
     add_unittest_without_exec(test_PyDataProvider
-        test_PyDataProvider.cpp
-        TestUtil.cpp)
+        test_PyDataProvider.cpp)
 
     add_test(NAME test_PyDataProvider
         COMMAND .set_python_path.sh -d ./gserver/tests:${PROJ_ROOT}/python/ ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider
@@ -81,18 +72,15 @@ if(WITH_PYTHON)
 endif()
 
 ############### test_RecurrentLayer #######################
-add_unittest(test_RecurrentLayer
-    test_RecurrentLayer.cpp
-    TestUtil.cpp)
+add_simple_unittest(test_RecurrentLayer)
 
 ############### test_WarpCTCLayer #######################
 if(NOT WITH_DOUBLE)
     add_unittest_without_exec(test_WarpCTCLayer
-        test_WarpCTCLayer.cpp
-        TestUtil.cpp)
+        test_WarpCTCLayer.cpp)
 
     add_test(NAME test_WarpCTCLayer
-        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${PROJ_ROOT}/warp-ctc/build
+        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR}
         WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
 endif()
 
@@ -108,8 +96,7 @@ add_test(NAME test_RecurrentGradientMachine
     WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
 
 add_unittest_without_exec(test_NetworkCompare
-    test_NetworkCompare.cpp
-    TestUtil.cpp)
+    test_NetworkCompare.cpp)
 if(WITH_GPU)
     add_test(NAME test_NetworkCompare
         COMMAND .set_python_path.sh -d ${PROJ_ROOT}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=true
diff --git a/paddle/gserver/tests/LayerGradUtil.cpp b/paddle/gserver/tests/LayerGradUtil.cpp
index 57c176810fddf96828c210807673b7d1a3c739c0..ae016e74eaa84f7c43a30c09c8c4577e25360c4e 100644
--- a/paddle/gserver/tests/LayerGradUtil.cpp
+++ b/paddle/gserver/tests/LayerGradUtil.cpp
@@ -310,7 +310,7 @@ void initDataLayer(TestConfig testConf,
         testConf.inputDefs[i].labelSeqStartPositions;
     if (labelSeqStartPositions.size() != 0) {
       CHECK(!sequenceStartPositions);
-      CHECK_GE(labelSeqStartPositions.size(), 2);
+      CHECK_GE(static_cast<int>(labelSeqStartPositions.size()), 2);
 
       sequenceStartPositions =
           ICpuGpuVector::create(labelSeqStartPositions.size(), useGpu);
diff --git a/paddle/gserver/tests/LayerGradUtil.h b/paddle/gserver/tests/LayerGradUtil.h
index 4e88ac0e81ef2596f14995be53f7c5c20ddba2d7..9f68eb64d0b4ad27306d3b20387d74a7e438d910 100644
--- a/paddle/gserver/tests/LayerGradUtil.h
+++ b/paddle/gserver/tests/LayerGradUtil.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/gserver/layers/DataLayer.h"
 #include "paddle/trainer/Trainer.h"
 
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 using namespace std;  // NOLINT
 
 namespace paddle {
diff --git a/paddle/gserver/tests/test_ActivationGrad.cpp b/paddle/gserver/tests/test_ActivationGrad.cpp
index 7d7e68da5c5a9dbcba024002a988f26f7613b724..b201ba8a5a4146ab28cd96454f434f889d72a968 100644
--- a/paddle/gserver/tests/test_ActivationGrad.cpp
+++ b/paddle/gserver/tests/test_ActivationGrad.cpp
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/trainer/Trainer.h"
 
 #include "LayerGradUtil.h"
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/gserver/tests/test_BatchNorm.cpp b/paddle/gserver/tests/test_BatchNorm.cpp
index 7f5fcb670b70aed9f0a04180d344556a0390122f..d07299bfe3c4147742384a45dc6f1698d9c382f4 100644
--- a/paddle/gserver/tests/test_BatchNorm.cpp
+++ b/paddle/gserver/tests/test_BatchNorm.cpp
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/utils/GlobalConstants.h"
 
 #include "LayerGradUtil.h"
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
@@ -114,8 +114,8 @@ TEST(Layer, batchNorm) {
   bnLayer->forward(PASS_GC);
   convLayer->forward(PASS_GC);
 
-  CHECK_EQ(convLayer->getOutputValue()->getHeight(), 100);
-  CHECK_EQ(convLayer->getOutputValue()->getWidth(), 576);
+  CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getHeight()), 100);
+  CHECK_EQ(static_cast<int>(convLayer->getOutputValue()->getWidth()), 576);
 }
 
 int main(int argc, char** argv) {
diff --git a/paddle/gserver/tests/test_ConvTrans.cpp b/paddle/gserver/tests/test_ConvTrans.cpp
index dd3378304b433c135881310eb89273b6bf492af2..40bb1e2d73c81280a9b12114c13de851285c276b 100644
--- a/paddle/gserver/tests/test_ConvTrans.cpp
+++ b/paddle/gserver/tests/test_ConvTrans.cpp
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/utils/GlobalConstants.h"
 
 #include "LayerGradUtil.h"
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/gserver/tests/test_ConvUnify.cpp b/paddle/gserver/tests/test_ConvUnify.cpp
index ad99b50245cf56eb7db227fa582f6e3f41b47a7a..207fc0566fcf4a0d2e971f3c169a14a64146155b 100644
--- a/paddle/gserver/tests/test_ConvUnify.cpp
+++ b/paddle/gserver/tests/test_ConvUnify.cpp
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/utils/GlobalConstants.h"
 
 #include "LayerGradUtil.h"
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/gserver/tests/test_Evaluator.cpp b/paddle/gserver/tests/test_Evaluator.cpp
index e07066dad84aa6326c2447fc5ee80fa496735fbf..8165eb8269336193858962edac4f9637c2fc1c2f 100644
--- a/paddle/gserver/tests/test_Evaluator.cpp
+++ b/paddle/gserver/tests/test_Evaluator.cpp
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include <vector>
 #include "ModelConfig.pb.h"
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 #include "paddle/trainer/Trainer.h"
 
 using namespace paddle;  // NOLINT
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index 2cc25f6b211e367fc82c07c30082c3e12c04e53d..66a70ecd41091b9590038dab3194dd2a0c59dd03 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/trainer/Trainer.h"
 
 #include "LayerGradUtil.h"
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/gserver/tests/test_NetworkCompare.cpp b/paddle/gserver/tests/test_NetworkCompare.cpp
index 0d261059555c971cd509e64802d6c70abc9d2fef..4db30f37a5bc92d4348caed0aebdd8a589b55712 100644
--- a/paddle/gserver/tests/test_NetworkCompare.cpp
+++ b/paddle/gserver/tests/test_NetworkCompare.cpp
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <algorithm>
 #include <cstdlib>
 
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 #include "paddle/trainer/Trainer.h"
 #include "paddle/utils/Stat.h"
 
diff --git a/paddle/gserver/tests/test_PriorBox.cpp b/paddle/gserver/tests/test_PriorBox.cpp
index a6d6a242696633e66a05bf9fc9eee81a468ed056..ae0e3bc3d24c54eb84c7b5f5053e629607ef4310 100644
--- a/paddle/gserver/tests/test_PriorBox.cpp
+++ b/paddle/gserver/tests/test_PriorBox.cpp
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <vector>
 
 #include "LayerGradUtil.h"
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/gserver/tests/test_ProtoDataProvider.cpp b/paddle/gserver/tests/test_ProtoDataProvider.cpp
index 8fc0aaab69548ae60100696db04d5611570df110..e11bf402c27898b8fdbd3fceeb8aeff8906352db 100644
--- a/paddle/gserver/tests/test_ProtoDataProvider.cpp
+++ b/paddle/gserver/tests/test_ProtoDataProvider.cpp
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/gserver/dataproviders/ProtoDataProvider.h"
 #include "paddle/utils/Util.h"
 
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace std;  // NOLINT
 
diff --git a/paddle/gserver/tests/test_PyDataProvider.cpp b/paddle/gserver/tests/test_PyDataProvider.cpp
index 0f264ecf91837f6681f0577b93be7e35be268c04..db883543c306c1938eb9da188ce20ed768018efb 100644
--- a/paddle/gserver/tests/test_PyDataProvider.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider.cpp
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/gserver/dataproviders/PyDataProvider.h"
 #include "paddle/utils/Util.h"
 
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace std;     // NOLINT
 using namespace paddle;  // NOLINT
diff --git a/paddle/gserver/tests/test_PyDataProvider2.cpp b/paddle/gserver/tests/test_PyDataProvider2.cpp
index 5f8bc5ecd0f77efc6dcda0330f124ca6cab7f277..7e193eb31a03e6a6b8b0b02e89608a0e02b9e248 100644
--- a/paddle/gserver/tests/test_PyDataProvider2.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider2.cpp
@@ -293,7 +293,7 @@ TEST(PyDataProvider2, can_over_batch_size) {
   while (true) {
     int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
     if (realBatchSize) {
-      CHECK_LE(realBatchSize, batchSize);
+      CHECK_LE(static_cast<size_t>(realBatchSize), batchSize);
     } else {
       break;
     }
diff --git a/paddle/gserver/tests/test_RecurrentLayer.cpp b/paddle/gserver/tests/test_RecurrentLayer.cpp
index f91c788863b6963df92b735dbfef2bacee1fff45..16ab0e6aecb6a895b20389992a44dc542eb3b00a 100644
--- a/paddle/gserver/tests/test_RecurrentLayer.cpp
+++ b/paddle/gserver/tests/test_RecurrentLayer.cpp
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/gserver/layers/DataLayer.h"
 #include "paddle/gserver/layers/Layer.h"
 
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/gserver/tests/test_WarpCTCLayer.cpp b/paddle/gserver/tests/test_WarpCTCLayer.cpp
index dab6366588b7894a6700c00a5331d436ca2a410c..23ae95852e84216c9065f1b123d35ce868fbb90f 100644
--- a/paddle/gserver/tests/test_WarpCTCLayer.cpp
+++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/gserver/layers/Layer.h"
 #include "paddle/gserver/layers/WarpCTCLayer.h"
 
-#include "TestUtil.h"
+#include "paddle/testing/TestUtil.h"
 
 using namespace paddle;  // NOLINT
 using namespace std;     // NOLINT
diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h
index 8f9bc9e823eb8062535920361899ce3cc06ec3a7..8691c87ac3b88499a9676d59af533e0f4713dfc3 100644
--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <stdint.h>
 #include <cstddef>
 #include "TensorExpression.h"
-#include "paddle/utils/common.h"
+#include "paddle/utils/Common.h"
 
 namespace paddle {
 
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index 4865a081a5aaa010d5b3ce0127ffc6f8330d4a68..ceac0212d25a53ca77403b57aa66d2607ed41c5a 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -26,8 +26,8 @@ limitations under the License. */
 #include "BaseMatrix.h"
 #include "MemoryHandle.h"
 #include "Vector.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/ThreadLocal.h"
-#include "paddle/utils/common.h"
 
 namespace paddle {
 
diff --git a/paddle/math/TensorExpression.h b/paddle/math/TensorExpression.h
index f3d60e400380f7d7d645559318837b0d7706661d..6fd60e7f3c65ea8e31fd1aaaa61b6ad8956ff1cd 100644
--- a/paddle/math/TensorExpression.h
+++ b/paddle/math/TensorExpression.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <stdint.h>
 #include <cstddef>
 #include "hl_tensor_ops.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Logging.h"
-#include "paddle/utils/common.h"
 
 namespace paddle {
 
diff --git a/paddle/math/Vector.h b/paddle/math/Vector.h
index b4347a70f874a2a1bf933bbea4d1b15385f36090..9af6e30c9e13895ad95653a787ec1c1ad77a248f 100644
--- a/paddle/math/Vector.h
+++ b/paddle/math/Vector.h
@@ -21,8 +21,8 @@ limitations under the License. */
 
 #include "BaseMatrix.h"
 #include "MemoryHandle.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Thread.h"
-#include "paddle/utils/common.h"
 
 namespace paddle {
 
diff --git a/paddle/math/tests/CMakeLists.txt b/paddle/math/tests/CMakeLists.txt
index a3ea078509704f305672d0b02d272de0f6c97f51..06fc10bae7232fb1278e89e8d9cbdf477fc27b60 100644
--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -7,8 +7,7 @@ add_simple_unittest(test_SparseMatrix)
 
 # TODO(yuyang18): Refactor TestUtil.cpp. Remove this cross module reference.
 add_unittest(test_matrixCompare
-    test_matrixCompare.cpp
-    ../../gserver/tests/TestUtil.cpp)
+    test_matrixCompare.cpp)
 
 add_simple_unittest(test_sparseMatrixCompare)
 add_simple_unittest(test_perturbation)
diff --git a/paddle/math/tests/test_FPException.cpp b/paddle/math/tests/test_FPException.cpp
index 6aa5891bce922c00cbb4f69a511fb3c42d53f319..3836f7fc0fe577c463c9a476d49b21f2967043e5 100644
--- a/paddle/math/tests/test_FPException.cpp
+++ b/paddle/math/tests/test_FPException.cpp
@@ -28,10 +28,10 @@ limitations under the License. */
  * so we can add some tricks to prevent exp calculate an excessive value.
  *
  */
-#include <fenv.h>
+
 #include <gtest/gtest.h>
 #include "paddle/math/Matrix.h"
-#include "paddle/utils/Excepts.h"
+#include "paddle/utils/Common.h"
 
 using namespace paddle;  // NOLINT
 
diff --git a/paddle/math/tests/test_GpuProfiler.cpp b/paddle/math/tests/test_GpuProfiler.cpp
index d490078d909e7940e83a6f461f9386eeda02f53c..e6b5dba446b5a0022ade76b188895c4e0e2a22b4 100644
--- a/paddle/math/tests/test_GpuProfiler.cpp
+++ b/paddle/math/tests/test_GpuProfiler.cpp
@@ -15,9 +15,9 @@ limitations under the License. */
 #ifndef PADDLE_ONLY_CPU
 
 #include <gtest/gtest.h>
-#include "paddle/gserver/tests/TestUtil.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
+#include "paddle/testing/TestUtil.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
 
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index 98d63438a57b48340bc3b05ac7ac3d6c5cd90fb0..3a780d26c050ac5870824f2ef35c87edc61900a2 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -18,9 +18,9 @@ limitations under the License. */
 
 #include <gtest/gtest.h>
 #include "TensorCheck.h"
-#include "paddle/gserver/tests/TestUtil.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/SparseMatrix.h"
+#include "paddle/testing/TestUtil.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
 
diff --git a/paddle/parameter/ParallelParameter.h b/paddle/parameter/ParallelParameter.h
index 1ee220d2dc1a26b3f394ca673975cc827f450206..2e7c18b8084dc25b9f2f7630390bb4553ac703c9 100644
--- a/paddle/parameter/ParallelParameter.h
+++ b/paddle/parameter/ParallelParameter.h
@@ -26,9 +26,9 @@ limitations under the License. */
 #include "paddle/math/Vector.h"
 #include "paddle/parameter/Parameter.h"
 #include "paddle/parameter/ParameterUpdateFunctions.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Flags.h"
 #include "paddle/utils/Locks.h"
-#include "paddle/utils/common.h"
 
 #include "ParameterConfig.pb.h"
 
diff --git a/paddle/parameter/Parameter.h b/paddle/parameter/Parameter.h
index e05137b315f254795de26a5ff0ac977e7968f4d8..72c8336799133ad3f5855b0c1aa06639179ff70a 100644
--- a/paddle/parameter/Parameter.h
+++ b/paddle/parameter/Parameter.h
@@ -26,11 +26,11 @@ limitations under the License. */
 #include "ParameterUpdaterHook.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/math/Vector.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/Locks.h"
 #include "paddle/utils/ThreadLocal.h"
 #include "paddle/utils/Util.h"
-#include "paddle/utils/common.h"
 
 namespace paddle {
 
diff --git a/paddle/parameter/ParameterUpdateFunctions.h b/paddle/parameter/ParameterUpdateFunctions.h
index 2cb379871716ffd9e75eede607276b6b3f200e6b..0fca280149c30f0241ec988dfd6719a5519808f4 100644
--- a/paddle/parameter/ParameterUpdateFunctions.h
+++ b/paddle/parameter/ParameterUpdateFunctions.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/math/Vector.h"
-#include "paddle/utils/common.h"
+#include "paddle/utils/Common.h"
 
 namespace paddle {
 
diff --git a/paddle/pserver/BaseClient.h b/paddle/pserver/BaseClient.h
index ccf05ae1ca3ab76fbe9d36237969207768de4dd2..11d7a147bf749ba2de0772b5efd5f73ab0ccdb1a 100644
--- a/paddle/pserver/BaseClient.h
+++ b/paddle/pserver/BaseClient.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include "ParameterService.pb.h"
 #include "paddle/math/Matrix.h"
 #include "paddle/pserver/ProtoServer.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Queue.h"
-#include "paddle/utils/common.h"
 
 namespace paddle {
 
diff --git a/paddle/pserver/ParameterClient2.h b/paddle/pserver/ParameterClient2.h
index 70cfc6d70072f399ef97eef1a0e6111a127cbd9f..89b3ddd502151e537b81bdbb09f171dd6e13ba26 100644
--- a/paddle/pserver/ParameterClient2.h
+++ b/paddle/pserver/ParameterClient2.h
@@ -23,11 +23,11 @@ limitations under the License. */
 #include "paddle/math/Vector.h"
 #include "paddle/parameter/Parameter.h"
 #include "paddle/pserver/BaseClient.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Flags.h"
 #include "paddle/utils/Locks.h"
 #include "paddle/utils/Queue.h"
 #include "paddle/utils/Util.h"
-#include "paddle/utils/common.h"
 
 #include "ParameterService.pb.h"
 
diff --git a/paddle/pserver/ParameterServer2.h b/paddle/pserver/ParameterServer2.h
index 79d1eb97ff149f4f5ca9a924c1b0b7ba629f1e33..0f5a5895907b20a0cf882b6fa6fb74bd52dce058 100644
--- a/paddle/pserver/ParameterServer2.h
+++ b/paddle/pserver/ParameterServer2.h
@@ -29,10 +29,10 @@ limitations under the License. */
 #include "paddle/math/Vector.h"
 #include "paddle/parameter/Parameter.h"
 #include "paddle/parameter/ParameterOptimizer.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/Locks.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/ThreadLocal.h"
-#include "paddle/utils/common.h"
 
 #include "ParameterService.pb.h"
 
diff --git a/paddle/scripts/docker/Dockerfile b/paddle/scripts/docker/Dockerfile
index b01de499bd1fbcfff1f655535f574ae2caa17707..1522be023f6de32f86fc8a367867bbe2f1c9aeb6 100644
--- a/paddle/scripts/docker/Dockerfile
+++ b/paddle/scripts/docker/Dockerfile
@@ -15,7 +15,7 @@ RUN apt-get update \
     && apt-get clean -y
 RUN cd /usr/src/gtest && cmake . && make && cp *.a /usr/lib
 RUN pip install -U BeautifulSoup docopt PyYAML pillow \
-    sphinx sphinx_rtd_theme recommonmark
+    sphinx sphinx_rtd_theme recommonmark jupyter
 
 ARG WITH_AVX
 ARG WITH_DOC
@@ -43,4 +43,13 @@ RUN echo 'root:root' | chpasswd
 RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
 RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
 EXPOSE 22
-CMD ["/usr/sbin/sshd", "-D"]
+
+# Jupyter Notebook directory.
+RUN mkdir /notes/
+WORKDIR "/notes"
+EXPOSE 8888
+
+RUN mkdir -p /opt/bin
+COPY ./paddle/scripts/docker/entrypoint /opt/bin/
+
+CMD ["/opt/bin/entrypoint"]
diff --git a/paddle/scripts/docker/Dockerfile.gpu b/paddle/scripts/docker/Dockerfile.gpu
index a68cc79b84271c63d41a89494150381d96748b67..09f07043e2172319de257cc952fb81ba53ce89a5 100644
--- a/paddle/scripts/docker/Dockerfile.gpu
+++ b/paddle/scripts/docker/Dockerfile.gpu
@@ -15,7 +15,7 @@ RUN apt-get update \
     && apt-get clean -y
 RUN cd /usr/src/gtest && cmake . && make && cp *.a /usr/lib
 RUN pip install -U BeautifulSoup docopt PyYAML pillow \
-    sphinx sphinx_rtd_theme recommonmark
+    sphinx sphinx_rtd_theme recommonmark jupyter
 
 ARG WITH_AVX
 ARG WITH_DOC
@@ -43,4 +43,13 @@ RUN echo 'root:root' | chpasswd
 RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
 RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
 EXPOSE 22
-CMD ["/usr/sbin/sshd", "-D"]
+
+# Jupyter Notebook directory.
+RUN mkdir /notes/
+WORKDIR "/notes"
+EXPOSE 8888
+
+RUN mkdir -p /opt/bin
+COPY ./paddle/scripts/docker/entrypoint /opt/bin/
+
+CMD ["/opt/bin/entrypoint"]
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index ca3f1c3f1896feaae657f47c121ce6cd858dc2c9..7edba3dd09cdc594383597ac7cf7913d50e9f6e1 100755
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -43,5 +43,7 @@ cp -rv /woboq/data $WOBOQ_OUT/../data
     -o $WOBOQ_OUT \
     -p paddle:/paddle
 /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
-
+cd /woboq
+make clean
+rm -rf /paddle/build
 trap : 0
diff --git a/paddle/scripts/docker/entrypoint b/paddle/scripts/docker/entrypoint
new file mode 100755
index 0000000000000000000000000000000000000000..87083467f50acd689ce57b86951f5f7a03c6a58b
--- /dev/null
+++ b/paddle/scripts/docker/entrypoint
@@ -0,0 +1,8 @@
+#!/bin/bash
+LOG=/var/log/all
+
+touch $LOG
+
+/usr/sbin/sshd -D >> $LOG &
+jupyter notebook --ip=0.0.0.0 /notes/ >> $LOG &
+tail -f $LOG
diff --git a/paddle/scripts/travis/before_install.linux.sh b/paddle/scripts/travis/before_install.linux.sh
deleted file mode 100755
index 9620bff6bcf77c6e87f149e8e33408170dd8e507..0000000000000000000000000000000000000000
--- a/paddle/scripts/travis/before_install.linux.sh
+++ /dev/null
@@ -1,18 +0,0 @@
-#!/bin/bash
-set -e
-pip install protobuf
-cd /tmp
-wget https://github.com/google/protobuf/archive/v3.0.2.tar.gz -O protobuf.tar.gz
-tar xf protobuf.tar.gz
-cd protobuf*
-./autogen.sh
-./configure --prefix=/usr/
-make -j 2 install
-cd ..
-rm -rf protobuf*
-
-pushd /usr/src/gtest
-cmake .
-make
-sudo cp *.a /usr/lib
-popd
diff --git a/paddle/scripts/travis/before_install.osx.sh b/paddle/scripts/travis/before_install.osx.sh
index bd88ed39132f19ca7cfc4f0dd6acdbc6b83e94ab..fd113d313e3140ad11460c1c288927b08fea88c4 100755
--- a/paddle/scripts/travis/before_install.osx.sh
+++ b/paddle/scripts/travis/before_install.osx.sh
@@ -1,12 +1,4 @@
 #!/bin/bash
 brew update
 brew tap homebrew/science
-brew install python
-sudo pip install --upgrade protobuf
-brew install cmake python glog gflags openblas wget md5sha1sum protobuf
-
-wget https://github.com/google/googletest/archive/release-1.8.0.tar.gz -O gtest.tar.gz
-tar xf gtest.tar.gz
-cd googletest-release-1.8.0/
-cmake .
-make install
+brew install openblas md5sha1sum 
diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh
index 9caeb21beb15ee5281f9a6aefcfd59b94b91e48a..ffc48eae66aa615aab1ac007f8987ba6aba3ed8f 100755
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
@@ -1,27 +1,26 @@
 #!/bin/bash
-./build_submodules.sh
 source ./common.sh
-CMAKE_EXTRA=""
+
+python -c 'import pip; print(pip.pep425tags.get_supported())'
+
 if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
-  CMAKE_EXTRA="-DPYTHON_LIBRARY=/usr/local/Cellar/python/2.7.12_1/Frameworks/Python.framework/Versions/2.7/lib/python2.7/config/libpython2.7.dylib"
+  CMAKE_EXTRA="-DWITH_SWIG_PY=OFF"
 else
   CMAKE_EXTRA="-DWITH_SWIG_PY=ON"
 fi
 
-
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_TESTING=ON -DON_TRAVIS=ON -DON_COVERALLS=ON ${CMAKE_EXTRA}
+cmake .. -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_TESTING=ON -DON_TRAVIS=ON -DON_COVERALLS=ON ${CMAKE_EXTRA}
 
 NPROC=1
 if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
   NRPOC=`nproc`
   make -j $NPROC
   make coveralls
+  sudo make install
 elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
   NPROC=`sysctl -n hw.ncpu`
   make -j $NPROC
   env CTEST_OUTPUT_ON_FAILURE=1 make test ARGS="-j $NPROC"
+  sudo make install
+  sudo paddle version
 fi
-
-
-sudo make install
-sudo paddle version
diff --git a/paddle/scripts/travis/build_submodules.sh b/paddle/scripts/travis/build_submodules.sh
deleted file mode 100755
index d458bf92bf455609de601c60402101d09765dfe4..0000000000000000000000000000000000000000
--- a/paddle/scripts/travis/build_submodules.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-set -e
-WORK_DIR=$PWD
-PROJ_ROOT=$(git rev-parse --show-cdup)
-SUBMODULES=$(grep path ${PROJ_ROOT}.gitmodules | sed 's/^.*path = //')
-
-for module in $SUBMODULES
-do
-  case $module in
-    "warp-ctc")
-      if [ -d ${PROJ_ROOT}warp-ctc/build ]; then
-        rm -rf ${PROJ_ROOT}warp-ctc/build
-      fi
-      mkdir ${PROJ_ROOT}warp-ctc/build
-      cd ${PROJ_ROOT}warp-ctc/build
-      cmake ..; make
-    ;;
-  esac
-done
-cd $WORK_DIR
diff --git a/paddle/setup.py.in b/paddle/setup.py.in
index 464ad632868bd1fd4d88547212421302ca0b2116..e3650bf1c0c4692a50e9731fcd8b832865eaac62 100644
--- a/paddle/setup.py.in
+++ b/paddle/setup.py.in
@@ -14,7 +14,9 @@
 
 # This file is used to build paddle python binding package.
 # It will be invoked by Makefile that generated by COMAKE
+
 from setuptools import setup, Extension
+
 import numpy as np
 import api.paddle_ld_flags
 import platform
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index 584498c8602ee5faad3e21a8588af7bb802d7377..c47add04b081cbdf78b5a5d3bca3a71025b3d9ac 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -3,4 +3,6 @@
 if(WITH_TESTING)
   add_library(paddle_test_main STATIC TestMain.cpp)
   add_dependencies(paddle_test_main gen_proto_cpp)
+  add_library(paddle_test_util STATIC TestUtil.cpp)
+  add_dependencies(paddle_test_util gen_proto_cpp)
 endif()
diff --git a/paddle/gserver/tests/TestUtil.cpp b/paddle/testing/TestUtil.cpp
similarity index 100%
rename from paddle/gserver/tests/TestUtil.cpp
rename to paddle/testing/TestUtil.cpp
diff --git a/paddle/gserver/tests/TestUtil.h b/paddle/testing/TestUtil.h
similarity index 100%
rename from paddle/gserver/tests/TestUtil.h
rename to paddle/testing/TestUtil.h
diff --git a/paddle/trainer/Trainer.cpp b/paddle/trainer/Trainer.cpp
index 09e0a213ab2d71890cfafb905b5969383acfe95a..8465addaf9e03831e914be2c73901c3b1a9d537f 100644
--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #include "Trainer.h"
 
-#include <fenv.h>
 #include <stdio.h>
 
 #include <iomanip>
@@ -24,7 +23,7 @@ limitations under the License. */
 
 #include <google/protobuf/text_format.h>
 
-#include "paddle/utils/Excepts.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Stat.h"
diff --git a/paddle/trainer/TrainerMain.cpp b/paddle/trainer/TrainerMain.cpp
index 947f9cadcc983d58ce31ef462e51dc42e41eaf1b..e2fbd21e14afa7c89b82999b08bf91c1de182906 100644
--- a/paddle/trainer/TrainerMain.cpp
+++ b/paddle/trainer/TrainerMain.cpp
@@ -12,9 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include <fenv.h>
 #include "paddle/pserver/ParameterServer2.h"
-#include "paddle/utils/Excepts.h"
+#include "paddle/utils/Common.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/StringUtil.h"
 
diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt
index 28c3d6f2631f9e28e3f1ff086b1e8edf994e73a4..22e07bd0e98a4cd36e6ed5860bcff0d4ae7cb1d2 100644
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -17,9 +17,10 @@ add_test(NAME test_Compare
 ################# test_Trainer ###########################
 add_unittest_without_exec(test_Trainer
     test_Trainer.cpp)
-set(diy_dll_dir ${CMAKE_CURRENT_BINARY_DIR}/../../gserver/tests)
 add_test(NAME test_Trainer
   COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/paddle/trainer/tests/gen_proto_data.py &&
+        ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
         ${CMAKE_CURRENT_BINARY_DIR}/test_Trainer
     WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
 
@@ -82,5 +83,5 @@ add_test(NAME test_PyDataProviderWrapper
 #################### test_config_parser #########################
 add_test(NAME test_config_parser
   COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        python ${PROJ_ROOT}/paddle/trainer/tests/config_parser_test.py
+        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/paddle/trainer/tests/config_parser_test.py
     WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
diff --git a/paddle/trainer/tests/test_Trainer.cpp b/paddle/trainer/tests/test_Trainer.cpp
index 371282dd6bb9a995bc6ae8b2a5bd708f831d7e33..264bc46ebcd0aa17fd605e537fcb2c316ef31162 100644
--- a/paddle/trainer/tests/test_Trainer.cpp
+++ b/paddle/trainer/tests/test_Trainer.cpp
@@ -96,11 +96,6 @@ TEST(checkGradient, multi) {
 TEST(checkGradient, hsigmoid) { checkGradientTest(configFile2, false, false); }
 
 TEST(checkGradient, chunk) {
-#if defined(__APPLE__) || defined(__OSX__)
-  EXPECT_EQ(0, system("python trainer/tests/gen_proto_data.py"));
-#else
-  EXPECT_EQ(0, system("python2 trainer/tests/gen_proto_data.py"));
-#endif
   checkGradientTest(configFile3, false, false);
 #ifndef PADDLE_ONLY_CPU
   checkGradientTest(configFile3, true, true);
diff --git a/paddle/utils/.gitignore b/paddle/utils/.gitignore
index f2cfd7409412de68f4183daebcb48e7a3ae37672..956b606a18cae1bb11322accfa174ae5ce1580de 100644
--- a/paddle/utils/.gitignore
+++ b/paddle/utils/.gitignore
@@ -1 +1,2 @@
 enable_virtualenv.c
+PythonUtil.cpp
diff --git a/paddle/utils/CMakeLists.txt b/paddle/utils/CMakeLists.txt
index 45240b5002aa18be4a9b7e3ec3b754eb83ca0e09..10d906ee16656a808122b81d8b2fef55b8e7b7e9 100644
--- a/paddle/utils/CMakeLists.txt
+++ b/paddle/utils/CMakeLists.txt
@@ -1,5 +1,7 @@
 # The utilities for paddle
 
+configure_file(PythonUtil.cpp.in ${PROJ_ROOT}/paddle/utils/PythonUtil.cpp)
+
 file(GLOB UTIL_HEADERS . *.h)
 file(GLOB UTIL_SOURCES . *.cpp)
 create_resources(enable_virtualenv.py enable_virtualenv.c)
diff --git a/paddle/utils/common.h b/paddle/utils/Common.h
similarity index 97%
rename from paddle/utils/common.h
rename to paddle/utils/Common.h
index 202a9d980d8350c230daaf473dd34d4069479e5f..1f1d0255a5eaef824171ddeaf9480167f232007e 100644
--- a/paddle/utils/common.h
+++ b/paddle/utils/Common.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "Excepts.h"
+
 /**
  * Disable copy macro.
  */
diff --git a/paddle/utils/CpuId.h b/paddle/utils/CpuId.h
index 1218e8194c4e837ca880744f92e769a68ba474de..0f3985cc7b2c018ede9bba9644d2d096561dccee 100644
--- a/paddle/utils/CpuId.h
+++ b/paddle/utils/CpuId.h
@@ -11,7 +11,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "common.h"
+#include "Common.h"
 
 namespace paddle {
 
diff --git a/paddle/utils/Excepts.h b/paddle/utils/Excepts.h
index dc3369b7e8c27cf53a03ce56b18a123f291d2d6d..5c2c504f53a586f2991ccfae891991465fdb39b6 100644
--- a/paddle/utils/Excepts.h
+++ b/paddle/utils/Excepts.h
@@ -15,6 +15,8 @@ limitations under the License. */
 #ifndef EXCEPTS_H_
 #define EXCEPTS_H_
 
+#include <fenv.h>
+
 #if defined(__APPLE__) || defined(__OSX__)
 
 int fegetexcept(void);
diff --git a/paddle/utils/Locks.h b/paddle/utils/Locks.h
index a21872e89ebc172b87c8b5c3731a89302f34f521..e87abb9139f1c3f250f8b8fe1afdd8883f682647 100644
--- a/paddle/utils/Locks.h
+++ b/paddle/utils/Locks.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include <condition_variable>
 #include <mutex>
 
-#include "common.h"
+#include "Common.h"
 
 namespace paddle {
 
diff --git a/paddle/utils/PythonUtil.cpp b/paddle/utils/PythonUtil.cpp.in
similarity index 98%
rename from paddle/utils/PythonUtil.cpp
rename to paddle/utils/PythonUtil.cpp.in
index 7faeff55c28b9065179ad27b3b604a9f411249e5..e0caaf4cd6cf429e57ee221a0b0957a905b89973 100644
--- a/paddle/utils/PythonUtil.cpp
+++ b/paddle/utils/PythonUtil.cpp.in
@@ -195,6 +195,8 @@ extern const char enable_virtualenv_py[];
 }
 void initPython(int argc, char** argv) {
 #ifndef PADDLE_NO_PYTHON
+  char PythonHome[] = "@PYTHON_INSTALL_DIR@"; // NOLINT
+  Py_SetPythonHome(PythonHome);
   Py_SetProgramName(argv[0]);
   Py_Initialize();
   PySys_SetArgv(argc, argv);
diff --git a/paddle/utils/Util.h b/paddle/utils/Util.h
index dc15ada5862d648af27aa1b0e8c8a5cce012ded8..613844669d2495ada7b8f7a841f47b821b7fdeba 100644
--- a/paddle/utils/Util.h
+++ b/paddle/utils/Util.h
@@ -26,9 +26,9 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
+#include "Common.h"
 #include "Logging.h"
 #include "TrainerConfig.pb.h"
-#include "common.h"
 
 #include "Flags.h"
 #include "hl_gpu.h"
diff --git a/paddle/utils/Version.h b/paddle/utils/Version.h
index aa5df3243893145dbcc7e7ef2592555fc1c88fc9..f53d6420bbbdf66f8f355af95c6b11c30a3bfab9 100644
--- a/paddle/utils/Version.h
+++ b/paddle/utils/Version.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <stddef.h>
 #include <iostream>
-#include "common.h"
+#include "Common.h"
 
 namespace paddle {
 
diff --git a/paddle/utils/Excepts.cpp b/paddle/utils/arch/osx/Excepts.cpp
similarity index 97%
rename from paddle/utils/Excepts.cpp
rename to paddle/utils/arch/osx/Excepts.cpp
index 4ddce35ed31a8fed3f25cb3b03348b4eda8fcfdd..c8e904d8f9fe29e51447994af43dc62bf3514306 100644
--- a/paddle/utils/Excepts.cpp
+++ b/paddle/utils/arch/osx/Excepts.cpp
@@ -12,12 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "Excepts.h"
+#include "paddle/utils/Excepts.h"
 
 #if defined(__APPLE__) || defined(__OSX__)
 
-#include <fenv.h>
-
 int fegetexcept(void) {
   static fenv_t fenv;
   return fegetenv(&fenv) ? -1 : (fenv.__control & FE_ALL_EXCEPT);
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
index 2c40070eca44d8656d7ce82157a1b840092b9965..e854b2b427e550ec491dacf931cc2d2cce7ba6c2 100644
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -18,10 +18,10 @@ foreach(filename ${proto_filenames})
         ${PROTO_GEN}
         ${CUR_PROTO_GEN})
     add_custom_command(OUTPUT ${CUR_PROTO_GEN}
-        COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} 
+        COMMAND env ${py_env} ${PROTOBUF_PROTOC_EXECUTABLE} 
                   --cpp_out ${CMAKE_CURRENT_BINARY_DIR}
-		  --proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename}
-        DEPENDS ${filename})
+          --proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename}
+        DEPENDS ${filename} ${external_project_dependencies})
 
     set(CUR_PROTO_GEN_PY
         ${PROJ_ROOT}/paddle/python/paddle/proto/${base_filename}_pb2.py)
@@ -29,9 +29,9 @@ foreach(filename ${proto_filenames})
         ${CUR_PROTO_GEN_PY}
         ${PROTO_GEN_PY})
     add_custom_command(OUTPUT ${CUR_PROTO_GEN_PY}
-        COMMAND ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${PROJ_ROOT}/python/paddle/proto
-	--proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename}
-        DEPENDS ${filename})
+        COMMAND env ${py_env} ${PROTOBUF_PROTOC_EXECUTABLE} --python_out ${PROJ_ROOT}/python/paddle/proto
+    --proto_path ${PROJ_ROOT}/proto ${PROJ_ROOT}/proto/${filename}
+        DEPENDS ${filename} ${external_project_dependencies})
 endforeach()
 
 include_directories(${CMAKE_CURRENT_BINARY_DIR}/proto)
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index dce0b909524369926eda54763e571706b79daeaf..1cda4762eb2a55175d6c9faee98aaeaa1f763890 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -10,26 +10,17 @@ set(PY_FILES paddle/__init__.py
              ${HELPERS_PY_FILES}
              ${UTILS_PY_FILES})
 
-set(PADDLE_INTERNAL_PACKAGE "")
-if (PADDLE_WITH_INTERNAL)
-    set(PADDLE_INTERNAL_PACKAGE "paddle.internals")
-endif()
-
 configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 
 add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
-    COMMAND ${PYTHON_EXECUTABLE} setup.py bdist_wheel
+    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp
-    DEPENDS gen_proto_py ${PY_FILES})
+    DEPENDS gen_proto_py ${PY_FILES} ${external_project_dependencies})
 
 add_custom_target(paddle_python ALL DEPENDS
     ${OUTPUT_DIR}/.timestamp)
 
-find_python_module(pip REQUIRED)
-find_python_module(wheel REQUIRED)
-find_python_module(google.protobuf REQUIRED)
-
 add_subdirectory(paddle/trainer_config_helpers/tests)
 
 install(DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/dist/
diff --git a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
index d1a9843d326669711bf3d0769df1b804cfcfa673..403aafabe9143472dd2f0857ecd25f7acf515b6c 100644
--- a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
+++ b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
@@ -1,12 +1,12 @@
 #################### test_config_parser #########################
 add_test(NAME layers_test
   COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        python ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/layers_test.py
+        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/layers_test.py
     WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
 
 add_test(NAME test_reset_hook
   COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        python ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
+        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
     WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
 
 if (PROTOBUF_3)
@@ -14,12 +14,12 @@ if (PROTOBUF_3)
     ProtobufEqualMain.cpp)
   add_test(NAME test_layerHelpers
     COMMAND
-    ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+    ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
     ${CMAKE_CURRENT_BINARY_DIR}/protobuf_equal
   )
 else()
   add_test(NAME test_layerHelpers
     COMMAND
-    ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+    ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
   )
 endif()
diff --git a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
index a54af94ce3db4ed300dee697b30516c3b6448d7c..ee5961af75ebb33af52f9add645f793015288f4e 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/generate_protostr.sh
@@ -10,13 +10,13 @@ protostr=$PWD/protostr
 for conf in ${configs[*]}
 do
     echo "Generating " $conf
-    python -m paddle.utils.dump_config $conf.py > $protostr/$conf.protostr.unittest
-    cat ${conf}.py |python test_config_parser_for_non_file_config.py > $protostr/$conf.protostr.non_file_config.unittest
+    $1 -m paddle.utils.dump_config $conf.py > $protostr/$conf.protostr.unittest
+    cat ${conf}.py |$1 test_config_parser_for_non_file_config.py > $protostr/$conf.protostr.non_file_config.unittest
 done
 
 for conf in ${whole_configs[*]}
 do
     echo "Generating " $conf
-    python -m paddle.utils.dump_config $conf.py "" --whole > $protostr/$conf.protostr.unittest
-    cat ${conf}.py |python test_config_parser_for_non_file_config.py --whole > $protostr/$conf.protostr.non_file_config.unittest
+    $1 -m paddle.utils.dump_config $conf.py "" --whole > $protostr/$conf.protostr.unittest
+    cat ${conf}.py |$1 test_config_parser_for_non_file_config.py --whole > $protostr/$conf.protostr.non_file_config.unittest
 done
diff --git a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
index e984ee70625456241b3cfe6202fdadaa3807d33c..a37eb6439e6d2803a417883f0aed2a5d56d059b9 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
@@ -7,7 +7,7 @@ protostr=`dirname $0`/protostr
 
 files=`ls $protostr | grep -v "unittest"`
 
-./generate_protostr.sh
+./generate_protostr.sh $1
 
 . ./file_list.sh
 
diff --git a/python/setup.py.in b/python/setup.py.in
index d2fb95f27ff2f0673050e699316dde504dbf28f6..b66a42e87c78701e9eb26b1b7dc8f46a95035a76 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -1,16 +1,11 @@
 from setuptools import setup
 
-INTERNAL_PACKAGE='${PADDLE_INTERNAL_PACKAGE}'
-
 packages=['paddle',
           'paddle.proto',
           'paddle.trainer',
           'paddle.trainer_config_helpers',
           'paddle.utils']
 
-if len(INTERNAL_PACKAGE) != 0:
-    packages.append(INTERNAL_PACKAGE)
-
 setup(name='paddle',
       version='${PADDLE_VERSION}',
       description='Parallel Distributed Deep Learning',
diff --git a/warp-ctc b/warp-ctc
deleted file mode 160000
index bd535c8d44e03c8ebd2d768e06c8c05fdccd11d2..0000000000000000000000000000000000000000
--- a/warp-ctc
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit bd535c8d44e03c8ebd2d768e06c8c05fdccd11d2