diff --git a/.dockerignore b/.dockerignore
deleted file mode 120000
index 3e4e48b0b5fe6b468434d6767749b399319f2da2..0000000000000000000000000000000000000000
--- a/.dockerignore
+++ /dev/null
@@ -1 +0,0 @@
-.gitignore
\ No newline at end of file
diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000000000000000000000000000000000000..2b2e74053d33cb6d2878fd3d6da48fa344172f63
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,15 @@
+*.DS_Store
+build/
+*.user
+.vscode
+.idea
+.project
+.cproject
+.pydevproject
+Makefile
+.test_env/
+third_party/
+*~
+bazel-*
+
+!build/*.deb
diff --git a/.gitignore b/.gitignore
index 6aae076a49012b032b8fc0f1dc02c2714fb7b4a3..ee7c6ec370cd7c1f3435b41d915e24023c456af7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@ build/
 .project
 .cproject
 .pydevproject
+.settings/
 Makefile
 .test_env/
 third_party/
diff --git a/.gitmodules b/.gitmodules
deleted file mode 100644
index 832698b8bfaa5e2275cb5202fb42abb9f3afce75..0000000000000000000000000000000000000000
--- a/.gitmodules
+++ /dev/null
@@ -1,4 +0,0 @@
-[submodule "book"]
-	path = book
-	url = https://github.com/PaddlePaddle/book.git
-	branch = develop
\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
index fd233b3439cf6841f6caf9c57e0d93ab6836a549..f03cda950a927ff42b1241d4c2a07f0aa29fa43f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -39,7 +39,7 @@ option(WITH_RDMA        "Compile PaddlePaddle with RDMA support"        OFF)
 option(WITH_TIMER       "Compile PaddlePaddle with stats timer"         OFF)
 option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler"        OFF)
 option(WITH_DOC         "Compile PaddlePaddle with documentation"       OFF)
-option(ON_COVERALLS     "Compile PaddlePaddle with code coverage"       OFF)
+option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
 
@@ -81,14 +81,21 @@ include_directories("${PROJ_ROOT}/paddle/cuda/include")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
 
 set(EXTERNAL_LIBS
-    # have not include gtest here.
     ${GFLAGS_LIBRARIES}
     ${GLOG_LIBRARIES}
     ${CBLAS_LIBRARIES}
     ${PROTOBUF_LIBRARY}
     ${ZLIB_LIBRARIES}
+    ${PYTHON_LIBRARIES}
 )
 
+if(WITH_GPU)
+    list(APPEND EXTERNAL_LIB ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
+    if(NOT WITH_DSO)
+        list(APPEND EXTERNAL_LIB ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
+    endif(NOT WITH_DSO)
+endif(WITH_GPU)
+
 add_subdirectory(proto)
 add_subdirectory(paddle)
 add_subdirectory(python)
diff --git a/paddle/scripts/docker/Dockerfile.gpu b/Dockerfile
similarity index 77%
rename from paddle/scripts/docker/Dockerfile.gpu
rename to Dockerfile
index a687d490a3af365f68cc6218ecbef7f64e4a1af1..ccd43be668e7acb1a82bb88f5938755a5d3974d1 100644
--- a/paddle/scripts/docker/Dockerfile.gpu
+++ b/Dockerfile
@@ -1,27 +1,25 @@
+# A image for building paddle binaries
+# Use cuda devel base image for both cpu and gpu environment
 FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 
-ARG DEBIAN_FRONTEND=noninteractive
 ARG UBUNTU_MIRROR
-RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
+RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
 
 # ENV variables
 ARG BUILD_WOBOQ
-ARG BUILD_AND_INSTALL
+ARG WITH_GPU
 ARG WITH_AVX
 ARG WITH_DOC
 ARG WITH_STYLE_CHECK
 
 ENV BUILD_WOBOQ=${BUILD_WOBOQ:-OFF}
-ENV BUILD_AND_INSTALL=${BUILD_AND_INSTALL:-OFF}
-ENV WITH_GPU=ON
+ENV WITH_GPU=${WITH_AVX:-OFF}
 ENV WITH_AVX=${WITH_AVX:-ON}
 ENV WITH_DOC=${WITH_DOC:-OFF}
 ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
-ENV DOCKER_BUILD=TRUE
 
 ENV HOME /root
-
 # Add bash enhancements
 COPY ./paddle/scripts/docker/root/ /root/
 
@@ -30,7 +28,7 @@ RUN apt-get update && \
     apt-get install -y wget unzip tar xz-utils bzip2 gzip coreutils && \
     apt-get install -y curl sed grep graphviz libjpeg-dev zlib1g-dev && \
     apt-get install -y python-numpy python-matplotlib gcc g++ gfortran && \
-    apt-get install -y automake locales clang-format-3.8 && \
+    apt-get install -y automake locales clang-format-3.8 swig && \
     apt-get clean -y
 
 # git credential to skip password typing
@@ -50,10 +48,6 @@ RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
     cd cmake-3.4.1 && ./bootstrap && make -j `nproc` && make install && \
     cd .. && rm -rf cmake-3.4.1
 
-COPY . /paddle/
-RUN cd /paddle/ && git submodule update --init --recursive
-RUN /paddle/paddle/scripts/docker/build.sh
-
 VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"]
 
 # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
@@ -63,9 +57,5 @@ RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
 RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
 EXPOSE 22
 
-# Jupyter Notebook: Paddle book
-EXPOSE 8888
-
-COPY ./paddle/scripts/docker/entrypoint /opt/bin/
-
-CMD ["/opt/bin/entrypoint"]
+# development image default do build work
+CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]
diff --git a/README.md b/README.md
index 8a8e15841586ae6a01bb93e94f6074189f556f5a..bcc24b84128df282a2e3f0bc62aafe1ffe172338 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@
 
 
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/cn/index.html)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/develop/doc/)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/doc_cn/)
 [![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@@ -59,36 +59,36 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
     the capability of PaddlePaddle to make a huge impact for your product.
 
 ## Installation
-Check out the [Install Guide](http://paddlepaddle.org/doc/build/) to install from
-pre-built packages (**docker image**, **deb package**) or
-directly build on **Linux** and **Mac OS X** from the source code.
+
+It is recommended to check out the
+[Docker installation guide](http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html)
+before looking into the
+[build from source guide](http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html)
 
 ## Documentation
-Both [English Docs](http://paddlepaddle.org/doc/) and [Chinese Docs](http://paddlepaddle.org/doc_cn/) are provided for our users and developers.
 
-- [Quick Start](http://paddlepaddle.org/doc/demo/quick_start/index_en) <br>
-   You can follow the quick start tutorial to learn how use PaddlePaddle
-   step-by-step.
+We provide [English](http://www.paddlepaddle.org/develop/doc/) and
+[Chinese](http://www.paddlepaddle.org/doc_cn/) documentation.
+
+- [Deep Learning 101](http://book.paddlepaddle.org/index.en.html)
+
+  You might want to start from the this online interactive book that can run in Jupyter Notebook.
+
+- [Distributed Training](http://www.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)
+
+  You can run distributed training jobs on MPI clusters.
+
+- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/develop/doc/howto/usage/k8s/k8s_en.html)
 
-- [Example and Demo](http://paddlepaddle.org/doc/demo/) <br>
-   We provide five demos, including: image classification, sentiment analysis,
-   sequence to sequence model, recommendation, semantic role labeling.
+   You can also run distributed training jobs on Kubernetes clusters.
 
-- [Distributed Training](http://paddlepaddle.org/doc/cluster) <br>
-  This system supports training deep learning models on multiple machines
-  with data parallelism.
+- [Python API](http://www.paddlepaddle.org/develop/doc/api/index_en.html)
 
-- [Python API](http://paddlepaddle.org/doc/ui/) <br>
-   PaddlePaddle supports using either Python interface or C++ to build your
-   system. We also use SWIG to wrap C++ source code to create a user friendly
-   interface for Python. You can also use SWIG to create interface for your
-   favorite programming language.
+   Our new API enables much shorter programs.
 
-- [How to Contribute](http://paddlepaddle.org/doc/build/contribute_to_paddle.html) <br>
-   We sincerely appreciate your interest and contributions. If you would like to
-   contribute, please read the contribution guide.
+- [How to Contribute](http://www.paddlepaddle.org/develop/doc/howto/dev/contribute_to_paddle_en.html)
 
-- [Source Code Documents](http://paddlepaddle.org/doc/source/) <br>
+   We appreciate your contributions!
 
 ## Ask Questions
 
diff --git a/book b/book
deleted file mode 160000
index 6e3875eb62533de1f2c1088a477719eb57b9732c..0000000000000000000000000000000000000000
--- a/book
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 6e3875eb62533de1f2c1088a477719eb57b9732c
diff --git a/cmake/FindSphinx.cmake b/cmake/FindSphinx.cmake
index 1c29cb22a31f1e41a6b5575837c6374175cfdea5..f74cd4ff8c9c2c52319b18ac37264167b3718eae 100644
--- a/cmake/FindSphinx.cmake
+++ b/cmake/FindSphinx.cmake
@@ -72,7 +72,7 @@ function( Sphinx_add_target target_name builder conf cache source destination )
     ${source}
     ${destination}
     COMMENT "Generating sphinx documentation: ${builder}"
-    COMMAND cd ${destination} && ln -s ./index_*.html index.html
+    COMMAND cd ${destination} && ln -sf ./index_*.html index.html
     )
 
   set_property(
diff --git a/cmake/coveralls.cmake b/cmake/coveralls.cmake
index 9be7643819efdde3f42e4d39b2849ecc17e0d9fb..ca1471cabb57c0795ee193493d2e60bb5bd9e1cc 100644
--- a/cmake/coveralls.cmake
+++ b/cmake/coveralls.cmake
@@ -61,7 +61,7 @@ function(code_coverage _COVERAGE_SRCS _COVERALLS_UPLOAD _CMAKE_SCRIPT_PATH)
     endif()
 endfunction()
 
-if(ON_COVERALLS)
+if(WITH_COVERAGE)
     set(CMAKE_BUILD_TYPE "Debug")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
diff --git a/cmake/coverallsGcovJsons.cmake b/cmake/coverallsGcovJsons.cmake
index ad9a10cb8616159b9e3aff445e698cb2edb92820..4641184fcf5273b884524d9b9444209ffb65e000 100644
--- a/cmake/coverallsGcovJsons.cmake
+++ b/cmake/coverallsGcovJsons.cmake
@@ -134,7 +134,7 @@ foreach(GCDA ${GCDA_FILES})
 	# If -p is not specified then the file is named only "the_file.c.gcov"
 	#
 	execute_process(
-		COMMAND "${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA} >/dev/null"
+		COMMAND ${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA} >/dev/null
 		WORKING_DIRECTORY ${GCDA_DIR}
 	)
 endforeach()
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 21fd5f41b8ddc9271fb834f62a40cc1aa9261cef..c99b37b62d7f3c2caa1113db5485491c83d13e28 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -47,7 +47,7 @@ IF(NOT ${CBLAS_FOUND})
         PREFIX              ${CBLAS_SOURCES_DIR}
         INSTALL_DIR         ${CBLAS_INSTALL_DIR}
         BUILD_IN_SOURCE     1
-        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} FC=${CMAKE_Fortran_COMPILER} CC=${CMAKE_C_COMPILER} HOSTCC=${CMAKE_C_COMPILER} NO_SHARED=1 libs netlib
+        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} FC=${CMAKE_Fortran_COMPILER} CC=${CMAKE_C_COMPILER} HOSTCC=${CMAKE_C_COMPILER} DYNAMIC_ARCH=1 NO_SHARED=1 libs netlib
         INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 PREFIX=<INSTALL_DIR>
         UPDATE_COMMAND      ""
         CONFIGURE_COMMAND   ""
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 446a7532c55bd3ca66662efe70db93551580b8cc..ad1426fd940c7b163668c33d41731fe75d89dd89 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -14,7 +14,8 @@
 
 INCLUDE(ExternalProject)
 
-FIND_PACKAGE(Protobuf 3.1)
+set(PROTOBUF_VERSION 3.1)
+FIND_PACKAGE(Protobuf ${PROTOBUF_VERSION})
 
 IF(PROTOBUF_FOUND)
     EXEC_PROGRAM(${PROTOBUF_PROTOC_EXECUTABLE} ARGS --version OUTPUT_VARIABLE PROTOBUF_VERSION)
diff --git a/cmake/util.cmake b/cmake/util.cmake
index 24ad5c815ca20d9b6b317b1be4d2dc93a9e06fba..bacb64eb9ee65fffc824e4587a22fc432c092b19 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -71,21 +71,10 @@ function(link_paddle_exe TARGET_NAME)
         generate_rdma_links()
     endif()
 
-    if(WITH_METRIC)
-        if(WITH_GPU)
-            set(METRIC_LIBS paddle_metric_learning paddle_dserver_lib metric metric_cpu)
-        else()
-            set(METRIC_LIBS paddle_metric_learning paddle_dserver_lib metric_cpu)
-        endif()
-    else()
-        set(METRIC_LIBS "")
-    endif()
-
     target_circle_link_libraries(${TARGET_NAME}
         ARCHIVE_START
         paddle_gserver
         paddle_function
-        ${METRIC_LIBS}
         ARCHIVE_END
         paddle_pserver
         paddle_trainer_lib
@@ -95,33 +84,12 @@ function(link_paddle_exe TARGET_NAME)
         paddle_parameter
         paddle_proto
         paddle_cuda
-        ${METRIC_LIBS}
         ${EXTERNAL_LIBS}
         ${CMAKE_THREAD_LIBS_INIT}
         ${CMAKE_DL_LIBS}
         ${RDMA_LD_FLAGS}
         ${RDMA_LIBS})
 
-    if(WITH_PYTHON)
-        target_link_libraries(${TARGET_NAME}
-            ${PYTHON_LIBRARIES} util)
-    endif()
-
-    if(WITH_GPU)
-        target_link_libraries(${TARGET_NAME} ${CUDA_CUDART_LIBRARY})
-        if(NOT WITH_DSO OR WITH_METRIC)
-            target_link_libraries(${TARGET_NAME}
-                ${CUDNN_LIBRARY}
-                ${CUDA_curand_LIBRARY})
-            CUDA_ADD_CUBLAS_TO_TARGET(${TARGET_NAME})
-        endif()
-
-        check_library_exists(rt clock_gettime "time.h" HAVE_CLOCK_GETTIME )
-        if(HAVE_CLOCK_GETTIME)
-            target_link_libraries(${TARGET_NAME} rt)
-        endif()
-    endif()
-
     add_dependencies(${TARGET_NAME} ${external_project_dependencies})
 endfunction()
 
diff --git a/demo/introduction/api_train_v2.py b/demo/introduction/api_train_v2.py
index 84125c3b4b621a128fd488ff7fa374a75f620bf1..1ba971b3688ce3dec078998df2c0b183a4e449f8 100644
--- a/demo/introduction/api_train_v2.py
+++ b/demo/introduction/api_train_v2.py
@@ -14,7 +14,7 @@ def main():
                                 act=paddle.activation.Linear(),
                                 bias_attr=paddle.attr.Param(name='b'))
     y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
-    cost = paddle.layer.regression_cost(input=y_predict, label=y)
+    cost = paddle.layer.mse_cost(input=y_predict, label=y)
 
     # create parameters
     parameters = paddle.parameters.create(cost)
diff --git a/demo/introduction/trainer_config.py b/demo/introduction/trainer_config.py
index ecafe955f9e5c1062168d5d7b6b4c639d6e72a99..651dfaa4b7b4873810a0b393655541a62d1a311b 100644
--- a/demo/introduction/trainer_config.py
+++ b/demo/introduction/trainer_config.py
@@ -34,5 +34,5 @@ y_predict = fc_layer(
     size=1,
     act=LinearActivation(),
     bias_attr=ParamAttr(name='b'))
-cost = regression_cost(input=y_predict, label=y)
+cost = mse_cost(input=y_predict, label=y)
 outputs(cost)
diff --git a/demo/recommendation/api_train_v2.py b/demo/recommendation/api_train_v2.py
index 9b254933a1de60bf8d74517f0d52401d334703b7..f6a061799e3ac50236a68beedaf700dd6c698a05 100644
--- a/demo/recommendation/api_train_v2.py
+++ b/demo/recommendation/api_train_v2.py
@@ -61,7 +61,7 @@ def main():
 
     inference = paddle.layer.cos_sim(
         a=usr_combined_features, b=mov_combined_features, size=1, scale=5)
-    cost = paddle.layer.regression_cost(
+    cost = paddle.layer.mse_cost(
         input=inference,
         label=paddle.layer.data(
             name='score', type=paddle.data_type.dense_vector(1)))
diff --git a/demo/recommendation/trainer_config.py b/demo/recommendation/trainer_config.py
index aabcd335253faf69c940024ac8098a54da030463..25f529d7d7c430f179107fb189ade34760ab309d 100755
--- a/demo/recommendation/trainer_config.py
+++ b/demo/recommendation/trainer_config.py
@@ -86,10 +86,7 @@ movie_feature = construct_feature("movie")
 user_feature = construct_feature("user")
 similarity = cos_sim(a=movie_feature, b=user_feature)
 if not is_predict:
-    outputs(
-        regression_cost(
-            input=similarity, label=data_layer(
-                'rating', size=1)))
+    outputs(mse_cost(input=similarity, label=data_layer('rating', size=1)))
 
     define_py_data_sources2(
         'data/train.list',
diff --git a/doc/api/v1/trainer_config_helpers/layers.rst b/doc/api/v1/trainer_config_helpers/layers.rst
index bbea823de4d870f8a4384b6a85ebb7e8182797fe..24389c2d8574dfda4bec9298776aa6b1aee51535 100644
--- a/doc/api/v1/trainer_config_helpers/layers.rst
+++ b/doc/api/v1/trainer_config_helpers/layers.rst
@@ -432,6 +432,12 @@ multi_binary_label_cross_entropy
     :members: multi_binary_label_cross_entropy
     :noindex:
 
+mse_cost
+---------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: mse_cost
+    :noindex:
+
 huber_cost
 ----------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -450,6 +456,12 @@ rank_cost
     :members: rank_cost
     :noindex:
 
+sum_cost
+---------
+..  automodule:: paddle.trainer_config_helpers.layers
+    :members: sum_cost
+    :noindex:
+
 crf_layer
 -----------------
 ..  automodule:: paddle.trainer_config_helpers.layers
@@ -486,12 +498,6 @@ hsigmoid
     :members: hsigmoid
     :noindex:
 
-sum_cost
----------
-..  automodule:: paddle.trainer_config_helpers.layers
-    :members: sum_cost
-    :noindex:
-
 Check Layer 
 ============
 
diff --git a/doc/design/multi_language_interface/why_plain_c.md b/doc/design/multi_language_interface/why_plain_c.md
new file mode 100644
index 0000000000000000000000000000000000000000..a3f41ca7b93de8a55d927c88812802ef12246182
--- /dev/null
+++ b/doc/design/multi_language_interface/why_plain_c.md
@@ -0,0 +1,118 @@
+# Paddle多语言接口实现
+## 背景
+
+Paddle需要一个多语言接口，这个接口需要做到:
+
+* 有标准的，良好的文档
+    * 例如Python可以使用[Sphinx](http://www.sphinx-doc.org/en/stable/)生成API文档，golang可以使用[GoDoc](https://godoc.org/golang.org/x/tools/cmd/godoc)生成文档。这都需要这个接口按照约定俗成的规则来注释完备。
+* 不同语言的接口适应不同语言的特性
+    * 例如Java与Python的错误处理是直接扔出来Exception，而对于golang错误处理应该使用返回值。
+
+## 基本要求
+
+Paddle的多语言接口实现包括一下几个方面:
+
+* 我们使用动态库来分发Paddle。在这个动态库中不嵌入任何其他语言的解释器，也不使用其他动态库。
+* 这个动态库使用C99标准的头文件导出一些函数，不使用/导出C++符号。
+* 不导出Paddle内部的结构体、类，仅仅使用`void*`指针作为类型的句柄(handler)。
+* 不使用SWIG这种代码生成器，而是手写多语言绑定。
+
+
+## 原因
+
+### 使用动态库来分发Paddle
+
+* Paddle的链接方式比较复杂
+    * 如果用户要把Paddle的静态库（libpaddle.a）链接到自己的程序里，得使用 `--whole-archive` (for GCC) 或者 `--force_load` (for Clang) 参数，来确保把 libpaddle.a 里所有的符号都写入自己的程序的二进制文件里。这是因为 Paddle 的源码里使用了[object factory design pattern](http://stackoverflow.com/a/1310326/724872)。
+* 编译型语言，例如C/C++使用静态库和动态库难度差不多。但是解释性语言，例如[Python](http://stackoverflow.com/questions/19560594/how-to-import-static-library-in-python)或者[Java](http://stackoverflow.com/questions/24493337/linking-static-library-with-jni)，只能调用Paddle的动态库，否则得把Paddle静态库链接到解释器里。
+    * 解释性语言实际运行的二进制是解释器本身，如果调用静态库只能将静态库与解释器链接。例如对于Java来说，便是将静态库加入JVM中。这对于通常的Java的开发者来说，是不常见的做法。
+
+### 动态库中不嵌入任何其他语言的解释器
+
+* 目前Paddle的进程模型是C++内部驱动Python解释器进行模型配置解析和数据读取
+* 我们最终的动态库中不嵌入Python或者其他任何语言的解释器。模型配置解析，数据读取均交由其他语言完成
+
+现阶段Paddle有一个问题是，Paddle内嵌的Python解释器和外部使用的Python如果版本不同，会直接报错退出。
+
+### Paddle动态库中，不引用其他动态库
+
+* 即这个动态库是不依赖于其他任何文件的，可以在任何机器上执行的。
+
+###  这个动态库使用C99标准的头文件导出一些函数，不使用/导出C++符号
+
+* 由于C++编译器没有[名字修饰](https://en.wikipedia.org/wiki/Name_mangling#C.2B.2B)的规范，不同版本的编译器之间，对于同一段C++代码生成的符号可能不一致。而多语言接口需要直接读取生成的二进制(动态库)，需要有稳定的导出符号。
+* C语言是有导出符号的标准的，并且在常见的平台上，都是ABI调用标准的。
+* 大多数语言都支持使用C语言API
+* 使用C99而不使用C89，是因为C99支持[Fixed-width integer types](https://en.wikipedia.org/wiki/C_data_types#Fixed-width_integer_types)和[Boolean type](https://en.wikipedia.org/wiki/C_data_types#Boolean_type)。
+* 使用C99而不使用C11的原因是，[C11](https://en.wikipedia.org/wiki/C11_(C_standard_revision))并没有Paddle特别需要的特性，且C99相对于C11使用更加广泛。
+
+### 不导出Paddle内部的结构体、类，仅仅使用`void*`指针作为类型的句柄(handler)
+
+* Paddle内部的类为C++书写，直接导出到C的接口比较困难。
+* 在C-API中使用`void*`来表示Paddle内部类。再在每一个API中自己检查类型。
+
+在C的头文件 `paddle_matrix.h` 中:
+
+```C
+typedef void* paddle_matrix;
+typedef int paddle_error;
+
+extern "C"
+paddle_error paddle_matrix_shape(paddle_matrix matrix,
+                                 uint64_t* width,
+                                 uint64_t* height);
+```
+而在CPP里面实现这个C的接口，文件 `paddle_matrix.cpp`
+
+```cpp
+#include "paddle/math/matrix.hpp"
+extern "C"
+paddle_error paddle_matrix_shape(paddle_matrix matrix,
+                                 uint64_t *width,
+                                 uint64_t *height) {
+  auto m = (paddle::math::matrix*)(matrix);
+  *width = m->width();
+  *height = m->height();
+}
+```
+
+其中`paddle/math/matrix.hpp`文件内容为:
+
+```cpp
+namespace paddle {
+namespace math {  
+
+class Matrix {
+  //...
+};
+
+}  // namespace math
+}  // namespace paddle
+```
+
+### 不使用SWIG这种代码生成器，而是手写多语言绑定
+
+* [SWIG](http://www.swig.org/)是一个多语言接口的代码生成器。他的目标是使用C/C++写代码，SWIG直接读取C/C++的头文件，生成各种语言的绑定代码。
+    * 对于多语言接口，SWIG需要写一个interface文件。这个文件具有独特的语法，学习成本高。且增加一个第三方语言，就需要对这个第三方语言增加一些定义。有的时候，interface文件的写法非常[tricky](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/api/Paddle.swig#L36)。社区贡献代码学习成本高。
+    * SWIG暴露的接口保留了C++的接口样式，很难保证多语言代码风格的一致性。(函数命名，错误处理)
+        * 因为SWIG在第三方语言中暴露的函数名，类名和C++中完全一致。C++的命名风格并不能适应其他第三方语言。如果使用SWIG我们需要将在interface文件里，将大量的`SomeCppClass`重命名成`some_python_class`，或者`SomeGoTypes`。
+        * 对于不同语言，错误处理的方式也不尽相同。例如对于Java或者Python，最常见的错误处理方式是Exception，而对于Golang，错误处理方式是返回值。而SWIG只能简单的暴露C++接口，无法做到对于各种语言错误处理方式的适配。
+    * 对于大多数语言，直接使用C语言的.h并不困难。例如Python的[cffi](https://cffi.readthedocs.io/en/latest/overview.html#simple-example-abi-level-in-line)或者[Cython](http://cython.org/), golang的[cgo](https://golang.org/cmd/cgo/)。
+    * SWIG支持的语言或者解释器有局限。例如对于Python，使用SWIG只支持CPython解释器，而不支持PyPy解释器。
+
+
+## 原因列表
+
+| 结论 | 对比 | 原因 |
+|---| --- | --- |
+| 使用动态库 | 不使用静态库 | 解释型语言只能调用动态库，Paddle静态库链接复杂 |
+| 不嵌入其他语言解释器 | 不嵌入Python解释器 | Paddle C++目前嵌入Python解释器，会导致不同版本Python在一个进程里的bug |
+| 不引用其他动态库 | | Paddle一个动态库可以在任何Linux系统上运行 |
+| 使用C99做接口 | 不使用C++做接口 | C有标准的ABI，C99是目前C最广泛的使用标准，且C99支持bool类型和定长整数(uint64_t等)类型 |
+| 使用void*作为类句柄 | 不显示的写每个类具体包含什么| 实现简单，并且让接口脱离实现细节 |
+| 手写多语言绑定 | 不使用SWIG | 使用SWIG需要多语言绑定的开发人员熟练掌握SWIG配置，社区参与困难。SWIG生成的代码不能保证多语言代码风格的一致性 |
+
+
+## 简单实现
+
+TBD
diff --git a/doc/faq/index_cn.rst b/doc/faq/index_cn.rst
index 6d5367177da2af6276698f94f86664a5b506dca2..df5e172252277a881480cd2816eb901b711abe6b 100644
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@@ -286,3 +286,16 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
 ..      code-block:: bash
 
         paddle train --use_gpu=true --trainer_count=2 --gpu_id=2
+
+
+12. 训练过程中出现 :code:`Floating point exception`, 训练因此退出怎么办?
+------------------------------------------------------------------------
+
+Paddle二进制在运行时捕获了浮点数异常，只要出现浮点数异常(即训练过程中出现NaN或者Inf)，立刻退出。浮点异常通常的原因是浮点数溢出、除零等问题。
+主要原因包括两个方面:
+
+* 训练过程中参数或者训练过程中的梯度尺度过大，导致参数累加，乘除等时候，导致了浮点数溢出。
+* 模型一直不收敛，发散到了一个数值特别大的地方。
+* 训练数据有问题，导致参数收敛到了一些奇异的情况。或者输入数据尺度过大，有些特征的取值达到数百万，这时进行矩阵乘法运算就可能导致浮点数溢出。
+
+主要的解决办法是减小学习律或者对数据进行归一化处理。
diff --git a/doc/getstarted/basic_usage/index_cn.rst b/doc/getstarted/basic_usage/index_cn.rst
index d01cdaaeb75ec7d02480eb9162cabaad2a947db9..428f58830e0b10c024f31238b7404c6df193eecd 100644
--- a/doc/getstarted/basic_usage/index_cn.rst
+++ b/doc/getstarted/basic_usage/index_cn.rst
@@ -55,7 +55,7 @@ PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍
     # 线性计算网络层: ȳ = wx + b
     ȳ = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
     # 计算误差函数，即  ȳ 和真实 y 之间的距离
-    cost = regression_cost(input= ȳ, label=y)
+    cost = mse_cost(input= ȳ, label=y)
     outputs(cost)
 
 
@@ -69,7 +69,7 @@ PaddlePaddle是源于百度的一个深度学习平台。这份简短的介绍
     
     - **数据层**：数据层 `data_layer` 是神经网络的入口，它读入数据并将它们传输到接下来的网络层。这里数据层有两个，分别对应于变量 `x` 和 `y`。
     - **全连接层**：全连接层 `fc_layer` 是基础的计算单元，这里利用它建模变量之间的线性关系。计算单元是神经网络的核心，PaddlePaddle支持大量的计算单元和任意深度的网络连接，从而可以拟合任意的函数来学习复杂的数据关系。
-    - **回归误差代价层**：回归误差代价层 `regression_cost` 是众多误差代价函数层的一种，它们在训练过程作为网络的出口，用来计算模型的误差，是模型参数优化的目标函数。
+    - **回归误差代价层**：回归误差代价层 `mse_cost` 是众多误差代价函数层的一种，它们在训练过程作为网络的出口，用来计算模型的误差，是模型参数优化的目标函数。
 
 定义了网络结构并保存为 `trainer_config.py` 之后，运行以下训练命令：
 
diff --git a/doc/getstarted/basic_usage/index_en.rst b/doc/getstarted/basic_usage/index_en.rst
index c10b897d4292d0c2b062b5c8e23466505afa408a..6775da20c2f51000f305b095d40abd27b8fa6c0e 100644
--- a/doc/getstarted/basic_usage/index_en.rst
+++ b/doc/getstarted/basic_usage/index_en.rst
@@ -49,7 +49,7 @@ To recover this relationship between ``X`` and ``Y``, we use a neural network wi
         x = data_layer(name='x', size=1)
         y = data_layer(name='y', size=1)
         y_predict = fc_layer(input=x, param_attr=ParamAttr(name='w'), size=1, act=LinearActivation(), bias_attr=ParamAttr(name='b'))
-        cost = regression_cost(input=y_predict, label=y)
+        cost = mse_cost(input=y_predict, label=y)
         outputs(cost)
 
 Some of the most fundamental usages of PaddlePaddle are demonstrated:
diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
index d9d54bff3096cb3520409971dbd1b2e179ac8be1..69f4501f370dcc9d603ec54a63d68568d66e832e 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -51,7 +51,7 @@ PaddlePaddle supports some build options.
 <tr><td class="left">WITH_TIMER</td><td class="left">Compile PaddlePaddle with stats timer</td></tr>
 <tr><td class="left">WITH_PROFILER</td><td class="left">Compile PaddlePaddle with GPU profiler</td></tr>
 <tr><td class="left">WITH_DOC</td><td class="left">Compile PaddlePaddle with documentation</td></tr>
-<tr><td class="left">ON_COVERALLS</td><td class="left">Compile PaddlePaddle with code coverage</td></tr>
+<tr><td class="left">WITH_COVERAGE</td><td class="left">Compile PaddlePaddle with code coverage</td></tr>
 <tr><td class="left">COVERALLS_UPLOAD</td><td class="left">Package code coverage data to coveralls</td></tr>
 <tr><td class="left">ON_TRAVIS</td><td class="left">Exclude special unit test on Travis CI</td></tr>
 </tbody>
diff --git a/doc/getstarted/build_and_install/docker_install_cn.rst b/doc/getstarted/build_and_install/docker_install_cn.rst
index cf7dddd073ceb465781fcb235595788a294f5d96..af889ec9d1b4f43f8e4a266b21822f773ab62ec2 100644
--- a/doc/getstarted/build_and_install/docker_install_cn.rst
+++ b/doc/getstarted/build_and_install/docker_install_cn.rst
@@ -4,138 +4,137 @@ PaddlePaddle的Docker容器使用方式
 PaddlePaddle目前唯一官方支持的运行的方式是Docker容器。因为Docker能在所有主要操作系统（包括Linux，Mac OS X和Windows）上运行。 请注意，您需要更改 `Dockers设置 <https://github.com/PaddlePaddle/Paddle/issues/627>`_ 才能充分利用Mac OS X和Windows上的硬件资源。
 
 
-通过Docker容器开发PaddlePaddle
+纯CPU和GPU的docker镜像使用说明
 ------------------------------
 
-开发人员可以在Docker中开发PaddlePaddle。这样开发人员可以以一致的方式在不同的平台上工作 - Linux，Mac OS X和Windows。
+对于每一个PaddlePaddle版本，我们都会发布两个Docker镜像：纯CPU的和GPU的。
+我们通过设置 `dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_ 自动生成最新的docker镜像：
+`paddledev/paddle:0.10.0rc1-cpu` 和 `paddledev/paddle:0.10.0rc1-gpu`。
 
-1. 将开发环境构建为Docker镜像
-   
-   .. code-block:: bash
+以交互容器方式运行纯CPU的镜像：
 
-      git clone --recursive https://github.com/PaddlePaddle/Paddle
-      cd Paddle
-      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile .
+.. code-block:: bash
 
+    docker run -it --rm paddledev/paddle:0.10.0rc1-cpu /bin/bash
 
-   请注意，默认情况下，:code:`docker build` 不会将源码导入到镜像中并编译它。如果我们想这样做，需要设置一个参数：
+或者，可以以后台进程方式运行容器：
 
-   .. code-block:: bash
+.. code-block:: bash
 
-      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile --build-arg BUILD_AND_INSTALL=ON .
+    docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:0.10.0rc1-cpu
 
+然后用密码 :code:`root` SSH进入容器：
 
-2. 运行开发环境
+.. code-block:: bash
 
-   当我们编译好了 :code:`paddle:dev`， 我们可以在docker容器里做开发，源代码可以通过挂载本地文件来被载入Docker的开发环境里面：
-   
-   .. code-block:: bash
+    ssh -p 2202 root@localhost
 
-      docker run -d -p 2202:22 -v $PWD:/paddle paddle:dev
+SSH方式的一个优点是我们可以从多个终端进入容器。比如，一个终端运行vi，另一个终端运行Python。另一个好处是我们可以把PaddlePaddle容器运行在远程服务器上，并在笔记本上通过SSH与其连接。
 
-   以上代码会启动一个带有PaddlePaddle开发环境的docker容器，源代码会被挂载到 :code:`/paddle` 。
 
-   请注意， :code:`paddle:dev` 的默认入口是 :code:`sshd` 。以上的 :code:`docker run` 命令其实会启动一个在2202端口监听的SSHD服务器。这样，我们就能SSH进入我们的开发容器了：
-   
-   .. code-block:: bash
+以上方法在GPU镜像里也能用－只是请不要忘记按装CUDA驱动，以及告诉Docker：
 
-      ssh root@localhost -p 2202
+.. code-block:: bash
 
-3. 在Docker开发环境中编译与安装PaddlPaddle代码
+    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+    export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+    docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0rc1-gpu
 
-   当在容器里面的时候，可以用脚本 :code:`paddle/scripts/docker/build.sh` 来编译、安装与测试PaddlePaddle：
-   
-   .. code-block:: bash
-		      
-      /paddle/paddle/scripts/docker/build.sh
 
-   以上指令会在 :code:`/paddle/build` 中编译PaddlePaddle。通过以下指令可以运行单元测试：
-   
-   .. code-block:: bash
+运行PaddlePaddle书籍
+---------------------
 
-      cd /paddle/build
-      ctest
+Jupyter Notebook是一个开源的web程序，大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。
 
-4. 在Docker容器中运行PaddlePaddle书籍
+PaddlePaddle书籍是为用户和开发者制作的一个交互式的Jupyter Nodebook。
+如果您想要更深入了解deep learning，PaddlePaddle书籍一定是您最好的选择。
 
-   Jupyter Notebook是一个开源的web程序，大家可以通过它制作和分享带有代码、公式、图表、文字的交互式文档。用户可以通过网页浏览文档。
+当您进入容器内之后，只用运行以下命令：
 
-   PaddlePaddle书籍是为用户和开发者制作的一个交互式的Jupyter Nodebook。
-   如果您想要更深入了解deep learning，PaddlePaddle书籍一定是您最好的选择。
-   
-   当您进入容器内之后，只用运行以下命令：
+.. code-block:: bash
+        
+    jupyter notebook
 
-   .. code-block:: bash
-		   
-      jupyter notebook
+然后在浏览器中输入以下网址：
+    
+.. code-block:: text
 
-   然后在浏览器中输入以下网址：
-      
-   .. code-block:: text
+    http://localhost:8888/
 
-      http://localhost:8888/
+就这么简单，享受您的旅程！
 
-   就这么简单，享受您的旅程！
 
-纯CPU和GPU的docker镜像
-----------------------
+非AVX镜像
+---------
 
-对于每一个PaddlePaddle版本，我们都会发布两个Docker镜像：纯CPU的和GPU的。我们通过设置 `dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_ 自动运行以下两个命令：
+纯CPU镜像以及GPU镜像都会用到AVX指令集，但是2008年之前生产的旧电脑不支持AVX。以下指令能检查Linux电脑是否支持AVX：
 
 .. code-block:: bash
 
-   docker build -t paddle:cpu -f paddle/scripts/docker/Dockerfile --build-arg BUILD_AND_INSTALL=ON .
-   docker build -t paddle:gpu -f paddle/scripts/docker/Dockerfile.gpu --build-arg BUILD_AND_INSTALL=ON .
+   if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
 
-以交互容器方式运行纯CPU的镜像：
+如果输出是No，我们就需要手动编译一个非AVX版本的镜像：
 
 .. code-block:: bash
 
-    docker run -it --rm paddledev/paddle:0.10.0rc1-cpu /bin/bash
+   cd ~
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
+   docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
 
-或者，可以以后台进程方式运行容器：
 
-.. code-block:: bash
+通过Docker容器开发PaddlePaddle
+------------------------------
 
-    docker run -d -p 2202:22 paddledev/paddle:0.10.0rc1-cpu
+开发人员可以在Docker中开发PaddlePaddle。这样开发人员可以以一致的方式在不同的平台上工作 - Linux，Mac OS X和Windows。
 
-然后用密码 :code:`root` SSH进入容器：
+1. 将开发环境构建为Docker镜像
+   
+   .. code-block:: bash
 
-.. code-block:: bash
+      git clone --recursive https://github.com/PaddlePaddle/Paddle
+      cd Paddle
+      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile .
 
-    ssh -p 2202 root@localhost
 
-SSH方式的一个优点是我们可以从多个终端进入容器。比如，一个终端运行vi，另一个终端运行Python。另一个好处是我们可以把PaddlePaddle容器运行在远程服务器上，并在笔记本上通过SSH与其连接。
+   请注意，默认情况下，:code:`docker build` 不会将源码导入到镜像中并编译它。如果我们想这样做，需要设置一个参数：
 
+   .. code-block:: bash
 
-以上方法在GPU镜像里也能用－只是请不要忘记按装CUDA驱动，以及告诉Docker：
+      docker build -t paddle:dev -f paddle/scripts/docker/Dockerfile --build-arg BUILD_AND_INSTALL=ON .
 
-.. code-block:: bash
 
-    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
-    export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0rc1-gpu
+2. 运行开发环境
 
+   当我们编译好了 :code:`paddle:dev`， 我们可以在docker容器里做开发，源代码可以通过挂载本地文件来被载入Docker的开发环境里面：
+   
+   .. code-block:: bash
 
-非AVX镜像
----------
+      docker run -d -p 2202:22 -v $PWD:/paddle paddle:dev
 
-纯CPU镜像以及GPU镜像都会用到AVX指令集，但是2008年之前生产的旧电脑不支持AVX。以下指令能检查Linux电脑是否支持AVX：
+   以上代码会启动一个带有PaddlePaddle开发环境的docker容器，源代码会被挂载到 :code:`/paddle` 。
 
+   请注意， :code:`paddle:dev` 的默认入口是 :code:`sshd` 。以上的 :code:`docker run` 命令其实会启动一个在2202端口监听的SSHD服务器。这样，我们就能SSH进入我们的开发容器了：
+   
+   .. code-block:: bash
 
-.. code-block:: bash
+      ssh root@localhost -p 2202
 
-   if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
+3. 在Docker开发环境中编译与安装PaddlPaddle代码
 
-如果输出是No，我们就需要手动编译一个非AVX版本的镜像：
+   当在容器里面的时候，可以用脚本 :code:`paddle/scripts/docker/build.sh` 来编译、安装与测试PaddlePaddle：
+   
+   .. code-block:: bash
+		      
+      /paddle/paddle/scripts/docker/build.sh
 
-.. code-block:: bash
+   以上指令会在 :code:`/paddle/build` 中编译PaddlePaddle。通过以下指令可以运行单元测试：
+   
+   .. code-block:: bash
 
-   cd ~
-   git clone https://github.com/PaddlePaddle/Paddle.git
-   cd Paddle
-   docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
-   docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
+      cd /paddle/build
+      ctest
 
 
 文档
diff --git a/doc/getstarted/build_and_install/docker_install_en.rst b/doc/getstarted/build_and_install/docker_install_en.rst
index a4f62b283563adffc725f20efae306240f30bf9d..606746597acc0da00588b7eb05935f6c05c169f2 100644
--- a/doc/getstarted/build_and_install/docker_install_en.rst
+++ b/doc/getstarted/build_and_install/docker_install_en.rst
@@ -9,6 +9,100 @@ Please be aware that you will need to change `Dockers settings
 of your hardware resource on Mac OS X and Windows.
 
 
+Usage of CPU-only and GPU Images
+----------------------------------
+
+For each version of PaddlePaddle, we release 2 Docker images, a
+CPU-only one and a CUDA GPU one.  We do so by configuring
+`dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_
+automatically generate the latest docker images `paddledev/paddle:0.10.0rc1-cpu`
+and `paddledev/paddle:0.10.0rc1-gpu`.
+
+To run the CPU-only image as an interactive container:
+
+.. code-block:: bash
+
+    docker run -it --rm paddledev/paddle:0.10.0rc1-cpu /bin/bash
+
+or, we can run it as a daemon container
+
+.. code-block:: bash
+
+    docker run -d -p 2202:22 -p 8888:8888 paddledev/paddle:0.10.0rc1-cpu
+
+and SSH to this container using password :code:`root`:
+
+.. code-block:: bash
+
+    ssh -p 2202 root@localhost
+
+An advantage of using SSH is that we can connect to PaddlePaddle from
+more than one terminals.  For example, one terminal running vi and
+another one running Python interpreter.  Another advantage is that we
+can run the PaddlePaddle container on a remote server and SSH to it
+from a laptop.
+
+Above methods work with the GPU image too -- just please don't forget
+to install CUDA driver and let Docker knows about it:
+
+.. code-block:: bash
+
+    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
+    export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
+    docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0rc1-gpu
+
+
+PaddlePaddle Book
+------------------
+
+The Jupyter Notebook is an open-source web application that allows
+you to create and share documents that contain live code, equations,
+visualizations and explanatory text in a single browser.
+
+PaddlePaddle Book is an interactive Jupyter Notebook for users and developers. 
+We already exposed port 8888 for this book. If you want to
+dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
+
+Once you are inside the container, simply issue the command:
+
+.. code-block:: bash
+        
+    jupyter notebook
+
+Then, you would back and paste the address into the local browser:
+    
+.. code-block:: text
+
+    http://localhost:8888/
+
+That's all. Enjoy your journey!
+
+
+Non-AVX Images
+--------------
+
+Please be aware that the CPU-only and the GPU images both use the AVX
+instruction set, but old computers produced before 2008 do not support
+AVX.  The following command checks if your Linux computer supports
+AVX:
+
+.. code-block:: bash
+
+   if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
+
+
+If it doesn't, we will need to build non-AVX images manually from
+source code:
+
+.. code-block:: bash
+
+   cd ~
+   git clone https://github.com/PaddlePaddle/Paddle.git
+   cd Paddle
+   docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
+   docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
+
+
 Development Using Docker
 ------------------------
 
@@ -82,103 +176,6 @@ Windows -- in a consistent way.
       cd /paddle/build
       ctest
 
-4. Run PaddlePaddle Book under Docker Container
-
-   The Jupyter Notebook is an open-source web application that allows
-   you to create and share documents that contain live code, equations,
-   visualizations and explanatory text in a single browser.
-
-   PaddlePaddle Book is an interactive Jupyter Notebook for users and developers. 
-   We already exposed port 8888 for this book. If you want to
-   dig deeper into deep learning, PaddlePaddle Book definitely is your best choice.
-
-   Once you are inside the container, simply issue the command:
-
-   .. code-block:: bash
-		   
-      jupyter notebook
-
-   Then, you would back and paste the address into the local browser:
-      
-   .. code-block:: text
-
-      http://localhost:8888/
-
-   That's all. Enjoy your journey!
-
-CPU-only and GPU Images
------------------------
-
-For each version of PaddlePaddle, we release 2 Docker images, a
-CPU-only one and a CUDA GPU one.  We do so by configuring
-`dockerhub.com <https://hub.docker.com/r/paddledev/paddle/>`_
-automatically runs the following commands:
-
-.. code-block:: bash
-
-   docker build -t paddle:cpu -f paddle/scripts/docker/Dockerfile --build-arg BUILD_AND_INSTALL=ON .
-   docker build -t paddle:gpu -f paddle/scripts/docker/Dockerfile.gpu --build-arg BUILD_AND_INSTALL=ON .
-
-
-To run the CPU-only image as an interactive container:
-
-.. code-block:: bash
-
-    docker run -it --rm paddledev/paddle:0.10.0rc1-cpu /bin/bash
-
-or, we can run it as a daemon container
-
-.. code-block:: bash
-
-    docker run -d -p 2202:22 paddledev/paddle:0.10.0rc1-cpu
-
-and SSH to this container using password :code:`root`:
-
-.. code-block:: bash
-
-    ssh -p 2202 root@localhost
-
-An advantage of using SSH is that we can connect to PaddlePaddle from
-more than one terminals.  For example, one terminal running vi and
-another one running Python interpreter.  Another advantage is that we
-can run the PaddlePaddle container on a remote server and SSH to it
-from a laptop.
-
-
-Above methods work with the GPU image too -- just please don't forget
-to install CUDA driver and let Docker knows about it:
-
-.. code-block:: bash
-
-    export CUDA_SO="$(\ls /usr/lib64/libcuda* | xargs -I{} echo '-v {}:{}') $(\ls /usr/lib64/libnvidia* | xargs -I{} echo '-v {}:{}')"
-    export DEVICES=$(\ls /dev/nvidia* | xargs -I{} echo '--device {}:{}')
-    docker run ${CUDA_SO} ${DEVICES} -it paddledev/paddle:0.10.0rc1-gpu
-
-
-Non-AVX Images
---------------
-
-Please be aware that the CPU-only and the GPU images both use the AVX
-instruction set, but old computers produced before 2008 do not support
-AVX.  The following command checks if your Linux computer supports
-AVX:
-
-.. code-block:: bash
-
-   if cat /proc/cpuinfo | grep -i avx; then echo Yes; else echo No; fi
-
-
-If it doesn't, we will need to build non-AVX images manually from
-source code:
-
-.. code-block:: bash
-
-   cd ~
-   git clone https://github.com/PaddlePaddle/Paddle.git
-   cd Paddle
-   docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
-   docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
-
 
 Documentation
 -------------
diff --git a/doc/howto/usage/cmd_parameter/arguments_cn.md b/doc/howto/usage/cmd_parameter/arguments_cn.md
index 2e2a2fcc54a09f4f41e4ebbc317e1409591ddd9c..f7aa525054468670f59309ddf9206af55bb77869 100644
--- a/doc/howto/usage/cmd_parameter/arguments_cn.md
+++ b/doc/howto/usage/cmd_parameter/arguments_cn.md
@@ -228,16 +228,6 @@
 <td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
 </tr>
 
-<tr>
-<td class="left" rowspan = "2">度量学习(metric learning)</td><td class="left">external</td>
-<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
-</tr>
-
-<tr>
-<td class="left">data_server_port</td>
-<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
-</tr>
-
 <tr>
 <td class="left" rowspan = "16">参数服务器(PServer)</td><td class="left">start_pserver</td>
 <td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
diff --git a/doc/howto/usage/cmd_parameter/arguments_en.md b/doc/howto/usage/cmd_parameter/arguments_en.md
index e5546f0ddc78a9f8bdc306a19c2fe9a415463e5a..d1963067bda949b11ececefed3db7db1432c6223 100644
--- a/doc/howto/usage/cmd_parameter/arguments_en.md
+++ b/doc/howto/usage/cmd_parameter/arguments_en.md
@@ -228,16 +228,6 @@ It looks like there are a lot of arguments. However, most of them are for develo
 <td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
 </tr>
 
-<tr>
-<td class="left" rowspan = "2">metric learning</td><td class="left">external</td>
-<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
-</tr>
-
-<tr>
-<td class="left">data_server_port</td>
-<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
-</tr>
-
 <tr>
 <td class="left" rowspan = "16">PServer</td><td class="left">start_pserver</td>
 <td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
diff --git a/doc/howto/usage/cmd_parameter/detail_introduction_cn.md b/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
index 3b573a324d541b024600a254d5266e517db229c5..b4625ba68cf23e5697554ba94efaf0b873f2c1de 100644
--- a/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
+++ b/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
@@ -180,15 +180,6 @@
   - 用户可以自定义beam search的方法，编译成动态库，供PaddlePaddle加载。 该参数用于指定动态库路径.
   - 类型: string (默认: "", null).
 
-## 度量学习(Metric Learning)
-* `--external`
-   - 指示是否使用外部机器进行度量学习.
-   - 类型: bool (默认: 0).
-
-* `--data_server_port`
-  - 数据服务器(data server)的监听端口，主要用在度量学习中.
-  - 类型: int32 (默认: 21134).
-
 ## 数据支持(DataProvider)
 
 * `--memory_threshold_on_load_data`
diff --git a/doc/howto/usage/cmd_parameter/detail_introduction_en.md b/doc/howto/usage/cmd_parameter/detail_introduction_en.md
index 33b7ec0d51a96ee126197e7aa819fdae0d3dc353..b681ebc81a355dfc1a7638a4463dff6979929a45 100644
--- a/doc/howto/usage/cmd_parameter/detail_introduction_en.md
+++ b/doc/howto/usage/cmd_parameter/detail_introduction_en.md
@@ -184,15 +184,6 @@
   - Specify shared dynamic library. It can be defined out of paddle by user.
   - type: string (default: "", null).
 
-## Metric Learning
-* `--external`
-   - Whether to use external machine for metric learning.
-   - type: bool (default: 0).
-
-* `--data_server_port`
-  - Listening port for dserver (data server), dserver is mainly used in metric learning.
-  - type: int32 (default: 21134).
-
 ## DataProvider
 
 * `--memory_threshold_on_load_data`
diff --git a/doc/howto/usage/k8s/k8s_distributed_cn.md b/doc/howto/usage/k8s/k8s_distributed_cn.md
index 2a7a6c8c17882a6f2c95e933e051c4b8f1a8eeee..3121b3f59df650c0a22d0bd305a6f793b202d30e 100644
--- a/doc/howto/usage/k8s/k8s_distributed_cn.md
+++ b/doc/howto/usage/k8s/k8s_distributed_cn.md
@@ -213,7 +213,7 @@ I1116 09:10:17.123440    50 Util.cpp:130] Calling runInitFunctions
 I1116 09:10:17.123764    50 Util.cpp:143] Call runInitFunctions done.
 [WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config.
 [INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating]
-[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__regression_cost_0__]
+[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__mse_cost_0__]
 I1116 09:10:17.392917    50 Trainer.cpp:170] trainer mode: Normal
 I1116 09:10:17.613910    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
 I1116 09:10:17.680917    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index 6e8fcd114df580a00858d95f0af0d1ec0bd9b4a2..3760c6727c21cfb32ca4d2efc30351352c9b182b 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -1,21 +1,3 @@
-FUNCTION(generate_python_api target_name)
-    ADD_CUSTOM_COMMAND(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
-                              ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
-                              ${PROJ_ROOT}/paddle/Paddle_wrap.h
-        COMMAND ${SWIG_EXECUTABLE} -python -c++ -outcurrentdir -I../ api/Paddle.swig
-                && mv ${PROJ_ROOT}/paddle/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
-        DEPENDS ${PROJ_ROOT}/paddle/api/Paddle.swig
-                ${PROJ_ROOT}/paddle/api/PaddleAPI.h
-                ${external_project_dependencies}
-        WORKING_DIRECTORY ${PROJ_ROOT}/paddle
-        COMMENT "Generate Python API from swig")
-    ADD_CUSTOM_TARGET(${target_name} ALL DEPENDS
-                ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
-                ${PROJ_ROOT}/paddle/Paddle_wrap.h
-                ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
-                ${external_project_dependencies})
-ENDFUNCTION(generate_python_api)
-
 set(API_SOURCES
     Arguments.cpp
     ConfigParser.cpp
@@ -33,65 +15,86 @@ set(API_HEADER
     PaddleAPI.h
     Internal.h)
 
-add_library(paddle_api STATIC
-        ${API_SOURCES})
+add_library(paddle_api STATIC ${API_SOURCES})
 add_dependencies(paddle_api gen_proto_cpp)
 
-list(LENGTH "${GFLAGS_LIBRARIES}" GFLAGS_LIBRARIES_LENGTH)
+INCLUDE(${SWIG_USE_FILE})
+INCLUDE_DIRECTORIES(${PROJ_ROOT}/paddle)
 
-if(${GFLAGS_LIBRARIES_LENGTH} EQUAL 0 AND TARGET "${GFLAGS_LIBRARIES}")
-# Because gflags compiled by cmake, so it is imported by cmake target,
-# not a real library path. Get the real library path here.
-message(STATUS "GFLAGS Libraries is ${GFLAGS_LIBRARIES}")
-get_target_property(GFLAGS_LOCATION ${GFLAGS_LIBRARIES} LOCATION)
-message(STATUS "GFLAGS Target location is ${GFLAGS_LOCATION}")
-else()
-set(GFLAGS_LOCATION ${GFLAGS_LIBRARIES})
-endif()
+FILE(GLOB PY_PADDLE_PYTHON_FILES ${PROJ_ROOT}/paddle/py_paddle/*.py)
+
+SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON)
+
+SET(CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR})
+SET(CMAKE_CXX_FLAGS "-std=c++11 -fPIC -Wall")
+IF(WITH_COVERAGE)
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
+ENDIF(WITH_COVERAGE)
 
-configure_file(
-    paddle_api_config.py.in
-    ${PROJ_ROOT}/paddle/api/paddle_api_config.py
+SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS
+    paddle_parameter
+    paddle_function
+    paddle_math
+    paddle_utils
+    paddle_gserver
+    paddle_pserver
+    paddle_api
+    paddle_cuda
+    paddle_trainer_lib
+    paddle_network
+    paddle_proto
+    ${external_project_dependencies}
 )
 
-generate_python_api(python_swig_sources)
+IF(APPLE)
+    SET(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load")
+ELSE(APPLE)
+    SET(START_GROUP "-Xlinker -start-group")
+    SET(END_GROUP "-Xlinker -end-group")
+    SET(ARCHIVE_START "-Wl,--whole-archive")
+    SET(ARCHIVE_END "-Wl,--no-whole-archive")
+ENDIF(APPLE)
 
-file(GLOB PY_PADDLE_PYTHON_FILES ${PROJ_ROOT}/paddle/py_paddle/*.py)
+SWIG_ADD_MODULE(swig_paddle python Paddle.i)
+SWIG_LINK_LIBRARIES(swig_paddle
+    ${MACOS_LD_FLAGS}
+    ${START_GROUP}
+    ${ARCHIVE_START}
+    paddle_gserver
+    paddle_function
+    ${METRIC_LIBS}
+    ${ARCHIVE_END}
+    paddle_pserver
+    paddle_trainer_lib
+    paddle_network
+    paddle_parameter
+    paddle_math
+    paddle_utils
+    paddle_proto
+    paddle_cuda
+    paddle_api
+    ${CMAKE_DL_LIBS}
+    ${EXTERNAL_LIBS}
+    ${CMAKE_THREAD_LIBS_INIT}
+    ${RDMA_LD_FLAGS}
+    ${RDMA_LIBS}
+    ${START_END}
+)
 
-# TODO(yuyang18) : make wheel name calculated by cmake
-add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/dist/.timestamp
+add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PROJ_ROOT}/paddle/py_paddle
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch dist/.timestamp
     COMMAND rm -rf py_paddle.egg-info build
     WORKING_DIRECTORY ${PROJ_ROOT}/paddle
-    DEPENDS python_swig_sources
-            paddle_parameter
-            paddle_function
-            paddle_math
-            paddle_utils
-            paddle_gserver
-            paddle_pserver
-            paddle_trainer
-            paddle_api
-            paddle_cuda
-        ${PY_PADDLE_PYTHON_FILES}
+    DEPENDS _swig_paddle
 )
 
-install(DIRECTORY ${PROJ_ROOT}/paddle/dist/
-    DESTINATION opt/paddle/share/wheels
-)
+# TODO(yuyang18) : make wheel name calculated by cmake
+add_custom_target(python_api_wheel ALL DEPENDS ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so)
 
-add_custom_target(python_api_wheel ALL DEPENDS
-  ${PROJ_ROOT}/paddle/dist/.timestamp)
-add_dependencies(python_api_wheel python_swig_sources
-  paddle_parameter
-  paddle_math
-  paddle_utils
-  paddle_gserver
-  paddle_pserver
-  paddle_trainer
-  paddle_api
-  paddle_cuda)
+install(DIRECTORY ${PROJ_ROOT}/paddle/dist/ DESTINATION opt/paddle/share/wheels)
 
 if(WITH_TESTING)
     IF(NOT PY_PIP_FOUND)
diff --git a/paddle/api/Paddle.swig b/paddle/api/Paddle.i
similarity index 100%
rename from paddle/api/Paddle.swig
rename to paddle/api/Paddle.i
diff --git a/paddle/api/paddle_api_config.py.in b/paddle/api/paddle_api_config.py.in
deleted file mode 100644
index 82f45ba6ccec49eb190d1814a67a575f311689e8..0000000000000000000000000000000000000000
--- a/paddle/api/paddle_api_config.py.in
+++ /dev/null
@@ -1,17 +0,0 @@
-PADDLE_BUILD_DIR="@CMAKE_CURRENT_BINARY_DIR@/../"
-WITH_GPU="@WITH_GPU@"
-PROTOBUF_LIBRARY="@PROTOBUF_LIBRARY@"
-ZLIB_LIBRARIES="@ZLIB_LIBRARIES@"
-CMAKE_THREAD_LIB="@CMAKE_THREAD_LIBS_INIT@"
-CMAKE_DL_LIBS="@CMAKE_DL_LIBS@"
-
-
-WITH_PYTHON="@WITH_PYTHON@"
-PYTHON_LIBRARIES="@PYTHON_LIBRARIES@"
-GLOG_LIBRARIES="@GLOG_LIBRARIES@"
-GFLAGS_LIBRARIES="@GFLAGS_LIBRARIES@"
-GFLAGS_LOCATION="@GFLAGS_LOCATION@"
-CBLAS_LIBRARIES="@CBLAS_LIBRARIES@"
-
-CUDA_LIBRARIES="@CUDA_CUDART_LIBRARY@"
-WITH_COVERALLS="@ON_COVERALLS@"
diff --git a/paddle/api/paddle_ld_flags.py b/paddle/api/paddle_ld_flags.py
deleted file mode 100644
index ad5dce209bf8e14120320a58c3cd85d6f6a97688..0000000000000000000000000000000000000000
--- a/paddle/api/paddle_ld_flags.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-try:
-    from paddle_api_config import *
-    import os.path
-    import platform
-
-    system = platform.system().lower()
-    is_osx = (system == 'darwin')
-    is_win = (system == 'windows')
-    is_lin = (system == 'linux')
-
-    if is_lin:
-        whole_start = "-Wl,--whole-archive"
-        whole_end = "-Wl,--no-whole-archive"
-    elif is_osx:
-        whole_start = ""
-        whole_end = ""
-
-    LIB_DIRS = [
-        "math", 'function', 'utils', 'parameter', "gserver", "api", "cuda",
-        "pserver", "trainer"
-    ]
-    PARENT_LIB_DIRS = ['proto']
-
-    class PaddleLDFlag(object):
-        def __init__(self):
-            self.paddle_build_dir = PADDLE_BUILD_DIR
-            self.paddle_build_dir = os.path.abspath(self.paddle_build_dir)
-            self.with_gpu = PaddleLDFlag.cmake_bool(WITH_GPU)
-            self.protolib = PROTOBUF_LIBRARY
-            self.zlib = ZLIB_LIBRARIES
-            self.thread = CMAKE_THREAD_LIB
-            self.dl_libs = CMAKE_DL_LIBS
-            self.with_python = PaddleLDFlag.cmake_bool(WITH_PYTHON)
-            self.python_libs = PYTHON_LIBRARIES
-
-            self.glog_libs = GLOG_LIBRARIES
-
-            self.with_coverage = PaddleLDFlag.cmake_bool(WITH_COVERALLS)
-            self.gflags_libs = GFLAGS_LIBRARIES
-            self.gflags_location = GFLAGS_LOCATION
-            self.cblas_libs = CBLAS_LIBRARIES
-            self.curt = CUDA_LIBRARIES
-
-        def ldflag_str(self):
-            return " ".join(
-                [self.libs_dir_str(), self.parent_dir_str(), self.libs_str()])
-
-        def libs_dir_str(self):
-            libdirs = LIB_DIRS
-            return " ".join(
-                map(lambda x: "-L" + os.path.join(self.paddle_build_dir, x),
-                    libdirs))
-
-        def parent_dir_str(self):
-            libdirs = PARENT_LIB_DIRS
-            return " ".join(
-                map(lambda x: "-L" + os.path.join(self.paddle_build_dir, '..', x),
-                    libdirs))
-
-        def libs_str(self):
-            libs = [
-                whole_start,
-                "-lpaddle_gserver",
-                "-lpaddle_function",
-                whole_end,
-                "-lpaddle_pserver",
-                "-lpaddle_trainer_lib",
-                "-lpaddle_network",
-                '-lpaddle_parameter',
-                "-lpaddle_math",
-                '-lpaddle_utils',
-                "-lpaddle_proto",
-                "-lpaddle_cuda",
-                "-lpaddle_api",
-                self.normalize_flag(self.protolib),
-                self.normalize_flag(self.glog_libs),
-                self.normalize_flag(self.gflags_libs),
-                self.normalize_flag(self.zlib),
-                self.normalize_flag(self.thread),
-                self.normalize_flag(self.dl_libs),
-                self.normalize_flag(self.cblas_libs),
-            ]
-
-            if self.with_python:
-                libs.append(self.normalize_flag(self.python_libs))
-            if self.with_gpu:
-                libs.append(self.normalize_flag(self.curt))
-            if self.with_coverage:
-                libs.append("-fprofile-arcs")
-            return " ".join(filter(lambda l: len(l) != 0, libs))
-
-        def normalize_flag(self, cmake_flag):
-            """
-            CMake flag string to ld flag
-            :type cmake_flag: str
-            """
-            if ";" in cmake_flag:
-                return " ".join(map(self.normalize_flag, cmake_flag.split(";")))
-            if cmake_flag.startswith("/"):  # is a path
-                return cmake_flag
-            elif cmake_flag.startswith("-l"):  # normal link command
-                return cmake_flag
-            elif cmake_flag in [
-                    "gflags-shared", "gflags-static", "gflags_nothreads-shared",
-                    "gflags_nothreads-static"
-            ]:  # special for gflags
-                assert PaddleLDFlag.cmake_bool(self.gflags_location)
-                return self.gflags_location
-            elif len(cmake_flag) != 0:
-                return "".join(["-l", cmake_flag])
-            else:
-                return ""
-
-        @staticmethod
-        def cmake_bool(cmake_str):
-            """
-            CMake bool string to bool
-            :param cmake_str: cmake boolean string
-            :type cmake_str: str
-            :rtype: bool
-            """
-            if cmake_str in ["FALSE", "OFF", "NO"] or cmake_str.endswith(
-                    "-NOTFOUND"):
-                return False
-            else:
-                return True
-
-        def c_flag(self):
-            if self.with_coverage:
-                return [
-                    "-fprofile-arcs", "-ftest-coverage", "-O0", "-g",
-                    "-std=c++11"
-                ]
-            else:
-                return ["-std=c++11"]
-except ImportError:
-
-    class PaddleLDFlag(object):
-        def ldflag_str(self):
-            pass
-
-        def c_flag(self):
-            pass
diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt
index 4f92150ec84d637c5b75cba09d7e98501a5a5f5d..93a6a99848aa13bb36c9c5c7091fbaa891fc9823 100644
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -25,12 +25,16 @@ filter_test(GSERVER_HEADER)
 filter_test(GSERVER_SOURCES)
 if(NOT WITH_GPU)
     list(REMOVE_ITEM GSERVER_HEADER
+        layers/CudnnConvBaseLayer.h
         layers/CudnnConvLayer.h
+        layers/CudnnConvTransLayer.h
         layers/CudnnPoolLayer.h
         layers/CudnnBatchNormLayer.h)
 
     list(REMOVE_ITEM GSERVER_SOURCES
+        layers/CudnnConvBaseLayer.cpp
         layers/CudnnConvLayer.cpp
+        layers/CudnnConvTransLayer.cpp
         layers/CudnnPoolLayer.cpp
         layers/CudnnBatchNormLayer.cpp)
     compile_cu_as_cpp(layers/LstmCompute.cu)
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index 9a2ad7567f0dc93d0a8e396fd88b2488afe9d049..40036762179ebb1495b90907f16b97e3c60c50d8 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -164,15 +164,6 @@ public:
     argu.value = value;
     data_.push_back(argu);
   }
-  /**
-   * @brief Append user defined data
-   * @param[in]  ptr     user defined data
-   */
-  void appendUserDefinedPtr(UserDefinedVectorPtr ptr) {
-    Argument argu;
-    argu.udp = ptr;
-    data_.push_back(argu);
-  }
 
   /*
    * @brief Append argument
diff --git a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
index 4654d0206413ec198da62af12e294cd5b442e735..6ae60102b3e431727c0954e8b8073bfe0534f8ee 100644
--- a/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
+++ b/paddle/gserver/gradientmachines/MultiGradientMachine.cpp
@@ -24,9 +24,6 @@ limitations under the License. */
 DEFINE_bool(allow_only_one_model_on_one_gpu,
             true,
             "If true, do not allow multiple models on one GPU device");
-#ifdef PADDLE_METRIC_LEARNING
-DECLARE_bool(external);
-#endif
 
 namespace paddle {
 
@@ -45,11 +42,7 @@ MultiGradientMachine::MultiGradientMachine(const ModelConfig& config,
       trainerBarrier_(FLAGS_trainer_count),
       allBarrier_(FLAGS_trainer_count + 1),
       inArgsCopied_(false) {
-#ifdef PADDLE_METRIC_LEARNING
-  isPassGrad_ = FLAGS_external;
-#else
   isPassGrad_ = false;
-#endif
   numThreads_ = FLAGS_trainer_count;
   if (useGpu) {
     //! TODO(yuyang18): When useGpu=false && paddle is not compiled with gpu,
diff --git a/paddle/gserver/layers/ConvBaseOperator.cpp b/paddle/gserver/layers/ConvBaseOperator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5c231986292d2cd26ee30ccc122142fccd5b4949
--- /dev/null
+++ b/paddle/gserver/layers/ConvBaseOperator.cpp
@@ -0,0 +1,150 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvBaseOperator.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvBaseOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+ConvBaseOperator::ConvBaseOperator(const OperatorConfig &config, bool useGpu)
+    : Operator(config, useGpu) {
+  CHECK(useGpu);
+  CHECK_EQ(config_.input_indices_size(), 2L);
+
+  caffeMode_ = true;
+  getConvParams();
+  computeConvSizes();
+
+  // initialize all to default algorithms
+  fwdAlgo_ = 0;
+  bwdFilterAlgo_ = 0;
+  bwdDataAlgo_ = 0;
+  fwdLimitBytes_ = 0;
+  bwdDataLimitBytes_ = 0;
+  bwdFilterLimitBytes_ = 0;
+  workSpaceInBytes_ = 0;
+  workSpace_ = nullptr;
+
+  isSelectAlgo_ = false;
+}
+
+void ConvBaseOperator::allocConvWorkSpace() {
+  hl_conv_workspace(imageDesc_,
+                    outputDesc_,
+                    filterDesc_,
+                    convDesc_,
+                    &fwdAlgo_,
+                    &fwdLimitBytes_,
+                    &bwdDataAlgo_,
+                    &bwdDataLimitBytes_,
+                    &bwdFilterAlgo_,
+                    &bwdFilterLimitBytes_);
+
+  size_t maxWorkSpace = 0;
+  maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
+  maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
+
+  if (maxWorkSpace > workSpaceInBytes_) {
+    if (workSpaceInBytes_ != 0) {
+      hl_free_mem_device(workSpace_);
+    }
+    // total amount of storage needed
+    workSpace_ = hl_malloc_device(maxWorkSpace);
+    workSpaceInBytes_ = maxWorkSpace;
+  }
+}
+
+void ConvBaseOperator::computeConvSizes() {
+  hl_create_filter_descriptor(
+      &filterDesc_, channels_, numFilters_, filterSizeY_, filterSize_);
+  hl_create_tensor_descriptor(&imageDesc_);
+  hl_create_tensor_descriptor(&outputDesc_);
+  hl_create_convolution_descriptor(&convDesc_,
+                                   imageDesc_,
+                                   filterDesc_,
+                                   paddingY_,
+                                   padding_,
+                                   strideY_,
+                                   stride_);
+}
+
+void ConvBaseOperator::reshapeImageDescriptors() {
+  hl_tensor_reshape(imageDesc_,
+                    1,
+                    channels_,
+                    imageH_,
+                    imageW_,
+                    channels_ * imageH_ * imageW_,
+                    imageH_ * imageW_,
+                    imageW_,
+                    1);
+  hl_tensor_reshape(outputDesc_,
+                    1,
+                    numFilters_,
+                    outputH_,
+                    outputW_,
+                    numFilters_ * outputH_ * outputW_,
+                    outputH_ * outputW_,
+                    outputW_,
+                    1);
+  hl_reset_convolution_descriptor(convDesc_,
+                                  imageDesc_,
+                                  filterDesc_,
+                                  paddingY_,
+                                  padding_,
+                                  strideY_,
+                                  stride_);
+}
+
+void ConvBaseOperator::getConvParams() {
+  configNumFilters_ = config_.num_filters();
+  const ConvConfig &conf = config_.conv_conf();
+  padding_ = conf.padding();
+  stride_ = conf.stride();
+  filterSize_ = conf.filter_size();
+  paddingY_ = conf.padding_y();
+  strideY_ = conf.stride_y();
+  filterSizeY_ = conf.filter_size_y();
+  filterPixels_ = filterSize_ * filterSizeY_;
+  configChannels_ = conf.channels();
+  imgSize_ = conf.img_size();
+  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  imgPixels_ = imgSize_ * imgSizeY_;
+  CHECK_EQ(conf.groups(), 1U);
+  filterChannels_ = conf.filter_channels();
+  outputX_ = conf.output_x();
+  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+  outputs_ = outputX_ * outputX_;
+
+  isDeconv_ = (config_.type() == "conv") ? false : true;
+  if (isDeconv_) {
+    channels_ = configNumFilters_;
+    numFilters_ = configChannels_;
+  } else {
+    channels_ = configChannels_;
+    numFilters_ = configNumFilters_;
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseOperator.h b/paddle/gserver/layers/ConvBaseOperator.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d42169cde2a80a26edcf98bc2d728e00b075728
--- /dev/null
+++ b/paddle/gserver/layers/ConvBaseOperator.h
@@ -0,0 +1,112 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "Operator.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+class ConvBaseOperator : public Operator {
+public:
+  ConvBaseOperator(const OperatorConfig &config, bool useGpu);
+  /**
+   * Free workspace in device and destroy cudnn tensor descriptor.
+   */
+  virtual ~ConvBaseOperator() {
+    if (workSpaceInBytes_ != 0) {
+      hl_free_mem_device(workSpace_);
+      workSpaceInBytes_ = 0;
+    }
+
+    hl_destroy_tensor_descriptor(imageDesc_);
+    hl_destroy_tensor_descriptor(outputDesc_);
+    hl_destroy_filter_descriptor(filterDesc_);
+    hl_destroy_convolution_descriptor(convDesc_);
+  }
+
+protected:
+  /**
+   * Get convolution parameters from layer config and
+   * initialize member variables.
+   */
+  void getConvParams();
+
+  /**
+   * Allocate Gpu Memory for cudnn convolution algorithms.
+   */
+  void allocConvWorkSpace();
+
+  /**
+   * Create cudnn tensor descriptor for convolution operation.
+   */
+  void computeConvSizes();
+
+  /**
+   * Reshape cudnn tensor descriptor.
+   */
+  void reshapeImageDescriptors();
+
+  /**
+   * Reshape cudnn tensor descriptor.
+   */
+  virtual void reshape(int batchSize) = 0;
+
+  /**
+   * Check filter size is equal to the size calculated by parameters from
+   * layer config.
+   */
+  void checkFilterSize(const MatrixPtr &filter) {
+    CHECK_EQ(static_cast<int>(filter->getWidth()),
+             filterSize_ * filterSizeY_ * channels_ * numFilters_);
+  }
+
+  /// Most of member variables are same with CudnnConvLayer.
+  /// There is no explanation here.
+  bool isDeconv_;
+  int imageH_, imageW_, outputH_, outputW_;
+  hl_tensor_descriptor imageDesc_;
+  hl_tensor_descriptor outputDesc_;
+  hl_filter_descriptor filterDesc_;
+  hl_convolution_descriptor convDesc_;
+  bool caffeMode_;
+  int inputOffset_, outputOffset_, weightOffset_;
+  int numFilters_, channels_;
+
+  /// from parsing config
+  int configNumFilters_, configChannels_;
+  int padding_, stride_, filterSize_, imgSize_, imgSizeY_;
+  int paddingY_, strideY_, filterSizeY_;
+  int imgPixels_, filterPixels_, filterChannels_, outputX_, outputY_, outputs_;
+
+  /// Following member variables are same with CudnnConvLayer.
+  /// There is no explanation here.
+  int fwdAlgo_, bwdFilterAlgo_, bwdDataAlgo_;
+  size_t fwdLimitBytes_, bwdDataLimitBytes_, bwdFilterLimitBytes_;
+  size_t workSpaceInBytes_;
+  void *workSpace_;
+  bool isSelectAlgo_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseProjection.cpp b/paddle/gserver/layers/ConvBaseProjection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d1e932ded595c90cbe6040c330c5c8663d81e2b4
--- /dev/null
+++ b/paddle/gserver/layers/ConvBaseProjection.cpp
@@ -0,0 +1,195 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvBaseProjection.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+ThreadLocalD<std::vector<MemoryHandle *>> ConvBaseProjection::convMem_;
+
+ConvBaseProjection::ConvBaseProjection(const ProjectionConfig &config,
+                                       ParameterPtr parameter,
+                                       bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  CHECK(useGpu);  // only support GPU
+  getConvParams();
+  initCudnn();
+
+  size_t height = filterH_ * filterW_ * channels_ / groups_;
+  size_t width = numFilters_;
+  weight_.reset(new Weight(height, width, parameter));
+  weightOffset_ = height * width / groups_;
+}
+
+void ConvBaseProjection::getConvParams() {
+  const ConvConfig &conf = config_.conv_conf();
+  paddingH_ = conf.padding_y();
+  paddingW_ = conf.padding();
+
+  strideH_ = conf.stride_y();
+  strideW_ = conf.stride();
+
+  filterH_ = conf.filter_size_y();
+  filterW_ = conf.filter_size();
+
+  configImgH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  configImgW_ = conf.img_size();
+
+  configOutH_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+  configOutW_ = conf.output_x();
+
+  configChannels_ = conf.channels();
+  configNumFilters_ = config_.num_filters();
+
+  isDeconv_ = (config_.type() == "conv") ? false : true;
+
+  channels_ = (isDeconv_) ? configNumFilters_ : configChannels_;
+  numFilters_ = (isDeconv_) ? configChannels_ : configNumFilters_;
+
+  groups_ = conf.groups();
+  CHECK_EQ(channels_ % groups_, 0);
+  CHECK_EQ(numFilters_ % groups_, 0);
+}
+
+void ConvBaseProjection::initCudnn() {
+  hl_create_filter_descriptor(&filterDesc_,
+                              channels_ / groups_,
+                              numFilters_ / groups_,
+                              filterH_,
+                              filterW_);
+  hl_create_tensor_descriptor(&imageDesc_);
+  hl_create_tensor_descriptor(&outputDesc_);
+  hl_create_convolution_descriptor(&convDesc_,
+                                   imageDesc_,
+                                   filterDesc_,
+                                   paddingH_,
+                                   paddingW_,
+                                   strideH_,
+                                   strideW_);
+
+  // initialize all to default algorithms
+  fwdAlgo_ = 0;
+  bwdFilterAlgo_ = 0;
+  bwdDataAlgo_ = 0;
+  fwdLimitBytes_ = 0;
+  bwdDataLimitBytes_ = 0;
+  bwdFilterLimitBytes_ = 0;
+  workSpaceInBytes_ = 0;
+
+  batchNum_ = 0;
+  isSelectAlgo_ = false;
+}
+
+void ConvBaseProjection::reshapeTensorDesc(int batchSize) {
+  // The stride between two consecutive samples in the output of ConvProjection
+  // may not be numFilters_ * outputH_ * outputW_ (conv) or
+  // channels_ * imageH_ * imageW_ (deconv)
+  // for example, in the case of layer ConcatenateLayer2 with two
+  // ConvProjection, the stride is the output_size of layer ConcatenateLayer2.
+  // So the calculation of nStride is different from CudnnConvLayer.
+  size_t nStrideImage, nStrideOutput;
+  if (isDeconv_) {
+    nStrideImage = out_->value->getStride();
+    nStrideOutput = numFilters_ * outputH_ * outputW_;
+  } else {
+    nStrideImage = channels_ * imageH_ * imageW_;
+    nStrideOutput = out_->value->getStride();
+  }
+
+  hl_tensor_reshape(imageDesc_,
+                    batchSize,
+                    channels_ / groups_,
+                    imageH_,
+                    imageW_,
+                    nStrideImage,
+                    imageH_ * imageW_,
+                    imageW_,
+                    1);
+
+  hl_tensor_reshape(outputDesc_,
+                    batchSize,
+                    numFilters_ / groups_,
+                    outputH_,
+                    outputW_,
+                    nStrideOutput,
+                    outputH_ * outputW_,
+                    outputW_,
+                    1);
+
+  hl_reset_convolution_descriptor(convDesc_,
+                                  imageDesc_,
+                                  filterDesc_,
+                                  paddingH_,
+                                  paddingW_,
+                                  strideH_,
+                                  strideW_);
+}
+
+void ConvBaseProjection::reshape(int batchSize) {
+  size_t width = calOutputSize();
+  CHECK_EQ(width, out_->value->getWidth());
+  CHECK_EQ(calInputSize(), in_->value->getWidth());
+
+  isSelectAlgo_ = (batchSize == batchNum_);
+  batchNum_ = batchSize;
+
+  if (!isSelectAlgo_) {
+    reshapeTensorDesc(batchSize);
+    hl_conv_workspace(imageDesc_,
+                      outputDesc_,
+                      filterDesc_,
+                      convDesc_,
+                      &fwdAlgo_,
+                      &fwdLimitBytes_,
+                      &bwdDataAlgo_,
+                      &bwdDataLimitBytes_,
+                      &bwdFilterAlgo_,
+                      &bwdFilterLimitBytes_);
+
+    size_t maxWorkSpace = 0;
+    maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
+    maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
+    workSpaceInBytes_ = maxWorkSpace;
+
+    VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
+            << " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_;
+  }
+
+  isSelectAlgo_ = true;
+}
+
+void *ConvBaseProjection::getSpaceBytes(size_t size) {
+  std::vector<MemoryHandle *> &convMem = *convMem_;
+  if (convMem.empty()) {
+    int numDevices = hl_get_device_count();
+    convMem.resize(numDevices);
+  }
+
+  int devId = hl_get_device();
+  MemoryHandle **localMem = &(convMem[devId]);
+  if (NULL == *localMem || size > (*localMem)->getAllocSize()) {
+    *localMem = new GpuMemoryHandle(size);
+  }
+  return (*localMem)->getBuf();
+}
+
+ConvBaseProjection::~ConvBaseProjection() {
+  hl_destroy_tensor_descriptor(imageDesc_);
+  hl_destroy_tensor_descriptor(outputDesc_);
+  hl_destroy_filter_descriptor(filterDesc_);
+  hl_destroy_convolution_descriptor(convDesc_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseProjection.h b/paddle/gserver/layers/ConvBaseProjection.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a33aa1837dfc36dbead60deaccbc6b772fe4754
--- /dev/null
+++ b/paddle/gserver/layers/ConvBaseProjection.h
@@ -0,0 +1,116 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Projection.h"
+#include "paddle/math/MathUtils.h"
+
+namespace paddle {
+
+/**
+ * @brief Base class for ConvProjection and ConvTransProjection.
+ */
+class ConvBaseProjection : public Projection {
+public:
+  /**
+   * Constructor.
+   */
+  ConvBaseProjection(const ProjectionConfig& config,
+                     ParameterPtr parameter,
+                     bool useGpu);
+
+  ~ConvBaseProjection();
+
+protected:
+  void getConvParams();
+  void initCudnn();
+
+  void reshapeTensorDesc(int batchSize);
+  void reshape(int batchSize);
+
+  virtual size_t calOutputSize() = 0;
+  virtual size_t calInputSize() = 0;
+
+  static void* getSpaceBytes(size_t size);
+
+  /// True if it's deconv projection layer, false if it's ConvProjection layer
+  bool isDeconv_;
+  /// imageH_ and imageW_ / outputH_ and outputW_
+  /// is calculated from the input layer.
+  int imageH_, imageW_;
+  int outputH_, outputW_;
+  /// configImgH_ and configImgW_ / configOutH_ and configOutW_
+  /// is obtained from config.
+  int configImgH_, configImgW_;
+  int configOutH_, configOutW_;
+  /// channels_ and numFilters_ are defined in terms of convolution semantics
+  int channels_, numFilters_;
+  /// configChannels and configNumFilters_ are obtained from config
+  /// For Conv they are the same as channels_ and numFilters
+  /// For ConvTrans they are opposite to channels_ and numFilters
+  int configChannels_, configNumFilters_;
+  int paddingH_, paddingW_;
+  int strideH_, strideW_;
+  int filterH_, filterW_;
+  /// One group offset of input data.
+  int inputOffset_;
+  /// One group offset of output data.
+  int outputOffset_;
+  /// One group offset of weight.
+  int weightOffset_;
+  int groups_;
+
+  /// Cudnn tensor descriptor for input.
+  hl_tensor_descriptor imageDesc_;
+  /// Cudnn tensor descriptor for output.
+  hl_tensor_descriptor outputDesc_;
+  /// Cudnn tensor descriptor for filter.
+  hl_filter_descriptor filterDesc_;
+  /// Cudnn tensor descriptor for a convolution operation.
+  hl_convolution_descriptor convDesc_;
+
+  /// Record the algorithm for forward convolution, which is obtained by cudnn
+  /// api to search the best suited algorithm.
+  int fwdAlgo_;
+  /// Record the algorithm for computing convolution gradient with respect to
+  /// filter coefficients.
+  int bwdFilterAlgo_;
+  /// Record the algorithm for computing convolution gradient with respect to
+  /// the output.
+  int bwdDataAlgo_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// forward convolution with the specified algo.
+  size_t fwdLimitBytes_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// backwardFilter with the specified algo.
+  size_t bwdDataLimitBytes_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// backwardData with the specified algo.
+  size_t bwdFilterLimitBytes_;
+  /// Size of total work space.
+  size_t workSpaceInBytes_;
+
+  /// Whether to call cuDNN api to choose conv algorithm.
+  bool isSelectAlgo_;
+  /// batchNum is used to record batch size. If the batch size is changed,
+  /// the selection algorithm will be called.
+  int batchNum_;
+  bool bias_;
+
+  std::unique_ptr<Weight> weight_;
+  static ThreadLocalD<std::vector<MemoryHandle*>> convMem_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvOperator.cpp b/paddle/gserver/layers/ConvOperator.cpp
index f943410dee0dc2f3d356c9d7d8f61398fe2871c8..80932c8c509e3cb013c7e0051cbf4d8ccced0228 100644
--- a/paddle/gserver/layers/ConvOperator.cpp
+++ b/paddle/gserver/layers/ConvOperator.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "Operator.h"
+#include "ConvOperator.h"
 #include "paddle/math/MathUtils.h"
 #include "paddle/math/Matrix.h"
 
@@ -27,120 +27,8 @@ namespace paddle {
  * The config file api is conv_operator.
  */
 
-class ConvOperator : public Operator {
-public:
-  ConvOperator(const OperatorConfig &config, bool useGpu);
-  /**
-   * Free workspace in device and destroy cudnn tensor descriptor.
-   */
-  virtual ~ConvOperator() {
-    if (workSpaceInBytes_ != 0) {
-      hl_free_mem_device(workSpace_);
-      workSpaceInBytes_ = 0;
-    }
-
-    hl_destroy_tensor_descriptor(inputDesc_);
-    hl_destroy_tensor_descriptor(outputDesc_);
-    hl_destroy_filter_descriptor(filterDesc_);
-    hl_destroy_convolution_descriptor(convDesc_);
-  }
-  virtual void forward();
-  virtual void backward();
-
-private:
-  /**
-   * Get convolution parameters from layer config and
-   * initialize member variables.
-   */
-  void getConvParams();
-
-  /**
-   * Allocate Gpu Memory for cudnn convolution algorithms.
-   */
-  void allocConvWorkSpace(size_t maxWorkSpace);
-
-  /**
-   * Create cudnn tensor descriptor for convolution operation.
-   */
-  void computeConvSizes();
-
-  /**
-   * Reshape cudnn tensor descriptor.
-   */
-  void reshapeImageDescriptors();
-
-  /**
-   * Reshape cudnn tensor descriptor.
-   */
-  void reshape(int batchSize);
-
-  /**
-   * Check filter size is equal to the size calculated by parameters from
-   * layer config.
-   */
-  void checkFilterSize(const MatrixPtr &filter) {
-    CHECK_EQ(static_cast<int>(filter->getWidth()),
-             filterSize_ * filterSizeY_ * channels_ * numFilters_);
-  }
-
-  /// Most of member variables are same with CudnnConvLayer.
-  /// There is no explanation here.
-  int imageH_, imageW_, outputH_, outputW_;
-  hl_tensor_descriptor inputDesc_;
-  hl_tensor_descriptor outputDesc_;
-  hl_filter_descriptor filterDesc_;
-  hl_convolution_descriptor convDesc_;
-  bool caffeMode_;
-  int inputOffset_, outputOffset_, weightOffset_;
-  int numFilters_;
-  int padding_, stride_, filterSize_, channels_, imgSize_, imgSizeY_;
-  int paddingY_, strideY_, filterSizeY_;
-  int imgPixels_, filterPixels_, filterChannels_, outputX_, outputY_, outputs_;
-
-  /// Following member variables are same with CudnnConvLayer.
-  /// There is no explanation here.
-  int fwdAlgo_, bwdFilterAlgo_, bwdDataAlgo_;
-  size_t fwdLimitBytes_, bwdDataLimitBytes_, bwdFilterLimitBytes_;
-  size_t workSpaceInBytes_;
-  void *workSpace_;
-  bool isSelectAlgo_;
-};
-
 REGISTER_OPERATOR(conv, ConvOperator);
 
-ConvOperator::ConvOperator(const OperatorConfig &config, bool useGpu)
-    : Operator(config, useGpu) {
-  CHECK(useGpu);
-  CHECK_EQ(config_.input_indices_size(), 2L);
-
-  caffeMode_ = true;
-  getConvParams();
-  computeConvSizes();
-
-  // initialize all to default algorithms
-  fwdAlgo_ = 0;
-  bwdFilterAlgo_ = 0;
-  bwdDataAlgo_ = 0;
-  fwdLimitBytes_ = 0;
-  bwdDataLimitBytes_ = 0;
-  bwdFilterLimitBytes_ = 0;
-  workSpaceInBytes_ = 0;
-  workSpace_ = nullptr;
-
-  isSelectAlgo_ = false;
-}
-
-void ConvOperator::allocConvWorkSpace(size_t maxWorkSpace) {
-  if (maxWorkSpace > workSpaceInBytes_) {
-    if (workSpaceInBytes_ != 0) {
-      hl_free_mem_device(workSpace_);
-    }
-    // total amount of storage needed
-    workSpace_ = hl_malloc_device(maxWorkSpace);
-    workSpaceInBytes_ = maxWorkSpace;
-  }
-}
-
 void ConvOperator::reshape(int batchSize) {
   imageH_ = ins_[0]->getFrameHeight();
   imageW_ = ins_[0]->getFrameWidth();
@@ -148,106 +36,25 @@ void ConvOperator::reshape(int batchSize) {
   if (imageW_ == 0) imageW_ = imgSize_;
   outputH_ = outputSize(imageH_, filterSizeY_, paddingY_, strideY_, caffeMode_);
   outputW_ = outputSize(imageW_, filterSize_, padding_, stride_, caffeMode_);
-
+  /// Check that the outputSizes are consistent with config
+  CHECK_EQ(outputH_, outputY_);
+  CHECK_EQ(outputW_, outputX_);
   out_->setFrameHeight(outputH_);
   out_->setFrameWidth(outputW_);
 
   reshapeImageDescriptors();
 
-  if (!isSelectAlgo_) {
-    hl_conv_workspace(inputDesc_,
-                      outputDesc_,
-                      filterDesc_,
-                      convDesc_,
-                      &fwdAlgo_,
-                      &fwdLimitBytes_,
-                      &bwdDataAlgo_,
-                      &bwdDataLimitBytes_,
-                      &bwdFilterAlgo_,
-                      &bwdFilterLimitBytes_);
-
-    size_t maxWorkSpace = 0;
-    maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
-    maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
+  inputOffset_ = channels_ * imageH_ * imageW_;
+  outputOffset_ = numFilters_ * outputH_ * outputW_;
+  weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSizeY_;
 
-    allocConvWorkSpace(maxWorkSpace);
+  if (!isSelectAlgo_) {
+    allocConvWorkSpace();
   }
 
   isSelectAlgo_ = true;
 }
 
-void ConvOperator::computeConvSizes() {
-  hl_create_filter_descriptor(
-      &filterDesc_, channels_, numFilters_, filterSizeY_, filterSize_);
-  hl_create_tensor_descriptor(&inputDesc_);
-  int outputX =
-      outputSize(imgSize_, filterSize_, padding_, stride_, caffeMode_);
-  int outputY =
-      outputSize(imgSizeY_, filterSizeY_, paddingY_, strideY_, caffeMode_);
-  CHECK_EQ(outputX, outputX_);
-  CHECK_EQ(outputY, outputY_);
-  hl_create_tensor_descriptor(&outputDesc_);
-  hl_create_convolution_descriptor(&convDesc_,
-                                   inputDesc_,
-                                   filterDesc_,
-                                   paddingY_,
-                                   padding_,
-                                   strideY_,
-                                   stride_);
-}
-
-void ConvOperator::reshapeImageDescriptors() {
-  hl_tensor_reshape(inputDesc_,
-                    1,
-                    channels_,
-                    imageH_,
-                    imageW_,
-                    channels_ * imageH_ * imageW_,
-                    imageH_ * imageW_,
-                    imageW_,
-                    1);
-  hl_tensor_reshape(outputDesc_,
-                    1,
-                    numFilters_,
-                    outputH_,
-                    outputW_,
-                    numFilters_ * outputH_ * outputW_,
-                    outputH_ * outputW_,
-                    outputW_,
-                    1);
-  hl_reset_convolution_descriptor(convDesc_,
-                                  inputDesc_,
-                                  filterDesc_,
-                                  paddingY_,
-                                  padding_,
-                                  strideY_,
-                                  stride_);
-  inputOffset_ = channels_ * imageH_ * imageW_;
-  outputOffset_ = numFilters_ * outputH_ * outputW_;
-  weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSize_;
-}
-
-void ConvOperator::getConvParams() {
-  numFilters_ = config_.num_filters();
-  const ConvConfig &conf = config_.conv_conf();
-  padding_ = conf.padding();
-  stride_ = conf.stride();
-  filterSize_ = conf.filter_size();
-  paddingY_ = conf.padding_y();
-  strideY_ = conf.stride_y();
-  filterSizeY_ = conf.filter_size_y();
-  filterPixels_ = filterSize_ * filterSizeY_;
-  channels_ = conf.channels();
-  imgSize_ = conf.img_size();
-  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  imgPixels_ = imgSize_ * imgSizeY_;
-  CHECK_EQ(conf.groups(), 1U);
-  filterChannels_ = conf.filter_channels();
-  outputX_ = conf.output_x();
-  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
-  outputs_ = outputX_ * outputX_;
-}
-
 void ConvOperator::forward() {
   size_t batchSize = ins_[0]->value->getHeight();
   reshape(batchSize);
@@ -264,7 +71,7 @@ void ConvOperator::forward() {
       real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
       real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
       real *outData = out_->value->getData() + outputOffset_ * batchId;
-      hl_convolution_forward(inputDesc_,
+      hl_convolution_forward(imageDesc_,
                              inputData,
                              outputDesc_,
                              outData,
@@ -287,7 +94,7 @@ void ConvOperator::backward() {
       if (ins_[1]->grad) {
         real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
         real *weightGrad = ins_[1]->grad->getData() + weightOffset_ * batchId;
-        hl_convolution_backward_filter(inputDesc_,
+        hl_convolution_backward_filter(imageDesc_,
                                        inputData,
                                        outputDesc_,
                                        outGrad,
@@ -303,7 +110,7 @@ void ConvOperator::backward() {
       if (NULL != preGrad) {
         real *inputGrad = preGrad->getData() + inputOffset_ * batchId;
         real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
-        hl_convolution_backward_data(inputDesc_,
+        hl_convolution_backward_data(imageDesc_,
                                      inputGrad,
                                      outputDesc_,
                                      outGrad,
diff --git a/paddle/gserver/layers/ConvOperator.h b/paddle/gserver/layers/ConvOperator.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f3546c67ac174628044d5fb6e5c7bce06f37995
--- /dev/null
+++ b/paddle/gserver/layers/ConvOperator.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "ConvBaseOperator.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+class ConvOperator : public ConvBaseOperator {
+public:
+  ConvOperator(const OperatorConfig &config, bool useGpu)
+      : ConvBaseOperator(config, useGpu) {}
+  /**
+   * Free workspace in device and destroy cudnn tensor descriptor.
+   */
+  virtual ~ConvOperator() {}
+  void forward() override;
+  void backward() override;
+  void reshape(int batchSize) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvProjection.cpp b/paddle/gserver/layers/ConvProjection.cpp
index 0281170bc59855f6f4d2f4212523275a92d202d5..5b7ecc5560c1e7431305b34a331fe1fbc96c6b06 100644
--- a/paddle/gserver/layers/ConvProjection.cpp
+++ b/paddle/gserver/layers/ConvProjection.cpp
@@ -19,149 +19,32 @@ namespace paddle {
 
 REGISTER_PROJECTION(conv, ConvProjection);
 
-ThreadLocalD<std::vector<MemoryHandle *>> ConvProjection::convMem_;
-
-ConvProjection::ConvProjection(const ProjectionConfig &config,
-                               ParameterPtr parameter,
-                               bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  CHECK(useGpu);  // only support GPU
-  getConvParams();
-  initCudnn();
-
-  size_t height = filterH_ * filterW_ * channels_ / groups_;
-  size_t width = numFilters_;
-  weight_.reset(new Weight(height, width, parameter));
-  weightOffset_ = height * width / groups_;
-}
-
-void ConvProjection::getConvParams() {
-  const ConvConfig &conf = config_.conv_conf();
-  paddingH_ = conf.padding_y();
-  paddingW_ = conf.padding();
-
-  strideH_ = conf.stride_y();
-  strideW_ = conf.stride();
-
-  filterH_ = conf.filter_size_y();
-  filterW_ = conf.filter_size();
-
-  configImgH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  configImgW_ = conf.img_size();
-
-  channels_ = conf.channels();
-  numFilters_ = config_.num_filters();
-
-  groups_ = conf.groups();
-  CHECK_EQ(channels_ % groups_, 0);
-  CHECK_EQ(numFilters_ % groups_, 0);
-}
-
-void ConvProjection::initCudnn() {
-  hl_create_filter_descriptor(&filterDesc_,
-                              channels_ / groups_,
-                              numFilters_ / groups_,
-                              filterH_,
-                              filterW_);
-  hl_create_tensor_descriptor(&inputDesc_);
-  hl_create_tensor_descriptor(&outputDesc_);
-  hl_create_convolution_descriptor(&convDesc_,
-                                   inputDesc_,
-                                   filterDesc_,
-                                   paddingH_,
-                                   paddingW_,
-                                   strideH_,
-                                   strideW_);
-
-  // initialize all to default algorithms
-  fwdAlgo_ = 0;
-  bwdFilterAlgo_ = 0;
-  bwdDataAlgo_ = 0;
-  fwdLimitBytes_ = 0;
-  bwdDataLimitBytes_ = 0;
-  bwdFilterLimitBytes_ = 0;
-  workSpaceInBytes_ = 0;
-
-  batchNum_ = 0;
-  isSelectAlgo_ = false;
-}
-
-void ConvProjection::reshapeTensorDesc(int batchSize) {
-  hl_tensor_reshape(inputDesc_,
-                    batchSize,
-                    channels_ / groups_,
-                    imageH_,
-                    imageW_,
-                    channels_ * imageH_ * imageW_,
-                    imageH_ * imageW_,
-                    imageW_,
-                    1);
-  hl_reset_convolution_descriptor(convDesc_,
-                                  inputDesc_,
-                                  filterDesc_,
-                                  paddingH_,
-                                  paddingW_,
-                                  strideH_,
-                                  strideW_);
-
-  // The stride between two consecutive images in ConvProjection may not be 1,
-  // for example, in the case of layer ConcatenateLayer2 with two
-  // ConvProjection, the stride is the output_size of layer ConcatenateLayer2.
-  // So the calculation of nStride is different from CudnnConvLayer.
-  // In fact, only "nStride = out_->value->getStride()" is ok.
-  size_t nStride = numFilters_ * outputH_ * outputW_;
-  if (out_->value->isContiguous()) {
-    CHECK_EQ(nStride, out_->value->getWidth());
-  } else {
-    nStride = out_->value->getStride();
-  }
-
-  hl_tensor_reshape(outputDesc_,
-                    batchSize,
-                    numFilters_ / groups_,
-                    outputH_,
-                    outputW_,
-                    nStride,
-                    outputH_ * outputW_,
-                    outputW_,
-                    1);
+size_t ConvProjection::calOutputSize() {
+  imageH_ = in_->getFrameHeight();
+  imageW_ = in_->getFrameWidth();
+  if (imageH_ == 0) imageH_ = configImgH_;
+  if (imageW_ == 0) imageW_ = configImgW_;
+  outputH_ = outputSize(imageH_,
+                        filterH_,
+                        paddingH_,
+                        strideH_,
+                        /* caffeMode */ true);
+  outputW_ = outputSize(imageW_,
+                        filterW_,
+                        paddingW_,
+                        strideW_,
+                        /* caffeMode */ true);
+
+  const_cast<Argument *>(out_)->setFrameHeight(outputH_);
+  const_cast<Argument *>(out_)->setFrameWidth(outputW_);
+
+  inputOffset_ = (configChannels_ / groups_) * imageH_ * imageW_;
+  outputOffset_ = (configNumFilters_ / groups_) * outputH_ * outputW_;
+  return outputH_ * outputW_ * configNumFilters_;
 }
 
-void ConvProjection::reshape(int batchSize) {
-  size_t width = calOutputSize();
-  CHECK_EQ(width, out_->value->getWidth());
-  CHECK_EQ(static_cast<size_t>(channels_ * imageH_ * imageW_),
-           in_->value->getWidth())
-      << "Wrong input size for convolution"
-      << " channels=" << channels_ << " imageH=" << imageH_
-      << " imageW=" << imageW_ << " inputSize=" << in_->value->getWidth();
-
-  isSelectAlgo_ = (batchSize == batchNum_);
-  batchNum_ = batchSize;
-
-  if (!isSelectAlgo_) {
-    reshapeTensorDesc(batchSize);
-    hl_conv_workspace(inputDesc_,
-                      outputDesc_,
-                      filterDesc_,
-                      convDesc_,
-                      &fwdAlgo_,
-                      &fwdLimitBytes_,
-                      &bwdDataAlgo_,
-                      &bwdDataLimitBytes_,
-                      &bwdFilterAlgo_,
-                      &bwdFilterLimitBytes_);
-
-    size_t maxWorkSpace = 0;
-    maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
-    maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
-    workSpaceInBytes_ = maxWorkSpace;
-
-    VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
-            << " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_;
-  }
-
-  isSelectAlgo_ = true;
+size_t ConvProjection::calInputSize() {
+  return static_cast<size_t>(configChannels_ * imageH_ * imageW_);
 }
 
 void ConvProjection::forward() {
@@ -179,7 +62,7 @@ void ConvProjection::forward() {
     real *inputData = in_->value->getData() + g * inputOffset_;
     real *wgtData = weight_->getW()->getData() + g * weightOffset_;
     real *outData = out_->value->getData() + g * outputOffset_;
-    hl_convolution_forward(inputDesc_,
+    hl_convolution_forward(imageDesc_,
                            inputData,
                            outputDesc_,
                            outData,
@@ -205,7 +88,7 @@ void ConvProjection::backward(const UpdateCallback &callback) {
     if (weight_->getWGrad()) {
       real *inputData = in_->value->getData() + g * inputOffset_;
       real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_;
-      hl_convolution_backward_filter(inputDesc_,
+      hl_convolution_backward_filter(imageDesc_,
                                      inputData,
                                      outputDesc_,
                                      outGrad,
@@ -221,7 +104,7 @@ void ConvProjection::backward(const UpdateCallback &callback) {
     if (NULL != preGrad) {
       real *inputGrad = preGrad->getData() + g * inputOffset_;
       real *wgtData = weight_->getW()->getData() + g * weightOffset_;
-      hl_convolution_backward_data(inputDesc_,
+      hl_convolution_backward_data(imageDesc_,
                                    inputGrad,
                                    outputDesc_,
                                    outGrad,
@@ -237,26 +120,4 @@ void ConvProjection::backward(const UpdateCallback &callback) {
   weight_->getParameterPtr()->incUpdate(callback);
 }
 
-void *ConvProjection::getSpaceBytes(size_t size) {
-  std::vector<MemoryHandle *> &convMem = *convMem_;
-  if (convMem.empty()) {
-    int numDevices = hl_get_device_count();
-    convMem.resize(numDevices);
-  }
-
-  int devId = hl_get_device();
-  MemoryHandle **localMem = &(convMem[devId]);
-  if (NULL == *localMem || size > (*localMem)->getAllocSize()) {
-    *localMem = new GpuMemoryHandle(size);
-  }
-  return (*localMem)->getBuf();
-}
-
-ConvProjection::~ConvProjection() {
-  hl_destroy_tensor_descriptor(inputDesc_);
-  hl_destroy_tensor_descriptor(outputDesc_);
-  hl_destroy_filter_descriptor(filterDesc_);
-  hl_destroy_convolution_descriptor(convDesc_);
-}
-
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ConvProjection.h b/paddle/gserver/layers/ConvProjection.h
index c32e5e1d3ab2f85feb6dd2fb5fbddd7482598e58..b7d7cc9a275529a02a5d8e82d28ed79cb7ce0b43 100644
--- a/paddle/gserver/layers/ConvProjection.h
+++ b/paddle/gserver/layers/ConvProjection.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "Projection.h"
+#include "ConvBaseProjection.h"
 #include "paddle/math/MathUtils.h"
 
 namespace paddle {
@@ -22,109 +22,22 @@ namespace paddle {
 /**
  * @brief Convolution projection do the same calculation with CudnnConvLayer.
  */
-class ConvProjection : public Projection {
+class ConvProjection : public ConvBaseProjection {
 public:
   /**
    * Constructor.
    */
   ConvProjection(const ProjectionConfig& config,
                  ParameterPtr parameter,
-                 bool useGpu);
+                 bool useGpu)
+      : ConvBaseProjection(config, parameter, useGpu) {}
 
-  ~ConvProjection();
+  ~ConvProjection() {}
 
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
-
-protected:
-  void getConvParams();
-  void initCudnn();
-
-  void reshapeTensorDesc(int batchSize);
-  void reshape(int batchSize);
-
-  size_t calOutputSize() {
-    imageH_ = in_->getFrameHeight();
-    imageW_ = in_->getFrameWidth();
-    if (imageH_ == 0) imageH_ = configImgH_;
-    if (imageW_ == 0) imageW_ = configImgW_;
-    outputH_ = outputSize(imageH_,
-                          filterH_,
-                          paddingH_,
-                          strideH_,
-                          /* caffeMode */ true);
-    outputW_ = outputSize(imageW_,
-                          filterW_,
-                          paddingW_,
-                          strideW_,
-                          /* caffeMode */ true);
-
-    const_cast<Argument*>(out_)->setFrameHeight(outputH_);
-    const_cast<Argument*>(out_)->setFrameWidth(outputW_);
-
-    inputOffset_ = (channels_ / groups_) * imageH_ * imageW_;
-    outputOffset_ = (numFilters_ / groups_) * outputH_ * outputW_;
-    return outputH_ * outputW_ * numFilters_;
-  }
-
-  static void* getSpaceBytes(size_t size);
-
-  /// imageH_ and imageW_ is calculated from the input layer.
-  int imageH_, imageW_;
-  /// configImgH_ and configImgW_ is obtained from config.
-  int configImgH_, configImgW_;
-  int outputH_, outputW_;
-  int channels_, numFilters_;
-  int paddingH_, paddingW_;
-  int strideH_, strideW_;
-  int filterH_, filterW_;
-  /// One group offset of input data.
-  int inputOffset_;
-  /// One group offset of output data.
-  int outputOffset_;
-  /// One group offset of weight.
-  int weightOffset_;
-  int groups_;
-
-  /// Cudnn tensor descriptor for input.
-  hl_tensor_descriptor inputDesc_;
-  /// Cudnn tensor descriptor for output.
-  hl_tensor_descriptor outputDesc_;
-  /// Cudnn tensor descriptor for filter.
-  hl_filter_descriptor filterDesc_;
-  /// Cudnn tensor descriptor for a convolution operation.
-  hl_convolution_descriptor convDesc_;
-
-  /// Record the algorithm for forward convolution, which is obtained by cudnn
-  /// api to search the best suited algorithm.
-  int fwdAlgo_;
-  /// Record the algorithm for computing convolution gradient with respect to
-  /// filter coefficients.
-  int bwdFilterAlgo_;
-  /// Record the algorithm for computing convolution gradient with respect to
-  /// the output.
-  int bwdDataAlgo_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// forward convolution with the specified algo.
-  size_t fwdLimitBytes_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// backwardFilter with the specified algo.
-  size_t bwdDataLimitBytes_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// backwardData with the specified algo.
-  size_t bwdFilterLimitBytes_;
-  /// Size of total work space.
-  size_t workSpaceInBytes_;
-
-  /// Whether to call cuDNN api to choose conv algorithm.
-  bool isSelectAlgo_;
-  /// batchNum is used to record batch size. If the batch size is changed,
-  /// the selection algorithm will be called.
-  int batchNum_;
-  bool bias_;
-
-  std::unique_ptr<Weight> weight_;
-  static ThreadLocalD<std::vector<MemoryHandle*>> convMem_;
+  virtual size_t calOutputSize();
+  virtual size_t calInputSize();
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ConvTransOperator.cpp b/paddle/gserver/layers/ConvTransOperator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..db026337a473f7edf1a7c0db320f60ff3048eb9c
--- /dev/null
+++ b/paddle/gserver/layers/ConvTransOperator.cpp
@@ -0,0 +1,125 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvTransOperator.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvTransOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+REGISTER_OPERATOR(convt, ConvTransOperator);
+
+void ConvTransOperator::reshape(int batchSize) {
+  outputH_ = ins_[0]->getFrameHeight();
+  outputW_ = ins_[0]->getFrameWidth();
+  if (outputH_ == 0) outputH_ = outputY_;
+  if (outputW_ == 0) outputW_ = outputX_;
+  imageH_ = imageSize(outputH_, filterSizeY_, paddingY_, strideY_, caffeMode_);
+  imageW_ = imageSize(outputW_, filterSize_, padding_, stride_, caffeMode_);
+  /// Check that the imageSizes are consistent with config
+  CHECK_EQ(imageH_, imgSizeY_);
+  CHECK_EQ(imageW_, imgSize_);
+  out_->setFrameHeight(imageH_);
+  out_->setFrameWidth(imageW_);
+
+  reshapeImageDescriptors();
+
+  inputOffset_ = numFilters_ * outputH_ * outputW_;
+  outputOffset_ = channels_ * imageH_ * imageW_;
+  weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSizeY_;
+
+  if (!isSelectAlgo_) {
+    allocConvWorkSpace();
+  }
+
+  isSelectAlgo_ = true;
+}
+
+void ConvTransOperator::forward() {
+  size_t batchSize = ins_[0]->value->getHeight();
+  reshape(batchSize);
+  CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
+  checkFilterSize(ins_[1]->value);
+  Matrix::resizeOrCreate(
+      out_->value, batchSize, imageH_ * imageW_ * channels_, false, useGpu_);
+  {
+    AsyncGpuBlock block;
+    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
+      real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
+      real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
+      real *outData = out_->value->getData() + outputOffset_ * batchId;
+      hl_convolution_backward_data(imageDesc_,
+                                   outData,
+                                   outputDesc_,
+                                   inputData,
+                                   filterDesc_,
+                                   wgtData,
+                                   convDesc_,
+                                   workSpace_,
+                                   workSpaceInBytes_,
+                                   bwdDataAlgo_);
+    }
+  }
+}
+
+void ConvTransOperator::backward() {
+  size_t batchSize = ins_[0]->value->getHeight();
+  {
+    AsyncGpuBlock block;
+    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
+      real *outGrad = out_->grad->getData() + outputOffset_ * batchId;
+      if (ins_[1]->grad) {
+        real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
+        real *weightGrad = ins_[1]->grad->getData() + weightOffset_ * batchId;
+        hl_convolution_backward_filter(imageDesc_,
+                                       outGrad,
+                                       outputDesc_,
+                                       inputData,
+                                       filterDesc_,
+                                       weightGrad,
+                                       convDesc_,
+                                       workSpace_,
+                                       workSpaceInBytes_,
+                                       bwdFilterAlgo_);
+      }
+
+      MatrixPtr preGrad = ins_[0]->grad;
+      if (NULL != preGrad) {
+        real *inputGrad = preGrad->getData() + inputOffset_ * batchId;
+        real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
+        hl_convolution_forward(imageDesc_,
+                               outGrad,
+                               outputDesc_,
+                               inputGrad,
+                               filterDesc_,
+                               wgtData,
+                               convDesc_,
+                               workSpace_,
+                               workSpaceInBytes_,
+                               fwdAlgo_);
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvTransOperator.h b/paddle/gserver/layers/ConvTransOperator.h
new file mode 100644
index 0000000000000000000000000000000000000000..ca08dc9aa77d59b45635c16cdd5064c5c3b5f96d
--- /dev/null
+++ b/paddle/gserver/layers/ConvTransOperator.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "ConvBaseOperator.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvTransOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+class ConvTransOperator : public ConvBaseOperator {
+public:
+  ConvTransOperator(const OperatorConfig &config, bool useGpu)
+      : ConvBaseOperator(config, useGpu) {}
+  /**
+   * Free workspace in device and destroy cudnn tensor descriptor.
+   */
+  virtual ~ConvTransOperator() {}
+  void forward() override;
+  void backward() override;
+  void reshape(int batchSize) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvTransProjection.cpp b/paddle/gserver/layers/ConvTransProjection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..48132a3ce4cc4b50fea6d755d84d7254d2055bec
--- /dev/null
+++ b/paddle/gserver/layers/ConvTransProjection.cpp
@@ -0,0 +1,123 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvTransProjection.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_PROJECTION(convt, ConvTransProjection);
+size_t ConvTransProjection::calOutputSize() {
+  outputH_ = in_->getFrameHeight();
+  outputW_ = in_->getFrameWidth();
+  if (outputH_ == 0) outputH_ = configOutH_;
+  if (outputW_ == 0) outputW_ = configOutW_;
+  imageH_ = imageSize(outputH_,
+                      filterH_,
+                      paddingH_,
+                      strideH_,
+                      /* caffeMode */ true);
+
+  imageW_ = imageSize(outputW_,
+                      filterW_,
+                      paddingW_,
+                      strideW_,
+                      /* caffeMode */ true);
+
+  const_cast<Argument *>(out_)->setFrameHeight(imageH_);
+  const_cast<Argument *>(out_)->setFrameWidth(imageW_);
+
+  inputOffset_ = (configChannels_ / groups_) * outputH_ * outputW_;
+  outputOffset_ = (configNumFilters_ / groups_) * imageH_ * imageW_;
+  return imageH_ * imageW_ * configNumFilters_;
+}
+
+size_t ConvTransProjection::calInputSize() {
+  return static_cast<size_t>(configChannels_ * outputH_ * outputW_);
+}
+
+void ConvTransProjection::forward() {
+  int batchSize = in_->value->getHeight();
+  reshape(batchSize);
+
+  void *workSpace = NULL;
+  if (workSpaceInBytes_ > 0) {
+    workSpace = getSpaceBytes(workSpaceInBytes_);
+  }
+
+  for (int g = 0; g < groups_; ++g) {
+    REGISTER_TIMER_INFO("CudnnConvTransFwTimer", getName().c_str());
+
+    real *inData = in_->value->getData() + g * inputOffset_;
+    real *wgtData = weight_->getW()->getData() + g * weightOffset_;
+    real *outData = out_->value->getData() + g * outputOffset_;
+    hl_convolution_backward_data(imageDesc_,
+                                 outData,
+                                 outputDesc_,
+                                 inData,
+                                 filterDesc_,
+                                 wgtData,
+                                 convDesc_,
+                                 workSpace,
+                                 bwdDataLimitBytes_,
+                                 bwdDataAlgo_);
+  }
+}
+
+void ConvTransProjection::backward(const UpdateCallback &callback) {
+  REGISTER_TIMER_INFO("CudnnConvTransBpTimer", getName().c_str());
+
+  void *workSpace = NULL;
+  if (workSpaceInBytes_ > 0) {
+    workSpace = getSpaceBytes(workSpaceInBytes_);
+  }
+
+  for (int g = 0; g < groups_; ++g) {
+    real *outGrad = out_->grad->getData() + g * outputOffset_;
+    if (weight_->getWGrad()) {
+      real *inData = in_->value->getData() + g * inputOffset_;
+      real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_;
+      hl_convolution_backward_filter(imageDesc_,
+                                     outGrad,
+                                     outputDesc_,
+                                     inData,
+                                     filterDesc_,
+                                     weightGrad,
+                                     convDesc_,
+                                     workSpace,
+                                     bwdFilterLimitBytes_,
+                                     bwdFilterAlgo_);
+    }
+
+    MatrixPtr preGrad = in_->grad;
+    if (NULL != preGrad) {
+      real *inGrad = preGrad->getData() + g * inputOffset_;
+      real *wgtData = weight_->getW()->getData() + g * weightOffset_;
+      hl_convolution_forward(imageDesc_,
+                             outGrad,
+                             outputDesc_,
+                             inGrad,
+                             filterDesc_,
+                             wgtData,
+                             convDesc_,
+                             workSpace,
+                             fwdLimitBytes_,
+                             fwdAlgo_);
+    }
+  }
+
+  weight_->getParameterPtr()->incUpdate(callback);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvTransProjection.h b/paddle/gserver/layers/ConvTransProjection.h
new file mode 100644
index 0000000000000000000000000000000000000000..6508d17b2409aa0cc11cdafb306604816f010718
--- /dev/null
+++ b/paddle/gserver/layers/ConvTransProjection.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "ConvBaseProjection.h"
+#include "paddle/math/MathUtils.h"
+
+namespace paddle {
+
+/**
+ * @brief Convolution projection do the same calculation with CudnnConvLayer.
+ */
+class ConvTransProjection : public ConvBaseProjection {
+public:
+  /**
+   * Constructor.
+   */
+  ConvTransProjection(const ProjectionConfig& config,
+                      ParameterPtr parameter,
+                      bool useGpu)
+      : ConvBaseProjection(config, parameter, useGpu) {}
+
+  ~ConvTransProjection() {}
+
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+  virtual size_t calOutputSize();
+  virtual size_t calInputSize();
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp
index 998b8d7d3034cb18fbab242c66656092bfc50fcb..4ae5b828707eb8412e98cbefcf3949d62e81ad1e 100644
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -192,6 +192,59 @@ void SumOfSquaresCostLayer::backwardImp(Matrix& output,
   outputG.sumOfSquaresBp(output, *label.value);
 }
 
+//
+// class SmoothL1CostLayer
+//
+
+REGISTER_LAYER(smooth_l1, SmoothL1CostLayer);
+
+bool SmoothL1CostLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  return CostLayer::init(layerMap, parameterMap);
+}
+
+void SmoothL1CostLayer::forwardImp(Matrix& output,
+                                   Argument& label,
+                                   Matrix& target) {
+  MatrixPtr targetCpu, outputCpu, labelCpu;
+  if (useGpu_) {
+    targetCpu =
+        Matrix::create(target.getHeight(), target.getWidth(), false, false);
+    outputCpu =
+        Matrix::create(output.getHeight(), output.getWidth(), false, false);
+    labelCpu = Matrix::create(
+        label.value->getHeight(), label.value->getWidth(), false, false);
+    targetCpu->copyFrom(target);
+    outputCpu->copyFrom(output);
+    labelCpu->copyFrom(*label.value);
+    targetCpu->smoothL1(*outputCpu, *(labelCpu));
+    target.copyFrom(*targetCpu);
+  } else {
+    target.smoothL1(output, *label.value);
+  }
+}
+
+void SmoothL1CostLayer::backwardImp(Matrix& output,
+                                    Argument& label,
+                                    Matrix& outputG) {
+  MatrixPtr outputGCpu, outputCpu, labelCpu;
+  if (useGpu_) {
+    outputGCpu =
+        Matrix::create(outputG.getHeight(), outputG.getWidth(), false, false);
+    outputCpu =
+        Matrix::create(output.getHeight(), output.getWidth(), false, false);
+    labelCpu = Matrix::create(
+        label.value->getHeight(), label.value->getWidth(), false, false);
+    outputGCpu->copyFrom(outputG);
+    outputCpu->copyFrom(output);
+    labelCpu->copyFrom(*label.value);
+    outputGCpu->smoothL1Bp(*outputCpu, *labelCpu);
+    outputG.copyFrom(*outputGCpu);
+  } else {
+    outputG.smoothL1Bp(output, *label.value);
+  }
+}
+
 //
 // class RankingCost
 //
diff --git a/paddle/gserver/layers/CostLayer.h b/paddle/gserver/layers/CostLayer.h
index b3045e0b31308abf2caa90cbd21f105e685ef341..569a6840f0d4432cc827219f590b821df115c7ea 100644
--- a/paddle/gserver/layers/CostLayer.h
+++ b/paddle/gserver/layers/CostLayer.h
@@ -159,6 +159,29 @@ public:
                    Matrix& outputGrad) override;
 };
 
+/**
+ * This cost layer compute smooth L1 loss for real-valued regression
+ * tasks.
+ * \f[
+ * L =
+ *   (output - label)^2 * 0.5  / -1 < (output - label) < 1 /
+ *   (output - label) - 0.5    / otherwise  /
+ * \f]
+ */
+class SmoothL1CostLayer : public CostLayer {
+public:
+  explicit SmoothL1CostLayer(const LayerConfig& config) : CostLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
+
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
+};
+
 /**
  * A cost layer for learning to rank (LTR) task. This layer contains at leat
  * three inputs.
diff --git a/paddle/gserver/layers/CudnnConvLayer.cpp b/paddle/gserver/layers/CudnnConvBaseLayer.cpp
similarity index 66%
rename from paddle/gserver/layers/CudnnConvLayer.cpp
rename to paddle/gserver/layers/CudnnConvBaseLayer.cpp
index 978c2c1479c64ab2cdebaaff7394059b3d033ab6..24363bb8b09cc354c25abe512257be68566c10e1 100644
--- a/paddle/gserver/layers/CudnnConvLayer.cpp
+++ b/paddle/gserver/layers/CudnnConvBaseLayer.cpp
@@ -12,16 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "CudnnConvLayer.h"
+#include "CudnnConvBaseLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
+REGISTER_LAYER(cudnn_conv, CudnnConvBaseLayer);
+REGISTER_LAYER(cudnn_convt, CudnnConvBaseLayer);
 
-REGISTER_LAYER(cudnn_conv, CudnnConvLayer);
-
-bool CudnnConvLayer::init(const LayerMap &layerMap,
-                          const ParameterMap &parameterMap) {
+bool CudnnConvBaseLayer::init(const LayerMap &layerMap,
+                              const ParameterMap &parameterMap) {
   if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
   CHECK(useGpu_) << "CudnnConvLayer only support gpu";
 
@@ -33,7 +33,11 @@ bool CudnnConvLayer::init(const LayerMap &layerMap,
   CHECK(config_.shared_biases());
   for (size_t i = 0; i < inputLayers_.size(); i++) {
     ProjectionConfig *conf = new ProjectionConfig();
-    conf->set_type("conv");
+    if (isDeconv_) {
+      conf->set_type("convt");
+    } else {
+      conf->set_type("conv");
+    }
     conf->set_num_filters(numFilters_);
     ConvConfig *convConf = conf->mutable_conv_conf();
     *convConf = *(config_.mutable_inputs(i)->mutable_conv_conf());
@@ -47,14 +51,13 @@ bool CudnnConvLayer::init(const LayerMap &layerMap,
   if (biases_.get() && sharedBiases_) {
     hl_create_tensor_descriptor(&biasDesc_);
     hl_create_tensor_descriptor(&outputDesc_);
-    hl_tensor_reshape(biasDesc_, 1, numFilters_ / groups_[0], 1, 1);
-    biasOffset_ = numFilters_ / groups_[0];
+    hl_tensor_reshape(biasDesc_, 1, numFilters_, 1, 1);
   }
 
   return true;
 }
 
-void CudnnConvLayer::forward(PassType passType) {
+void CudnnConvBaseLayer::forward(PassType passType) {
   Layer::forward(passType);
 
   int batchSize = getInput(0).getBatchSize();
@@ -67,37 +70,41 @@ void CudnnConvLayer::forward(PassType passType) {
   if (biases_) {
     REGISTER_TIMER_INFO("CudnnConvBiasTimer", getName().c_str());
     int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+    int outH, outW;
+    if (isDeconv_) {
+      outH = imgSizeH_[0];
+      outW = imgSizeW_[0];
+    } else {
+      outH = outputH_[0];
+      outW = outputW_[0];
+    }
+
     hl_tensor_reshape(outputDesc_,
                       batchSize,
-                      numFilters_ / groups_[0],
-                      outputH_[0],
-                      outputW_[0],
-                      numFilters_ * outputH_[0] * outputW_[0],
-                      outputH_[0] * outputW_[0],
-                      outputW_[0],
+                      numFilters_,
+                      outH,
+                      outW,
+                      numFilters_ * outH * outW,
+                      outH * outW,
+                      outW,
                       1);
-    outputOffset_ = getOutputValue()->getWidth() / groups_[0];
-    for (int g = 0; g < groups_[0]; ++g) {
-      real *biasData = biases_->getW()->getData() + biasOffset_ * g;
-      real *outData = getOutputValue()->getData() + outputOffset_ * g;
-      hl_convolution_forward_add_bias(
-          biasDesc_, biasData, outputDesc_, outData);
-    }
+    real *outData = getOutputValue()->getData();
+    real *biasData = biases_->getW()->getData();
+    hl_convolution_forward_add_bias(biasDesc_, biasData, outputDesc_, outData);
   }
 
   forwardActivation();
 }
 
-void CudnnConvLayer::backward(const UpdateCallback &callback) {
+void CudnnConvBaseLayer::backward(const UpdateCallback &callback) {
   backwardActivation();
 
   if (biases_ && biases_->getWGrad()) {
     REGISTER_TIMER_INFO("CudnnConvBpBiasTimer", getName().c_str());
-    for (int g = 0; g < groups_[0]; ++g) {
-      real *biasGrad = biases_->getWGrad()->getData() + biasOffset_ * g;
-      real *outGrad = getOutputGrad()->getData() + outputOffset_ * g;
-      hl_convolution_backward_bias(biasDesc_, biasGrad, outputDesc_, outGrad);
-    }
+    real *biasGrad = biases_->getWGrad()->getData();
+    real *outGrad = getOutputGrad()->getData();
+    hl_convolution_backward_bias(biasDesc_, biasGrad, outputDesc_, outGrad);
+
     biases_->getParameterPtr()->incUpdate(callback);
   }
 
@@ -106,7 +113,7 @@ void CudnnConvLayer::backward(const UpdateCallback &callback) {
   }
 }
 
-CudnnConvLayer::~CudnnConvLayer() {
+CudnnConvBaseLayer::~CudnnConvBaseLayer() {
   if (biases_) {
     hl_destroy_tensor_descriptor(biasDesc_);
     hl_destroy_tensor_descriptor(outputDesc_);
diff --git a/paddle/gserver/layers/CudnnConvLayer.h b/paddle/gserver/layers/CudnnConvBaseLayer.h
similarity index 86%
rename from paddle/gserver/layers/CudnnConvLayer.h
rename to paddle/gserver/layers/CudnnConvBaseLayer.h
index 919b1efc4e453219a6c2ab1a11c61ccb99404084..93a05f94c7717f9170818b9d5ce3d27a6d18cef5 100644
--- a/paddle/gserver/layers/CudnnConvLayer.h
+++ b/paddle/gserver/layers/CudnnConvBaseLayer.h
@@ -30,27 +30,24 @@ namespace paddle {
  *
  * The config file api is img_conv_layer.
  */
-class CudnnConvLayer : public ConvBaseLayer {
+class CudnnConvBaseLayer : public ConvBaseLayer {
 protected:
   std::vector<std::unique_ptr<ProjectionConfig>> projConf_;
   std::vector<std::unique_ptr<Projection>> projections_;
 
   hl_tensor_descriptor biasDesc_;
   hl_tensor_descriptor outputDesc_;
-  int biasOffset_;
-  int outputOffset_;
 
 public:
-  explicit CudnnConvLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
+  explicit CudnnConvBaseLayer(const LayerConfig& config)
+      : ConvBaseLayer(config) {}
 
-  ~CudnnConvLayer();
+  ~CudnnConvBaseLayer();
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
 
   bool init(const LayerMap& layerMap,
             const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-  void addBiases();
-  void bpropBiases();
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/Layer.cpp b/paddle/gserver/layers/Layer.cpp
index f76d41ad3e8a3b1730f9d50c0773ee4f61ddb541..125aaf947f3c9d976b117667d1d1b7700a029cc6 100644
--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -381,8 +381,7 @@ void Layer::backwardActivation() {
 void Layer::forwardDropOut() {
   auto& outV = getOutputValue();
 
-  if (passType_ == PASS_TRAIN || passType_ == PASS_METRIC_TRAIN ||
-      passType_ == PASS_METRIC_TRAIN_WITH_NOERROR) {
+  if (passType_ == PASS_TRAIN) {
     // new dropOutMask_ if dropOutMask_ is null ptr
     Matrix::resizeOrCreate(dropOutMask_,
                            outV->getHeight(),
diff --git a/paddle/gserver/layers/SequencePoolLayer.cpp b/paddle/gserver/layers/SequencePoolLayer.cpp
index 35260ca912d5d0e00213ffb7074bd8963da265da..5807c4249620db44fed82a6bb69a77d807d9f0a0 100644
--- a/paddle/gserver/layers/SequencePoolLayer.cpp
+++ b/paddle/gserver/layers/SequencePoolLayer.cpp
@@ -56,17 +56,16 @@ void SequencePoolLayer::forward(PassType passType) {
   CHECK_EQ(newBatchSize_, starts->getSize() - 1);
 
   resetOutput(newBatchSize_, dim);
-  if (type_) {
-    CHECK(input.subSequenceStartPositions)
-        << "when trans_type = seq, input must hasSubseq";
-  }
+
   /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
    * thus, in this case, output_ has no sequenceStartPositions.
    * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
    * case, we should compute the new sequenceStartPositions.
   */
   if (type_) {
-    output_.degradeSequence(input, useGpu_);
+    CHECK(input.subSequenceStartPositions)
+        << "when trans_type = seq, input must hasSubseq";
+    output_.degradeSequence(input);
   }
 }
 
diff --git a/paddle/gserver/tests/test_ConvUnify.cpp b/paddle/gserver/tests/test_ConvUnify.cpp
index 207fc0566fcf4a0d2e971f3c169a14a64146155b..54b72375b743fe025e0ded5fdbce5699a0b4be1a 100644
--- a/paddle/gserver/tests/test_ConvUnify.cpp
+++ b/paddle/gserver/tests/test_ConvUnify.cpp
@@ -34,8 +34,7 @@ DECLARE_double(checkgrad_eps);
 DECLARE_bool(thread_local_rand_use_global_seed);
 DECLARE_bool(prev_batch_state);
 
-// Do one forward pass of convTrans layer and check to see if its output
-// matches the given result
+// Do one forward pass of ConvLayer using either exconv or cudnn_conv
 MatrixPtr doOneConvTest(size_t imgSize,
                         size_t output_x,
                         size_t stride,
@@ -46,22 +45,35 @@ MatrixPtr doOneConvTest(size_t imgSize,
                         size_t groups,
                         MatrixPtr& inputData,
                         real* param,
-                        bool useGpu) {
+                        bool useGpu,
+                        bool isDeconv = false) {
   TestConfig config;
   config.biasSize = numfilters;
+  string layerType;
   if (useGpu) {
-    config.layerConfig.set_type("cudnn_conv");
+    layerType = (isDeconv) ? "cudnn_convt" : "cudnn_conv";
   } else {
-    config.layerConfig.set_type("exconv");
+    layerType = (isDeconv) ? "exconvt" : "exconv";
   }
+  config.layerConfig.set_type(layerType);
   config.layerConfig.set_num_filters(numfilters);
   config.layerConfig.set_partial_sum(1);
   config.layerConfig.set_shared_biases(true);
 
   size_t weightSize = channel * filter_size * filter_size *
                       config.layerConfig.num_filters() / groups;
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", imgSize * imgSize * channel, weightSize});
+  if (isDeconv) {
+    config.inputDefs.push_back(
+        {INPUT_DATA, "layer_0", output_x * output_x * channel, weightSize});
+    config.layerConfig.set_size(imgSize * imgSize *
+                                config.layerConfig.num_filters());
+  } else {
+    config.inputDefs.push_back(
+        {INPUT_DATA, "layer_0", imgSize * imgSize * channel, weightSize});
+    config.layerConfig.set_size(output_x * output_x *
+                                config.layerConfig.num_filters());
+  }
+
   LayerInputConfig* input = config.layerConfig.add_inputs();
   ConvConfig* conv = input->mutable_conv_conf();
   conv->set_filter_size(filter_size);
@@ -72,12 +84,15 @@ MatrixPtr doOneConvTest(size_t imgSize,
   conv->set_stride(stride);
   conv->set_stride_y(stride);
   conv->set_groups(groups);
-  conv->set_filter_channels(channel / groups);
   conv->set_img_size(imgSize);
   conv->set_output_x(output_x);
 
-  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
-                              config.layerConfig.num_filters());
+  if (isDeconv) {
+    conv->set_filter_channels(numfilters / groups);
+  } else {
+    conv->set_filter_channels(channel / groups);
+  }
+
   config.layerConfig.set_name("conv");
 
   std::vector<DataLayerPtr> dataLayers;
@@ -105,6 +120,8 @@ MatrixPtr doOneConvTest(size_t imgSize,
 TEST(Layer, convParaUnified) {
 #ifndef PADDLE_ONLY_CPU
   MatrixPtr input, resultCpu, resultGpu;
+
+  /// TEST1 for conv ///
   input = Matrix::create(1, 4 * 4, false, false);
   real inputData[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
   real param[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 7, 6, 5, 4, 3, 2, 1};
@@ -121,7 +138,7 @@ TEST(Layer, convParaUnified) {
                             /*groups*/ 1,
                             input,
                             param,
-                            false);
+                            /*useGpu*/ false);
 
   resultGpu = doOneConvTest(/* imgSize */ 4,
                             /* output_x */ 2,
@@ -133,9 +150,42 @@ TEST(Layer, convParaUnified) {
                             /*groups*/ 1,
                             input,
                             param,
-                            true);
+                            /*useGpu*/ true);
   checkMatrixEqual(resultCpu, resultGpu);
 
+  /// TEST1 for deconv ///
+  input = Matrix::create(1, 2 * 2, false, false);
+  real inputDataT[] = {1, 2, 3, 4};
+  input->setData(inputDataT);
+
+  resultCpu = doOneConvTest(/* imgSize */ 4,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 3,
+                            /*channel*/ 1,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param,
+                            /*useGpu*/ false,
+                            /*isDeconv*/ true);
+
+  resultGpu = doOneConvTest(/* imgSize */ 4,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 3,
+                            /*channel*/ 1,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param,
+                            /*useGpu*/ true,
+                            /*isDeconv*/ true);
+  checkMatrixEqual(resultCpu, resultGpu);
+
+  /// TEST2 for conv ///
   input = Matrix::create(1, 3 * 3 * 2, false, false);
   real inputData2[] = {
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
@@ -153,7 +203,7 @@ TEST(Layer, convParaUnified) {
                             /*groups*/ 1,
                             input,
                             param2,
-                            false);
+                            /*useGpu*/ false);
 
   resultGpu = doOneConvTest(/* imgSize */ 3,
                             /* output_x */ 2,
@@ -165,9 +215,10 @@ TEST(Layer, convParaUnified) {
                             /*groups*/ 1,
                             input,
                             param2,
-                            true);
+                            /*useGpu*/ true);
   checkMatrixEqual(resultCpu, resultGpu);
 
+  /// TEST3 for conv ///
   real param3[] = {1, 2, 3, 4, 4, 3, 2, 1};
 
   resultCpu = doOneConvTest(/* imgSize */ 3,
@@ -180,7 +231,66 @@ TEST(Layer, convParaUnified) {
                             /*groups*/ 2,
                             input,
                             param3,
-                            false);
+                            /*useGpu*/ false);
+
+  resultGpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 2,
+                            input,
+                            param3,
+                            /*useGpu*/ true);
+  checkMatrixEqual(resultCpu, resultGpu);
+
+  /// TEST2 for deconv ///
+  input = Matrix::create(1, 2 * 2 * 2, false, false);
+  real inputData2T[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  input->setData(inputData2T);
+
+  resultCpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param2,
+                            /*useGpu*/ false,
+                            /*isDeconv*/ true);
+
+  resultGpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param2,
+                            /*useGpu*/ true,
+                            /*isDeconv*/ true);
+  checkMatrixEqual(resultCpu, resultGpu);
+
+  /// TEST3 for deconv ///
+  resultCpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 2,
+                            input,
+                            param3,
+                            /*useGpu*/ false,
+                            /*isDeconv*/ true);
 
   resultGpu = doOneConvTest(/* imgSize */ 3,
                             /* output_x */ 2,
@@ -192,7 +302,8 @@ TEST(Layer, convParaUnified) {
                             /*groups*/ 2,
                             input,
                             param3,
-                            true);
+                            /*useGpu*/ true,
+                            /*isDeconv*/ true);
   checkMatrixEqual(resultCpu, resultGpu);
 #endif
 }
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index ceb69359c992128635c199e56805d3f603ca4271..5f8a7b79a06e014e3d9cb03ab033e0bce47a432a 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -166,15 +166,19 @@ TEST(Projection, scaling) {
   }
 }
 
-void testProjectionConv(size_t groups) {
+void testProjectionConv(size_t groups, bool isDeconv) {
   const int NUM_FILTERS = 18;
   const int FILTER_SIZE = 2;
-  const int FILTER_SIZE_Y = 3;
+  const int FILTER_SIZE_Y = 4;
   const int CHANNELS = 3;
   const int IMAGE_SIZE = 16;
 
   ProjectionConfig conf;
-  conf.set_type("conv");
+  if (isDeconv) {
+    conf.set_type("convt");
+  } else {
+    conf.set_type("conv");
+  }
   conf.set_num_filters(NUM_FILTERS);
 
   ConvConfig* conv = conf.mutable_conv_conf();
@@ -186,7 +190,11 @@ void testProjectionConv(size_t groups) {
   conv->set_stride(2);
   conv->set_stride_y(2);
   conv->set_groups(groups);
-  conv->set_filter_channels(conv->channels() / conv->groups());
+  if (isDeconv) {
+    conv->set_filter_channels(NUM_FILTERS / conv->groups());
+  } else {
+    conv->set_filter_channels(conv->channels() / conv->groups());
+  }
   conv->set_img_size(IMAGE_SIZE);
   int output_x = outputSize(conv->img_size(),
                             conv->filter_size(),
@@ -199,8 +207,14 @@ void testProjectionConv(size_t groups) {
                             conv->stride_y(),
                             /* caffeMode */ true);
   conv->set_output_x(output_x);
-  conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
-  conf.set_output_size(output_x * output_y * NUM_FILTERS);
+  conv->set_output_y(output_y);
+  if (isDeconv) {
+    conf.set_input_size(output_x * output_y * CHANNELS);
+    conf.set_output_size(IMAGE_SIZE * IMAGE_SIZE * NUM_FILTERS);
+  } else {
+    conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
+    conf.set_output_size(output_x * output_y * NUM_FILTERS);
+  }
 
   testProjectionGrad(conf,
                      INPUT_DATA,
@@ -215,8 +229,12 @@ void testProjectionConv(size_t groups) {
 
 #ifndef PADDLE_ONLY_CPU
 TEST(Projection, conv) {
-  testProjectionConv(1);
-  testProjectionConv(3);
+  /// test ConvProjection
+  testProjectionConv(1, false);
+  testProjectionConv(3, false);
+  /// test ConvTransProjection
+  testProjectionConv(1, true);
+  testProjectionConv(3, true);
 }
 #endif
 
@@ -385,11 +403,11 @@ void testConvTransLayer(const string& type, bool trans, bool useGpu) {
   config.layerConfig.set_partial_sum(1);
   config.layerConfig.set_shared_biases(true);
 
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 288});
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 384});
   LayerInputConfig* input = config.layerConfig.add_inputs();
   ConvConfig* conv = input->mutable_conv_conf();
   conv->set_filter_size(2);
-  conv->set_filter_size_y(3);
+  conv->set_filter_size_y(4);
   conv->set_channels(16);
   conv->set_padding(0);
   conv->set_padding_y(1);
@@ -416,6 +434,9 @@ TEST(Layer, convTransLayer) {
   for (auto useGpu : {false, true}) {
     testConvTransLayer("exconvt", /* trans= */ false, /* useGpu= */ useGpu);
   }
+#ifndef PADDLE_ONLY_CPU
+  testConvTransLayer("cudnn_convt", /* trans= */ false, /* useGpu= */ true);
+#endif
 }
 
 TEST(Layer, blockExpandLayer) {
@@ -1482,16 +1503,20 @@ TEST(Layer, BatchNormalizationLayer) {
 #endif
 }
 
-TEST(Operator, conv) {
+void testConvOperator(bool isDeconv) {
   TestConfig config;
   const int NUM_FILTERS = 16;
   const int FILTER_SIZE = 2;
   const int FILTER_SIZE_Y = 3;
   const int CHANNELS = 3;
   const int IMAGE_SIZE = 16;
-  const int IMAGE_SIZE_Y = 8;
+  const int IMAGE_SIZE_Y = 9;
   OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
-  operatorConf.set_type("conv");
+  if (isDeconv) {
+    operatorConf.set_type("convt");
+  } else {
+    operatorConf.set_type("conv");
+  }
   ConvConfig* conv = operatorConf.mutable_conv_conf();
   operatorConf.set_num_filters(NUM_FILTERS);
   conv->set_filter_size(FILTER_SIZE);
@@ -1502,7 +1527,6 @@ TEST(Operator, conv) {
   conv->set_stride(2);
   conv->set_stride_y(2);
   conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
   conv->set_img_size(IMAGE_SIZE);
   conv->set_img_size_y(IMAGE_SIZE_Y);
   conv->set_output_x(outputSize(conv->img_size(),
@@ -1515,11 +1539,22 @@ TEST(Operator, conv) {
                                 conv->padding_y(),
                                 conv->stride_y(),
                                 /*  caffeMode */ true));
-  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
-                              NUM_FILTERS);
 
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE_Y * CHANNELS, 0});
+  if (isDeconv) {
+    conv->set_filter_channels(NUM_FILTERS / conv->groups());
+    config.inputDefs.push_back({INPUT_DATA,
+                                "layer_0",
+                                conv->output_x() * conv->output_y() * CHANNELS,
+                                0});
+    config.layerConfig.set_size(IMAGE_SIZE * IMAGE_SIZE_Y * NUM_FILTERS);
+  } else {
+    conv->set_filter_channels(conv->channels() / conv->groups());
+    config.inputDefs.push_back(
+        {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE_Y * CHANNELS, 0});
+    config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+                                NUM_FILTERS);
+  }
+
   config.inputDefs.push_back(
       {INPUT_DATA,
        "layer_1",
@@ -1531,6 +1566,11 @@ TEST(Operator, conv) {
   testOperatorGrad(config, operatorConf, 100, /*useGpu*/ true, false);
 }
 
+TEST(Operator, conv) {
+  testConvOperator(/*isDeconv*/ true);
+  testConvOperator(/*isDeconv*/ false);
+}
+
 TEST(Layer, FeatureMapExpandLayer) {
   TestConfig config;
   config.layerConfig.set_type("featmap_expand");
@@ -1602,6 +1642,20 @@ TEST(Layer, PadLayer) {
   }
 }
 
+TEST(Layer, smooth_l1) {
+  TestConfig config;
+  config.layerConfig.set_type("smooth_l1");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 1, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "smooth_l1", 100, false, useGpu, false, 2.0);
+  }
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 8189f3fd3d6c408e6e8fb4a08d170ecd34458606..b408abbf321f7b8ddc3a30bf11434e3c3211966d 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -3557,6 +3557,55 @@ void CpuMatrix::sumOfSquaresBp(Matrix& output, Matrix& label) {
   }
 }
 
+void CpuMatrix::smoothL1(Matrix& output, Matrix& label) {
+  CHECK(output.useGpu_ == false && label.useGpu_ == false)
+      << "Matrix type are not equal";
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(label.getHeight(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(label.getWidth(), dim);
+  CHECK_EQ(getWidth(), (size_t)1);
+  real* out = output.getData();
+  real* cost = getData();
+  real* lbl = label.getData();
+
+  for (size_t i = 0; i < numSamples; ++i, out += dim, cost += dim, lbl += dim) {
+    for (size_t j = 0; j < dim; ++j) {
+      cost[j] = std::fabs(out[j] - lbl[j]);
+      if (cost[j] < 1.0)
+        cost[j] = 0.5 * cost[j] * cost[j];
+      else
+        cost[j] = cost[j] - 0.5;
+    }
+  }
+}
+
+void CpuMatrix::smoothL1Bp(Matrix& output, Matrix& label) {
+  CHECK(output.useGpu_ == false && label.useGpu_ == false)
+      << "Matrix type are not equal";
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(label.getHeight(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(label.getWidth(), dim);
+  CHECK_EQ(getWidth(), (size_t)1);
+  real* out = output.getData();
+  real* cost = getData();
+  real* lbl = label.getData();
+
+  // f'(x) = x         if |x| < 1
+  //       = sign(x)   otherwise
+  for (size_t i = 0; i < numSamples; ++i, out += dim, cost += dim, lbl += dim) {
+    for (size_t j = 0; j < dim; ++j) {
+      cost[j] = out[j] - lbl[j];
+      if (std::fabs(cost[j]) >= 1) cost[j] = (0 < cost[j]) - (cost[j] < 0);
+    }
+  }
+}
+
 void CpuMatrix::tanh(Matrix& output) {
   CHECK(isContiguous());
   CHECK(output.isContiguous());
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index d0ba2e93feabfcc11ac1d261bc40c9c6973a8c29..dbdb629614546b7c7b569d7473d96a06d0c5a9c7 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -783,6 +783,14 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
+  virtual void smoothL1(Matrix& output, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void smoothL1Bp(Matrix& outputV, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
   virtual void tanh(Matrix& output) { LOG(FATAL) << "Not implemented"; }
 
   virtual void tanhDerivative(Matrix& output) {
@@ -1720,6 +1728,9 @@ public:
   /// gradient of sumOfSquares.
   void sumOfSquaresBp(Matrix& outputV, Matrix& label);
 
+  void smoothL1(Matrix& output, Matrix& label);
+  void smoothL1Bp(Matrix& output, Matrix& label);
+
   void tanh(Matrix& output);
   void tanhDerivative(Matrix& output);
 
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index 7a343cca33f5b420be6192231ac73ca1c2da5fb9..4139f59a2c8e665daf410b5b16539ff74b77ecfe 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -123,46 +123,6 @@ static void resizeAndCopy(ICpuGpuVectorPtr& dest,
   }
 }
 
-static void resizeAndCopy(UserDefinedVectorPtr& dest,
-                          const UserDefinedVectorPtr& src,
-                          bool useGpu,
-                          hl_stream_t stream) {
-  if (src) {
-    CHECK(!useGpu) << "not implemented";
-    size_t height = src->size();
-    if (!dest) {
-      dest = std::make_shared<std::vector<void*>>(height);
-    } else {
-      dest->resize(height);
-    }
-    std::copy_n(src->begin(), height, dest->begin());
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(UserDefinedVectorPtr& dest,
-                          const UserDefinedVectorPtr& src,
-                          int32_t startPos,
-                          int32_t copySize,
-                          bool useGpu,
-                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
-  if (src) {
-    CHECK(!useGpu) << "not implemented";
-    CHECK_LE((size_t)startPos + copySize, src->size());
-
-    size_t height = copySize;
-    if (!dest) {
-      dest = std::make_shared<std::vector<void*>>(height);
-    } else {
-      dest->resize(height);
-    }
-    std::copy_n(src->begin() + startPos, height, dest->begin());
-  } else {
-    dest.reset();
-  }
-}
-
 static void resizeAndCopy(SVectorPtr& dest,
                           const SVectorPtr& src,
                           bool useGpu,
@@ -223,7 +183,6 @@ void Argument::resizeAndCopyFrom(const Argument& src,
                   false /* useGpu */,
                   stream);
   }
-  resizeAndCopy(udp, src.udp, useGpu, stream);
   resizeAndCopy(strs, src.strs, useGpu, stream);
   frameWidth = src.frameWidth;
   frameHeight = src.frameHeight;
@@ -255,7 +214,6 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src,
     resizeAndCopy(value, src.value, startRow, copySize, useGpu, stream);
     resizeAndCopy(grad, src.grad, startRow, copySize, useGpu, stream);
     resizeAndCopy(ids, src.ids, startRow, copySize, useGpu, stream);
-    resizeAndCopy(udp, src.udp, startRow, copySize, useGpu, stream);
     resizeAndCopy(strs, src.strs, startRow, copySize, useGpu, stream);
     return copySize;
   } else {
@@ -268,7 +226,6 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src,
     resizeAndCopy(value, src.value, startRow, copyFeatureSize, useGpu, stream);
     resizeAndCopy(grad, src.grad, startRow, copyFeatureSize, useGpu, stream);
     resizeAndCopy(ids, src.ids, startRow, copyFeatureSize, useGpu, stream);
-    resizeAndCopy(udp, src.udp, startRow, copySize, useGpu, stream);
     resizeAndCopy(sequenceStartPositions,
                   src.sequenceStartPositions,
                   startSeq,
@@ -583,7 +540,7 @@ void Argument::checkSubset() const {
   }
 }
 
-void Argument::degradeSequence(const Argument& input, bool useGpu) {
+void Argument::degradeSequence(const Argument& input) {
   CHECK_EQ(input.hasSubseq(), 1UL);
   size_t numSequences = input.getNumSequences();
   size_t numSubSequences = input.getNumSubSequences();
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
index 9ef44be0cb3b960db1e789f3f26bb66d1fe63c81..9fd84bc4b7e0aa54d81f5d5df9e5acb3fbb70d29 100644
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -24,8 +24,6 @@ limitations under the License. */
 
 namespace paddle {
 
-// vector of user defined pointers
-typedef std::shared_ptr<std::vector<void*>> UserDefinedVectorPtr;
 typedef std::shared_ptr<std::vector<std::string>> SVectorPtr;
 
 struct Argument {
@@ -40,7 +38,6 @@ struct Argument {
         sequenceStartPositions(nullptr),
         subSequenceStartPositions(nullptr),
         cpuSequenceDims(nullptr),
-        udp(nullptr),
         deviceId(-1),
         allCount(0),
         valueCount(0),
@@ -63,7 +60,6 @@ struct Argument {
     sequenceStartPositions = argument.sequenceStartPositions;
     subSequenceStartPositions = argument.subSequenceStartPositions;
     cpuSequenceDims = argument.cpuSequenceDims;
-    udp = argument.udp;
     deviceId = argument.deviceId;
     allCount = argument.allCount;
     frameHeight = argument.frameHeight;
@@ -96,8 +92,6 @@ struct Argument {
   // dimension of sequence, stored only in CPU
   IVectorPtr cpuSequenceDims;
 
-  UserDefinedVectorPtr udp;  // user defined pointer
-
   int deviceId;            // the GPU device id which the argument in
   int allCount;            // the number of output layers using this argument
   mutable int valueCount;  // waiting this member when layer do forward
@@ -137,7 +131,6 @@ struct Argument {
     if (ids) return ids->getSize();
     if (grad) return grad->getHeight();
     if (in) return in->getHeight();
-    if (udp) return udp->size();
     if (strs) return strs->size();
     return 0;
   }
@@ -296,7 +289,7 @@ struct Argument {
   /*
    sequence has sub-sequence degrades to a sequence.
    */
-  void degradeSequence(const Argument& input, bool useGpu);
+  void degradeSequence(const Argument& input);
 
   /**
    * @brief getValueString will return the argument's output in string. There
diff --git a/paddle/pserver/BaseClient.h b/paddle/pserver/BaseClient.h
index 11d7a147bf749ba2de0772b5efd5f73ab0ccdb1a..667bc451d16aa1436ac5d74dd96edbd70556edd0 100644
--- a/paddle/pserver/BaseClient.h
+++ b/paddle/pserver/BaseClient.h
@@ -30,9 +30,6 @@ namespace paddle {
  * the first solution arms with sendThreads_/recvThreads_ and sendJobQueue_/
  * recvJobQueue_. the second solution use some shared thread pool to manage
  * connections.
- * In addition to pserver, metric learning also uses network to exchange
- * features within multi-machines, so this class just abstracts some basic
- * threads and queue buffer creation for them
  */
 class BaseClient {
 protected:
diff --git a/paddle/pserver/ParameterServer2.cpp b/paddle/pserver/ParameterServer2.cpp
index 9dd9a2d291c1cbac66df9e4947dca22c5d8d0bb3..19ff40ba7e9584f772043f939bcb31caf666163d 100644
--- a/paddle/pserver/ParameterServer2.cpp
+++ b/paddle/pserver/ParameterServer2.cpp
@@ -369,11 +369,8 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
                                    std::vector<Buffer>* outputBuffers) {
   VLOG(1) << "pserver: addGradient";
 
-/// forwardbackward delta from all trainers
-/// indicate the fluctuation caused by forwardbackward.
-#ifndef PADDLE_METRIC_LEARNING
-  // @TODO(yanfei):
-  // add support tuning forwardbackward balance for metric learning
+  // forwardbackward delta from all trainers
+  // indicate the fluctuation caused by forwardbackward.
   if (!numPassFinishClients_) {
     REGISTER_BARRIER_DELTA_SERVER_SET(
         *statSet_,
@@ -383,7 +380,6 @@ void ParameterServer2::addGradient(const SendParameterRequest& request,
         request.forwardbackward_time(),
         isSparseServer_ ? "_sparseUpdater" : "_denseUpdater");
   }
-#endif
 
   {
     /// approximately pure network overhead
diff --git a/paddle/py_paddle/.gitignore b/paddle/py_paddle/.gitignore
index 9e8ad4bf1638a69ab7ef19badfbf867e116548d2..80d1f76fbc05627e21e334af55d63a4a534434c6 100644
--- a/paddle/py_paddle/.gitignore
+++ b/paddle/py_paddle/.gitignore
@@ -1 +1,2 @@
 swig_paddle.py
+_swig_paddle.so
diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py
index c009b05cdeeb9dbe2dc70048e6827a12445f677e..6d6a406cf61d467cb2701ca5e85e99648eea36eb 100644
--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
@@ -16,11 +16,25 @@ import paddle.trainer.PyDataProvider2 as dp2
 import collections
 import swig_paddle
 import numpy
+import itertools
 
 __all__ = ['DataProviderConverter']
 
 
 class IScanner(object):
+    """
+    The scanner will scan Python object two passes, then convert it to Paddle's
+    argument.
+
+    In the first pass, `pre_scan` will be invoked by every data instance, and
+    then invoke `finish_pre_scan` to arguments. And the second pass do the same
+    thing except the functions changed to `scan`, `finish_scan`.
+
+    During the first pass, a scanner may count the shape of input matrix and
+    allocate memory for this argument. Then fill the data into this  argument
+    in second pass.
+    """
+
     def __init__(self, input_type, pos):
         self.input_type = input_type
         if not isinstance(self.input_type, dp2.InputType):
@@ -36,10 +50,40 @@ class IScanner(object):
         self.data_in_gpu = swig_paddle.isUsingGpu(
         ) and swig_paddle.getTrainerCount() == 1
 
+    def pre_scan(self, dat):
+        """
+        First pass scan method. During this method, the scanner could count the
+        data number, and get the total memory size this batch would use.
+
+        :param dat: The python object.
+        """
+        pass
+
+    def finish_pre_scan(self, argument):
+        """
+        Finish first scan pass. Allocate the memory.
+
+        :param argument: Output arguments object.
+        :type argument: swig_paddle.Arguments
+        :return:
+        """
+        pass
+
     def scan(self, dat):
+        """
+        Second pass scan method. Copy the data to arguments.
+
+        :param dat: The python object.
+        """
         pass
 
     def finish_scan(self, argument):
+        """
+        Finish second pass. Finalize the resources, etc.
+
+        :param argument: Output arguments object.
+        :type argument: swig_paddle.Arguments
+        """
         pass
 
 
@@ -51,12 +95,19 @@ class DenseScanner(IScanner):
     def __init__(self, input_type, pos):
         IScanner.__init__(self, input_type, pos)
         self.__mat__ = None
+        self.__height__ = 0
+
+    def pre_scan(self, dat):
+        self.__height__ += 1
+
+    def finish_pre_scan(self, argument):
+        self.__mat__ = numpy.ndarray(
+            shape=(self.__height__, self.input_type.dim), dtype=numpy.float32)
+        self.__height__ = 0
 
     def scan(self, dat):
-        if self.__mat__ is None:
-            self.__mat__ = numpy.array([dat], dtype='float32')
-        else:
-            self.__mat__ = numpy.append(self.__mat__, [dat], axis=0)
+        self.__mat__[self.__height__] = dat
+        self.__height__ += 1
 
     def finish_scan(self, argument):
         assert isinstance(argument, swig_paddle.Arguments)
@@ -163,7 +214,14 @@ class DataProviderConverter(object):
         ]
 
         for each_sample in dat:
-            for each_step, scanner in zip(each_sample, scanners):
+            for each_step, scanner in itertools.izip(each_sample, scanners):
+                scanner.pre_scan(each_step)
+
+        for scanner in scanners:
+            scanner.finish_pre_scan(argument)
+
+        for each_sample in dat:
+            for each_step, scanner in itertools.izip(each_sample, scanners):
                 scanner.scan(each_step)
 
         for scanner in scanners:
diff --git a/paddle/scripts/deb/build_scripts/.gitignore b/paddle/scripts/deb/build_scripts/.gitignore
deleted file mode 100644
index 1521c8b7652b1eec8ed4fe50877aae880c758ee3..0000000000000000000000000000000000000000
--- a/paddle/scripts/deb/build_scripts/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-dist
diff --git a/paddle/scripts/deb/build_scripts/Dockerfile b/paddle/scripts/deb/build_scripts/Dockerfile
deleted file mode 100644
index db365a65b7d33429dc1260b2ce69d6dc46abe487..0000000000000000000000000000000000000000
--- a/paddle/scripts/deb/build_scripts/Dockerfile
+++ /dev/null
@@ -1,5 +0,0 @@
-FROM paddledev/paddle:gpu-latest
-MAINTAINER PaddlePaddle Dev Team <paddle-dev@baidu.com>
-COPY build.sh /root/
-CMD cd /root/ && bash build.sh
-
diff --git a/paddle/scripts/deb/build_scripts/build.sh b/paddle/scripts/deb/build_scripts/build.sh
deleted file mode 100755
index d13dea514841b110c304b8aa0e65ad16e42c75f3..0000000000000000000000000000000000000000
--- a/paddle/scripts/deb/build_scripts/build.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-set -e
-apt-get update
-apt-get install -y dh-make
-cd ~
-mkdir -p ~/dist/gpu
-mkdir -p ~/dist/cpu
-mkdir -p ~/dist/cpu-noavx
-mkdir -p ~/dist/gpu-noavx
-cd paddle
-mkdir build
-cd build
-cmake .. -DWITH_GPU=OFF -DWITH_SWIG_PY=ON -DWITH_AVX=ON
-make -j `nproc`
-cpack -D CPACK_GENERATOR='DEB' ..
-mv *.deb ~/dist/cpu
-
-rm -rf *
-cmake .. -DWITH_GPU=ON -DWITH_SWIG_PY=ON -DWITH_AVX=ON -DCUDNN_ROOT=/usr/
-make -j `nproc`
-cpack -D CPACK_GENERATOR='DEB' ..
-mv *.deb ~/dist/gpu
-
-
-rm -rf *
-cmake .. -DWITH_GPU=OFF -DWITH_SWIG_PY=ON -DWITH_AVX=OFF
-make -j `nproc`
-cpack -D CPACK_GENERATOR='DEB' ..
-mv *.deb ~/dist/cpu-noavx
-
-rm -rf *
-cmake .. -DWITH_GPU=ON -DWITH_SWIG_PY=ON -DWITH_AVX=OFF -DCUDNN_ROOT=/usr/
-make -j `nproc`
-cpack -D CPACK_GENERATOR='DEB' ..
-mv *.deb ~/dist/gpu-noavx
diff --git a/paddle/scripts/deb/build_scripts/build_deb.sh b/paddle/scripts/deb/build_scripts/build_deb.sh
deleted file mode 100755
index c38c6299f840345b7f6f6e0aad7482241d36198a..0000000000000000000000000000000000000000
--- a/paddle/scripts/deb/build_scripts/build_deb.sh
+++ /dev/null
@@ -1,8 +0,0 @@
-#!/bin/bash
-set -e
-docker build -t build_paddle_deb .
-rm -rf dist
-mkdir -p dist
-docker run -v$PWD/dist:/root/dist -v $PWD/../../../..:/root/paddle --name tmp_build_deb_container build_paddle_deb
-docker rm tmp_build_deb_container
-docker rmi build_paddle_deb
diff --git a/paddle/scripts/deb/postinst b/paddle/scripts/deb/postinst
deleted file mode 100644
index 1d2dd3171a132966832d87ae758d4e620475aed1..0000000000000000000000000000000000000000
--- a/paddle/scripts/deb/postinst
+++ /dev/null
@@ -1,7 +0,0 @@
-#!/bin/bash
-set -e
-echo "Post install paddle debian package."
-echo "Install some python package used for paddle. You can run "
-echo "  pip install /usr/opt/paddle/share/wheels/*.whl to install them."
-pip install /usr/opt/paddle/share/wheels/*.whl
-
diff --git a/paddle/scripts/docker/Dockerfile b/paddle/scripts/docker/Dockerfile
deleted file mode 100644
index 48af9e5b5fe83f552b17cec5d843da74845497bc..0000000000000000000000000000000000000000
--- a/paddle/scripts/docker/Dockerfile
+++ /dev/null
@@ -1,71 +0,0 @@
-FROM ubuntu:14.04
-MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-
-ARG DEBIAN_FRONTEND=noninteractive
-ARG UBUNTU_MIRROR
-RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
-
-# ENV variables
-ARG BUILD_WOBOQ
-ARG BUILD_AND_INSTALL
-ARG WITH_AVX
-ARG WITH_DOC
-ARG WITH_STYLE_CHECK
-
-ENV BUILD_WOBOQ=${BUILD_WOBOQ:-OFF}
-ENV BUILD_AND_INSTALL=${BUILD_AND_INSTALL:-OFF}
-ENV WITH_GPU=OFF
-ENV WITH_AVX=${WITH_AVX:-ON}
-ENV WITH_DOC=${WITH_DOC:-OFF}
-ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
-ENV DOCKER_BUILD=TRUE
-
-ENV HOME /root
-
-# Add bash enhancements
-COPY ./paddle/scripts/docker/root/ /root/
-
-RUN apt-get update && \
-    apt-get install -y git python-pip python-dev openssh-server bison && \
-    apt-get install -y wget unzip tar xz-utils bzip2 gzip coreutils && \
-    apt-get install -y curl sed grep graphviz libjpeg-dev zlib1g-dev && \
-    apt-get install -y python-numpy python-matplotlib gcc g++ gfortran && \
-    apt-get install -y automake locales clang-format-3.8 && \
-    apt-get clean -y
-
-# git credential to skip password typing
-RUN git config --global credential.helper store
-
-# Fix locales to en_US.UTF-8
-RUN localedef -i en_US -f UTF-8 en_US.UTF-8
-
-RUN pip install --upgrade pip && \
-    pip install -U 'protobuf==3.1.0' && \
-    pip install -U wheel pillow BeautifulSoup && \
-    pip install -U docopt PyYAML sphinx && \
-    pip install -U sphinx-rtd-theme==0.1.9 recommonmark && \
-    pip install -U pre-commit 'requests==2.9.2' jupyter
-
-RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
-    cd cmake-3.4.1 && ./bootstrap && make -j `nproc` && make install && \
-    cd .. && rm -rf cmake-3.4.1
-
-COPY . /paddle/
-RUN cd /paddle/ && git submodule update --init --recursive
-RUN /paddle/paddle/scripts/docker/build.sh
-
-VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"]
-
-# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
-RUN mkdir /var/run/sshd
-RUN echo 'root:root' | chpasswd
-RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config
-RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
-EXPOSE 22
-
-# Jupyter Notebook: Paddle book
-EXPOSE 8888
-
-COPY ./paddle/scripts/docker/entrypoint /opt/bin/
-
-CMD ["/opt/bin/entrypoint"]
diff --git a/paddle/scripts/docker/README.md b/paddle/scripts/docker/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..e5af5c9a1e6f96c5112895a1ec0b0c6ac57da666
--- /dev/null
+++ b/paddle/scripts/docker/README.md
@@ -0,0 +1,164 @@
+# Building PaddlePaddle
+
+## Goals
+
+We want the building procedure generates Docker images so that we can run PaddlePaddle applications on Kubernetes clusters.
+
+We want to build .deb packages so that enterprise users can run PaddlePaddle applications without Docker.
+
+We want to minimize the size of generated Docker images and .deb packages so to reduce the download time.
+
+We want to encapsulate building tools and dependencies in a *development* Docker image so to ease the tools installation for developers.
+
+Developers use various editors (emacs, vim, Eclipse, Jupyter Notebook), so the development Docker image contains only building tools, not editing tools, and developers are supposed to git clone source code into their development computers and map the code into the development container.
+
+We want the procedure and tools also work with testing, continuous integration, and releasing.
+
+
+## Docker Images
+
+So we need two Docker images for each version of PaddlePaddle:
+
+1. `paddle:<version>-dev`
+
+   This a development image contains only the development tools and standardizes the building procedure.  Users include:
+
+   - developers -- no longer need to install development tools on the host, and can build their current work on the host (development computer).
+   - release engineers -- use this to build the official release from certain branch/tag on Github.com.
+   - document writers / Website developers -- Our documents are in the source repo in the form of .md/.rst files and comments in source code.  We need tools to extract the information, typeset, and generate Web pages.
+
+   Of course, developers can install building tools on their development computers.  But different versions of PaddlePaddle might require different set or version of building tools.  Also, it makes collaborative debugging easier if all developers use a unified development environment.
+
+  The development image should include the following tools:
+
+   - gcc/clang
+   - nvcc
+   - Python
+   - sphinx
+   - woboq
+   - sshd
+
+   Many developers work on a remote computer with GPU; they could ssh into the computer and  `docker exec` into the development container. However, running `sshd` in the container allows developers to ssh into the container directly.
+
+1. `paddle:<version>`
+
+   This is the production image, generated using the development image. This image might have multiple variants:
+
+   - GPU/AVX   `paddle:<version>-gpu`
+   - GPU/no-AVX  `paddle:<version>-gpu-noavx`
+   - no-GPU/AVX  `paddle:<version>`
+   - no-GPU/no-AVX  `paddle:<version>-noavx`
+
+   We allow users to choose between GPU and no-GPU because the GPU version image is much larger than then the no-GPU version.
+
+   We allow users the choice between AVX and no-AVX, because some cloud providers don't provide AVX-enabled VMs.
+
+
+## Development Environment
+
+Here we describe how to use above two images.  We start from considering our daily development environment.
+
+Developers work on a computer, which is usually a laptop or desktop:
+
+<img src="doc/paddle-development-environment.png" width=500 />
+
+or, they might rely on a more sophisticated box (like with GPUs):
+
+<img src="doc/paddle-development-environment-gpu.png" width=500 />
+
+A principle here is that source code lies on the development computer (host) so that editors like Eclipse can parse the source code to support auto-completion.
+
+
+## Usages
+
+### Build the Development Docker Image
+
+The following commands check out the source code to the host and build the development image `paddle:dev`:
+
+```bash
+git clone https://github.com/PaddlePaddle/Paddle paddle
+cd paddle
+docker build -t paddle:dev .
+```
+
+The `docker build` command assumes that `Dockerfile` is in the root source tree.  Note that in this design, this `Dockerfile` is this only one in our repo.
+
+Users can specify a Ubuntu mirror server for faster downloading:
+
+```bash
+docker build -t paddle:dev --build-arg UBUNTU_MIRROR=mirror://mirrors.ubuntu.com/mirrors.txt .
+```
+
+### Build PaddlePaddle from Source Code
+
+Given the development image `paddle:dev`, the following command builds PaddlePaddle from the source tree on the development computer (host):
+
+```bash
+docker run -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "TEST=OFF" paddle:dev
+```
+
+This command mounts the source directory on the host into `/paddle` in the container, so the default entry point of `paddle:dev`, `build.sh`, could build the source code with possible local changes.  When it writes to `/paddle/build` in the container, it writes to `$PWD/build` on the host indeed.
+
+`build.sh` builds the following:
+
+- PaddlePaddle binaries,
+- `$PWD/build/paddle-<version>.deb` for production installation, and
+- `$PWD/build/Dockerfile`, which builds the production Docker image.
+
+Users can specify the following Docker build arguments with either "ON" or "OFF" value:
+- `WITH_GPU`: ***Required***. Generates NVIDIA CUDA GPU code and relies on CUDA libraries.
+- `WITH_AVX`: ***Required***. Set to "OFF" prevents from generating AVX instructions. If you don't know what is AVX, you might want to set "ON".
+- `TEST`: ***Optional, default OFF***. Build unit tests and run them after building.
+
+### Build the Production Docker Image
+
+The following command builds the production image:
+
+```bash
+docker build -t paddle -f build/Dockerfile .
+```
+
+This production image is minimal -- it includes binary `paddle`, the shared library `libpaddle.so`, and Python runtime.
+
+### Run PaddlePaddle Applications
+
+Again the development happens on the host.  Suppose that we have a simple application program in `a.py`, we can test and run it using the production image:
+
+```bash
+docker run -it -v $PWD:/work paddle /work/a.py
+```
+
+But this works only if all dependencies of `a.py` are in the production image. If this is not the case, we need to build a new Docker image from the production image and with more dependencies installs.
+
+### Build and Run PaddlePaddle Applications
+
+We need a Dockerfile in https://github.com/paddlepaddle/book that builds Docker image `paddlepaddle/book:<version>`, basing on the PaddlePaddle production image:
+
+```
+FROM paddlepaddle/paddle:<version>
+RUN pip install -U matplotlib jupyter ...
+COPY . /book
+EXPOSE 8080
+CMD ["jupyter"]
+```
+
+The book image is an example of PaddlePaddle application image.  We can build it
+
+```bash
+git clone https://github.com/paddlepaddle/book
+cd book
+docker build -t book .
+```
+
+### Build and Run Distributed Applications
+
+In our [API design doc](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/api.md#distributed-training), we proposed an API that starts a distributed training job on a cluster.  This API need to build a PaddlePaddle application into a Docker image as above and calls kubectl to run it on the cluster.  This API might need to generate a Dockerfile look like above and call `docker build`.
+
+Of course, we can manually build an application image and launch the job using the kubectl tool:
+
+```bash
+docker build -f some/Dockerfile -t myapp .
+docker tag myapp me/myapp
+docker push
+kubectl ...
+```
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
old mode 100755
new mode 100644
index 668b6e6b84191c7042a4905f45879b7405be331c..8d50ced23fc7430edd23d380c9fa12b2cd200a39
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -1,68 +1,102 @@
 #!/bin/bash
 
-function abort(){
-    echo "An error occurred. Exiting..." 1>&2
-    exit 1
-}
-
-trap 'abort' 0
 set -e
 
-# If Dockerfile.* sets BUILD_AND_INSTALL to 'ON', it would have copied
-# source tree to /paddle, and this scripts should build it into
-# /paddle/build.
-if [[ ${BUILD_AND_INSTALL:-OFF} == 'ON' ]]; then
-    if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
-	ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so /usr/lib/libcudnn.so
-    fi
+# Set BASE_IMAGE according to env variables
+if [ ${WITH_GPU} == "ON" ]; then
+  BASE_IMAGE="nvidia/cuda:7.5-cudnn5-runtime-ubuntu14.04"
+  # additional packages to install when building gpu images
+  GPU_DOCKER_PKG="python-pip python-dev"
+else
+  BASE_IMAGE="python:2.7.13-slim"
+fi
+
+DOCKERFILE_GPU_ENV=""
+if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
+    DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
+
+    # for cmake to find cudnn
+    ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so /usr/lib/libcudnn.so
+fi
+
+mkdir -p /paddle/build
+cd /paddle/build
+
+# build script will not fail if *.deb does not exist
+rm *.deb 2>/dev/null || true
 
-    mkdir -p /paddle/build # -p means no error if exists
-    # clean local cmake and third_party cache
-    cd /paddle/build && rm -rf * && rm -rf ../third_party
-    cmake .. \
-	  -DWITH_DOC=${WITH_DOC:-OFF} \
-	  -DWITH_GPU=${WITH_GPU:-OFF} \
-	  -DWITH_AVX=${WITH_AVX:-OFF} \
-	  -DWITH_SWIG_PY=ON \
-	  -DCUDNN_ROOT=/usr/ \
-	  -DWITH_STYLE_CHECK=OFF \
-	  -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-    make -j `nproc`
-    make install
+cmake .. \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DWITH_DOC=${WITH_DOC:-OFF} \
+      -DWITH_GPU=${WITH_GPU:-OFF} \
+      -DWITH_AVX=${WITH_AVX:-OFF} \
+      -DWITH_SWIG_PY=ON \
+      -DCUDNN_ROOT=/usr/ \
+      -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF} \
+      -DWITH_COVERAGE=${TEST:-OFF} \
+      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+make -j `nproc`
+if [[ ${TEST:-OFF} == "ON" ]]; then
+    make coveralls
+fi
+make install
 
-    if [[ ${BUILD_WOBOQ:-OFF} == 'ON' ]]; then
-        apt-get install -y clang-3.8 llvm-3.8 libclang-3.8-dev 
-        # Install woboq_codebrowser.
-        git clone https://github.com/woboq/woboq_codebrowser /woboq
-        cd /woboq
-        cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-        -DCMAKE_BUILD_TYPE=Release \
-        .
-        make
+# generate deb package for current build
+# FIXME(typhoonzero): should we remove paddle/scripts/deb ?
+# FIXME: CPACK_DEBIAN_PACKAGE_DEPENDS removes all dev dependencies, must
+# install them in docker
+cpack -D CPACK_GENERATOR='DEB' -D CPACK_DEBIAN_PACKAGE_DEPENDS="" ..
 
-        export WOBOQ_OUT=/usr/share/nginx/html/paddle
-        export BUILD_DIR=/paddle/build
-        mkdir -p $WOBOQ_OUT
-        cp -rv /woboq/data $WOBOQ_OUT/../data
-        /woboq/generator/codebrowser_generator \
+if [[ ${BUILD_WOBOQ:-OFF} == 'ON' ]]; then
+    apt-get install -y clang-3.8 llvm-3.8 libclang-3.8-dev
+    # Install woboq_codebrowser.
+    git clone https://github.com/woboq/woboq_codebrowser /woboq
+    cd /woboq
+    cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
+          -DCMAKE_BUILD_TYPE=Release \
+          .
+    make
+
+    export WOBOQ_OUT=/usr/share/nginx/html/paddle
+    export BUILD_DIR=/paddle/build
+    mkdir -p $WOBOQ_OUT
+    cp -rv /woboq/data $WOBOQ_OUT/../data
+    /woboq/generator/codebrowser_generator \
         -b /paddle/build \
         -a \
         -o $WOBOQ_OUT \
         -p paddle:/paddle
-        /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
-        cd /woboq
-        make clean
-    fi
+    /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
+    cd /woboq
+    make clean
+fi
 
-    pip install /usr/local/opt/paddle/share/wheels/py_paddle*linux*.whl
-    pip install /usr/local/opt/paddle/share/wheels/paddle*.whl
-    paddle version
+paddle version
 
-    if [[ ${DOCKER_BUILD:-FALSE} == 'TRUE' ]]; then
-	# reduce docker image size
-	rm -rf /paddle/build
-	rm -rf /usr/local/opt/paddle/share/wheels/
-    fi
+# generate production docker image Dockerfile
+if [ ${USE_MIRROR} ]; then
+  MIRROR_UPDATE="sed 's@http:\/\/archive.ubuntu.com\/ubuntu\/@mirror:\/\/mirrors.ubuntu.com\/mirrors.txt@' -i /etc/apt/sources.list && \\"
+else
+  MIRROR_UPDATE="\\"
 fi
 
-trap : 0
+cat > /paddle/build/Dockerfile <<EOF
+FROM ${BASE_IMAGE}
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+ENV HOME /root
+ENV LANG en_US.UTF-8
+# Use Fix locales to en_US.UTF-8
+RUN ${MIRROR_UPDATE}
+    apt-get update && \
+    apt-get install -y libgfortran3 libpython2.7 ${GPU_DOCKER_PKG} && \
+    apt-get clean -y && \
+    pip install --upgrade pip && \
+    pip install -U 'protobuf==3.1.0' requests numpy
+# Use different deb file when building different type of images
+ADD build/*.deb /usr/local/opt/paddle/deb/
+# run paddle version to install python packages first
+RUN dpkg -i /usr/local/opt/paddle/deb/*.deb && rm -f /usr/local/opt/paddle/deb/*.deb && paddle version
+${DOCKERFILE_GPU_ENV}
+# default command shows the paddle version and exit
+CMD ["paddle", "version"]
+EOF
diff --git a/paddle/scripts/docker/doc/paddle-development-environment-gpu.graffle b/paddle/scripts/docker/doc/paddle-development-environment-gpu.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..4629f9b9da7ababdafa0b964db18a98a819c6a9e
Binary files /dev/null and b/paddle/scripts/docker/doc/paddle-development-environment-gpu.graffle differ
diff --git a/paddle/scripts/docker/doc/paddle-development-environment-gpu.png b/paddle/scripts/docker/doc/paddle-development-environment-gpu.png
new file mode 100644
index 0000000000000000000000000000000000000000..61a96d7198d013f08f0f9c269cc352da5f7dd2e9
Binary files /dev/null and b/paddle/scripts/docker/doc/paddle-development-environment-gpu.png differ
diff --git a/paddle/scripts/docker/doc/paddle-development-environment.graffle b/paddle/scripts/docker/doc/paddle-development-environment.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..5b164c4832809de94ead7309af49c579135d7f48
Binary files /dev/null and b/paddle/scripts/docker/doc/paddle-development-environment.graffle differ
diff --git a/paddle/scripts/docker/doc/paddle-development-environment.png b/paddle/scripts/docker/doc/paddle-development-environment.png
new file mode 100644
index 0000000000000000000000000000000000000000..707ed45a335a981c23b3533984045f53848b55e2
Binary files /dev/null and b/paddle/scripts/docker/doc/paddle-development-environment.png differ
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index f29d32f0d947dc7cde6112160e4f79ce8113505f..5a45df4072b9197a713bd19ee766296279bfcbc8 100644
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -94,16 +94,22 @@ else:
 EOF
 
 if [ $? -eq 1 ]; then  # Older version installed, or not installed at all
-   echo "First time run paddle, need to install some python dependencies."
-   BASEDIR=$(dirname "$0")
-   pip install ${BASEDIR}/../opt/paddle/share/wheels/*-@PADDLE_VERSION@-*.whl
-   if [ $? -ne 0 ]; then
-      echo "pip install wheels failed. "
-      echo "Please use 'sudo paddle' at the first time you use PaddlePaddle"
-      echo "PaddlePaddle will install some python dependencies automatically."
-      exit 1
-   fi
-   echo "Python dependencies are installed."
+    echo "First time run paddle, need to install some python dependencies."
+    # setuptools normalizes package version, so we need to use normalized
+    # package version for paddle python package
+    PYTHON_PADDLE_VERSION=$(python -c 'import packaging
+import setuptools
+print str(packaging.version.Version("@PADDLE_VERSION@"))
+' 2>/dev/null)
+    BASEDIR=$(dirname "$0")
+    pip install ${BASEDIR}/../opt/paddle/share/wheels/*-${PYTHON_PADDLE_VERSION}-*.whl
+    if [ $? -ne 0 ]; then
+	echo "pip install wheels failed. "
+	echo "Please use 'sudo paddle' at the first time you use PaddlePaddle"
+	echo "PaddlePaddle will install some python dependencies automatically."
+	exit 1
+    fi
+    echo "Python dependencies are installed."
 fi
 
 case "$1" in
diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh
index 7deb3e62e88de7e1306fcbfc5a28aa4372d678e6..f2cbc561652a3c7502de94be37d75783fc40b9c1 100755
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
@@ -5,7 +5,7 @@ NPROC=1
 export PYTHONPATH=/opt/python/2.7.12/lib/python2.7/site-packages
 export PYTHONHOME=/opt/python/2.7.12
 export PATH=/opt/python/2.7.12/bin:${PATH}
-cmake .. -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
+cmake .. -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DON_TRAVIS=ON -DWITH_COVERAGE=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
 NRPOC=`nproc`
 make -j $NPROC
 make coveralls
diff --git a/paddle/setup.py.in b/paddle/setup.py.in
index 382d5be6ecfc26b4a524bb6a775bd1a805a34d96..0b62436a7f81682d5279c3b307ac1abf09eafffb 100644
--- a/paddle/setup.py.in
+++ b/paddle/setup.py.in
@@ -12,68 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# This file is used to build paddle python binding package.
-# It will be invoked by Makefile that generated by COMAKE
 
 from setuptools import setup, Extension
 
-import numpy as np
-import api.paddle_ld_flags
-import platform
-import os
-
-system = platform.system().lower()
-
-is_osx = (system == 'darwin')
-is_win = (system == 'windows')
-is_lin = (system == 'linux')
-
-
-# The extra links will passed from COMAKE
-#   because generate paddle LDFLAGS is too complicated to do in setup.py
-#   it just read COMAKE generated LDFLAGS.
-extra_comps = []
-extra_links = []
-obj = api.paddle_ld_flags.PaddleLDFlag()
-extra_comps = obj.c_flag()
-ldflags = obj.ldflag_str()
-if ldflags is not None:
-  extra_links.extend(ldflags.split(" "))
-
-try:
-  with open('.py_paddle_extra_link_flags', 'r') as f:
-    for line in f:
-      extra_links += line.split()
-except:
-  pass
-
-if is_lin == True:
-    extra_links = ["-Xlinker", '-start-group'] + extra_links + ["-Xlinker", "-end-group"]
-elif is_osx == True:
-    os.environ["ARCHFLAGS"] = "-arch x86_64"
-    extra_links = ["-Wl,-all_load"] + extra_links
-
-include_dirs = [np.get_include(), "../"]    # include numpy and paddle.
-
-os.environ["CC"] = "@CMAKE_C_COMPILER@"
-os.environ["CXX"] = "@CMAKE_CXX_COMPILER@"
-
 setup(name="py_paddle",
-  version="@PADDLE_VERSION@",
-  ext_modules=[
-    Extension('py_paddle._swig_paddle',      # Build SWIG Extension.
-       ['Paddle_wrap.cxx'],
-       language = "c++",
-       include_dirs = include_dirs,
-       extra_link_args = extra_links,
-       extra_compile_args = extra_comps
-    )
-  ],
-  packages=['py_paddle'],
-  include_dirs = include_dirs,
-  install_requires = [
-    'nltk>=3.2.2',
-    'numpy>=1.8.0',      # The numpy is required.
-    'protobuf>=3.0.0'    # The paddle protobuf version
-  ],
+      version="${PADDLE_VERSION}",
+      packages=['py_paddle'],
+      include_package_data=True,
+      package_data={'py_paddle':['*.py','_swig_paddle.so']},
+      install_requires = [
+        'nltk>=3.2.2',
+        'numpy>=1.8.0',      # The numpy is required.
+        'protobuf>=${PROTOBUF_VERSION}'    # The paddle protobuf version
+      ],
+      url='http://www.paddlepaddle.org/',
+      license='Apache 2.0',
 )
diff --git a/paddle/trainer/Trainer.h b/paddle/trainer/Trainer.h
index c8ee4726c24c335ceda22ea3a20049b01d11c149..fac589d1d711affcd008f90edf87d865c8362f69 100644
--- a/paddle/trainer/Trainer.h
+++ b/paddle/trainer/Trainer.h
@@ -30,10 +30,6 @@ limitations under the License. */
 #include "TrainerConfigHelper.h"
 #include "TrainerInternal.h"
 
-#ifdef PADDLE_METRIC_LEARNING
-#include "paddle/internals/metric_learning/MetricTrainer.h"
-#endif
-
 DECLARE_int32(num_passes);
 
 namespace paddle {
@@ -201,12 +197,8 @@ protected:
   // parameter util
   std::unique_ptr<ParameterUtil> paramUtil_;
 
-#ifdef PADDLE_METRIC_LEARNING
-  MetricTrainer trainerInternal_;
-#else
   // trainer Internal
   TrainerInternal trainerInternal_;
-#endif
 };
 
 }  // namespace paddle
diff --git a/paddle/utils/CpuId.h b/paddle/utils/CpuId.h
index 385bc36a8bccc322323901ac9f974652c4a973b7..869be5be541dafd699a87a8e8893aadadf59b711 100644
--- a/paddle/utils/CpuId.h
+++ b/paddle/utils/CpuId.h
@@ -12,6 +12,7 @@ limitations under the License. */
 #pragma once
 
 #include "Common.h"
+#include "Error.h"
 
 namespace paddle {
 
@@ -99,4 +100,37 @@ private:
 #define HAS_NEON    HAS_SIMD(SIMD_NEON)
 // clang-format on
 
+/**
+ * Invoke checkCPUFeature() before Paddle initialization to
+ * check target machine whether support compiled instructions.
+ * If not, simply throw out an error.
+ */
+inline Error __must_check checkCPUFeature() {
+  Error err;
+#ifndef __AVX__
+  if (HAS_AVX) {
+    LOG(WARNING) << "PaddlePaddle wasn't compiled to use avx instructions, "
+                 << "but these are available on your machine and could "
+                 << "speed up CPU computations via CMAKE .. -DWITH_AVX=ON";
+  }
+#else
+  if (!HAS_AVX) {
+    err = Error(
+        "PaddlePaddle was compiled to use avx instructions, "
+        "but these aren't available on your machine, please "
+        "disable it via CMAKE .. -DWITH_AVX=OFF");
+  }
+#endif  // __AVX__
+#ifdef __SSE3__
+  if (!HAS_SSE3) {
+    err = Error(
+        "PaddlePaddle was compiled to use sse3 instructions, "
+        "which is the minimum requirement of PaddlePaddle. "
+        "But these aren't available on your current machine.");
+  }
+#endif  // __SSE3__
+
+  return err;
+}
+
 }  // namespace paddle
diff --git a/paddle/utils/Flags.cpp b/paddle/utils/Flags.cpp
index e8f31bc811ac30d83e8203b784ee1f93a8d35d90..320f671ed97dbadc4fa1b4b52d5611cf9239e7dd 100644
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
@@ -30,7 +30,6 @@ DEFINE_bool(parallel_nn,
 DEFINE_int32(trainer_count, 1, "Defined how many trainers to train");
 DEFINE_int32(gpu_id, 0, "Which gpu core to use");
 DEFINE_int32(port, 20134, "Listening port for pserver");
-DEFINE_int32(data_server_port, 21134, "Listening port for dserver");
 DEFINE_int32(ports_num,
              1,
              "Number of ports for sending dense parameter,"
diff --git a/paddle/utils/Flags.h b/paddle/utils/Flags.h
index 3e72f8356d883b353127ccae80f2881320d20b2b..dc4faef8331ed47b9ce3e952389b6469cd9fda2e 100644
--- a/paddle/utils/Flags.h
+++ b/paddle/utils/Flags.h
@@ -19,7 +19,6 @@ limitations under the License. */
 DECLARE_bool(parallel_nn);
 DECLARE_int32(async_count);
 DECLARE_int32(port);
-DECLARE_int32(data_server_port);
 DECLARE_bool(use_gpu);
 DECLARE_int32(gpu_id);
 DECLARE_int32(trainer_count);
diff --git a/paddle/utils/GlobalConstants.h b/paddle/utils/GlobalConstants.h
index 707346f2c76e59b50722f4f8805ebe56c3cf861b..0ec1c28dfbb2a7db9fa84c9eb2bc4dad806b78e9 100644
--- a/paddle/utils/GlobalConstants.h
+++ b/paddle/utils/GlobalConstants.h
@@ -23,11 +23,6 @@ enum PassType {
   PASS_TEST,    // Test pass
   PASS_GC,      // Gradient Check pass
   PASS_METRIC,  // pass for generate template output with no drop rate.
-  // pass for metric learning training with metric learning error, only used
-  // when we are doing KNN evaluation.
-  PASS_METRIC_TRAIN,
-  PASS_METRIC_TRAIN_WITH_NOERROR,  // Pass for metric learning training
-                                   // with no evaluation.
 };
 
 enum ParameterType {
diff --git a/paddle/utils/PythonUtil.cpp.in b/paddle/utils/PythonUtil.cpp.in
index 66b5795e29fb9fa751ed802e87ced0a71aea4c51..a51b8f765f41f6febb86002f371b14e8797e7e4d 100644
--- a/paddle/utils/PythonUtil.cpp.in
+++ b/paddle/utils/PythonUtil.cpp.in
@@ -195,9 +195,14 @@ extern const char enable_virtualenv_py[];
 }
 void initPython(int argc, char** argv) {
 #ifndef PADDLE_NO_PYTHON
-  char pyHome[] = "@PYTHON_INSTALL_DIR@"; // NOLINT
-  if (strlen(pyHome)) {
-    Py_SetPythonHome(pyHome);
+  std::string pyHome;
+#if defined(__APPLE__) || defined(__OSX__)
+  pyHome = "/usr/local/Frameworks/Python.framework/Versions/2.7";
+  Py_SetPythonHome(const_cast<char*>(pyHome.c_str()));
+#endif
+  pyHome = "@PYTHON_INSTALL_DIR@"; // NOLINT
+  if (!pyHome.empty()) {
+    Py_SetPythonHome(const_cast<char*>(pyHome.c_str()));
   }
   Py_SetProgramName(argv[0]);
   Py_Initialize();
diff --git a/paddle/utils/Util.cpp b/paddle/utils/Util.cpp
index 96302a45f36ee192db1bc04ac5787691088a6630..b18b73e06a6c39c3bf9717280bc6323917c80efb 100644
--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
@@ -31,6 +31,7 @@ limitations under the License. */
 
 #include <gflags/gflags.h>
 
+#include "CpuId.h"
 #include "CustomStackTrace.h"
 #include "Logging.h"
 #include "StringUtil.h"
@@ -194,6 +195,7 @@ void initMain(int argc, char** argv) {
   }
 
   version::printVersion();
+  checkCPUFeature().check();
   runInitFunctions();
 }
 
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index e257aa568facb1555944dba7e76c5d8bce7f1c7d..1394773b4ff12aa751b8659a4461f94ee706892e 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -686,25 +686,17 @@ class ContextProjection(Projection):
 
 
 @config_class
-class ConvProjection(Projection):
-    type = 'conv'
-
+class ConvBaseProjection(Projection):
     def __init__(self,
                  input_layer_name,
                  num_filters=None,
                  conv_conf=None,
                  **xargs):
-        super(ConvProjection, self).__init__(input_layer_name, **xargs)
+        super(ConvBaseProjection, self).__init__(input_layer_name, **xargs)
 
         if num_filters is not None:
             self.proj_conf.num_filters = num_filters
 
-        parse_conv(conv_conf, input_layer_name, self.proj_conf.conv_conf,
-                   num_filters)
-        self.proj_conf.output_size = self.proj_conf.conv_conf.output_x * \
-                                     self.proj_conf.conv_conf.output_y * \
-                                     num_filters
-
     def calc_output_size(self, input_layer_config):
         return self.proj_conf.output_size
 
@@ -723,6 +715,48 @@ class ConvProjection(Projection):
         return None
 
 
+@config_class
+class ConvProjection(ConvBaseProjection):
+    type = 'conv'
+
+    def __init__(self,
+                 input_layer_name,
+                 num_filters=None,
+                 conv_conf=None,
+                 **xargs):
+        super(ConvProjection, self).__init__(input_layer_name, num_filters,
+                                             conv_conf, **xargs)
+
+        parse_conv(conv_conf, self.input_layer_name, self.proj_conf.conv_conf,
+                   num_filters)
+        self.proj_conf.output_size = self.proj_conf.conv_conf.output_x * \
+                                     self.proj_conf.conv_conf.output_y * \
+                                     num_filters
+
+
+@config_class
+class ConvTransProjection(ConvBaseProjection):
+    type = 'convt'
+
+    def __init__(self,
+                 input_layer_name,
+                 num_filters=None,
+                 conv_conf=None,
+                 **xargs):
+        super(ConvTransProjection, self).__init__(input_layer_name, num_filters,
+                                                  conv_conf, **xargs)
+
+        parse_conv(
+            conv_conf,
+            self.input_layer_name,
+            self.proj_conf.conv_conf,
+            num_filters,
+            trans=True)
+        self.proj_conf.output_size = self.proj_conf.conv_conf.img_size_y * \
+                                     self.proj_conf.conv_conf.img_size * \
+                                     num_filters
+
+
 # Define a operator for mixed layer
 @config_class
 class Operator(Cfg):
@@ -789,6 +823,36 @@ class ConvOperator(Operator):
         return self.operator_conf.output_size
 
 
+@config_class
+class ConvTransOperator(Operator):
+    type = 'convt'
+
+    def __init__(self,
+                 input_layer_names,
+                 num_filters=None,
+                 conv_conf=None,
+                 **xargs):
+        super(ConvTransOperator, self).__init__(input_layer_names, **xargs)
+        if num_filters is not None:
+            self.operator_conf.num_filters = num_filters
+
+        parse_conv(
+            conv_conf,
+            MakeLayerNameInSubmodel(input_layer_names[0]),
+            self.operator_conf.conv_conf,
+            num_filters,
+            trans=True)
+        self.operator_conf.output_size = \
+            self.operator_conf.conv_conf.img_size * \
+            self.operator_conf.conv_conf.img_size_y * \
+            num_filters
+
+        config_assert(len(input_layer_names) == 2, "Conv is binary operator")
+
+    def calc_output_size(self, input_sizes):
+        return self.operator_conf.output_size
+
+
 # please refer to the comments in proto/ModelConfig.proto
 @config_class
 class Conv(Cfg):
@@ -1772,8 +1836,17 @@ class ConvTransLayerBase(LayerBase):
         use_gpu = int(g_command_config_args.get("use_gpu", 0))
         parallel_nn = int(g_command_config_args.get("parallel_nn", 0))
 
-        # cudnn_convt has not been implemented so use exconvt only
-        self.layer_type = "exconvt"
+        # Automatically select cudnn_type for GPU and exconvt for CPU
+        # if set type=exconvt, but still reserve the way user specify
+        # exconvt or cudnn_convt manually.
+        if self.layer_type == "cudnn_convt":
+            config_assert(use_gpu, "cudnn_convt only support GPU")
+
+        if (use_gpu == 1 and self.layer_type != "exconvt" and
+            (parallel_nn == 0 or self.config.device > -1)):
+            self.layer_type = "cudnn_convt"
+        else:
+            self.layer_type = "exconvt"
         # need to specify layer in config
         self.config.type = self.layer_type
 
@@ -1790,10 +1863,9 @@ class ConvTransLayerBase(LayerBase):
                 trans=True)
             conv_conf = self.config.inputs[input_index].conv_conf
             psize = self.calc_parameter_size(conv_conf)
-            print("output size for %s is %d " % (name, conv_conf.output_x))
             self.create_input_parameter(input_index, psize)
-            self.set_layer_size(
-                (conv_conf.img_size**2) * self.config.num_filters)
+            self.set_cnn_layer(name, conv_conf.img_size_y, conv_conf.img_size,
+                               self.config.num_filters)
 
         psize = self.config.size
         if shared_biases:
@@ -1810,6 +1882,11 @@ class ConvTransLayer(ConvTransLayerBase):
     layer_type = 'exconvt'
 
 
+@config_layer('cudnn_convt')
+class ConvTransLayer(ConvTransLayerBase):
+    layer_type = 'cudnn_convt'
+
+
 @config_layer('norm')
 class NormLayer(LayerBase):
     def __init__(self, name, inputs, **xargs):
@@ -2222,7 +2299,10 @@ def Link(
 
 # memory for recurrent layer group.
 # *name* and *size* are actual layer's name and size.
-# will return name of the memory,
+# If *name* is None, need to provide *memory_name* and need to use
+# SetMemoryInput() later to specify the layer which this memory remembers.
+#
+# return the name of the memory,
 # use this name if you assign the memory as other layer's input
 #
 # boot frame of memory is zeroed by default,
@@ -2234,15 +2314,18 @@ def Link(
 # can only be initailized by a *boot_layer* which is a sequence.
 #
 @config_func
-def Memory(
-        name,
-        size,
-        is_sequence=False,
-        boot_layer=None,
-        boot_bias=False,
-        boot_bias_active_type="",
-        boot_with_const_id=None, ):
-    agent_name = name + "+delay1"
+def Memory(name,
+           size,
+           is_sequence=False,
+           boot_layer=None,
+           boot_bias=False,
+           boot_bias_active_type="",
+           boot_with_const_id=None,
+           memory_name=None):
+    if not memory_name:
+        config_assert(name is not None, "name needs cannot be None")
+        memory_name = name + "+delay1"
+    agent_name = memory_name
     if is_sequence:
         agent_layer = SequenceAgentLayer(agent_name, size)
     else:
@@ -2250,7 +2333,8 @@ def Memory(
     config_assert(g_current_submodel.is_recurrent_layer_group,
                   'Memory should be used in recurrent layer group only')
     memory = g_current_submodel.memories.add()
-    memory.layer_name = MakeLayerNameInSubmodel(name)
+    if name is not None:
+        memory.layer_name = MakeLayerNameInSubmodel(name)
     memory.link_name = MakeLayerNameInSubmodel(agent_name)
     memory.is_sequence = is_sequence
     options = sum((boot_layer is not None, bool(boot_bias),
@@ -2274,6 +2358,17 @@ def Memory(
     return agent_name
 
 
+@config_func
+def SetMemoryInput(memory_name, layer_name):
+    memory_name = MakeLayerNameInSubmodel(memory_name)
+    layer_name = MakeLayerNameInSubmodel(layer_name)
+    for mem in g_current_submodel.memories:
+        if mem.link_name == memory_name:
+            mem.layer_name = layer_name
+            return
+    logger.fatal("Nonexistent memory name: " + memory_name)
+
+
 # Generator for recurrent layer group, to use it:
 #  1. define a id layer as output of layer group
 #  2. define a memory of this id layer, and assign a boot id(begin of sequence)
diff --git a/python/paddle/trainer_config_helpers/default_decorators.py b/python/paddle/trainer_config_helpers/default_decorators.py
index 2f25579fcdd9793e4c165439c9934a2bccb63617..69d860d9dab9c1d90e4d6a6940d66fcb551f6eb6 100644
--- a/python/paddle/trainer_config_helpers/default_decorators.py
+++ b/python/paddle/trainer_config_helpers/default_decorators.py
@@ -97,13 +97,13 @@ def reset_hook():
 register_parse_config_hook(reset_hook)
 
 
-def wrap_name_default(name_prefix=None):
+def wrap_name_default(name_prefix=None, name_param="name"):
     """
     Decorator to set "name" arguments default to "{name_prefix}_{invoke_count}".
 
     ..  code:: python
 
-        @default_name("some_name")
+        @wrap_name_default("some_name")
         def func(name=None):
             print name      # name will never be None. If name is not set,
                             # name will be "some_name_%d"
@@ -115,7 +115,7 @@ def wrap_name_default(name_prefix=None):
     """
     factory = DefaultNameFactory(name_prefix)
     _name_factories.append(factory)
-    return wrap_param_default(["name"], factory)
+    return wrap_param_default([name_param], factory)
 
 
 def wrap_param_attr_default(param_names=None, default_factory=None):
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index b94f8f9a783552519ca73e7cfc0937b302d3445b..b006eb46d99fd09c7bc31e5de41ebdb39659b663 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -52,6 +52,7 @@ __all__ = [
     "cos_sim",
     "hsigmoid",
     "conv_projection",
+    "mse_cost",
     "regression_cost",
     'classification_cost',
     "LayerOutput",
@@ -287,6 +288,14 @@ class LayerOutput(object):
         """
         assert False, "this method should not be invoked"
 
+    def set_input(self, input):
+        """
+        Set the input for a memory layer. Can only be used for memory layer
+        """
+        assert isinstance(input, LayerOutput)
+        assert self.layer_type == LayerType.MEMORY
+        SetMemoryInput(self.name, input.name)
+
 
 ERROR_CLIPPING = 'error_clipping_threshold'
 DROPOUT = 'drop_rate'
@@ -703,8 +712,9 @@ class MixedLayerType(LayerOutput):
         assert len(self.inputs) == 0
         return self
 
-    def __exit__(self, *args, **kwargs):
-        del args, kwargs  # unused parameter to suppress warning
+    def __exit__(self, exc_type, exc_value, tb):
+        if exc_value is not None:
+            raise exc_value
         assert len(self.inputs) != 0
         ml = MixedLayer(
             name=self.name,
@@ -2035,8 +2045,9 @@ def img_conv_layer(input,
     :param trans: true if it is a convTransLayer, false if it is a convLayer
     :type trans: bool
     :param layer_type: specify the layer_type, default is None. If trans=True,
-                       layer_type has to be "exconvt", otherwise layer_type
-                       has to be either "exconv" or "cudnn_conv"
+                       layer_type has to be "exconvt" or "cudnn_convt", 
+                       otherwise layer_type has to be either "exconv" or 
+                       "cudnn_conv"
     :type layer_type: String
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2076,7 +2087,7 @@ def img_conv_layer(input,
 
     if layer_type:
         if trans:
-            assert layer_type in ["exconvt"]
+            assert layer_type in ["exconvt", "cudnn_convt"]
         else:
             assert layer_type in ["exconv", "cudnn_conv"]
         lt = layer_type
@@ -2758,8 +2769,10 @@ def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
         size=a.size)
 
 
+@wrap_name_default("memory", "memory_name")
 def memory(name,
            size,
+           memory_name=None,
            is_seq=False,
            boot_layer=None,
            boot_bias=None,
@@ -2781,14 +2794,32 @@ def memory(name,
     If boot_layer is not null, the memory is just the boot_layer's output.
     Set :code:`is_seq` is true boot layer is sequence.
 
-
     The same name layer in recurrent group will set memory on each time
     step.
 
-    :param name: memory's name.
+    .. code-block:: python
+
+       mem = memory(size=256, name='state')
+       state = fc_layer(input=mem, size=256, name='state')
+
+    If you do not want to specify the name, you can equivalently use set_input()
+    to specify the layer needs to be remembered as the following:
+
+    .. code-block:: python
+       mem = memory(size=256)
+       state = fc_layer(input=mem, size=256)
+       mem.set_input(mem)
+
+
+    :param name: the name of the layer which this memory remembers.
+                 If name is None, user should call set_input() to specify the
+                 name of the layer which this memory remembers.
     :type name: basestring
     :param size: size of memory.
     :type size: int
+    :param memory_name: the name of the memory.
+                        It is ignored when name is provided.
+    :type memory_name: basestring
     :param is_seq: is sequence for boot_layer
     :type is_seq: bool
     :param boot_layer: boot layer of memory.
@@ -2810,13 +2841,21 @@ def memory(name,
         boot_bias = ParamAttr.to_bias(boot_bias)
 
     assert boot_layer is None or isinstance(boot_layer, LayerOutput)
+    if name is not None:
+        memory_name = None
 
-    agent_name = Memory(name, size, is_seq, boot_layer.name
-                        if boot_layer is not None else None, boot_bias,
-                        boot_bias_active_type.name, boot_with_const_id)
+    memory_name = Memory(
+        name,
+        size,
+        is_sequence=is_seq,
+        boot_layer=boot_layer.name if boot_layer is not None else None,
+        boot_bias=boot_bias,
+        boot_bias_active_type=boot_bias_active_type.name,
+        boot_with_const_id=boot_with_const_id,
+        memory_name=memory_name)
 
     lout = LayerOutput(
-        name=agent_name,
+        name=memory_name,
         size=size,
         layer_type=LayerType.MEMORY,
         parents=[boot_layer] if boot_layer is not None else None)
@@ -3564,7 +3603,7 @@ def __cost_input__(input, label, weight=None):
     ipts = [Input(input.name), Input(label.name)]
     parents = [input, label]
     if weight is not None:
-        assert weight.layer_type == LayerType.DATA
+        assert weight.size == 1
         ipts.append(Input(weight.name))
         parents.append(weight)
     return ipts, parents
@@ -3572,11 +3611,14 @@ def __cost_input__(input, label, weight=None):
 
 @wrap_name_default()
 @layer_support()
-def regression_cost(input, label, weight=None, name=None, layer_attr=None):
+def mse_cost(input, label, weight=None, name=None, layer_attr=None):
     """
-    Regression Layer.
+    mean squared error cost:
+
+    ..  math::
+
+       $\frac{1}{N}\sum_{i=1}^N(t _i- y_i)^2$
 
-    TODO(yuyang18): Complete this method.
 
     :param name: layer name.
     :type name: basestring
@@ -3602,6 +3644,9 @@ def regression_cost(input, label, weight=None, name=None, layer_attr=None):
     return LayerOutput(name, LayerType.COST, parents=parents, size=1)
 
 
+regression_cost = mse_cost
+
+
 @wrap_name_default("cost")
 @layer_support()
 def classification_cost(input,
@@ -3672,7 +3717,8 @@ def conv_operator(img,
                   padding=0,
                   filter_size_y=None,
                   stride_y=None,
-                  padding_y=None):
+                  padding_y=None,
+                  trans=False):
     """
     Different from img_conv_layer, conv_op is an Operator, which can be used
     in mixed_layer. And conv_op takes two inputs to perform convolution.
@@ -3728,7 +3774,9 @@ def conv_operator(img,
     if filter.size is not None:
         filter.size = filter_size * filter_size_y * num_filters * num_channels
 
-    op = ConvOperator(
+    opCls = ConvTransOperator if trans else ConvOperator
+
+    op = opCls(
         input_layer_names=[img.name, filter.name],
         num_filters=num_filters,
         conv_conf=Conv(
@@ -3740,6 +3788,7 @@ def conv_operator(img,
             padding_y=padding_y,
             stride_y=stride_y,
             groups=1))
+
     op.origin = [img, filter]
     return op
 
@@ -3755,7 +3804,8 @@ def conv_projection(input,
                     stride_y=None,
                     padding_y=None,
                     groups=1,
-                    param_attr=None):
+                    param_attr=None,
+                    trans=False):
     """
     Different from img_conv_layer and conv_op, conv_projection is an Projection,
     which can be used in mixed_layer and conat_layer. It use cudnn to implement
@@ -3794,6 +3844,8 @@ def conv_projection(input,
     :type groups: int
     :param param_attr: Convolution param attribute. None means default attribute
     :type param_attr: ParameterAttribute
+    :param trans: whether it is convTrans or conv
+    :type trans: boolean
     :return: A DotMulProjection Object.
     :rtype: DotMulProjection
     """
@@ -3830,7 +3882,9 @@ def conv_projection(input,
         param_attr.attr["initial_strategy"] = 0
         param_attr.attr["initial_smart"] = False
 
-    proj = ConvProjection(
+    projCls = ConvTransProjection if trans else ConvProjection
+
+    proj = projCls(
         input_layer_name=input.name,
         num_filters=num_filters,
         conv_conf=Conv(
@@ -4939,7 +4993,12 @@ def lambda_cost(input,
 
 @wrap_name_default()
 @layer_support()
-def cross_entropy(input, label, name=None, coeff=1.0, layer_attr=None):
+def cross_entropy(input,
+                  label,
+                  name=None,
+                  coeff=1.0,
+                  weight=None,
+                  layer_attr=None):
     """
     A loss layer for multi class entropy.
 
@@ -4954,22 +5013,27 @@ def cross_entropy(input, label, name=None, coeff=1.0, layer_attr=None):
     :type input: LayerOutput.
     :param name: The name of this layers. It is not necessary.
     :type name: None|basestring.
-    :param coeff: The coefficient affects the gradient in the backward.
+    :param coeff: The cost is multiplied with coeff.
+                  The coefficient affects the gradient in the backward.
     :type coeff: float.
+    :param weight: The cost of each sample is multiplied with each weight.
+                   The weight should be a layer with size=1. Note that gradient
+                   will not be calculated for weight.
+    :type weight: LayerOutout
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput.
     """
 
+    ipts, parents = __cost_input__(input, label, weight)
     Layer(
         name=name,
         type=LayerType.CROSS_ENTROPY,
-        inputs=[input.name, label.name],
+        inputs=ipts,
         coeff=coeff,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.CROSS_ENTROPY, parents=[input, label], size=1)
+    return LayerOutput(name, LayerType.CROSS_ENTROPY, parents=parents, size=1)
 
 
 @wrap_name_default()
diff --git a/python/paddle/trainer_config_helpers/tests/configs/projections.py b/python/paddle/trainer_config_helpers/tests/configs/projections.py
index aa4521dcd5db3f845871cfaaedb02a86bcbddc38..dc8975cb311582a621eb4a5a166ddc34348fe3e9 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/projections.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/projections.py
@@ -34,11 +34,31 @@ flt = data_layer(name='filter', size=3 * 3 * 1 * 64)
 with mixed_layer() as m7:
     m7 += conv_operator(
         img=img, filter=flt, num_filters=64, num_channels=1, filter_size=3)
+    m7 += conv_projection(img, filter_size=3, num_filters=64, num_channels=1)
 
+with mixed_layer() as m8:
+    m8 += conv_operator(
+        img=img,
+        filter=flt,
+        num_filters=64,
+        num_channels=1,
+        filter_size=3,
+        stride=2,
+        padding=1,
+        trans=True)
+    m8 += conv_projection(
+        img,
+        filter_size=3,
+        num_filters=64,
+        num_channels=1,
+        stride=2,
+        padding=1,
+        trans=True)
 end = mixed_layer(
     input=[
         full_matrix_projection(input=m5),
-        trans_full_matrix_projection(input=m6), full_matrix_projection(input=m7)
+        trans_full_matrix_projection(input=m6),
+        full_matrix_projection(input=m7), full_matrix_projection(input=m8)
     ],
     size=100,
     layer_attr=ExtraAttr(
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
index 6934fd0da62f90f9bbddef9a98798bf168f7fa8e..2818389b16cca75f5030b75fc4de8c89c06c5e02 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
@@ -33,6 +33,8 @@ layers {
   bias_parameter_name: "___conv_0__.wbias"
   num_filters: 64
   shared_biases: true
+  height: 256
+  width: 256
 }
 layers {
   name: "__batch_norm_0__"
@@ -58,6 +60,8 @@ layers {
   }
   bias_parameter_name: "___batch_norm_0__.wbias"
   moving_average_fraction: 0.9
+  height: 256
+  width: 256
 }
 layers {
   name: "__crmnorm_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
index 2943ab130bd7d6f3b78ea611f1c35850ccaf5e92..2afc3afef6d39ce9b8eef05948861284775d5011 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
@@ -154,13 +154,40 @@ layers {
   inputs {
     input_layer_name: "img"
   }
+  inputs {
+    input_layer_name: "img"
+    input_parameter_name: "___mixed_6__.w1"
+    proj_conf {
+      type: "conv"
+      name: "___mixed_6__.w1"
+      input_size: 1024
+      output_size: 57600
+      conv_conf {
+        filter_size: 3
+        channels: 1
+        stride: 1
+        padding: 0
+        groups: 1
+        filter_channels: 1
+        output_x: 30
+        img_size: 32
+        caffe_mode: true
+        filter_size_y: 3
+        padding_y: 0
+        stride_y: 1
+        output_y: 30
+        img_size_y: 32
+      }
+      num_filters: 64
+    }
+  }
   inputs {
     input_layer_name: "filter"
   }
   operator_confs {
     type: "conv"
     input_indices: 0
-    input_indices: 1
+    input_indices: 2
     input_sizes: 1024
     input_sizes: 576
     output_size: 57600
@@ -186,38 +213,112 @@ layers {
 layers {
   name: "__mixed_7__"
   type: "mixed"
+  size: 254016
+  active_type: ""
+  inputs {
+    input_layer_name: "img"
+  }
+  inputs {
+    input_layer_name: "img"
+    input_parameter_name: "___mixed_7__.w1"
+    proj_conf {
+      type: "convt"
+      name: "___mixed_7__.w1"
+      input_size: 1024
+      output_size: 254016
+      conv_conf {
+        filter_size: 3
+        channels: 1
+        stride: 2
+        padding: 1
+        groups: 1
+        filter_channels: 64
+        output_x: 32
+        img_size: 63
+        caffe_mode: true
+        filter_size_y: 3
+        padding_y: 1
+        stride_y: 2
+        output_y: 32
+        img_size_y: 63
+      }
+      num_filters: 64
+    }
+  }
+  inputs {
+    input_layer_name: "filter"
+  }
+  operator_confs {
+    type: "convt"
+    input_indices: 0
+    input_indices: 2
+    input_sizes: 1024
+    input_sizes: 576
+    output_size: 254016
+    conv_conf {
+      filter_size: 3
+      channels: 1
+      stride: 2
+      padding: 1
+      groups: 1
+      filter_channels: 64
+      output_x: 32
+      img_size: 63
+      caffe_mode: true
+      filter_size_y: 3
+      padding_y: 1
+      stride_y: 2
+      output_y: 32
+      img_size_y: 63
+    }
+    num_filters: 64
+  }
+}
+layers {
+  name: "__mixed_8__"
+  type: "mixed"
   size: 100
   active_type: ""
   inputs {
     input_layer_name: "__mixed_4__"
-    input_parameter_name: "___mixed_7__.w0"
+    input_parameter_name: "___mixed_8__.w0"
     proj_conf {
       type: "fc"
-      name: "___mixed_7__.w0"
+      name: "___mixed_8__.w0"
       input_size: 300
       output_size: 100
     }
   }
   inputs {
     input_layer_name: "__mixed_5__"
-    input_parameter_name: "___mixed_7__.w1"
+    input_parameter_name: "___mixed_8__.w1"
     proj_conf {
       type: "trans_fc"
-      name: "___mixed_7__.w1"
+      name: "___mixed_8__.w1"
       input_size: 100
       output_size: 100
     }
   }
   inputs {
     input_layer_name: "__mixed_6__"
-    input_parameter_name: "___mixed_7__.w2"
+    input_parameter_name: "___mixed_8__.w2"
     proj_conf {
       type: "fc"
-      name: "___mixed_7__.w2"
+      name: "___mixed_8__.w2"
       input_size: 57600
       output_size: 100
     }
   }
+  inputs {
+    input_layer_name: "__mixed_7__"
+    input_parameter_name: "___mixed_8__.w3"
+    proj_conf {
+      type: "fc"
+      name: "___mixed_8__.w3"
+      input_size: 254016
+      output_size: 100
+    }
+  }
   drop_rate: 0.5
 }
 parameters {
@@ -281,7 +382,23 @@ parameters {
   initial_smart: true
 }
 parameters {
-  name: "___mixed_7__.w0"
+  name: "___mixed_6__.w1"
+  size: 576
+  initial_mean: 0.0
+  initial_std: 0.471404520791
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___mixed_7__.w1"
+  size: 576
+  initial_mean: 0.0
+  initial_std: 0.471404520791
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___mixed_8__.w0"
   size: 30000
   initial_mean: 0.0
   initial_std: 0.057735026919
@@ -291,7 +408,7 @@ parameters {
   initial_smart: true
 }
 parameters {
-  name: "___mixed_7__.w1"
+  name: "___mixed_8__.w1"
   size: 10000
   initial_mean: 0.0
   initial_std: 0.1
@@ -301,7 +418,7 @@ parameters {
   initial_smart: true
 }
 parameters {
-  name: "___mixed_7__.w2"
+  name: "___mixed_8__.w2"
   size: 5760000
   initial_mean: 0.0
   initial_std: 0.00416666666667
@@ -310,10 +427,20 @@ parameters {
   initial_strategy: 0
   initial_smart: true
 }
+parameters {
+  name: "___mixed_8__.w3"
+  size: 25401600
+  initial_mean: 0.0
+  initial_std: 0.00198412698413
+  dims: 254016
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
 input_layer_names: "test"
 input_layer_names: "img"
 input_layer_names: "filter"
-output_layer_names: "__mixed_7__"
+output_layer_names: "__mixed_8__"
 sub_models {
   name: "root"
   layer_names: "test"
@@ -328,10 +455,11 @@ sub_models {
   layer_names: "filter"
   layer_names: "__mixed_6__"
   layer_names: "__mixed_7__"
+  layer_names: "__mixed_8__"
   input_layer_names: "test"
   input_layer_names: "img"
   input_layer_names: "filter"
-  output_layer_names: "__mixed_7__"
+  output_layer_names: "__mixed_8__"
   is_recurrent_layer_group: false
 }
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
index 811b38ae4a51e8faedb59fea2b81a8be3cceeae6..3244181a63109335c4fba6ca4dd04ac8f0446313 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers_with_weight.protostr
@@ -45,7 +45,7 @@ layers {
   coeff: 1.0
 }
 layers {
-  name: "__regression_cost_0__"
+  name: "__mse_cost_0__"
   type: "square_error"
   size: 1
   active_type: ""
@@ -84,7 +84,7 @@ input_layer_names: "input"
 input_layer_names: "label"
 input_layer_names: "weight"
 output_layer_names: "__cost_0__"
-output_layer_names: "__regression_cost_0__"
+output_layer_names: "__mse_cost_0__"
 evaluators {
   name: "classification_error_evaluator"
   type: "classification_error"
@@ -99,12 +99,12 @@ sub_models {
   layer_names: "weight"
   layer_names: "__fc_layer_0__"
   layer_names: "__cost_0__"
-  layer_names: "__regression_cost_0__"
+  layer_names: "__mse_cost_0__"
   input_layer_names: "input"
   input_layer_names: "label"
   input_layer_names: "weight"
   output_layer_names: "__cost_0__"
-  output_layer_names: "__regression_cost_0__"
+  output_layer_names: "__mse_cost_0__"
   evaluator_names: "classification_error_evaluator"
   is_recurrent_layer_group: false
 }
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
index 3e9d28416ed5066461e960f0a9f085e057c28346..a0fb729e062bdf6fd7d2a7c2ae364d1a2b32811d 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
@@ -331,6 +331,54 @@ layers {
   }
   trans_type: "non-seq"
 }
+layers {
+  name: "__recurrent_group_3__"
+  type: "recurrent_layer_group"
+  active_type: ""
+}
+layers {
+  name: "seq_input@__recurrent_group_3__"
+  type: "scatter_agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__memory_6__@__recurrent_group_3__"
+  type: "agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "__fc_layer_0__@__recurrent_group_3__"
+  type: "fc"
+  size: 200
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "seq_input@__recurrent_group_3__"
+    input_parameter_name: "___fc_layer_0__@__recurrent_group_3__.w0"
+  }
+  inputs {
+    input_layer_name: "__memory_6__@__recurrent_group_3__"
+    input_parameter_name: "___fc_layer_0__@__recurrent_group_3__.w1"
+  }
+  bias_parameter_name: "___fc_layer_0__@__recurrent_group_3__.wbias"
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "gather_agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "__last_seq_4__"
+  type: "seqlastins"
+  size: 200
+  active_type: "linear"
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+  }
+  trans_type: "non-seq"
+}
 parameters {
   name: "___mixed_0__.w0"
   size: 40000
@@ -481,6 +529,36 @@ parameters {
   initial_strategy: 0
   initial_smart: false
 }
+parameters {
+  name: "___fc_layer_0__@__recurrent_group_3__.w0"
+  size: 20000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 200
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_0__@__recurrent_group_3__.w1"
+  size: 40000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 200
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_0__@__recurrent_group_3__.wbias"
+  size: 200
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 200
+  initial_strategy: 0
+  initial_smart: false
+}
 input_layer_names: "seq_input"
 input_layer_names: "sub_seq_input"
 output_layer_names: "__last_seq_0__"
@@ -488,6 +566,7 @@ output_layer_names: "__first_seq_0__"
 output_layer_names: "__last_seq_1__"
 output_layer_names: "__last_seq_2__"
 output_layer_names: "__last_seq_3__"
+output_layer_names: "__last_seq_4__"
 sub_models {
   name: "root"
   layer_names: "seq_input"
@@ -510,6 +589,9 @@ sub_models {
   layer_names: "__gru_group_0___recurrent_group"
   layer_names: "__gru_group_0__"
   layer_names: "__last_seq_3__"
+  layer_names: "__recurrent_group_3__"
+  layer_names: "__fc_layer_0__"
+  layer_names: "__last_seq_4__"
   input_layer_names: "seq_input"
   input_layer_names: "sub_seq_input"
   output_layer_names: "__last_seq_0__"
@@ -517,6 +599,7 @@ sub_models {
   output_layer_names: "__last_seq_1__"
   output_layer_names: "__last_seq_2__"
   output_layer_names: "__last_seq_3__"
+  output_layer_names: "__last_seq_4__"
   is_recurrent_layer_group: false
 }
 sub_models {
@@ -647,4 +730,28 @@ sub_models {
   }
   target_inlinkid: -1
 }
+sub_models {
+  name: "__recurrent_group_3__"
+  layer_names: "seq_input@__recurrent_group_3__"
+  layer_names: "__memory_6__@__recurrent_group_3__"
+  layer_names: "__fc_layer_0__@__recurrent_group_3__"
+  is_recurrent_layer_group: true
+  reversed: false
+  memories {
+    layer_name: "__fc_layer_0__@__recurrent_group_3__"
+    link_name: "__memory_6__@__recurrent_group_3__"
+    is_sequence: false
+  }
+  in_links {
+    layer_name: "seq_input"
+    link_name: "seq_input@__recurrent_group_3__"
+    has_subseq: false
+  }
+  out_links {
+    layer_name: "__fc_layer_0__@__recurrent_group_3__"
+    link_name: "__fc_layer_0__"
+    has_subseq: false
+  }
+  target_inlinkid: -1
+}
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
index d30f70a55c5b1834074966dfb3f378e01447c8ab..1c0aa7f9b9ee45b9eaf82dc46a2648d834dcd4ad 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers_with_weight.py
@@ -10,5 +10,5 @@ fc = fc_layer(input=data, size=10, act=SoftmaxActivation())
 outputs(
     classification_cost(
         input=fc, label=lbl, weight=wt),
-    regression_cost(
+    mse_cost(
         input=fc, label=lbl, weight=wt))
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py b/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
index 60b4849d69d497109ef5af3257e212df233a2d0b..91010759e4847f087eb4e05ad98ae794a2129365 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
@@ -16,6 +16,16 @@ def generate_rnn_simple(name):
     return rnn_simple
 
 
+def generate_rnn_simple_no_name():
+    def rnn_simple(s):
+        m = memory(name=None, size=200)
+        fc = fc_layer(input=[s, m], size=200)
+        m.set_input(fc)
+        return fc
+
+    return rnn_simple
+
+
 with mixed_layer() as lstm_param:  # test lstm unit, rnn group
     lstm_param += full_matrix_projection(input=seq, size=100 * 4)
 
@@ -33,4 +43,6 @@ outputs(
     last_seq(input=lstmemory_group(
         input=lstm_param, size=100)),
     last_seq(input=gru_group(
-        input=gru_param, size=100)))
+        input=gru_param, size=100)),
+    last_seq(input=recurrent_group(
+        step=generate_rnn_simple_no_name(), input=seq)), )
diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py
index f5a16d51477f9cfbf0cd32af54098406fbbd2b41..c686870a497668517d1c78c11c616ad8a71a2980 100644
--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/v2/dataset/wmt14.py
@@ -23,7 +23,7 @@ __all__ = ['train', 'test', 'build_dict']
 URL_DEV_TEST = 'http://www-lium.univ-lemans.fr/~schwenk/cslm_joint_paper/data/dev+test.tgz'
 MD5_DEV_TEST = '7d7897317ddd8ba0ae5c5fa7248d3ff5'
 # this is a small set of data for test. The original data is too large and will be add later.
-URL_TRAIN = 'http://paddlepaddle.bj.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz'
+URL_TRAIN = 'http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz'
 MD5_TRAIN = 'a755315dd01c2c35bde29a744ede23a6'
 
 START = "<s>"
diff --git a/python/paddle/v2/tests/test_layer.py b/python/paddle/v2/tests/test_layer.py
index 0055679a91801a2f9b6432797665ec17caf3beb1..89cc928dd7f624612ba717b4e5c2d6c2de7f8bed 100644
--- a/python/paddle/v2/tests/test_layer.py
+++ b/python/paddle/v2/tests/test_layer.py
@@ -22,7 +22,9 @@ import paddle.v2.networks as networks
 
 pixel = layer.data(name='pixel', type=data_type.dense_vector(128))
 label = layer.data(name='label', type=data_type.integer_value(10))
-weight = layer.data(name='weight', type=data_type.dense_vector(10))
+weight = layer.data(name='weight', type=data_type.dense_vector(1))
+combine_weight = layer.data(
+    name='weight_combine', type=data_type.dense_vector(10))
 score = layer.data(name='score', type=data_type.dense_vector(1))
 
 hidden = layer.fc(input=pixel,
@@ -81,7 +83,8 @@ class AggregateLayerTest(unittest.TestCase):
 class MathLayerTest(unittest.TestCase):
     def test_math_layer(self):
         addto = layer.addto(input=[pixel, pixel])
-        linear_comb = layer.linear_comb(weights=weight, vectors=hidden, size=10)
+        linear_comb = layer.linear_comb(
+            weights=combine_weight, vectors=hidden, size=10)
         interpolation = layer.interpolation(
             input=[hidden, hidden], weight=score)
         bilinear = layer.bilinear_interp(input=conv, out_size_x=4, out_size_y=4)
@@ -126,9 +129,8 @@ class CostLayerTest(unittest.TestCase):
         cost3 = layer.cross_entropy_cost(input=inference, label=label)
         cost4 = layer.cross_entropy_with_selfnorm_cost(
             input=inference, label=label)
-        cost5 = layer.regression_cost(input=inference, label=label)
-        cost6 = layer.regression_cost(
-            input=inference, label=label, weight=weight)
+        cost5 = layer.mse_cost(input=inference, label=label)
+        cost6 = layer.mse_cost(input=inference, label=label, weight=weight)
         cost7 = layer.multi_binary_label_cross_entropy_cost(
             input=inference, label=label)
         cost8 = layer.rank_cost(left=score, right=score, label=score)