diff --git a/.dockerignore b/.dockerignore
deleted file mode 120000
index 3e4e48b0b5fe6b468434d6767749b399319f2da2..0000000000000000000000000000000000000000
--- a/.dockerignore
+++ /dev/null
@@ -1 +0,0 @@
-.gitignore
\ No newline at end of file
diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000000000000000000000000000000000000..2b2e74053d33cb6d2878fd3d6da48fa344172f63
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,15 @@
+*.DS_Store
+build/
+*.user
+.vscode
+.idea
+.project
+.cproject
+.pydevproject
+Makefile
+.test_env/
+third_party/
+*~
+bazel-*
+
+!build/*.deb
diff --git a/.gitignore b/.gitignore
index 6aae076a49012b032b8fc0f1dc02c2714fb7b4a3..ee7c6ec370cd7c1f3435b41d915e24023c456af7 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@ build/
 .project
 .cproject
 .pydevproject
+.settings/
 Makefile
 .test_env/
 third_party/
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e991a9a0ea0cf2a3d4f5f1e900bfc38e703aaf39..e78ccdf6d3b09c0170df5bd091fd1620b131216a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -40,7 +40,7 @@ option(WITH_RDMA        "Compile PaddlePaddle with RDMA support"        OFF)
 option(WITH_TIMER       "Compile PaddlePaddle with stats timer"         OFF)
 option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler"        OFF)
 option(WITH_DOC         "Compile PaddlePaddle with documentation"       OFF)
-option(ON_COVERALLS     "Compile PaddlePaddle with code coverage"       OFF)
+option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
 option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
@@ -90,14 +90,21 @@ include_directories("${PROJ_ROOT}/paddle/cuda/include")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
 
 set(EXTERNAL_LIBS
-    # have not include gtest here.
     ${GFLAGS_LIBRARIES}
     ${GLOG_LIBRARIES}
     ${CBLAS_LIBRARIES}
     ${PROTOBUF_LIBRARY}
     ${ZLIB_LIBRARIES}
+    ${PYTHON_LIBRARIES}
 )
 
+if(WITH_GPU)
+    list(APPEND EXTERNAL_LIB ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
+    if(NOT WITH_DSO)
+        list(APPEND EXTERNAL_LIB ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
+    endif(NOT WITH_DSO)
+endif(WITH_GPU)
+
 add_subdirectory(proto)
 add_subdirectory(paddle)
 add_subdirectory(python)
diff --git a/Dockerfile b/Dockerfile
index 536adb0716447aa8b8c10beef8b974ae3f016f05..ccd43be668e7acb1a82bb88f5938755a5d3974d1 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -3,20 +3,17 @@
 FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
 
-ARG DEBIAN_FRONTEND=noninteractive
 ARG UBUNTU_MIRROR
 RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
 
 # ENV variables
 ARG BUILD_WOBOQ
-ARG BUILD_AND_INSTALL
 ARG WITH_GPU
 ARG WITH_AVX
 ARG WITH_DOC
 ARG WITH_STYLE_CHECK
 
 ENV BUILD_WOBOQ=${BUILD_WOBOQ:-OFF}
-ENV BUILD_AND_INSTALL=${BUILD_AND_INSTALL:-OFF}
 ENV WITH_GPU=${WITH_AVX:-OFF}
 ENV WITH_AVX=${WITH_AVX:-ON}
 ENV WITH_DOC=${WITH_DOC:-OFF}
@@ -31,7 +28,7 @@ RUN apt-get update && \
     apt-get install -y wget unzip tar xz-utils bzip2 gzip coreutils && \
     apt-get install -y curl sed grep graphviz libjpeg-dev zlib1g-dev && \
     apt-get install -y python-numpy python-matplotlib gcc g++ gfortran && \
-    apt-get install -y automake locales clang-format-3.8 && \
+    apt-get install -y automake locales clang-format-3.8 swig && \
     apt-get clean -y
 
 # git credential to skip password typing
@@ -51,8 +48,6 @@ RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
     cd cmake-3.4.1 && ./bootstrap && make -j `nproc` && make install && \
     cd .. && rm -rf cmake-3.4.1
 
-RUN apt-get install -y swig
-
 VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"]
 
 # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
diff --git a/README.md b/README.md
index 8a8e15841586ae6a01bb93e94f6074189f556f5a..bcc24b84128df282a2e3f0bc62aafe1ffe172338 100644
--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@
 
 
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/cn/index.html)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/develop/doc/)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/doc_cn/)
 [![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@@ -59,36 +59,36 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
     the capability of PaddlePaddle to make a huge impact for your product.
 
 ## Installation
-Check out the [Install Guide](http://paddlepaddle.org/doc/build/) to install from
-pre-built packages (**docker image**, **deb package**) or
-directly build on **Linux** and **Mac OS X** from the source code.
+
+It is recommended to check out the
+[Docker installation guide](http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html)
+before looking into the
+[build from source guide](http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html)
 
 ## Documentation
-Both [English Docs](http://paddlepaddle.org/doc/) and [Chinese Docs](http://paddlepaddle.org/doc_cn/) are provided for our users and developers.
 
-- [Quick Start](http://paddlepaddle.org/doc/demo/quick_start/index_en) <br>
-   You can follow the quick start tutorial to learn how use PaddlePaddle
-   step-by-step.
+We provide [English](http://www.paddlepaddle.org/develop/doc/) and
+[Chinese](http://www.paddlepaddle.org/doc_cn/) documentation.
+
+- [Deep Learning 101](http://book.paddlepaddle.org/index.en.html)
+
+  You might want to start from the this online interactive book that can run in Jupyter Notebook.
+
+- [Distributed Training](http://www.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)
+
+  You can run distributed training jobs on MPI clusters.
+
+- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/develop/doc/howto/usage/k8s/k8s_en.html)
 
-- [Example and Demo](http://paddlepaddle.org/doc/demo/) <br>
-   We provide five demos, including: image classification, sentiment analysis,
-   sequence to sequence model, recommendation, semantic role labeling.
+   You can also run distributed training jobs on Kubernetes clusters.
 
-- [Distributed Training](http://paddlepaddle.org/doc/cluster) <br>
-  This system supports training deep learning models on multiple machines
-  with data parallelism.
+- [Python API](http://www.paddlepaddle.org/develop/doc/api/index_en.html)
 
-- [Python API](http://paddlepaddle.org/doc/ui/) <br>
-   PaddlePaddle supports using either Python interface or C++ to build your
-   system. We also use SWIG to wrap C++ source code to create a user friendly
-   interface for Python. You can also use SWIG to create interface for your
-   favorite programming language.
+   Our new API enables much shorter programs.
 
-- [How to Contribute](http://paddlepaddle.org/doc/build/contribute_to_paddle.html) <br>
-   We sincerely appreciate your interest and contributions. If you would like to
-   contribute, please read the contribution guide.
+- [How to Contribute](http://www.paddlepaddle.org/develop/doc/howto/dev/contribute_to_paddle_en.html)
 
-- [Source Code Documents](http://paddlepaddle.org/doc/source/) <br>
+   We appreciate your contributions!
 
 ## Ask Questions
 
diff --git a/cmake/coveralls.cmake b/cmake/coveralls.cmake
index 9be7643819efdde3f42e4d39b2849ecc17e0d9fb..ca1471cabb57c0795ee193493d2e60bb5bd9e1cc 100644
--- a/cmake/coveralls.cmake
+++ b/cmake/coveralls.cmake
@@ -61,7 +61,7 @@ function(code_coverage _COVERAGE_SRCS _COVERALLS_UPLOAD _CMAKE_SCRIPT_PATH)
     endif()
 endfunction()
 
-if(ON_COVERALLS)
+if(WITH_COVERAGE)
     set(CMAKE_BUILD_TYPE "Debug")
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
     set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
diff --git a/cmake/coverallsGcovJsons.cmake b/cmake/coverallsGcovJsons.cmake
index ad9a10cb8616159b9e3aff445e698cb2edb92820..4641184fcf5273b884524d9b9444209ffb65e000 100644
--- a/cmake/coverallsGcovJsons.cmake
+++ b/cmake/coverallsGcovJsons.cmake
@@ -134,7 +134,7 @@ foreach(GCDA ${GCDA_FILES})
 	# If -p is not specified then the file is named only "the_file.c.gcov"
 	#
 	execute_process(
-		COMMAND "${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA} >/dev/null"
+		COMMAND ${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA} >/dev/null
 		WORKING_DIRECTORY ${GCDA_DIR}
 	)
 endforeach()
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 29d17691db9f4575bae4372c61a0e1964e163fc9..00dde9a9fdd4d4825947b987b3e8e0460f4a5f3a 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -45,7 +45,7 @@ IF(NOT ${CBLAS_FOUND})
         PREFIX              ${CBLAS_SOURCES_DIR}
         INSTALL_DIR         ${CBLAS_INSTALL_DIR}
         BUILD_IN_SOURCE     1
-        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} FC=${CMAKE_Fortran_COMPILER} CC=${CMAKE_C_COMPILER} HOSTCC=${CMAKE_C_COMPILER} NO_SHARED=1 libs netlib
+        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} FC=${CMAKE_Fortran_COMPILER} CC=${CMAKE_C_COMPILER} HOSTCC=${CMAKE_C_COMPILER} DYNAMIC_ARCH=1 NO_SHARED=1 libs netlib
         INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 PREFIX=<INSTALL_DIR>
         UPDATE_COMMAND      ""
         CONFIGURE_COMMAND   ""
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 446a7532c55bd3ca66662efe70db93551580b8cc..ad1426fd940c7b163668c33d41731fe75d89dd89 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -14,7 +14,8 @@
 
 INCLUDE(ExternalProject)
 
-FIND_PACKAGE(Protobuf 3.1)
+set(PROTOBUF_VERSION 3.1)
+FIND_PACKAGE(Protobuf ${PROTOBUF_VERSION})
 
 IF(PROTOBUF_FOUND)
     EXEC_PROGRAM(${PROTOBUF_PROTOC_EXECUTABLE} ARGS --version OUTPUT_VARIABLE PROTOBUF_VERSION)
diff --git a/cmake/util.cmake b/cmake/util.cmake
index 3640e4651fdd8b491f63875a7ea886afcadf978a..bacb64eb9ee65fffc824e4587a22fc432c092b19 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -90,26 +90,6 @@ function(link_paddle_exe TARGET_NAME)
         ${RDMA_LD_FLAGS}
         ${RDMA_LIBS})
 
-    if(WITH_PYTHON)
-        target_link_libraries(${TARGET_NAME}
-            ${PYTHON_LIBRARIES} util)
-    endif()
-
-    if(WITH_GPU)
-        target_link_libraries(${TARGET_NAME} ${CUDA_CUDART_LIBRARY})
-        if(NOT WITH_DSO OR WITH_METRIC)
-            target_link_libraries(${TARGET_NAME}
-                ${CUDNN_LIBRARY}
-                ${CUDA_curand_LIBRARY})
-            CUDA_ADD_CUBLAS_TO_TARGET(${TARGET_NAME})
-        endif()
-
-        check_library_exists(rt clock_gettime "time.h" HAVE_CLOCK_GETTIME )
-        if(HAVE_CLOCK_GETTIME)
-            target_link_libraries(${TARGET_NAME} rt)
-        endif()
-    endif()
-
     add_dependencies(${TARGET_NAME} ${external_project_dependencies})
 endfunction()
 
diff --git a/doc/api/v2/config/layer.rst b/doc/api/v2/config/layer.rst
index db33a20487e579cda67a01c52ee646829df0f4e6..05817ec85455ac58566e90956a54cb86541f8488 100644
--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -109,6 +109,12 @@ sum_to_one_norm
     :members: sum_to_one_norm
     :noindex:
     
+cross_channel_norm
+------------------
+..  automodule:: paddle.v2.layer
+    :members: cross_channel_norm
+    :noindex:
+    
 Recurrent Layers
 ================
 
diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
index d9d54bff3096cb3520409971dbd1b2e179ac8be1..69f4501f370dcc9d603ec54a63d68568d66e832e 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -51,7 +51,7 @@ PaddlePaddle supports some build options.
 <tr><td class="left">WITH_TIMER</td><td class="left">Compile PaddlePaddle with stats timer</td></tr>
 <tr><td class="left">WITH_PROFILER</td><td class="left">Compile PaddlePaddle with GPU profiler</td></tr>
 <tr><td class="left">WITH_DOC</td><td class="left">Compile PaddlePaddle with documentation</td></tr>
-<tr><td class="left">ON_COVERALLS</td><td class="left">Compile PaddlePaddle with code coverage</td></tr>
+<tr><td class="left">WITH_COVERAGE</td><td class="left">Compile PaddlePaddle with code coverage</td></tr>
 <tr><td class="left">COVERALLS_UPLOAD</td><td class="left">Package code coverage data to coveralls</td></tr>
 <tr><td class="left">ON_TRAVIS</td><td class="left">Exclude special unit test on Travis CI</td></tr>
 </tbody>
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index 6e8fcd114df580a00858d95f0af0d1ec0bd9b4a2..3760c6727c21cfb32ca4d2efc30351352c9b182b 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -1,21 +1,3 @@
-FUNCTION(generate_python_api target_name)
-    ADD_CUSTOM_COMMAND(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
-                              ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
-                              ${PROJ_ROOT}/paddle/Paddle_wrap.h
-        COMMAND ${SWIG_EXECUTABLE} -python -c++ -outcurrentdir -I../ api/Paddle.swig
-                && mv ${PROJ_ROOT}/paddle/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
-        DEPENDS ${PROJ_ROOT}/paddle/api/Paddle.swig
-                ${PROJ_ROOT}/paddle/api/PaddleAPI.h
-                ${external_project_dependencies}
-        WORKING_DIRECTORY ${PROJ_ROOT}/paddle
-        COMMENT "Generate Python API from swig")
-    ADD_CUSTOM_TARGET(${target_name} ALL DEPENDS
-                ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
-                ${PROJ_ROOT}/paddle/Paddle_wrap.h
-                ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
-                ${external_project_dependencies})
-ENDFUNCTION(generate_python_api)
-
 set(API_SOURCES
     Arguments.cpp
     ConfigParser.cpp
@@ -33,65 +15,86 @@ set(API_HEADER
     PaddleAPI.h
     Internal.h)
 
-add_library(paddle_api STATIC
-        ${API_SOURCES})
+add_library(paddle_api STATIC ${API_SOURCES})
 add_dependencies(paddle_api gen_proto_cpp)
 
-list(LENGTH "${GFLAGS_LIBRARIES}" GFLAGS_LIBRARIES_LENGTH)
+INCLUDE(${SWIG_USE_FILE})
+INCLUDE_DIRECTORIES(${PROJ_ROOT}/paddle)
 
-if(${GFLAGS_LIBRARIES_LENGTH} EQUAL 0 AND TARGET "${GFLAGS_LIBRARIES}")
-# Because gflags compiled by cmake, so it is imported by cmake target,
-# not a real library path. Get the real library path here.
-message(STATUS "GFLAGS Libraries is ${GFLAGS_LIBRARIES}")
-get_target_property(GFLAGS_LOCATION ${GFLAGS_LIBRARIES} LOCATION)
-message(STATUS "GFLAGS Target location is ${GFLAGS_LOCATION}")
-else()
-set(GFLAGS_LOCATION ${GFLAGS_LIBRARIES})
-endif()
+FILE(GLOB PY_PADDLE_PYTHON_FILES ${PROJ_ROOT}/paddle/py_paddle/*.py)
+
+SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON)
+
+SET(CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR})
+SET(CMAKE_CXX_FLAGS "-std=c++11 -fPIC -Wall")
+IF(WITH_COVERAGE)
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
+ENDIF(WITH_COVERAGE)
 
-configure_file(
-    paddle_api_config.py.in
-    ${PROJ_ROOT}/paddle/api/paddle_api_config.py
+SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS
+    paddle_parameter
+    paddle_function
+    paddle_math
+    paddle_utils
+    paddle_gserver
+    paddle_pserver
+    paddle_api
+    paddle_cuda
+    paddle_trainer_lib
+    paddle_network
+    paddle_proto
+    ${external_project_dependencies}
 )
 
-generate_python_api(python_swig_sources)
+IF(APPLE)
+    SET(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load")
+ELSE(APPLE)
+    SET(START_GROUP "-Xlinker -start-group")
+    SET(END_GROUP "-Xlinker -end-group")
+    SET(ARCHIVE_START "-Wl,--whole-archive")
+    SET(ARCHIVE_END "-Wl,--no-whole-archive")
+ENDIF(APPLE)
 
-file(GLOB PY_PADDLE_PYTHON_FILES ${PROJ_ROOT}/paddle/py_paddle/*.py)
+SWIG_ADD_MODULE(swig_paddle python Paddle.i)
+SWIG_LINK_LIBRARIES(swig_paddle
+    ${MACOS_LD_FLAGS}
+    ${START_GROUP}
+    ${ARCHIVE_START}
+    paddle_gserver
+    paddle_function
+    ${METRIC_LIBS}
+    ${ARCHIVE_END}
+    paddle_pserver
+    paddle_trainer_lib
+    paddle_network
+    paddle_parameter
+    paddle_math
+    paddle_utils
+    paddle_proto
+    paddle_cuda
+    paddle_api
+    ${CMAKE_DL_LIBS}
+    ${EXTERNAL_LIBS}
+    ${CMAKE_THREAD_LIBS_INIT}
+    ${RDMA_LD_FLAGS}
+    ${RDMA_LIBS}
+    ${START_END}
+)
 
-# TODO(yuyang18) : make wheel name calculated by cmake
-add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/dist/.timestamp
+add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PROJ_ROOT}/paddle/py_paddle
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch dist/.timestamp
     COMMAND rm -rf py_paddle.egg-info build
     WORKING_DIRECTORY ${PROJ_ROOT}/paddle
-    DEPENDS python_swig_sources
-            paddle_parameter
-            paddle_function
-            paddle_math
-            paddle_utils
-            paddle_gserver
-            paddle_pserver
-            paddle_trainer
-            paddle_api
-            paddle_cuda
-        ${PY_PADDLE_PYTHON_FILES}
+    DEPENDS _swig_paddle
 )
 
-install(DIRECTORY ${PROJ_ROOT}/paddle/dist/
-    DESTINATION opt/paddle/share/wheels
-)
+# TODO(yuyang18) : make wheel name calculated by cmake
+add_custom_target(python_api_wheel ALL DEPENDS ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so)
 
-add_custom_target(python_api_wheel ALL DEPENDS
-  ${PROJ_ROOT}/paddle/dist/.timestamp)
-add_dependencies(python_api_wheel python_swig_sources
-  paddle_parameter
-  paddle_math
-  paddle_utils
-  paddle_gserver
-  paddle_pserver
-  paddle_trainer
-  paddle_api
-  paddle_cuda)
+install(DIRECTORY ${PROJ_ROOT}/paddle/dist/ DESTINATION opt/paddle/share/wheels)
 
 if(WITH_TESTING)
     IF(NOT PY_PIP_FOUND)
diff --git a/paddle/api/Paddle.swig b/paddle/api/Paddle.i
similarity index 100%
rename from paddle/api/Paddle.swig
rename to paddle/api/Paddle.i
diff --git a/paddle/api/paddle_api_config.py.in b/paddle/api/paddle_api_config.py.in
deleted file mode 100644
index 82f45ba6ccec49eb190d1814a67a575f311689e8..0000000000000000000000000000000000000000
--- a/paddle/api/paddle_api_config.py.in
+++ /dev/null
@@ -1,17 +0,0 @@
-PADDLE_BUILD_DIR="@CMAKE_CURRENT_BINARY_DIR@/../"
-WITH_GPU="@WITH_GPU@"
-PROTOBUF_LIBRARY="@PROTOBUF_LIBRARY@"
-ZLIB_LIBRARIES="@ZLIB_LIBRARIES@"
-CMAKE_THREAD_LIB="@CMAKE_THREAD_LIBS_INIT@"
-CMAKE_DL_LIBS="@CMAKE_DL_LIBS@"
-
-
-WITH_PYTHON="@WITH_PYTHON@"
-PYTHON_LIBRARIES="@PYTHON_LIBRARIES@"
-GLOG_LIBRARIES="@GLOG_LIBRARIES@"
-GFLAGS_LIBRARIES="@GFLAGS_LIBRARIES@"
-GFLAGS_LOCATION="@GFLAGS_LOCATION@"
-CBLAS_LIBRARIES="@CBLAS_LIBRARIES@"
-
-CUDA_LIBRARIES="@CUDA_CUDART_LIBRARY@"
-WITH_COVERALLS="@ON_COVERALLS@"
diff --git a/paddle/api/paddle_ld_flags.py b/paddle/api/paddle_ld_flags.py
deleted file mode 100644
index ad5dce209bf8e14120320a58c3cd85d6f6a97688..0000000000000000000000000000000000000000
--- a/paddle/api/paddle_ld_flags.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-try:
-    from paddle_api_config import *
-    import os.path
-    import platform
-
-    system = platform.system().lower()
-    is_osx = (system == 'darwin')
-    is_win = (system == 'windows')
-    is_lin = (system == 'linux')
-
-    if is_lin:
-        whole_start = "-Wl,--whole-archive"
-        whole_end = "-Wl,--no-whole-archive"
-    elif is_osx:
-        whole_start = ""
-        whole_end = ""
-
-    LIB_DIRS = [
-        "math", 'function', 'utils', 'parameter', "gserver", "api", "cuda",
-        "pserver", "trainer"
-    ]
-    PARENT_LIB_DIRS = ['proto']
-
-    class PaddleLDFlag(object):
-        def __init__(self):
-            self.paddle_build_dir = PADDLE_BUILD_DIR
-            self.paddle_build_dir = os.path.abspath(self.paddle_build_dir)
-            self.with_gpu = PaddleLDFlag.cmake_bool(WITH_GPU)
-            self.protolib = PROTOBUF_LIBRARY
-            self.zlib = ZLIB_LIBRARIES
-            self.thread = CMAKE_THREAD_LIB
-            self.dl_libs = CMAKE_DL_LIBS
-            self.with_python = PaddleLDFlag.cmake_bool(WITH_PYTHON)
-            self.python_libs = PYTHON_LIBRARIES
-
-            self.glog_libs = GLOG_LIBRARIES
-
-            self.with_coverage = PaddleLDFlag.cmake_bool(WITH_COVERALLS)
-            self.gflags_libs = GFLAGS_LIBRARIES
-            self.gflags_location = GFLAGS_LOCATION
-            self.cblas_libs = CBLAS_LIBRARIES
-            self.curt = CUDA_LIBRARIES
-
-        def ldflag_str(self):
-            return " ".join(
-                [self.libs_dir_str(), self.parent_dir_str(), self.libs_str()])
-
-        def libs_dir_str(self):
-            libdirs = LIB_DIRS
-            return " ".join(
-                map(lambda x: "-L" + os.path.join(self.paddle_build_dir, x),
-                    libdirs))
-
-        def parent_dir_str(self):
-            libdirs = PARENT_LIB_DIRS
-            return " ".join(
-                map(lambda x: "-L" + os.path.join(self.paddle_build_dir, '..', x),
-                    libdirs))
-
-        def libs_str(self):
-            libs = [
-                whole_start,
-                "-lpaddle_gserver",
-                "-lpaddle_function",
-                whole_end,
-                "-lpaddle_pserver",
-                "-lpaddle_trainer_lib",
-                "-lpaddle_network",
-                '-lpaddle_parameter',
-                "-lpaddle_math",
-                '-lpaddle_utils',
-                "-lpaddle_proto",
-                "-lpaddle_cuda",
-                "-lpaddle_api",
-                self.normalize_flag(self.protolib),
-                self.normalize_flag(self.glog_libs),
-                self.normalize_flag(self.gflags_libs),
-                self.normalize_flag(self.zlib),
-                self.normalize_flag(self.thread),
-                self.normalize_flag(self.dl_libs),
-                self.normalize_flag(self.cblas_libs),
-            ]
-
-            if self.with_python:
-                libs.append(self.normalize_flag(self.python_libs))
-            if self.with_gpu:
-                libs.append(self.normalize_flag(self.curt))
-            if self.with_coverage:
-                libs.append("-fprofile-arcs")
-            return " ".join(filter(lambda l: len(l) != 0, libs))
-
-        def normalize_flag(self, cmake_flag):
-            """
-            CMake flag string to ld flag
-            :type cmake_flag: str
-            """
-            if ";" in cmake_flag:
-                return " ".join(map(self.normalize_flag, cmake_flag.split(";")))
-            if cmake_flag.startswith("/"):  # is a path
-                return cmake_flag
-            elif cmake_flag.startswith("-l"):  # normal link command
-                return cmake_flag
-            elif cmake_flag in [
-                    "gflags-shared", "gflags-static", "gflags_nothreads-shared",
-                    "gflags_nothreads-static"
-            ]:  # special for gflags
-                assert PaddleLDFlag.cmake_bool(self.gflags_location)
-                return self.gflags_location
-            elif len(cmake_flag) != 0:
-                return "".join(["-l", cmake_flag])
-            else:
-                return ""
-
-        @staticmethod
-        def cmake_bool(cmake_str):
-            """
-            CMake bool string to bool
-            :param cmake_str: cmake boolean string
-            :type cmake_str: str
-            :rtype: bool
-            """
-            if cmake_str in ["FALSE", "OFF", "NO"] or cmake_str.endswith(
-                    "-NOTFOUND"):
-                return False
-            else:
-                return True
-
-        def c_flag(self):
-            if self.with_coverage:
-                return [
-                    "-fprofile-arcs", "-ftest-coverage", "-O0", "-g",
-                    "-std=c++11"
-                ]
-            else:
-                return ["-std=c++11"]
-except ImportError:
-
-    class PaddleLDFlag(object):
-        def ldflag_str(self):
-            pass
-
-        def c_flag(self):
-            pass
diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt
index 4f92150ec84d637c5b75cba09d7e98501a5a5f5d..93a6a99848aa13bb36c9c5c7091fbaa891fc9823 100644
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -25,12 +25,16 @@ filter_test(GSERVER_HEADER)
 filter_test(GSERVER_SOURCES)
 if(NOT WITH_GPU)
     list(REMOVE_ITEM GSERVER_HEADER
+        layers/CudnnConvBaseLayer.h
         layers/CudnnConvLayer.h
+        layers/CudnnConvTransLayer.h
         layers/CudnnPoolLayer.h
         layers/CudnnBatchNormLayer.h)
 
     list(REMOVE_ITEM GSERVER_SOURCES
+        layers/CudnnConvBaseLayer.cpp
         layers/CudnnConvLayer.cpp
+        layers/CudnnConvTransLayer.cpp
         layers/CudnnPoolLayer.cpp
         layers/CudnnBatchNormLayer.cpp)
     compile_cu_as_cpp(layers/LstmCompute.cu)
diff --git a/paddle/gserver/dataproviders/DataProvider.h b/paddle/gserver/dataproviders/DataProvider.h
index 9a2ad7567f0dc93d0a8e396fd88b2488afe9d049..40036762179ebb1495b90907f16b97e3c60c50d8 100644
--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -164,15 +164,6 @@ public:
     argu.value = value;
     data_.push_back(argu);
   }
-  /**
-   * @brief Append user defined data
-   * @param[in]  ptr     user defined data
-   */
-  void appendUserDefinedPtr(UserDefinedVectorPtr ptr) {
-    Argument argu;
-    argu.udp = ptr;
-    data_.push_back(argu);
-  }
 
   /*
    * @brief Append argument
diff --git a/paddle/gserver/layers/ConvBaseOperator.cpp b/paddle/gserver/layers/ConvBaseOperator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5c231986292d2cd26ee30ccc122142fccd5b4949
--- /dev/null
+++ b/paddle/gserver/layers/ConvBaseOperator.cpp
@@ -0,0 +1,150 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvBaseOperator.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvBaseOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+ConvBaseOperator::ConvBaseOperator(const OperatorConfig &config, bool useGpu)
+    : Operator(config, useGpu) {
+  CHECK(useGpu);
+  CHECK_EQ(config_.input_indices_size(), 2L);
+
+  caffeMode_ = true;
+  getConvParams();
+  computeConvSizes();
+
+  // initialize all to default algorithms
+  fwdAlgo_ = 0;
+  bwdFilterAlgo_ = 0;
+  bwdDataAlgo_ = 0;
+  fwdLimitBytes_ = 0;
+  bwdDataLimitBytes_ = 0;
+  bwdFilterLimitBytes_ = 0;
+  workSpaceInBytes_ = 0;
+  workSpace_ = nullptr;
+
+  isSelectAlgo_ = false;
+}
+
+void ConvBaseOperator::allocConvWorkSpace() {
+  hl_conv_workspace(imageDesc_,
+                    outputDesc_,
+                    filterDesc_,
+                    convDesc_,
+                    &fwdAlgo_,
+                    &fwdLimitBytes_,
+                    &bwdDataAlgo_,
+                    &bwdDataLimitBytes_,
+                    &bwdFilterAlgo_,
+                    &bwdFilterLimitBytes_);
+
+  size_t maxWorkSpace = 0;
+  maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
+  maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
+
+  if (maxWorkSpace > workSpaceInBytes_) {
+    if (workSpaceInBytes_ != 0) {
+      hl_free_mem_device(workSpace_);
+    }
+    // total amount of storage needed
+    workSpace_ = hl_malloc_device(maxWorkSpace);
+    workSpaceInBytes_ = maxWorkSpace;
+  }
+}
+
+void ConvBaseOperator::computeConvSizes() {
+  hl_create_filter_descriptor(
+      &filterDesc_, channels_, numFilters_, filterSizeY_, filterSize_);
+  hl_create_tensor_descriptor(&imageDesc_);
+  hl_create_tensor_descriptor(&outputDesc_);
+  hl_create_convolution_descriptor(&convDesc_,
+                                   imageDesc_,
+                                   filterDesc_,
+                                   paddingY_,
+                                   padding_,
+                                   strideY_,
+                                   stride_);
+}
+
+void ConvBaseOperator::reshapeImageDescriptors() {
+  hl_tensor_reshape(imageDesc_,
+                    1,
+                    channels_,
+                    imageH_,
+                    imageW_,
+                    channels_ * imageH_ * imageW_,
+                    imageH_ * imageW_,
+                    imageW_,
+                    1);
+  hl_tensor_reshape(outputDesc_,
+                    1,
+                    numFilters_,
+                    outputH_,
+                    outputW_,
+                    numFilters_ * outputH_ * outputW_,
+                    outputH_ * outputW_,
+                    outputW_,
+                    1);
+  hl_reset_convolution_descriptor(convDesc_,
+                                  imageDesc_,
+                                  filterDesc_,
+                                  paddingY_,
+                                  padding_,
+                                  strideY_,
+                                  stride_);
+}
+
+void ConvBaseOperator::getConvParams() {
+  configNumFilters_ = config_.num_filters();
+  const ConvConfig &conf = config_.conv_conf();
+  padding_ = conf.padding();
+  stride_ = conf.stride();
+  filterSize_ = conf.filter_size();
+  paddingY_ = conf.padding_y();
+  strideY_ = conf.stride_y();
+  filterSizeY_ = conf.filter_size_y();
+  filterPixels_ = filterSize_ * filterSizeY_;
+  configChannels_ = conf.channels();
+  imgSize_ = conf.img_size();
+  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  imgPixels_ = imgSize_ * imgSizeY_;
+  CHECK_EQ(conf.groups(), 1U);
+  filterChannels_ = conf.filter_channels();
+  outputX_ = conf.output_x();
+  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+  outputs_ = outputX_ * outputX_;
+
+  isDeconv_ = (config_.type() == "conv") ? false : true;
+  if (isDeconv_) {
+    channels_ = configNumFilters_;
+    numFilters_ = configChannels_;
+  } else {
+    channels_ = configChannels_;
+    numFilters_ = configNumFilters_;
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseOperator.h b/paddle/gserver/layers/ConvBaseOperator.h
new file mode 100644
index 0000000000000000000000000000000000000000..2d42169cde2a80a26edcf98bc2d728e00b075728
--- /dev/null
+++ b/paddle/gserver/layers/ConvBaseOperator.h
@@ -0,0 +1,112 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "Operator.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+class ConvBaseOperator : public Operator {
+public:
+  ConvBaseOperator(const OperatorConfig &config, bool useGpu);
+  /**
+   * Free workspace in device and destroy cudnn tensor descriptor.
+   */
+  virtual ~ConvBaseOperator() {
+    if (workSpaceInBytes_ != 0) {
+      hl_free_mem_device(workSpace_);
+      workSpaceInBytes_ = 0;
+    }
+
+    hl_destroy_tensor_descriptor(imageDesc_);
+    hl_destroy_tensor_descriptor(outputDesc_);
+    hl_destroy_filter_descriptor(filterDesc_);
+    hl_destroy_convolution_descriptor(convDesc_);
+  }
+
+protected:
+  /**
+   * Get convolution parameters from layer config and
+   * initialize member variables.
+   */
+  void getConvParams();
+
+  /**
+   * Allocate Gpu Memory for cudnn convolution algorithms.
+   */
+  void allocConvWorkSpace();
+
+  /**
+   * Create cudnn tensor descriptor for convolution operation.
+   */
+  void computeConvSizes();
+
+  /**
+   * Reshape cudnn tensor descriptor.
+   */
+  void reshapeImageDescriptors();
+
+  /**
+   * Reshape cudnn tensor descriptor.
+   */
+  virtual void reshape(int batchSize) = 0;
+
+  /**
+   * Check filter size is equal to the size calculated by parameters from
+   * layer config.
+   */
+  void checkFilterSize(const MatrixPtr &filter) {
+    CHECK_EQ(static_cast<int>(filter->getWidth()),
+             filterSize_ * filterSizeY_ * channels_ * numFilters_);
+  }
+
+  /// Most of member variables are same with CudnnConvLayer.
+  /// There is no explanation here.
+  bool isDeconv_;
+  int imageH_, imageW_, outputH_, outputW_;
+  hl_tensor_descriptor imageDesc_;
+  hl_tensor_descriptor outputDesc_;
+  hl_filter_descriptor filterDesc_;
+  hl_convolution_descriptor convDesc_;
+  bool caffeMode_;
+  int inputOffset_, outputOffset_, weightOffset_;
+  int numFilters_, channels_;
+
+  /// from parsing config
+  int configNumFilters_, configChannels_;
+  int padding_, stride_, filterSize_, imgSize_, imgSizeY_;
+  int paddingY_, strideY_, filterSizeY_;
+  int imgPixels_, filterPixels_, filterChannels_, outputX_, outputY_, outputs_;
+
+  /// Following member variables are same with CudnnConvLayer.
+  /// There is no explanation here.
+  int fwdAlgo_, bwdFilterAlgo_, bwdDataAlgo_;
+  size_t fwdLimitBytes_, bwdDataLimitBytes_, bwdFilterLimitBytes_;
+  size_t workSpaceInBytes_;
+  void *workSpace_;
+  bool isSelectAlgo_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseProjection.cpp b/paddle/gserver/layers/ConvBaseProjection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d1e932ded595c90cbe6040c330c5c8663d81e2b4
--- /dev/null
+++ b/paddle/gserver/layers/ConvBaseProjection.cpp
@@ -0,0 +1,195 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvBaseProjection.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+ThreadLocalD<std::vector<MemoryHandle *>> ConvBaseProjection::convMem_;
+
+ConvBaseProjection::ConvBaseProjection(const ProjectionConfig &config,
+                                       ParameterPtr parameter,
+                                       bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  CHECK(useGpu);  // only support GPU
+  getConvParams();
+  initCudnn();
+
+  size_t height = filterH_ * filterW_ * channels_ / groups_;
+  size_t width = numFilters_;
+  weight_.reset(new Weight(height, width, parameter));
+  weightOffset_ = height * width / groups_;
+}
+
+void ConvBaseProjection::getConvParams() {
+  const ConvConfig &conf = config_.conv_conf();
+  paddingH_ = conf.padding_y();
+  paddingW_ = conf.padding();
+
+  strideH_ = conf.stride_y();
+  strideW_ = conf.stride();
+
+  filterH_ = conf.filter_size_y();
+  filterW_ = conf.filter_size();
+
+  configImgH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  configImgW_ = conf.img_size();
+
+  configOutH_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+  configOutW_ = conf.output_x();
+
+  configChannels_ = conf.channels();
+  configNumFilters_ = config_.num_filters();
+
+  isDeconv_ = (config_.type() == "conv") ? false : true;
+
+  channels_ = (isDeconv_) ? configNumFilters_ : configChannels_;
+  numFilters_ = (isDeconv_) ? configChannels_ : configNumFilters_;
+
+  groups_ = conf.groups();
+  CHECK_EQ(channels_ % groups_, 0);
+  CHECK_EQ(numFilters_ % groups_, 0);
+}
+
+void ConvBaseProjection::initCudnn() {
+  hl_create_filter_descriptor(&filterDesc_,
+                              channels_ / groups_,
+                              numFilters_ / groups_,
+                              filterH_,
+                              filterW_);
+  hl_create_tensor_descriptor(&imageDesc_);
+  hl_create_tensor_descriptor(&outputDesc_);
+  hl_create_convolution_descriptor(&convDesc_,
+                                   imageDesc_,
+                                   filterDesc_,
+                                   paddingH_,
+                                   paddingW_,
+                                   strideH_,
+                                   strideW_);
+
+  // initialize all to default algorithms
+  fwdAlgo_ = 0;
+  bwdFilterAlgo_ = 0;
+  bwdDataAlgo_ = 0;
+  fwdLimitBytes_ = 0;
+  bwdDataLimitBytes_ = 0;
+  bwdFilterLimitBytes_ = 0;
+  workSpaceInBytes_ = 0;
+
+  batchNum_ = 0;
+  isSelectAlgo_ = false;
+}
+
+void ConvBaseProjection::reshapeTensorDesc(int batchSize) {
+  // The stride between two consecutive samples in the output of ConvProjection
+  // may not be numFilters_ * outputH_ * outputW_ (conv) or
+  // channels_ * imageH_ * imageW_ (deconv)
+  // for example, in the case of layer ConcatenateLayer2 with two
+  // ConvProjection, the stride is the output_size of layer ConcatenateLayer2.
+  // So the calculation of nStride is different from CudnnConvLayer.
+  size_t nStrideImage, nStrideOutput;
+  if (isDeconv_) {
+    nStrideImage = out_->value->getStride();
+    nStrideOutput = numFilters_ * outputH_ * outputW_;
+  } else {
+    nStrideImage = channels_ * imageH_ * imageW_;
+    nStrideOutput = out_->value->getStride();
+  }
+
+  hl_tensor_reshape(imageDesc_,
+                    batchSize,
+                    channels_ / groups_,
+                    imageH_,
+                    imageW_,
+                    nStrideImage,
+                    imageH_ * imageW_,
+                    imageW_,
+                    1);
+
+  hl_tensor_reshape(outputDesc_,
+                    batchSize,
+                    numFilters_ / groups_,
+                    outputH_,
+                    outputW_,
+                    nStrideOutput,
+                    outputH_ * outputW_,
+                    outputW_,
+                    1);
+
+  hl_reset_convolution_descriptor(convDesc_,
+                                  imageDesc_,
+                                  filterDesc_,
+                                  paddingH_,
+                                  paddingW_,
+                                  strideH_,
+                                  strideW_);
+}
+
+void ConvBaseProjection::reshape(int batchSize) {
+  size_t width = calOutputSize();
+  CHECK_EQ(width, out_->value->getWidth());
+  CHECK_EQ(calInputSize(), in_->value->getWidth());
+
+  isSelectAlgo_ = (batchSize == batchNum_);
+  batchNum_ = batchSize;
+
+  if (!isSelectAlgo_) {
+    reshapeTensorDesc(batchSize);
+    hl_conv_workspace(imageDesc_,
+                      outputDesc_,
+                      filterDesc_,
+                      convDesc_,
+                      &fwdAlgo_,
+                      &fwdLimitBytes_,
+                      &bwdDataAlgo_,
+                      &bwdDataLimitBytes_,
+                      &bwdFilterAlgo_,
+                      &bwdFilterLimitBytes_);
+
+    size_t maxWorkSpace = 0;
+    maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
+    maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
+    workSpaceInBytes_ = maxWorkSpace;
+
+    VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
+            << " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_;
+  }
+
+  isSelectAlgo_ = true;
+}
+
+void *ConvBaseProjection::getSpaceBytes(size_t size) {
+  std::vector<MemoryHandle *> &convMem = *convMem_;
+  if (convMem.empty()) {
+    int numDevices = hl_get_device_count();
+    convMem.resize(numDevices);
+  }
+
+  int devId = hl_get_device();
+  MemoryHandle **localMem = &(convMem[devId]);
+  if (NULL == *localMem || size > (*localMem)->getAllocSize()) {
+    *localMem = new GpuMemoryHandle(size);
+  }
+  return (*localMem)->getBuf();
+}
+
+ConvBaseProjection::~ConvBaseProjection() {
+  hl_destroy_tensor_descriptor(imageDesc_);
+  hl_destroy_tensor_descriptor(outputDesc_);
+  hl_destroy_filter_descriptor(filterDesc_);
+  hl_destroy_convolution_descriptor(convDesc_);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvBaseProjection.h b/paddle/gserver/layers/ConvBaseProjection.h
new file mode 100644
index 0000000000000000000000000000000000000000..4a33aa1837dfc36dbead60deaccbc6b772fe4754
--- /dev/null
+++ b/paddle/gserver/layers/ConvBaseProjection.h
@@ -0,0 +1,116 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Projection.h"
+#include "paddle/math/MathUtils.h"
+
+namespace paddle {
+
+/**
+ * @brief Base class for ConvProjection and ConvTransProjection.
+ */
+class ConvBaseProjection : public Projection {
+public:
+  /**
+   * Constructor.
+   */
+  ConvBaseProjection(const ProjectionConfig& config,
+                     ParameterPtr parameter,
+                     bool useGpu);
+
+  ~ConvBaseProjection();
+
+protected:
+  void getConvParams();
+  void initCudnn();
+
+  void reshapeTensorDesc(int batchSize);
+  void reshape(int batchSize);
+
+  virtual size_t calOutputSize() = 0;
+  virtual size_t calInputSize() = 0;
+
+  static void* getSpaceBytes(size_t size);
+
+  /// True if it's deconv projection layer, false if it's ConvProjection layer
+  bool isDeconv_;
+  /// imageH_ and imageW_ / outputH_ and outputW_
+  /// is calculated from the input layer.
+  int imageH_, imageW_;
+  int outputH_, outputW_;
+  /// configImgH_ and configImgW_ / configOutH_ and configOutW_
+  /// is obtained from config.
+  int configImgH_, configImgW_;
+  int configOutH_, configOutW_;
+  /// channels_ and numFilters_ are defined in terms of convolution semantics
+  int channels_, numFilters_;
+  /// configChannels and configNumFilters_ are obtained from config
+  /// For Conv they are the same as channels_ and numFilters
+  /// For ConvTrans they are opposite to channels_ and numFilters
+  int configChannels_, configNumFilters_;
+  int paddingH_, paddingW_;
+  int strideH_, strideW_;
+  int filterH_, filterW_;
+  /// One group offset of input data.
+  int inputOffset_;
+  /// One group offset of output data.
+  int outputOffset_;
+  /// One group offset of weight.
+  int weightOffset_;
+  int groups_;
+
+  /// Cudnn tensor descriptor for input.
+  hl_tensor_descriptor imageDesc_;
+  /// Cudnn tensor descriptor for output.
+  hl_tensor_descriptor outputDesc_;
+  /// Cudnn tensor descriptor for filter.
+  hl_filter_descriptor filterDesc_;
+  /// Cudnn tensor descriptor for a convolution operation.
+  hl_convolution_descriptor convDesc_;
+
+  /// Record the algorithm for forward convolution, which is obtained by cudnn
+  /// api to search the best suited algorithm.
+  int fwdAlgo_;
+  /// Record the algorithm for computing convolution gradient with respect to
+  /// filter coefficients.
+  int bwdFilterAlgo_;
+  /// Record the algorithm for computing convolution gradient with respect to
+  /// the output.
+  int bwdDataAlgo_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// forward convolution with the specified algo.
+  size_t fwdLimitBytes_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// backwardFilter with the specified algo.
+  size_t bwdDataLimitBytes_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// backwardData with the specified algo.
+  size_t bwdFilterLimitBytes_;
+  /// Size of total work space.
+  size_t workSpaceInBytes_;
+
+  /// Whether to call cuDNN api to choose conv algorithm.
+  bool isSelectAlgo_;
+  /// batchNum is used to record batch size. If the batch size is changed,
+  /// the selection algorithm will be called.
+  int batchNum_;
+  bool bias_;
+
+  std::unique_ptr<Weight> weight_;
+  static ThreadLocalD<std::vector<MemoryHandle*>> convMem_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvOperator.cpp b/paddle/gserver/layers/ConvOperator.cpp
index f943410dee0dc2f3d356c9d7d8f61398fe2871c8..80932c8c509e3cb013c7e0051cbf4d8ccced0228 100644
--- a/paddle/gserver/layers/ConvOperator.cpp
+++ b/paddle/gserver/layers/ConvOperator.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "Operator.h"
+#include "ConvOperator.h"
 #include "paddle/math/MathUtils.h"
 #include "paddle/math/Matrix.h"
 
@@ -27,120 +27,8 @@ namespace paddle {
  * The config file api is conv_operator.
  */
 
-class ConvOperator : public Operator {
-public:
-  ConvOperator(const OperatorConfig &config, bool useGpu);
-  /**
-   * Free workspace in device and destroy cudnn tensor descriptor.
-   */
-  virtual ~ConvOperator() {
-    if (workSpaceInBytes_ != 0) {
-      hl_free_mem_device(workSpace_);
-      workSpaceInBytes_ = 0;
-    }
-
-    hl_destroy_tensor_descriptor(inputDesc_);
-    hl_destroy_tensor_descriptor(outputDesc_);
-    hl_destroy_filter_descriptor(filterDesc_);
-    hl_destroy_convolution_descriptor(convDesc_);
-  }
-  virtual void forward();
-  virtual void backward();
-
-private:
-  /**
-   * Get convolution parameters from layer config and
-   * initialize member variables.
-   */
-  void getConvParams();
-
-  /**
-   * Allocate Gpu Memory for cudnn convolution algorithms.
-   */
-  void allocConvWorkSpace(size_t maxWorkSpace);
-
-  /**
-   * Create cudnn tensor descriptor for convolution operation.
-   */
-  void computeConvSizes();
-
-  /**
-   * Reshape cudnn tensor descriptor.
-   */
-  void reshapeImageDescriptors();
-
-  /**
-   * Reshape cudnn tensor descriptor.
-   */
-  void reshape(int batchSize);
-
-  /**
-   * Check filter size is equal to the size calculated by parameters from
-   * layer config.
-   */
-  void checkFilterSize(const MatrixPtr &filter) {
-    CHECK_EQ(static_cast<int>(filter->getWidth()),
-             filterSize_ * filterSizeY_ * channels_ * numFilters_);
-  }
-
-  /// Most of member variables are same with CudnnConvLayer.
-  /// There is no explanation here.
-  int imageH_, imageW_, outputH_, outputW_;
-  hl_tensor_descriptor inputDesc_;
-  hl_tensor_descriptor outputDesc_;
-  hl_filter_descriptor filterDesc_;
-  hl_convolution_descriptor convDesc_;
-  bool caffeMode_;
-  int inputOffset_, outputOffset_, weightOffset_;
-  int numFilters_;
-  int padding_, stride_, filterSize_, channels_, imgSize_, imgSizeY_;
-  int paddingY_, strideY_, filterSizeY_;
-  int imgPixels_, filterPixels_, filterChannels_, outputX_, outputY_, outputs_;
-
-  /// Following member variables are same with CudnnConvLayer.
-  /// There is no explanation here.
-  int fwdAlgo_, bwdFilterAlgo_, bwdDataAlgo_;
-  size_t fwdLimitBytes_, bwdDataLimitBytes_, bwdFilterLimitBytes_;
-  size_t workSpaceInBytes_;
-  void *workSpace_;
-  bool isSelectAlgo_;
-};
-
 REGISTER_OPERATOR(conv, ConvOperator);
 
-ConvOperator::ConvOperator(const OperatorConfig &config, bool useGpu)
-    : Operator(config, useGpu) {
-  CHECK(useGpu);
-  CHECK_EQ(config_.input_indices_size(), 2L);
-
-  caffeMode_ = true;
-  getConvParams();
-  computeConvSizes();
-
-  // initialize all to default algorithms
-  fwdAlgo_ = 0;
-  bwdFilterAlgo_ = 0;
-  bwdDataAlgo_ = 0;
-  fwdLimitBytes_ = 0;
-  bwdDataLimitBytes_ = 0;
-  bwdFilterLimitBytes_ = 0;
-  workSpaceInBytes_ = 0;
-  workSpace_ = nullptr;
-
-  isSelectAlgo_ = false;
-}
-
-void ConvOperator::allocConvWorkSpace(size_t maxWorkSpace) {
-  if (maxWorkSpace > workSpaceInBytes_) {
-    if (workSpaceInBytes_ != 0) {
-      hl_free_mem_device(workSpace_);
-    }
-    // total amount of storage needed
-    workSpace_ = hl_malloc_device(maxWorkSpace);
-    workSpaceInBytes_ = maxWorkSpace;
-  }
-}
-
 void ConvOperator::reshape(int batchSize) {
   imageH_ = ins_[0]->getFrameHeight();
   imageW_ = ins_[0]->getFrameWidth();
@@ -148,106 +36,25 @@ void ConvOperator::reshape(int batchSize) {
   if (imageW_ == 0) imageW_ = imgSize_;
   outputH_ = outputSize(imageH_, filterSizeY_, paddingY_, strideY_, caffeMode_);
   outputW_ = outputSize(imageW_, filterSize_, padding_, stride_, caffeMode_);
-
+  /// Check that the outputSizes are consistent with config
+  CHECK_EQ(outputH_, outputY_);
+  CHECK_EQ(outputW_, outputX_);
   out_->setFrameHeight(outputH_);
   out_->setFrameWidth(outputW_);
 
   reshapeImageDescriptors();
 
-  if (!isSelectAlgo_) {
-    hl_conv_workspace(inputDesc_,
-                      outputDesc_,
-                      filterDesc_,
-                      convDesc_,
-                      &fwdAlgo_,
-                      &fwdLimitBytes_,
-                      &bwdDataAlgo_,
-                      &bwdDataLimitBytes_,
-                      &bwdFilterAlgo_,
-                      &bwdFilterLimitBytes_);
-
-    size_t maxWorkSpace = 0;
-    maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
-    maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
+  inputOffset_ = channels_ * imageH_ * imageW_;
+  outputOffset_ = numFilters_ * outputH_ * outputW_;
+  weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSizeY_;
 
-    allocConvWorkSpace(maxWorkSpace);
+  if (!isSelectAlgo_) {
+    allocConvWorkSpace();
   }
 
   isSelectAlgo_ = true;
 }
 
-void ConvOperator::computeConvSizes() {
-  hl_create_filter_descriptor(
-      &filterDesc_, channels_, numFilters_, filterSizeY_, filterSize_);
-  hl_create_tensor_descriptor(&inputDesc_);
-  int outputX =
-      outputSize(imgSize_, filterSize_, padding_, stride_, caffeMode_);
-  int outputY =
-      outputSize(imgSizeY_, filterSizeY_, paddingY_, strideY_, caffeMode_);
-  CHECK_EQ(outputX, outputX_);
-  CHECK_EQ(outputY, outputY_);
-  hl_create_tensor_descriptor(&outputDesc_);
-  hl_create_convolution_descriptor(&convDesc_,
-                                   inputDesc_,
-                                   filterDesc_,
-                                   paddingY_,
-                                   padding_,
-                                   strideY_,
-                                   stride_);
-}
-
-void ConvOperator::reshapeImageDescriptors() {
-  hl_tensor_reshape(inputDesc_,
-                    1,
-                    channels_,
-                    imageH_,
-                    imageW_,
-                    channels_ * imageH_ * imageW_,
-                    imageH_ * imageW_,
-                    imageW_,
-                    1);
-  hl_tensor_reshape(outputDesc_,
-                    1,
-                    numFilters_,
-                    outputH_,
-                    outputW_,
-                    numFilters_ * outputH_ * outputW_,
-                    outputH_ * outputW_,
-                    outputW_,
-                    1);
-  hl_reset_convolution_descriptor(convDesc_,
-                                  inputDesc_,
-                                  filterDesc_,
-                                  paddingY_,
-                                  padding_,
-                                  strideY_,
-                                  stride_);
-  inputOffset_ = channels_ * imageH_ * imageW_;
-  outputOffset_ = numFilters_ * outputH_ * outputW_;
-  weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSize_;
-}
-
-void ConvOperator::getConvParams() {
-  numFilters_ = config_.num_filters();
-  const ConvConfig &conf = config_.conv_conf();
-  padding_ = conf.padding();
-  stride_ = conf.stride();
-  filterSize_ = conf.filter_size();
-  paddingY_ = conf.padding_y();
-  strideY_ = conf.stride_y();
-  filterSizeY_ = conf.filter_size_y();
-  filterPixels_ = filterSize_ * filterSizeY_;
-  channels_ = conf.channels();
-  imgSize_ = conf.img_size();
-  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  imgPixels_ = imgSize_ * imgSizeY_;
-  CHECK_EQ(conf.groups(), 1U);
-  filterChannels_ = conf.filter_channels();
-  outputX_ = conf.output_x();
-  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
-  outputs_ = outputX_ * outputX_;
-}
-
 void ConvOperator::forward() {
   size_t batchSize = ins_[0]->value->getHeight();
   reshape(batchSize);
@@ -264,7 +71,7 @@ void ConvOperator::forward() {
       real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
       real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
       real *outData = out_->value->getData() + outputOffset_ * batchId;
-      hl_convolution_forward(inputDesc_,
+      hl_convolution_forward(imageDesc_,
                              inputData,
                              outputDesc_,
                              outData,
@@ -287,7 +94,7 @@ void ConvOperator::backward() {
       if (ins_[1]->grad) {
         real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
         real *weightGrad = ins_[1]->grad->getData() + weightOffset_ * batchId;
-        hl_convolution_backward_filter(inputDesc_,
+        hl_convolution_backward_filter(imageDesc_,
                                        inputData,
                                        outputDesc_,
                                        outGrad,
@@ -303,7 +110,7 @@ void ConvOperator::backward() {
       if (NULL != preGrad) {
         real *inputGrad = preGrad->getData() + inputOffset_ * batchId;
         real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
-        hl_convolution_backward_data(inputDesc_,
+        hl_convolution_backward_data(imageDesc_,
                                      inputGrad,
                                      outputDesc_,
                                      outGrad,
diff --git a/paddle/gserver/layers/ConvOperator.h b/paddle/gserver/layers/ConvOperator.h
new file mode 100644
index 0000000000000000000000000000000000000000..0f3546c67ac174628044d5fb6e5c7bce06f37995
--- /dev/null
+++ b/paddle/gserver/layers/ConvOperator.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "ConvBaseOperator.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+class ConvOperator : public ConvBaseOperator {
+public:
+  ConvOperator(const OperatorConfig &config, bool useGpu)
+      : ConvBaseOperator(config, useGpu) {}
+  /**
+   * Free workspace in device and destroy cudnn tensor descriptor.
+   */
+  virtual ~ConvOperator() {}
+  void forward() override;
+  void backward() override;
+  void reshape(int batchSize) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvProjection.cpp b/paddle/gserver/layers/ConvProjection.cpp
index 0281170bc59855f6f4d2f4212523275a92d202d5..5b7ecc5560c1e7431305b34a331fe1fbc96c6b06 100644
--- a/paddle/gserver/layers/ConvProjection.cpp
+++ b/paddle/gserver/layers/ConvProjection.cpp
@@ -19,149 +19,32 @@ namespace paddle {
 
 REGISTER_PROJECTION(conv, ConvProjection);
 
-ThreadLocalD<std::vector<MemoryHandle *>> ConvProjection::convMem_;
-
-ConvProjection::ConvProjection(const ProjectionConfig &config,
-                               ParameterPtr parameter,
-                               bool useGpu)
-    : Projection(config, parameter, useGpu) {
-  CHECK(useGpu);  // only support GPU
-  getConvParams();
-  initCudnn();
-
-  size_t height = filterH_ * filterW_ * channels_ / groups_;
-  size_t width = numFilters_;
-  weight_.reset(new Weight(height, width, parameter));
-  weightOffset_ = height * width / groups_;
-}
-
-void ConvProjection::getConvParams() {
-  const ConvConfig &conf = config_.conv_conf();
-  paddingH_ = conf.padding_y();
-  paddingW_ = conf.padding();
-
-  strideH_ = conf.stride_y();
-  strideW_ = conf.stride();
-
-  filterH_ = conf.filter_size_y();
-  filterW_ = conf.filter_size();
-
-  configImgH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  configImgW_ = conf.img_size();
-
-  channels_ = conf.channels();
-  numFilters_ = config_.num_filters();
-
-  groups_ = conf.groups();
-  CHECK_EQ(channels_ % groups_, 0);
-  CHECK_EQ(numFilters_ % groups_, 0);
-}
-
-void ConvProjection::initCudnn() {
-  hl_create_filter_descriptor(&filterDesc_,
-                              channels_ / groups_,
-                              numFilters_ / groups_,
-                              filterH_,
-                              filterW_);
-  hl_create_tensor_descriptor(&inputDesc_);
-  hl_create_tensor_descriptor(&outputDesc_);
-  hl_create_convolution_descriptor(&convDesc_,
-                                   inputDesc_,
-                                   filterDesc_,
-                                   paddingH_,
-                                   paddingW_,
-                                   strideH_,
-                                   strideW_);
-
-  // initialize all to default algorithms
-  fwdAlgo_ = 0;
-  bwdFilterAlgo_ = 0;
-  bwdDataAlgo_ = 0;
-  fwdLimitBytes_ = 0;
-  bwdDataLimitBytes_ = 0;
-  bwdFilterLimitBytes_ = 0;
-  workSpaceInBytes_ = 0;
-
-  batchNum_ = 0;
-  isSelectAlgo_ = false;
-}
-
-void ConvProjection::reshapeTensorDesc(int batchSize) {
-  hl_tensor_reshape(inputDesc_,
-                    batchSize,
-                    channels_ / groups_,
-                    imageH_,
-                    imageW_,
-                    channels_ * imageH_ * imageW_,
-                    imageH_ * imageW_,
-                    imageW_,
-                    1);
-  hl_reset_convolution_descriptor(convDesc_,
-                                  inputDesc_,
-                                  filterDesc_,
-                                  paddingH_,
-                                  paddingW_,
-                                  strideH_,
-                                  strideW_);
-
-  // The stride between two consecutive images in ConvProjection may not be 1,
-  // for example, in the case of layer ConcatenateLayer2 with two
-  // ConvProjection, the stride is the output_size of layer ConcatenateLayer2.
-  // So the calculation of nStride is different from CudnnConvLayer.
-  // In fact, only "nStride = out_->value->getStride()" is ok.
-  size_t nStride = numFilters_ * outputH_ * outputW_;
-  if (out_->value->isContiguous()) {
-    CHECK_EQ(nStride, out_->value->getWidth());
-  } else {
-    nStride = out_->value->getStride();
-  }
-
-  hl_tensor_reshape(outputDesc_,
-                    batchSize,
-                    numFilters_ / groups_,
-                    outputH_,
-                    outputW_,
-                    nStride,
-                    outputH_ * outputW_,
-                    outputW_,
-                    1);
+size_t ConvProjection::calOutputSize() {
+  imageH_ = in_->getFrameHeight();
+  imageW_ = in_->getFrameWidth();
+  if (imageH_ == 0) imageH_ = configImgH_;
+  if (imageW_ == 0) imageW_ = configImgW_;
+  outputH_ = outputSize(imageH_,
+                        filterH_,
+                        paddingH_,
+                        strideH_,
+                        /* caffeMode */ true);
+  outputW_ = outputSize(imageW_,
+                        filterW_,
+                        paddingW_,
+                        strideW_,
+                        /* caffeMode */ true);
+
+  const_cast<Argument *>(out_)->setFrameHeight(outputH_);
+  const_cast<Argument *>(out_)->setFrameWidth(outputW_);
+
+  inputOffset_ = (configChannels_ / groups_) * imageH_ * imageW_;
+  outputOffset_ = (configNumFilters_ / groups_) * outputH_ * outputW_;
+  return outputH_ * outputW_ * configNumFilters_;
 }
 
-void ConvProjection::reshape(int batchSize) {
-  size_t width = calOutputSize();
-  CHECK_EQ(width, out_->value->getWidth());
-  CHECK_EQ(static_cast<size_t>(channels_ * imageH_ * imageW_),
-           in_->value->getWidth())
-      << "Wrong input size for convolution"
-      << " channels=" << channels_ << " imageH=" << imageH_
-      << " imageW=" << imageW_ << " inputSize=" << in_->value->getWidth();
-
-  isSelectAlgo_ = (batchSize == batchNum_);
-  batchNum_ = batchSize;
-
-  if (!isSelectAlgo_) {
-    reshapeTensorDesc(batchSize);
-    hl_conv_workspace(inputDesc_,
-                      outputDesc_,
-                      filterDesc_,
-                      convDesc_,
-                      &fwdAlgo_,
-                      &fwdLimitBytes_,
-                      &bwdDataAlgo_,
-                      &bwdDataLimitBytes_,
-                      &bwdFilterAlgo_,
-                      &bwdFilterLimitBytes_);
-
-    size_t maxWorkSpace = 0;
-    maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
-    maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
-    workSpaceInBytes_ = maxWorkSpace;
-
-    VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
-            << " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_;
-  }
-
-  isSelectAlgo_ = true;
+size_t ConvProjection::calInputSize() {
+  return static_cast<size_t>(configChannels_ * imageH_ * imageW_);
 }
 
 void ConvProjection::forward() {
@@ -179,7 +62,7 @@ void ConvProjection::forward() {
     real *inputData = in_->value->getData() + g * inputOffset_;
     real *wgtData = weight_->getW()->getData() + g * weightOffset_;
     real *outData = out_->value->getData() + g * outputOffset_;
-    hl_convolution_forward(inputDesc_,
+    hl_convolution_forward(imageDesc_,
                            inputData,
                            outputDesc_,
                            outData,
@@ -205,7 +88,7 @@ void ConvProjection::backward(const UpdateCallback &callback) {
     if (weight_->getWGrad()) {
       real *inputData = in_->value->getData() + g * inputOffset_;
       real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_;
-      hl_convolution_backward_filter(inputDesc_,
+      hl_convolution_backward_filter(imageDesc_,
                                      inputData,
                                      outputDesc_,
                                      outGrad,
@@ -221,7 +104,7 @@ void ConvProjection::backward(const UpdateCallback &callback) {
     if (NULL != preGrad) {
       real *inputGrad = preGrad->getData() + g * inputOffset_;
       real *wgtData = weight_->getW()->getData() + g * weightOffset_;
-      hl_convolution_backward_data(inputDesc_,
+      hl_convolution_backward_data(imageDesc_,
                                    inputGrad,
                                    outputDesc_,
                                    outGrad,
@@ -237,26 +120,4 @@ void ConvProjection::backward(const UpdateCallback &callback) {
   weight_->getParameterPtr()->incUpdate(callback);
 }
 
-void *ConvProjection::getSpaceBytes(size_t size) {
-  std::vector<MemoryHandle *> &convMem = *convMem_;
-  if (convMem.empty()) {
-    int numDevices = hl_get_device_count();
-    convMem.resize(numDevices);
-  }
-
-  int devId = hl_get_device();
-  MemoryHandle **localMem = &(convMem[devId]);
-  if (NULL == *localMem || size > (*localMem)->getAllocSize()) {
-    *localMem = new GpuMemoryHandle(size);
-  }
-  return (*localMem)->getBuf();
-}
-
-ConvProjection::~ConvProjection() {
-  hl_destroy_tensor_descriptor(inputDesc_);
-  hl_destroy_tensor_descriptor(outputDesc_);
-  hl_destroy_filter_descriptor(filterDesc_);
-  hl_destroy_convolution_descriptor(convDesc_);
-}
-
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ConvProjection.h b/paddle/gserver/layers/ConvProjection.h
index c32e5e1d3ab2f85feb6dd2fb5fbddd7482598e58..b7d7cc9a275529a02a5d8e82d28ed79cb7ce0b43 100644
--- a/paddle/gserver/layers/ConvProjection.h
+++ b/paddle/gserver/layers/ConvProjection.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#include "Projection.h"
+#include "ConvBaseProjection.h"
 #include "paddle/math/MathUtils.h"
 
 namespace paddle {
@@ -22,109 +22,22 @@ namespace paddle {
 /**
  * @brief Convolution projection do the same calculation with CudnnConvLayer.
  */
-class ConvProjection : public Projection {
+class ConvProjection : public ConvBaseProjection {
 public:
   /**
    * Constructor.
    */
   ConvProjection(const ProjectionConfig& config,
                  ParameterPtr parameter,
-                 bool useGpu);
+                 bool useGpu)
+      : ConvBaseProjection(config, parameter, useGpu) {}
 
-  ~ConvProjection();
+  ~ConvProjection() {}
 
   virtual void forward();
   virtual void backward(const UpdateCallback& callback);
-
-protected:
-  void getConvParams();
-  void initCudnn();
-
-  void reshapeTensorDesc(int batchSize);
-  void reshape(int batchSize);
-
-  size_t calOutputSize() {
-    imageH_ = in_->getFrameHeight();
-    imageW_ = in_->getFrameWidth();
-    if (imageH_ == 0) imageH_ = configImgH_;
-    if (imageW_ == 0) imageW_ = configImgW_;
-    outputH_ = outputSize(imageH_,
-                          filterH_,
-                          paddingH_,
-                          strideH_,
-                          /* caffeMode */ true);
-    outputW_ = outputSize(imageW_,
-                          filterW_,
-                          paddingW_,
-                          strideW_,
-                          /* caffeMode */ true);
-
-    const_cast<Argument*>(out_)->setFrameHeight(outputH_);
-    const_cast<Argument*>(out_)->setFrameWidth(outputW_);
-
-    inputOffset_ = (channels_ / groups_) * imageH_ * imageW_;
-    outputOffset_ = (numFilters_ / groups_) * outputH_ * outputW_;
-    return outputH_ * outputW_ * numFilters_;
-  }
-
-  static void* getSpaceBytes(size_t size);
-
-  /// imageH_ and imageW_ is calculated from the input layer.
-  int imageH_, imageW_;
-  /// configImgH_ and configImgW_ is obtained from config.
-  int configImgH_, configImgW_;
-  int outputH_, outputW_;
-  int channels_, numFilters_;
-  int paddingH_, paddingW_;
-  int strideH_, strideW_;
-  int filterH_, filterW_;
-  /// One group offset of input data.
-  int inputOffset_;
-  /// One group offset of output data.
-  int outputOffset_;
-  /// One group offset of weight.
-  int weightOffset_;
-  int groups_;
-
-  /// Cudnn tensor descriptor for input.
-  hl_tensor_descriptor inputDesc_;
-  /// Cudnn tensor descriptor for output.
-  hl_tensor_descriptor outputDesc_;
-  /// Cudnn tensor descriptor for filter.
-  hl_filter_descriptor filterDesc_;
-  /// Cudnn tensor descriptor for a convolution operation.
-  hl_convolution_descriptor convDesc_;
-
-  /// Record the algorithm for forward convolution, which is obtained by cudnn
-  /// api to search the best suited algorithm.
-  int fwdAlgo_;
-  /// Record the algorithm for computing convolution gradient with respect to
-  /// filter coefficients.
-  int bwdFilterAlgo_;
-  /// Record the algorithm for computing convolution gradient with respect to
-  /// the output.
-  int bwdDataAlgo_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// forward convolution with the specified algo.
-  size_t fwdLimitBytes_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// backwardFilter with the specified algo.
-  size_t bwdDataLimitBytes_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// backwardData with the specified algo.
-  size_t bwdFilterLimitBytes_;
-  /// Size of total work space.
-  size_t workSpaceInBytes_;
-
-  /// Whether to call cuDNN api to choose conv algorithm.
-  bool isSelectAlgo_;
-  /// batchNum is used to record batch size. If the batch size is changed,
-  /// the selection algorithm will be called.
-  int batchNum_;
-  bool bias_;
-
-  std::unique_ptr<Weight> weight_;
-  static ThreadLocalD<std::vector<MemoryHandle*>> convMem_;
+  virtual size_t calOutputSize();
+  virtual size_t calInputSize();
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ConvTransOperator.cpp b/paddle/gserver/layers/ConvTransOperator.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..db026337a473f7edf1a7c0db320f60ff3048eb9c
--- /dev/null
+++ b/paddle/gserver/layers/ConvTransOperator.cpp
@@ -0,0 +1,125 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvTransOperator.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvTransOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+REGISTER_OPERATOR(convt, ConvTransOperator);
+
+void ConvTransOperator::reshape(int batchSize) {
+  outputH_ = ins_[0]->getFrameHeight();
+  outputW_ = ins_[0]->getFrameWidth();
+  if (outputH_ == 0) outputH_ = outputY_;
+  if (outputW_ == 0) outputW_ = outputX_;
+  imageH_ = imageSize(outputH_, filterSizeY_, paddingY_, strideY_, caffeMode_);
+  imageW_ = imageSize(outputW_, filterSize_, padding_, stride_, caffeMode_);
+  /// Check that the imageSizes are consistent with config
+  CHECK_EQ(imageH_, imgSizeY_);
+  CHECK_EQ(imageW_, imgSize_);
+  out_->setFrameHeight(imageH_);
+  out_->setFrameWidth(imageW_);
+
+  reshapeImageDescriptors();
+
+  inputOffset_ = numFilters_ * outputH_ * outputW_;
+  outputOffset_ = channels_ * imageH_ * imageW_;
+  weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSizeY_;
+
+  if (!isSelectAlgo_) {
+    allocConvWorkSpace();
+  }
+
+  isSelectAlgo_ = true;
+}
+
+void ConvTransOperator::forward() {
+  size_t batchSize = ins_[0]->value->getHeight();
+  reshape(batchSize);
+  CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
+  checkFilterSize(ins_[1]->value);
+  Matrix::resizeOrCreate(
+      out_->value, batchSize, imageH_ * imageW_ * channels_, false, useGpu_);
+  {
+    AsyncGpuBlock block;
+    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
+      real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
+      real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
+      real *outData = out_->value->getData() + outputOffset_ * batchId;
+      hl_convolution_backward_data(imageDesc_,
+                                   outData,
+                                   outputDesc_,
+                                   inputData,
+                                   filterDesc_,
+                                   wgtData,
+                                   convDesc_,
+                                   workSpace_,
+                                   workSpaceInBytes_,
+                                   bwdDataAlgo_);
+    }
+  }
+}
+
+void ConvTransOperator::backward() {
+  size_t batchSize = ins_[0]->value->getHeight();
+  {
+    AsyncGpuBlock block;
+    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
+      real *outGrad = out_->grad->getData() + outputOffset_ * batchId;
+      if (ins_[1]->grad) {
+        real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
+        real *weightGrad = ins_[1]->grad->getData() + weightOffset_ * batchId;
+        hl_convolution_backward_filter(imageDesc_,
+                                       outGrad,
+                                       outputDesc_,
+                                       inputData,
+                                       filterDesc_,
+                                       weightGrad,
+                                       convDesc_,
+                                       workSpace_,
+                                       workSpaceInBytes_,
+                                       bwdFilterAlgo_);
+      }
+
+      MatrixPtr preGrad = ins_[0]->grad;
+      if (NULL != preGrad) {
+        real *inputGrad = preGrad->getData() + inputOffset_ * batchId;
+        real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
+        hl_convolution_forward(imageDesc_,
+                               outGrad,
+                               outputDesc_,
+                               inputGrad,
+                               filterDesc_,
+                               wgtData,
+                               convDesc_,
+                               workSpace_,
+                               workSpaceInBytes_,
+                               fwdAlgo_);
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvTransOperator.h b/paddle/gserver/layers/ConvTransOperator.h
new file mode 100644
index 0000000000000000000000000000000000000000..ca08dc9aa77d59b45635c16cdd5064c5c3b5f96d
--- /dev/null
+++ b/paddle/gserver/layers/ConvTransOperator.h
@@ -0,0 +1,44 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+
+#include "ConvBaseOperator.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+/**
+ * @brief ConvTransOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+
+class ConvTransOperator : public ConvBaseOperator {
+public:
+  ConvTransOperator(const OperatorConfig &config, bool useGpu)
+      : ConvBaseOperator(config, useGpu) {}
+  /**
+   * Free workspace in device and destroy cudnn tensor descriptor.
+   */
+  virtual ~ConvTransOperator() {}
+  void forward() override;
+  void backward() override;
+  void reshape(int batchSize) override;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvTransProjection.cpp b/paddle/gserver/layers/ConvTransProjection.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..48132a3ce4cc4b50fea6d755d84d7254d2055bec
--- /dev/null
+++ b/paddle/gserver/layers/ConvTransProjection.cpp
@@ -0,0 +1,123 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ConvTransProjection.h"
+#include "paddle/utils/Stat.h"
+
+namespace paddle {
+
+REGISTER_PROJECTION(convt, ConvTransProjection);
+size_t ConvTransProjection::calOutputSize() {
+  outputH_ = in_->getFrameHeight();
+  outputW_ = in_->getFrameWidth();
+  if (outputH_ == 0) outputH_ = configOutH_;
+  if (outputW_ == 0) outputW_ = configOutW_;
+  imageH_ = imageSize(outputH_,
+                      filterH_,
+                      paddingH_,
+                      strideH_,
+                      /* caffeMode */ true);
+
+  imageW_ = imageSize(outputW_,
+                      filterW_,
+                      paddingW_,
+                      strideW_,
+                      /* caffeMode */ true);
+
+  const_cast<Argument *>(out_)->setFrameHeight(imageH_);
+  const_cast<Argument *>(out_)->setFrameWidth(imageW_);
+
+  inputOffset_ = (configChannels_ / groups_) * outputH_ * outputW_;
+  outputOffset_ = (configNumFilters_ / groups_) * imageH_ * imageW_;
+  return imageH_ * imageW_ * configNumFilters_;
+}
+
+size_t ConvTransProjection::calInputSize() {
+  return static_cast<size_t>(configChannels_ * outputH_ * outputW_);
+}
+
+void ConvTransProjection::forward() {
+  int batchSize = in_->value->getHeight();
+  reshape(batchSize);
+
+  void *workSpace = NULL;
+  if (workSpaceInBytes_ > 0) {
+    workSpace = getSpaceBytes(workSpaceInBytes_);
+  }
+
+  for (int g = 0; g < groups_; ++g) {
+    REGISTER_TIMER_INFO("CudnnConvTransFwTimer", getName().c_str());
+
+    real *inData = in_->value->getData() + g * inputOffset_;
+    real *wgtData = weight_->getW()->getData() + g * weightOffset_;
+    real *outData = out_->value->getData() + g * outputOffset_;
+    hl_convolution_backward_data(imageDesc_,
+                                 outData,
+                                 outputDesc_,
+                                 inData,
+                                 filterDesc_,
+                                 wgtData,
+                                 convDesc_,
+                                 workSpace,
+                                 bwdDataLimitBytes_,
+                                 bwdDataAlgo_);
+  }
+}
+
+void ConvTransProjection::backward(const UpdateCallback &callback) {
+  REGISTER_TIMER_INFO("CudnnConvTransBpTimer", getName().c_str());
+
+  void *workSpace = NULL;
+  if (workSpaceInBytes_ > 0) {
+    workSpace = getSpaceBytes(workSpaceInBytes_);
+  }
+
+  for (int g = 0; g < groups_; ++g) {
+    real *outGrad = out_->grad->getData() + g * outputOffset_;
+    if (weight_->getWGrad()) {
+      real *inData = in_->value->getData() + g * inputOffset_;
+      real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_;
+      hl_convolution_backward_filter(imageDesc_,
+                                     outGrad,
+                                     outputDesc_,
+                                     inData,
+                                     filterDesc_,
+                                     weightGrad,
+                                     convDesc_,
+                                     workSpace,
+                                     bwdFilterLimitBytes_,
+                                     bwdFilterAlgo_);
+    }
+
+    MatrixPtr preGrad = in_->grad;
+    if (NULL != preGrad) {
+      real *inGrad = preGrad->getData() + g * inputOffset_;
+      real *wgtData = weight_->getW()->getData() + g * weightOffset_;
+      hl_convolution_forward(imageDesc_,
+                             outGrad,
+                             outputDesc_,
+                             inGrad,
+                             filterDesc_,
+                             wgtData,
+                             convDesc_,
+                             workSpace,
+                             fwdLimitBytes_,
+                             fwdAlgo_);
+    }
+  }
+
+  weight_->getParameterPtr()->incUpdate(callback);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/ConvTransProjection.h b/paddle/gserver/layers/ConvTransProjection.h
new file mode 100644
index 0000000000000000000000000000000000000000..6508d17b2409aa0cc11cdafb306604816f010718
--- /dev/null
+++ b/paddle/gserver/layers/ConvTransProjection.h
@@ -0,0 +1,43 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "ConvBaseProjection.h"
+#include "paddle/math/MathUtils.h"
+
+namespace paddle {
+
+/**
+ * @brief Convolution projection do the same calculation with CudnnConvLayer.
+ */
+class ConvTransProjection : public ConvBaseProjection {
+public:
+  /**
+   * Constructor.
+   */
+  ConvTransProjection(const ProjectionConfig& config,
+                      ParameterPtr parameter,
+                      bool useGpu)
+      : ConvBaseProjection(config, parameter, useGpu) {}
+
+  ~ConvTransProjection() {}
+
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+  virtual size_t calOutputSize();
+  virtual size_t calInputSize();
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CostLayer.cpp b/paddle/gserver/layers/CostLayer.cpp
index 998b8d7d3034cb18fbab242c66656092bfc50fcb..4ae5b828707eb8412e98cbefcf3949d62e81ad1e 100644
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -192,6 +192,59 @@ void SumOfSquaresCostLayer::backwardImp(Matrix& output,
   outputG.sumOfSquaresBp(output, *label.value);
 }
 
+//
+// class SmoothL1CostLayer
+//
+
+REGISTER_LAYER(smooth_l1, SmoothL1CostLayer);
+
+bool SmoothL1CostLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  return CostLayer::init(layerMap, parameterMap);
+}
+
+void SmoothL1CostLayer::forwardImp(Matrix& output,
+                                   Argument& label,
+                                   Matrix& target) {
+  MatrixPtr targetCpu, outputCpu, labelCpu;
+  if (useGpu_) {
+    targetCpu =
+        Matrix::create(target.getHeight(), target.getWidth(), false, false);
+    outputCpu =
+        Matrix::create(output.getHeight(), output.getWidth(), false, false);
+    labelCpu = Matrix::create(
+        label.value->getHeight(), label.value->getWidth(), false, false);
+    targetCpu->copyFrom(target);
+    outputCpu->copyFrom(output);
+    labelCpu->copyFrom(*label.value);
+    targetCpu->smoothL1(*outputCpu, *(labelCpu));
+    target.copyFrom(*targetCpu);
+  } else {
+    target.smoothL1(output, *label.value);
+  }
+}
+
+void SmoothL1CostLayer::backwardImp(Matrix& output,
+                                    Argument& label,
+                                    Matrix& outputG) {
+  MatrixPtr outputGCpu, outputCpu, labelCpu;
+  if (useGpu_) {
+    outputGCpu =
+        Matrix::create(outputG.getHeight(), outputG.getWidth(), false, false);
+    outputCpu =
+        Matrix::create(output.getHeight(), output.getWidth(), false, false);
+    labelCpu = Matrix::create(
+        label.value->getHeight(), label.value->getWidth(), false, false);
+    outputGCpu->copyFrom(outputG);
+    outputCpu->copyFrom(output);
+    labelCpu->copyFrom(*label.value);
+    outputGCpu->smoothL1Bp(*outputCpu, *labelCpu);
+    outputG.copyFrom(*outputGCpu);
+  } else {
+    outputG.smoothL1Bp(output, *label.value);
+  }
+}
+
 //
 // class RankingCost
 //
diff --git a/paddle/gserver/layers/CostLayer.h b/paddle/gserver/layers/CostLayer.h
index b3045e0b31308abf2caa90cbd21f105e685ef341..569a6840f0d4432cc827219f590b821df115c7ea 100644
--- a/paddle/gserver/layers/CostLayer.h
+++ b/paddle/gserver/layers/CostLayer.h
@@ -159,6 +159,29 @@ public:
                    Matrix& outputGrad) override;
 };
 
+/**
+ * This cost layer compute smooth L1 loss for real-valued regression
+ * tasks.
+ * \f[
+ * L =
+ *   (output - label)^2 * 0.5  / -1 < (output - label) < 1 /
+ *   (output - label) - 0.5    / otherwise  /
+ * \f]
+ */
+class SmoothL1CostLayer : public CostLayer {
+public:
+  explicit SmoothL1CostLayer(const LayerConfig& config) : CostLayer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
+
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
+};
+
 /**
  * A cost layer for learning to rank (LTR) task. This layer contains at leat
  * three inputs.
diff --git a/paddle/gserver/layers/CrossChannelNormLayer.cpp b/paddle/gserver/layers/CrossChannelNormLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3fbccc11032caa4878ce8dcfe7c34a261acee68b
--- /dev/null
+++ b/paddle/gserver/layers/CrossChannelNormLayer.cpp
@@ -0,0 +1,122 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+#include "NormLayer.h"
+#include "paddle/math/BaseMatrix.h"
+#include "paddle/math/Matrix.h"
+
+namespace paddle {
+
+MatrixPtr CrossChannelNormLayer::createSampleMatrix(MatrixPtr data,
+                                                    size_t iter,
+                                                    size_t spatialDim) {
+  return Matrix::create(data->getData() + iter * channels_ * spatialDim,
+                        channels_,
+                        spatialDim,
+                        false,
+                        useGpu_);
+}
+
+MatrixPtr CrossChannelNormLayer::createSpatialMatrix(MatrixPtr data,
+                                                     size_t iter,
+                                                     size_t spatialDim) {
+  return Matrix::create(
+      data->getData() + iter * spatialDim, 1, spatialDim, false, useGpu_);
+}
+
+void CrossChannelNormLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  MatrixPtr inV = getInputValue(0);
+
+  size_t batchSize = inV->getHeight();
+  size_t dataDim = inV->getWidth();
+  CHECK_EQ(getSize(), dataDim);
+
+  reserveOutput(batchSize, dataDim);
+  MatrixPtr outV = getOutputValue();
+  size_t spatialDim = dataDim / channels_;
+
+  Matrix::resizeOrCreate(dataBuffer_, batchSize, dataDim, false, useGpu_);
+  Matrix::resizeOrCreate(spatialBuffer_, 1, spatialDim, false, useGpu_);
+  Matrix::resizeOrCreate(normBuffer_, batchSize, spatialDim, false, useGpu_);
+  normBuffer_->zeroMem();
+  // add eps to avoid overflow
+  normBuffer_->addScalar(*normBuffer_, 1e-6);
+  inV->square2(*dataBuffer_);
+  for (size_t i = 0; i < batchSize; i++) {
+    const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
+    const MatrixPtr dataTmp = createSampleMatrix(dataBuffer_, i, spatialDim);
+    MatrixPtr outVTmp = createSampleMatrix(outV, i, spatialDim);
+    MatrixPtr normTmp = createSpatialMatrix(normBuffer_, i, spatialDim);
+
+    // compute norm.
+    spatialBuffer_->sumCols(*dataTmp, 1, 0);
+    spatialBuffer_->sqrt2(*spatialBuffer_);
+    normTmp->copyFrom(*spatialBuffer_);
+    outVTmp->copyFrom(*inVTmp);
+    outVTmp->divRowVector(*spatialBuffer_);
+    // scale the layer.
+    outVTmp->mulColVector(*scale_->getW());
+  }
+}
+
+void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inG = getInputGrad(0);
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr outV = getOutputValue();
+
+  size_t batchSize = inG->getHeight();
+  size_t dataDim = inG->getWidth();
+  size_t spatialDim = dataDim / channels_;
+
+  dataBuffer_->dotMul(*outG, *outV);
+  Matrix::resizeOrCreate(scaleDiff_, channels_, 1, false, useGpu_);
+  Matrix::resizeOrCreate(channelBuffer_, channels_, 1, false, useGpu_);
+  Matrix::resizeOrCreate(sampleBuffer_, channels_, spatialDim, false, useGpu_);
+  scaleDiff_->zeroMem();
+  for (size_t i = 0; i < batchSize; i++) {
+    MatrixPtr outGTmp = createSampleMatrix(outG, i, spatialDim);
+    const MatrixPtr dataTmp = createSampleMatrix(dataBuffer_, i, spatialDim);
+    const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
+    const MatrixPtr inGTmp = createSampleMatrix(inG, i, spatialDim);
+    const MatrixPtr normTmp = createSpatialMatrix(normBuffer_, i, spatialDim);
+
+    channelBuffer_->sumRows(*dataTmp, 1, 0);
+    channelBuffer_->dotDiv(*channelBuffer_, *(scale_->getW()));
+    // store a / scale[i] in scaleDiff_ temporary
+    scaleDiff_->add(*channelBuffer_, 1.);
+
+    sampleBuffer_->dotMul(*inVTmp, *outGTmp);
+    spatialBuffer_->sumCols(*sampleBuffer_, 1., 1.);
+    // scale the grad
+    inGTmp->copyFrom(*inVTmp);
+    inGTmp->mulRowVector(*spatialBuffer_);
+    // divide by square of norm
+    spatialBuffer_->dotMul(*normTmp, *normTmp);
+    inGTmp->divRowVector(*spatialBuffer_);
+    // subtract
+    inGTmp->add(*outGTmp, -1, 1);
+    // divide by norm
+    inGTmp->divRowVector(*normTmp);
+    // scale the diff
+    inGTmp->mulColVector(*scale_->getW());
+  }
+  // updata scale
+  if (scale_->getWGrad()) scale_->getWGrad()->copyFrom(*scaleDiff_);
+  scale_->getParameterPtr()->incUpdate(callback);
+}
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/CudnnConvLayer.cpp b/paddle/gserver/layers/CudnnConvBaseLayer.cpp
similarity index 66%
rename from paddle/gserver/layers/CudnnConvLayer.cpp
rename to paddle/gserver/layers/CudnnConvBaseLayer.cpp
index 978c2c1479c64ab2cdebaaff7394059b3d033ab6..24363bb8b09cc354c25abe512257be68566c10e1 100644
--- a/paddle/gserver/layers/CudnnConvLayer.cpp
+++ b/paddle/gserver/layers/CudnnConvBaseLayer.cpp
@@ -12,16 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "CudnnConvLayer.h"
+#include "CudnnConvBaseLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
 namespace paddle {
+REGISTER_LAYER(cudnn_conv, CudnnConvBaseLayer);
+REGISTER_LAYER(cudnn_convt, CudnnConvBaseLayer);
 
-REGISTER_LAYER(cudnn_conv, CudnnConvLayer);
-
-bool CudnnConvLayer::init(const LayerMap &layerMap,
-                          const ParameterMap &parameterMap) {
+bool CudnnConvBaseLayer::init(const LayerMap &layerMap,
+                              const ParameterMap &parameterMap) {
   if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
   CHECK(useGpu_) << "CudnnConvLayer only support gpu";
 
@@ -33,7 +33,11 @@ bool CudnnConvLayer::init(const LayerMap &layerMap,
   CHECK(config_.shared_biases());
   for (size_t i = 0; i < inputLayers_.size(); i++) {
     ProjectionConfig *conf = new ProjectionConfig();
-    conf->set_type("conv");
+    if (isDeconv_) {
+      conf->set_type("convt");
+    } else {
+      conf->set_type("conv");
+    }
     conf->set_num_filters(numFilters_);
     ConvConfig *convConf = conf->mutable_conv_conf();
     *convConf = *(config_.mutable_inputs(i)->mutable_conv_conf());
@@ -47,14 +51,13 @@ bool CudnnConvLayer::init(const LayerMap &layerMap,
   if (biases_.get() && sharedBiases_) {
     hl_create_tensor_descriptor(&biasDesc_);
     hl_create_tensor_descriptor(&outputDesc_);
-    hl_tensor_reshape(biasDesc_, 1, numFilters_ / groups_[0], 1, 1);
-    biasOffset_ = numFilters_ / groups_[0];
+    hl_tensor_reshape(biasDesc_, 1, numFilters_, 1, 1);
   }
 
   return true;
 }
 
-void CudnnConvLayer::forward(PassType passType) {
+void CudnnConvBaseLayer::forward(PassType passType) {
   Layer::forward(passType);
 
   int batchSize = getInput(0).getBatchSize();
@@ -67,37 +70,41 @@ void CudnnConvLayer::forward(PassType passType) {
   if (biases_) {
     REGISTER_TIMER_INFO("CudnnConvBiasTimer", getName().c_str());
     int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+    int outH, outW;
+    if (isDeconv_) {
+      outH = imgSizeH_[0];
+      outW = imgSizeW_[0];
+    } else {
+      outH = outputH_[0];
+      outW = outputW_[0];
+    }
+
     hl_tensor_reshape(outputDesc_,
                       batchSize,
-                      numFilters_ / groups_[0],
-                      outputH_[0],
-                      outputW_[0],
-                      numFilters_ * outputH_[0] * outputW_[0],
-                      outputH_[0] * outputW_[0],
-                      outputW_[0],
+                      numFilters_,
+                      outH,
+                      outW,
+                      numFilters_ * outH * outW,
+                      outH * outW,
+                      outW,
                       1);
-    outputOffset_ = getOutputValue()->getWidth() / groups_[0];
-    for (int g = 0; g < groups_[0]; ++g) {
-      real *biasData = biases_->getW()->getData() + biasOffset_ * g;
-      real *outData = getOutputValue()->getData() + outputOffset_ * g;
-      hl_convolution_forward_add_bias(
-          biasDesc_, biasData, outputDesc_, outData);
-    }
+    real *outData = getOutputValue()->getData();
+    real *biasData = biases_->getW()->getData();
+    hl_convolution_forward_add_bias(biasDesc_, biasData, outputDesc_, outData);
   }
 
   forwardActivation();
 }
 
-void CudnnConvLayer::backward(const UpdateCallback &callback) {
+void CudnnConvBaseLayer::backward(const UpdateCallback &callback) {
   backwardActivation();
 
   if (biases_ && biases_->getWGrad()) {
     REGISTER_TIMER_INFO("CudnnConvBpBiasTimer", getName().c_str());
-    for (int g = 0; g < groups_[0]; ++g) {
-      real *biasGrad = biases_->getWGrad()->getData() + biasOffset_ * g;
-      real *outGrad = getOutputGrad()->getData() + outputOffset_ * g;
-      hl_convolution_backward_bias(biasDesc_, biasGrad, outputDesc_, outGrad);
-    }
+    real *biasGrad = biases_->getWGrad()->getData();
+    real *outGrad = getOutputGrad()->getData();
+    hl_convolution_backward_bias(biasDesc_, biasGrad, outputDesc_, outGrad);
+
     biases_->getParameterPtr()->incUpdate(callback);
   }
 
@@ -106,7 +113,7 @@ void CudnnConvLayer::backward(const UpdateCallback &callback) {
   }
 }
 
-CudnnConvLayer::~CudnnConvLayer() {
+CudnnConvBaseLayer::~CudnnConvBaseLayer() {
   if (biases_) {
     hl_destroy_tensor_descriptor(biasDesc_);
     hl_destroy_tensor_descriptor(outputDesc_);
diff --git a/paddle/gserver/layers/CudnnConvLayer.h b/paddle/gserver/layers/CudnnConvBaseLayer.h
similarity index 86%
rename from paddle/gserver/layers/CudnnConvLayer.h
rename to paddle/gserver/layers/CudnnConvBaseLayer.h
index 919b1efc4e453219a6c2ab1a11c61ccb99404084..93a05f94c7717f9170818b9d5ce3d27a6d18cef5 100644
--- a/paddle/gserver/layers/CudnnConvLayer.h
+++ b/paddle/gserver/layers/CudnnConvBaseLayer.h
@@ -30,27 +30,24 @@ namespace paddle {
  *
  * The config file api is img_conv_layer.
  */
-class CudnnConvLayer : public ConvBaseLayer {
+class CudnnConvBaseLayer : public ConvBaseLayer {
 protected:
   std::vector<std::unique_ptr<ProjectionConfig>> projConf_;
   std::vector<std::unique_ptr<Projection>> projections_;
 
   hl_tensor_descriptor biasDesc_;
   hl_tensor_descriptor outputDesc_;
-  int biasOffset_;
-  int outputOffset_;
 
 public:
-  explicit CudnnConvLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
+  explicit CudnnConvBaseLayer(const LayerConfig& config)
+      : ConvBaseLayer(config) {}
 
-  ~CudnnConvLayer();
+  ~CudnnConvBaseLayer();
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
 
   bool init(const LayerMap& layerMap,
             const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-  void addBiases();
-  void bpropBiases();
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/NormLayer.cpp b/paddle/gserver/layers/NormLayer.cpp
index 3db0af2515ee9f64aa6c0b0a441e88562d9e398e..e094078bfe86e30c06e1b80ebc04c8213fe9abcf 100644
--- a/paddle/gserver/layers/NormLayer.cpp
+++ b/paddle/gserver/layers/NormLayer.cpp
@@ -26,6 +26,8 @@ Layer* NormLayer::create(const LayerConfig& config) {
     return new ResponseNormLayer(config);
   } else if (norm == "cmrnorm-projection") {
     return new CMRProjectionNormLayer(config);
+  } else if (norm == "cross-channel-norm") {
+    return new CrossChannelNormLayer(config);
   } else {
     LOG(FATAL) << "Unknown norm type: " << norm;
     return nullptr;
@@ -54,4 +56,14 @@ bool ResponseNormLayer::init(const LayerMap& layerMap,
   return true;
 }
 
+bool CrossChannelNormLayer::init(const LayerMap& layerMap,
+                                 const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK(parameters_[0]);
+  const NormConfig& conf = config_.inputs(0).norm_conf();
+  channels_ = conf.channels();
+  scale_.reset(new Weight(channels_, 1, parameters_[0]));
+  return true;
+}
+
 }  // namespace paddle
diff --git a/paddle/gserver/layers/NormLayer.h b/paddle/gserver/layers/NormLayer.h
index e77faaa322570933b3ea2de877b7859857306432..7c238ac944e52c3a83c2aa5deac18de3aff6db61 100644
--- a/paddle/gserver/layers/NormLayer.h
+++ b/paddle/gserver/layers/NormLayer.h
@@ -65,4 +65,35 @@ public:
   }
 };
 
+/**
+ * This layer applys normalization across the channels of each sample to a
+ * conv layer's output, and scales the output by a group of trainable factors
+ * whose dimensions equal to the number of channels.
+ * - Input: One and only one input layer are accepted.
+ * - Output: The normalized data of the input data.
+ * Reference:
+ *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
+ *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
+ */
+class CrossChannelNormLayer : public NormLayer {
+public:
+  explicit CrossChannelNormLayer(const LayerConfig& config)
+      : NormLayer(config) {}
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback);
+  MatrixPtr createSampleMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
+  MatrixPtr createSpatialMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
+
+protected:
+  size_t channels_;
+  std::unique_ptr<Weight> scale_;
+  MatrixPtr scaleDiff_;
+  MatrixPtr normBuffer_;
+  MatrixPtr dataBuffer_;
+  MatrixPtr channelBuffer_;
+  MatrixPtr spatialBuffer_;
+  MatrixPtr sampleBuffer_;
+};
+
 }  // namespace paddle
diff --git a/paddle/gserver/layers/PriorBox.cpp b/paddle/gserver/layers/PriorBox.cpp
index bcf5e912a50fef2cec8ebdf1e0dad9efa43fba2f..331bc7672ec0d39a7317c39f1d14e8dcadea471a 100644
--- a/paddle/gserver/layers/PriorBox.cpp
+++ b/paddle/gserver/layers/PriorBox.cpp
@@ -20,7 +20,7 @@ namespace paddle {
 /**
  * @brief A layer for generating priorbox locations and variances.
  * - Input: Two and only two input layer are accepted. The input layer must be
- *        be a data output layer and a convolution output layer.
+ *          be a data output layer and a convolution output layer.
  * - Output: The priorbox locations and variances of the input data.
  * Reference:
  *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
@@ -45,27 +45,32 @@ protected:
   MatrixPtr buffer_;
 };
 
+REGISTER_LAYER(priorbox, PriorBoxLayer);
+
 bool PriorBoxLayer::init(const LayerMap& layerMap,
                          const ParameterMap& parameterMap) {
   Layer::init(layerMap, parameterMap);
   auto pbConf = config_.inputs(0).priorbox_conf();
+  std::vector<real> tmp;
+  aspectRatio_.push_back(1.);
   std::copy(pbConf.min_size().begin(),
             pbConf.min_size().end(),
             std::back_inserter(minSize_));
   std::copy(pbConf.max_size().begin(),
             pbConf.max_size().end(),
             std::back_inserter(maxSize_));
-  std::copy(pbConf.aspect_ratio().begin(),
-            pbConf.aspect_ratio().end(),
-            std::back_inserter(aspectRatio_));
   std::copy(pbConf.variance().begin(),
             pbConf.variance().end(),
             std::back_inserter(variance_));
+  std::copy(pbConf.aspect_ratio().begin(),
+            pbConf.aspect_ratio().end(),
+            std::back_inserter(tmp));
   // flip
-  int inputRatioLength = aspectRatio_.size();
-  for (int index = 0; index < inputRatioLength; index++)
-    aspectRatio_.push_back(1 / aspectRatio_[index]);
-  aspectRatio_.push_back(1.);
+  int inputRatioLength = tmp.size();
+  for (int index = 0; index < inputRatioLength; index++) {
+    aspectRatio_.push_back(tmp[index]);
+    aspectRatio_.push_back(1 / tmp[index]);
+  }
   numPriors_ = aspectRatio_.size();
   if (maxSize_.size() > 0) numPriors_++;
   return true;
@@ -94,12 +99,12 @@ void PriorBoxLayer::forward(PassType passType) {
     for (int w = 0; w < layerWidth; ++w) {
       real centerX = (w + 0.5) * stepW;
       real centerY = (h + 0.5) * stepH;
-      int minSize = 0;
+      real minSize = 0;
       for (size_t s = 0; s < minSize_.size(); s++) {
         // first prior.
         minSize = minSize_[s];
-        int boxWidth = minSize;
-        int boxHeight = minSize;
+        real boxWidth = minSize;
+        real boxHeight = minSize;
         // xmin, ymin, xmax, ymax.
         tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
         tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
@@ -112,7 +117,7 @@ void PriorBoxLayer::forward(PassType passType) {
           CHECK_EQ(minSize_.size(), maxSize_.size());
           // second prior.
           for (size_t s = 0; s < maxSize_.size(); s++) {
-            int maxSize = maxSize_[s];
+            real maxSize = maxSize_[s];
             boxWidth = boxHeight = sqrt(minSize * maxSize);
             tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
             tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
@@ -145,6 +150,5 @@ void PriorBoxLayer::forward(PassType passType) {
   MatrixPtr outV = getOutputValue();
   outV->copyFrom(buffer_->data_, dim * 2);
 }
-REGISTER_LAYER(priorbox, PriorBoxLayer);
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/SequencePoolLayer.cpp b/paddle/gserver/layers/SequencePoolLayer.cpp
index 35260ca912d5d0e00213ffb7074bd8963da265da..5807c4249620db44fed82a6bb69a77d807d9f0a0 100644
--- a/paddle/gserver/layers/SequencePoolLayer.cpp
+++ b/paddle/gserver/layers/SequencePoolLayer.cpp
@@ -56,17 +56,16 @@ void SequencePoolLayer::forward(PassType passType) {
   CHECK_EQ(newBatchSize_, starts->getSize() - 1);
 
   resetOutput(newBatchSize_, dim);
-  if (type_) {
-    CHECK(input.subSequenceStartPositions)
-        << "when trans_type = seq, input must hasSubseq";
-  }
+
   /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
    * thus, in this case, output_ has no sequenceStartPositions.
    * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
    * case, we should compute the new sequenceStartPositions.
   */
   if (type_) {
-    output_.degradeSequence(input, useGpu_);
+    CHECK(input.subSequenceStartPositions)
+        << "when trans_type = seq, input must hasSubseq";
+    output_.degradeSequence(input);
   }
 }
 
diff --git a/paddle/gserver/tests/test_ConvUnify.cpp b/paddle/gserver/tests/test_ConvUnify.cpp
index 207fc0566fcf4a0d2e971f3c169a14a64146155b..54b72375b743fe025e0ded5fdbce5699a0b4be1a 100644
--- a/paddle/gserver/tests/test_ConvUnify.cpp
+++ b/paddle/gserver/tests/test_ConvUnify.cpp
@@ -34,8 +34,7 @@ DECLARE_double(checkgrad_eps);
 DECLARE_bool(thread_local_rand_use_global_seed);
 DECLARE_bool(prev_batch_state);
 
-// Do one forward pass of convTrans layer and check to see if its output
-// matches the given result
+// Do one forward pass of ConvLayer using either exconv or cudnn_conv
 MatrixPtr doOneConvTest(size_t imgSize,
                         size_t output_x,
                         size_t stride,
@@ -46,22 +45,35 @@ MatrixPtr doOneConvTest(size_t imgSize,
                         size_t groups,
                         MatrixPtr& inputData,
                         real* param,
-                        bool useGpu) {
+                        bool useGpu,
+                        bool isDeconv = false) {
   TestConfig config;
   config.biasSize = numfilters;
+  string layerType;
   if (useGpu) {
-    config.layerConfig.set_type("cudnn_conv");
+    layerType = (isDeconv) ? "cudnn_convt" : "cudnn_conv";
   } else {
-    config.layerConfig.set_type("exconv");
+    layerType = (isDeconv) ? "exconvt" : "exconv";
   }
+  config.layerConfig.set_type(layerType);
   config.layerConfig.set_num_filters(numfilters);
   config.layerConfig.set_partial_sum(1);
   config.layerConfig.set_shared_biases(true);
 
   size_t weightSize = channel * filter_size * filter_size *
                       config.layerConfig.num_filters() / groups;
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", imgSize * imgSize * channel, weightSize});
+  if (isDeconv) {
+    config.inputDefs.push_back(
+        {INPUT_DATA, "layer_0", output_x * output_x * channel, weightSize});
+    config.layerConfig.set_size(imgSize * imgSize *
+                                config.layerConfig.num_filters());
+  } else {
+    config.inputDefs.push_back(
+        {INPUT_DATA, "layer_0", imgSize * imgSize * channel, weightSize});
+    config.layerConfig.set_size(output_x * output_x *
+                                config.layerConfig.num_filters());
+  }
+
   LayerInputConfig* input = config.layerConfig.add_inputs();
   ConvConfig* conv = input->mutable_conv_conf();
   conv->set_filter_size(filter_size);
@@ -72,12 +84,15 @@ MatrixPtr doOneConvTest(size_t imgSize,
   conv->set_stride(stride);
   conv->set_stride_y(stride);
   conv->set_groups(groups);
-  conv->set_filter_channels(channel / groups);
   conv->set_img_size(imgSize);
   conv->set_output_x(output_x);
 
-  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
-                              config.layerConfig.num_filters());
+  if (isDeconv) {
+    conv->set_filter_channels(numfilters / groups);
+  } else {
+    conv->set_filter_channels(channel / groups);
+  }
+
   config.layerConfig.set_name("conv");
 
   std::vector<DataLayerPtr> dataLayers;
@@ -105,6 +120,8 @@ MatrixPtr doOneConvTest(size_t imgSize,
 TEST(Layer, convParaUnified) {
 #ifndef PADDLE_ONLY_CPU
   MatrixPtr input, resultCpu, resultGpu;
+
+  /// TEST1 for conv ///
   input = Matrix::create(1, 4 * 4, false, false);
   real inputData[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
   real param[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 7, 6, 5, 4, 3, 2, 1};
@@ -121,7 +138,7 @@ TEST(Layer, convParaUnified) {
                             /*groups*/ 1,
                             input,
                             param,
-                            false);
+                            /*useGpu*/ false);
 
   resultGpu = doOneConvTest(/* imgSize */ 4,
                             /* output_x */ 2,
@@ -133,9 +150,42 @@ TEST(Layer, convParaUnified) {
                             /*groups*/ 1,
                             input,
                             param,
-                            true);
+                            /*useGpu*/ true);
   checkMatrixEqual(resultCpu, resultGpu);
 
+  /// TEST1 for deconv ///
+  input = Matrix::create(1, 2 * 2, false, false);
+  real inputDataT[] = {1, 2, 3, 4};
+  input->setData(inputDataT);
+
+  resultCpu = doOneConvTest(/* imgSize */ 4,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 3,
+                            /*channel*/ 1,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param,
+                            /*useGpu*/ false,
+                            /*isDeconv*/ true);
+
+  resultGpu = doOneConvTest(/* imgSize */ 4,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 3,
+                            /*channel*/ 1,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param,
+                            /*useGpu*/ true,
+                            /*isDeconv*/ true);
+  checkMatrixEqual(resultCpu, resultGpu);
+
+  /// TEST2 for conv ///
   input = Matrix::create(1, 3 * 3 * 2, false, false);
   real inputData2[] = {
       1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
@@ -153,7 +203,7 @@ TEST(Layer, convParaUnified) {
                             /*groups*/ 1,
                             input,
                             param2,
-                            false);
+                            /*useGpu*/ false);
 
   resultGpu = doOneConvTest(/* imgSize */ 3,
                             /* output_x */ 2,
@@ -165,9 +215,10 @@ TEST(Layer, convParaUnified) {
                             /*groups*/ 1,
                             input,
                             param2,
-                            true);
+                            /*useGpu*/ true);
   checkMatrixEqual(resultCpu, resultGpu);
 
+  /// TEST3 for conv ///
   real param3[] = {1, 2, 3, 4, 4, 3, 2, 1};
 
   resultCpu = doOneConvTest(/* imgSize */ 3,
@@ -180,7 +231,66 @@ TEST(Layer, convParaUnified) {
                             /*groups*/ 2,
                             input,
                             param3,
-                            false);
+                            /*useGpu*/ false);
+
+  resultGpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 2,
+                            input,
+                            param3,
+                            /*useGpu*/ true);
+  checkMatrixEqual(resultCpu, resultGpu);
+
+  /// TEST2 for deconv ///
+  input = Matrix::create(1, 2 * 2 * 2, false, false);
+  real inputData2T[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  input->setData(inputData2T);
+
+  resultCpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param2,
+                            /*useGpu*/ false,
+                            /*isDeconv*/ true);
+
+  resultGpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param2,
+                            /*useGpu*/ true,
+                            /*isDeconv*/ true);
+  checkMatrixEqual(resultCpu, resultGpu);
+
+  /// TEST3 for deconv ///
+  resultCpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 2,
+                            input,
+                            param3,
+                            /*useGpu*/ false,
+                            /*isDeconv*/ true);
 
   resultGpu = doOneConvTest(/* imgSize */ 3,
                             /* output_x */ 2,
@@ -192,7 +302,8 @@ TEST(Layer, convParaUnified) {
                             /*groups*/ 2,
                             input,
                             param3,
-                            true);
+                            /*useGpu*/ true,
+                            /*isDeconv*/ true);
   checkMatrixEqual(resultCpu, resultGpu);
 #endif
 }
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index ceb69359c992128635c199e56805d3f603ca4271..0c22896d6e58f8705f4284b95d0a6e132cb8903d 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -166,15 +166,19 @@ TEST(Projection, scaling) {
   }
 }
 
-void testProjectionConv(size_t groups) {
+void testProjectionConv(size_t groups, bool isDeconv) {
   const int NUM_FILTERS = 18;
   const int FILTER_SIZE = 2;
-  const int FILTER_SIZE_Y = 3;
+  const int FILTER_SIZE_Y = 4;
   const int CHANNELS = 3;
   const int IMAGE_SIZE = 16;
 
   ProjectionConfig conf;
-  conf.set_type("conv");
+  if (isDeconv) {
+    conf.set_type("convt");
+  } else {
+    conf.set_type("conv");
+  }
   conf.set_num_filters(NUM_FILTERS);
 
   ConvConfig* conv = conf.mutable_conv_conf();
@@ -186,7 +190,11 @@ void testProjectionConv(size_t groups) {
   conv->set_stride(2);
   conv->set_stride_y(2);
   conv->set_groups(groups);
-  conv->set_filter_channels(conv->channels() / conv->groups());
+  if (isDeconv) {
+    conv->set_filter_channels(NUM_FILTERS / conv->groups());
+  } else {
+    conv->set_filter_channels(conv->channels() / conv->groups());
+  }
   conv->set_img_size(IMAGE_SIZE);
   int output_x = outputSize(conv->img_size(),
                             conv->filter_size(),
@@ -199,8 +207,14 @@ void testProjectionConv(size_t groups) {
                             conv->stride_y(),
                             /* caffeMode */ true);
   conv->set_output_x(output_x);
-  conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
-  conf.set_output_size(output_x * output_y * NUM_FILTERS);
+  conv->set_output_y(output_y);
+  if (isDeconv) {
+    conf.set_input_size(output_x * output_y * CHANNELS);
+    conf.set_output_size(IMAGE_SIZE * IMAGE_SIZE * NUM_FILTERS);
+  } else {
+    conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
+    conf.set_output_size(output_x * output_y * NUM_FILTERS);
+  }
 
   testProjectionGrad(conf,
                      INPUT_DATA,
@@ -215,8 +229,12 @@ void testProjectionConv(size_t groups) {
 
 #ifndef PADDLE_ONLY_CPU
 TEST(Projection, conv) {
-  testProjectionConv(1);
-  testProjectionConv(3);
+  /// test ConvProjection
+  testProjectionConv(1, false);
+  testProjectionConv(3, false);
+  /// test ConvTransProjection
+  testProjectionConv(1, true);
+  testProjectionConv(3, true);
 }
 #endif
 
@@ -385,11 +403,11 @@ void testConvTransLayer(const string& type, bool trans, bool useGpu) {
   config.layerConfig.set_partial_sum(1);
   config.layerConfig.set_shared_biases(true);
 
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 288});
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 384});
   LayerInputConfig* input = config.layerConfig.add_inputs();
   ConvConfig* conv = input->mutable_conv_conf();
   conv->set_filter_size(2);
-  conv->set_filter_size_y(3);
+  conv->set_filter_size_y(4);
   conv->set_channels(16);
   conv->set_padding(0);
   conv->set_padding_y(1);
@@ -416,6 +434,9 @@ TEST(Layer, convTransLayer) {
   for (auto useGpu : {false, true}) {
     testConvTransLayer("exconvt", /* trans= */ false, /* useGpu= */ useGpu);
   }
+#ifndef PADDLE_ONLY_CPU
+  testConvTransLayer("cudnn_convt", /* trans= */ false, /* useGpu= */ true);
+#endif
 }
 
 TEST(Layer, blockExpandLayer) {
@@ -1482,16 +1503,20 @@ TEST(Layer, BatchNormalizationLayer) {
 #endif
 }
 
-TEST(Operator, conv) {
+void testConvOperator(bool isDeconv) {
   TestConfig config;
   const int NUM_FILTERS = 16;
   const int FILTER_SIZE = 2;
   const int FILTER_SIZE_Y = 3;
   const int CHANNELS = 3;
   const int IMAGE_SIZE = 16;
-  const int IMAGE_SIZE_Y = 8;
+  const int IMAGE_SIZE_Y = 9;
   OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
-  operatorConf.set_type("conv");
+  if (isDeconv) {
+    operatorConf.set_type("convt");
+  } else {
+    operatorConf.set_type("conv");
+  }
   ConvConfig* conv = operatorConf.mutable_conv_conf();
   operatorConf.set_num_filters(NUM_FILTERS);
   conv->set_filter_size(FILTER_SIZE);
@@ -1502,7 +1527,6 @@ TEST(Operator, conv) {
   conv->set_stride(2);
   conv->set_stride_y(2);
   conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
   conv->set_img_size(IMAGE_SIZE);
   conv->set_img_size_y(IMAGE_SIZE_Y);
   conv->set_output_x(outputSize(conv->img_size(),
@@ -1515,11 +1539,22 @@ TEST(Operator, conv) {
                                 conv->padding_y(),
                                 conv->stride_y(),
                                 /*  caffeMode */ true));
-  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
-                              NUM_FILTERS);
 
-  config.inputDefs.push_back(
-      {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE_Y * CHANNELS, 0});
+  if (isDeconv) {
+    conv->set_filter_channels(NUM_FILTERS / conv->groups());
+    config.inputDefs.push_back({INPUT_DATA,
+                                "layer_0",
+                                conv->output_x() * conv->output_y() * CHANNELS,
+                                0});
+    config.layerConfig.set_size(IMAGE_SIZE * IMAGE_SIZE_Y * NUM_FILTERS);
+  } else {
+    conv->set_filter_channels(conv->channels() / conv->groups());
+    config.inputDefs.push_back(
+        {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE_Y * CHANNELS, 0});
+    config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+                                NUM_FILTERS);
+  }
+
   config.inputDefs.push_back(
       {INPUT_DATA,
        "layer_1",
@@ -1531,6 +1566,11 @@ TEST(Operator, conv) {
   testOperatorGrad(config, operatorConf, 100, /*useGpu*/ true, false);
 }
 
+TEST(Operator, conv) {
+  testConvOperator(/*isDeconv*/ true);
+  testConvOperator(/*isDeconv*/ false);
+}
+
 TEST(Layer, FeatureMapExpandLayer) {
   TestConfig config;
   config.layerConfig.set_type("featmap_expand");
@@ -1602,6 +1642,39 @@ TEST(Layer, PadLayer) {
   }
 }
 
+TEST(Layer, CrossChannelNormLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("norm");
+  config.layerConfig.set_size(100);
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_norm_type("cross-channel-norm");
+  norm->set_channels(10);
+  norm->set_size(100);
+  norm->set_scale(0);
+  norm->set_pow(0);
+  norm->set_blocked(0);
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 100, 10});
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false, 5);
+  }
+}
+
+TEST(Layer, smooth_l1) {
+  TestConfig config;
+  config.layerConfig.set_type("smooth_l1");
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 1, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "smooth_l1", 100, false, useGpu, false, 2.0);
+  }
+}
+
 int main(int argc, char** argv) {
   testing::InitGoogleTest(&argc, argv);
   initMain(argc, argv);
diff --git a/paddle/math/BaseMatrix.cu b/paddle/math/BaseMatrix.cu
index 0a0d92d1ae65f5b6020eb71fe2a6db5a3c625d9c..de48b6fac9c7d8125a552022c52353ef6bcef995 100644
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -1453,6 +1453,24 @@ void BaseMatrixT<T>::divRowVector(BaseMatrixT& b) {
               true_type() /* bAsRowVector */, false_type());
 }
 
+template<class T>
+void BaseMatrixT<T>::mulColVector(BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::DotMul<T>(), b, numRows, numCols, offset,
+              false_type(), true_type() /* bAsColVector */);
+}
+
+template<class T>
+void BaseMatrixT<T>::divColVector(BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::DotDiv<T>(), b, numRows, numCols, offset,
+              false_type(), true_type() /* bAsColVector */);
+}
+
 template<>
 template <class Agg>
 int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {
diff --git a/paddle/math/BaseMatrix.h b/paddle/math/BaseMatrix.h
index 8691c87ac3b88499a9676d59af533e0f4713dfc3..6ed48c8d88ee698689de6f7a7f470b97a094ea5b 100644
--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -545,6 +545,9 @@ public:
   void mulRowVector(BaseMatrixT& b);
   void divRowVector(BaseMatrixT& b);
 
+  void mulColVector(BaseMatrixT& b);
+  void divColVector(BaseMatrixT& b);
+
   void addP2P(BaseMatrixT& b);
 
   /**
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 07450bfb0ef709840f7e8253e87c227276529a2a..9eead5b62c690b0a3310d8b68bfa3f1870be17c2 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -3590,6 +3590,55 @@ void CpuMatrix::sumOfSquaresBp(Matrix& output, Matrix& label) {
   }
 }
 
+void CpuMatrix::smoothL1(Matrix& output, Matrix& label) {
+  CHECK(output.useGpu_ == false && label.useGpu_ == false)
+      << "Matrix type are not equal";
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(label.getHeight(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(label.getWidth(), dim);
+  CHECK_EQ(getWidth(), (size_t)1);
+  real* out = output.getData();
+  real* cost = getData();
+  real* lbl = label.getData();
+
+  for (size_t i = 0; i < numSamples; ++i, out += dim, cost += dim, lbl += dim) {
+    for (size_t j = 0; j < dim; ++j) {
+      cost[j] = std::fabs(out[j] - lbl[j]);
+      if (cost[j] < 1.0)
+        cost[j] = 0.5 * cost[j] * cost[j];
+      else
+        cost[j] = cost[j] - 0.5;
+    }
+  }
+}
+
+void CpuMatrix::smoothL1Bp(Matrix& output, Matrix& label) {
+  CHECK(output.useGpu_ == false && label.useGpu_ == false)
+      << "Matrix type are not equal";
+
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(label.getHeight(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(label.getWidth(), dim);
+  CHECK_EQ(getWidth(), (size_t)1);
+  real* out = output.getData();
+  real* cost = getData();
+  real* lbl = label.getData();
+
+  // f'(x) = x         if |x| < 1
+  //       = sign(x)   otherwise
+  for (size_t i = 0; i < numSamples; ++i, out += dim, cost += dim, lbl += dim) {
+    for (size_t j = 0; j < dim; ++j) {
+      cost[j] = out[j] - lbl[j];
+      if (std::fabs(cost[j]) >= 1) cost[j] = (0 < cost[j]) - (cost[j] < 0);
+    }
+  }
+}
+
 void CpuMatrix::tanh(Matrix& output) {
   CHECK(isContiguous());
   CHECK(output.isContiguous());
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index d0ba2e93feabfcc11ac1d261bc40c9c6973a8c29..dbdb629614546b7c7b569d7473d96a06d0c5a9c7 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -783,6 +783,14 @@ public:
     LOG(FATAL) << "Not implemented";
   }
 
+  virtual void smoothL1(Matrix& output, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
+  virtual void smoothL1Bp(Matrix& outputV, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+
   virtual void tanh(Matrix& output) { LOG(FATAL) << "Not implemented"; }
 
   virtual void tanhDerivative(Matrix& output) {
@@ -1720,6 +1728,9 @@ public:
   /// gradient of sumOfSquares.
   void sumOfSquaresBp(Matrix& outputV, Matrix& label);
 
+  void smoothL1(Matrix& output, Matrix& label);
+  void smoothL1Bp(Matrix& output, Matrix& label);
+
   void tanh(Matrix& output);
   void tanhDerivative(Matrix& output);
 
diff --git a/paddle/math/tests/test_BaseMatrix.cpp b/paddle/math/tests/test_BaseMatrix.cpp
index 21918b86e1ad98766ceaf09dea3020d6e8592191..22ce39701fca7b650fc03794cb0701e0987d2dae 100644
--- a/paddle/math/tests/test_BaseMatrix.cpp
+++ b/paddle/math/tests/test_BaseMatrix.cpp
@@ -110,6 +110,8 @@ TEST(BaseMatrix, BaseMatrix) {
       compare(&BaseMatrix::addRowVector);
       compare(&BaseMatrix::mulRowVector);
       compare(&BaseMatrix::divRowVector);
+      compare(&BaseMatrix::mulColVector);
+      compare(&BaseMatrix::divColVector);
       compare(&BaseMatrix::addP2P);
       compare(&BaseMatrix::invSqrt);
     }
diff --git a/paddle/parameter/Argument.cpp b/paddle/parameter/Argument.cpp
index 7a343cca33f5b420be6192231ac73ca1c2da5fb9..4139f59a2c8e665daf410b5b16539ff74b77ecfe 100644
--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -123,46 +123,6 @@ static void resizeAndCopy(ICpuGpuVectorPtr& dest,
   }
 }
 
-static void resizeAndCopy(UserDefinedVectorPtr& dest,
-                          const UserDefinedVectorPtr& src,
-                          bool useGpu,
-                          hl_stream_t stream) {
-  if (src) {
-    CHECK(!useGpu) << "not implemented";
-    size_t height = src->size();
-    if (!dest) {
-      dest = std::make_shared<std::vector<void*>>(height);
-    } else {
-      dest->resize(height);
-    }
-    std::copy_n(src->begin(), height, dest->begin());
-  } else {
-    dest.reset();
-  }
-}
-
-static void resizeAndCopy(UserDefinedVectorPtr& dest,
-                          const UserDefinedVectorPtr& src,
-                          int32_t startPos,
-                          int32_t copySize,
-                          bool useGpu,
-                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
-  if (src) {
-    CHECK(!useGpu) << "not implemented";
-    CHECK_LE((size_t)startPos + copySize, src->size());
-
-    size_t height = copySize;
-    if (!dest) {
-      dest = std::make_shared<std::vector<void*>>(height);
-    } else {
-      dest->resize(height);
-    }
-    std::copy_n(src->begin() + startPos, height, dest->begin());
-  } else {
-    dest.reset();
-  }
-}
-
 static void resizeAndCopy(SVectorPtr& dest,
                           const SVectorPtr& src,
                           bool useGpu,
@@ -223,7 +183,6 @@ void Argument::resizeAndCopyFrom(const Argument& src,
                   false /* useGpu */,
                   stream);
   }
-  resizeAndCopy(udp, src.udp, useGpu, stream);
   resizeAndCopy(strs, src.strs, useGpu, stream);
   frameWidth = src.frameWidth;
   frameHeight = src.frameHeight;
@@ -255,7 +214,6 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src,
     resizeAndCopy(value, src.value, startRow, copySize, useGpu, stream);
     resizeAndCopy(grad, src.grad, startRow, copySize, useGpu, stream);
     resizeAndCopy(ids, src.ids, startRow, copySize, useGpu, stream);
-    resizeAndCopy(udp, src.udp, startRow, copySize, useGpu, stream);
     resizeAndCopy(strs, src.strs, startRow, copySize, useGpu, stream);
     return copySize;
   } else {
@@ -268,7 +226,6 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src,
     resizeAndCopy(value, src.value, startRow, copyFeatureSize, useGpu, stream);
     resizeAndCopy(grad, src.grad, startRow, copyFeatureSize, useGpu, stream);
     resizeAndCopy(ids, src.ids, startRow, copyFeatureSize, useGpu, stream);
-    resizeAndCopy(udp, src.udp, startRow, copySize, useGpu, stream);
     resizeAndCopy(sequenceStartPositions,
                   src.sequenceStartPositions,
                   startSeq,
@@ -583,7 +540,7 @@ void Argument::checkSubset() const {
   }
 }
 
-void Argument::degradeSequence(const Argument& input, bool useGpu) {
+void Argument::degradeSequence(const Argument& input) {
   CHECK_EQ(input.hasSubseq(), 1UL);
   size_t numSequences = input.getNumSequences();
   size_t numSubSequences = input.getNumSubSequences();
diff --git a/paddle/parameter/Argument.h b/paddle/parameter/Argument.h
index 9ef44be0cb3b960db1e789f3f26bb66d1fe63c81..9fd84bc4b7e0aa54d81f5d5df9e5acb3fbb70d29 100644
--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -24,8 +24,6 @@ limitations under the License. */
 
 namespace paddle {
 
-// vector of user defined pointers
-typedef std::shared_ptr<std::vector<void*>> UserDefinedVectorPtr;
 typedef std::shared_ptr<std::vector<std::string>> SVectorPtr;
 
 struct Argument {
@@ -40,7 +38,6 @@ struct Argument {
         sequenceStartPositions(nullptr),
         subSequenceStartPositions(nullptr),
         cpuSequenceDims(nullptr),
-        udp(nullptr),
         deviceId(-1),
         allCount(0),
         valueCount(0),
@@ -63,7 +60,6 @@ struct Argument {
     sequenceStartPositions = argument.sequenceStartPositions;
     subSequenceStartPositions = argument.subSequenceStartPositions;
     cpuSequenceDims = argument.cpuSequenceDims;
-    udp = argument.udp;
     deviceId = argument.deviceId;
     allCount = argument.allCount;
     frameHeight = argument.frameHeight;
@@ -96,8 +92,6 @@ struct Argument {
   // dimension of sequence, stored only in CPU
   IVectorPtr cpuSequenceDims;
 
-  UserDefinedVectorPtr udp;  // user defined pointer
-
   int deviceId;            // the GPU device id which the argument in
   int allCount;            // the number of output layers using this argument
   mutable int valueCount;  // waiting this member when layer do forward
@@ -137,7 +131,6 @@ struct Argument {
     if (ids) return ids->getSize();
     if (grad) return grad->getHeight();
     if (in) return in->getHeight();
-    if (udp) return udp->size();
     if (strs) return strs->size();
     return 0;
   }
@@ -296,7 +289,7 @@ struct Argument {
   /*
    sequence has sub-sequence degrades to a sequence.
    */
-  void degradeSequence(const Argument& input, bool useGpu);
+  void degradeSequence(const Argument& input);
 
   /**
    * @brief getValueString will return the argument's output in string. There
diff --git a/paddle/py_paddle/.gitignore b/paddle/py_paddle/.gitignore
index 9e8ad4bf1638a69ab7ef19badfbf867e116548d2..80d1f76fbc05627e21e334af55d63a4a534434c6 100644
--- a/paddle/py_paddle/.gitignore
+++ b/paddle/py_paddle/.gitignore
@@ -1 +1,2 @@
 swig_paddle.py
+_swig_paddle.so
diff --git a/paddle/scripts/docker/README.md b/paddle/scripts/docker/README.md
index 8c35411fc390ef218e395c58808d644e7a35095e..7c90316ad82a6430d6c12d72e07b166b6d9d98a9 100644
--- a/paddle/scripts/docker/README.md
+++ b/paddle/scripts/docker/README.md
@@ -83,13 +83,18 @@ docker build -t paddle:dev .
 
 The `docker build` command assumes that `Dockerfile` is in the root source tree.  Note that in this design, this `Dockerfile` is this only one in our repo.
 
+Users can specify a Ubuntu mirror server for faster downloading:
+
+```bash
+docker build -t paddle:dev --build-arg UBUNTU_MIRROR=mirror://mirrors.ubuntu.com/mirrors.txt .
+```
 
 ### Build PaddlePaddle from Source Code
 
 Given the development image `paddle:dev`, the following command builds PaddlePaddle from the source tree on the development computer (host):
 
 ```bash
-docker run -v $PWD:/paddle -e "GPU=OFF" -e "AVX=ON" -e "TEST=ON" paddle:dev
+docker run -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=OFF" -e "RUN_TEST=OFF" paddle:dev
 ```
 
 This command mounts the source directory on the host into `/paddle` in the container, so the default entry point of `paddle:dev`, `build.sh`, could build the source code with possible local changes.  When it writes to `/paddle/build` in the container, it writes to `$PWD/build` on the host indeed.
@@ -100,6 +105,14 @@ This command mounts the source directory on the host into `/paddle` in the conta
 - `$PWD/build/paddle-<version>.deb` for production installation, and
 - `$PWD/build/Dockerfile`, which builds the production Docker image.
 
+Users can specify the following Docker build arguments with either "ON" or "OFF" value:
+- `WITH_GPU`: ***Required***. Generates NVIDIA CUDA GPU code and relies on CUDA libraries.
+- `WITH_AVX`: ***Required***. Set to "OFF" prevents from generating AVX instructions. If you don't know what is AVX, you might want to set "ON".
+- `WITH_TEST`: ***Optional, default OFF***. Build unit tests binaries. Once you've built the unit tests, you can run these test manually by the following command:
+  ```bash
+    docker run -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" paddle:dev sh -c "cd /paddle/build; make coverall"
+  ```
+- `RUN_TEST`: ***Optional, default OFF***. Run unit tests after building. You can't run unit tests without building it.
 
 ### Build the Production Docker Image
 
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
old mode 100755
new mode 100644
index c44874eede03a8b1060b15e175ad89622f925940..a0da561dfe962b7a0a0515d4104940175ebdecad
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -1,104 +1,78 @@
 #!/bin/bash
 
-function abort(){
-    echo "An error occurred. Exiting..." 1>&2
-    exit 1
-}
-
-trap 'abort' 0
 set -e
-mkdir -p /paddle/dist/cpu
-mkdir -p /paddle/dist/gpu
-mkdir -p /paddle/dist/cpu-noavx
-mkdir -p /paddle/dist/gpu-noavx
-# Set BASE_IMAGE and DEB_PATH according to env variables
+
+# Set BASE_IMAGE according to env variables
 if [ ${WITH_GPU} == "ON" ]; then
   BASE_IMAGE="nvidia/cuda:7.5-cudnn5-runtime-ubuntu14.04"
   # additional packages to install when building gpu images
-  GPU_DOCKER_PKG="python-pip"
-  if [ ${WITH_AVX} == "ON" ]; then
-    DEB_PATH="dist/gpu/"
-    DOCKER_SUFFIX="gpu"
-  else
-    DEB_PATH="dist/gpu-noavx/"
-    DOCKER_SUFFIX="gpu-noavx"
-  fi
+  GPU_DOCKER_PKG="python-pip python-dev"
 else
   BASE_IMAGE="python:2.7.13-slim"
-  if [ ${WITH_AVX} == "ON" ]; then
-    DEB_PATH="dist/cpu/"
-    DOCKER_SUFFIX="cpu"
-  else
-    DEB_PATH="dist/cpu-noavx/"
-    DOCKER_SUFFIX="noavx"
-  fi
 fi
-# If Dockerfile.* sets BUILD_AND_INSTALL to 'ON', it would have copied
-# source tree to /paddle, and this scripts should build it into
-# /paddle/build.
-if [[ ${BUILD_AND_INSTALL:-OFF} == 'ON' ]]; then
-    if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
-	ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so /usr/lib/libcudnn.so
-    fi
 
-    mkdir -p /paddle/build # -p means no error if exists
-    cd /paddle/build
-    # clean local cmake and third_party cache
-    if [ ${DELETE_BUILD_CACHE} == 'ON' ]; then
-      rm -rf * && rm -rf ../third_party
-    fi
-    cmake .. \
-	  -DWITH_DOC=${WITH_DOC:-OFF} \
-	  -DWITH_GPU=${WITH_GPU:-OFF} \
-	  -DWITH_AVX=${WITH_AVX:-OFF} \
-	  -DWITH_SWIG_PY=ON \
-	  -DCUDNN_ROOT=/usr/ \
-	  -DWITH_STYLE_CHECK=OFF \
-	  -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-    make -j `nproc`
-    make install
-    # generate deb package for current build
-    # FIXME(typhoonzero): should we remove paddle/scripts/deb ?
-    # FIXME: CPACK_DEBIAN_PACKAGE_DEPENDS removes all dev dependencies, must
-    # install them in docker
-    cpack -D CPACK_GENERATOR='DEB' -D CPACK_DEBIAN_PACKAGE_DEPENDS="" ..
-    mv /paddle/build/*.deb /paddle/${DEB_PATH}
+DOCKERFILE_GPU_ENV=""
+if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
+    DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
 
-    if [[ ${BUILD_WOBOQ:-OFF} == 'ON' ]]; then
-        apt-get install -y clang-3.8 llvm-3.8 libclang-3.8-dev
-        # Install woboq_codebrowser.
-        git clone https://github.com/woboq/woboq_codebrowser /woboq
-        cd /woboq
-        cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-        -DCMAKE_BUILD_TYPE=Release \
-        .
-        make
+    # for cmake to find cudnn
+    ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so /usr/lib/libcudnn.so
+fi
 
-        export WOBOQ_OUT=/usr/share/nginx/html/paddle
-        export BUILD_DIR=/paddle/build
-        mkdir -p $WOBOQ_OUT
-        cp -rv /woboq/data $WOBOQ_OUT/../data
-        /woboq/generator/codebrowser_generator \
+mkdir -p /paddle/build
+cd /paddle/build
+
+# build script will not fail if *.deb does not exist
+rm *.deb 2>/dev/null || true
+
+cmake .. \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DWITH_DOC=${WITH_DOC:-OFF} \
+      -DWITH_GPU=${WITH_GPU:-OFF} \
+      -DWITH_AVX=${WITH_AVX:-OFF} \
+      -DWITH_SWIG_PY=ON \
+      -DCUDNN_ROOT=/usr/ \
+      -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF} \
+      -DON_COVERALLS=${WITH_TEST:-OFF} \
+      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+make -j `nproc`
+if [[ ${RUN_TEST:-OFF} == "ON" ]]; then
+    make coveralls
+fi
+make install
+
+# generate deb package for current build
+# FIXME(typhoonzero): should we remove paddle/scripts/deb ?
+# FIXME: CPACK_DEBIAN_PACKAGE_DEPENDS removes all dev dependencies, must
+# install them in docker
+cpack -D CPACK_GENERATOR='DEB' -D CPACK_DEBIAN_PACKAGE_DEPENDS="" ..
+
+if [[ ${BUILD_WOBOQ:-OFF} == 'ON' ]]; then
+    apt-get install -y clang-3.8 llvm-3.8 libclang-3.8-dev
+    # Install woboq_codebrowser.
+    git clone https://github.com/woboq/woboq_codebrowser /woboq
+    cd /woboq
+    cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
+          -DCMAKE_BUILD_TYPE=Release \
+          .
+    make
+
+    export WOBOQ_OUT=/usr/share/nginx/html/paddle
+    export BUILD_DIR=/paddle/build
+    mkdir -p $WOBOQ_OUT
+    cp -rv /woboq/data $WOBOQ_OUT/../data
+    /woboq/generator/codebrowser_generator \
         -b /paddle/build \
         -a \
         -o $WOBOQ_OUT \
         -p paddle:/paddle
-        /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
-        cd /woboq
-        make clean
-    fi
-
-    pip install /usr/local/opt/paddle/share/wheels/py_paddle*linux*.whl
-    pip install /usr/local/opt/paddle/share/wheels/paddle*.whl
-    paddle version
-
-    if [[ ${DOCKER_BUILD:-FALSE} == 'TRUE' ]]; then
-	# reduce docker image size
-	rm -rf /paddle/build
-	rm -rf /usr/local/opt/paddle/share/wheels/
-    fi
+    /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
+    cd /woboq
+    make clean
 fi
 
+paddle version
+
 # generate production docker image Dockerfile
 if [ ${USE_MIRROR} ]; then
   MIRROR_UPDATE="sed 's@http:\/\/archive.ubuntu.com\/ubuntu\/@mirror:\/\/mirrors.ubuntu.com\/mirrors.txt@' -i /etc/apt/sources.list && \\"
@@ -106,39 +80,23 @@ else
   MIRROR_UPDATE="\\"
 fi
 
-cat > /paddle/build/Dockerfile.${DOCKER_SUFFIX} <<EOF
+cat > /paddle/build/Dockerfile <<EOF
 FROM ${BASE_IMAGE}
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-
-# ENV variables
-ARG WITH_AVX
-ARG WITH_DOC
-ARG WITH_STYLE_CHECK
-
-ENV WITH_GPU=${WITH_GPU}
-ENV WITH_AVX=\${WITH_AVX:-ON}
-ENV WITH_DOC=\${WITH_DOC:-OFF}
-ENV WITH_STYLE_CHECK=\${WITH_STYLE_CHECK:-OFF}
-
 ENV HOME /root
 ENV LANG en_US.UTF-8
-
 # Use Fix locales to en_US.UTF-8
-
 RUN ${MIRROR_UPDATE}
     apt-get update && \
-    apt-get install -y libgfortran3 ${GPU_DOCKER_PKG} && \
+    apt-get install -y libgfortran3 libpython2.7 ${GPU_DOCKER_PKG} && \
     apt-get clean -y && \
     pip install --upgrade pip && \
-    pip install -U 'protobuf==3.1.0' requests
-RUN pip install numpy
+    pip install -U 'protobuf==3.1.0' requests numpy
 # Use different deb file when building different type of images
-ADD \$PWD/${DEB_PATH}*.deb /usr/local/opt/paddle/deb/
-RUN dpkg --force-all -i /usr/local/opt/paddle/deb/*.deb && rm -f /usr/local/opt/paddle/deb/*.deb
-
-ENV PATH="/usr/local/opt/paddle/bin/:${PATH}"
+ADD build/*.deb /usr/local/opt/paddle/deb/
+# run paddle version to install python packages first
+RUN dpkg -i /usr/local/opt/paddle/deb/*.deb && rm -f /usr/local/opt/paddle/deb/*.deb && paddle version
+${DOCKERFILE_GPU_ENV}
 # default command shows the paddle version and exit
 CMD ["paddle", "version"]
 EOF
-
-trap : 0
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index f29d32f0d947dc7cde6112160e4f79ce8113505f..5a45df4072b9197a713bd19ee766296279bfcbc8 100644
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -94,16 +94,22 @@ else:
 EOF
 
 if [ $? -eq 1 ]; then  # Older version installed, or not installed at all
-   echo "First time run paddle, need to install some python dependencies."
-   BASEDIR=$(dirname "$0")
-   pip install ${BASEDIR}/../opt/paddle/share/wheels/*-@PADDLE_VERSION@-*.whl
-   if [ $? -ne 0 ]; then
-      echo "pip install wheels failed. "
-      echo "Please use 'sudo paddle' at the first time you use PaddlePaddle"
-      echo "PaddlePaddle will install some python dependencies automatically."
-      exit 1
-   fi
-   echo "Python dependencies are installed."
+    echo "First time run paddle, need to install some python dependencies."
+    # setuptools normalizes package version, so we need to use normalized
+    # package version for paddle python package
+    PYTHON_PADDLE_VERSION=$(python -c 'import packaging
+import setuptools
+print str(packaging.version.Version("@PADDLE_VERSION@"))
+' 2>/dev/null)
+    BASEDIR=$(dirname "$0")
+    pip install ${BASEDIR}/../opt/paddle/share/wheels/*-${PYTHON_PADDLE_VERSION}-*.whl
+    if [ $? -ne 0 ]; then
+	echo "pip install wheels failed. "
+	echo "Please use 'sudo paddle' at the first time you use PaddlePaddle"
+	echo "PaddlePaddle will install some python dependencies automatically."
+	exit 1
+    fi
+    echo "Python dependencies are installed."
 fi
 
 case "$1" in
diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh
index 7deb3e62e88de7e1306fcbfc5a28aa4372d678e6..f2cbc561652a3c7502de94be37d75783fc40b9c1 100755
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
@@ -5,7 +5,7 @@ NPROC=1
 export PYTHONPATH=/opt/python/2.7.12/lib/python2.7/site-packages
 export PYTHONHOME=/opt/python/2.7.12
 export PATH=/opt/python/2.7.12/bin:${PATH}
-cmake .. -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
+cmake .. -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DON_TRAVIS=ON -DWITH_COVERAGE=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
 NRPOC=`nproc`
 make -j $NPROC
 make coveralls
diff --git a/paddle/setup.py.in b/paddle/setup.py.in
index 382d5be6ecfc26b4a524bb6a775bd1a805a34d96..0b62436a7f81682d5279c3b307ac1abf09eafffb 100644
--- a/paddle/setup.py.in
+++ b/paddle/setup.py.in
@@ -12,68 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# This file is used to build paddle python binding package.
-# It will be invoked by Makefile that generated by COMAKE
 
 from setuptools import setup, Extension
 
-import numpy as np
-import api.paddle_ld_flags
-import platform
-import os
-
-system = platform.system().lower()
-
-is_osx = (system == 'darwin')
-is_win = (system == 'windows')
-is_lin = (system == 'linux')
-
-
-# The extra links will passed from COMAKE
-#   because generate paddle LDFLAGS is too complicated to do in setup.py
-#   it just read COMAKE generated LDFLAGS.
-extra_comps = []
-extra_links = []
-obj = api.paddle_ld_flags.PaddleLDFlag()
-extra_comps = obj.c_flag()
-ldflags = obj.ldflag_str()
-if ldflags is not None:
-  extra_links.extend(ldflags.split(" "))
-
-try:
-  with open('.py_paddle_extra_link_flags', 'r') as f:
-    for line in f:
-      extra_links += line.split()
-except:
-  pass
-
-if is_lin == True:
-    extra_links = ["-Xlinker", '-start-group'] + extra_links + ["-Xlinker", "-end-group"]
-elif is_osx == True:
-    os.environ["ARCHFLAGS"] = "-arch x86_64"
-    extra_links = ["-Wl,-all_load"] + extra_links
-
-include_dirs = [np.get_include(), "../"]    # include numpy and paddle.
-
-os.environ["CC"] = "@CMAKE_C_COMPILER@"
-os.environ["CXX"] = "@CMAKE_CXX_COMPILER@"
-
 setup(name="py_paddle",
-  version="@PADDLE_VERSION@",
-  ext_modules=[
-    Extension('py_paddle._swig_paddle',      # Build SWIG Extension.
-       ['Paddle_wrap.cxx'],
-       language = "c++",
-       include_dirs = include_dirs,
-       extra_link_args = extra_links,
-       extra_compile_args = extra_comps
-    )
-  ],
-  packages=['py_paddle'],
-  include_dirs = include_dirs,
-  install_requires = [
-    'nltk>=3.2.2',
-    'numpy>=1.8.0',      # The numpy is required.
-    'protobuf>=3.0.0'    # The paddle protobuf version
-  ],
+      version="${PADDLE_VERSION}",
+      packages=['py_paddle'],
+      include_package_data=True,
+      package_data={'py_paddle':['*.py','_swig_paddle.so']},
+      install_requires = [
+        'nltk>=3.2.2',
+        'numpy>=1.8.0',      # The numpy is required.
+        'protobuf>=${PROTOBUF_VERSION}'    # The paddle protobuf version
+      ],
+      url='http://www.paddlepaddle.org/',
+      license='Apache 2.0',
 )
diff --git a/paddle/utils/CpuId.h b/paddle/utils/CpuId.h
index 0f3985cc7b2c018ede9bba9644d2d096561dccee..5fc610964d4f5b8064f16ebf1b26bbb002264ce1 100644
--- a/paddle/utils/CpuId.h
+++ b/paddle/utils/CpuId.h
@@ -12,6 +12,7 @@ limitations under the License. */
 #pragma once
 
 #include "Common.h"
+#include "Error.h"
 
 namespace paddle {
 
@@ -97,4 +98,37 @@ private:
 #define HAS_AVX512  HAS_SIMD(SIMD_AVX512)
 // clang-format on
 
+/**
+ * Invoke checkCPUFeature() before Paddle initialization to
+ * check target machine whether support compiled instructions.
+ * If not, simply throw out an error.
+ */
+inline Error __must_check checkCPUFeature() {
+  Error err;
+#ifndef __AVX__
+  if (HAS_AVX) {
+    LOG(WARNING) << "PaddlePaddle wasn't compiled to use avx instructions, "
+                 << "but these are available on your machine and could "
+                 << "speed up CPU computations via CMAKE .. -DWITH_AVX=ON";
+  }
+#else
+  if (!HAS_AVX) {
+    err = Error(
+        "PaddlePaddle was compiled to use avx instructions, "
+        "but these aren't available on your machine, please "
+        "disable it via CMAKE .. -DWITH_AVX=OFF");
+  }
+#endif  // __AVX__
+#ifdef __SSE3__
+  if (!HAS_SSE3) {
+    err = Error(
+        "PaddlePaddle was compiled to use sse3 instructions, "
+        "which is the minimum requirement of PaddlePaddle. "
+        "But these aren't available on your current machine.");
+  }
+#endif  // __SSE3__
+
+  return err;
+}
+
 }  // namespace paddle
diff --git a/paddle/utils/PythonUtil.cpp.in b/paddle/utils/PythonUtil.cpp.in
index 66b5795e29fb9fa751ed802e87ced0a71aea4c51..a51b8f765f41f6febb86002f371b14e8797e7e4d 100644
--- a/paddle/utils/PythonUtil.cpp.in
+++ b/paddle/utils/PythonUtil.cpp.in
@@ -195,9 +195,14 @@ extern const char enable_virtualenv_py[];
 }
 void initPython(int argc, char** argv) {
 #ifndef PADDLE_NO_PYTHON
-  char pyHome[] = "@PYTHON_INSTALL_DIR@"; // NOLINT
-  if (strlen(pyHome)) {
-    Py_SetPythonHome(pyHome);
+  std::string pyHome;
+#if defined(__APPLE__) || defined(__OSX__)
+  pyHome = "/usr/local/Frameworks/Python.framework/Versions/2.7";
+  Py_SetPythonHome(const_cast<char*>(pyHome.c_str()));
+#endif
+  pyHome = "@PYTHON_INSTALL_DIR@"; // NOLINT
+  if (!pyHome.empty()) {
+    Py_SetPythonHome(const_cast<char*>(pyHome.c_str()));
   }
   Py_SetProgramName(argv[0]);
   Py_Initialize();
diff --git a/paddle/utils/Util.cpp b/paddle/utils/Util.cpp
index dbab4ec43ca2fa691445131d2cb14f51721a2e4c..1f56b6b8a96602d298507452fc7182d46179de41 100644
--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
@@ -26,6 +26,7 @@ limitations under the License. */
 
 #include <gflags/gflags.h>
 
+#include "CpuId.h"
 #include "CustomStackTrace.h"
 #include "Logging.h"
 #include "StringUtil.h"
@@ -185,6 +186,7 @@ void initMain(int argc, char** argv) {
   }
 
   version::printVersion();
+  checkCPUFeature().check();
   runInitFunctions();
 }
 
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index e257aa568facb1555944dba7e76c5d8bce7f1c7d..77361f8bc751446d89d8a812f48d33cd3dffc665 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -686,25 +686,17 @@ class ContextProjection(Projection):
 
 
 @config_class
-class ConvProjection(Projection):
-    type = 'conv'
-
+class ConvBaseProjection(Projection):
     def __init__(self,
                  input_layer_name,
                  num_filters=None,
                  conv_conf=None,
                  **xargs):
-        super(ConvProjection, self).__init__(input_layer_name, **xargs)
+        super(ConvBaseProjection, self).__init__(input_layer_name, **xargs)
 
         if num_filters is not None:
             self.proj_conf.num_filters = num_filters
 
-        parse_conv(conv_conf, input_layer_name, self.proj_conf.conv_conf,
-                   num_filters)
-        self.proj_conf.output_size = self.proj_conf.conv_conf.output_x * \
-                                     self.proj_conf.conv_conf.output_y * \
-                                     num_filters
-
     def calc_output_size(self, input_layer_config):
         return self.proj_conf.output_size
 
@@ -723,6 +715,48 @@ class ConvProjection(Projection):
         return None
 
 
+@config_class
+class ConvProjection(ConvBaseProjection):
+    type = 'conv'
+
+    def __init__(self,
+                 input_layer_name,
+                 num_filters=None,
+                 conv_conf=None,
+                 **xargs):
+        super(ConvProjection, self).__init__(input_layer_name, num_filters,
+                                             conv_conf, **xargs)
+
+        parse_conv(conv_conf, self.input_layer_name, self.proj_conf.conv_conf,
+                   num_filters)
+        self.proj_conf.output_size = self.proj_conf.conv_conf.output_x * \
+                                     self.proj_conf.conv_conf.output_y * \
+                                     num_filters
+
+
+@config_class
+class ConvTransProjection(ConvBaseProjection):
+    type = 'convt'
+
+    def __init__(self,
+                 input_layer_name,
+                 num_filters=None,
+                 conv_conf=None,
+                 **xargs):
+        super(ConvTransProjection, self).__init__(input_layer_name, num_filters,
+                                                  conv_conf, **xargs)
+
+        parse_conv(
+            conv_conf,
+            self.input_layer_name,
+            self.proj_conf.conv_conf,
+            num_filters,
+            trans=True)
+        self.proj_conf.output_size = self.proj_conf.conv_conf.img_size_y * \
+                                     self.proj_conf.conv_conf.img_size * \
+                                     num_filters
+
+
 # Define a operator for mixed layer
 @config_class
 class Operator(Cfg):
@@ -789,6 +823,36 @@ class ConvOperator(Operator):
         return self.operator_conf.output_size
 
 
+@config_class
+class ConvTransOperator(Operator):
+    type = 'convt'
+
+    def __init__(self,
+                 input_layer_names,
+                 num_filters=None,
+                 conv_conf=None,
+                 **xargs):
+        super(ConvTransOperator, self).__init__(input_layer_names, **xargs)
+        if num_filters is not None:
+            self.operator_conf.num_filters = num_filters
+
+        parse_conv(
+            conv_conf,
+            MakeLayerNameInSubmodel(input_layer_names[0]),
+            self.operator_conf.conv_conf,
+            num_filters,
+            trans=True)
+        self.operator_conf.output_size = \
+            self.operator_conf.conv_conf.img_size * \
+            self.operator_conf.conv_conf.img_size_y * \
+            num_filters
+
+        config_assert(len(input_layer_names) == 2, "Conv is binary operator")
+
+    def calc_output_size(self, input_sizes):
+        return self.operator_conf.output_size
+
+
 # please refer to the comments in proto/ModelConfig.proto
 @config_class
 class Conv(Cfg):
@@ -1156,9 +1220,11 @@ def parse_image(image, input_layer_name, image_conf):
 
 def parse_norm(norm, input_layer_name, norm_conf):
     norm_conf.norm_type = norm.norm_type
-    config_assert(norm.norm_type in ['rnorm', 'cmrnorm-projection'],
-                  "norm-type %s is not in [rnorm, 'cmrnorm-projection']" %
-                  norm.norm_type)
+    config_assert(
+        norm.norm_type in
+        ['rnorm', 'cmrnorm-projection', 'cross-channel-norm'],
+        "norm-type %s is not in [rnorm, cmrnorm-projection, cross-channel-norm]"
+        % norm.norm_type)
     norm_conf.channels = norm.channels
     norm_conf.size = norm.size
     norm_conf.scale = norm.scale
@@ -1772,8 +1838,17 @@ class ConvTransLayerBase(LayerBase):
         use_gpu = int(g_command_config_args.get("use_gpu", 0))
         parallel_nn = int(g_command_config_args.get("parallel_nn", 0))
 
-        # cudnn_convt has not been implemented so use exconvt only
-        self.layer_type = "exconvt"
+        # Automatically select cudnn_type for GPU and exconvt for CPU
+        # if set type=exconvt, but still reserve the way user specify
+        # exconvt or cudnn_convt manually.
+        if self.layer_type == "cudnn_convt":
+            config_assert(use_gpu, "cudnn_convt only support GPU")
+
+        if (use_gpu == 1 and self.layer_type != "exconvt" and
+            (parallel_nn == 0 or self.config.device > -1)):
+            self.layer_type = "cudnn_convt"
+        else:
+            self.layer_type = "exconvt"
         # need to specify layer in config
         self.config.type = self.layer_type
 
@@ -1790,10 +1865,9 @@ class ConvTransLayerBase(LayerBase):
                 trans=True)
             conv_conf = self.config.inputs[input_index].conv_conf
             psize = self.calc_parameter_size(conv_conf)
-            print("output size for %s is %d " % (name, conv_conf.output_x))
             self.create_input_parameter(input_index, psize)
-            self.set_layer_size(
-                (conv_conf.img_size**2) * self.config.num_filters)
+            self.set_cnn_layer(name, conv_conf.img_size_y, conv_conf.img_size,
+                               self.config.num_filters)
 
         psize = self.config.size
         if shared_biases:
@@ -1810,6 +1884,11 @@ class ConvTransLayer(ConvTransLayerBase):
     layer_type = 'exconvt'
 
 
+@config_layer('cudnn_convt')
+class ConvTransLayer(ConvTransLayerBase):
+    layer_type = 'cudnn_convt'
+
+
 @config_layer('norm')
 class NormLayer(LayerBase):
     def __init__(self, name, inputs, **xargs):
@@ -1821,6 +1900,9 @@ class NormLayer(LayerBase):
                        norm_conf)
             self.set_cnn_layer(name, norm_conf.output_y, norm_conf.output_x,
                                norm_conf.channels, False)
+            if norm_conf.norm_type == "cross-channel-norm":
+                self.create_input_parameter(0, norm_conf.channels,
+                                            [norm_conf.channels, 1])
 
 
 @config_layer('pool')
@@ -2222,7 +2304,10 @@ def Link(
 
 # memory for recurrent layer group.
 # *name* and *size* are actual layer's name and size.
-# will return name of the memory,
+# If *name* is None, need to provide *memory_name* and need to use
+# SetMemoryInput() later to specify the layer which this memory remembers.
+#
+# return the name of the memory,
 # use this name if you assign the memory as other layer's input
 #
 # boot frame of memory is zeroed by default,
@@ -2234,15 +2319,18 @@ def Link(
 # can only be initailized by a *boot_layer* which is a sequence.
 #
 @config_func
-def Memory(
-        name,
-        size,
-        is_sequence=False,
-        boot_layer=None,
-        boot_bias=False,
-        boot_bias_active_type="",
-        boot_with_const_id=None, ):
-    agent_name = name + "+delay1"
+def Memory(name,
+           size,
+           is_sequence=False,
+           boot_layer=None,
+           boot_bias=False,
+           boot_bias_active_type="",
+           boot_with_const_id=None,
+           memory_name=None):
+    if not memory_name:
+        config_assert(name is not None, "name needs cannot be None")
+        memory_name = name + "+delay1"
+    agent_name = memory_name
     if is_sequence:
         agent_layer = SequenceAgentLayer(agent_name, size)
     else:
@@ -2250,7 +2338,8 @@ def Memory(
     config_assert(g_current_submodel.is_recurrent_layer_group,
                   'Memory should be used in recurrent layer group only')
     memory = g_current_submodel.memories.add()
-    memory.layer_name = MakeLayerNameInSubmodel(name)
+    if name is not None:
+        memory.layer_name = MakeLayerNameInSubmodel(name)
     memory.link_name = MakeLayerNameInSubmodel(agent_name)
     memory.is_sequence = is_sequence
     options = sum((boot_layer is not None, bool(boot_bias),
@@ -2274,6 +2363,17 @@ def Memory(
     return agent_name
 
 
+@config_func
+def SetMemoryInput(memory_name, layer_name):
+    memory_name = MakeLayerNameInSubmodel(memory_name)
+    layer_name = MakeLayerNameInSubmodel(layer_name)
+    for mem in g_current_submodel.memories:
+        if mem.link_name == memory_name:
+            mem.layer_name = layer_name
+            return
+    logger.fatal("Nonexistent memory name: " + memory_name)
+
+
 # Generator for recurrent layer group, to use it:
 #  1. define a id layer as output of layer group
 #  2. define a memory of this id layer, and assign a boot id(begin of sequence)
diff --git a/python/paddle/trainer_config_helpers/default_decorators.py b/python/paddle/trainer_config_helpers/default_decorators.py
index 2f25579fcdd9793e4c165439c9934a2bccb63617..69d860d9dab9c1d90e4d6a6940d66fcb551f6eb6 100644
--- a/python/paddle/trainer_config_helpers/default_decorators.py
+++ b/python/paddle/trainer_config_helpers/default_decorators.py
@@ -97,13 +97,13 @@ def reset_hook():
 register_parse_config_hook(reset_hook)
 
 
-def wrap_name_default(name_prefix=None):
+def wrap_name_default(name_prefix=None, name_param="name"):
     """
     Decorator to set "name" arguments default to "{name_prefix}_{invoke_count}".
 
     ..  code:: python
 
-        @default_name("some_name")
+        @wrap_name_default("some_name")
         def func(name=None):
             print name      # name will never be None. If name is not set,
                             # name will be "some_name_%d"
@@ -115,7 +115,7 @@ def wrap_name_default(name_prefix=None):
     """
     factory = DefaultNameFactory(name_prefix)
     _name_factories.append(factory)
-    return wrap_param_default(["name"], factory)
+    return wrap_param_default([name_param], factory)
 
 
 def wrap_param_attr_default(param_names=None, default_factory=None):
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 7cd3ce91312b86c96e46530e45ff9427db0a0a45..8d2329292b5b8b408473c2e33fc43b2e586d89b6 100755
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -112,6 +112,7 @@ __all__ = [
     'out_prod_layer',
     'print_layer',
     'priorbox_layer',
+    'cross_channel_norm_layer',
     'spp_layer',
     'pad_layer',
     'eos_layer',
@@ -288,6 +289,14 @@ class LayerOutput(object):
         """
         assert False, "this method should not be invoked"
 
+    def set_input(self, input):
+        """
+        Set the input for a memory layer. Can only be used for memory layer
+        """
+        assert isinstance(input, LayerOutput)
+        assert self.layer_type == LayerType.MEMORY
+        SetMemoryInput(self.name, input.name)
+
 
 ERROR_CLIPPING = 'error_clipping_threshold'
 DROPOUT = 'drop_rate'
@@ -704,8 +713,9 @@ class MixedLayerType(LayerOutput):
         assert len(self.inputs) == 0
         return self
 
-    def __exit__(self, *args, **kwargs):
-        del args, kwargs  # unused parameter to suppress warning
+    def __exit__(self, exc_type, exc_value, tb):
+        if exc_value is not None:
+            raise exc_value
         assert len(self.inputs) != 0
         ml = MixedLayer(
             name=self.name,
@@ -999,6 +1009,46 @@ def priorbox_layer(input,
         size=size)
 
 
+@wrap_name_default("cross_channel_norm")
+def cross_channel_norm_layer(input, name=None, param_attr=None):
+    """
+    Normalize a layer's output. This layer is necessary for ssd.
+    This layer applys normalize across the channels of each sample to
+    a conv layer's output and scale the output by a group of trainable
+    factors which dimensions equal to the channel's number.
+
+    :param name: The Layer Name.
+    :type name: basestring
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param param_attr: The Parameter Attribute|list.
+    :type param_attr: ParameterAttribute
+    :return: LayerOutput
+    """
+    assert input.num_filters is not None
+    Layer(
+        name=name,
+        type=LayerType.NORM_LAYER,
+        inputs=[
+            Input(
+                input.name,
+                norm=Norm(
+                    norm_type="cross-channel-norm",
+                    channels=input.num_filters,
+                    size=input.size,
+                    scale=0,
+                    pow=0,
+                    blocked=0),
+                **param_attr.attr)
+        ])
+    return LayerOutput(
+        name,
+        LayerType.NORM_LAYER,
+        parents=input,
+        num_filters=input.num_filters,
+        size=input.size)
+
+
 @wrap_name_default("seq_pooling")
 @wrap_bias_attr_default(has_bias=False)
 @wrap_param_default(['pooling_type'], default_factory=lambda _: MaxPooling())
@@ -2036,8 +2086,9 @@ def img_conv_layer(input,
     :param trans: true if it is a convTransLayer, false if it is a convLayer
     :type trans: bool
     :param layer_type: specify the layer_type, default is None. If trans=True,
-                       layer_type has to be "exconvt", otherwise layer_type
-                       has to be either "exconv" or "cudnn_conv"
+                       layer_type has to be "exconvt" or "cudnn_convt", 
+                       otherwise layer_type has to be either "exconv" or 
+                       "cudnn_conv"
     :type layer_type: String
     :return: LayerOutput object.
     :rtype: LayerOutput
@@ -2077,7 +2128,7 @@ def img_conv_layer(input,
 
     if layer_type:
         if trans:
-            assert layer_type in ["exconvt"]
+            assert layer_type in ["exconvt", "cudnn_convt"]
         else:
             assert layer_type in ["exconv", "cudnn_conv"]
         lt = layer_type
@@ -2759,8 +2810,10 @@ def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
         size=a.size)
 
 
+@wrap_name_default("memory", "memory_name")
 def memory(name,
            size,
+           memory_name=None,
            is_seq=False,
            boot_layer=None,
            boot_bias=None,
@@ -2782,14 +2835,32 @@ def memory(name,
     If boot_layer is not null, the memory is just the boot_layer's output.
     Set :code:`is_seq` is true boot layer is sequence.
 
-
     The same name layer in recurrent group will set memory on each time
     step.
 
-    :param name: memory's name.
+    .. code-block:: python
+
+       mem = memory(size=256, name='state')
+       state = fc_layer(input=mem, size=256, name='state')
+
+    If you do not want to specify the name, you can equivalently use set_input()
+    to specify the layer needs to be remembered as the following:
+
+    .. code-block:: python
+       mem = memory(size=256)
+       state = fc_layer(input=mem, size=256)
+       mem.set_input(mem)
+
+
+    :param name: the name of the layer which this memory remembers.
+                 If name is None, user should call set_input() to specify the
+                 name of the layer which this memory remembers.
     :type name: basestring
     :param size: size of memory.
     :type size: int
+    :param memory_name: the name of the memory.
+                        It is ignored when name is provided.
+    :type memory_name: basestring
     :param is_seq: is sequence for boot_layer
     :type is_seq: bool
     :param boot_layer: boot layer of memory.
@@ -2811,13 +2882,21 @@ def memory(name,
         boot_bias = ParamAttr.to_bias(boot_bias)
 
     assert boot_layer is None or isinstance(boot_layer, LayerOutput)
+    if name is not None:
+        memory_name = None
 
-    agent_name = Memory(name, size, is_seq, boot_layer.name
-                        if boot_layer is not None else None, boot_bias,
-                        boot_bias_active_type.name, boot_with_const_id)
+    memory_name = Memory(
+        name,
+        size,
+        is_sequence=is_seq,
+        boot_layer=boot_layer.name if boot_layer is not None else None,
+        boot_bias=boot_bias,
+        boot_bias_active_type=boot_bias_active_type.name,
+        boot_with_const_id=boot_with_const_id,
+        memory_name=memory_name)
 
     lout = LayerOutput(
-        name=agent_name,
+        name=memory_name,
         size=size,
         layer_type=LayerType.MEMORY,
         parents=[boot_layer] if boot_layer is not None else None)
@@ -3565,7 +3644,7 @@ def __cost_input__(input, label, weight=None):
     ipts = [Input(input.name), Input(label.name)]
     parents = [input, label]
     if weight is not None:
-        assert weight.layer_type == LayerType.DATA
+        assert weight.size == 1
         ipts.append(Input(weight.name))
         parents.append(weight)
     return ipts, parents
@@ -3679,7 +3758,8 @@ def conv_operator(img,
                   padding=0,
                   filter_size_y=None,
                   stride_y=None,
-                  padding_y=None):
+                  padding_y=None,
+                  trans=False):
     """
     Different from img_conv_layer, conv_op is an Operator, which can be used
     in mixed_layer. And conv_op takes two inputs to perform convolution.
@@ -3735,7 +3815,9 @@ def conv_operator(img,
     if filter.size is not None:
         filter.size = filter_size * filter_size_y * num_filters * num_channels
 
-    op = ConvOperator(
+    opCls = ConvTransOperator if trans else ConvOperator
+
+    op = opCls(
         input_layer_names=[img.name, filter.name],
         num_filters=num_filters,
         conv_conf=Conv(
@@ -3747,6 +3829,7 @@ def conv_operator(img,
             padding_y=padding_y,
             stride_y=stride_y,
             groups=1))
+
     op.origin = [img, filter]
     return op
 
@@ -3762,7 +3845,8 @@ def conv_projection(input,
                     stride_y=None,
                     padding_y=None,
                     groups=1,
-                    param_attr=None):
+                    param_attr=None,
+                    trans=False):
     """
     Different from img_conv_layer and conv_op, conv_projection is an Projection,
     which can be used in mixed_layer and conat_layer. It use cudnn to implement
@@ -3801,6 +3885,8 @@ def conv_projection(input,
     :type groups: int
     :param param_attr: Convolution param attribute. None means default attribute
     :type param_attr: ParameterAttribute
+    :param trans: whether it is convTrans or conv
+    :type trans: boolean
     :return: A DotMulProjection Object.
     :rtype: DotMulProjection
     """
@@ -3837,7 +3923,9 @@ def conv_projection(input,
         param_attr.attr["initial_strategy"] = 0
         param_attr.attr["initial_smart"] = False
 
-    proj = ConvProjection(
+    projCls = ConvTransProjection if trans else ConvProjection
+
+    proj = projCls(
         input_layer_name=input.name,
         num_filters=num_filters,
         conv_conf=Conv(
@@ -4946,7 +5034,12 @@ def lambda_cost(input,
 
 @wrap_name_default()
 @layer_support()
-def cross_entropy(input, label, name=None, coeff=1.0, layer_attr=None):
+def cross_entropy(input,
+                  label,
+                  name=None,
+                  coeff=1.0,
+                  weight=None,
+                  layer_attr=None):
     """
     A loss layer for multi class entropy.
 
@@ -4961,22 +5054,27 @@ def cross_entropy(input, label, name=None, coeff=1.0, layer_attr=None):
     :type input: LayerOutput.
     :param name: The name of this layers. It is not necessary.
     :type name: None|basestring.
-    :param coeff: The coefficient affects the gradient in the backward.
+    :param coeff: The cost is multiplied with coeff.
+                  The coefficient affects the gradient in the backward.
     :type coeff: float.
+    :param weight: The cost of each sample is multiplied with each weight.
+                   The weight should be a layer with size=1. Note that gradient
+                   will not be calculated for weight.
+    :type weight: LayerOutout
     :param layer_attr: Extra Layer Attribute.
     :type layer_attr: ExtraLayerAttribute
     :return: LayerOutput object.
     :rtype: LayerOutput.
     """
 
+    ipts, parents = __cost_input__(input, label, weight)
     Layer(
         name=name,
         type=LayerType.CROSS_ENTROPY,
-        inputs=[input.name, label.name],
+        inputs=ipts,
         coeff=coeff,
         **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
-        name, LayerType.CROSS_ENTROPY, parents=[input, label], size=1)
+    return LayerOutput(name, LayerType.CROSS_ENTROPY, parents=parents, size=1)
 
 
 @wrap_name_default()
diff --git a/python/paddle/trainer_config_helpers/tests/configs/projections.py b/python/paddle/trainer_config_helpers/tests/configs/projections.py
index aa4521dcd5db3f845871cfaaedb02a86bcbddc38..dc8975cb311582a621eb4a5a166ddc34348fe3e9 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/projections.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/projections.py
@@ -34,11 +34,31 @@ flt = data_layer(name='filter', size=3 * 3 * 1 * 64)
 with mixed_layer() as m7:
     m7 += conv_operator(
         img=img, filter=flt, num_filters=64, num_channels=1, filter_size=3)
+    m7 += conv_projection(img, filter_size=3, num_filters=64, num_channels=1)
 
+with mixed_layer() as m8:
+    m8 += conv_operator(
+        img=img,
+        filter=flt,
+        num_filters=64,
+        num_channels=1,
+        filter_size=3,
+        stride=2,
+        padding=1,
+        trans=True)
+    m8 += conv_projection(
+        img,
+        filter_size=3,
+        num_filters=64,
+        num_channels=1,
+        stride=2,
+        padding=1,
+        trans=True)
 end = mixed_layer(
     input=[
         full_matrix_projection(input=m5),
-        trans_full_matrix_projection(input=m6), full_matrix_projection(input=m7)
+        trans_full_matrix_projection(input=m6),
+        full_matrix_projection(input=m7), full_matrix_projection(input=m8)
     ],
     size=100,
     layer_attr=ExtraAttr(
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
index 6934fd0da62f90f9bbddef9a98798bf168f7fa8e..2818389b16cca75f5030b75fc4de8c89c06c5e02 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
@@ -33,6 +33,8 @@ layers {
   bias_parameter_name: "___conv_0__.wbias"
   num_filters: 64
   shared_biases: true
+  height: 256
+  width: 256
 }
 layers {
   name: "__batch_norm_0__"
@@ -58,6 +60,8 @@ layers {
   }
   bias_parameter_name: "___batch_norm_0__.wbias"
   moving_average_fraction: 0.9
+  height: 256
+  width: 256
 }
 layers {
   name: "__crmnorm_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
index 2943ab130bd7d6f3b78ea611f1c35850ccaf5e92..2afc3afef6d39ce9b8eef05948861284775d5011 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
@@ -154,13 +154,40 @@ layers {
   inputs {
     input_layer_name: "img"
   }
+  inputs {
+    input_layer_name: "img"
+    input_parameter_name: "___mixed_6__.w1"
+    proj_conf {
+      type: "conv"
+      name: "___mixed_6__.w1"
+      input_size: 1024
+      output_size: 57600
+      conv_conf {
+        filter_size: 3
+        channels: 1
+        stride: 1
+        padding: 0
+        groups: 1
+        filter_channels: 1
+        output_x: 30
+        img_size: 32
+        caffe_mode: true
+        filter_size_y: 3
+        padding_y: 0
+        stride_y: 1
+        output_y: 30
+        img_size_y: 32
+      }
+      num_filters: 64
+    }
+  }
   inputs {
     input_layer_name: "filter"
   }
   operator_confs {
     type: "conv"
     input_indices: 0
-    input_indices: 1
+    input_indices: 2
     input_sizes: 1024
     input_sizes: 576
     output_size: 57600
@@ -186,38 +213,112 @@ layers {
 layers {
   name: "__mixed_7__"
   type: "mixed"
+  size: 254016
+  active_type: ""
+  inputs {
+    input_layer_name: "img"
+  }
+  inputs {
+    input_layer_name: "img"
+    input_parameter_name: "___mixed_7__.w1"
+    proj_conf {
+      type: "convt"
+      name: "___mixed_7__.w1"
+      input_size: 1024
+      output_size: 254016
+      conv_conf {
+        filter_size: 3
+        channels: 1
+        stride: 2
+        padding: 1
+        groups: 1
+        filter_channels: 64
+        output_x: 32
+        img_size: 63
+        caffe_mode: true
+        filter_size_y: 3
+        padding_y: 1
+        stride_y: 2
+        output_y: 32
+        img_size_y: 63
+      }
+      num_filters: 64
+    }
+  }
+  inputs {
+    input_layer_name: "filter"
+  }
+  operator_confs {
+    type: "convt"
+    input_indices: 0
+    input_indices: 2
+    input_sizes: 1024
+    input_sizes: 576
+    output_size: 254016
+    conv_conf {
+      filter_size: 3
+      channels: 1
+      stride: 2
+      padding: 1
+      groups: 1
+      filter_channels: 64
+      output_x: 32
+      img_size: 63
+      caffe_mode: true
+      filter_size_y: 3
+      padding_y: 1
+      stride_y: 2
+      output_y: 32
+      img_size_y: 63
+    }
+    num_filters: 64
+  }
+}
+layers {
+  name: "__mixed_8__"
+  type: "mixed"
   size: 100
   active_type: ""
   inputs {
     input_layer_name: "__mixed_4__"
-    input_parameter_name: "___mixed_7__.w0"
+    input_parameter_name: "___mixed_8__.w0"
     proj_conf {
       type: "fc"
-      name: "___mixed_7__.w0"
+      name: "___mixed_8__.w0"
       input_size: 300
       output_size: 100
     }
   }
   inputs {
     input_layer_name: "__mixed_5__"
-    input_parameter_name: "___mixed_7__.w1"
+    input_parameter_name: "___mixed_8__.w1"
     proj_conf {
       type: "trans_fc"
-      name: "___mixed_7__.w1"
+      name: "___mixed_8__.w1"
       input_size: 100
       output_size: 100
     }
   }
   inputs {
     input_layer_name: "__mixed_6__"
-    input_parameter_name: "___mixed_7__.w2"
+    input_parameter_name: "___mixed_8__.w2"
     proj_conf {
       type: "fc"
-      name: "___mixed_7__.w2"
+      name: "___mixed_8__.w2"
       input_size: 57600
       output_size: 100
     }
   }
+  inputs {
+    input_layer_name: "__mixed_7__"
+    input_parameter_name: "___mixed_8__.w3"
+    proj_conf {
+      type: "fc"
+      name: "___mixed_8__.w3"
+      input_size: 254016
+      output_size: 100
+    }
+  }
   drop_rate: 0.5
 }
 parameters {
@@ -281,7 +382,23 @@ parameters {
   initial_smart: true
 }
 parameters {
-  name: "___mixed_7__.w0"
+  name: "___mixed_6__.w1"
+  size: 576
+  initial_mean: 0.0
+  initial_std: 0.471404520791
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___mixed_7__.w1"
+  size: 576
+  initial_mean: 0.0
+  initial_std: 0.471404520791
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___mixed_8__.w0"
   size: 30000
   initial_mean: 0.0
   initial_std: 0.057735026919
@@ -291,7 +408,7 @@ parameters {
   initial_smart: true
 }
 parameters {
-  name: "___mixed_7__.w1"
+  name: "___mixed_8__.w1"
   size: 10000
   initial_mean: 0.0
   initial_std: 0.1
@@ -301,7 +418,7 @@ parameters {
   initial_smart: true
 }
 parameters {
-  name: "___mixed_7__.w2"
+  name: "___mixed_8__.w2"
   size: 5760000
   initial_mean: 0.0
   initial_std: 0.00416666666667
@@ -310,10 +427,20 @@ parameters {
   initial_strategy: 0
   initial_smart: true
 }
+parameters {
+  name: "___mixed_8__.w3"
+  size: 25401600
+  initial_mean: 0.0
+  initial_std: 0.00198412698413
+  dims: 254016
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
 input_layer_names: "test"
 input_layer_names: "img"
 input_layer_names: "filter"
-output_layer_names: "__mixed_7__"
+output_layer_names: "__mixed_8__"
 sub_models {
   name: "root"
   layer_names: "test"
@@ -328,10 +455,11 @@ sub_models {
   layer_names: "filter"
   layer_names: "__mixed_6__"
   layer_names: "__mixed_7__"
+  layer_names: "__mixed_8__"
   input_layer_names: "test"
   input_layer_names: "img"
   input_layer_names: "filter"
-  output_layer_names: "__mixed_7__"
+  output_layer_names: "__mixed_8__"
   is_recurrent_layer_group: false
 }
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
index 3e9d28416ed5066461e960f0a9f085e057c28346..a0fb729e062bdf6fd7d2a7c2ae364d1a2b32811d 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
@@ -331,6 +331,54 @@ layers {
   }
   trans_type: "non-seq"
 }
+layers {
+  name: "__recurrent_group_3__"
+  type: "recurrent_layer_group"
+  active_type: ""
+}
+layers {
+  name: "seq_input@__recurrent_group_3__"
+  type: "scatter_agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__memory_6__@__recurrent_group_3__"
+  type: "agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "__fc_layer_0__@__recurrent_group_3__"
+  type: "fc"
+  size: 200
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "seq_input@__recurrent_group_3__"
+    input_parameter_name: "___fc_layer_0__@__recurrent_group_3__.w0"
+  }
+  inputs {
+    input_layer_name: "__memory_6__@__recurrent_group_3__"
+    input_parameter_name: "___fc_layer_0__@__recurrent_group_3__.w1"
+  }
+  bias_parameter_name: "___fc_layer_0__@__recurrent_group_3__.wbias"
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "gather_agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "__last_seq_4__"
+  type: "seqlastins"
+  size: 200
+  active_type: "linear"
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+  }
+  trans_type: "non-seq"
+}
 parameters {
   name: "___mixed_0__.w0"
   size: 40000
@@ -481,6 +529,36 @@ parameters {
   initial_strategy: 0
   initial_smart: false
 }
+parameters {
+  name: "___fc_layer_0__@__recurrent_group_3__.w0"
+  size: 20000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 200
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_0__@__recurrent_group_3__.w1"
+  size: 40000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 200
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_0__@__recurrent_group_3__.wbias"
+  size: 200
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 200
+  initial_strategy: 0
+  initial_smart: false
+}
 input_layer_names: "seq_input"
 input_layer_names: "sub_seq_input"
 output_layer_names: "__last_seq_0__"
@@ -488,6 +566,7 @@ output_layer_names: "__first_seq_0__"
 output_layer_names: "__last_seq_1__"
 output_layer_names: "__last_seq_2__"
 output_layer_names: "__last_seq_3__"
+output_layer_names: "__last_seq_4__"
 sub_models {
   name: "root"
   layer_names: "seq_input"
@@ -510,6 +589,9 @@ sub_models {
   layer_names: "__gru_group_0___recurrent_group"
   layer_names: "__gru_group_0__"
   layer_names: "__last_seq_3__"
+  layer_names: "__recurrent_group_3__"
+  layer_names: "__fc_layer_0__"
+  layer_names: "__last_seq_4__"
   input_layer_names: "seq_input"
   input_layer_names: "sub_seq_input"
   output_layer_names: "__last_seq_0__"
@@ -517,6 +599,7 @@ sub_models {
   output_layer_names: "__last_seq_1__"
   output_layer_names: "__last_seq_2__"
   output_layer_names: "__last_seq_3__"
+  output_layer_names: "__last_seq_4__"
   is_recurrent_layer_group: false
 }
 sub_models {
@@ -647,4 +730,28 @@ sub_models {
   }
   target_inlinkid: -1
 }
+sub_models {
+  name: "__recurrent_group_3__"
+  layer_names: "seq_input@__recurrent_group_3__"
+  layer_names: "__memory_6__@__recurrent_group_3__"
+  layer_names: "__fc_layer_0__@__recurrent_group_3__"
+  is_recurrent_layer_group: true
+  reversed: false
+  memories {
+    layer_name: "__fc_layer_0__@__recurrent_group_3__"
+    link_name: "__memory_6__@__recurrent_group_3__"
+    is_sequence: false
+  }
+  in_links {
+    layer_name: "seq_input"
+    link_name: "seq_input@__recurrent_group_3__"
+    has_subseq: false
+  }
+  out_links {
+    layer_name: "__fc_layer_0__@__recurrent_group_3__"
+    link_name: "__fc_layer_0__"
+    has_subseq: false
+  }
+  target_inlinkid: -1
+}
 
diff --git a/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py b/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
index 60b4849d69d497109ef5af3257e212df233a2d0b..91010759e4847f087eb4e05ad98ae794a2129365 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
@@ -16,6 +16,16 @@ def generate_rnn_simple(name):
     return rnn_simple
 
 
+def generate_rnn_simple_no_name():
+    def rnn_simple(s):
+        m = memory(name=None, size=200)
+        fc = fc_layer(input=[s, m], size=200)
+        m.set_input(fc)
+        return fc
+
+    return rnn_simple
+
+
 with mixed_layer() as lstm_param:  # test lstm unit, rnn group
     lstm_param += full_matrix_projection(input=seq, size=100 * 4)
 
@@ -33,4 +43,6 @@ outputs(
     last_seq(input=lstmemory_group(
         input=lstm_param, size=100)),
     last_seq(input=gru_group(
-        input=gru_param, size=100)))
+        input=gru_param, size=100)),
+    last_seq(input=recurrent_group(
+        step=generate_rnn_simple_no_name(), input=seq)), )
diff --git a/python/paddle/v2/dataset/cifar.py b/python/paddle/v2/dataset/cifar.py
index d9f7a830ee60a331b55a1e218923e690103e1c5b..3a8b98b8f045b0eb58be69649486cbd0a571f118 100644
--- a/python/paddle/v2/dataset/cifar.py
+++ b/python/paddle/v2/dataset/cifar.py
@@ -20,7 +20,7 @@ TODO(yuyang18): Complete the comments.
 import cPickle
 import itertools
 import numpy
-import paddle.v2.dataset.common
+from common import download
 import tarfile
 
 __all__ = ['train100', 'test100', 'train10', 'test10']
@@ -55,23 +55,23 @@ def reader_creator(filename, sub_name):
 
 def train100():
     return reader_creator(
-        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
-        'train')
+        download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'train')
 
 
 def test100():
-    return reader_creator(
-        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
-        'test')
+    return reader_creator(download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'test')
 
 
 def train10():
     return reader_creator(
-        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
-        'data_batch')
+        download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'data_batch')
 
 
 def test10():
     return reader_creator(
-        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
-        'test_batch')
+        download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'test_batch')
+
+
+def fetch():
+    download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
+    download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
diff --git a/python/paddle/v2/dataset/common.py b/python/paddle/v2/dataset/common.py
index 3021b68ddb02ecaa874e21681796c0912ad4cc06..7021a6da05dec6be216534112c2df2586e73390f 100644
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@@ -17,6 +17,8 @@ import hashlib
 import os
 import shutil
 import sys
+import importlib
+import paddle.v2.dataset
 
 __all__ = ['DATA_HOME', 'download', 'md5file']
 
@@ -69,3 +71,13 @@ def dict_add(a_dict, ele):
         a_dict[ele] += 1
     else:
         a_dict[ele] = 1
+
+
+def fetch_all():
+    for module_name in filter(lambda x: not x.startswith("__"),
+                              dir(paddle.v2.dataset)):
+        if "fetch" in dir(
+                importlib.import_module("paddle.v2.dataset.%s" % module_name)):
+            getattr(
+                importlib.import_module("paddle.v2.dataset.%s" % module_name),
+                "fetch")()
diff --git a/python/paddle/v2/dataset/conll05.py b/python/paddle/v2/dataset/conll05.py
index 9eab49ee39325c1c60fc511e0bd834e83aa987f0..f1b0ce16f21ad13d4564242c2359355236093032 100644
--- a/python/paddle/v2/dataset/conll05.py
+++ b/python/paddle/v2/dataset/conll05.py
@@ -196,3 +196,11 @@ def test():
         words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
         props_name='conll05st-release/test.wsj/props/test.wsj.props.gz')
     return reader_creator(reader, word_dict, verb_dict, label_dict)
+
+
+def fetch():
+    download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
+    download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
+    download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
+    download(EMB_URL, 'conll05st', EMB_MD5)
+    download(DATA_URL, 'conll05st', DATA_MD5)
diff --git a/python/paddle/v2/dataset/imdb.py b/python/paddle/v2/dataset/imdb.py
index 76019d9f54020ff6f02c17eb6047cbd014a8ccf2..5284017ce08de8beb559f58fb6006639f40f5580 100644
--- a/python/paddle/v2/dataset/imdb.py
+++ b/python/paddle/v2/dataset/imdb.py
@@ -123,3 +123,7 @@ def test(word_idx):
 def word_dict():
     return build_dict(
         re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
+
+
+def fetch():
+    paddle.v2.dataset.common.download(URL, 'imdb', MD5)
diff --git a/python/paddle/v2/dataset/imikolov.py b/python/paddle/v2/dataset/imikolov.py
index 97c160f111d09d61eb860c7f02552e635f2400a7..2931d06e7eb65bde887c56a8bc20e7a9c5e4d4e4 100644
--- a/python/paddle/v2/dataset/imikolov.py
+++ b/python/paddle/v2/dataset/imikolov.py
@@ -89,3 +89,7 @@ def train(word_idx, n):
 
 def test(word_idx, n):
     return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n)
+
+
+def fetch():
+    paddle.v2.dataset.common.download(URL, "imikolov", MD5)
diff --git a/python/paddle/v2/dataset/mnist.py b/python/paddle/v2/dataset/mnist.py
index 16f2fcb99de4cb1971a7375a97b5daa209ee95ef..48a39b5493a8004d6eb034498a797af9c662bd19 100644
--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/v2/dataset/mnist.py
@@ -106,3 +106,10 @@ def test():
                                           TEST_IMAGE_MD5),
         paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist',
                                           TEST_LABEL_MD5), 100)
+
+
+def fetch():
+    paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
+    paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
+    paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
+    paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
diff --git a/python/paddle/v2/dataset/movielens.py b/python/paddle/v2/dataset/movielens.py
index 25fd8227da2f219d75c6b830e65627ecf35be453..e148ddeca0370cd76128a31ce3a4d488e9737d98 100644
--- a/python/paddle/v2/dataset/movielens.py
+++ b/python/paddle/v2/dataset/movielens.py
@@ -30,6 +30,9 @@ __all__ = [
 
 age_table = [1, 18, 25, 35, 45, 50, 56]
 
+URL = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
+MD5 = 'c4d9eecfca2ab87c1945afe126590906'
+
 
 class MovieInfo(object):
     def __init__(self, index, categories, title):
@@ -77,10 +80,7 @@ USER_INFO = None
 
 
 def __initialize_meta_info__():
-    fn = download(
-        url='http://files.grouplens.org/datasets/movielens/ml-1m.zip',
-        module_name='movielens',
-        md5sum='c4d9eecfca2ab87c1945afe126590906')
+    fn = download(URL, "movielens", MD5)
     global MOVIE_INFO
     if MOVIE_INFO is None:
         pattern = re.compile(r'^(.*)\((\d+)\)$')
@@ -205,5 +205,9 @@ def unittest():
     print train_count, test_count
 
 
+def fetch():
+    download(URL, "movielens", MD5)
+
+
 if __name__ == '__main__':
     unittest()
diff --git a/python/paddle/v2/dataset/sentiment.py b/python/paddle/v2/dataset/sentiment.py
index 71689fd61b6b14a7b5072caff4e2fd48a7f74072..0eeb6d5affd8c280fb74edc82cf24bf418ca8ef9 100644
--- a/python/paddle/v2/dataset/sentiment.py
+++ b/python/paddle/v2/dataset/sentiment.py
@@ -125,3 +125,7 @@ def test():
     """
     data_set = load_sentiment_data()
     return reader_creator(data_set[NUM_TRAINING_INSTANCES:])
+
+
+def fetch():
+    nltk.download('movie_reviews', download_dir=common.DATA_HOME)
diff --git a/python/paddle/v2/dataset/uci_housing.py b/python/paddle/v2/dataset/uci_housing.py
index 27f454b137e3a40febd19cf085e2f4034cc16b24..dab8620441c966b19d8218025f8d8fa5b40d1c2c 100644
--- a/python/paddle/v2/dataset/uci_housing.py
+++ b/python/paddle/v2/dataset/uci_housing.py
@@ -89,3 +89,7 @@ def test():
             yield d[:-1], d[-1:]
 
     return reader
+
+
+def fetch():
+    download(URL, 'uci_housing', MD5)
diff --git a/python/paddle/v2/dataset/wmt14.py b/python/paddle/v2/dataset/wmt14.py
index c686870a497668517d1c78c11c616ad8a71a2980..ee63a93f5ad918b5bbc949ae6ba29082b3f6abd5 100644
--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/v2/dataset/wmt14.py
@@ -16,7 +16,7 @@ wmt14 dataset
 """
 import tarfile
 
-import paddle.v2.dataset.common
+from paddle.v2.dataset.common import download
 
 __all__ = ['train', 'test', 'build_dict']
 
@@ -95,11 +95,13 @@ def reader_creator(tar_file, file_name, dict_size):
 
 def train(dict_size):
     return reader_creator(
-        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
-        'train/train', dict_size)
+        download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'train/train', dict_size)
 
 
 def test(dict_size):
     return reader_creator(
-        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
-        'test/test', dict_size)
+        download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'test/test', dict_size)
+
+
+def fetch():
+    download(URL_TRAIN, 'wmt14', MD5_TRAIN)
diff --git a/python/paddle/v2/tests/test_layer.py b/python/paddle/v2/tests/test_layer.py
index 5ccd3d6913e1755a37b4da7c4f182147b880d3cb..89cc928dd7f624612ba717b4e5c2d6c2de7f8bed 100644
--- a/python/paddle/v2/tests/test_layer.py
+++ b/python/paddle/v2/tests/test_layer.py
@@ -22,7 +22,9 @@ import paddle.v2.networks as networks
 
 pixel = layer.data(name='pixel', type=data_type.dense_vector(128))
 label = layer.data(name='label', type=data_type.integer_value(10))
-weight = layer.data(name='weight', type=data_type.dense_vector(10))
+weight = layer.data(name='weight', type=data_type.dense_vector(1))
+combine_weight = layer.data(
+    name='weight_combine', type=data_type.dense_vector(10))
 score = layer.data(name='score', type=data_type.dense_vector(1))
 
 hidden = layer.fc(input=pixel,
@@ -81,7 +83,8 @@ class AggregateLayerTest(unittest.TestCase):
 class MathLayerTest(unittest.TestCase):
     def test_math_layer(self):
         addto = layer.addto(input=[pixel, pixel])
-        linear_comb = layer.linear_comb(weights=weight, vectors=hidden, size=10)
+        linear_comb = layer.linear_comb(
+            weights=combine_weight, vectors=hidden, size=10)
         interpolation = layer.interpolation(
             input=[hidden, hidden], weight=score)
         bilinear = layer.bilinear_interp(input=conv, out_size_x=4, out_size_y=4)