Merge branch 'develop' of github.com:baidu/Paddle into feature/c_api

9c1c19b6 · Yu Yang · d49c6274 · 5961b52b · d49c6274 · 9c1c19b6
77 changed file
--- a/.dockerignore
+++ b/.dockerignore
-.gitignore
\ No newline at end of file
--- a/.dockerignore
+++ b/.dockerignore
+*.DS_Store
+build/
+*.user
+.vscode
+.idea
+.project
+.cproject
+.pydevproject
+Makefile
+.test_env/
+third_party/
+*~
+bazel-*
+!build/*.deb
--- a/.gitignore
+++ b/.gitignore
@@ -7,6 +7,7 @@ build/
 .project
 .cproject
 .pydevproject
+.settings/
 Makefile
 .test_env/
 third_party/

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -40,7 +40,7 @@ option(WITH_RDMA        "Compile PaddlePaddle with RDMA support"        OFF)
 option(WITH_TIMER       "Compile PaddlePaddle with stats timer"         OFF)
 option(WITH_PROFILER    "Compile PaddlePaddle with GPU profiler"        OFF)
 option(WITH_DOC         "Compile PaddlePaddle with documentation"       OFF)
-option(ON_COVERALLS     "Compile PaddlePaddle with code coverage"       OFF)
+option(WITH_COVERAGE    "Compile PaddlePaddle with code coverage"       OFF)
 option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
 option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
@@ -90,14 +90,21 @@ include_directories("${PROJ_ROOT}/paddle/cuda/include")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
 set(EXTERNAL_LIBS
-    # have not include gtest here.
    ${GFLAGS_LIBRARIES}
    ${GLOG_LIBRARIES}
    ${CBLAS_LIBRARIES}
    ${PROTOBUF_LIBRARY}
    ${ZLIB_LIBRARIES}
+    ${PYTHON_LIBRARIES}
 )
+if(WITH_GPU)
+    list(APPEND EXTERNAL_LIB ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
+    if(NOT WITH_DSO)
+        list(APPEND EXTERNAL_LIB ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
+    endif(NOT WITH_DSO)
+endif(WITH_GPU)
 add_subdirectory(proto)
 add_subdirectory(paddle)
 add_subdirectory(python)

--- a/Dockerfile
+++ b/Dockerfile
@@ -3,20 +3,17 @@
 FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-ARG DEBIAN_FRONTEND=noninteractive
 ARG UBUNTU_MIRROR
 RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
 # ENV variables
 ARG BUILD_WOBOQ
-ARG BUILD_AND_INSTALL
 ARG WITH_GPU
 ARG WITH_AVX
 ARG WITH_DOC
 ARG WITH_STYLE_CHECK
 ENV BUILD_WOBOQ=${BUILD_WOBOQ:-OFF}
-ENV BUILD_AND_INSTALL=${BUILD_AND_INSTALL:-OFF}
 ENV WITH_GPU=${WITH_AVX:-OFF}
 ENV WITH_AVX=${WITH_AVX:-ON}
 ENV WITH_DOC=${WITH_DOC:-OFF}
@@ -31,7 +28,7 @@ RUN apt-get update && \
    apt-get install -y wget unzip tar xz-utils bzip2 gzip coreutils && \
    apt-get install -y curl sed grep graphviz libjpeg-dev zlib1g-dev && \
    apt-get install -y python-numpy python-matplotlib gcc g++ gfortran && \
-    apt-get install -y automake locales clang-format-3.8 && \
+    apt-get install -y automake locales clang-format-3.8 swig && \
    apt-get clean -y
 # git credential to skip password typing
@@ -51,8 +48,6 @@ RUN curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
    cd cmake-3.4.1 && ./bootstrap && make -j `nproc` && make install && \
    cd .. && rm -rf cmake-3.4.1
-RUN apt-get install -y swig
 VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"]
 # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service

--- a/README.md
+++ b/README.md
@@ -2,8 +2,8 @@
 [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle)
-[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/)
+[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://www.paddlepaddle.org/develop/doc/)
-[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/cn/index.html)
+[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://www.paddlepaddle.org/doc_cn/)
 [![Coverage Status](https://coveralls.io/repos/github/PaddlePaddle/Paddle/badge.svg?branch=develop)](https://coveralls.io/github/PaddlePaddle/Paddle?branch=develop)
 [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)
@@ -59,36 +59,36 @@ Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddl
    the capability of PaddlePaddle to make a huge impact for your product.
 ## Installation
-Check out the [Install Guide](http://paddlepaddle.org/doc/build/) to install from
-pre-built packages (**docker image**, **deb package**) or
+It is recommended to check out the
-directly build on **Linux** and **Mac OS X** from the source code.
+[Docker installation guide](http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html)
+before looking into the
+[build from source guide](http://www.paddlepaddle.org/develop/doc/getstarted/build_and_install/build_from_source_en.html)
 ## Documentation
-Both [English Docs](http://paddlepaddle.org/doc/) and [Chinese Docs](http://paddlepaddle.org/doc_cn/) are provided for our users and developers.
- [Quick Start](http://paddlepaddle.org/doc/demo/quick_start/index_en) <br>
+We provide [English](http://www.paddlepaddle.org/develop/doc/) and
-   You can follow the quick start tutorial to learn how use PaddlePaddle
+[Chinese](http://www.paddlepaddle.org/doc_cn/) documentation.
-   step-by-step.
+- [Deep Learning 101](http://book.paddlepaddle.org/index.en.html)
+  You might want to start from the this online interactive book that can run in Jupyter Notebook.
+- [Distributed Training](http://www.paddlepaddle.org/develop/doc/howto/usage/cluster/cluster_train_en.html)
+  You can run distributed training jobs on MPI clusters.
+- [Distributed Training on Kubernetes](http://www.paddlepaddle.org/develop/doc/howto/usage/k8s/k8s_en.html)
- [Example and Demo](http://paddlepaddle.org/doc/demo/) <br>
+   You can also run distributed training jobs on Kubernetes clusters.
-   We provide five demos, including: image classification, sentiment analysis,
-   sequence to sequence model, recommendation, semantic role labeling.
- [Distributed Training](http://paddlepaddle.org/doc/cluster) <br>
+- [Python API](http://www.paddlepaddle.org/develop/doc/api/index_en.html)
-  This system supports training deep learning models on multiple machines
-  with data parallelism.
- [Python API](http://paddlepaddle.org/doc/ui/) <br>
+   Our new API enables much shorter programs.
-   PaddlePaddle supports using either Python interface or C++ to build your
-   system. We also use SWIG to wrap C++ source code to create a user friendly
-   interface for Python. You can also use SWIG to create interface for your
-   favorite programming language.
- [How to Contribute](http://paddlepaddle.org/doc/build/contribute_to_paddle.html) <br>
+- [How to Contribute](http://www.paddlepaddle.org/develop/doc/howto/dev/contribute_to_paddle_en.html)
-   We sincerely appreciate your interest and contributions. If you would like to
-   contribute, please read the contribution guide.
- [Source Code Documents](http://paddlepaddle.org/doc/source/) <br>
+   We appreciate your contributions!
 ## Ask Questions

--- a/cmake/coveralls.cmake
+++ b/cmake/coveralls.cmake
@@ -61,7 +61,7 @@ function(code_coverage _COVERAGE_SRCS _COVERALLS_UPLOAD _CMAKE_SCRIPT_PATH)
    endif()
 endfunction()
-if(ON_COVERALLS)
+if(WITH_COVERAGE)
    set(CMAKE_BUILD_TYPE "Debug")
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")

--- a/cmake/coverallsGcovJsons.cmake
+++ b/cmake/coverallsGcovJsons.cmake
@@ -134,7 +134,7 @@ foreach(GCDA ${GCDA_FILES})
 	# If -p is not specified then the file is named only "the_file.c.gcov"
 	#
 	execute_process(
-		COMMAND "${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA} >/dev/null"
+		COMMAND ${GCOV_EXECUTABLE} -p -o ${GCDA_DIR} ${GCDA} >/dev/null
 		WORKING_DIRECTORY ${GCDA_DIR}
 	)
 endforeach()

--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -45,7 +45,7 @@ IF(NOT ${CBLAS_FOUND})
        PREFIX              ${CBLAS_SOURCES_DIR}
        INSTALL_DIR         ${CBLAS_INSTALL_DIR}
        BUILD_IN_SOURCE     1
-        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} FC=${CMAKE_Fortran_COMPILER} CC=${CMAKE_C_COMPILER} HOSTCC=${CMAKE_C_COMPILER} NO_SHARED=1 libs netlib
+        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} FC=${CMAKE_Fortran_COMPILER} CC=${CMAKE_C_COMPILER} HOSTCC=${CMAKE_C_COMPILER} DYNAMIC_ARCH=1 NO_SHARED=1 libs netlib
        INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 PREFIX=<INSTALL_DIR>
        UPDATE_COMMAND      ""
        CONFIGURE_COMMAND   ""

--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -14,7 +14,8 @@
 INCLUDE(ExternalProject)
-FIND_PACKAGE(Protobuf 3.1)
+set(PROTOBUF_VERSION 3.1)
+FIND_PACKAGE(Protobuf ${PROTOBUF_VERSION})
 IF(PROTOBUF_FOUND)
    EXEC_PROGRAM(${PROTOBUF_PROTOC_EXECUTABLE} ARGS --version OUTPUT_VARIABLE PROTOBUF_VERSION)

--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -90,26 +90,6 @@ function(link_paddle_exe TARGET_NAME)
        ${RDMA_LD_FLAGS}
        ${RDMA_LIBS})
-    if(WITH_PYTHON)
-        target_link_libraries(${TARGET_NAME}
-            ${PYTHON_LIBRARIES} util)
-    endif()
-    if(WITH_GPU)
-        target_link_libraries(${TARGET_NAME} ${CUDA_CUDART_LIBRARY})
-        if(NOT WITH_DSO OR WITH_METRIC)
-            target_link_libraries(${TARGET_NAME}
-                ${CUDNN_LIBRARY}
-                ${CUDA_curand_LIBRARY})
-            CUDA_ADD_CUBLAS_TO_TARGET(${TARGET_NAME})
-        endif()
-        check_library_exists(rt clock_gettime "time.h" HAVE_CLOCK_GETTIME )
-        if(HAVE_CLOCK_GETTIME)
-            target_link_libraries(${TARGET_NAME} rt)
-        endif()
-    endif()
    add_dependencies(${TARGET_NAME} ${external_project_dependencies})
 endfunction()

--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -109,6 +109,12 @@ sum_to_one_norm
    :members: sum_to_one_norm
    :noindex:
+cross_channel_norm
+------------------
+..  automodule:: paddle.v2.layer
+    :members: cross_channel_norm
+    :noindex:
 Recurrent Layers
 ================

--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -51,7 +51,7 @@ PaddlePaddle supports some build options.
 <tr><td class="left">WITH_TIMER</td><td class="left">Compile PaddlePaddle with stats timer</td></tr>
 <tr><td class="left">WITH_PROFILER</td><td class="left">Compile PaddlePaddle with GPU profiler</td></tr>
 <tr><td class="left">WITH_DOC</td><td class="left">Compile PaddlePaddle with documentation</td></tr>
-<tr><td class="left">ON_COVERALLS</td><td class="left">Compile PaddlePaddle with code coverage</td></tr>
+<tr><td class="left">WITH_COVERAGE</td><td class="left">Compile PaddlePaddle with code coverage</td></tr>
 <tr><td class="left">COVERALLS_UPLOAD</td><td class="left">Package code coverage data to coveralls</td></tr>
 <tr><td class="left">ON_TRAVIS</td><td class="left">Exclude special unit test on Travis CI</td></tr>
 </tbody>

--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
-FUNCTION(generate_python_api target_name)
-    ADD_CUSTOM_COMMAND(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
-                              ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
-                              ${PROJ_ROOT}/paddle/Paddle_wrap.h
-        COMMAND ${SWIG_EXECUTABLE} -python -c++ -outcurrentdir -I../ api/Paddle.swig
-                && mv ${PROJ_ROOT}/paddle/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
-        DEPENDS ${PROJ_ROOT}/paddle/api/Paddle.swig
-                ${PROJ_ROOT}/paddle/api/PaddleAPI.h
-                ${external_project_dependencies}
-        WORKING_DIRECTORY ${PROJ_ROOT}/paddle
-        COMMENT "Generate Python API from swig")
-    ADD_CUSTOM_TARGET(${target_name} ALL DEPENDS
-                ${PROJ_ROOT}/paddle/Paddle_wrap.cxx
-                ${PROJ_ROOT}/paddle/Paddle_wrap.h
-                ${PROJ_ROOT}/paddle/py_paddle/swig_paddle.py
-                ${external_project_dependencies})
-ENDFUNCTION(generate_python_api)
 set(API_SOURCES
    Arguments.cpp
    ConfigParser.cpp
@@ -33,65 +15,86 @@ set(API_HEADER
    PaddleAPI.h
    Internal.h)
-add_library(paddle_api STATIC
+add_library(paddle_api STATIC ${API_SOURCES})
-        ${API_SOURCES})
 add_dependencies(paddle_api gen_proto_cpp)
-list(LENGTH "${GFLAGS_LIBRARIES}" GFLAGS_LIBRARIES_LENGTH)
+INCLUDE(${SWIG_USE_FILE})
+INCLUDE_DIRECTORIES(${PROJ_ROOT}/paddle)
-if(${GFLAGS_LIBRARIES_LENGTH} EQUAL 0 AND TARGET "${GFLAGS_LIBRARIES}")
+FILE(GLOB PY_PADDLE_PYTHON_FILES ${PROJ_ROOT}/paddle/py_paddle/*.py)
-# Because gflags compiled by cmake, so it is imported by cmake target,
-# not a real library path. Get the real library path here.
+SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON)
-message(STATUS "GFLAGS Libraries is ${GFLAGS_LIBRARIES}")
-get_target_property(GFLAGS_LOCATION ${GFLAGS_LIBRARIES} LOCATION)
+SET(CMAKE_SWIG_OUTDIR ${CMAKE_CURRENT_BINARY_DIR})
-message(STATUS "GFLAGS Target location is ${GFLAGS_LOCATION}")
+SET(CMAKE_CXX_FLAGS "-std=c++11 -fPIC -Wall")
-else()
+IF(WITH_COVERAGE)
-set(GFLAGS_LOCATION ${GFLAGS_LIBRARIES})
+    SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O0 -fprofile-arcs -ftest-coverage")
-endif()
+ENDIF(WITH_COVERAGE)
-configure_file(
+SET(SWIG_MODULE_swig_paddle_EXTRA_DEPS
-    paddle_api_config.py.in
+    paddle_parameter
-    ${PROJ_ROOT}/paddle/api/paddle_api_config.py
+    paddle_function
+    paddle_math
+    paddle_utils
+    paddle_gserver
+    paddle_pserver
+    paddle_api
+    paddle_cuda
+    paddle_trainer_lib
+    paddle_network
+    paddle_proto
+    ${external_project_dependencies}
 )
-generate_python_api(python_swig_sources)
+IF(APPLE)
+    SET(MACOS_LD_FLAGS "-undefined dynamic_lookup -Wl,-all_load")
+ELSE(APPLE)
+    SET(START_GROUP "-Xlinker -start-group")
+    SET(END_GROUP "-Xlinker -end-group")
+    SET(ARCHIVE_START "-Wl,--whole-archive")
+    SET(ARCHIVE_END "-Wl,--no-whole-archive")
+ENDIF(APPLE)
-file(GLOB PY_PADDLE_PYTHON_FILES ${PROJ_ROOT}/paddle/py_paddle/*.py)
+SWIG_ADD_MODULE(swig_paddle python Paddle.i)
+SWIG_LINK_LIBRARIES(swig_paddle
+    ${MACOS_LD_FLAGS}
+    ${START_GROUP}
+    ${ARCHIVE_START}
+    paddle_gserver
+    paddle_function
+    ${METRIC_LIBS}
+    ${ARCHIVE_END}
+    paddle_pserver
+    paddle_trainer_lib
+    paddle_network
+    paddle_parameter
+    paddle_math
+    paddle_utils
+    paddle_proto
+    paddle_cuda
+    paddle_api
+    ${CMAKE_DL_LIBS}
+    ${EXTERNAL_LIBS}
+    ${CMAKE_THREAD_LIBS_INIT}
+    ${RDMA_LD_FLAGS}
+    ${RDMA_LIBS}
+    ${START_END}
+)
-# TODO(yuyang18) : make wheel name calculated by cmake
+add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so
-add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/dist/.timestamp
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PROJ_ROOT}/paddle/py_paddle
    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
    COMMAND ${CMAKE_COMMAND} -E touch dist/.timestamp
    COMMAND rm -rf py_paddle.egg-info build
    WORKING_DIRECTORY ${PROJ_ROOT}/paddle
-    DEPENDS python_swig_sources
+    DEPENDS _swig_paddle
-            paddle_parameter
-            paddle_function
-            paddle_math
-            paddle_utils
-            paddle_gserver
-            paddle_pserver
-            paddle_trainer
-            paddle_api
-            paddle_cuda
-        ${PY_PADDLE_PYTHON_FILES}
 )
-install(DIRECTORY ${PROJ_ROOT}/paddle/dist/
+# TODO(yuyang18) : make wheel name calculated by cmake
-    DESTINATION opt/paddle/share/wheels
+add_custom_target(python_api_wheel ALL DEPENDS ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so)
-)
-add_custom_target(python_api_wheel ALL DEPENDS
+install(DIRECTORY ${PROJ_ROOT}/paddle/dist/ DESTINATION opt/paddle/share/wheels)
-  ${PROJ_ROOT}/paddle/dist/.timestamp)
-add_dependencies(python_api_wheel python_swig_sources
-  paddle_parameter
-  paddle_math
-  paddle_utils
-  paddle_gserver
-  paddle_pserver
-  paddle_trainer
-  paddle_api
-  paddle_cuda)
 if(WITH_TESTING)
    IF(NOT PY_PIP_FOUND)

--- a/paddle/api/Paddle.swig
+++ b/paddle/api/Paddle.swig
--- a/paddle/api/paddle_api_config.py.in
+++ b/paddle/api/paddle_api_config.py.in
-PADDLE_BUILD_DIR="@CMAKE_CURRENT_BINARY_DIR@/../"
-WITH_GPU="@WITH_GPU@"
-PROTOBUF_LIBRARY="@PROTOBUF_LIBRARY@"
-ZLIB_LIBRARIES="@ZLIB_LIBRARIES@"
-CMAKE_THREAD_LIB="@CMAKE_THREAD_LIBS_INIT@"
-CMAKE_DL_LIBS="@CMAKE_DL_LIBS@"
-WITH_PYTHON="@WITH_PYTHON@"
-PYTHON_LIBRARIES="@PYTHON_LIBRARIES@"
-GLOG_LIBRARIES="@GLOG_LIBRARIES@"
-GFLAGS_LIBRARIES="@GFLAGS_LIBRARIES@"
-GFLAGS_LOCATION="@GFLAGS_LOCATION@"
-CBLAS_LIBRARIES="@CBLAS_LIBRARIES@"
-CUDA_LIBRARIES="@CUDA_CUDART_LIBRARY@"
-WITH_COVERALLS="@ON_COVERALLS@"
--- a/paddle/api/paddle_ld_flags.py
+++ b/paddle/api/paddle_ld_flags.py
-# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-try:
-    from paddle_api_config import *
-    import os.path
-    import platform
-    system = platform.system().lower()
-    is_osx = (system == 'darwin')
-    is_win = (system == 'windows')
-    is_lin = (system == 'linux')
-    if is_lin:
-        whole_start = "-Wl,--whole-archive"
-        whole_end = "-Wl,--no-whole-archive"
-    elif is_osx:
-        whole_start = ""
-        whole_end = ""
-    LIB_DIRS = [
-        "math", 'function', 'utils', 'parameter', "gserver", "api", "cuda",
-        "pserver", "trainer"
-    ]
-    PARENT_LIB_DIRS = ['proto']
-    class PaddleLDFlag(object):
-        def __init__(self):
-            self.paddle_build_dir = PADDLE_BUILD_DIR
-            self.paddle_build_dir = os.path.abspath(self.paddle_build_dir)
-            self.with_gpu = PaddleLDFlag.cmake_bool(WITH_GPU)
-            self.protolib = PROTOBUF_LIBRARY
-            self.zlib = ZLIB_LIBRARIES
-            self.thread = CMAKE_THREAD_LIB
-            self.dl_libs = CMAKE_DL_LIBS
-            self.with_python = PaddleLDFlag.cmake_bool(WITH_PYTHON)
-            self.python_libs = PYTHON_LIBRARIES
-            self.glog_libs = GLOG_LIBRARIES
-            self.with_coverage = PaddleLDFlag.cmake_bool(WITH_COVERALLS)
-            self.gflags_libs = GFLAGS_LIBRARIES
-            self.gflags_location = GFLAGS_LOCATION
-            self.cblas_libs = CBLAS_LIBRARIES
-            self.curt = CUDA_LIBRARIES
-        def ldflag_str(self):
-            return " ".join(
-                [self.libs_dir_str(), self.parent_dir_str(), self.libs_str()])
-        def libs_dir_str(self):
-            libdirs = LIB_DIRS
-            return " ".join(
-                map(lambda x: "-L" + os.path.join(self.paddle_build_dir, x),
-                    libdirs))
-        def parent_dir_str(self):
-            libdirs = PARENT_LIB_DIRS
-            return " ".join(
-                map(lambda x: "-L" + os.path.join(self.paddle_build_dir, '..', x),
-                    libdirs))
-        def libs_str(self):
-            libs = [
-                whole_start,
-                "-lpaddle_gserver",
-                "-lpaddle_function",
-                whole_end,
-                "-lpaddle_pserver",
-                "-lpaddle_trainer_lib",
-                "-lpaddle_network",
-                '-lpaddle_parameter',
-                "-lpaddle_math",
-                '-lpaddle_utils',
-                "-lpaddle_proto",
-                "-lpaddle_cuda",
-                "-lpaddle_api",
-                self.normalize_flag(self.protolib),
-                self.normalize_flag(self.glog_libs),
-                self.normalize_flag(self.gflags_libs),
-                self.normalize_flag(self.zlib),
-                self.normalize_flag(self.thread),
-                self.normalize_flag(self.dl_libs),
-                self.normalize_flag(self.cblas_libs),
-            ]
-            if self.with_python:
-                libs.append(self.normalize_flag(self.python_libs))
-            if self.with_gpu:
-                libs.append(self.normalize_flag(self.curt))
-            if self.with_coverage:
-                libs.append("-fprofile-arcs")
-            return " ".join(filter(lambda l: len(l) != 0, libs))
-        def normalize_flag(self, cmake_flag):
-            """
-            CMake flag string to ld flag
-            :type cmake_flag: str
-            """
-            if ";" in cmake_flag:
-                return " ".join(map(self.normalize_flag, cmake_flag.split(";")))
-            if cmake_flag.startswith("/"):  # is a path
-                return cmake_flag
-            elif cmake_flag.startswith("-l"):  # normal link command
-                return cmake_flag
-            elif cmake_flag in [
-                    "gflags-shared", "gflags-static", "gflags_nothreads-shared",
-                    "gflags_nothreads-static"
-            ]:  # special for gflags
-                assert PaddleLDFlag.cmake_bool(self.gflags_location)
-                return self.gflags_location
-            elif len(cmake_flag) != 0:
-                return "".join(["-l", cmake_flag])
-            else:
-                return ""
-        @staticmethod
-        def cmake_bool(cmake_str):
-            """
-            CMake bool string to bool
-            :param cmake_str: cmake boolean string
-            :type cmake_str: str
-            :rtype: bool
-            """
-            if cmake_str in ["FALSE", "OFF", "NO"] or cmake_str.endswith(
-                    "-NOTFOUND"):
-                return False
-            else:
-                return True
-        def c_flag(self):
-            if self.with_coverage:
-                return [
-                    "-fprofile-arcs", "-ftest-coverage", "-O0", "-g",
-                    "-std=c++11"
-                ]
-            else:
-                return ["-std=c++11"]
-except ImportError:
-    class PaddleLDFlag(object):
-        def ldflag_str(self):
-            pass
-        def c_flag(self):
-            pass
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -25,12 +25,16 @@ filter_test(GSERVER_HEADER)
 filter_test(GSERVER_SOURCES)
 if(NOT WITH_GPU)
    list(REMOVE_ITEM GSERVER_HEADER
+        layers/CudnnConvBaseLayer.h
        layers/CudnnConvLayer.h
+        layers/CudnnConvTransLayer.h
        layers/CudnnPoolLayer.h
        layers/CudnnBatchNormLayer.h)
    list(REMOVE_ITEM GSERVER_SOURCES
+        layers/CudnnConvBaseLayer.cpp
        layers/CudnnConvLayer.cpp
+        layers/CudnnConvTransLayer.cpp
        layers/CudnnPoolLayer.cpp
        layers/CudnnBatchNormLayer.cpp)
    compile_cu_as_cpp(layers/LstmCompute.cu)

--- a/paddle/gserver/dataproviders/DataProvider.h
+++ b/paddle/gserver/dataproviders/DataProvider.h
@@ -164,15 +164,6 @@ public:
    argu.value = value;
    data_.push_back(argu);
  }
-  /**
-   * @brief Append user defined data
-   * @param[in]  ptr     user defined data
-   */
-  void appendUserDefinedPtr(UserDefinedVectorPtr ptr) {
-    Argument argu;
-    argu.udp = ptr;
-    data_.push_back(argu);
-  }
  /*
   * @brief Append argument

--- a/paddle/gserver/layers/ConvBaseOperator.cpp
+++ b/paddle/gserver/layers/ConvBaseOperator.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "ConvBaseOperator.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+namespace paddle {
+/**
+ * @brief ConvBaseOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+ConvBaseOperator::ConvBaseOperator(const OperatorConfig &config, bool useGpu)
+    : Operator(config, useGpu) {
+  CHECK(useGpu);
+  CHECK_EQ(config_.input_indices_size(), 2L);
+  caffeMode_ = true;
+  getConvParams();
+  computeConvSizes();
+  // initialize all to default algorithms
+  fwdAlgo_ = 0;
+  bwdFilterAlgo_ = 0;
+  bwdDataAlgo_ = 0;
+  fwdLimitBytes_ = 0;
+  bwdDataLimitBytes_ = 0;
+  bwdFilterLimitBytes_ = 0;
+  workSpaceInBytes_ = 0;
+  workSpace_ = nullptr;
+  isSelectAlgo_ = false;
+}
+void ConvBaseOperator::allocConvWorkSpace() {
+  hl_conv_workspace(imageDesc_,
+                    outputDesc_,
+                    filterDesc_,
+                    convDesc_,
+                    &fwdAlgo_,
+                    &fwdLimitBytes_,
+                    &bwdDataAlgo_,
+                    &bwdDataLimitBytes_,
+                    &bwdFilterAlgo_,
+                    &bwdFilterLimitBytes_);
+  size_t maxWorkSpace = 0;
+  maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
+  maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
+  if (maxWorkSpace > workSpaceInBytes_) {
+    if (workSpaceInBytes_ != 0) {
+      hl_free_mem_device(workSpace_);
+    }
+    // total amount of storage needed
+    workSpace_ = hl_malloc_device(maxWorkSpace);
+    workSpaceInBytes_ = maxWorkSpace;
+  }
+}
+void ConvBaseOperator::computeConvSizes() {
+  hl_create_filter_descriptor(
+      &filterDesc_, channels_, numFilters_, filterSizeY_, filterSize_);
+  hl_create_tensor_descriptor(&imageDesc_);
+  hl_create_tensor_descriptor(&outputDesc_);
+  hl_create_convolution_descriptor(&convDesc_,
+                                   imageDesc_,
+                                   filterDesc_,
+                                   paddingY_,
+                                   padding_,
+                                   strideY_,
+                                   stride_);
+}
+void ConvBaseOperator::reshapeImageDescriptors() {
+  hl_tensor_reshape(imageDesc_,
+                    1,
+                    channels_,
+                    imageH_,
+                    imageW_,
+                    channels_ * imageH_ * imageW_,
+                    imageH_ * imageW_,
+                    imageW_,
+                    1);
+  hl_tensor_reshape(outputDesc_,
+                    1,
+                    numFilters_,
+                    outputH_,
+                    outputW_,
+                    numFilters_ * outputH_ * outputW_,
+                    outputH_ * outputW_,
+                    outputW_,
+                    1);
+  hl_reset_convolution_descriptor(convDesc_,
+                                  imageDesc_,
+                                  filterDesc_,
+                                  paddingY_,
+                                  padding_,
+                                  strideY_,
+                                  stride_);
+}
+void ConvBaseOperator::getConvParams() {
+  configNumFilters_ = config_.num_filters();
+  const ConvConfig &conf = config_.conv_conf();
+  padding_ = conf.padding();
+  stride_ = conf.stride();
+  filterSize_ = conf.filter_size();
+  paddingY_ = conf.padding_y();
+  strideY_ = conf.stride_y();
+  filterSizeY_ = conf.filter_size_y();
+  filterPixels_ = filterSize_ * filterSizeY_;
+  configChannels_ = conf.channels();
+  imgSize_ = conf.img_size();
+  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  imgPixels_ = imgSize_ * imgSizeY_;
+  CHECK_EQ(conf.groups(), 1U);
+  filterChannels_ = conf.filter_channels();
+  outputX_ = conf.output_x();
+  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+  outputs_ = outputX_ * outputX_;
+  isDeconv_ = (config_.type() == "conv") ? false : true;
+  if (isDeconv_) {
+    channels_ = configNumFilters_;
+    numFilters_ = configChannels_;
+  } else {
+    channels_ = configChannels_;
+    numFilters_ = configNumFilters_;
+  }
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/ConvBaseOperator.h
+++ b/paddle/gserver/layers/ConvBaseOperator.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "Operator.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+namespace paddle {
+/**
+ * @brief ConvOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+class ConvBaseOperator : public Operator {
+public:
+  ConvBaseOperator(const OperatorConfig &config, bool useGpu);
+  /**
+   * Free workspace in device and destroy cudnn tensor descriptor.
+   */
+  virtual ~ConvBaseOperator() {
+    if (workSpaceInBytes_ != 0) {
+      hl_free_mem_device(workSpace_);
+      workSpaceInBytes_ = 0;
+    }
+    hl_destroy_tensor_descriptor(imageDesc_);
+    hl_destroy_tensor_descriptor(outputDesc_);
+    hl_destroy_filter_descriptor(filterDesc_);
+    hl_destroy_convolution_descriptor(convDesc_);
+  }
+protected:
+  /**
+   * Get convolution parameters from layer config and
+   * initialize member variables.
+   */
+  void getConvParams();
+  /**
+   * Allocate Gpu Memory for cudnn convolution algorithms.
+   */
+  void allocConvWorkSpace();
+  /**
+   * Create cudnn tensor descriptor for convolution operation.
+   */
+  void computeConvSizes();
+  /**
+   * Reshape cudnn tensor descriptor.
+   */
+  void reshapeImageDescriptors();
+  /**
+   * Reshape cudnn tensor descriptor.
+   */
+  virtual void reshape(int batchSize) = 0;
+  /**
+   * Check filter size is equal to the size calculated by parameters from
+   * layer config.
+   */
+  void checkFilterSize(const MatrixPtr &filter) {
+    CHECK_EQ(static_cast<int>(filter->getWidth()),
+             filterSize_ * filterSizeY_ * channels_ * numFilters_);
+  }
+  /// Most of member variables are same with CudnnConvLayer.
+  /// There is no explanation here.
+  bool isDeconv_;
+  int imageH_, imageW_, outputH_, outputW_;
+  hl_tensor_descriptor imageDesc_;
+  hl_tensor_descriptor outputDesc_;
+  hl_filter_descriptor filterDesc_;
+  hl_convolution_descriptor convDesc_;
+  bool caffeMode_;
+  int inputOffset_, outputOffset_, weightOffset_;
+  int numFilters_, channels_;
+  /// from parsing config
+  int configNumFilters_, configChannels_;
+  int padding_, stride_, filterSize_, imgSize_, imgSizeY_;
+  int paddingY_, strideY_, filterSizeY_;
+  int imgPixels_, filterPixels_, filterChannels_, outputX_, outputY_, outputs_;
+  /// Following member variables are same with CudnnConvLayer.
+  /// There is no explanation here.
+  int fwdAlgo_, bwdFilterAlgo_, bwdDataAlgo_;
+  size_t fwdLimitBytes_, bwdDataLimitBytes_, bwdFilterLimitBytes_;
+  size_t workSpaceInBytes_;
+  void *workSpace_;
+  bool isSelectAlgo_;
+};
+}  // namespace paddle
--- a/paddle/gserver/layers/ConvBaseProjection.cpp
+++ b/paddle/gserver/layers/ConvBaseProjection.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "ConvBaseProjection.h"
+#include "paddle/utils/Stat.h"
+namespace paddle {
+ThreadLocalD<std::vector<MemoryHandle *>> ConvBaseProjection::convMem_;
+ConvBaseProjection::ConvBaseProjection(const ProjectionConfig &config,
+                                       ParameterPtr parameter,
+                                       bool useGpu)
+    : Projection(config, parameter, useGpu) {
+  CHECK(useGpu);  // only support GPU
+  getConvParams();
+  initCudnn();
+  size_t height = filterH_ * filterW_ * channels_ / groups_;
+  size_t width = numFilters_;
+  weight_.reset(new Weight(height, width, parameter));
+  weightOffset_ = height * width / groups_;
+}
+void ConvBaseProjection::getConvParams() {
+  const ConvConfig &conf = config_.conv_conf();
+  paddingH_ = conf.padding_y();
+  paddingW_ = conf.padding();
+  strideH_ = conf.stride_y();
+  strideW_ = conf.stride();
+  filterH_ = conf.filter_size_y();
+  filterW_ = conf.filter_size();
+  configImgH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
+  configImgW_ = conf.img_size();
+  configOutH_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
+  configOutW_ = conf.output_x();
+  configChannels_ = conf.channels();
+  configNumFilters_ = config_.num_filters();
+  isDeconv_ = (config_.type() == "conv") ? false : true;
+  channels_ = (isDeconv_) ? configNumFilters_ : configChannels_;
+  numFilters_ = (isDeconv_) ? configChannels_ : configNumFilters_;
+  groups_ = conf.groups();
+  CHECK_EQ(channels_ % groups_, 0);
+  CHECK_EQ(numFilters_ % groups_, 0);
+}
+void ConvBaseProjection::initCudnn() {
+  hl_create_filter_descriptor(&filterDesc_,
+                              channels_ / groups_,
+                              numFilters_ / groups_,
+                              filterH_,
+                              filterW_);
+  hl_create_tensor_descriptor(&imageDesc_);
+  hl_create_tensor_descriptor(&outputDesc_);
+  hl_create_convolution_descriptor(&convDesc_,
+                                   imageDesc_,
+                                   filterDesc_,
+                                   paddingH_,
+                                   paddingW_,
+                                   strideH_,
+                                   strideW_);
+  // initialize all to default algorithms
+  fwdAlgo_ = 0;
+  bwdFilterAlgo_ = 0;
+  bwdDataAlgo_ = 0;
+  fwdLimitBytes_ = 0;
+  bwdDataLimitBytes_ = 0;
+  bwdFilterLimitBytes_ = 0;
+  workSpaceInBytes_ = 0;
+  batchNum_ = 0;
+  isSelectAlgo_ = false;
+}
+void ConvBaseProjection::reshapeTensorDesc(int batchSize) {
+  // The stride between two consecutive samples in the output of ConvProjection
+  // may not be numFilters_ * outputH_ * outputW_ (conv) or
+  // channels_ * imageH_ * imageW_ (deconv)
+  // for example, in the case of layer ConcatenateLayer2 with two
+  // ConvProjection, the stride is the output_size of layer ConcatenateLayer2.
+  // So the calculation of nStride is different from CudnnConvLayer.
+  size_t nStrideImage, nStrideOutput;
+  if (isDeconv_) {
+    nStrideImage = out_->value->getStride();
+    nStrideOutput = numFilters_ * outputH_ * outputW_;
+  } else {
+    nStrideImage = channels_ * imageH_ * imageW_;
+    nStrideOutput = out_->value->getStride();
+  }
+  hl_tensor_reshape(imageDesc_,
+                    batchSize,
+                    channels_ / groups_,
+                    imageH_,
+                    imageW_,
+                    nStrideImage,
+                    imageH_ * imageW_,
+                    imageW_,
+                    1);
+  hl_tensor_reshape(outputDesc_,
+                    batchSize,
+                    numFilters_ / groups_,
+                    outputH_,
+                    outputW_,
+                    nStrideOutput,
+                    outputH_ * outputW_,
+                    outputW_,
+                    1);
+  hl_reset_convolution_descriptor(convDesc_,
+                                  imageDesc_,
+                                  filterDesc_,
+                                  paddingH_,
+                                  paddingW_,
+                                  strideH_,
+                                  strideW_);
+}
+void ConvBaseProjection::reshape(int batchSize) {
+  size_t width = calOutputSize();
+  CHECK_EQ(width, out_->value->getWidth());
+  CHECK_EQ(calInputSize(), in_->value->getWidth());
+  isSelectAlgo_ = (batchSize == batchNum_);
+  batchNum_ = batchSize;
+  if (!isSelectAlgo_) {
+    reshapeTensorDesc(batchSize);
+    hl_conv_workspace(imageDesc_,
+                      outputDesc_,
+                      filterDesc_,
+                      convDesc_,
+                      &fwdAlgo_,
+                      &fwdLimitBytes_,
+                      &bwdDataAlgo_,
+                      &bwdDataLimitBytes_,
+                      &bwdFilterAlgo_,
+                      &bwdFilterLimitBytes_);
+    size_t maxWorkSpace = 0;
+    maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
+    maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
+    workSpaceInBytes_ = maxWorkSpace;
+    VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
+            << " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_;
+  }
+  isSelectAlgo_ = true;
+}
+void *ConvBaseProjection::getSpaceBytes(size_t size) {
+  std::vector<MemoryHandle *> &convMem = *convMem_;
+  if (convMem.empty()) {
+    int numDevices = hl_get_device_count();
+    convMem.resize(numDevices);
+  }
+  int devId = hl_get_device();
+  MemoryHandle **localMem = &(convMem[devId]);
+  if (NULL == *localMem || size > (*localMem)->getAllocSize()) {
+    *localMem = new GpuMemoryHandle(size);
+  }
+  return (*localMem)->getBuf();
+}
+ConvBaseProjection::~ConvBaseProjection() {
+  hl_destroy_tensor_descriptor(imageDesc_);
+  hl_destroy_tensor_descriptor(outputDesc_);
+  hl_destroy_filter_descriptor(filterDesc_);
+  hl_destroy_convolution_descriptor(convDesc_);
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/ConvBaseProjection.h
+++ b/paddle/gserver/layers/ConvBaseProjection.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "Projection.h"
+#include "paddle/math/MathUtils.h"
+namespace paddle {
+/**
+ * @brief Base class for ConvProjection and ConvTransProjection.
+ */
+class ConvBaseProjection : public Projection {
+public:
+  /**
+   * Constructor.
+   */
+  ConvBaseProjection(const ProjectionConfig& config,
+                     ParameterPtr parameter,
+                     bool useGpu);
+  ~ConvBaseProjection();
+protected:
+  void getConvParams();
+  void initCudnn();
+  void reshapeTensorDesc(int batchSize);
+  void reshape(int batchSize);
+  virtual size_t calOutputSize() = 0;
+  virtual size_t calInputSize() = 0;
+  static void* getSpaceBytes(size_t size);
+  /// True if it's deconv projection layer, false if it's ConvProjection layer
+  bool isDeconv_;
+  /// imageH_ and imageW_ / outputH_ and outputW_
+  /// is calculated from the input layer.
+  int imageH_, imageW_;
+  int outputH_, outputW_;
+  /// configImgH_ and configImgW_ / configOutH_ and configOutW_
+  /// is obtained from config.
+  int configImgH_, configImgW_;
+  int configOutH_, configOutW_;
+  /// channels_ and numFilters_ are defined in terms of convolution semantics
+  int channels_, numFilters_;
+  /// configChannels and configNumFilters_ are obtained from config
+  /// For Conv they are the same as channels_ and numFilters
+  /// For ConvTrans they are opposite to channels_ and numFilters
+  int configChannels_, configNumFilters_;
+  int paddingH_, paddingW_;
+  int strideH_, strideW_;
+  int filterH_, filterW_;
+  /// One group offset of input data.
+  int inputOffset_;
+  /// One group offset of output data.
+  int outputOffset_;
+  /// One group offset of weight.
+  int weightOffset_;
+  int groups_;
+  /// Cudnn tensor descriptor for input.
+  hl_tensor_descriptor imageDesc_;
+  /// Cudnn tensor descriptor for output.
+  hl_tensor_descriptor outputDesc_;
+  /// Cudnn tensor descriptor for filter.
+  hl_filter_descriptor filterDesc_;
+  /// Cudnn tensor descriptor for a convolution operation.
+  hl_convolution_descriptor convDesc_;
+  /// Record the algorithm for forward convolution, which is obtained by cudnn
+  /// api to search the best suited algorithm.
+  int fwdAlgo_;
+  /// Record the algorithm for computing convolution gradient with respect to
+  /// filter coefficients.
+  int bwdFilterAlgo_;
+  /// Record the algorithm for computing convolution gradient with respect to
+  /// the output.
+  int bwdDataAlgo_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// forward convolution with the specified algo.
+  size_t fwdLimitBytes_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// backwardFilter with the specified algo.
+  size_t bwdDataLimitBytes_;
+  /// Amount of GPU memory needed as workspace to be able to execute a
+  /// backwardData with the specified algo.
+  size_t bwdFilterLimitBytes_;
+  /// Size of total work space.
+  size_t workSpaceInBytes_;
+  /// Whether to call cuDNN api to choose conv algorithm.
+  bool isSelectAlgo_;
+  /// batchNum is used to record batch size. If the batch size is changed,
+  /// the selection algorithm will be called.
+  int batchNum_;
+  bool bias_;
+  std::unique_ptr<Weight> weight_;
+  static ThreadLocalD<std::vector<MemoryHandle*>> convMem_;
+};
+}  // namespace paddle
--- a/paddle/gserver/layers/ConvOperator.cpp
+++ b/paddle/gserver/layers/ConvOperator.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "Operator.h"
+#include "ConvOperator.h"
 #include "paddle/math/MathUtils.h"
 #include "paddle/math/Matrix.h"
@@ -27,120 +27,8 @@ namespace paddle {
 * The config file api is conv_operator.
 */
-class ConvOperator : public Operator {
-public:
-  ConvOperator(const OperatorConfig &config, bool useGpu);
-  /**
-   * Free workspace in device and destroy cudnn tensor descriptor.
-   */
-  virtual ~ConvOperator() {
-    if (workSpaceInBytes_ != 0) {
-      hl_free_mem_device(workSpace_);
-      workSpaceInBytes_ = 0;
-    }
-    hl_destroy_tensor_descriptor(inputDesc_);
-    hl_destroy_tensor_descriptor(outputDesc_);
-    hl_destroy_filter_descriptor(filterDesc_);
-    hl_destroy_convolution_descriptor(convDesc_);
-  }
-  virtual void forward();
-  virtual void backward();
-private:
-  /**
-   * Get convolution parameters from layer config and
-   * initialize member variables.
-   */
-  void getConvParams();
-  /**
-   * Allocate Gpu Memory for cudnn convolution algorithms.
-   */
-  void allocConvWorkSpace(size_t maxWorkSpace);
-  /**
-   * Create cudnn tensor descriptor for convolution operation.
-   */
-  void computeConvSizes();
-  /**
-   * Reshape cudnn tensor descriptor.
-   */
-  void reshapeImageDescriptors();
-  /**
-   * Reshape cudnn tensor descriptor.
-   */
-  void reshape(int batchSize);
-  /**
-   * Check filter size is equal to the size calculated by parameters from
-   * layer config.
-   */
-  void checkFilterSize(const MatrixPtr &filter) {
-    CHECK_EQ(static_cast<int>(filter->getWidth()),
-             filterSize_ * filterSizeY_ * channels_ * numFilters_);
-  }
-  /// Most of member variables are same with CudnnConvLayer.
-  /// There is no explanation here.
-  int imageH_, imageW_, outputH_, outputW_;
-  hl_tensor_descriptor inputDesc_;
-  hl_tensor_descriptor outputDesc_;
-  hl_filter_descriptor filterDesc_;
-  hl_convolution_descriptor convDesc_;
-  bool caffeMode_;
-  int inputOffset_, outputOffset_, weightOffset_;
-  int numFilters_;
-  int padding_, stride_, filterSize_, channels_, imgSize_, imgSizeY_;
-  int paddingY_, strideY_, filterSizeY_;
-  int imgPixels_, filterPixels_, filterChannels_, outputX_, outputY_, outputs_;
-  /// Following member variables are same with CudnnConvLayer.
-  /// There is no explanation here.
-  int fwdAlgo_, bwdFilterAlgo_, bwdDataAlgo_;
-  size_t fwdLimitBytes_, bwdDataLimitBytes_, bwdFilterLimitBytes_;
-  size_t workSpaceInBytes_;
-  void *workSpace_;
-  bool isSelectAlgo_;
-};
 REGISTER_OPERATOR(conv, ConvOperator);
-ConvOperator::ConvOperator(const OperatorConfig &config, bool useGpu)
-    : Operator(config, useGpu) {
-  CHECK(useGpu);
-  CHECK_EQ(config_.input_indices_size(), 2L);
-  caffeMode_ = true;
-  getConvParams();
-  computeConvSizes();
-  // initialize all to default algorithms
-  fwdAlgo_ = 0;
-  bwdFilterAlgo_ = 0;
-  bwdDataAlgo_ = 0;
-  fwdLimitBytes_ = 0;
-  bwdDataLimitBytes_ = 0;
-  bwdFilterLimitBytes_ = 0;
-  workSpaceInBytes_ = 0;
-  workSpace_ = nullptr;
-  isSelectAlgo_ = false;
-}
-void ConvOperator::allocConvWorkSpace(size_t maxWorkSpace) {
-  if (maxWorkSpace > workSpaceInBytes_) {
-    if (workSpaceInBytes_ != 0) {
-      hl_free_mem_device(workSpace_);
-    }
-    // total amount of storage needed
-    workSpace_ = hl_malloc_device(maxWorkSpace);
-    workSpaceInBytes_ = maxWorkSpace;
-  }
-}
 void ConvOperator::reshape(int batchSize) {
  imageH_ = ins_[0]->getFrameHeight();
  imageW_ = ins_[0]->getFrameWidth();
@@ -148,106 +36,25 @@ void ConvOperator::reshape(int batchSize) {
  if (imageW_ == 0) imageW_ = imgSize_;
  outputH_ = outputSize(imageH_, filterSizeY_, paddingY_, strideY_, caffeMode_);
  outputW_ = outputSize(imageW_, filterSize_, padding_, stride_, caffeMode_);
+  /// Check that the outputSizes are consistent with config
+  CHECK_EQ(outputH_, outputY_);
+  CHECK_EQ(outputW_, outputX_);
  out_->setFrameHeight(outputH_);
  out_->setFrameWidth(outputW_);
  reshapeImageDescriptors();
-  if (!isSelectAlgo_) {
+  inputOffset_ = channels_ * imageH_ * imageW_;
-    hl_conv_workspace(inputDesc_,
+  outputOffset_ = numFilters_ * outputH_ * outputW_;
-                      outputDesc_,
+  weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSizeY_;
-                      filterDesc_,
-                      convDesc_,
-                      &fwdAlgo_,
-                      &fwdLimitBytes_,
-                      &bwdDataAlgo_,
-                      &bwdDataLimitBytes_,
-                      &bwdFilterAlgo_,
-                      &bwdFilterLimitBytes_);
-    size_t maxWorkSpace = 0;
-    maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
-    maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
-    allocConvWorkSpace(maxWorkSpace);
+  if (!isSelectAlgo_) {
+    allocConvWorkSpace();
  }
  isSelectAlgo_ = true;
 }
-void ConvOperator::computeConvSizes() {
-  hl_create_filter_descriptor(
-      &filterDesc_, channels_, numFilters_, filterSizeY_, filterSize_);
-  hl_create_tensor_descriptor(&inputDesc_);
-  int outputX =
-      outputSize(imgSize_, filterSize_, padding_, stride_, caffeMode_);
-  int outputY =
-      outputSize(imgSizeY_, filterSizeY_, paddingY_, strideY_, caffeMode_);
-  CHECK_EQ(outputX, outputX_);
-  CHECK_EQ(outputY, outputY_);
-  hl_create_tensor_descriptor(&outputDesc_);
-  hl_create_convolution_descriptor(&convDesc_,
-                                   inputDesc_,
-                                   filterDesc_,
-                                   paddingY_,
-                                   padding_,
-                                   strideY_,
-                                   stride_);
-}
-void ConvOperator::reshapeImageDescriptors() {
-  hl_tensor_reshape(inputDesc_,
-                    1,
-                    channels_,
-                    imageH_,
-                    imageW_,
-                    channels_ * imageH_ * imageW_,
-                    imageH_ * imageW_,
-                    imageW_,
-                    1);
-  hl_tensor_reshape(outputDesc_,
-                    1,
-                    numFilters_,
-                    outputH_,
-                    outputW_,
-                    numFilters_ * outputH_ * outputW_,
-                    outputH_ * outputW_,
-                    outputW_,
-                    1);
-  hl_reset_convolution_descriptor(convDesc_,
-                                  inputDesc_,
-                                  filterDesc_,
-                                  paddingY_,
-                                  padding_,
-                                  strideY_,
-                                  stride_);
-  inputOffset_ = channels_ * imageH_ * imageW_;
-  outputOffset_ = numFilters_ * outputH_ * outputW_;
-  weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSize_;
-}
-void ConvOperator::getConvParams() {
-  numFilters_ = config_.num_filters();
-  const ConvConfig &conf = config_.conv_conf();
-  padding_ = conf.padding();
-  stride_ = conf.stride();
-  filterSize_ = conf.filter_size();
-  paddingY_ = conf.padding_y();
-  strideY_ = conf.stride_y();
-  filterSizeY_ = conf.filter_size_y();
-  filterPixels_ = filterSize_ * filterSizeY_;
-  channels_ = conf.channels();
-  imgSize_ = conf.img_size();
-  imgSizeY_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  imgPixels_ = imgSize_ * imgSizeY_;
-  CHECK_EQ(conf.groups(), 1U);
-  filterChannels_ = conf.filter_channels();
-  outputX_ = conf.output_x();
-  outputY_ = conf.has_output_y() ? conf.output_y() : conf.output_x();
-  outputs_ = outputX_ * outputX_;
-}
 void ConvOperator::forward() {
  size_t batchSize = ins_[0]->value->getHeight();
  reshape(batchSize);
@@ -264,7 +71,7 @@ void ConvOperator::forward() {
      real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
      real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
      real *outData = out_->value->getData() + outputOffset_ * batchId;
-      hl_convolution_forward(inputDesc_,
+      hl_convolution_forward(imageDesc_,
                             inputData,
                             outputDesc_,
                             outData,
@@ -287,7 +94,7 @@ void ConvOperator::backward() {
      if (ins_[1]->grad) {
        real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
        real *weightGrad = ins_[1]->grad->getData() + weightOffset_ * batchId;
-        hl_convolution_backward_filter(inputDesc_,
+        hl_convolution_backward_filter(imageDesc_,
                                       inputData,
                                       outputDesc_,
                                       outGrad,
@@ -303,7 +110,7 @@ void ConvOperator::backward() {
      if (NULL != preGrad) {
        real *inputGrad = preGrad->getData() + inputOffset_ * batchId;
        real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
-        hl_convolution_backward_data(inputDesc_,
+        hl_convolution_backward_data(imageDesc_,
                                     inputGrad,
                                     outputDesc_,
                                     outGrad,

--- a/paddle/gserver/layers/ConvOperator.h
+++ b/paddle/gserver/layers/ConvOperator.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "ConvBaseOperator.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+namespace paddle {
+/**
+ * @brief ConvOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+class ConvOperator : public ConvBaseOperator {
+public:
+  ConvOperator(const OperatorConfig &config, bool useGpu)
+      : ConvBaseOperator(config, useGpu) {}
+  /**
+   * Free workspace in device and destroy cudnn tensor descriptor.
+   */
+  virtual ~ConvOperator() {}
+  void forward() override;
+  void backward() override;
+  void reshape(int batchSize) override;
+};
+}  // namespace paddle
--- a/paddle/gserver/layers/ConvProjection.cpp
+++ b/paddle/gserver/layers/ConvProjection.cpp
@@ -19,149 +19,32 @@ namespace paddle {
 REGISTER_PROJECTION(conv, ConvProjection);
-ThreadLocalD<std::vector<MemoryHandle *>> ConvProjection::convMem_;
+size_t ConvProjection::calOutputSize() {
+  imageH_ = in_->getFrameHeight();
-ConvProjection::ConvProjection(const ProjectionConfig &config,
+  imageW_ = in_->getFrameWidth();
-                               ParameterPtr parameter,
+  if (imageH_ == 0) imageH_ = configImgH_;
-                               bool useGpu)
+  if (imageW_ == 0) imageW_ = configImgW_;
-    : Projection(config, parameter, useGpu) {
+  outputH_ = outputSize(imageH_,
-  CHECK(useGpu);  // only support GPU
+                        filterH_,
-  getConvParams();
+                        paddingH_,
-  initCudnn();
+                        strideH_,
+                        /* caffeMode */ true);
-  size_t height = filterH_ * filterW_ * channels_ / groups_;
+  outputW_ = outputSize(imageW_,
-  size_t width = numFilters_;
+                        filterW_,
-  weight_.reset(new Weight(height, width, parameter));
+                        paddingW_,
-  weightOffset_ = height * width / groups_;
+                        strideW_,
-}
+                        /* caffeMode */ true);
-void ConvProjection::getConvParams() {
+  const_cast<Argument *>(out_)->setFrameHeight(outputH_);
-  const ConvConfig &conf = config_.conv_conf();
+  const_cast<Argument *>(out_)->setFrameWidth(outputW_);
-  paddingH_ = conf.padding_y();
-  paddingW_ = conf.padding();
+  inputOffset_ = (configChannels_ / groups_) * imageH_ * imageW_;
+  outputOffset_ = (configNumFilters_ / groups_) * outputH_ * outputW_;
-  strideH_ = conf.stride_y();
+  return outputH_ * outputW_ * configNumFilters_;
-  strideW_ = conf.stride();
-  filterH_ = conf.filter_size_y();
-  filterW_ = conf.filter_size();
-  configImgH_ = conf.has_img_size_y() ? conf.img_size_y() : conf.img_size();
-  configImgW_ = conf.img_size();
-  channels_ = conf.channels();
-  numFilters_ = config_.num_filters();
-  groups_ = conf.groups();
-  CHECK_EQ(channels_ % groups_, 0);
-  CHECK_EQ(numFilters_ % groups_, 0);
-}
-void ConvProjection::initCudnn() {
-  hl_create_filter_descriptor(&filterDesc_,
-                              channels_ / groups_,
-                              numFilters_ / groups_,
-                              filterH_,
-                              filterW_);
-  hl_create_tensor_descriptor(&inputDesc_);
-  hl_create_tensor_descriptor(&outputDesc_);
-  hl_create_convolution_descriptor(&convDesc_,
-                                   inputDesc_,
-                                   filterDesc_,
-                                   paddingH_,
-                                   paddingW_,
-                                   strideH_,
-                                   strideW_);
-  // initialize all to default algorithms
-  fwdAlgo_ = 0;
-  bwdFilterAlgo_ = 0;
-  bwdDataAlgo_ = 0;
-  fwdLimitBytes_ = 0;
-  bwdDataLimitBytes_ = 0;
-  bwdFilterLimitBytes_ = 0;
-  workSpaceInBytes_ = 0;
-  batchNum_ = 0;
-  isSelectAlgo_ = false;
-}
-void ConvProjection::reshapeTensorDesc(int batchSize) {
-  hl_tensor_reshape(inputDesc_,
-                    batchSize,
-                    channels_ / groups_,
-                    imageH_,
-                    imageW_,
-                    channels_ * imageH_ * imageW_,
-                    imageH_ * imageW_,
-                    imageW_,
-                    1);
-  hl_reset_convolution_descriptor(convDesc_,
-                                  inputDesc_,
-                                  filterDesc_,
-                                  paddingH_,
-                                  paddingW_,
-                                  strideH_,
-                                  strideW_);
-  // The stride between two consecutive images in ConvProjection may not be 1,
-  // for example, in the case of layer ConcatenateLayer2 with two
-  // ConvProjection, the stride is the output_size of layer ConcatenateLayer2.
-  // So the calculation of nStride is different from CudnnConvLayer.
-  // In fact, only "nStride = out_->value->getStride()" is ok.
-  size_t nStride = numFilters_ * outputH_ * outputW_;
-  if (out_->value->isContiguous()) {
-    CHECK_EQ(nStride, out_->value->getWidth());
-  } else {
-    nStride = out_->value->getStride();
-  }
-  hl_tensor_reshape(outputDesc_,
-                    batchSize,
-                    numFilters_ / groups_,
-                    outputH_,
-                    outputW_,
-                    nStride,
-                    outputH_ * outputW_,
-                    outputW_,
-                    1);
 }
-void ConvProjection::reshape(int batchSize) {
+size_t ConvProjection::calInputSize() {
-  size_t width = calOutputSize();
+  return static_cast<size_t>(configChannels_ * imageH_ * imageW_);
-  CHECK_EQ(width, out_->value->getWidth());
-  CHECK_EQ(static_cast<size_t>(channels_ * imageH_ * imageW_),
-           in_->value->getWidth())
-      << "Wrong input size for convolution"
-      << " channels=" << channels_ << " imageH=" << imageH_
-      << " imageW=" << imageW_ << " inputSize=" << in_->value->getWidth();
-  isSelectAlgo_ = (batchSize == batchNum_);
-  batchNum_ = batchSize;
-  if (!isSelectAlgo_) {
-    reshapeTensorDesc(batchSize);
-    hl_conv_workspace(inputDesc_,
-                      outputDesc_,
-                      filterDesc_,
-                      convDesc_,
-                      &fwdAlgo_,
-                      &fwdLimitBytes_,
-                      &bwdDataAlgo_,
-                      &bwdDataLimitBytes_,
-                      &bwdFilterAlgo_,
-                      &bwdFilterLimitBytes_);
-    size_t maxWorkSpace = 0;
-    maxWorkSpace = std::max(fwdLimitBytes_, bwdDataLimitBytes_);
-    maxWorkSpace = std::max(maxWorkSpace, bwdFilterLimitBytes_);
-    workSpaceInBytes_ = maxWorkSpace;
-    VLOG(3) << getName() << " Fwd / BwdData / BwdFilter algo: " << fwdAlgo_
-            << " / " << bwdDataAlgo_ << " / " << bwdFilterAlgo_;
-  }
-  isSelectAlgo_ = true;
 }
 void ConvProjection::forward() {
@@ -179,7 +62,7 @@ void ConvProjection::forward() {
    real *inputData = in_->value->getData() + g * inputOffset_;
    real *wgtData = weight_->getW()->getData() + g * weightOffset_;
    real *outData = out_->value->getData() + g * outputOffset_;
-    hl_convolution_forward(inputDesc_,
+    hl_convolution_forward(imageDesc_,
                           inputData,
                           outputDesc_,
                           outData,
@@ -205,7 +88,7 @@ void ConvProjection::backward(const UpdateCallback &callback) {
    if (weight_->getWGrad()) {
      real *inputData = in_->value->getData() + g * inputOffset_;
      real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_;
-      hl_convolution_backward_filter(inputDesc_,
+      hl_convolution_backward_filter(imageDesc_,
                                     inputData,
                                     outputDesc_,
                                     outGrad,
@@ -221,7 +104,7 @@ void ConvProjection::backward(const UpdateCallback &callback) {
    if (NULL != preGrad) {
      real *inputGrad = preGrad->getData() + g * inputOffset_;
      real *wgtData = weight_->getW()->getData() + g * weightOffset_;
-      hl_convolution_backward_data(inputDesc_,
+      hl_convolution_backward_data(imageDesc_,
                                   inputGrad,
                                   outputDesc_,
                                   outGrad,
@@ -237,26 +120,4 @@ void ConvProjection::backward(const UpdateCallback &callback) {
  weight_->getParameterPtr()->incUpdate(callback);
 }
-void *ConvProjection::getSpaceBytes(size_t size) {
-  std::vector<MemoryHandle *> &convMem = *convMem_;
-  if (convMem.empty()) {
-    int numDevices = hl_get_device_count();
-    convMem.resize(numDevices);
-  }
-  int devId = hl_get_device();
-  MemoryHandle **localMem = &(convMem[devId]);
-  if (NULL == *localMem || size > (*localMem)->getAllocSize()) {
-    *localMem = new GpuMemoryHandle(size);
-  }
-  return (*localMem)->getBuf();
-}
-ConvProjection::~ConvProjection() {
-  hl_destroy_tensor_descriptor(inputDesc_);
-  hl_destroy_tensor_descriptor(outputDesc_);
-  hl_destroy_filter_descriptor(filterDesc_);
-  hl_destroy_convolution_descriptor(convDesc_);
-}
 }  // namespace paddle
--- a/paddle/gserver/layers/ConvProjection.h
+++ b/paddle/gserver/layers/ConvProjection.h
@@ -14,7 +14,7 @@ limitations under the License. */
 #pragma once
-#include "Projection.h"
+#include "ConvBaseProjection.h"
 #include "paddle/math/MathUtils.h"
 namespace paddle {
@@ -22,109 +22,22 @@ namespace paddle {
 /**
 * @brief Convolution projection do the same calculation with CudnnConvLayer.
 */
-class ConvProjection : public Projection {
+class ConvProjection : public ConvBaseProjection {
 public:
  /**
   * Constructor.
   */
  ConvProjection(const ProjectionConfig& config,
                 ParameterPtr parameter,
-                 bool useGpu);
+                 bool useGpu)
+      : ConvBaseProjection(config, parameter, useGpu) {}
-  ~ConvProjection();
+  ~ConvProjection() {}
  virtual void forward();
  virtual void backward(const UpdateCallback& callback);
+  virtual size_t calOutputSize();
-protected:
+  virtual size_t calInputSize();
-  void getConvParams();
-  void initCudnn();
-  void reshapeTensorDesc(int batchSize);
-  void reshape(int batchSize);
-  size_t calOutputSize() {
-    imageH_ = in_->getFrameHeight();
-    imageW_ = in_->getFrameWidth();
-    if (imageH_ == 0) imageH_ = configImgH_;
-    if (imageW_ == 0) imageW_ = configImgW_;
-    outputH_ = outputSize(imageH_,
-                          filterH_,
-                          paddingH_,
-                          strideH_,
-                          /* caffeMode */ true);
-    outputW_ = outputSize(imageW_,
-                          filterW_,
-                          paddingW_,
-                          strideW_,
-                          /* caffeMode */ true);
-    const_cast<Argument*>(out_)->setFrameHeight(outputH_);
-    const_cast<Argument*>(out_)->setFrameWidth(outputW_);
-    inputOffset_ = (channels_ / groups_) * imageH_ * imageW_;
-    outputOffset_ = (numFilters_ / groups_) * outputH_ * outputW_;
-    return outputH_ * outputW_ * numFilters_;
-  }
-  static void* getSpaceBytes(size_t size);
-  /// imageH_ and imageW_ is calculated from the input layer.
-  int imageH_, imageW_;
-  /// configImgH_ and configImgW_ is obtained from config.
-  int configImgH_, configImgW_;
-  int outputH_, outputW_;
-  int channels_, numFilters_;
-  int paddingH_, paddingW_;
-  int strideH_, strideW_;
-  int filterH_, filterW_;
-  /// One group offset of input data.
-  int inputOffset_;
-  /// One group offset of output data.
-  int outputOffset_;
-  /// One group offset of weight.
-  int weightOffset_;
-  int groups_;
-  /// Cudnn tensor descriptor for input.
-  hl_tensor_descriptor inputDesc_;
-  /// Cudnn tensor descriptor for output.
-  hl_tensor_descriptor outputDesc_;
-  /// Cudnn tensor descriptor for filter.
-  hl_filter_descriptor filterDesc_;
-  /// Cudnn tensor descriptor for a convolution operation.
-  hl_convolution_descriptor convDesc_;
-  /// Record the algorithm for forward convolution, which is obtained by cudnn
-  /// api to search the best suited algorithm.
-  int fwdAlgo_;
-  /// Record the algorithm for computing convolution gradient with respect to
-  /// filter coefficients.
-  int bwdFilterAlgo_;
-  /// Record the algorithm for computing convolution gradient with respect to
-  /// the output.
-  int bwdDataAlgo_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// forward convolution with the specified algo.
-  size_t fwdLimitBytes_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// backwardFilter with the specified algo.
-  size_t bwdDataLimitBytes_;
-  /// Amount of GPU memory needed as workspace to be able to execute a
-  /// backwardData with the specified algo.
-  size_t bwdFilterLimitBytes_;
-  /// Size of total work space.
-  size_t workSpaceInBytes_;
-  /// Whether to call cuDNN api to choose conv algorithm.
-  bool isSelectAlgo_;
-  /// batchNum is used to record batch size. If the batch size is changed,
-  /// the selection algorithm will be called.
-  int batchNum_;
-  bool bias_;
-  std::unique_ptr<Weight> weight_;
-  static ThreadLocalD<std::vector<MemoryHandle*>> convMem_;
 };
 }  // namespace paddle
--- a/paddle/gserver/layers/ConvTransOperator.cpp
+++ b/paddle/gserver/layers/ConvTransOperator.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "ConvTransOperator.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+namespace paddle {
+/**
+ * @brief ConvTransOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+REGISTER_OPERATOR(convt, ConvTransOperator);
+void ConvTransOperator::reshape(int batchSize) {
+  outputH_ = ins_[0]->getFrameHeight();
+  outputW_ = ins_[0]->getFrameWidth();
+  if (outputH_ == 0) outputH_ = outputY_;
+  if (outputW_ == 0) outputW_ = outputX_;
+  imageH_ = imageSize(outputH_, filterSizeY_, paddingY_, strideY_, caffeMode_);
+  imageW_ = imageSize(outputW_, filterSize_, padding_, stride_, caffeMode_);
+  /// Check that the imageSizes are consistent with config
+  CHECK_EQ(imageH_, imgSizeY_);
+  CHECK_EQ(imageW_, imgSize_);
+  out_->setFrameHeight(imageH_);
+  out_->setFrameWidth(imageW_);
+  reshapeImageDescriptors();
+  inputOffset_ = numFilters_ * outputH_ * outputW_;
+  outputOffset_ = channels_ * imageH_ * imageW_;
+  weightOffset_ = numFilters_ * channels_ * filterSize_ * filterSizeY_;
+  if (!isSelectAlgo_) {
+    allocConvWorkSpace();
+  }
+  isSelectAlgo_ = true;
+}
+void ConvTransOperator::forward() {
+  size_t batchSize = ins_[0]->value->getHeight();
+  reshape(batchSize);
+  CHECK_EQ(ins_[1]->value->getHeight(), batchSize);
+  checkFilterSize(ins_[1]->value);
+  Matrix::resizeOrCreate(
+      out_->value, batchSize, imageH_ * imageW_ * channels_, false, useGpu_);
+  {
+    AsyncGpuBlock block;
+    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
+      real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
+      real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
+      real *outData = out_->value->getData() + outputOffset_ * batchId;
+      hl_convolution_backward_data(imageDesc_,
+                                   outData,
+                                   outputDesc_,
+                                   inputData,
+                                   filterDesc_,
+                                   wgtData,
+                                   convDesc_,
+                                   workSpace_,
+                                   workSpaceInBytes_,
+                                   bwdDataAlgo_);
+    }
+  }
+}
+void ConvTransOperator::backward() {
+  size_t batchSize = ins_[0]->value->getHeight();
+  {
+    AsyncGpuBlock block;
+    for (size_t batchId = 0; batchId < batchSize; ++batchId) {
+      real *outGrad = out_->grad->getData() + outputOffset_ * batchId;
+      if (ins_[1]->grad) {
+        real *inputData = ins_[0]->value->getData() + inputOffset_ * batchId;
+        real *weightGrad = ins_[1]->grad->getData() + weightOffset_ * batchId;
+        hl_convolution_backward_filter(imageDesc_,
+                                       outGrad,
+                                       outputDesc_,
+                                       inputData,
+                                       filterDesc_,
+                                       weightGrad,
+                                       convDesc_,
+                                       workSpace_,
+                                       workSpaceInBytes_,
+                                       bwdFilterAlgo_);
+      }
+      MatrixPtr preGrad = ins_[0]->grad;
+      if (NULL != preGrad) {
+        real *inputGrad = preGrad->getData() + inputOffset_ * batchId;
+        real *wgtData = ins_[1]->value->getData() + weightOffset_ * batchId;
+        hl_convolution_forward(imageDesc_,
+                               outGrad,
+                               outputDesc_,
+                               inputGrad,
+                               filterDesc_,
+                               wgtData,
+                               convDesc_,
+                               workSpace_,
+                               workSpaceInBytes_,
+                               fwdAlgo_);
+      }
+    }
+  }
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/ConvTransOperator.h
+++ b/paddle/gserver/layers/ConvTransOperator.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "ConvBaseOperator.h"
+#include "paddle/math/MathUtils.h"
+#include "paddle/math/Matrix.h"
+namespace paddle {
+/**
+ * @brief ConvTransOperator takes two inputs to perform the convolution.
+ * The first input is the image, and the second input is the convolution kernel.
+ * The height of data for two inputs are the same. Each data of the first input
+ * is convolved with each data of the second input indepedently.
+ *
+ * The config file api is conv_operator.
+ */
+class ConvTransOperator : public ConvBaseOperator {
+public:
+  ConvTransOperator(const OperatorConfig &config, bool useGpu)
+      : ConvBaseOperator(config, useGpu) {}
+  /**
+   * Free workspace in device and destroy cudnn tensor descriptor.
+   */
+  virtual ~ConvTransOperator() {}
+  void forward() override;
+  void backward() override;
+  void reshape(int batchSize) override;
+};
+}  // namespace paddle
--- a/paddle/gserver/layers/ConvTransProjection.cpp
+++ b/paddle/gserver/layers/ConvTransProjection.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "ConvTransProjection.h"
+#include "paddle/utils/Stat.h"
+namespace paddle {
+REGISTER_PROJECTION(convt, ConvTransProjection);
+size_t ConvTransProjection::calOutputSize() {
+  outputH_ = in_->getFrameHeight();
+  outputW_ = in_->getFrameWidth();
+  if (outputH_ == 0) outputH_ = configOutH_;
+  if (outputW_ == 0) outputW_ = configOutW_;
+  imageH_ = imageSize(outputH_,
+                      filterH_,
+                      paddingH_,
+                      strideH_,
+                      /* caffeMode */ true);
+  imageW_ = imageSize(outputW_,
+                      filterW_,
+                      paddingW_,
+                      strideW_,
+                      /* caffeMode */ true);
+  const_cast<Argument *>(out_)->setFrameHeight(imageH_);
+  const_cast<Argument *>(out_)->setFrameWidth(imageW_);
+  inputOffset_ = (configChannels_ / groups_) * outputH_ * outputW_;
+  outputOffset_ = (configNumFilters_ / groups_) * imageH_ * imageW_;
+  return imageH_ * imageW_ * configNumFilters_;
+}
+size_t ConvTransProjection::calInputSize() {
+  return static_cast<size_t>(configChannels_ * outputH_ * outputW_);
+}
+void ConvTransProjection::forward() {
+  int batchSize = in_->value->getHeight();
+  reshape(batchSize);
+  void *workSpace = NULL;
+  if (workSpaceInBytes_ > 0) {
+    workSpace = getSpaceBytes(workSpaceInBytes_);
+  }
+  for (int g = 0; g < groups_; ++g) {
+    REGISTER_TIMER_INFO("CudnnConvTransFwTimer", getName().c_str());
+    real *inData = in_->value->getData() + g * inputOffset_;
+    real *wgtData = weight_->getW()->getData() + g * weightOffset_;
+    real *outData = out_->value->getData() + g * outputOffset_;
+    hl_convolution_backward_data(imageDesc_,
+                                 outData,
+                                 outputDesc_,
+                                 inData,
+                                 filterDesc_,
+                                 wgtData,
+                                 convDesc_,
+                                 workSpace,
+                                 bwdDataLimitBytes_,
+                                 bwdDataAlgo_);
+  }
+}
+void ConvTransProjection::backward(const UpdateCallback &callback) {
+  REGISTER_TIMER_INFO("CudnnConvTransBpTimer", getName().c_str());
+  void *workSpace = NULL;
+  if (workSpaceInBytes_ > 0) {
+    workSpace = getSpaceBytes(workSpaceInBytes_);
+  }
+  for (int g = 0; g < groups_; ++g) {
+    real *outGrad = out_->grad->getData() + g * outputOffset_;
+    if (weight_->getWGrad()) {
+      real *inData = in_->value->getData() + g * inputOffset_;
+      real *weightGrad = weight_->getWGrad()->getData() + g * weightOffset_;
+      hl_convolution_backward_filter(imageDesc_,
+                                     outGrad,
+                                     outputDesc_,
+                                     inData,
+                                     filterDesc_,
+                                     weightGrad,
+                                     convDesc_,
+                                     workSpace,
+                                     bwdFilterLimitBytes_,
+                                     bwdFilterAlgo_);
+    }
+    MatrixPtr preGrad = in_->grad;
+    if (NULL != preGrad) {
+      real *inGrad = preGrad->getData() + g * inputOffset_;
+      real *wgtData = weight_->getW()->getData() + g * weightOffset_;
+      hl_convolution_forward(imageDesc_,
+                             outGrad,
+                             outputDesc_,
+                             inGrad,
+                             filterDesc_,
+                             wgtData,
+                             convDesc_,
+                             workSpace,
+                             fwdLimitBytes_,
+                             fwdAlgo_);
+    }
+  }
+  weight_->getParameterPtr()->incUpdate(callback);
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/ConvTransProjection.h
+++ b/paddle/gserver/layers/ConvTransProjection.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "ConvBaseProjection.h"
+#include "paddle/math/MathUtils.h"
+namespace paddle {
+/**
+ * @brief Convolution projection do the same calculation with CudnnConvLayer.
+ */
+class ConvTransProjection : public ConvBaseProjection {
+public:
+  /**
+   * Constructor.
+   */
+  ConvTransProjection(const ProjectionConfig& config,
+                      ParameterPtr parameter,
+                      bool useGpu)
+      : ConvBaseProjection(config, parameter, useGpu) {}
+  ~ConvTransProjection() {}
+  virtual void forward();
+  virtual void backward(const UpdateCallback& callback);
+  virtual size_t calOutputSize();
+  virtual size_t calInputSize();
+};
+}  // namespace paddle
--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -192,6 +192,59 @@ void SumOfSquaresCostLayer::backwardImp(Matrix& output,
  outputG.sumOfSquaresBp(output, *label.value);
 }
+//
+// class SmoothL1CostLayer
+//
+REGISTER_LAYER(smooth_l1, SmoothL1CostLayer);
+bool SmoothL1CostLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  return CostLayer::init(layerMap, parameterMap);
+}
+void SmoothL1CostLayer::forwardImp(Matrix& output,
+                                   Argument& label,
+                                   Matrix& target) {
+  MatrixPtr targetCpu, outputCpu, labelCpu;
+  if (useGpu_) {
+    targetCpu =
+        Matrix::create(target.getHeight(), target.getWidth(), false, false);
+    outputCpu =
+        Matrix::create(output.getHeight(), output.getWidth(), false, false);
+    labelCpu = Matrix::create(
+        label.value->getHeight(), label.value->getWidth(), false, false);
+    targetCpu->copyFrom(target);
+    outputCpu->copyFrom(output);
+    labelCpu->copyFrom(*label.value);
+    targetCpu->smoothL1(*outputCpu, *(labelCpu));
+    target.copyFrom(*targetCpu);
+  } else {
+    target.smoothL1(output, *label.value);
+  }
+}
+void SmoothL1CostLayer::backwardImp(Matrix& output,
+                                    Argument& label,
+                                    Matrix& outputG) {
+  MatrixPtr outputGCpu, outputCpu, labelCpu;
+  if (useGpu_) {
+    outputGCpu =
+        Matrix::create(outputG.getHeight(), outputG.getWidth(), false, false);
+    outputCpu =
+        Matrix::create(output.getHeight(), output.getWidth(), false, false);
+    labelCpu = Matrix::create(
+        label.value->getHeight(), label.value->getWidth(), false, false);
+    outputGCpu->copyFrom(outputG);
+    outputCpu->copyFrom(output);
+    labelCpu->copyFrom(*label.value);
+    outputGCpu->smoothL1Bp(*outputCpu, *labelCpu);
+    outputG.copyFrom(*outputGCpu);
+  } else {
+    outputG.smoothL1Bp(output, *label.value);
+  }
+}
 //
 // class RankingCost
 //

--- a/paddle/gserver/layers/CostLayer.h
+++ b/paddle/gserver/layers/CostLayer.h
@@ -159,6 +159,29 @@ public:
                   Matrix& outputGrad) override;
 };
+/**
+ * This cost layer compute smooth L1 loss for real-valued regression
+ * tasks.
+ * \f[
+ * L =
+ *   (output - label)^2 * 0.5  / -1 < (output - label) < 1 /
+ *   (output - label) - 0.5    / otherwise  /
+ * \f]
+ */
+class SmoothL1CostLayer : public CostLayer {
+public:
+  explicit SmoothL1CostLayer(const LayerConfig& config) : CostLayer(config) {}
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+  void forwardImp(Matrix& output, Argument& label, Matrix& cost) override;
+  void backwardImp(Matrix& outputValue,
+                   Argument& label,
+                   Matrix& outputGrad) override;
+};
 /**
 * A cost layer for learning to rank (LTR) task. This layer contains at leat
 * three inputs.

--- a/paddle/gserver/layers/CrossChannelNormLayer.cpp
+++ b/paddle/gserver/layers/CrossChannelNormLayer.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "Layer.h"
+#include "NormLayer.h"
+#include "paddle/math/BaseMatrix.h"
+#include "paddle/math/Matrix.h"
+namespace paddle {
+MatrixPtr CrossChannelNormLayer::createSampleMatrix(MatrixPtr data,
+                                                    size_t iter,
+                                                    size_t spatialDim) {
+  return Matrix::create(data->getData() + iter * channels_ * spatialDim,
+                        channels_,
+                        spatialDim,
+                        false,
+                        useGpu_);
+}
+MatrixPtr CrossChannelNormLayer::createSpatialMatrix(MatrixPtr data,
+                                                     size_t iter,
+                                                     size_t spatialDim) {
+  return Matrix::create(
+      data->getData() + iter * spatialDim, 1, spatialDim, false, useGpu_);
+}
+void CrossChannelNormLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  MatrixPtr inV = getInputValue(0);
+  size_t batchSize = inV->getHeight();
+  size_t dataDim = inV->getWidth();
+  CHECK_EQ(getSize(), dataDim);
+  reserveOutput(batchSize, dataDim);
+  MatrixPtr outV = getOutputValue();
+  size_t spatialDim = dataDim / channels_;
+  Matrix::resizeOrCreate(dataBuffer_, batchSize, dataDim, false, useGpu_);
+  Matrix::resizeOrCreate(spatialBuffer_, 1, spatialDim, false, useGpu_);
+  Matrix::resizeOrCreate(normBuffer_, batchSize, spatialDim, false, useGpu_);
+  normBuffer_->zeroMem();
+  // add eps to avoid overflow
+  normBuffer_->addScalar(*normBuffer_, 1e-6);
+  inV->square2(*dataBuffer_);
+  for (size_t i = 0; i < batchSize; i++) {
+    const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
+    const MatrixPtr dataTmp = createSampleMatrix(dataBuffer_, i, spatialDim);
+    MatrixPtr outVTmp = createSampleMatrix(outV, i, spatialDim);
+    MatrixPtr normTmp = createSpatialMatrix(normBuffer_, i, spatialDim);
+    // compute norm.
+    spatialBuffer_->sumCols(*dataTmp, 1, 0);
+    spatialBuffer_->sqrt2(*spatialBuffer_);
+    normTmp->copyFrom(*spatialBuffer_);
+    outVTmp->copyFrom(*inVTmp);
+    outVTmp->divRowVector(*spatialBuffer_);
+    // scale the layer.
+    outVTmp->mulColVector(*scale_->getW());
+  }
+}
+void CrossChannelNormLayer::backward(const UpdateCallback& callback) {
+  MatrixPtr inG = getInputGrad(0);
+  MatrixPtr inV = getInputValue(0);
+  MatrixPtr outG = getOutputGrad();
+  MatrixPtr outV = getOutputValue();
+  size_t batchSize = inG->getHeight();
+  size_t dataDim = inG->getWidth();
+  size_t spatialDim = dataDim / channels_;
+  dataBuffer_->dotMul(*outG, *outV);
+  Matrix::resizeOrCreate(scaleDiff_, channels_, 1, false, useGpu_);
+  Matrix::resizeOrCreate(channelBuffer_, channels_, 1, false, useGpu_);
+  Matrix::resizeOrCreate(sampleBuffer_, channels_, spatialDim, false, useGpu_);
+  scaleDiff_->zeroMem();
+  for (size_t i = 0; i < batchSize; i++) {
+    MatrixPtr outGTmp = createSampleMatrix(outG, i, spatialDim);
+    const MatrixPtr dataTmp = createSampleMatrix(dataBuffer_, i, spatialDim);
+    const MatrixPtr inVTmp = createSampleMatrix(inV, i, spatialDim);
+    const MatrixPtr inGTmp = createSampleMatrix(inG, i, spatialDim);
+    const MatrixPtr normTmp = createSpatialMatrix(normBuffer_, i, spatialDim);
+    channelBuffer_->sumRows(*dataTmp, 1, 0);
+    channelBuffer_->dotDiv(*channelBuffer_, *(scale_->getW()));
+    // store a / scale[i] in scaleDiff_ temporary
+    scaleDiff_->add(*channelBuffer_, 1.);
+    sampleBuffer_->dotMul(*inVTmp, *outGTmp);
+    spatialBuffer_->sumCols(*sampleBuffer_, 1., 1.);
+    // scale the grad
+    inGTmp->copyFrom(*inVTmp);
+    inGTmp->mulRowVector(*spatialBuffer_);
+    // divide by square of norm
+    spatialBuffer_->dotMul(*normTmp, *normTmp);
+    inGTmp->divRowVector(*spatialBuffer_);
+    // subtract
+    inGTmp->add(*outGTmp, -1, 1);
+    // divide by norm
+    inGTmp->divRowVector(*normTmp);
+    // scale the diff
+    inGTmp->mulColVector(*scale_->getW());
+  }
+  // updata scale
+  if (scale_->getWGrad()) scale_->getWGrad()->copyFrom(*scaleDiff_);
+  scale_->getParameterPtr()->incUpdate(callback);
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/CudnnConvLayer.cpp
+++ b/paddle/gserver/layers/CudnnConvLayer.cpp
@@ -12,16 +12,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "CudnnConvLayer.h"
+#include "CudnnConvBaseLayer.h"
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 namespace paddle {
+REGISTER_LAYER(cudnn_conv, CudnnConvBaseLayer);
+REGISTER_LAYER(cudnn_convt, CudnnConvBaseLayer);
-REGISTER_LAYER(cudnn_conv, CudnnConvLayer);
+bool CudnnConvBaseLayer::init(const LayerMap &layerMap,
+                              const ParameterMap &parameterMap) {
-bool CudnnConvLayer::init(const LayerMap &layerMap,
-                          const ParameterMap &parameterMap) {
  if (!ConvBaseLayer::init(layerMap, parameterMap)) return false;
  CHECK(useGpu_) << "CudnnConvLayer only support gpu";
@@ -33,7 +33,11 @@ bool CudnnConvLayer::init(const LayerMap &layerMap,
  CHECK(config_.shared_biases());
  for (size_t i = 0; i < inputLayers_.size(); i++) {
    ProjectionConfig *conf = new ProjectionConfig();
-    conf->set_type("conv");
+    if (isDeconv_) {
+      conf->set_type("convt");
+    } else {
+      conf->set_type("conv");
+    }
    conf->set_num_filters(numFilters_);
    ConvConfig *convConf = conf->mutable_conv_conf();
    *convConf = *(config_.mutable_inputs(i)->mutable_conv_conf());
@@ -47,14 +51,13 @@ bool CudnnConvLayer::init(const LayerMap &layerMap,
  if (biases_.get() && sharedBiases_) {
    hl_create_tensor_descriptor(&biasDesc_);
    hl_create_tensor_descriptor(&outputDesc_);
-    hl_tensor_reshape(biasDesc_, 1, numFilters_ / groups_[0], 1, 1);
+    hl_tensor_reshape(biasDesc_, 1, numFilters_, 1, 1);
-    biasOffset_ = numFilters_ / groups_[0];
  }
  return true;
 }
-void CudnnConvLayer::forward(PassType passType) {
+void CudnnConvBaseLayer::forward(PassType passType) {
  Layer::forward(passType);
  int batchSize = getInput(0).getBatchSize();
@@ -67,37 +70,41 @@ void CudnnConvLayer::forward(PassType passType) {
  if (biases_) {
    REGISTER_TIMER_INFO("CudnnConvBiasTimer", getName().c_str());
    int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
+    int outH, outW;
+    if (isDeconv_) {
+      outH = imgSizeH_[0];
+      outW = imgSizeW_[0];
+    } else {
+      outH = outputH_[0];
+      outW = outputW_[0];
+    }
    hl_tensor_reshape(outputDesc_,
                      batchSize,
-                      numFilters_ / groups_[0],
+                      numFilters_,
-                      outputH_[0],
+                      outH,
-                      outputW_[0],
+                      outW,
-                      numFilters_ * outputH_[0] * outputW_[0],
+                      numFilters_ * outH * outW,
-                      outputH_[0] * outputW_[0],
+                      outH * outW,
-                      outputW_[0],
+                      outW,
                      1);
-    outputOffset_ = getOutputValue()->getWidth() / groups_[0];
+    real *outData = getOutputValue()->getData();
-    for (int g = 0; g < groups_[0]; ++g) {
+    real *biasData = biases_->getW()->getData();
-      real *biasData = biases_->getW()->getData() + biasOffset_ * g;
+    hl_convolution_forward_add_bias(biasDesc_, biasData, outputDesc_, outData);
-      real *outData = getOutputValue()->getData() + outputOffset_ * g;
-      hl_convolution_forward_add_bias(
-          biasDesc_, biasData, outputDesc_, outData);
-    }
  }
  forwardActivation();
 }
-void CudnnConvLayer::backward(const UpdateCallback &callback) {
+void CudnnConvBaseLayer::backward(const UpdateCallback &callback) {
  backwardActivation();
  if (biases_ && biases_->getWGrad()) {
    REGISTER_TIMER_INFO("CudnnConvBpBiasTimer", getName().c_str());
-    for (int g = 0; g < groups_[0]; ++g) {
+    real *biasGrad = biases_->getWGrad()->getData();
-      real *biasGrad = biases_->getWGrad()->getData() + biasOffset_ * g;
+    real *outGrad = getOutputGrad()->getData();
-      real *outGrad = getOutputGrad()->getData() + outputOffset_ * g;
+    hl_convolution_backward_bias(biasDesc_, biasGrad, outputDesc_, outGrad);
-      hl_convolution_backward_bias(biasDesc_, biasGrad, outputDesc_, outGrad);
-    }
    biases_->getParameterPtr()->incUpdate(callback);
  }
@@ -106,7 +113,7 @@ void CudnnConvLayer::backward(const UpdateCallback &callback) {
  }
 }
-CudnnConvLayer::~CudnnConvLayer() {
+CudnnConvBaseLayer::~CudnnConvBaseLayer() {
  if (biases_) {
    hl_destroy_tensor_descriptor(biasDesc_);
    hl_destroy_tensor_descriptor(outputDesc_);

--- a/paddle/gserver/layers/CudnnConvLayer.h
+++ b/paddle/gserver/layers/CudnnConvLayer.h
@@ -30,27 +30,24 @@ namespace paddle {
 *
 * The config file api is img_conv_layer.
 */
-class CudnnConvLayer : public ConvBaseLayer {
+class CudnnConvBaseLayer : public ConvBaseLayer {
 protected:
  std::vector<std::unique_ptr<ProjectionConfig>> projConf_;
  std::vector<std::unique_ptr<Projection>> projections_;
  hl_tensor_descriptor biasDesc_;
  hl_tensor_descriptor outputDesc_;
-  int biasOffset_;
-  int outputOffset_;
 public:
-  explicit CudnnConvLayer(const LayerConfig& config) : ConvBaseLayer(config) {}
+  explicit CudnnConvBaseLayer(const LayerConfig& config)
+      : ConvBaseLayer(config) {}
-  ~CudnnConvLayer();
+  ~CudnnConvBaseLayer();
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback) override;
  bool init(const LayerMap& layerMap,
            const ParameterMap& parameterMap) override;
-  void forward(PassType passType) override;
-  void backward(const UpdateCallback& callback) override;
-  void addBiases();
-  void bpropBiases();
 };
 }  // namespace paddle
--- a/paddle/gserver/layers/NormLayer.cpp
+++ b/paddle/gserver/layers/NormLayer.cpp
@@ -26,6 +26,8 @@ Layer* NormLayer::create(const LayerConfig& config) {
    return new ResponseNormLayer(config);
  } else if (norm == "cmrnorm-projection") {
    return new CMRProjectionNormLayer(config);
+  } else if (norm == "cross-channel-norm") {
+    return new CrossChannelNormLayer(config);
  } else {
    LOG(FATAL) << "Unknown norm type: " << norm;
    return nullptr;
@@ -54,4 +56,14 @@ bool ResponseNormLayer::init(const LayerMap& layerMap,
  return true;
 }
+bool CrossChannelNormLayer::init(const LayerMap& layerMap,
+                                 const ParameterMap& parameterMap) {
+  Layer::init(layerMap, parameterMap);
+  CHECK(parameters_[0]);
+  const NormConfig& conf = config_.inputs(0).norm_conf();
+  channels_ = conf.channels();
+  scale_.reset(new Weight(channels_, 1, parameters_[0]));
+  return true;
+}
 }  // namespace paddle
--- a/paddle/gserver/layers/NormLayer.h
+++ b/paddle/gserver/layers/NormLayer.h
@@ -65,4 +65,35 @@ public:
  }
 };
+/**
+ * This layer applys normalization across the channels of each sample to a
+ * conv layer's output, and scales the output by a group of trainable factors
+ * whose dimensions equal to the number of channels.
+ * - Input: One and only one input layer are accepted.
+ * - Output: The normalized data of the input data.
+ * Reference:
+ *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
+ *    Cheng-Yang Fu, Alexander C. Berg. SSD: Single Shot MultiBox Detector
+ */
+class CrossChannelNormLayer : public NormLayer {
+public:
+  explicit CrossChannelNormLayer(const LayerConfig& config)
+      : NormLayer(config) {}
+  bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  void forward(PassType passType);
+  void backward(const UpdateCallback& callback);
+  MatrixPtr createSampleMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
+  MatrixPtr createSpatialMatrix(MatrixPtr data, size_t iter, size_t spatialDim);
+protected:
+  size_t channels_;
+  std::unique_ptr<Weight> scale_;
+  MatrixPtr scaleDiff_;
+  MatrixPtr normBuffer_;
+  MatrixPtr dataBuffer_;
+  MatrixPtr channelBuffer_;
+  MatrixPtr spatialBuffer_;
+  MatrixPtr sampleBuffer_;
+};
 }  // namespace paddle
--- a/paddle/gserver/layers/PriorBox.cpp
+++ b/paddle/gserver/layers/PriorBox.cpp
@@ -20,7 +20,7 @@ namespace paddle {
 /**
 * @brief A layer for generating priorbox locations and variances.
 * - Input: Two and only two input layer are accepted. The input layer must be
- *        be a data output layer and a convolution output layer.
+ *          be a data output layer and a convolution output layer.
 * - Output: The priorbox locations and variances of the input data.
 * Reference:
 *    Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy, Scott Reed,
@@ -45,27 +45,32 @@ protected:
  MatrixPtr buffer_;
 };
+REGISTER_LAYER(priorbox, PriorBoxLayer);
 bool PriorBoxLayer::init(const LayerMap& layerMap,
                         const ParameterMap& parameterMap) {
  Layer::init(layerMap, parameterMap);
  auto pbConf = config_.inputs(0).priorbox_conf();
+  std::vector<real> tmp;
+  aspectRatio_.push_back(1.);
  std::copy(pbConf.min_size().begin(),
            pbConf.min_size().end(),
            std::back_inserter(minSize_));
  std::copy(pbConf.max_size().begin(),
            pbConf.max_size().end(),
            std::back_inserter(maxSize_));
-  std::copy(pbConf.aspect_ratio().begin(),
-            pbConf.aspect_ratio().end(),
-            std::back_inserter(aspectRatio_));
  std::copy(pbConf.variance().begin(),
            pbConf.variance().end(),
            std::back_inserter(variance_));
+  std::copy(pbConf.aspect_ratio().begin(),
+            pbConf.aspect_ratio().end(),
+            std::back_inserter(tmp));
  // flip
-  int inputRatioLength = aspectRatio_.size();
+  int inputRatioLength = tmp.size();
-  for (int index = 0; index < inputRatioLength; index++)
+  for (int index = 0; index < inputRatioLength; index++) {
-    aspectRatio_.push_back(1 / aspectRatio_[index]);
+    aspectRatio_.push_back(tmp[index]);
-  aspectRatio_.push_back(1.);
+    aspectRatio_.push_back(1 / tmp[index]);
+  }
  numPriors_ = aspectRatio_.size();
  if (maxSize_.size() > 0) numPriors_++;
  return true;
@@ -94,12 +99,12 @@ void PriorBoxLayer::forward(PassType passType) {
    for (int w = 0; w < layerWidth; ++w) {
      real centerX = (w + 0.5) * stepW;
      real centerY = (h + 0.5) * stepH;
-      int minSize = 0;
+      real minSize = 0;
      for (size_t s = 0; s < minSize_.size(); s++) {
        // first prior.
        minSize = minSize_[s];
-        int boxWidth = minSize;
+        real boxWidth = minSize;
-        int boxHeight = minSize;
+        real boxHeight = minSize;
        // xmin, ymin, xmax, ymax.
        tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
        tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
@@ -112,7 +117,7 @@ void PriorBoxLayer::forward(PassType passType) {
          CHECK_EQ(minSize_.size(), maxSize_.size());
          // second prior.
          for (size_t s = 0; s < maxSize_.size(); s++) {
-            int maxSize = maxSize_[s];
+            real maxSize = maxSize_[s];
            boxWidth = boxHeight = sqrt(minSize * maxSize);
            tmpPtr[idx++] = (centerX - boxWidth / 2.) / imageWidth;
            tmpPtr[idx++] = (centerY - boxHeight / 2.) / imageHeight;
@@ -145,6 +150,5 @@ void PriorBoxLayer::forward(PassType passType) {
  MatrixPtr outV = getOutputValue();
  outV->copyFrom(buffer_->data_, dim * 2);
 }
-REGISTER_LAYER(priorbox, PriorBoxLayer);
 }  // namespace paddle
--- a/paddle/gserver/layers/SequencePoolLayer.cpp
+++ b/paddle/gserver/layers/SequencePoolLayer.cpp
@@ -56,17 +56,16 @@ void SequencePoolLayer::forward(PassType passType) {
  CHECK_EQ(newBatchSize_, starts->getSize() - 1);
  resetOutput(newBatchSize_, dim);
-  if (type_) {
-    CHECK(input.subSequenceStartPositions)
-        << "when trans_type = seq, input must hasSubseq";
-  }
  /* If type_ = kNonSeq, both seq has or not has sub-seq degrade to a non-seq,
   * thus, in this case, output_ has no sequenceStartPositions.
   * If type_ = kSeq, seq has sub-seq degrades to a seq, thus, only in this
   * case, we should compute the new sequenceStartPositions.
  */
  if (type_) {
-    output_.degradeSequence(input, useGpu_);
+    CHECK(input.subSequenceStartPositions)
+        << "when trans_type = seq, input must hasSubseq";
+    output_.degradeSequence(input);
  }
 }

--- a/paddle/gserver/tests/test_ConvUnify.cpp
+++ b/paddle/gserver/tests/test_ConvUnify.cpp
@@ -34,8 +34,7 @@ DECLARE_double(checkgrad_eps);
 DECLARE_bool(thread_local_rand_use_global_seed);
 DECLARE_bool(prev_batch_state);
-// Do one forward pass of convTrans layer and check to see if its output
+// Do one forward pass of ConvLayer using either exconv or cudnn_conv
-// matches the given result
 MatrixPtr doOneConvTest(size_t imgSize,
                        size_t output_x,
                        size_t stride,
@@ -46,22 +45,35 @@ MatrixPtr doOneConvTest(size_t imgSize,
                        size_t groups,
                        MatrixPtr& inputData,
                        real* param,
-                        bool useGpu) {
+                        bool useGpu,
+                        bool isDeconv = false) {
  TestConfig config;
  config.biasSize = numfilters;
+  string layerType;
  if (useGpu) {
-    config.layerConfig.set_type("cudnn_conv");
+    layerType = (isDeconv) ? "cudnn_convt" : "cudnn_conv";
  } else {
-    config.layerConfig.set_type("exconv");
+    layerType = (isDeconv) ? "exconvt" : "exconv";
  }
+  config.layerConfig.set_type(layerType);
  config.layerConfig.set_num_filters(numfilters);
  config.layerConfig.set_partial_sum(1);
  config.layerConfig.set_shared_biases(true);
  size_t weightSize = channel * filter_size * filter_size *
                      config.layerConfig.num_filters() / groups;
-  config.inputDefs.push_back(
+  if (isDeconv) {
-      {INPUT_DATA, "layer_0", imgSize * imgSize * channel, weightSize});
+    config.inputDefs.push_back(
+        {INPUT_DATA, "layer_0", output_x * output_x * channel, weightSize});
+    config.layerConfig.set_size(imgSize * imgSize *
+                                config.layerConfig.num_filters());
+  } else {
+    config.inputDefs.push_back(
+        {INPUT_DATA, "layer_0", imgSize * imgSize * channel, weightSize});
+    config.layerConfig.set_size(output_x * output_x *
+                                config.layerConfig.num_filters());
+  }
  LayerInputConfig* input = config.layerConfig.add_inputs();
  ConvConfig* conv = input->mutable_conv_conf();
  conv->set_filter_size(filter_size);
@@ -72,12 +84,15 @@ MatrixPtr doOneConvTest(size_t imgSize,
  conv->set_stride(stride);
  conv->set_stride_y(stride);
  conv->set_groups(groups);
-  conv->set_filter_channels(channel / groups);
  conv->set_img_size(imgSize);
  conv->set_output_x(output_x);
-  config.layerConfig.set_size(conv->output_x() * conv->output_x() *
+  if (isDeconv) {
-                              config.layerConfig.num_filters());
+    conv->set_filter_channels(numfilters / groups);
+  } else {
+    conv->set_filter_channels(channel / groups);
+  }
  config.layerConfig.set_name("conv");
  std::vector<DataLayerPtr> dataLayers;
@@ -105,6 +120,8 @@ MatrixPtr doOneConvTest(size_t imgSize,
 TEST(Layer, convParaUnified) {
 #ifndef PADDLE_ONLY_CPU
  MatrixPtr input, resultCpu, resultGpu;
+  /// TEST1 for conv ///
  input = Matrix::create(1, 4 * 4, false, false);
  real inputData[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16};
  real param[] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 7, 6, 5, 4, 3, 2, 1};
@@ -121,7 +138,7 @@ TEST(Layer, convParaUnified) {
                            /*groups*/ 1,
                            input,
                            param,
-                            false);
+                            /*useGpu*/ false);
  resultGpu = doOneConvTest(/* imgSize */ 4,
                            /* output_x */ 2,
@@ -133,9 +150,42 @@ TEST(Layer, convParaUnified) {
                            /*groups*/ 1,
                            input,
                            param,
-                            true);
+                            /*useGpu*/ true);
  checkMatrixEqual(resultCpu, resultGpu);
+  /// TEST1 for deconv ///
+  input = Matrix::create(1, 2 * 2, false, false);
+  real inputDataT[] = {1, 2, 3, 4};
+  input->setData(inputDataT);
+  resultCpu = doOneConvTest(/* imgSize */ 4,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 3,
+                            /*channel*/ 1,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param,
+                            /*useGpu*/ false,
+                            /*isDeconv*/ true);
+  resultGpu = doOneConvTest(/* imgSize */ 4,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 3,
+                            /*channel*/ 1,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param,
+                            /*useGpu*/ true,
+                            /*isDeconv*/ true);
+  checkMatrixEqual(resultCpu, resultGpu);
+  /// TEST2 for conv ///
  input = Matrix::create(1, 3 * 3 * 2, false, false);
  real inputData2[] = {
      1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18};
@@ -153,7 +203,7 @@ TEST(Layer, convParaUnified) {
                            /*groups*/ 1,
                            input,
                            param2,
-                            false);
+                            /*useGpu*/ false);
  resultGpu = doOneConvTest(/* imgSize */ 3,
                            /* output_x */ 2,
@@ -165,9 +215,10 @@ TEST(Layer, convParaUnified) {
                            /*groups*/ 1,
                            input,
                            param2,
-                            true);
+                            /*useGpu*/ true);
  checkMatrixEqual(resultCpu, resultGpu);
+  /// TEST3 for conv ///
  real param3[] = {1, 2, 3, 4, 4, 3, 2, 1};
  resultCpu = doOneConvTest(/* imgSize */ 3,
@@ -180,7 +231,66 @@ TEST(Layer, convParaUnified) {
                            /*groups*/ 2,
                            input,
                            param3,
-                            false);
+                            /*useGpu*/ false);
+  resultGpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 2,
+                            input,
+                            param3,
+                            /*useGpu*/ true);
+  checkMatrixEqual(resultCpu, resultGpu);
+  /// TEST2 for deconv ///
+  input = Matrix::create(1, 2 * 2 * 2, false, false);
+  real inputData2T[] = {1, 2, 3, 4, 5, 6, 7, 8};
+  input->setData(inputData2T);
+  resultCpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param2,
+                            /*useGpu*/ false,
+                            /*isDeconv*/ true);
+  resultGpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 1,
+                            input,
+                            param2,
+                            /*useGpu*/ true,
+                            /*isDeconv*/ true);
+  checkMatrixEqual(resultCpu, resultGpu);
+  /// TEST3 for deconv ///
+  resultCpu = doOneConvTest(/* imgSize */ 3,
+                            /* output_x */ 2,
+                            /* stride */ 1,
+                            /* padding */ 0,
+                            /* filter_size */ 2,
+                            /*channel*/ 2,
+                            /*numfilters*/ 2,
+                            /*groups*/ 2,
+                            input,
+                            param3,
+                            /*useGpu*/ false,
+                            /*isDeconv*/ true);
  resultGpu = doOneConvTest(/* imgSize */ 3,
                            /* output_x */ 2,
@@ -192,7 +302,8 @@ TEST(Layer, convParaUnified) {
                            /*groups*/ 2,
                            input,
                            param3,
-                            true);
+                            /*useGpu*/ true,
+                            /*isDeconv*/ true);
  checkMatrixEqual(resultCpu, resultGpu);
 #endif
 }

--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -166,15 +166,19 @@ TEST(Projection, scaling) {
  }
 }
-void testProjectionConv(size_t groups) {
+void testProjectionConv(size_t groups, bool isDeconv) {
  const int NUM_FILTERS = 18;
  const int FILTER_SIZE = 2;
-  const int FILTER_SIZE_Y = 3;
+  const int FILTER_SIZE_Y = 4;
  const int CHANNELS = 3;
  const int IMAGE_SIZE = 16;
  ProjectionConfig conf;
-  conf.set_type("conv");
+  if (isDeconv) {
+    conf.set_type("convt");
+  } else {
+    conf.set_type("conv");
+  }
  conf.set_num_filters(NUM_FILTERS);
  ConvConfig* conv = conf.mutable_conv_conf();
@@ -186,7 +190,11 @@ void testProjectionConv(size_t groups) {
  conv->set_stride(2);
  conv->set_stride_y(2);
  conv->set_groups(groups);
-  conv->set_filter_channels(conv->channels() / conv->groups());
+  if (isDeconv) {
+    conv->set_filter_channels(NUM_FILTERS / conv->groups());
+  } else {
+    conv->set_filter_channels(conv->channels() / conv->groups());
+  }
  conv->set_img_size(IMAGE_SIZE);
  int output_x = outputSize(conv->img_size(),
                            conv->filter_size(),
@@ -199,8 +207,14 @@ void testProjectionConv(size_t groups) {
                            conv->stride_y(),
                            /* caffeMode */ true);
  conv->set_output_x(output_x);
-  conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
+  conv->set_output_y(output_y);
-  conf.set_output_size(output_x * output_y * NUM_FILTERS);
+  if (isDeconv) {
+    conf.set_input_size(output_x * output_y * CHANNELS);
+    conf.set_output_size(IMAGE_SIZE * IMAGE_SIZE * NUM_FILTERS);
+  } else {
+    conf.set_input_size(IMAGE_SIZE * IMAGE_SIZE * CHANNELS);
+    conf.set_output_size(output_x * output_y * NUM_FILTERS);
+  }
  testProjectionGrad(conf,
                     INPUT_DATA,
@@ -215,8 +229,12 @@ void testProjectionConv(size_t groups) {
 #ifndef PADDLE_ONLY_CPU
 TEST(Projection, conv) {
-  testProjectionConv(1);
+  /// test ConvProjection
-  testProjectionConv(3);
+  testProjectionConv(1, false);
+  testProjectionConv(3, false);
+  /// test ConvTransProjection
+  testProjectionConv(1, true);
+  testProjectionConv(3, true);
 }
 #endif
@@ -385,11 +403,11 @@ void testConvTransLayer(const string& type, bool trans, bool useGpu) {
  config.layerConfig.set_partial_sum(1);
  config.layerConfig.set_shared_biases(true);
-  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 288});
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1024, 384});
  LayerInputConfig* input = config.layerConfig.add_inputs();
  ConvConfig* conv = input->mutable_conv_conf();
  conv->set_filter_size(2);
-  conv->set_filter_size_y(3);
+  conv->set_filter_size_y(4);
  conv->set_channels(16);
  conv->set_padding(0);
  conv->set_padding_y(1);
@@ -416,6 +434,9 @@ TEST(Layer, convTransLayer) {
  for (auto useGpu : {false, true}) {
    testConvTransLayer("exconvt", /* trans= */ false, /* useGpu= */ useGpu);
  }
+#ifndef PADDLE_ONLY_CPU
+  testConvTransLayer("cudnn_convt", /* trans= */ false, /* useGpu= */ true);
+#endif
 }
 TEST(Layer, blockExpandLayer) {
@@ -1482,16 +1503,20 @@ TEST(Layer, BatchNormalizationLayer) {
 #endif
 }
-TEST(Operator, conv) {
+void testConvOperator(bool isDeconv) {
  TestConfig config;
  const int NUM_FILTERS = 16;
  const int FILTER_SIZE = 2;
  const int FILTER_SIZE_Y = 3;
  const int CHANNELS = 3;
  const int IMAGE_SIZE = 16;
-  const int IMAGE_SIZE_Y = 8;
+  const int IMAGE_SIZE_Y = 9;
  OperatorConfig& operatorConf = *config.layerConfig.add_operator_confs();
-  operatorConf.set_type("conv");
+  if (isDeconv) {
+    operatorConf.set_type("convt");
+  } else {
+    operatorConf.set_type("conv");
+  }
  ConvConfig* conv = operatorConf.mutable_conv_conf();
  operatorConf.set_num_filters(NUM_FILTERS);
  conv->set_filter_size(FILTER_SIZE);
@@ -1502,7 +1527,6 @@ TEST(Operator, conv) {
  conv->set_stride(2);
  conv->set_stride_y(2);
  conv->set_groups(1);
-  conv->set_filter_channels(conv->channels() / conv->groups());
  conv->set_img_size(IMAGE_SIZE);
  conv->set_img_size_y(IMAGE_SIZE_Y);
  conv->set_output_x(outputSize(conv->img_size(),
@@ -1515,11 +1539,22 @@ TEST(Operator, conv) {
                                conv->padding_y(),
                                conv->stride_y(),
                                /*  caffeMode */ true));
-  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
-                              NUM_FILTERS);
-  config.inputDefs.push_back(
+  if (isDeconv) {
-      {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE_Y * CHANNELS, 0});
+    conv->set_filter_channels(NUM_FILTERS / conv->groups());
+    config.inputDefs.push_back({INPUT_DATA,
+                                "layer_0",
+                                conv->output_x() * conv->output_y() * CHANNELS,
+                                0});
+    config.layerConfig.set_size(IMAGE_SIZE * IMAGE_SIZE_Y * NUM_FILTERS);
+  } else {
+    conv->set_filter_channels(conv->channels() / conv->groups());
+    config.inputDefs.push_back(
+        {INPUT_DATA, "layer_0", IMAGE_SIZE * IMAGE_SIZE_Y * CHANNELS, 0});
+    config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+                                NUM_FILTERS);
+  }
  config.inputDefs.push_back(
      {INPUT_DATA,
       "layer_1",
@@ -1531,6 +1566,11 @@ TEST(Operator, conv) {
  testOperatorGrad(config, operatorConf, 100, /*useGpu*/ true, false);
 }
+TEST(Operator, conv) {
+  testConvOperator(/*isDeconv*/ true);
+  testConvOperator(/*isDeconv*/ false);
+}
 TEST(Layer, FeatureMapExpandLayer) {
  TestConfig config;
  config.layerConfig.set_type("featmap_expand");
@@ -1602,6 +1642,39 @@ TEST(Layer, PadLayer) {
  }
 }
+TEST(Layer, CrossChannelNormLayer) {
+  TestConfig config;
+  config.layerConfig.set_type("norm");
+  config.layerConfig.set_size(100);
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  NormConfig* norm = input->mutable_norm_conf();
+  norm->set_norm_type("cross-channel-norm");
+  norm->set_channels(10);
+  norm->set_size(100);
+  norm->set_scale(0);
+  norm->set_pow(0);
+  norm->set_blocked(0);
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 100, 10});
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "cross-channel-norm", 10, false, useGpu, false, 5);
+  }
+}
+TEST(Layer, smooth_l1) {
+  TestConfig config;
+  config.layerConfig.set_type("smooth_l1");
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 1, 0});
+  config.inputDefs.push_back({INPUT_DATA_TARGET, "layer_1", 1, 0});
+  config.layerConfig.add_inputs();
+  config.layerConfig.add_inputs();
+  for (auto useGpu : {false, true}) {
+    testLayerGrad(config, "smooth_l1", 100, false, useGpu, false, 2.0);
+  }
+}
 int main(int argc, char** argv) {
  testing::InitGoogleTest(&argc, argv);
  initMain(argc, argv);

--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -1453,6 +1453,24 @@ void BaseMatrixT<T>::divRowVector(BaseMatrixT& b) {
              true_type() /* bAsRowVector */, false_type());
 }
+template<class T>
+void BaseMatrixT<T>::mulColVector(BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::DotMul<T>(), b, numRows, numCols, offset,
+              false_type(), true_type() /* bAsColVector */);
+}
+template<class T>
+void BaseMatrixT<T>::divColVector(BaseMatrixT& b) {
+  MatrixOffset offset(0, 0, 0, 0);
+  int numRows = height_;
+  int numCols = width_;
+  applyBinary(binary::DotDiv<T>(), b, numRows, numCols, offset,
+              false_type(), true_type() /* bAsColVector */);
+}
 template<>
 template <class Agg>
 int BaseMatrixT<real>::applyRow(Agg agg, BaseMatrixT& b) {

--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -545,6 +545,9 @@ public:
  void mulRowVector(BaseMatrixT& b);
  void divRowVector(BaseMatrixT& b);
+  void mulColVector(BaseMatrixT& b);
+  void divColVector(BaseMatrixT& b);
  void addP2P(BaseMatrixT& b);
  /**

--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -3590,6 +3590,55 @@ void CpuMatrix::sumOfSquaresBp(Matrix& output, Matrix& label) {
  }
 }
+void CpuMatrix::smoothL1(Matrix& output, Matrix& label) {
+  CHECK(output.useGpu_ == false && label.useGpu_ == false)
+      << "Matrix type are not equal";
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(label.getHeight(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(label.getWidth(), dim);
+  CHECK_EQ(getWidth(), (size_t)1);
+  real* out = output.getData();
+  real* cost = getData();
+  real* lbl = label.getData();
+  for (size_t i = 0; i < numSamples; ++i, out += dim, cost += dim, lbl += dim) {
+    for (size_t j = 0; j < dim; ++j) {
+      cost[j] = std::fabs(out[j] - lbl[j]);
+      if (cost[j] < 1.0)
+        cost[j] = 0.5 * cost[j] * cost[j];
+      else
+        cost[j] = cost[j] - 0.5;
+    }
+  }
+}
+void CpuMatrix::smoothL1Bp(Matrix& output, Matrix& label) {
+  CHECK(output.useGpu_ == false && label.useGpu_ == false)
+      << "Matrix type are not equal";
+  size_t numSamples = getHeight();
+  size_t dim = output.getWidth();
+  CHECK_EQ(label.getHeight(), numSamples);
+  CHECK_EQ(output.getHeight(), numSamples);
+  CHECK_EQ(label.getWidth(), dim);
+  CHECK_EQ(getWidth(), (size_t)1);
+  real* out = output.getData();
+  real* cost = getData();
+  real* lbl = label.getData();
+  // f'(x) = x         if |x| < 1
+  //       = sign(x)   otherwise
+  for (size_t i = 0; i < numSamples; ++i, out += dim, cost += dim, lbl += dim) {
+    for (size_t j = 0; j < dim; ++j) {
+      cost[j] = out[j] - lbl[j];
+      if (std::fabs(cost[j]) >= 1) cost[j] = (0 < cost[j]) - (cost[j] < 0);
+    }
+  }
+}
 void CpuMatrix::tanh(Matrix& output) {
  CHECK(isContiguous());
  CHECK(output.isContiguous());

--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -783,6 +783,14 @@ public:
    LOG(FATAL) << "Not implemented";
  }
+  virtual void smoothL1(Matrix& output, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
+  virtual void smoothL1Bp(Matrix& outputV, Matrix& label) {
+    LOG(FATAL) << "Not implemented";
+  }
  virtual void tanh(Matrix& output) { LOG(FATAL) << "Not implemented"; }
  virtual void tanhDerivative(Matrix& output) {
@@ -1720,6 +1728,9 @@ public:
  /// gradient of sumOfSquares.
  void sumOfSquaresBp(Matrix& outputV, Matrix& label);
+  void smoothL1(Matrix& output, Matrix& label);
+  void smoothL1Bp(Matrix& output, Matrix& label);
  void tanh(Matrix& output);
  void tanhDerivative(Matrix& output);

--- a/paddle/math/tests/test_BaseMatrix.cpp
+++ b/paddle/math/tests/test_BaseMatrix.cpp
@@ -110,6 +110,8 @@ TEST(BaseMatrix, BaseMatrix) {
      compare(&BaseMatrix::addRowVector);
      compare(&BaseMatrix::mulRowVector);
      compare(&BaseMatrix::divRowVector);
+      compare(&BaseMatrix::mulColVector);
+      compare(&BaseMatrix::divColVector);
      compare(&BaseMatrix::addP2P);
      compare(&BaseMatrix::invSqrt);
    }

--- a/paddle/parameter/Argument.cpp
+++ b/paddle/parameter/Argument.cpp
@@ -123,46 +123,6 @@ static void resizeAndCopy(ICpuGpuVectorPtr& dest,
  }
 }
-static void resizeAndCopy(UserDefinedVectorPtr& dest,
-                          const UserDefinedVectorPtr& src,
-                          bool useGpu,
-                          hl_stream_t stream) {
-  if (src) {
-    CHECK(!useGpu) << "not implemented";
-    size_t height = src->size();
-    if (!dest) {
-      dest = std::make_shared<std::vector<void*>>(height);
-    } else {
-      dest->resize(height);
-    }
-    std::copy_n(src->begin(), height, dest->begin());
-  } else {
-    dest.reset();
-  }
-}
-static void resizeAndCopy(UserDefinedVectorPtr& dest,
-                          const UserDefinedVectorPtr& src,
-                          int32_t startPos,
-                          int32_t copySize,
-                          bool useGpu,
-                          hl_stream_t stream = HPPL_STREAM_DEFAULT) {
-  if (src) {
-    CHECK(!useGpu) << "not implemented";
-    CHECK_LE((size_t)startPos + copySize, src->size());
-    size_t height = copySize;
-    if (!dest) {
-      dest = std::make_shared<std::vector<void*>>(height);
-    } else {
-      dest->resize(height);
-    }
-    std::copy_n(src->begin() + startPos, height, dest->begin());
-  } else {
-    dest.reset();
-  }
-}
 static void resizeAndCopy(SVectorPtr& dest,
                          const SVectorPtr& src,
                          bool useGpu,
@@ -223,7 +183,6 @@ void Argument::resizeAndCopyFrom(const Argument& src,
                  false /* useGpu */,
                  stream);
  }
-  resizeAndCopy(udp, src.udp, useGpu, stream);
  resizeAndCopy(strs, src.strs, useGpu, stream);
  frameWidth = src.frameWidth;
  frameHeight = src.frameHeight;
@@ -255,7 +214,6 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src,
    resizeAndCopy(value, src.value, startRow, copySize, useGpu, stream);
    resizeAndCopy(grad, src.grad, startRow, copySize, useGpu, stream);
    resizeAndCopy(ids, src.ids, startRow, copySize, useGpu, stream);
-    resizeAndCopy(udp, src.udp, startRow, copySize, useGpu, stream);
    resizeAndCopy(strs, src.strs, startRow, copySize, useGpu, stream);
    return copySize;
  } else {
@@ -268,7 +226,6 @@ int32_t Argument::resizeAndCopyFrom(const Argument& src,
    resizeAndCopy(value, src.value, startRow, copyFeatureSize, useGpu, stream);
    resizeAndCopy(grad, src.grad, startRow, copyFeatureSize, useGpu, stream);
    resizeAndCopy(ids, src.ids, startRow, copyFeatureSize, useGpu, stream);
-    resizeAndCopy(udp, src.udp, startRow, copySize, useGpu, stream);
    resizeAndCopy(sequenceStartPositions,
                  src.sequenceStartPositions,
                  startSeq,
@@ -583,7 +540,7 @@ void Argument::checkSubset() const {
  }
 }
-void Argument::degradeSequence(const Argument& input, bool useGpu) {
+void Argument::degradeSequence(const Argument& input) {
  CHECK_EQ(input.hasSubseq(), 1UL);
  size_t numSequences = input.getNumSequences();
  size_t numSubSequences = input.getNumSubSequences();

--- a/paddle/parameter/Argument.h
+++ b/paddle/parameter/Argument.h
@@ -24,8 +24,6 @@ limitations under the License. */
 namespace paddle {
-// vector of user defined pointers
-typedef std::shared_ptr<std::vector<void*>> UserDefinedVectorPtr;
 typedef std::shared_ptr<std::vector<std::string>> SVectorPtr;
 struct Argument {
@@ -40,7 +38,6 @@ struct Argument {
        sequenceStartPositions(nullptr),
        subSequenceStartPositions(nullptr),
        cpuSequenceDims(nullptr),
-        udp(nullptr),
        deviceId(-1),
        allCount(0),
        valueCount(0),
@@ -63,7 +60,6 @@ struct Argument {
    sequenceStartPositions = argument.sequenceStartPositions;
    subSequenceStartPositions = argument.subSequenceStartPositions;
    cpuSequenceDims = argument.cpuSequenceDims;
-    udp = argument.udp;
    deviceId = argument.deviceId;
    allCount = argument.allCount;
    frameHeight = argument.frameHeight;
@@ -96,8 +92,6 @@ struct Argument {
  // dimension of sequence, stored only in CPU
  IVectorPtr cpuSequenceDims;
-  UserDefinedVectorPtr udp;  // user defined pointer
  int deviceId;            // the GPU device id which the argument in
  int allCount;            // the number of output layers using this argument
  mutable int valueCount;  // waiting this member when layer do forward
@@ -137,7 +131,6 @@ struct Argument {
    if (ids) return ids->getSize();
    if (grad) return grad->getHeight();
    if (in) return in->getHeight();
-    if (udp) return udp->size();
    if (strs) return strs->size();
    return 0;
  }
@@ -296,7 +289,7 @@ struct Argument {
  /*
   sequence has sub-sequence degrades to a sequence.
   */
-  void degradeSequence(const Argument& input, bool useGpu);
+  void degradeSequence(const Argument& input);
  /**
   * @brief getValueString will return the argument's output in string. There

--- a/paddle/py_paddle/.gitignore
+++ b/paddle/py_paddle/.gitignore
 swig_paddle.py
+_swig_paddle.so
--- a/paddle/scripts/docker/README.md
+++ b/paddle/scripts/docker/README.md
@@ -83,13 +83,18 @@ docker build -t paddle:dev .
 The `docker build` command assumes that `Dockerfile` is in the root source tree.  Note that in this design, this `Dockerfile` is this only one in our repo.
+Users can specify a Ubuntu mirror server for faster downloading:
+```bash
+docker build -t paddle:dev --build-arg UBUNTU_MIRROR=mirror://mirrors.ubuntu.com/mirrors.txt .
+```
 ### Build PaddlePaddle from Source Code
 Given the development image `paddle:dev`, the following command builds PaddlePaddle from the source tree on the development computer (host):
 ```bash
-docker run -v $PWD:/paddle -e "GPU=OFF" -e "AVX=ON" -e "TEST=ON" paddle:dev
+docker run -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" -e "WITH_TEST=OFF" -e "RUN_TEST=OFF" paddle:dev
 ```
 This command mounts the source directory on the host into `/paddle` in the container, so the default entry point of `paddle:dev`, `build.sh`, could build the source code with possible local changes.  When it writes to `/paddle/build` in the container, it writes to `$PWD/build` on the host indeed.
@@ -100,6 +105,14 @@ This command mounts the source directory on the host into `/paddle` in the conta
 - `$PWD/build/paddle-<version>.deb` for production installation, and
 - `$PWD/build/Dockerfile`, which builds the production Docker image.
+Users can specify the following Docker build arguments with either "ON" or "OFF" value:
+- `WITH_GPU`: ***Required***. Generates NVIDIA CUDA GPU code and relies on CUDA libraries.
+- `WITH_AVX`: ***Required***. Set to "OFF" prevents from generating AVX instructions. If you don't know what is AVX, you might want to set "ON".
+- `WITH_TEST`: ***Optional, default OFF***. Build unit tests binaries. Once you've built the unit tests, you can run these test manually by the following command:
+  ```bash
+    docker run -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_AVX=ON" paddle:dev sh -c "cd /paddle/build; make coverall"
+  ```
+- `RUN_TEST`: ***Optional, default OFF***. Run unit tests after building. You can't run unit tests without building it.
 ### Build the Production Docker Image

--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
 #!/bin/bash
-function abort(){
-    echo "An error occurred. Exiting..." 1>&2
-    exit 1
-}
-trap 'abort' 0
 set -e
-mkdir -p /paddle/dist/cpu
-mkdir -p /paddle/dist/gpu
+# Set BASE_IMAGE according to env variables
-mkdir -p /paddle/dist/cpu-noavx
-mkdir -p /paddle/dist/gpu-noavx
-# Set BASE_IMAGE and DEB_PATH according to env variables
 if [ ${WITH_GPU} == "ON" ]; then
  BASE_IMAGE="nvidia/cuda:7.5-cudnn5-runtime-ubuntu14.04"
  # additional packages to install when building gpu images
-  GPU_DOCKER_PKG="python-pip"
+  GPU_DOCKER_PKG="python-pip python-dev"
-  if [ ${WITH_AVX} == "ON" ]; then
-    DEB_PATH="dist/gpu/"
-    DOCKER_SUFFIX="gpu"
-  else
-    DEB_PATH="dist/gpu-noavx/"
-    DOCKER_SUFFIX="gpu-noavx"
-  fi
 else
  BASE_IMAGE="python:2.7.13-slim"
-  if [ ${WITH_AVX} == "ON" ]; then
-    DEB_PATH="dist/cpu/"
-    DOCKER_SUFFIX="cpu"
-  else
-    DEB_PATH="dist/cpu-noavx/"
-    DOCKER_SUFFIX="noavx"
-  fi
 fi
-# If Dockerfile.* sets BUILD_AND_INSTALL to 'ON', it would have copied
-# source tree to /paddle, and this scripts should build it into
-# /paddle/build.
-if [[ ${BUILD_AND_INSTALL:-OFF} == 'ON' ]]; then
-    if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
-	ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so /usr/lib/libcudnn.so
-    fi
-    mkdir -p /paddle/build # -p means no error if exists
+DOCKERFILE_GPU_ENV=""
-    cd /paddle/build
+if [[ ${WITH_GPU:-OFF} == 'ON' ]]; then
-    # clean local cmake and third_party cache
+    DOCKERFILE_GPU_ENV="ENV LD_LIBRARY_PATH /usr/lib/x86_64-linux-gnu:${LD_LIBRARY_PATH}"
-    if [ ${DELETE_BUILD_CACHE} == 'ON' ]; then
-      rm -rf * && rm -rf ../third_party
-    fi
-    cmake .. \
-	  -DWITH_DOC=${WITH_DOC:-OFF} \
-	  -DWITH_GPU=${WITH_GPU:-OFF} \
-	  -DWITH_AVX=${WITH_AVX:-OFF} \
-	  -DWITH_SWIG_PY=ON \
-	  -DCUDNN_ROOT=/usr/ \
-	  -DWITH_STYLE_CHECK=OFF \
-	  -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-    make -j `nproc`
-    make install
-    # generate deb package for current build
-    # FIXME(typhoonzero): should we remove paddle/scripts/deb ?
-    # FIXME: CPACK_DEBIAN_PACKAGE_DEPENDS removes all dev dependencies, must
-    # install them in docker
-    cpack -D CPACK_GENERATOR='DEB' -D CPACK_DEBIAN_PACKAGE_DEPENDS="" ..
-    mv /paddle/build/*.deb /paddle/${DEB_PATH}
-    if [[ ${BUILD_WOBOQ:-OFF} == 'ON' ]]; then
+    # for cmake to find cudnn
-        apt-get install -y clang-3.8 llvm-3.8 libclang-3.8-dev
+    ln -s /usr/lib/x86_64-linux-gnu/libcudnn.so /usr/lib/libcudnn.so
-        # Install woboq_codebrowser.
+fi
-        git clone https://github.com/woboq/woboq_codebrowser /woboq
-        cd /woboq
-        cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
-        -DCMAKE_BUILD_TYPE=Release \
-        .
-        make
-        export WOBOQ_OUT=/usr/share/nginx/html/paddle
+mkdir -p /paddle/build
-        export BUILD_DIR=/paddle/build
+cd /paddle/build
-        mkdir -p $WOBOQ_OUT
-        cp -rv /woboq/data $WOBOQ_OUT/../data
+# build script will not fail if *.deb does not exist
-        /woboq/generator/codebrowser_generator \
+rm *.deb 2>/dev/null || true
+cmake .. \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DWITH_DOC=${WITH_DOC:-OFF} \
+      -DWITH_GPU=${WITH_GPU:-OFF} \
+      -DWITH_AVX=${WITH_AVX:-OFF} \
+      -DWITH_SWIG_PY=ON \
+      -DCUDNN_ROOT=/usr/ \
+      -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF} \
+      -DON_COVERALLS=${WITH_TEST:-OFF} \
+      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
+make -j `nproc`
+if [[ ${RUN_TEST:-OFF} == "ON" ]]; then
+    make coveralls
+fi
+make install
+# generate deb package for current build
+# FIXME(typhoonzero): should we remove paddle/scripts/deb ?
+# FIXME: CPACK_DEBIAN_PACKAGE_DEPENDS removes all dev dependencies, must
+# install them in docker
+cpack -D CPACK_GENERATOR='DEB' -D CPACK_DEBIAN_PACKAGE_DEPENDS="" ..
+if [[ ${BUILD_WOBOQ:-OFF} == 'ON' ]]; then
+    apt-get install -y clang-3.8 llvm-3.8 libclang-3.8-dev
+    # Install woboq_codebrowser.
+    git clone https://github.com/woboq/woboq_codebrowser /woboq
+    cd /woboq
+    cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
+          -DCMAKE_BUILD_TYPE=Release \
+          .
+    make
+    export WOBOQ_OUT=/usr/share/nginx/html/paddle
+    export BUILD_DIR=/paddle/build
+    mkdir -p $WOBOQ_OUT
+    cp -rv /woboq/data $WOBOQ_OUT/../data
+    /woboq/generator/codebrowser_generator \
        -b /paddle/build \
        -a \
        -o $WOBOQ_OUT \
        -p paddle:/paddle
-        /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
+    /woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
-        cd /woboq
+    cd /woboq
-        make clean
+    make clean
-    fi
-    pip install /usr/local/opt/paddle/share/wheels/py_paddle*linux*.whl
-    pip install /usr/local/opt/paddle/share/wheels/paddle*.whl
-    paddle version
-    if [[ ${DOCKER_BUILD:-FALSE} == 'TRUE' ]]; then
-	# reduce docker image size
-	rm -rf /paddle/build
-	rm -rf /usr/local/opt/paddle/share/wheels/
-    fi
 fi
+paddle version
 # generate production docker image Dockerfile
 if [ ${USE_MIRROR} ]; then
  MIRROR_UPDATE="sed 's@http:\/\/archive.ubuntu.com\/ubuntu\/@mirror:\/\/mirrors.ubuntu.com\/mirrors.txt@' -i /etc/apt/sources.list && \\"
@@ -106,39 +80,23 @@ else
  MIRROR_UPDATE="\\"
 fi
-cat > /paddle/build/Dockerfile.${DOCKER_SUFFIX} <<EOF
+cat > /paddle/build/Dockerfile <<EOF
 FROM ${BASE_IMAGE}
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-# ENV variables
-ARG WITH_AVX
-ARG WITH_DOC
-ARG WITH_STYLE_CHECK
-ENV WITH_GPU=${WITH_GPU}
-ENV WITH_AVX=\${WITH_AVX:-ON}
-ENV WITH_DOC=\${WITH_DOC:-OFF}
-ENV WITH_STYLE_CHECK=\${WITH_STYLE_CHECK:-OFF}
 ENV HOME /root
 ENV LANG en_US.UTF-8
 # Use Fix locales to en_US.UTF-8
 RUN ${MIRROR_UPDATE}
    apt-get update && \
-    apt-get install -y libgfortran3 ${GPU_DOCKER_PKG} && \
+    apt-get install -y libgfortran3 libpython2.7 ${GPU_DOCKER_PKG} && \
    apt-get clean -y && \
    pip install --upgrade pip && \
-    pip install -U 'protobuf==3.1.0' requests
+    pip install -U 'protobuf==3.1.0' requests numpy
-RUN pip install numpy
 # Use different deb file when building different type of images
-ADD \$PWD/${DEB_PATH}*.deb /usr/local/opt/paddle/deb/
+ADD build/*.deb /usr/local/opt/paddle/deb/
-RUN dpkg --force-all -i /usr/local/opt/paddle/deb/*.deb && rm -f /usr/local/opt/paddle/deb/*.deb
+# run paddle version to install python packages first
+RUN dpkg -i /usr/local/opt/paddle/deb/*.deb && rm -f /usr/local/opt/paddle/deb/*.deb && paddle version
-ENV PATH="/usr/local/opt/paddle/bin/:${PATH}"
+${DOCKERFILE_GPU_ENV}
 # default command shows the paddle version and exit
 CMD ["paddle", "version"]
 EOF
-trap : 0
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -94,16 +94,22 @@ else:
 EOF
 if [ $? -eq 1 ]; then  # Older version installed, or not installed at all
-   echo "First time run paddle, need to install some python dependencies."
+    echo "First time run paddle, need to install some python dependencies."
-   BASEDIR=$(dirname "$0")
+    # setuptools normalizes package version, so we need to use normalized
-   pip install ${BASEDIR}/../opt/paddle/share/wheels/*-@PADDLE_VERSION@-*.whl
+    # package version for paddle python package
-   if [ $? -ne 0 ]; then
+    PYTHON_PADDLE_VERSION=$(python -c 'import packaging
-      echo "pip install wheels failed. "
+import setuptools
-      echo "Please use 'sudo paddle' at the first time you use PaddlePaddle"
+print str(packaging.version.Version("@PADDLE_VERSION@"))
-      echo "PaddlePaddle will install some python dependencies automatically."
+' 2>/dev/null)
-      exit 1
+    BASEDIR=$(dirname "$0")
-   fi
+    pip install ${BASEDIR}/../opt/paddle/share/wheels/*-${PYTHON_PADDLE_VERSION}-*.whl
-   echo "Python dependencies are installed."
+    if [ $? -ne 0 ]; then
+	echo "pip install wheels failed. "
+	echo "Please use 'sudo paddle' at the first time you use PaddlePaddle"
+	echo "PaddlePaddle will install some python dependencies automatically."
+	exit 1
+    fi
+    echo "Python dependencies are installed."
 fi
 case "$1" in

--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
@@ -5,7 +5,7 @@ NPROC=1
 export PYTHONPATH=/opt/python/2.7.12/lib/python2.7/site-packages
 export PYTHONHOME=/opt/python/2.7.12
 export PATH=/opt/python/2.7.12/bin:${PATH}
-cmake .. -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
+cmake .. -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DON_TRAVIS=ON -DWITH_COVERAGE=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
 NRPOC=`nproc`
 make -j $NPROC
 make coveralls

--- a/paddle/setup.py.in
+++ b/paddle/setup.py.in
@@ -12,68 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# This file is used to build paddle python binding package.
-# It will be invoked by Makefile that generated by COMAKE
 from setuptools import setup, Extension
-import numpy as np
-import api.paddle_ld_flags
-import platform
-import os
-system = platform.system().lower()
-is_osx = (system == 'darwin')
-is_win = (system == 'windows')
-is_lin = (system == 'linux')
-# The extra links will passed from COMAKE
-#   because generate paddle LDFLAGS is too complicated to do in setup.py
-#   it just read COMAKE generated LDFLAGS.
-extra_comps = []
-extra_links = []
-obj = api.paddle_ld_flags.PaddleLDFlag()
-extra_comps = obj.c_flag()
-ldflags = obj.ldflag_str()
-if ldflags is not None:
-  extra_links.extend(ldflags.split(" "))
-try:
-  with open('.py_paddle_extra_link_flags', 'r') as f:
-    for line in f:
-      extra_links += line.split()
-except:
-  pass
-if is_lin == True:
-    extra_links = ["-Xlinker", '-start-group'] + extra_links + ["-Xlinker", "-end-group"]
-elif is_osx == True:
-    os.environ["ARCHFLAGS"] = "-arch x86_64"
-    extra_links = ["-Wl,-all_load"] + extra_links
-include_dirs = [np.get_include(), "../"]    # include numpy and paddle.
-os.environ["CC"] = "@CMAKE_C_COMPILER@"
-os.environ["CXX"] = "@CMAKE_CXX_COMPILER@"
 setup(name="py_paddle",
-  version="@PADDLE_VERSION@",
+      version="${PADDLE_VERSION}",
-  ext_modules=[
+      packages=['py_paddle'],
-    Extension('py_paddle._swig_paddle',      # Build SWIG Extension.
+      include_package_data=True,
-       ['Paddle_wrap.cxx'],
+      package_data={'py_paddle':['*.py','_swig_paddle.so']},
-       language = "c++",
+      install_requires = [
-       include_dirs = include_dirs,
+        'nltk>=3.2.2',
-       extra_link_args = extra_links,
+        'numpy>=1.8.0',      # The numpy is required.
-       extra_compile_args = extra_comps
+        'protobuf>=${PROTOBUF_VERSION}'    # The paddle protobuf version
-    )
+      ],
-  ],
+      url='http://www.paddlepaddle.org/',
-  packages=['py_paddle'],
+      license='Apache 2.0',
-  include_dirs = include_dirs,
-  install_requires = [
-    'nltk>=3.2.2',
-    'numpy>=1.8.0',      # The numpy is required.
-    'protobuf>=3.0.0'    # The paddle protobuf version
-  ],
 )
--- a/paddle/utils/CpuId.h
+++ b/paddle/utils/CpuId.h
@@ -12,6 +12,7 @@ limitations under the License. */
 #pragma once
 #include "Common.h"
+#include "Error.h"
 namespace paddle {
@@ -97,4 +98,37 @@ private:
 #define HAS_AVX512  HAS_SIMD(SIMD_AVX512)
 // clang-format on
+/**
+ * Invoke checkCPUFeature() before Paddle initialization to
+ * check target machine whether support compiled instructions.
+ * If not, simply throw out an error.
+ */
+inline Error __must_check checkCPUFeature() {
+  Error err;
+#ifndef __AVX__
+  if (HAS_AVX) {
+    LOG(WARNING) << "PaddlePaddle wasn't compiled to use avx instructions, "
+                 << "but these are available on your machine and could "
+                 << "speed up CPU computations via CMAKE .. -DWITH_AVX=ON";
+  }
+#else
+  if (!HAS_AVX) {
+    err = Error(
+        "PaddlePaddle was compiled to use avx instructions, "
+        "but these aren't available on your machine, please "
+        "disable it via CMAKE .. -DWITH_AVX=OFF");
+  }
+#endif  // __AVX__
+#ifdef __SSE3__
+  if (!HAS_SSE3) {
+    err = Error(
+        "PaddlePaddle was compiled to use sse3 instructions, "
+        "which is the minimum requirement of PaddlePaddle. "
+        "But these aren't available on your current machine.");
+  }
+#endif  // __SSE3__
+  return err;
+}
 }  // namespace paddle
--- a/paddle/utils/PythonUtil.cpp.in
+++ b/paddle/utils/PythonUtil.cpp.in
@@ -195,9 +195,14 @@ extern const char enable_virtualenv_py[];
 }
 void initPython(int argc, char** argv) {
 #ifndef PADDLE_NO_PYTHON
-  char pyHome[] = "@PYTHON_INSTALL_DIR@"; // NOLINT
+  std::string pyHome;
-  if (strlen(pyHome)) {
+#if defined(__APPLE__) || defined(__OSX__)
-    Py_SetPythonHome(pyHome);
+  pyHome = "/usr/local/Frameworks/Python.framework/Versions/2.7";
+  Py_SetPythonHome(const_cast<char*>(pyHome.c_str()));
+#endif
+  pyHome = "@PYTHON_INSTALL_DIR@"; // NOLINT
+  if (!pyHome.empty()) {
+    Py_SetPythonHome(const_cast<char*>(pyHome.c_str()));
  }
  Py_SetProgramName(argv[0]);
  Py_Initialize();

--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
@@ -26,6 +26,7 @@ limitations under the License. */
 #include <gflags/gflags.h>
+#include "CpuId.h"
 #include "CustomStackTrace.h"
 #include "Logging.h"
 #include "StringUtil.h"
@@ -185,6 +186,7 @@ void initMain(int argc, char** argv) {
  }
  version::printVersion();
+  checkCPUFeature().check();
  runInitFunctions();
 }

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -686,25 +686,17 @@ class ContextProjection(Projection):
 @config_class
-class ConvProjection(Projection):
+class ConvBaseProjection(Projection):
-    type = 'conv'
    def __init__(self,
                 input_layer_name,
                 num_filters=None,
                 conv_conf=None,
                 **xargs):
-        super(ConvProjection, self).__init__(input_layer_name, **xargs)
+        super(ConvBaseProjection, self).__init__(input_layer_name, **xargs)
        if num_filters is not None:
            self.proj_conf.num_filters = num_filters
-        parse_conv(conv_conf, input_layer_name, self.proj_conf.conv_conf,
-                   num_filters)
-        self.proj_conf.output_size = self.proj_conf.conv_conf.output_x * \
-                                     self.proj_conf.conv_conf.output_y * \
-                                     num_filters
    def calc_output_size(self, input_layer_config):
        return self.proj_conf.output_size
@@ -723,6 +715,48 @@ class ConvProjection(Projection):
        return None
+@config_class
+class ConvProjection(ConvBaseProjection):
+    type = 'conv'
+    def __init__(self,
+                 input_layer_name,
+                 num_filters=None,
+                 conv_conf=None,
+                 **xargs):
+        super(ConvProjection, self).__init__(input_layer_name, num_filters,
+                                             conv_conf, **xargs)
+        parse_conv(conv_conf, self.input_layer_name, self.proj_conf.conv_conf,
+                   num_filters)
+        self.proj_conf.output_size = self.proj_conf.conv_conf.output_x * \
+                                     self.proj_conf.conv_conf.output_y * \
+                                     num_filters
+@config_class
+class ConvTransProjection(ConvBaseProjection):
+    type = 'convt'
+    def __init__(self,
+                 input_layer_name,
+                 num_filters=None,
+                 conv_conf=None,
+                 **xargs):
+        super(ConvTransProjection, self).__init__(input_layer_name, num_filters,
+                                                  conv_conf, **xargs)
+        parse_conv(
+            conv_conf,
+            self.input_layer_name,
+            self.proj_conf.conv_conf,
+            num_filters,
+            trans=True)
+        self.proj_conf.output_size = self.proj_conf.conv_conf.img_size_y * \
+                                     self.proj_conf.conv_conf.img_size * \
+                                     num_filters
 # Define a operator for mixed layer
 @config_class
 class Operator(Cfg):
@@ -789,6 +823,36 @@ class ConvOperator(Operator):
        return self.operator_conf.output_size
+@config_class
+class ConvTransOperator(Operator):
+    type = 'convt'
+    def __init__(self,
+                 input_layer_names,
+                 num_filters=None,
+                 conv_conf=None,
+                 **xargs):
+        super(ConvTransOperator, self).__init__(input_layer_names, **xargs)
+        if num_filters is not None:
+            self.operator_conf.num_filters = num_filters
+        parse_conv(
+            conv_conf,
+            MakeLayerNameInSubmodel(input_layer_names[0]),
+            self.operator_conf.conv_conf,
+            num_filters,
+            trans=True)
+        self.operator_conf.output_size = \
+            self.operator_conf.conv_conf.img_size * \
+            self.operator_conf.conv_conf.img_size_y * \
+            num_filters
+        config_assert(len(input_layer_names) == 2, "Conv is binary operator")
+    def calc_output_size(self, input_sizes):
+        return self.operator_conf.output_size
 # please refer to the comments in proto/ModelConfig.proto
 @config_class
 class Conv(Cfg):
@@ -1156,9 +1220,11 @@ def parse_image(image, input_layer_name, image_conf):
 def parse_norm(norm, input_layer_name, norm_conf):
    norm_conf.norm_type = norm.norm_type
-    config_assert(norm.norm_type in ['rnorm', 'cmrnorm-projection'],
+    config_assert(
-                  "norm-type %s is not in [rnorm, 'cmrnorm-projection']" %
+        norm.norm_type in
-                  norm.norm_type)
+        ['rnorm', 'cmrnorm-projection', 'cross-channel-norm'],
+        "norm-type %s is not in [rnorm, cmrnorm-projection, cross-channel-norm]"
+        % norm.norm_type)
    norm_conf.channels = norm.channels
    norm_conf.size = norm.size
    norm_conf.scale = norm.scale
@@ -1772,8 +1838,17 @@ class ConvTransLayerBase(LayerBase):
        use_gpu = int(g_command_config_args.get("use_gpu", 0))
        parallel_nn = int(g_command_config_args.get("parallel_nn", 0))
-        # cudnn_convt has not been implemented so use exconvt only
+        # Automatically select cudnn_type for GPU and exconvt for CPU
-        self.layer_type = "exconvt"
+        # if set type=exconvt, but still reserve the way user specify
+        # exconvt or cudnn_convt manually.
+        if self.layer_type == "cudnn_convt":
+            config_assert(use_gpu, "cudnn_convt only support GPU")
+        if (use_gpu == 1 and self.layer_type != "exconvt" and
+            (parallel_nn == 0 or self.config.device > -1)):
+            self.layer_type = "cudnn_convt"
+        else:
+            self.layer_type = "exconvt"
        # need to specify layer in config
        self.config.type = self.layer_type
@@ -1790,10 +1865,9 @@ class ConvTransLayerBase(LayerBase):
                trans=True)
            conv_conf = self.config.inputs[input_index].conv_conf
            psize = self.calc_parameter_size(conv_conf)
-            print("output size for %s is %d " % (name, conv_conf.output_x))
            self.create_input_parameter(input_index, psize)
-            self.set_layer_size(
+            self.set_cnn_layer(name, conv_conf.img_size_y, conv_conf.img_size,
-                (conv_conf.img_size**2) * self.config.num_filters)
+                               self.config.num_filters)
        psize = self.config.size
        if shared_biases:
@@ -1810,6 +1884,11 @@ class ConvTransLayer(ConvTransLayerBase):
    layer_type = 'exconvt'
+@config_layer('cudnn_convt')
+class ConvTransLayer(ConvTransLayerBase):
+    layer_type = 'cudnn_convt'
 @config_layer('norm')
 class NormLayer(LayerBase):
    def __init__(self, name, inputs, **xargs):
@@ -1821,6 +1900,9 @@ class NormLayer(LayerBase):
                       norm_conf)
            self.set_cnn_layer(name, norm_conf.output_y, norm_conf.output_x,
                               norm_conf.channels, False)
+            if norm_conf.norm_type == "cross-channel-norm":
+                self.create_input_parameter(0, norm_conf.channels,
+                                            [norm_conf.channels, 1])
 @config_layer('pool')
@@ -2222,7 +2304,10 @@ def Link(
 # memory for recurrent layer group.
 # *name* and *size* are actual layer's name and size.
-# will return name of the memory,
+# If *name* is None, need to provide *memory_name* and need to use
+# SetMemoryInput() later to specify the layer which this memory remembers.
+#
+# return the name of the memory,
 # use this name if you assign the memory as other layer's input
 #
 # boot frame of memory is zeroed by default,
@@ -2234,15 +2319,18 @@ def Link(
 # can only be initailized by a *boot_layer* which is a sequence.
 #
 @config_func
-def Memory(
+def Memory(name,
-        name,
+           size,
-        size,
+           is_sequence=False,
-        is_sequence=False,
+           boot_layer=None,
-        boot_layer=None,
+           boot_bias=False,
-        boot_bias=False,
+           boot_bias_active_type="",
-        boot_bias_active_type="",
+           boot_with_const_id=None,
-        boot_with_const_id=None, ):
+           memory_name=None):
-    agent_name = name + "+delay1"
+    if not memory_name:
+        config_assert(name is not None, "name needs cannot be None")
+        memory_name = name + "+delay1"
+    agent_name = memory_name
    if is_sequence:
        agent_layer = SequenceAgentLayer(agent_name, size)
    else:
@@ -2250,7 +2338,8 @@ def Memory(
    config_assert(g_current_submodel.is_recurrent_layer_group,
                  'Memory should be used in recurrent layer group only')
    memory = g_current_submodel.memories.add()
-    memory.layer_name = MakeLayerNameInSubmodel(name)
+    if name is not None:
+        memory.layer_name = MakeLayerNameInSubmodel(name)
    memory.link_name = MakeLayerNameInSubmodel(agent_name)
    memory.is_sequence = is_sequence
    options = sum((boot_layer is not None, bool(boot_bias),
@@ -2274,6 +2363,17 @@ def Memory(
    return agent_name
+@config_func
+def SetMemoryInput(memory_name, layer_name):
+    memory_name = MakeLayerNameInSubmodel(memory_name)
+    layer_name = MakeLayerNameInSubmodel(layer_name)
+    for mem in g_current_submodel.memories:
+        if mem.link_name == memory_name:
+            mem.layer_name = layer_name
+            return
+    logger.fatal("Nonexistent memory name: " + memory_name)
 # Generator for recurrent layer group, to use it:
 #  1. define a id layer as output of layer group
 #  2. define a memory of this id layer, and assign a boot id(begin of sequence)

--- a/python/paddle/trainer_config_helpers/default_decorators.py
+++ b/python/paddle/trainer_config_helpers/default_decorators.py
@@ -97,13 +97,13 @@ def reset_hook():
 register_parse_config_hook(reset_hook)
-def wrap_name_default(name_prefix=None):
+def wrap_name_default(name_prefix=None, name_param="name"):
    """
    Decorator to set "name" arguments default to "{name_prefix}_{invoke_count}".
    ..  code:: python
-        @default_name("some_name")
+        @wrap_name_default("some_name")
        def func(name=None):
            print name      # name will never be None. If name is not set,
                            # name will be "some_name_%d"
@@ -115,7 +115,7 @@ def wrap_name_default(name_prefix=None):
    """
    factory = DefaultNameFactory(name_prefix)
    _name_factories.append(factory)
-    return wrap_param_default(["name"], factory)
+    return wrap_param_default([name_param], factory)
 def wrap_param_attr_default(param_names=None, default_factory=None):

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -112,6 +112,7 @@ __all__ = [
    'out_prod_layer',
    'print_layer',
    'priorbox_layer',
+    'cross_channel_norm_layer',
    'spp_layer',
    'pad_layer',
    'eos_layer',
@@ -288,6 +289,14 @@ class LayerOutput(object):
        """
        assert False, "this method should not be invoked"
+    def set_input(self, input):
+        """
+        Set the input for a memory layer. Can only be used for memory layer
+        """
+        assert isinstance(input, LayerOutput)
+        assert self.layer_type == LayerType.MEMORY
+        SetMemoryInput(self.name, input.name)
 ERROR_CLIPPING = 'error_clipping_threshold'
 DROPOUT = 'drop_rate'
@@ -704,8 +713,9 @@ class MixedLayerType(LayerOutput):
        assert len(self.inputs) == 0
        return self
-    def __exit__(self, *args, **kwargs):
+    def __exit__(self, exc_type, exc_value, tb):
-        del args, kwargs  # unused parameter to suppress warning
+        if exc_value is not None:
+            raise exc_value
        assert len(self.inputs) != 0
        ml = MixedLayer(
            name=self.name,
@@ -999,6 +1009,46 @@ def priorbox_layer(input,
        size=size)
+@wrap_name_default("cross_channel_norm")
+def cross_channel_norm_layer(input, name=None, param_attr=None):
+    """
+    Normalize a layer's output. This layer is necessary for ssd.
+    This layer applys normalize across the channels of each sample to
+    a conv layer's output and scale the output by a group of trainable
+    factors which dimensions equal to the channel's number.
+    :param name: The Layer Name.
+    :type name: basestring
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param param_attr: The Parameter Attribute|list.
+    :type param_attr: ParameterAttribute
+    :return: LayerOutput
+    """
+    assert input.num_filters is not None
+    Layer(
+        name=name,
+        type=LayerType.NORM_LAYER,
+        inputs=[
+            Input(
+                input.name,
+                norm=Norm(
+                    norm_type="cross-channel-norm",
+                    channels=input.num_filters,
+                    size=input.size,
+                    scale=0,
+                    pow=0,
+                    blocked=0),
+                **param_attr.attr)
+        ])
+    return LayerOutput(
+        name,
+        LayerType.NORM_LAYER,
+        parents=input,
+        num_filters=input.num_filters,
+        size=input.size)
 @wrap_name_default("seq_pooling")
 @wrap_bias_attr_default(has_bias=False)
 @wrap_param_default(['pooling_type'], default_factory=lambda _: MaxPooling())
@@ -2036,8 +2086,9 @@ def img_conv_layer(input,
    :param trans: true if it is a convTransLayer, false if it is a convLayer
    :type trans: bool
    :param layer_type: specify the layer_type, default is None. If trans=True,
-                       layer_type has to be "exconvt", otherwise layer_type
+                       layer_type has to be "exconvt" or "cudnn_convt", 
-                       has to be either "exconv" or "cudnn_conv"
+                       otherwise layer_type has to be either "exconv" or 
+                       "cudnn_conv"
    :type layer_type: String
    :return: LayerOutput object.
    :rtype: LayerOutput
@@ -2077,7 +2128,7 @@ def img_conv_layer(input,
    if layer_type:
        if trans:
-            assert layer_type in ["exconvt"]
+            assert layer_type in ["exconvt", "cudnn_convt"]
        else:
            assert layer_type in ["exconv", "cudnn_conv"]
        lt = layer_type
@@ -2759,8 +2810,10 @@ def seq_concat_layer(a, b, act=None, name=None, layer_attr=None,
        size=a.size)
+@wrap_name_default("memory", "memory_name")
 def memory(name,
           size,
+           memory_name=None,
           is_seq=False,
           boot_layer=None,
           boot_bias=None,
@@ -2782,14 +2835,32 @@ def memory(name,
    If boot_layer is not null, the memory is just the boot_layer's output.
    Set :code:`is_seq` is true boot layer is sequence.
    The same name layer in recurrent group will set memory on each time
    step.
-    :param name: memory's name.
+    .. code-block:: python
+       mem = memory(size=256, name='state')
+       state = fc_layer(input=mem, size=256, name='state')
+    If you do not want to specify the name, you can equivalently use set_input()
+    to specify the layer needs to be remembered as the following:
+    .. code-block:: python
+       mem = memory(size=256)
+       state = fc_layer(input=mem, size=256)
+       mem.set_input(mem)
+    :param name: the name of the layer which this memory remembers.
+                 If name is None, user should call set_input() to specify the
+                 name of the layer which this memory remembers.
    :type name: basestring
    :param size: size of memory.
    :type size: int
+    :param memory_name: the name of the memory.
+                        It is ignored when name is provided.
+    :type memory_name: basestring
    :param is_seq: is sequence for boot_layer
    :type is_seq: bool
    :param boot_layer: boot layer of memory.
@@ -2811,13 +2882,21 @@ def memory(name,
        boot_bias = ParamAttr.to_bias(boot_bias)
    assert boot_layer is None or isinstance(boot_layer, LayerOutput)
+    if name is not None:
+        memory_name = None
-    agent_name = Memory(name, size, is_seq, boot_layer.name
+    memory_name = Memory(
-                        if boot_layer is not None else None, boot_bias,
+        name,
-                        boot_bias_active_type.name, boot_with_const_id)
+        size,
+        is_sequence=is_seq,
+        boot_layer=boot_layer.name if boot_layer is not None else None,
+        boot_bias=boot_bias,
+        boot_bias_active_type=boot_bias_active_type.name,
+        boot_with_const_id=boot_with_const_id,
+        memory_name=memory_name)
    lout = LayerOutput(
-        name=agent_name,
+        name=memory_name,
        size=size,
        layer_type=LayerType.MEMORY,
        parents=[boot_layer] if boot_layer is not None else None)
@@ -3565,7 +3644,7 @@ def __cost_input__(input, label, weight=None):
    ipts = [Input(input.name), Input(label.name)]
    parents = [input, label]
    if weight is not None:
-        assert weight.layer_type == LayerType.DATA
+        assert weight.size == 1
        ipts.append(Input(weight.name))
        parents.append(weight)
    return ipts, parents
@@ -3679,7 +3758,8 @@ def conv_operator(img,
                  padding=0,
                  filter_size_y=None,
                  stride_y=None,
-                  padding_y=None):
+                  padding_y=None,
+                  trans=False):
    """
    Different from img_conv_layer, conv_op is an Operator, which can be used
    in mixed_layer. And conv_op takes two inputs to perform convolution.
@@ -3735,7 +3815,9 @@ def conv_operator(img,
    if filter.size is not None:
        filter.size = filter_size * filter_size_y * num_filters * num_channels
-    op = ConvOperator(
+    opCls = ConvTransOperator if trans else ConvOperator
+    op = opCls(
        input_layer_names=[img.name, filter.name],
        num_filters=num_filters,
        conv_conf=Conv(
@@ -3747,6 +3829,7 @@ def conv_operator(img,
            padding_y=padding_y,
            stride_y=stride_y,
            groups=1))
    op.origin = [img, filter]
    return op
@@ -3762,7 +3845,8 @@ def conv_projection(input,
                    stride_y=None,
                    padding_y=None,
                    groups=1,
-                    param_attr=None):
+                    param_attr=None,
+                    trans=False):
    """
    Different from img_conv_layer and conv_op, conv_projection is an Projection,
    which can be used in mixed_layer and conat_layer. It use cudnn to implement
@@ -3801,6 +3885,8 @@ def conv_projection(input,
    :type groups: int
    :param param_attr: Convolution param attribute. None means default attribute
    :type param_attr: ParameterAttribute
+    :param trans: whether it is convTrans or conv
+    :type trans: boolean
    :return: A DotMulProjection Object.
    :rtype: DotMulProjection
    """
@@ -3837,7 +3923,9 @@ def conv_projection(input,
        param_attr.attr["initial_strategy"] = 0
        param_attr.attr["initial_smart"] = False
-    proj = ConvProjection(
+    projCls = ConvTransProjection if trans else ConvProjection
+    proj = projCls(
        input_layer_name=input.name,
        num_filters=num_filters,
        conv_conf=Conv(
@@ -4946,7 +5034,12 @@ def lambda_cost(input,
 @wrap_name_default()
 @layer_support()
-def cross_entropy(input, label, name=None, coeff=1.0, layer_attr=None):
+def cross_entropy(input,
+                  label,
+                  name=None,
+                  coeff=1.0,
+                  weight=None,
+                  layer_attr=None):
    """
    A loss layer for multi class entropy.
@@ -4961,22 +5054,27 @@ def cross_entropy(input, label, name=None, coeff=1.0, layer_attr=None):
    :type input: LayerOutput.
    :param name: The name of this layers. It is not necessary.
    :type name: None|basestring.
-    :param coeff: The coefficient affects the gradient in the backward.
+    :param coeff: The cost is multiplied with coeff.
+                  The coefficient affects the gradient in the backward.
    :type coeff: float.
+    :param weight: The cost of each sample is multiplied with each weight.
+                   The weight should be a layer with size=1. Note that gradient
+                   will not be calculated for weight.
+    :type weight: LayerOutout
    :param layer_attr: Extra Layer Attribute.
    :type layer_attr: ExtraLayerAttribute
    :return: LayerOutput object.
    :rtype: LayerOutput.
    """
+    ipts, parents = __cost_input__(input, label, weight)
    Layer(
        name=name,
        type=LayerType.CROSS_ENTROPY,
-        inputs=[input.name, label.name],
+        inputs=ipts,
        coeff=coeff,
        **ExtraLayerAttribute.to_kwargs(layer_attr))
-    return LayerOutput(
+    return LayerOutput(name, LayerType.CROSS_ENTROPY, parents=parents, size=1)
-        name, LayerType.CROSS_ENTROPY, parents=[input, label], size=1)
 @wrap_name_default()

--- a/python/paddle/trainer_config_helpers/tests/configs/projections.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/projections.py
@@ -34,11 +34,31 @@ flt = data_layer(name='filter', size=3 * 3 * 1 * 64)
 with mixed_layer() as m7:
    m7 += conv_operator(
        img=img, filter=flt, num_filters=64, num_channels=1, filter_size=3)
+    m7 += conv_projection(img, filter_size=3, num_filters=64, num_channels=1)
+with mixed_layer() as m8:
+    m8 += conv_operator(
+        img=img,
+        filter=flt,
+        num_filters=64,
+        num_channels=1,
+        filter_size=3,
+        stride=2,
+        padding=1,
+        trans=True)
+    m8 += conv_projection(
+        img,
+        filter_size=3,
+        num_filters=64,
+        num_channels=1,
+        stride=2,
+        padding=1,
+        trans=True)
 end = mixed_layer(
    input=[
        full_matrix_projection(input=m5),
-        trans_full_matrix_projection(input=m6), full_matrix_projection(input=m7)
+        trans_full_matrix_projection(input=m6),
+        full_matrix_projection(input=m7), full_matrix_projection(input=m8)
    ],
    size=100,
    layer_attr=ExtraAttr(

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/img_trans_layers.protostr
@@ -33,6 +33,8 @@ layers {
  bias_parameter_name: "___conv_0__.wbias"
  num_filters: 64
  shared_biases: true
+  height: 256
+  width: 256
 }
 layers {
  name: "__batch_norm_0__"
@@ -58,6 +60,8 @@ layers {
  }
  bias_parameter_name: "___batch_norm_0__.wbias"
  moving_average_fraction: 0.9
+  height: 256
+  width: 256
 }
 layers {
  name: "__crmnorm_0__"

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/projections.protostr
@@ -154,13 +154,40 @@ layers {
  inputs {
    input_layer_name: "img"
  }
+  inputs {
+    input_layer_name: "img"
+    input_parameter_name: "___mixed_6__.w1"
+    proj_conf {
+      type: "conv"
+      name: "___mixed_6__.w1"
+      input_size: 1024
+      output_size: 57600
+      conv_conf {
+        filter_size: 3
+        channels: 1
+        stride: 1
+        padding: 0
+        groups: 1
+        filter_channels: 1
+        output_x: 30
+        img_size: 32
+        caffe_mode: true
+        filter_size_y: 3
+        padding_y: 0
+        stride_y: 1
+        output_y: 30
+        img_size_y: 32
+      }
+      num_filters: 64
+    }
+  }
  inputs {
    input_layer_name: "filter"
  }
  operator_confs {
    type: "conv"
    input_indices: 0
-    input_indices: 1
+    input_indices: 2
    input_sizes: 1024
    input_sizes: 576
    output_size: 57600
@@ -186,38 +213,112 @@ layers {
 layers {
  name: "__mixed_7__"
  type: "mixed"
+  size: 254016
+  active_type: ""
+  inputs {
+    input_layer_name: "img"
+  }
+  inputs {
+    input_layer_name: "img"
+    input_parameter_name: "___mixed_7__.w1"
+    proj_conf {
+      type: "convt"
+      name: "___mixed_7__.w1"
+      input_size: 1024
+      output_size: 254016
+      conv_conf {
+        filter_size: 3
+        channels: 1
+        stride: 2
+        padding: 1
+        groups: 1
+        filter_channels: 64
+        output_x: 32
+        img_size: 63
+        caffe_mode: true
+        filter_size_y: 3
+        padding_y: 1
+        stride_y: 2
+        output_y: 32
+        img_size_y: 63
+      }
+      num_filters: 64
+    }
+  }
+  inputs {
+    input_layer_name: "filter"
+  }
+  operator_confs {
+    type: "convt"
+    input_indices: 0
+    input_indices: 2
+    input_sizes: 1024
+    input_sizes: 576
+    output_size: 254016
+    conv_conf {
+      filter_size: 3
+      channels: 1
+      stride: 2
+      padding: 1
+      groups: 1
+      filter_channels: 64
+      output_x: 32
+      img_size: 63
+      caffe_mode: true
+      filter_size_y: 3
+      padding_y: 1
+      stride_y: 2
+      output_y: 32
+      img_size_y: 63
+    }
+    num_filters: 64
+  }
+}
+layers {
+  name: "__mixed_8__"
+  type: "mixed"
  size: 100
  active_type: ""
  inputs {
    input_layer_name: "__mixed_4__"
-    input_parameter_name: "___mixed_7__.w0"
+    input_parameter_name: "___mixed_8__.w0"
    proj_conf {
      type: "fc"
-      name: "___mixed_7__.w0"
+      name: "___mixed_8__.w0"
      input_size: 300
      output_size: 100
    }
  }
  inputs {
    input_layer_name: "__mixed_5__"
-    input_parameter_name: "___mixed_7__.w1"
+    input_parameter_name: "___mixed_8__.w1"
    proj_conf {
      type: "trans_fc"
-      name: "___mixed_7__.w1"
+      name: "___mixed_8__.w1"
      input_size: 100
      output_size: 100
    }
  }
  inputs {
    input_layer_name: "__mixed_6__"
-    input_parameter_name: "___mixed_7__.w2"
+    input_parameter_name: "___mixed_8__.w2"
    proj_conf {
      type: "fc"
-      name: "___mixed_7__.w2"
+      name: "___mixed_8__.w2"
      input_size: 57600
      output_size: 100
    }
  }
+  inputs {
+    input_layer_name: "__mixed_7__"
+    input_parameter_name: "___mixed_8__.w3"
+    proj_conf {
+      type: "fc"
+      name: "___mixed_8__.w3"
+      input_size: 254016
+      output_size: 100
+    }
+  }
  drop_rate: 0.5
 }
 parameters {
@@ -281,7 +382,23 @@ parameters {
  initial_smart: true
 }
 parameters {
-  name: "___mixed_7__.w0"
+  name: "___mixed_6__.w1"
+  size: 576
+  initial_mean: 0.0
+  initial_std: 0.471404520791
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___mixed_7__.w1"
+  size: 576
+  initial_mean: 0.0
+  initial_std: 0.471404520791
+  initial_strategy: 0
+  initial_smart: false
+}
+parameters {
+  name: "___mixed_8__.w0"
  size: 30000
  initial_mean: 0.0
  initial_std: 0.057735026919
@@ -291,7 +408,7 @@ parameters {
  initial_smart: true
 }
 parameters {
-  name: "___mixed_7__.w1"
+  name: "___mixed_8__.w1"
  size: 10000
  initial_mean: 0.0
  initial_std: 0.1
@@ -301,7 +418,7 @@ parameters {
  initial_smart: true
 }
 parameters {
-  name: "___mixed_7__.w2"
+  name: "___mixed_8__.w2"
  size: 5760000
  initial_mean: 0.0
  initial_std: 0.00416666666667
@@ -310,10 +427,20 @@ parameters {
  initial_strategy: 0
  initial_smart: true
 }
+parameters {
+  name: "___mixed_8__.w3"
+  size: 25401600
+  initial_mean: 0.0
+  initial_std: 0.00198412698413
+  dims: 254016
+  dims: 100
+  initial_strategy: 0
+  initial_smart: true
+}
 input_layer_names: "test"
 input_layer_names: "img"
 input_layer_names: "filter"
-output_layer_names: "__mixed_7__"
+output_layer_names: "__mixed_8__"
 sub_models {
  name: "root"
  layer_names: "test"
@@ -328,10 +455,11 @@ sub_models {
  layer_names: "filter"
  layer_names: "__mixed_6__"
  layer_names: "__mixed_7__"
+  layer_names: "__mixed_8__"
  input_layer_names: "test"
  input_layer_names: "img"
  input_layer_names: "filter"
-  output_layer_names: "__mixed_7__"
+  output_layer_names: "__mixed_8__"
  is_recurrent_layer_group: false
 }
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
@@ -331,6 +331,54 @@ layers {
  }
  trans_type: "non-seq"
 }
+layers {
+  name: "__recurrent_group_3__"
+  type: "recurrent_layer_group"
+  active_type: ""
+}
+layers {
+  name: "seq_input@__recurrent_group_3__"
+  type: "scatter_agent"
+  size: 100
+  active_type: ""
+}
+layers {
+  name: "__memory_6__@__recurrent_group_3__"
+  type: "agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "__fc_layer_0__@__recurrent_group_3__"
+  type: "fc"
+  size: 200
+  active_type: "tanh"
+  inputs {
+    input_layer_name: "seq_input@__recurrent_group_3__"
+    input_parameter_name: "___fc_layer_0__@__recurrent_group_3__.w0"
+  }
+  inputs {
+    input_layer_name: "__memory_6__@__recurrent_group_3__"
+    input_parameter_name: "___fc_layer_0__@__recurrent_group_3__.w1"
+  }
+  bias_parameter_name: "___fc_layer_0__@__recurrent_group_3__.wbias"
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "gather_agent"
+  size: 200
+  active_type: ""
+}
+layers {
+  name: "__last_seq_4__"
+  type: "seqlastins"
+  size: 200
+  active_type: "linear"
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+  }
+  trans_type: "non-seq"
+}
 parameters {
  name: "___mixed_0__.w0"
  size: 40000
@@ -481,6 +529,36 @@ parameters {
  initial_strategy: 0
  initial_smart: false
 }
+parameters {
+  name: "___fc_layer_0__@__recurrent_group_3__.w0"
+  size: 20000
+  initial_mean: 0.0
+  initial_std: 0.1
+  dims: 100
+  dims: 200
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_0__@__recurrent_group_3__.w1"
+  size: 40000
+  initial_mean: 0.0
+  initial_std: 0.0707106781187
+  dims: 200
+  dims: 200
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_0__@__recurrent_group_3__.wbias"
+  size: 200
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 200
+  initial_strategy: 0
+  initial_smart: false
+}
 input_layer_names: "seq_input"
 input_layer_names: "sub_seq_input"
 output_layer_names: "__last_seq_0__"
@@ -488,6 +566,7 @@ output_layer_names: "__first_seq_0__"
 output_layer_names: "__last_seq_1__"
 output_layer_names: "__last_seq_2__"
 output_layer_names: "__last_seq_3__"
+output_layer_names: "__last_seq_4__"
 sub_models {
  name: "root"
  layer_names: "seq_input"
@@ -510,6 +589,9 @@ sub_models {
  layer_names: "__gru_group_0___recurrent_group"
  layer_names: "__gru_group_0__"
  layer_names: "__last_seq_3__"
+  layer_names: "__recurrent_group_3__"
+  layer_names: "__fc_layer_0__"
+  layer_names: "__last_seq_4__"
  input_layer_names: "seq_input"
  input_layer_names: "sub_seq_input"
  output_layer_names: "__last_seq_0__"
@@ -517,6 +599,7 @@ sub_models {
  output_layer_names: "__last_seq_1__"
  output_layer_names: "__last_seq_2__"
  output_layer_names: "__last_seq_3__"
+  output_layer_names: "__last_seq_4__"
  is_recurrent_layer_group: false
 }
 sub_models {
@@ -647,4 +730,28 @@ sub_models {
  }
  target_inlinkid: -1
 }
+sub_models {
+  name: "__recurrent_group_3__"
+  layer_names: "seq_input@__recurrent_group_3__"
+  layer_names: "__memory_6__@__recurrent_group_3__"
+  layer_names: "__fc_layer_0__@__recurrent_group_3__"
+  is_recurrent_layer_group: true
+  reversed: false
+  memories {
+    layer_name: "__fc_layer_0__@__recurrent_group_3__"
+    link_name: "__memory_6__@__recurrent_group_3__"
+    is_sequence: false
+  }
+  in_links {
+    layer_name: "seq_input"
+    link_name: "seq_input@__recurrent_group_3__"
+    has_subseq: false
+  }
+  out_links {
+    layer_name: "__fc_layer_0__@__recurrent_group_3__"
+    link_name: "__fc_layer_0__"
+    has_subseq: false
+  }
+  target_inlinkid: -1
+}
--- a/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_rnn_group.py
@@ -16,6 +16,16 @@ def generate_rnn_simple(name):
    return rnn_simple
+def generate_rnn_simple_no_name():
+    def rnn_simple(s):
+        m = memory(name=None, size=200)
+        fc = fc_layer(input=[s, m], size=200)
+        m.set_input(fc)
+        return fc
+    return rnn_simple
 with mixed_layer() as lstm_param:  # test lstm unit, rnn group
    lstm_param += full_matrix_projection(input=seq, size=100 * 4)
@@ -33,4 +43,6 @@ outputs(
    last_seq(input=lstmemory_group(
        input=lstm_param, size=100)),
    last_seq(input=gru_group(
-        input=gru_param, size=100)))
+        input=gru_param, size=100)),
+    last_seq(input=recurrent_group(
+        step=generate_rnn_simple_no_name(), input=seq)), )
--- a/python/paddle/v2/dataset/cifar.py
+++ b/python/paddle/v2/dataset/cifar.py
@@ -20,7 +20,7 @@ TODO(yuyang18): Complete the comments.
 import cPickle
 import itertools
 import numpy
-import paddle.v2.dataset.common
+from common import download
 import tarfile
 __all__ = ['train100', 'test100', 'train10', 'test10']
@@ -55,23 +55,23 @@ def reader_creator(filename, sub_name):
 def train100():
    return reader_creator(
-        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
+        download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'train')
-        'train')
 def test100():
-    return reader_creator(
+    return reader_creator(download(CIFAR100_URL, 'cifar', CIFAR100_MD5), 'test')
-        paddle.v2.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5),
-        'test')
 def train10():
    return reader_creator(
-        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'data_batch')
-        'data_batch')
 def test10():
    return reader_creator(
-        paddle.v2.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5),
+        download(CIFAR10_URL, 'cifar', CIFAR10_MD5), 'test_batch')
-        'test_batch')
+def fetch():
+    download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
+    download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
--- a/python/paddle/v2/dataset/common.py
+++ b/python/paddle/v2/dataset/common.py
@@ -17,6 +17,8 @@ import hashlib
 import os
 import shutil
 import sys
+import importlib
+import paddle.v2.dataset
 __all__ = ['DATA_HOME', 'download', 'md5file']
@@ -69,3 +71,13 @@ def dict_add(a_dict, ele):
        a_dict[ele] += 1
    else:
        a_dict[ele] = 1
+def fetch_all():
+    for module_name in filter(lambda x: not x.startswith("__"),
+                              dir(paddle.v2.dataset)):
+        if "fetch" in dir(
+                importlib.import_module("paddle.v2.dataset.%s" % module_name)):
+            getattr(
+                importlib.import_module("paddle.v2.dataset.%s" % module_name),
+                "fetch")()
--- a/python/paddle/v2/dataset/conll05.py
+++ b/python/paddle/v2/dataset/conll05.py
@@ -196,3 +196,11 @@ def test():
        words_name='conll05st-release/test.wsj/words/test.wsj.words.gz',
        props_name='conll05st-release/test.wsj/props/test.wsj.props.gz')
    return reader_creator(reader, word_dict, verb_dict, label_dict)
+def fetch():
+    download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
+    download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
+    download(TRGDICT_URL, 'conll05st', TRGDICT_MD5)
+    download(EMB_URL, 'conll05st', EMB_MD5)
+    download(DATA_URL, 'conll05st', DATA_MD5)
--- a/python/paddle/v2/dataset/imdb.py
+++ b/python/paddle/v2/dataset/imdb.py
@@ -123,3 +123,7 @@ def test(word_idx):
 def word_dict():
    return build_dict(
        re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
+def fetch():
+    paddle.v2.dataset.common.download(URL, 'imdb', MD5)
--- a/python/paddle/v2/dataset/imikolov.py
+++ b/python/paddle/v2/dataset/imikolov.py
@@ -89,3 +89,7 @@ def train(word_idx, n):
 def test(word_idx, n):
    return reader_creator('./simple-examples/data/ptb.valid.txt', word_idx, n)
+def fetch():
+    paddle.v2.dataset.common.download(URL, "imikolov", MD5)
--- a/python/paddle/v2/dataset/mnist.py
+++ b/python/paddle/v2/dataset/mnist.py
@@ -106,3 +106,10 @@ def test():
                                          TEST_IMAGE_MD5),
        paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist',
                                          TEST_LABEL_MD5), 100)
+def fetch():
+    paddle.v2.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
+    paddle.v2.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
+    paddle.v2.dataset.common.download(TEST_IMAGE_URL, 'mnist', TEST_IMAGE_MD5)
+    paddle.v2.dataset.common.download(TEST_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
--- a/python/paddle/v2/dataset/movielens.py
+++ b/python/paddle/v2/dataset/movielens.py
@@ -30,6 +30,9 @@ __all__ = [
 age_table = [1, 18, 25, 35, 45, 50, 56]
+URL = 'http://files.grouplens.org/datasets/movielens/ml-1m.zip'
+MD5 = 'c4d9eecfca2ab87c1945afe126590906'
 class MovieInfo(object):
    def __init__(self, index, categories, title):
@@ -77,10 +80,7 @@ USER_INFO = None
 def __initialize_meta_info__():
-    fn = download(
+    fn = download(URL, "movielens", MD5)
-        url='http://files.grouplens.org/datasets/movielens/ml-1m.zip',
-        module_name='movielens',
-        md5sum='c4d9eecfca2ab87c1945afe126590906')
    global MOVIE_INFO
    if MOVIE_INFO is None:
        pattern = re.compile(r'^(.*)\((\d+)\)$')
@@ -205,5 +205,9 @@ def unittest():
    print train_count, test_count
+def fetch():
+    download(URL, "movielens", MD5)
 if __name__ == '__main__':
    unittest()
--- a/python/paddle/v2/dataset/sentiment.py
+++ b/python/paddle/v2/dataset/sentiment.py
@@ -125,3 +125,7 @@ def test():
    """
    data_set = load_sentiment_data()
    return reader_creator(data_set[NUM_TRAINING_INSTANCES:])
+def fetch():
+    nltk.download('movie_reviews', download_dir=common.DATA_HOME)
--- a/python/paddle/v2/dataset/uci_housing.py
+++ b/python/paddle/v2/dataset/uci_housing.py
@@ -89,3 +89,7 @@ def test():
            yield d[:-1], d[-1:]
    return reader
+def fetch():
+    download(URL, 'uci_housing', MD5)
--- a/python/paddle/v2/dataset/wmt14.py
+++ b/python/paddle/v2/dataset/wmt14.py
@@ -16,7 +16,7 @@ wmt14 dataset
 """
 import tarfile
-import paddle.v2.dataset.common
+from paddle.v2.dataset.common import download
 __all__ = ['train', 'test', 'build_dict']
@@ -95,11 +95,13 @@ def reader_creator(tar_file, file_name, dict_size):
 def train(dict_size):
    return reader_creator(
-        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'train/train', dict_size)
-        'train/train', dict_size)
 def test(dict_size):
    return reader_creator(
-        paddle.v2.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
+        download(URL_TRAIN, 'wmt14', MD5_TRAIN), 'test/test', dict_size)
-        'test/test', dict_size)
+def fetch():
+    download(URL_TRAIN, 'wmt14', MD5_TRAIN)
--- a/python/paddle/v2/tests/test_layer.py
+++ b/python/paddle/v2/tests/test_layer.py
@@ -22,7 +22,9 @@ import paddle.v2.networks as networks
 pixel = layer.data(name='pixel', type=data_type.dense_vector(128))
 label = layer.data(name='label', type=data_type.integer_value(10))
-weight = layer.data(name='weight', type=data_type.dense_vector(10))
+weight = layer.data(name='weight', type=data_type.dense_vector(1))
+combine_weight = layer.data(
+    name='weight_combine', type=data_type.dense_vector(10))
 score = layer.data(name='score', type=data_type.dense_vector(1))
 hidden = layer.fc(input=pixel,
@@ -81,7 +83,8 @@ class AggregateLayerTest(unittest.TestCase):
 class MathLayerTest(unittest.TestCase):
    def test_math_layer(self):
        addto = layer.addto(input=[pixel, pixel])
-        linear_comb = layer.linear_comb(weights=weight, vectors=hidden, size=10)
+        linear_comb = layer.linear_comb(
+            weights=combine_weight, vectors=hidden, size=10)
        interpolation = layer.interpolation(
            input=[hidden, hidden], weight=score)
        bilinear = layer.bilinear_interp(input=conv, out_size_x=4, out_size_y=4)