Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into backward

5f3bc2a4 · fengjiayi · 380227b5 · 477c8356 · 5f3bc2a4 · 5f3bc2a4
103 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,6 +36,8 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
+option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    OFF)
+option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      OFF)
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
@@ -74,6 +76,10 @@ if(ANDROID)
        "Disable PYTHON when cross-compiling for Android" FORCE)
    set(WITH_RDMA OFF CACHE STRING
        "Disable RDMA when cross-compiling for Android" FORCE)
+    set(WITH_MKLDNN OFF CACHE STRING
+        "Disable MKLDNN when cross-compiling for Android" FORCE)
+    set(WITH_MKLML OFF CACHE STRING
+        "Disable MKLML package when cross-compiling for Android" FORCE)
 endif(ANDROID)

 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
@@ -87,6 +93,7 @@ endif()

 ########################################################################################

+include(external/mklml)     # download mklml package
 include(external/zlib)      # download, build, install zlib
 include(external/gflags)    # download, build, install gflags
 include(external/glog)      # download, build, install glog
@@ -94,6 +101,7 @@ include(external/gtest)     # download, build, install gtest
 include(external/protobuf)  # download, build, install protobuf
 include(external/python)    # download, build, install python
 include(external/openblas)  # download, build, install openblas
+include(external/mkldnn)    # download, build, install mkldnn
 include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
 include(external/any)       # download libn::any
@@ -135,6 +143,10 @@ if(WITH_GPU)
    endif(NOT WITH_DSO)
 endif(WITH_GPU)

+if(WITH_MKLDNN)
+    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIBRARY} ${MKLDNN_IOMP_LIB})
+endif()
+
 if(USE_NNPACK)
    include(external/nnpack)
    list(APPEND EXTERNAL_LIBS ${NNPACK_LIBS})

--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -15,23 +15,44 @@

 set(CBLAS_FOUND OFF)

-## Find MKL First.
-set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs")
-set(MKL_ROOT ${INTEL_ROOT}/mkl CACHE PATH "Folder contains MKL")
+## Find MKLML First.
+if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
+  set(CBLAS_FOUND ON)
+  set(CBLAS_PROVIDER MKLML)
+  set(CBLAS_INC_DIR ${MKLML_INC_DIR})
+  set(CBLAS_LIBRARIES ${MKLML_LIB})
+
+  add_definitions(-DPADDLE_USE_MKLML)
+  add_definitions(-DLAPACK_FOUND)
+
+  message(STATUS "Found cblas and lapack in MKLML "
+    "(include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  return()
+endif()
+
+## Then find MKL.
+set(INTEL_MKL_ROOT "/opt/intel/mkl" CACHE PATH "Folder contains intel mkl libs")
+set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains env MKL")
+
+set(MKL_INCLUDE_SEARCH_PATHS
+  ${MKL_ROOT}/include
+  ${INTEL_MKL_ROOT}/include)
+set(MKL_LIB_SEARCH_PATHS
+  ${MKL_ROOT}/lib
+  ${MKL_ROOT}/lib/intel64
+  ${INTEL_MKL_ROOT}/lib
+  ${INTEL_MKL_ROOT}/lib/intel64)

 find_path(MKL_INC_DIR mkl.h PATHS
-  ${MKL_ROOT}/include)
+  ${MKL_INCLUDE_SEARCH_PATHS})
 find_path(MKL_LAPACK_INC_DIR mkl_lapacke.h PATHS
-  ${MKL_ROOT}/include)
+  ${MKL_INCLUDE_SEARCH_PATHS})
 find_library(MKL_CORE_LIB NAMES mkl_core PATHS
-  ${MKL_ROOT}/lib
-  ${MKL_ROOT}/lib/intel64)
+  ${MKL_LIB_SEARCH_PATHS})
 find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS
-  ${MKL_ROOT}/lib
-  ${MKL_ROOT}/lib/intel64)
+  ${MKL_LIB_SEARCH_PATHS})
 find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
-  ${MKL_ROOT}/lib
-  ${MKL_ROOT}/lib/intel64)
+  ${MKL_LIB_SEARCH_PATHS})

 if(MKL_LAPACK_INC_DIR AND MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
  set(CBLAS_FOUND ON)

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -67,6 +67,30 @@ else()
    include_directories(${CUDA_TOOLKIT_INCLUDE})
 endif(NOT WITH_GPU)

+if(WITH_MKLDNN)
+    add_definitions(-DPADDLE_USE_MKLDNN)
+    if (WITH_MKLML AND MKLDNN_IOMP_DIR)
+        message(STATUS "Enable Intel OpenMP at ${MKLDNN_IOMP_DIR}")
+        set(OPENMP_FLAGS "-fopenmp")
+        set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
+        set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
+        set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
+        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
+        set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
+    else()
+        find_package(OpenMP)
+        if(OPENMP_FOUND)
+            set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+        else()
+            message(WARNING "Can not find OpenMP."
+                 "Some performance features in MKLDNN may not be available")
+        endif()
+    endif()
+
+endif(WITH_MKLDNN)
+
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")


--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -34,9 +34,15 @@ IF(WITH_TESTING)
            "${GTEST_INSTALL_DIR}/lib/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE)
    ENDIF(WIN32)

+    IF(WITH_MKLML)
+        # wait for mklml downloading completed
+        SET(GTEST_DEPENDS   ${MKLML_PROJECT})
+    ENDIF()
+
    ExternalProject_Add(
        extern_gtest
        ${EXTERNAL_PROJECT_LOG_ARGS}
+        DEPENDS         ${GTEST_DEPENDS}
        GIT_REPOSITORY  "https://github.com/google/googletest.git"
        GIT_TAG         "release-1.8.0"
        PREFIX          ${GTEST_SOURCES_DIR}

--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(NOT ${WITH_MKLDNN})
+  return()
+ENDIF(NOT ${WITH_MKLDNN})
+
+INCLUDE(ExternalProject)
+
+SET(MKLDNN_PROJECT        "extern_mkldnn")
+SET(MKLDNN_SOURCES_DIR    ${THIRD_PARTY_PATH}/mkldnn)
+SET(MKLDNN_INSTALL_ROOT   ${CMAKE_INSTALL_PREFIX})
+IF(NOT "$ENV{HOME}" STREQUAL "/root")
+    SET(MKLDNN_INSTALL_ROOT  "$ENV{HOME}")
+ENDIF()
+
+SET(MKLDNN_INSTALL_DIR    "${MKLDNN_INSTALL_ROOT}/opt/paddle/third_party/mkldnn")
+SET(MKLDNN_INCLUDE_DIR    "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
+
+IF(WIN32)
+    MESSAGE(WARNING "It is not supported compiling with mkldnn in windows Paddle yet."
+      "Force WITH_MKLDNN=OFF")
+    SET(WITH_MKLDNN OFF)
+    return()
+ELSE(WIN32)
+    SET(MKLDNN_LIBRARY "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
+    MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
+    SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
+    #SET(CMAKE_MACOSX_RPATH 1) # hold for MacOS
+    SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
+ENDIF(WIN32)
+
+INCLUDE_DIRECTORIES(${MKLDNN_INCLUDE_DIR})
+
+IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
+    SET(MKLDNN_DEPENDS   ${MKLML_PROJECT})
+    SET(MKLDNN_MKLROOT   ${MKLML_ROOT})
+    SET(MKLDNN_IOMP_LIB  ${MKLML_IOMP_LIB})
+    SET(MKLDNN_IOMP_DIR  ${MKLML_LIB_DIR})
+ENDIF()
+
+ExternalProject_Add(
+    ${MKLDNN_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    DEPENDS             ${MKLDNN_DEPENDS}
+    GIT_REPOSITORY      "https://github.com/01org/mkl-dnn.git"
+    GIT_TAG             "v0.9"
+    PREFIX              ${MKLDNN_SOURCES_DIR}
+    CONFIGURE_COMMAND   mkdir -p <SOURCE_DIR>/build
+    BUILD_COMMAND       cd <SOURCE_DIR>/build
+                        && cmake .. -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} -DMKLROOT=${MKLDNN_MKLROOT}
+                        && $(MAKE)
+    INSTALL_COMMAND     cd <SOURCE_DIR>/build && $(MAKE) install
+    UPDATE_COMMAND      ""
+)
+
+ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIBRARY})
+ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
+MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIBRARY}")
+LIST(APPEND external_project_dependencies mkldnn)
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(NOT ${WITH_MKLML})
+  return()
+ENDIF(NOT ${WITH_MKLML})
+
+INCLUDE(ExternalProject)
+
+SET(MKLML_PROJECT       "extern_mklml")
+SET(MKLML_VER           "mklml_lnx_2018.0.20170425")
+SET(MKLML_URL           "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz")
+SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")
+SET(MKLML_DOWNLOAD_DIR  "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
+SET(MKLML_DST_DIR       "opt/paddle/third_party/mklml")
+SET(MKLML_INSTALL_ROOT  "${CMAKE_INSTALL_PREFIX}")
+IF(NOT "$ENV{HOME}" STREQUAL "/root")
+    SET(MKLML_INSTALL_ROOT  "$ENV{HOME}")
+ENDIF()
+
+SET(MKLML_INSTALL_DIR   ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
+SET(MKLML_ROOT          ${MKLML_INSTALL_DIR}/${MKLML_VER})
+SET(MKLML_INC_DIR       ${MKLML_ROOT}/include)
+SET(MKLML_LIB_DIR       ${MKLML_ROOT}/lib)
+SET(MKLML_LIB           ${MKLML_LIB_DIR}/libmklml_intel.so)
+SET(MKLML_IOMP_LIB      ${MKLML_LIB_DIR}/libiomp5.so)
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
+
+INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
+
+SET(mklml_cmakefile ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt)
+FILE(WRITE ${mklml_cmakefile} "PROJECT(MKLML)\n"
+                              "cmake_minimum_required(VERSION 3.0)\n"
+                              "install(DIRECTORY ${MKLML_VER}\n"
+                              "        DESTINATION ${MKLML_DST_DIR})\n")
+
+ExternalProject_Add(
+    ${MKLML_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    PREFIX                ${MKLML_SOURCE_DIR}
+    DOWNLOAD_DIR          ${MKLML_DOWNLOAD_DIR}
+    DOWNLOAD_COMMAND      wget --no-check-certificate -O ${MKLML_DOWNLOAD_DIR}/${MKLML_VER}.tgz ${MKLML_URL}
+                          && tar -xzf ${MKLML_DOWNLOAD_DIR}/${MKLML_VER}.tgz
+    DOWNLOAD_NO_PROGRESS  1
+    UPDATE_COMMAND        ""
+    CMAKE_ARGS            -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT} 
+    CMAKE_CACHE_ARGS      -DCMAKE_INSTALL_PREFIX:PATH=${MKLML_INSTALL_ROOT}
+)
+
+ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
+ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
+LIST(APPEND external_project_dependencies mklml)
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -124,6 +124,7 @@ set(GPU_COMMON_FLAGS
    -Wno-error=literal-suffix
    -Wno-error=unused-local-typedefs
    -Wno-error=unused-function  # Warnings in Numpy Header.
+    -Wno-error=array-bounds # Warnings in Eigen::array
 )

 if (APPLE)

--- a/go/cmd/master/master.go
+++ b/go/cmd/master/master.go
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package main

 import (

--- a/go/cmd/pserver/pserver.go
+++ b/go/cmd/pserver/pserver.go
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package main

 import (

--- a/go/connection/conn.go
+++ b/go/connection/conn.go
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package connection

 import (

--- a/go/master/CMakeLists.txt
+++ b/go/master/CMakeLists.txt
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 if(WITH_TESTING)
  go_test(master_test)
 endif()
--- a/go/master/c/CMakeLists.txt
+++ b/go/master/c/CMakeLists.txt
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 go_library(paddle_master SHARED DEPS paddle_go_optimizer)
--- a/go/master/c/client.go
+++ b/go/master/c/client.go
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package main

 /*

--- a/go/master/client.go
+++ b/go/master/client.go
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package master

 import (

--- a/go/master/client_internal_test.go
+++ b/go/master/client_internal_test.go
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package master

 import (

--- a/go/master/client_test.go
+++ b/go/master/client_test.go
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package master_test

 import (

--- a/go/master/etcd_client.go
+++ b/go/master/etcd_client.go
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package master

 import (

--- a/go/master/inmem_store.go
+++ b/go/master/inmem_store.go
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package master

 import "sync"

--- a/go/master/service.go
+++ b/go/master/service.go
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package master

 import (

--- a/go/master/service_internal_test.go
+++ b/go/master/service_internal_test.go
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package master

 import "testing"

--- a/go/pserver/CMakeLists.txt
+++ b/go/pserver/CMakeLists.txt
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 if(WITH_TESTING)
  go_test(pserver_test DEPS paddle_go_optimizer)
 endif()
--- a/go/pserver/client/CMakeLists.txt
+++ b/go/pserver/client/CMakeLists.txt
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 if(WITH_TESTING)
  go_test(pserver_client_test DEPS paddle_go_optimizer)
 endif()
--- a/go/pserver/client/c/CMakeLists.txt
+++ b/go/pserver/client/c/CMakeLists.txt
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 cc_library(paddle_go_optimizer DEPS paddle_optimizer paddle_proto glog gflags protobuf)
 target_link_libraries(paddle_go_optimizer stdc++ m)


--- a/go/pserver/client/c/cclient.go
+++ b/go/pserver/client/c/cclient.go
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package main

 /*

--- a/go/pserver/client/c/test/CMakeLists.txt
+++ b/go/pserver/client/c/test/CMakeLists.txt
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_go_optimizer)
 add_style_check_target(test_cclient test_cclient.c)
--- a/go/pserver/client/c/test/test_cclient.c
+++ b/go/pserver/client/c/test/test_cclient.c
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
 #include <stdio.h>
 #include <stdlib.h>


--- a/go/pserver/client/client.go
+++ b/go/pserver/client/client.go
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package client

 import (

--- a/go/pserver/client/client_test.go
+++ b/go/pserver/client/client_test.go
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package client_test

 import (

--- a/go/pserver/client/etcd_client.go
+++ b/go/pserver/client/etcd_client.go
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package client

 import (
@@ -66,10 +80,10 @@ func (p *EtcdClient) List() []Server {
 	for {
 		for i := 0; i < psDesired; i++ {
 			ctx, cancel := context.WithTimeout(context.Background(), p.timeout)
-			cancel()
 			psKey := pserver.PsPath + strconv.Itoa(i)
 			log.Debugf("checking %s", psKey)
 			resp, err := p.client.Get(ctx, psKey)
+			cancel()
 			if err != nil {
 				log.Infof("Get psKey= %s error, %v", psKey, err)
 				time.Sleep(p.timeout)

--- a/go/pserver/etcd_client.go
+++ b/go/pserver/etcd_client.go
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package pserver

 import (

--- a/go/pserver/optimizer.go
+++ b/go/pserver/optimizer.go
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package pserver

 // #cgo CFLAGS: -I ../../

--- a/go/pserver/optimizer_test.go
+++ b/go/pserver/optimizer_test.go
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package pserver

 import (

--- a/go/pserver/service.go
+++ b/go/pserver/service.go
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package pserver

 import (

--- a/go/pserver/service_test.go
+++ b/go/pserver/service_test.go
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package pserver_test

 import (

--- a/go/utils/networkhelper/CMakeLists.txt
+++ b/go/utils/networkhelper/CMakeLists.txt
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
 if(WITH_TESTING)
  go_test(network_helper_test)
 endif()
--- a/go/utils/networkhelper/helper.go
+++ b/go/utils/networkhelper/helper.go
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package networkhelper

 import (

--- a/go/utils/networkhelper/helper_test.go
+++ b/go/utils/networkhelper/helper_test.go
+// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+// http://www.apache.org/licenses/LICENSE-2.0
+
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 package networkhelper

 import "testing"

--- a/paddle/framework/eigen.h
+++ b/paddle/framework/eigen.h
@@ -61,24 +61,24 @@ struct EigenTensor {
  }
 };

+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {};
+
 template <typename T, int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
 struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
-  // Flatten is to reshape a Tensor into a one dimension EigenVector
-  static typename EigenTensor<T, 1>::Type Flatten(Tensor& tensor) {
-    return EigenTensor<T, 1>::From(
+  // Flatten reshapes a Tensor into an EigenVector.
+  static typename EigenVector::Type Flatten(Tensor& tensor) {
+    return EigenVector::From(
        tensor, make_ddim({static_cast<int>(product(tensor.dims_))}));
  }

-  static typename EigenTensor<T, 1>::ConstType Flatten(const Tensor& tensor) {
-    return EigenTensor<T, 1>::From(
+  static typename EigenVector::ConstType Flatten(const Tensor& tensor) {
+    return EigenVector::From(
        tensor, make_ddim({static_cast<int>(product(tensor.dims_))}));
  }
 };

-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = EigenTensor<T, 2, MajorType, IndexType>;
-
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/net.cc
+++ b/paddle/framework/net.cc
@@ -49,19 +49,22 @@ void PlainNet::CompleteAddOp(bool calc) {
      output_set.insert(opt);
    }
  }
+
  inputs_.reserve(input_set.size());
  std::copy(input_set.begin(), input_set.end(), std::back_inserter(inputs_));
+  std::sort(inputs_.begin(), inputs_.end());

  outputs_.reserve(output_set.size());
+  std::copy(output_set.begin(), output_set.end(), std::back_inserter(outputs_));
+  std::sort(outputs_.begin(), outputs_.end());
+
  std::vector<int> tmp_index;
  tmp_index.reserve(temp_output.size());
-  int idx = 0;
-  for (auto& opt : output_set) {
-    if (Contains(temp_output, opt)) {
-      tmp_index.push_back(idx);
+  int output_len = static_cast<int>(outputs_.size());
+  for (int i = 0; i < output_len; ++i) {
+    if (Contains(temp_output, outputs_[i])) {
+      tmp_index.push_back(i);
    }
-    outputs_.push_back(opt);
-    ++idx;
  }

  attrs_["temporary_index"] = tmp_index;
@@ -69,9 +72,12 @@ void PlainNet::CompleteAddOp(bool calc) {

 std::string PlainNet::DebugString() const {
  std::ostringstream os;
-  os << this->type_ << ":" << std::endl;
+  os << OperatorBase::DebugString() << std::endl;
  for (auto& op : ops_) {
-    os << "\t" << op->DebugString() << std::endl;
+    std::istringstream is(op->DebugString());
+    for (std::string line; std::getline(is, line);) {
+      os << "    " << line << std::endl;
+    }
  }
  return os.str();
 }

--- a/paddle/framework/net.h
+++ b/paddle/framework/net.h
@@ -39,7 +39,7 @@ namespace framework {
 */
 class Net : public OperatorBase {
 public:
-  virtual void AddOp(const OperatorPtr& op) = 0;
+  virtual void AddOp(const std::shared_ptr<OperatorBase>& op) = 0;
  virtual void CompleteAddOp(bool calc) = 0;
 };

@@ -57,7 +57,7 @@ class PlainNet : public Net {
   * Infer all the operators' input and output variables' shapes, will be called
   * before every mini-batch
   */
-  void InferShape(const ScopePtr& scope) const override {
+  void InferShape(const std::shared_ptr<Scope>& scope) const override {
    for (auto& op : ops_) {
      op->InferShape(scope);
    }
@@ -70,7 +70,7 @@ class PlainNet : public Net {
   * scope will be used instead. If no OpContext is provicded, default context
   * will be used.
   */
-  void Run(const ScopePtr& scope,
+  void Run(const std::shared_ptr<Scope>& scope,
           const platform::DeviceContext& dev_ctx) const override {
    for (auto& op : ops_) {
      op->Run(scope, dev_ctx);
@@ -80,7 +80,7 @@ class PlainNet : public Net {
  /**
   * @brief Add an operator by ptr
   */
-  void AddOp(const OperatorPtr& op) override {
+  void AddOp(const std::shared_ptr<OperatorBase>& op) override {
    PADDLE_ENFORCE(!add_op_done_, "Cannot AddOp when this network is sealed");
    ops_.push_back(op);
  }
@@ -89,7 +89,7 @@ class PlainNet : public Net {

  std::string DebugString() const override;

-  std::vector<OperatorPtr> ops_;
+  std::vector<std::shared_ptr<OperatorBase>> ops_;

 private:
  bool add_op_done_{false};

--- a/paddle/framework/net_op_test.cc
+++ b/paddle/framework/net_op_test.cc
@@ -16,9 +16,11 @@ static int run_cnt = 0;

 class TestOp : public OperatorBase {
 public:
-  void InferShape(const ScopePtr& scope) const override { ++infer_shape_cnt; }
-  void Run(const ScopePtr& scope,
-           const platform::DeviceContext& dev_ctx) const override {
+  void InferShape(const std::shared_ptr<pd::Scope>& scope) const override {
+    ++infer_shape_cnt;
+  }
+  void Run(const std::shared_ptr<pd::Scope>& scope,
+           const paddle::platform::DeviceContext& dev_ctx) const override {
    ++run_cnt;
  }
 };

--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -250,7 +250,7 @@ class OpRegistry {
    grad_creators()[op_type] = [] { return new OpType; };
  }

-  static OperatorPtr CreateOp(const std::string& type,
+  static std::shared_ptr<OperatorBase> CreateOp(const std::string& type,
                                                const VarNameList& inputs,
                                                const VarNameList& outputs,
                                                const AttributeMap& attrs) {
@@ -276,10 +276,10 @@ class OpRegistry {
    }

    op->Init();
-    return OperatorPtr(op);
+    return std::shared_ptr<OperatorBase>(op);
  }

-  static OperatorPtr CreateOp(const OpDesc& op_desc) {
+  static std::shared_ptr<OperatorBase> CreateOp(const OpDesc& op_desc) {
    std::vector<std::string> inputs;
    inputs.reserve((size_t)op_desc.inputs_size());
    std::copy(op_desc.inputs().begin(), op_desc.inputs().end(),

--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -7,9 +7,9 @@ namespace paddle {
 namespace framework {
 class CosineOp : public OperatorBase {
 public:
-  void Run(const ScopePtr& scope,
+  void Run(const std::shared_ptr<Scope>& scope,
           const platform::DeviceContext& dev_ctx) const override {}
-  void InferShape(const ScopePtr& scope) const override {}
+  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
 };

 class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
@@ -27,8 +27,8 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {

 class MyTestOp : public OperatorBase {
 public:
-  void InferShape(const ScopePtr& scope) const override {}
-  void Run(const ScopePtr& scope,
+  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
+  void Run(const std::shared_ptr<Scope>& scope,
           const platform::DeviceContext& dev_ctx) const override {}
 };

@@ -67,7 +67,7 @@ TEST(OpRegistry, CreateOp) {
  attr->set_type(paddle::framework::AttrType::FLOAT);
  attr->set_f(scale);

-  paddle::framework::OperatorPtr op =
+  std::shared_ptr<paddle::framework::OperatorBase> op =
      paddle::framework::OpRegistry::CreateOp(op_desc);
  auto scope = std::make_shared<paddle::framework::Scope>();
  paddle::platform::CPUDeviceContext dev_ctx;
@@ -89,7 +89,6 @@ TEST(OpRegistry, IllegalAttr) {

  bool caught = false;
  try {
-    paddle::framework::OperatorPtr op __attribute__((unused)) =
    paddle::framework::OpRegistry::CreateOp(op_desc);
  } catch (std::runtime_error& err) {
    caught = true;
@@ -110,7 +109,7 @@ TEST(OpRegistry, DefaultValue) {

  ASSERT_TRUE(op_desc.IsInitialized());

-  paddle::framework::OperatorPtr op =
+  std::shared_ptr<paddle::framework::OperatorBase> op =
      paddle::framework::OpRegistry::CreateOp(op_desc);
  auto scope = std::make_shared<paddle::framework::Scope>();
  paddle::platform::CPUDeviceContext dev_ctx;
@@ -136,7 +135,6 @@ TEST(OpRegistry, CustomChecker) {
  // attr 'test_attr' is not set
  bool caught = false;
  try {
-    paddle::framework::OperatorPtr op __attribute__((unused)) =
    paddle::framework::OpRegistry::CreateOp(op_desc);
  } catch (std::runtime_error& err) {
    caught = true;
@@ -155,7 +153,6 @@ TEST(OpRegistry, CustomChecker) {
  attr->set_i(3);
  caught = false;
  try {
-    paddle::framework::OperatorPtr op __attribute__((unused)) =
    paddle::framework::OpRegistry::CreateOp(op_desc);
  } catch (std::runtime_error& err) {
    caught = true;
@@ -174,8 +171,7 @@ TEST(OpRegistry, CustomChecker) {
  attr->set_type(paddle::framework::AttrType::INT);
  attr->set_i(4);
  SetInputFormat(&op_desc);
-  paddle::framework::OperatorPtr op =
-      paddle::framework::OpRegistry::CreateOp(op_desc);
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
  paddle::platform::CPUDeviceContext dev_ctx;
  auto scope = std::make_shared<paddle::framework::Scope>();
  op->Run(scope, dev_ctx);

--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -47,7 +47,6 @@ struct EigenDeviceConverter<platform::GPUPlace> {
 #endif

 class OperatorBase;
-using OperatorPtr = std::shared_ptr<OperatorBase>;
 /**
 * OperatorBase has the basic element that Net will call to do computation.
 * Only CreateOperator from OpRegistry will new Operator directly. User
@@ -85,10 +84,10 @@ class OperatorBase {

  /// InferShape infer the size of Variables used by this Operator with
  /// information inside scope
-  virtual void InferShape(const ScopePtr& scope) const = 0;
+  virtual void InferShape(const std::shared_ptr<Scope>& scope) const = 0;

  /// Net will call this function to Run an op.
-  virtual void Run(const ScopePtr& scope,
+  virtual void Run(const std::shared_ptr<Scope>& scope,
                   const platform::DeviceContext& dev_ctx) const = 0;

  // Get a input with argument's name described in `op_proto`
@@ -213,7 +212,7 @@ class OperatorWithKernel : public OperatorBase {
  using OpKernelMap =
      std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;

-  void Run(const ScopePtr& scope,
+  void Run(const std::shared_ptr<Scope>& scope,
           const platform::DeviceContext& dev_ctx) const final {
    auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
    opKernel->Compute(KernelContext(this, scope, dev_ctx));

--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -24,8 +24,8 @@ static int op_run_num = 0;
 class OpWithoutKernelTest : public OperatorBase {
 public:
  void Init() override { x = 1; }
-  void InferShape(const ScopePtr& scope) const override {}
-  void Run(const ScopePtr& scope,
+  void InferShape(const std::shared_ptr<Scope>& scope) const override {}
+  void Run(const std::shared_ptr<Scope>& scope,
           const platform::DeviceContext& dev_ctx) const override {
    op_run_num++;
    ASSERT_EQ((int)inputs_.size(), 1);
@@ -70,8 +70,7 @@ TEST(OperatorBase, all) {
  paddle::platform::CPUDeviceContext device_context;
  auto scope = std::make_shared<paddle::framework::Scope>();

-  paddle::framework::OperatorPtr op =
-      paddle::framework::OpRegistry::CreateOp(op_desc);
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
  scope->CreateVariable("OUT1");
  ASSERT_EQ(paddle::framework::op_run_num, 0);
  op->Run(scope, device_context);
@@ -189,8 +188,7 @@ TEST(OpKernel, all) {
  paddle::platform::CPUDeviceContext cpu_device_context;
  auto scope = std::make_shared<paddle::framework::Scope>();

-  paddle::framework::OperatorPtr op =
-      paddle::framework::OpRegistry::CreateOp(op_desc);
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
  op->Run(scope, cpu_device_context);
  ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
@@ -236,6 +234,6 @@ TEST(OpKernel, multi_inputs) {
  paddle::platform::CPUDeviceContext cpu_device_context;
  auto scope = std::make_shared<Scope>();

-  OperatorPtr op(paddle::framework::OpRegistry::CreateOp(op_desc));
+  auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
  op->Run(scope, cpu_device_context);
 }
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -24,7 +24,6 @@ namespace paddle {
 namespace framework {

 class Scope;
-using ScopePtr = std::shared_ptr<Scope>;

 /**
 * @brief Scope that manage all variables.
@@ -44,7 +43,7 @@ class Scope {
  /**
   * @brief Initialize a Scope with parent.
   */
-  explicit Scope(const ScopePtr& parent) : parent_(parent) {}
+  explicit Scope(const std::shared_ptr<Scope>& parent) : parent_(parent) {}

  /**
   * @brief Create Variable
@@ -91,7 +90,7 @@ class Scope {

 private:
  std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
-  ScopePtr parent_{nullptr};
+  std::shared_ptr<Scope> parent_{nullptr};
 };

 }  // namespace framework

--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -48,25 +48,27 @@ class Tensor {

  template <typename T>
  const T* data() const {
-    CheckDims<T>();
+    EnforceSufficientMemory<T>();
    return reinterpret_cast<const T*>(
        reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
  }

  template <typename T>
  T* data() {
-    CheckDims<T>();
+    EnforceSufficientMemory<T>();
    return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
                                offset_);
  }

-  template <typename T>
+  template <typename T,  // must be POD types
+            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
  T* mutable_data(DDim dims, platform::Place place) {
-    set_dims(dims);
+    Resize(dims);
    return mutable_data<T>(place);
  }

-  template <typename T>
+  template <typename T,  // must be POD types
+            typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
  T* mutable_data(platform::Place place) {
    PADDLE_ENFORCE(product(dims_) > 0,
                   "Tensor's numel must be larger than zero to call "
@@ -95,11 +97,9 @@ class Tensor {
  }

  template <typename T>
-  void ShareDataFrom(const Tensor& src) {
-    src.CheckDims<T>();
-    holder_ = src.holder_;
-    set_dims(src.dims());
-    offset_ = src.offset_;
+  void ShareDataWith(const Tensor& src) {
+    src.EnforceSufficientMemory<T>();
+    *this = src;
  }

  template <typename T>
@@ -107,9 +107,9 @@ class Tensor {
    PADDLE_ENFORCE(platform::is_cpu_place(src.holder_->place()) &&
                       platform::is_cpu_place(dst_place),
                   "Tensor::CopyFrom only support CPU now.");
-    src.CheckDims<T>();
+    src.EnforceSufficientMemory<T>();
    size_t size = product(src.dims_) * sizeof(T);
-    set_dims(src.dims());
+    Resize(src.dims());
    const void* src_ptr = static_cast<const void*>(src.data<T>());
    void* dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
    memcpy(dst_ptr, src_ptr, size);
@@ -117,34 +117,25 @@ class Tensor {

  template <typename T>
  Tensor Slice(const int& begin_idx, const int& end_idx) const {
-    CheckDims<T>();
-    PADDLE_ENFORCE(begin_idx >= 0 && end_idx <= dims_[0],
-                   "Slice index is less than zero or out of bound.");
+    EnforceSufficientMemory<T>();
+    PADDLE_ENFORCE(begin_idx >= 0, "Slice begin index is less than zero.");
+    PADDLE_ENFORCE(end_idx <= dims_[0], "Slice end index is out of bound.");
    PADDLE_ENFORCE(begin_idx < end_idx,
                   "Begin index must be less than end index.");
    PADDLE_ENFORCE(dims_[0] != 1, "Can not slice a tensor with dims_[0] = 1.");
-    std::vector<int> d = vectorize(dims_);
-    int base = 1;
-    for (size_t i = 1; i < d.size(); ++i) {
-      base *= d[i];
-    }
+    int base = product(dims_) / dims_[0];
    Tensor dst;
    dst.holder_ = holder_;
    DDim dst_dims = dims_;
    dst_dims[0] = end_idx - begin_idx;
-    dst.set_dims(dst_dims);
+    dst.Resize(dst_dims);
    dst.offset_ = offset_ + begin_idx * base * sizeof(T);
    return dst;
  }

-  void set_dims(const DDim& dims) {
-    if (dims == dims_) {
-      return;
-    }
-    dims_ = dims;
-  }
+  void Resize(const DDim& dims) { dims_ = dims; }

-  DDim dims() const { return dims_; }
+  const DDim& dims() const { return dims_; }

 private:
  // Placeholder hides type T, so it doesn't appear as a template
@@ -159,21 +150,9 @@ class Tensor {

  template <typename T, typename PlaceType>
  struct PlaceholderImpl : public Placeholder {
-   private:
-    template <typename PType>
-    class Deleter {
-     public:
-      Deleter(PType place) : place_(place) {}
-      void operator()(T* ptr) { memory::Free(place_, static_cast<void*>(ptr)); }
-
-     private:
-      PType place_;
-    };
-
-   public:
    PlaceholderImpl(PlaceType place, size_t size)
        : ptr_(static_cast<T*>(memory::Alloc(place, size)),
-               Deleter<PlaceType>(place)),
+               memory::PODDeleter<T, PlaceType>(place)),
          place_(place),
          size_(size) {}

@@ -182,13 +161,13 @@ class Tensor {
    virtual paddle::platform::Place place() const { return place_; }
    virtual std::type_index type() const { return std::type_index(typeid(T)); }

-    std::unique_ptr<T, Deleter<PlaceType>> ptr_;
+    std::unique_ptr<T, memory::PODDeleter<T, PlaceType>> ptr_;
    platform::Place place_;  // record the place of ptr_.
    size_t size_;            // size of the memory block.
  };

  template <typename T>
-  inline void CheckDims() const {
+  inline void EnforceSufficientMemory() const {
    PADDLE_ENFORCE(holder_ != nullptr,
                   "Tenosr holds no memory. Call Tensor::mutable_data first.");
    PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_,
@@ -198,7 +177,11 @@ class Tensor {

  std::shared_ptr<Placeholder> holder_;  // holds the memory block if allocated.
  DDim dims_;
-  size_t offset_;  // marks the begin of tensor data area.
+  // A PlaceHolder may be shared by more than one tensor. Some of them may be
+  // slices of the others. So the offset_ is introduced here to indicate the
+  // byte offset between PlaceHolder::ptr_ and where tensor's data really
+  // begins.
+  size_t offset_;
 };

 }  // namespace framework

--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -19,7 +19,7 @@ TEST(Tensor, Dims) {
  using namespace paddle::framework;
  using namespace paddle::platform;
  Tensor tt;
-  tt.set_dims(make_ddim({2, 3, 4}));
+  tt.Resize(make_ddim({2, 3, 4}));
  DDim dims = tt.dims();
  ASSERT_EQ(arity(dims), 3);
  for (int i = 0; i < 3; ++i) {
@@ -97,7 +97,7 @@ TEST(Tensor, MutableData) {
 #endif
 }

-TEST(Tensor, ShareDataFrom) {
+TEST(Tensor, ShareDataWith) {
  using namespace paddle::framework;
  using namespace paddle::platform;
  {
@@ -106,7 +106,7 @@ TEST(Tensor, ShareDataFrom) {
    // Try to share data form uninitialized tensor
    bool caught = false;
    try {
-      dst_tensor.ShareDataFrom<float>(src_tensor);
+      dst_tensor.ShareDataWith<float>(src_tensor);
    } catch (std::runtime_error& err) {
      caught = true;
      std::string msg =
@@ -119,7 +119,7 @@ TEST(Tensor, ShareDataFrom) {
    ASSERT_TRUE(caught);

    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CPUPlace());
-    dst_tensor.ShareDataFrom<int>(src_tensor);
+    dst_tensor.ShareDataWith<int>(src_tensor);
    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
  }

@@ -128,7 +128,7 @@ TEST(Tensor, ShareDataFrom) {
    Tensor src_tensor;
    Tensor dst_tensor;
    src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
-    dst_tensor.ShareDataFrom<int>(src_tensor);
+    dst_tensor.ShareDataWith<int>(src_tensor);
    ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
  }
 #endif

--- a/paddle/function/ConvOpTest.cpp
+++ b/paddle/function/ConvOpTest.cpp
@@ -31,13 +31,22 @@ public:
  ConvolutionTest(const std::string& conv1,
                  const std::string& conv2,
                  TestType type,
+                  bool useGroups = true,
                  std::string algo = "auto") {
    for (size_t batchSize : {1, 32}) {
      for (size_t inputSize : {7, 14, 54}) {
        for (size_t filterSize : {1, 3, 5}) {
          for (size_t inputChannels : {3, 64}) {
-            for (size_t outputChannels : {3, 64, 128}) {
-              if (inputChannels < outputChannels) break;
+            for (size_t outputChannels : {3, 64}) {
+              if (inputChannels > outputChannels) break;
+              size_t groups;
+              if (!useGroups) {
+                groups = 1;
+              } else {
+                if (outputChannels % inputChannels != 0) continue;
+                groups = inputChannels;
+              }
+
              for (size_t stride : {1, 2}) {
                for (size_t padding : {0, 1}) {
                  if (padding >= filterSize) break;
@@ -62,13 +71,24 @@ public:
                      FuncConfig()
                          .set("paddings", paddings)
                          .set("strides", strides)
-                          .set("groups", (size_t)1)
+                          .set("groups", groups)
                          .set("algo", algo));

                  TensorShape input{
                      batchSize, inputChannels, inputSize, inputSize};
-                  TensorShape filter{
-                      outputChannels, inputChannels, filterSize, filterSize};
+
+                  TensorShape filter;
+                  if (groups > 1)
+                    filter = TensorShape({groups,
+                                          outputChannels / groups,
+                                          inputChannels / groups,
+                                          filterSize,
+                                          filterSize});
+                  else
+                    filter = TensorShape({outputChannels,
+                                          inputChannels,
+                                          filterSize,
+                                          filterSize});
                  TensorShape output{
                      batchSize, outputChannels, outputSize, outputSize};

@@ -85,7 +105,8 @@ public:
                  } else if (type == kBackwardFilterTest) {
                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter),
+                                    ADD_TO);
                    test.run();
                  }
                }
@@ -106,6 +127,7 @@ public:
  ConvolutionTest2(const std::string& conv1,
                   const std::string& conv2,
                   TestType type,
+                   bool useGroups = true,
                   std::string algo = "auto") {
    for (size_t batchSize : {16}) {
      for (size_t inputHeight : {7, 31}) {
@@ -113,7 +135,15 @@ public:
          for (size_t filterHeight : {1, 5}) {
            for (size_t filterWidth : {3, 7}) {
              for (size_t inputChannels : {7}) {
-                for (size_t outputChannels : {32}) {
+                for (size_t outputChannels : {7}) {
+                  size_t groups;
+                  if (!useGroups) {
+                    groups = 1;
+                  } else {
+                    if (outputChannels % inputChannels != 0) continue;
+                    groups = inputChannels;
+                  }
+
                  size_t stride = 1;
                  size_t padding = 0;
                  size_t outputHeight =
@@ -141,13 +171,24 @@ public:
                      FuncConfig()
                          .set("paddings", paddings)
                          .set("strides", strides)
-                          .set("groups", (size_t)1)
+                          .set("groups", groups)
                          .set("algo", algo));

                  TensorShape input{
                      batchSize, inputChannels, inputHeight, inputWidth};
-                  TensorShape filter{
-                      outputChannels, inputChannels, filterHeight, filterWidth};
+
+                  TensorShape filter;
+                  if (groups > 1)
+                    filter = TensorShape({groups,
+                                          outputChannels / groups,
+                                          inputChannels / groups,
+                                          filterHeight,
+                                          filterWidth});
+                  else
+                    filter = TensorShape({outputChannels,
+                                          inputChannels,
+                                          filterHeight,
+                                          filterWidth});
                  TensorShape output{
                      batchSize, outputChannels, outputHeight, outputWidth};

@@ -164,7 +205,8 @@ public:
                  } else if (type == kBackwardFilterTest) {
                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter),
+                                    ADD_TO);
                    test.run();
                  }
                }
@@ -177,34 +219,88 @@ public:
  }
 };

+// ======Start Convolution TEST======
+
 TEST(Forward, GEMM) {
  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test(
-      "NaiveConv-CPU", "GemmConv-CPU", kForwardTest);
+      "NaiveConv-CPU", "GemmConv-CPU", kForwardTest, false);
  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test2(
-      "NaiveConv-CPU", "GemmConv-CPU", kForwardTest);
+      "NaiveConv-CPU", "GemmConv-CPU", kForwardTest, false);
 }

 #ifndef PADDLE_ONLY_CPU
 TEST(Forward, GEMM2) {
  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConv-CPU", "GemmConv-GPU", kForwardTest);
+      "GemmConv-CPU", "GemmConv-GPU", kForwardTest, false);
  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConv-CPU", "GemmConv-GPU", kForwardTest);
+      "GemmConv-CPU", "GemmConv-GPU", kForwardTest, false);
 }

 TEST(BackwardInput, GEMM) {
  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", kBackwardInputTest);
+      "GemmConvGradInput-CPU",
+      "GemmConvGradInput-GPU",
+      kBackwardInputTest,
+      false);
  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", kBackwardInputTest);
+      "GemmConvGradInput-CPU",
+      "GemmConvGradInput-GPU",
+      kBackwardInputTest,
+      false);
 }

 TEST(BackwardFilter, GEMM) {
  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", kBackwardFilterTest);
+      "GemmConvGradFilter-CPU",
+      "GemmConvGradFilter-GPU",
+      kBackwardFilterTest,
+      false);
  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", kBackwardFilterTest);
+      "GemmConvGradFilter-CPU",
+      "GemmConvGradFilter-GPU",
+      kBackwardFilterTest,
+      false);
 }
 #endif
+// ======End Convolution TEST======
+
+// ======Start DepthwiseConvolution TEST======
+
+// TODO(zhaolong) The depthwise convolution cpu test will be added when the cpu
+// version of depthwiseConv is implemented.
+
+#ifndef PADDLE_ONLY_CPU
+
+TEST(DepthwiseConvForward, GEMM2) {
+  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
+      "GemmConv-CPU", "DepthwiseConv-GPU", kForwardTest);
+  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "GemmConv-CPU", "DepthwiseConv-GPU", kForwardTest);
+}
+
+TEST(DepthwiseConvBackwardInput, GEMM) {
+  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
+      "GemmConvGradInput-CPU",
+      "DepthwiseConvGradInput-GPU",
+      kBackwardInputTest);
+  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "GemmConvGradInput-CPU",
+      "DepthwiseConvGradInput-GPU",
+      kBackwardInputTest);
+}
+
+TEST(DepthwiseConvBackwardFilter, GEMM) {
+  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
+      "GemmConvGradFilter-CPU",
+      "DepthwiseConvGradFilter-GPU",
+      kBackwardFilterTest);
+  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "GemmConvGradFilter-CPU",
+      "DepthwiseConvGradFilter-GPU",
+      kBackwardFilterTest);
+}
+
+#endif
+// ======End DepthwiseConvolution TEST======

 }  // namespace paddle
--- a/paddle/function/CropOpGpu.cu
+++ b/paddle/function/CropOpGpu.cu
@@ -39,7 +39,8 @@ void Crop<DEVICE_TYPE_GPU>(real* outputs,
                          const TensorShape inShape,
                          const TensorShape outShape,
                          const FuncConfig& conf) {
-  std::vector<uint32_t> crop_corner = conf.get<std::vector<uint32_t>>("crop_corner");
+  std::vector<uint32_t> crop_corner =
+        conf.get<std::vector<uint32_t>>("crop_corner");
  int cropC = crop_corner[1];
  int cropH = crop_corner[2];
  int cropW = crop_corner[3];
@@ -74,7 +75,8 @@ __global__ void KeCropDiff(const real* inGrad, real* outGrad,
    const int c = (idx / inW / inH) % inC;
    const int n = idx / inW / inH / inC;

-    const int off = ((n * outC + c + cropC) * outH + h + cropH) * outW + cropW + w;
+    const int off =
+        ((n * outC + c + cropC) * outH + h + cropH) * outW + cropW + w;

    outGrad[off] += inGrad[idx];
  }
@@ -86,7 +88,8 @@ void CropGrad<DEVICE_TYPE_GPU>(const real* inGrad,
                              const TensorShape inShape,
                              const TensorShape outShape,
                              const FuncConfig& conf) {
-  std::vector<uint32_t> crop_corner = conf.get<std::vector<uint32_t>>("crop_corner");
+  std::vector<uint32_t> crop_corner =
+        conf.get<std::vector<uint32_t>>("crop_corner");
  int cropC = crop_corner[1];
  int cropH = crop_corner[2];
  int cropW = crop_corner[3];

--- a/paddle/function/DepthwiseConvOp.cpp
+++ b/paddle/function/DepthwiseConvOp.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DepthwiseConvOp.h"
+#include "ConvOp.h"
+#include "GemmFunctor.h"
+
+namespace paddle {
+
+template <class T>
+class DepthwiseConvFunctor<DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* inputData,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* outputData) {
+    // TODO(zhaolong) : cpu implementation of depthwise convolution
+  }
+};
+
+template <class T>
+class DepthwiseConvGradInputFunctor<DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* outputGrad,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* inputGrad) {}
+  // TODO(zhaolong) : cpu implementation of depthwise convolution
+};
+
+template <class T>
+class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* outputGrad,
+                  const T* inputData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* colData,
+                  T* filterGrad) {}
+  // TODO(zhaolong) : cpu implementation of depthwise convolution
+};
+
+/*
+ * \brief Forward calculation of depthwise convolution.
+ */
+template <DeviceType Device>
+class DepthwiseConvFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    check(inputs, outputs);
+
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+    size_t filterMultiplier = outputChannels / groups_;
+    CHECK_EQ(inputChannels, groups_);
+
+    real* inputData = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* outputData = outputs[0].data<real>();
+
+    DepthwiseConvFunctor<Device, real> depthwiseConv;
+    depthwiseConv(inputData,
+                  filterData,
+                  batchSize,
+                  outputChannels,
+                  outputHeight,
+                  outputWidth,
+                  inputChannels,
+                  inputHeight,
+                  inputWidth,
+                  filterMultiplier,
+                  filterHeight,
+                  filterWidth,
+                  strideH(),
+                  strideW(),
+                  paddingH(),
+                  paddingW(),
+                  outputData);
+  }
+};
+
+/*
+ * \brief Backward input calculation of depthwise convolution.
+ */
+template <DeviceType Device>
+class DepthwiseConvGradInputFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& input = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    check(inputs, outputs);
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& input = outputs[0].shape();
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+    size_t filterMultiplier = outputChannels / groups_;
+    CHECK_EQ(inputChannels, groups_);
+
+    real* outputGrad = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* inputGrad = outputs[0].data<real>();
+
+    DepthwiseConvGradInputFunctor<Device, real> depthwiseConvGradInput;
+    depthwiseConvGradInput(outputGrad,
+                           filterData,
+                           batchSize,
+                           outputChannels,
+                           outputHeight,
+                           outputWidth,
+                           inputChannels,
+                           inputHeight,
+                           inputWidth,
+                           filterMultiplier,
+                           filterHeight,
+                           filterWidth,
+                           strideH(),
+                           strideW(),
+                           paddingH(),
+                           paddingW(),
+                           inputGrad);
+  }
+};
+
+/*
+ * \brief Backward filter calculation of depthwise convolution.
+ */
+template <DeviceType Device>
+class DepthwiseConvGradFilterFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+
+  void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& input = inputs[1].shape();
+    const TensorShape& filter = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    check(inputs, outputs);
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& input = inputs[1].shape();
+    const TensorShape& filter = outputs[0].shape();
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+    size_t filterMultiplier = outputChannels / groups_;
+    CHECK_EQ(inputChannels, groups_);
+
+    real* outputGrad = inputs[0].data<real>();
+    real* inputData = inputs[1].data<real>();
+    real* filterGrad = outputs[0].data<real>();
+
+    int size = outputChannels * filterHeight * filterWidth * outputHeight *
+               outputWidth;
+    resizeBuffer<Device>(size);
+    real* colData = reinterpret_cast<real*>(memory_->getBuf());
+
+    DepthwiseConvGradFilterFunctor<Device, real> depthwiseConvGradFilter;
+
+    depthwiseConvGradFilter(outputGrad,
+                            inputData,
+                            batchSize,
+                            outputChannels,
+                            outputHeight,
+                            outputWidth,
+                            inputChannels,
+                            inputHeight,
+                            inputWidth,
+                            filterMultiplier,
+                            filterHeight,
+                            filterWidth,
+                            strideH(),
+                            strideW(),
+                            paddingH(),
+                            paddingW(),
+                            colData,
+                            filterGrad);
+  }
+};
+
+REGISTER_TYPED_FUNC(DepthwiseConv, CPU, DepthwiseConvFunction);
+REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
+                    CPU,
+                    DepthwiseConvGradInputFunction);
+REGISTER_TYPED_FUNC(DepthwiseConvGradFilter,
+                    CPU,
+                    DepthwiseConvGradFilterFunction);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(DepthwiseConv, GPU, DepthwiseConvFunction);
+REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
+                    GPU,
+                    DepthwiseConvGradInputFunction);
+REGISTER_TYPED_FUNC(DepthwiseConvGradFilter,
+                    GPU,
+                    DepthwiseConvGradFilterFunction);
+#endif
+
+}  // namespace paddle
--- a/paddle/function/DepthwiseConvOp.h
+++ b/paddle/function/DepthwiseConvOp.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "TensorType.h"
+
+namespace paddle {
+
+/**
+ *\brief   Depthwise convolution forward. The outputData
+ *         of depthwise convolution is same with ExpandConvLayer
+ *         when groups equals inputChannels in ExpandConvLayer.
+ *
+ * \param[in]   inputData         input data.
+ * \param[in]   filterData        the Paramters of the depthwise conv layer..
+ * \param[in]   batchSize         batch size of input data.
+ * \param[in]   outputChannels    channels of outputData.
+ * \param[in]   outputHeight      height of outputData.
+ * \param[in]   outputWidth       width of outputData.
+ * \param[in]   inputChannels     channels of inputData.
+ * \param[in]   inputHeight       height of inputData.
+ * \param[in]   inputWidth        width of inputData..
+ * \param[in]   filterMultiplier  equals to outputChannels/groups_.
+ * \param[in]   filterHeight      height of filter.
+ * \param[in]   filterWidth       widht of filter.
+ * \param[in]   strideH           stride size in height direction.
+ * \param[in]   strideW           stride size in width direction.
+ * \param[in]   paddingH          padding size in height direction.
+ * \param[in]   paddingW          padding size in width direction.
+ * \param[out]  outputData        outputData.
+ *
+ */
+template <DeviceType Device, class T>
+class DepthwiseConvFunctor {
+public:
+  void operator()(const T* inputData,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* outputData);
+};
+
+/**
+ *\brief  Functor tot compute the depthwise convolution backprop w.r.t input.
+ *
+ *
+ * \param[in]   outputGradData    the grad data of output.
+ * \param[in]   filterData        the Paramters of the depthwise conv layer..
+ * \param[in]   batchSize         batch size of input data.
+ * \param[in]   outputChannels    channels of outputData.
+ * \param[in]   outputHeight      height of outputData.
+ * \param[in]   outputWidth       width of outputData.
+ * \param[in]   inputChannels     channels of input data.
+ * \param[in]   inputHeight       height of inputData.
+ * \param[in]   inputWidth        width of inputData.
+ * \param[in]   filterMultiplier  equals to outputChannels/groups_.
+ * \param[in]   filterHeight      height of filter.
+ * \param[in]   filterWidth       widht of filter.
+ * \param[in]   strideH           stride size in height direction.
+ * \param[in]   strideW           stride size in width direction.
+ * \param[in]   paddingH          padding size in height direction.
+ * \param[in]   paddingW          padding size in width direction.
+ * \param[out]  inputGrad         the grad data of input.
+ *
+ */
+template <DeviceType Device, class T>
+class DepthwiseConvGradInputFunctor {
+public:
+  void operator()(const T* outputGrad,
+                  const T* filterData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* inputGrad);
+};
+
+/**
+ *\brief  Functor tot compute the depthwise convolution backprop w.r.t filter.
+ *
+ * \param[in]   outputGradData    the grad data of output.
+ * \param[in]   inputData         inputData.
+ * \param[in]   batchSize         batch size of input data.
+ * \param[in]   outputChannels    channels of outputData.
+ * \param[in]   outputHeight      height of outputData.
+ * \param[in]   outputWidth       width of outputData.
+ * \param[in]   inputChannels     channels of input data.
+ * \param[in]   inputHeight       height of inputData.
+ * \param[in]   inputWidth        width of inputData.
+ * \param[in]   filterMultiplier  equals to outputChannels/groups_.
+ * \param[in]   filterHeight      height of filter.
+ * \param[in]   filterWidth       widht of filter.
+ * \param[in]   strideH           stride size in height direction.
+ * \param[in]   strideW           stride size in width direction.
+ * \param[in]   paddingH          padding size in height direction.
+ * \param[in]   paddingW          padding size in width direction.
+ * \param[in]   colData           Auxiliary data when calculating filterGrad.
+ * \param[in]   multiplierData    Auxiliary data when calculating filterGrad.
+ * \param[out]  filterGrad        the grad data of filter.
+ *
+ */
+template <DeviceType Device, class T>
+class DepthwiseConvGradFilterFunctor {
+public:
+  void operator()(const T* outputGrad,
+                  const T* inputData,
+                  int batchSize,
+                  int outputChannels,
+                  int outputHeight,
+                  int outputWidth,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterMultiplier,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideH,
+                  int strideW,
+                  int paddingH,
+                  int paddingW,
+                  T* colData,
+                  T* filterGrad);
+};
+
+}  // namespace paddle
--- a/paddle/function/DepthwiseConvOpGpu.cu
+++ b/paddle/function/DepthwiseConvOpGpu.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "DepthwiseConvOp.h"
+#include "GemmFunctor.h"
+#include "paddle/math/BaseMatrix.h"
+
+namespace paddle {
+
+// CUDA kernel to compute the depthwise convolution forward pass
+template <class T>
+__global__
+void ConvolutionDepthwiseForward(const int nthreads,
+    const T* const inputData, const T* const filterData,
+    const int batchSize, const int outputChannels, const int outputHeight,
+    const int outputWidth, const int inputChannels, const int inputHeight,
+    const int inputWidth, const int filterMultiplier, const int filterHeight,
+    const int filterWidth, const int strideH, const int strideW,
+    const int paddingH, const int paddingW, T* const outputData) {
+
+  int index =
+    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+
+  if (index < nthreads) {
+    const int batch = index / outputChannels / outputHeight / outputWidth;
+    const int c_out = (index / outputHeight / outputWidth) % outputChannels;
+    const int h_out = (index / outputWidth) % outputHeight;
+    const int w_out = index % outputWidth;
+
+    const int c_in = c_out / filterMultiplier;
+    const T* weight = filterData + c_out * filterHeight * filterWidth;
+    T value = 0;
+    const int h_in_start = -paddingH + h_out * strideH;
+    const int w_in_start = -paddingW + w_out * strideW;
+    const int h_in_end = -paddingH + h_out * strideH + filterHeight - 1;
+    const int w_in_end = -paddingW + w_out * strideW + filterWidth - 1;
+    if ((h_in_start >= 0) && (h_in_end < inputHeight)
+       && (w_in_start >= 0) && (w_in_end < inputWidth)) {
+        for (int kh = 0; kh < filterHeight; ++kh) {
+            for (int kw = 0; kw < filterWidth; ++kw) {
+                const int h_in = -paddingH + h_out * strideH + kh;
+                const int w_in = -paddingW + w_out * strideW + kw;
+                const int offset = ((batch * inputChannels + c_in)
+                    * inputHeight + h_in) * inputWidth + w_in;
+                value += (*weight) * inputData[offset];
+                ++weight;
+            }
+        }
+    } else {
+        for (int kh = 0; kh < filterHeight; ++kh) {
+            for (int kw = 0; kw < filterWidth; ++kw) {
+                const int h_in = -paddingH + h_out * strideH + kh;
+                const int w_in = -paddingW + w_out * strideW + kw;
+                if ((h_in >= 0) && (h_in < inputHeight)
+                   && (w_in >= 0) && (w_in < inputWidth)) {
+                    const int offset = ((batch * inputChannels + c_in)
+                        * inputHeight + h_in) * inputWidth + w_in;
+                    value += (*weight) * inputData[offset];
+                }
+                ++weight;
+            }
+       }
+    }
+    outputData[index] = value;
+  }
+}
+
+// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
+template <class T>
+__global__
+void ConvolutionDepthwiseInputBackward(const int nthreads,
+    const T* const top_diff, const T* const weight_data,
+    const int num, const int outputChannels, const int outputHeight,
+    const int outputWidth, const int inputChannels, const int inputHeight,
+    const int inputWidth, const int filterMultiplier, const int filterHeight,
+    const int filterWidth, const int strideH, const int strideW,
+    const int paddingH, const int paddingW, T* const bottom_diff) {
+  int index =
+    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    const int batch = index / inputChannels / inputHeight / inputWidth;
+    const int c_in = (index / inputHeight / inputWidth) % inputChannels;
+    const int h_in = (index / inputWidth) % inputHeight;
+    const int w_in = index % inputWidth;
+
+    const int c_out_start = c_in * filterMultiplier;
+
+    int h_out_start = (h_in - filterHeight + paddingH + strideH)/strideH;
+    h_out_start = 0 > h_out_start ? 0 : h_out_start;
+    int h_out_end = (h_in + paddingH)/strideH;
+    h_out_end = outputHeight - 1 < h_out_end? outputHeight - 1 : h_out_end;
+    int w_out_start = (w_in - filterWidth + paddingW + strideW)/strideW;
+    w_out_start = 0 > w_out_start ? 0 : w_out_start;
+    int w_out_end = (w_in + paddingW)/strideW;
+    w_out_end = outputWidth - 1 < w_out_end? outputWidth - 1 : w_out_end;
+
+    T value = 0;
+
+    for (int c_out = c_out_start;
+         c_out < c_out_start + filterMultiplier; c_out ++) {
+        for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
+            const int filter_h = h_in + paddingH - h_out * strideH;
+            for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
+                const int filter_w = w_in + paddingW - w_out * strideW;
+                const int filter_offset = c_out * filterHeight * filterWidth
+                    + filter_h * filterWidth + filter_w;
+                const int top_diff_offset = ((batch * outputChannels + c_out) *
+                    outputHeight + h_out)* outputWidth + w_out;
+                value += top_diff[top_diff_offset] * weight_data[filter_offset];
+            }
+        }
+    }
+    bottom_diff[index] += value;
+   }
+}
+
+// CUDA kernel to compute the depthwise convolution backprop w.r.t filter.
+template <class T>
+__global__
+void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads,
+    const T* const top_diff, const T* const inputData,
+    const int num, const int outputChannels, const int outputHeight,
+    const int outputWidth, const int inputChannels, const int inputHeight,
+    const int inputWidth, const int filterMultiplier, const int filterHeight,
+    const int filterWidth, const int strideH, const int strideW,
+    const int paddingH, const int paddingW, T* const buffer_data) {
+  int index =
+    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < nthreads) {
+    const int h_out = (index / outputWidth) % outputHeight;
+    const int w_out = index % outputWidth;
+    const int kh = (index / filterWidth / outputHeight / outputWidth)
+          % filterHeight;
+    const int kw = (index / outputHeight / outputWidth) % filterWidth;
+    const int h_in = -paddingH + h_out * strideH + kh;
+    const int w_in = -paddingW + w_out * strideW + kw;
+    if ((h_in >= 0) && (h_in < inputHeight)
+          && (w_in >= 0) && (w_in < inputWidth)) {
+      const int c_out = index /
+            (filterHeight * filterWidth * outputHeight * outputWidth);
+      const int c_in = c_out / filterMultiplier;
+      const int batch = num_i;
+      const int top_offset = ((batch * outputChannels + c_out) *
+            outputHeight + h_out) * outputWidth + w_out;
+      const int bottom_offset = ((batch * inputChannels + c_in)
+            * inputHeight + h_in) * inputWidth + w_in;
+      buffer_data[index] = top_diff[top_offset] * inputData[bottom_offset];
+    } else {
+      buffer_data[index] = 0;
+    }
+  }
+}
+
+template <class T>
+class DepthwiseConvFunctor<DEVICE_TYPE_GPU, T>{
+public:
+  void operator()(const T* inputData,
+            const T* filterData,
+            int batchSize,
+            int outputChannels,
+            int outputHeight,
+            int outputWidth,
+            int inputChannels,
+            int inputHeight,
+            int inputWidth,
+            int filterMultiplier,
+            int filterHeight,
+            int filterWidth,
+            int strideH,
+            int strideW,
+            int paddingH,
+            int paddingW,
+            T* outputData){
+    int outputSize = batchSize * outputChannels * outputHeight * outputWidth;
+
+    size_t blocks = (outputSize + 1024 -1) / 1024;
+    size_t blockX = 512;
+    size_t blockY = (blocks+512-1)/512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+
+    ConvolutionDepthwiseForward<T>
+        <<< grid, threads, 0, STREAM_DEFAULT >>>(
+            outputSize,
+            inputData,
+            filterData,
+            batchSize,
+            outputChannels,
+            outputHeight,
+            outputWidth,
+            inputChannels,
+            inputHeight,
+            inputWidth,
+            filterMultiplier,
+            filterHeight,
+            filterWidth,
+            strideH,
+            strideW,
+            paddingH,
+            paddingW,
+            outputData);
+    }
+};
+
+template <class T>
+class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, T>{
+public:
+  void operator()(const T* outputGrad,
+            const T* filterData,
+            int batchSize,
+            int outputChannels,
+            int outputHeight,
+            int outputWidth,
+            int inputChannels,
+            int inputHeight,
+            int inputWidth,
+            int filterMultiplier,
+            int filterHeight,
+            int filterWidth,
+            int strideH,
+            int strideW,
+            int paddingH,
+            int paddingW,
+            T* inputGrad){
+    int inputSize = batchSize * inputChannels * inputHeight * inputWidth;
+
+    size_t blocks = (inputSize + 1024 -1) / 1024;
+    size_t blockX = 512;
+    size_t blockY = (blocks+512-1)/512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+
+
+    ConvolutionDepthwiseInputBackward<T>
+          // NOLINT_NEXT_LINE(whitespace/operators)
+        <<< grid, threads, 0, STREAM_DEFAULT >>>(
+            inputSize,
+            outputGrad,
+            filterData,
+            batchSize,
+            outputChannels,
+            outputHeight,
+            outputWidth,
+            inputChannels,
+            inputHeight,
+            inputWidth,
+            filterMultiplier,
+            filterHeight,
+            filterWidth,
+            strideH,
+            strideW,
+            paddingH,
+            paddingW,
+            inputGrad);
+    }
+};
+
+template <class T>
+class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(const T* outputGrad,
+                const T* inputData,
+                int batchSize,
+                int outputChannels,
+                int outputHeight,
+                int outputWidth,
+                int inputChannels,
+                int inputHeight,
+                int inputWidth,
+                int filterMultiplier,
+                int filterHeight,
+                int filterWidth,
+                int strideH,
+                int strideW,
+                int paddingH,
+                int paddingW,
+                T* colData,
+                T* filterGrad){
+        int colDataSize = outputChannels * filterHeight * filterWidth
+            * outputHeight * outputWidth;
+
+        size_t blocks = (colDataSize + 1024 -1) / 1024;
+        size_t blockX = 512;
+        size_t blockY = (blocks+512-1)/512;
+        dim3 threads(1024, 1);
+        dim3 grid(blockX, blockY);
+        BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth,
+            1, filterGrad, false, true);
+
+        for (int i = 0; i < batchSize; i++) {
+            ConvolutionDepthwiseFilterBackward<T>
+                <<< grid, threads, 0, STREAM_DEFAULT >>>(
+                    i,
+                    colDataSize,
+                    outputGrad,
+                    inputData,
+                    batchSize,
+                    outputChannels,
+                    outputHeight,
+                    outputWidth,
+                    inputChannels,
+                    inputHeight,
+                    inputWidth,
+                    filterMultiplier,
+                    filterHeight,
+                    filterWidth,
+                    strideH,
+                    strideW,
+                    paddingH,
+                    paddingW,
+                    colData);
+            int K = outputHeight * outputWidth;
+            int M = colDataSize / K;
+
+            BaseMatrix colMatrix(M, K, colData, false, true);
+            filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0);
+        }
+    }
+};
+
+#ifdef PADDLE_TYPE_DOUBLE
+template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, double>;
+template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, double>;
+template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, double>;
+#else
+template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, float>;
+template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, float>;
+template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, float>;
+#endif
+
+}  // namespace paddle
--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -38,10 +38,25 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
  inputShape_.resize(numInputs);
  filterShape_.resize(numInputs);
  outputShape_.resize(numInputs);
+
+  std::string convType;
+  std::string convGradInputType;
+  std::string convGradFilterType;
+
  for (int i = 0; i < config_.inputs_size(); i++) {
    std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
    std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};

+    if (useGpu_ && (size_t)groups_[i] == (size_t)channels_[i] && !isDeconv_) {
+      convType = "DepthwiseConv";
+      convGradInputType = "DepthwiseConvGradInput";
+      convGradFilterType = "DepthwiseConvGradFilter";
+    } else {
+      convType = "GemmConv";
+      convGradInputType = "GemmConvGradInput";
+      convGradFilterType = "GemmConvGradFilter";
+    }
+
    if (FLAGS_use_nnpack) {
      CHECK_EQ(isDeconv_, false);
      createFunction(forward_,
@@ -53,21 +68,21 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
                         .set("algo", std::string("auto")));
    } else {
      createFunction(forward_,
-                     !isDeconv_ ? "GemmConv" : "GemmConvGradInput",
+                     !isDeconv_ ? convType : convGradInputType,
                     FuncConfig()
                         .set("paddings", paddings)
                         .set("strides", strides)
                         .set("groups", (size_t)groups_[i]));

      createFunction(backward_,
-                     !isDeconv_ ? "GemmConvGradInput" : "GemmConv",
+                     !isDeconv_ ? convGradInputType : convType,
                     FuncConfig()
                         .set("paddings", paddings)
                         .set("strides", strides)
                         .set("groups", (size_t)groups_[i]));

      createFunction(backward_,
-                     "GemmConvGradFilter",
+                     convGradFilterType,
                     FuncConfig()
                         .set("paddings", paddings)
                         .set("strides", strides)

--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -347,6 +347,55 @@ TEST(Layer, CosSimVecMatLayer) {
  }
 }

+void testDepthwiseConvLayer(const string& type, bool useGpu) {
+  TestConfig config;
+  config.biasSize = 32;
+  config.layerConfig.set_type(type);
+  config.layerConfig.set_num_filters(32);
+  config.layerConfig.set_partial_sum(1);
+  config.layerConfig.set_shared_biases(true);
+
+  config.inputDefs.push_back({INPUT_DATA, "layer_0", 2048, 192});
+  LayerInputConfig* input = config.layerConfig.add_inputs();
+  ConvConfig* conv = input->mutable_conv_conf();
+  conv->set_filter_size(2);
+  conv->set_filter_size_y(3);
+  conv->set_channels(16);
+  conv->set_padding(0);
+  conv->set_padding_y(1);
+  conv->set_stride(2);
+  conv->set_stride_y(2);
+  conv->set_groups(16);
+  conv->set_filter_channels(conv->channels() / conv->groups());
+  conv->set_img_size(16);
+  conv->set_img_size_y(8);
+  conv->set_output_x(outputSize(conv->img_size(),
+                                conv->filter_size(),
+                                conv->padding(),
+                                conv->stride(),
+                                /* caffeMode */ true));
+  conv->set_output_y(outputSize(conv->img_size_y(),
+                                conv->filter_size_y(),
+                                conv->padding_y(),
+                                conv->stride_y(),
+                                /* caffeMode */ true));
+  config.layerConfig.set_size(conv->output_x() * conv->output_y() *
+                              config.layerConfig.num_filters());
+
+  testLayerGrad(config, "depthwise_conv", 100, false, useGpu);
+  // Use small batch_size and useWeight=true to test biasGrad
+  testLayerGrad(config, "depthwise_conv", 2, false, useGpu, true, 0.02);
+}
+
+TEST(Layer, depthwiseConvLayer) {
+  //  'depthwise_conv' is a sepecial case of 'exconv' whose
+  //  groups size equals to the input channels size.
+  testDepthwiseConvLayer("exconv", /* useGpu= */ false);
+#ifndef PADDLE_ONLY_CPU
+  testDepthwiseConvLayer("exconv", /* useGpu= */ true);
+#endif
+}
+
 void testConvLayer(const string& type, bool trans, bool useGpu) {
  TestConfig config;
  config.biasSize = 16;

--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -202,7 +202,7 @@ double dotProduct<double>(const int n, const double* x, const double* y) {
  return cblas_ddot(n, x, 1, y, 1);
 }

-#ifdef PADDLE_USE_MKL
+#if defined(PADDLE_USE_MKL) || defined(PADDLE_USE_MKLML)

 template <>
 void vExp<float>(const int n, const float* a, float* r) {
@@ -243,7 +243,55 @@ template <>
 void vAdd<double>(const int n, const double* a, const double* b, double* r) {
  vdAdd(n, a, b, r);
 }
+#else
+
+DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
+template <class T>
+void vExp(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vExp<T>, 0, 0>(
+      binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_BINARY_OP(vLog, b = std::log(a));
+template <class T>
+void vLog(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vLog<T>, 0, 0>(
+      binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(vPow, ONE_PARAMETER, b = std::pow(a, p));
+template <class T>
+void vPow(const int n, const T* a, const T b, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vPow<T>, 0, 0>(
+      binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_TERNARY_OP(vAdd, c = a + b);
+template <class T>
+void vAdd(const int n, const T* a, const T* b, T* r) {
+  hl_cpu_apply_ternary_op<T, ternary::vAdd<T>, 0, 0>(ternary::vAdd<T>(),
+                                                     const_cast<T*>(a),
+                                                     const_cast<T*>(b),
+                                                     r,
+                                                     1,
+                                                     n,
+                                                     n,
+                                                     n,
+                                                     n);
+}
+
+template void vExp(const int n, const float* a, float* r);
+template void vExp(const int n, const double* a, double* r);
+template void vLog(const int n, const float* a, float* r);
+template void vLog(const int n, const double* a, double* r);
+template void vPow(const int n, const float* a, const float b, float* r);
+template void vPow(const int n, const double* a, const double b, double* r);
+template void vAdd(const int n, const float* a, const float* b, float* r);
+template void vAdd(const int n, const double* a, const double* b, double* r);

+#endif
+
+#ifdef PADDLE_USE_MKL
 template <>
 void vInvSqrt<float>(const int n, const float* a, float* r) {
  vsInvSqrt(n, a, r);
@@ -275,20 +323,6 @@ void vTanh<double>(const int n, const double* a, double* r) {
 }
 #else

-DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
-template <class T>
-void vExp(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vExp<T>, 0, 0>(
-      binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_BINARY_OP(vLog, b = std::log(a));
-template <class T>
-void vLog(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vLog<T>, 0, 0>(
-      binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
 DEFINE_MATRIX_BINARY_OP(vInvSqrt, b = 1.0f / std::sqrt(a));
 template <class T>
 void vInvSqrt(const int n, const T* a, T* r) {
@@ -312,41 +346,12 @@ void vTanh(const int n, const T* a, T* r) {
      binary::vTanh<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }

-DEFINE_MATRIX_BINARY_PARAMETER_OP(vPow, ONE_PARAMETER, b = std::pow(a, p));
-template <class T>
-void vPow(const int n, const T* a, const T b, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vPow<T>, 0, 0>(
-      binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_TERNARY_OP(vAdd, c = a + b);
-template <class T>
-void vAdd(const int n, const T* a, const T* b, T* r) {
-  hl_cpu_apply_ternary_op<T, ternary::vAdd<T>, 0, 0>(ternary::vAdd<T>(),
-                                                     const_cast<T*>(a),
-                                                     const_cast<T*>(b),
-                                                     r,
-                                                     1,
-                                                     n,
-                                                     n,
-                                                     n,
-                                                     n);
-}
-
-template void vExp(const int n, const float* a, float* r);
-template void vExp(const int n, const double* a, double* r);
-template void vLog(const int n, const float* a, float* r);
-template void vLog(const int n, const double* a, double* r);
 template void vInvSqrt(const int n, const double* a, double* r);
 template void vInvSqrt(const int n, const float* a, float* r);
 template void vLog1p(const int n, const float* a, float* r);
 template void vLog1p(const int n, const double* a, double* r);
 template void vTanh(const int n, const float* a, float* r);
 template void vTanh(const int n, const double* a, double* r);
-template void vPow(const int n, const float* a, const float b, float* r);
-template void vPow(const int n, const double* a, const double b, double* r);
-template void vAdd(const int n, const float* a, const float* b, float* r);
-template void vAdd(const int n, const double* a, const double* b, double* r);

 #endif


--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -15,6 +15,12 @@ limitations under the License. */
 #ifndef MATHFUNCTIONS_H_
 #define MATHFUNCTIONS_H_

+#ifdef PADDLE_USE_MKLML
+#include <mkl_cblas.h>
+#include <mkl_lapacke.h>
+#include <mkl_vml_functions.h>
+#endif
+
 #ifdef PADDLE_USE_MKL
 #include <mkl.h>
 #include <mkl_lapacke.h>

--- a/paddle/memory/CMakeLists.txt
+++ b/paddle/memory/CMakeLists.txt
 add_subdirectory(detail)

 cc_library(memory SRCS memory.cc)
+cc_library(memcpy SRCS memcpy.cc DEPS device_context)

 cc_library(paddle_memory
    DEPS
-    memory meta_data
-    meta_cache memory_block
-    buddy_allocator system_allocator)
+    memory
+    memcpy
+    meta_data
+    meta_cache
+    memory_block
+    buddy_allocator
+    system_allocator)

 cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
--- a/paddle/memory/README.md
+++ b/paddle/memory/README.md
-## Design
+# Region-based Heterogeneous Memory Management

-### Usage
-
-To allocate 4KB CPU memory:
-
-```cpp
-p = memory::Alloc(platform::CPUPlace(), 4*1024);
-```
-
-To allocate 4KB memory on the 3rd GPU:
-
-```cpp
-p = memory::Alloc(platform::GPUPlace(2), 4*1024);
-```
-
-To free memory and check the so-far used amount of memory on a place:
-
-```cpp
-auto pl = platform::GPUPlace(0);
-p = memory::Alloc(pl, 4*1024);
-cout << memory::Used(pl);
-memory::Free(pl, p);
-```
-
-### API
-
-In `paddle/memory/memory.h` we have:
-
-```cpp
-namespace memory {
-template <typename Place> void* Alloc(Place, size_t);
-template <typename Place> void Free(Place, void*);
-template <typename Place> size_t Used(Place);
-}  // namespace memory
-```
-
-These function templates have specializations on either `platform::CPUPlace` or `platform::GPUPlace`:
-
-```cpp
-template<>
-void* Alloc<CPUPlace>(CPUPlace p, size_t size) {
-  return GetCPUBuddyAllocator()->Alloc(size);
-}
-```
-
-and 
-
-```cpp
-template<>
-void Alloc<GPUPlace>(GPUPlace p, size_t size) {
-  return GetGPUBuddyAllocator(p.id)->Alloc(size);
-}
-```
-
-Similar specializations exist for `Free` and `Used`.
-
-### Implementation
-
-`GetCPUBuddyAllocator` and `GetGPUBuddyAllocator` are singletions.
-
-```cpp
-BuddyAllocator* GetCPUBuddyAllocator() {
-  static BuddyAllocator* a = NULL;
-  if (a == NULL) {
-    a = new BuddyAllocator(new CPUAllocator /*backup allocator*/, ...);
-  }
-  return a;
-}
-
-BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
-  static BuddyAllocator* as = NULL;
-  if (as == NULL) {
-    as = new BuddyAllocator*[platform::NumGPUs()];
-    for (int gpu = 0; gpu < platform::NumGPUs(); gpu++) {
-      as[gpu] = new BuddyAllocator(new GPUAllocator(gpu) /* backup allocator */, ...);
-    }
-  }
-  return as[gpu_id);
-```
-
-#### `BuddyAllocator`
-
-`BuddyAllocator` implements the buddy allocation algorithm.  Its constructor takes parameters only related with the algorithm:
-
-```cpp
-BuddyAllocator::BuddyAllocator(initial_pool_size, max_pool_size) {
-  ...
-}
-```
-
-Please be aware that **`BuddyAllocator` always allocate aligned memory**, aligned on 32-bytes, which can hold a `BuddyAllocator::Block` object:
-
-```cpp
-class BuddyAllocator {
- private:
-  struct Block {
-    size_t size;
-    Block* left, right;
-    size_t index; // allocator id
-  };
-  ...
-};
-```
-
-Because BuddyAllocator has the meta-data of each block, it can trace the used memory -- record the amount returned by `Alloc` freed in `Free`.  Instead, `CPUAllocator` and `GPUAllocator` doesn't know the size of freed memory block and cannot do the trace.
-
-#### System Allocators
-
-The `GPUAllocator` and `CPUAllocator` are calls *system allocators*.  They work as the fallback allocators of `BuddyAllocator`.
-
-## Justification
-
-I got inspiration from Majel and Caffe2, though above design look different from both.
-
-### Caffe2
-
-In Caffe2, `Tensor<Context>::mutable_data()` allocates the memroy.  In particular, [`Tensor<Context>::mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L523) calls [`Tensor<Context>::raw_mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L459), which in turn calls [`Context::New`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L479).
-
-There are two implementations of `Context`:
-
-1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory.
-
-1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202).  This looks very similar to class `majel::GPUPlace`, who also has an `int id_` data member.   `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.
-
-### Majel
-
-In Majel, there are basically two allocator types:
-
-1. `cpu::SystemAllocator`, which has similar functionality to `caffe2::CPUContext::New/Delete`.
-1. `gpu::SystemAllocator`, which has similar functionality to `caffe2::CUDAContext::New/Delete`.
-
-However, memory allocation is not via these two allocators.  Instead, these two allocators are defined in hidden namespaces.
-
-In Majel there are hidden global variables like:
-
-1. `cpu::SystemAllocator g_cpu_allocator`, and
-1. `vector<gpu::SystemAllocator*> g_gpu_allocators(NUM_GPUS)`.
-
-Programs allocate memory via a BuddyAllocator, which can take the `g_cpu_allocator` or a `g_gpu_allocators[gpu_id]` as its *fallback allocator*, so that if BuddyAllocator cannot find a block in its memory pool, it extends its memory pool by calling the fallback allocator's `New(size_t)`.
+Please check out the [design documentation](http://gangliao.me) to find out more details about
+buddy memory allocator for both CPU and GPU.
--- a/paddle/memory/memcpy.cc
+++ b/paddle/memory/memcpy.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/memory/memcpy.h"
+
+#include <cstring>  // for memcpy
+
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace memory {
+
+template <>
+void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
+                                                  platform::CPUPlace,
+                                                  const void* src, size_t num) {
+  std::memcpy(dst, src, num);
+}
+
+#ifndef PADDLE_ONLY_CPU
+template <>
+void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::GPUPlace src_place,
+                                                  const void* src, size_t num,
+                                                  cudaStream_t stream) {
+  platform::GPUPlaceGuard g(src_place.device);
+  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
+}
+
+template <>
+void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::CPUPlace src_place,
+                                                  const void* src, size_t num,
+                                                  cudaStream_t stream) {
+  platform::GPUPlaceGuard g(dst_place.device);
+  platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
+}
+
+template <>
+void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::GPUPlace src_place,
+                                                  const void* src, size_t num,
+                                                  cudaStream_t stream) {
+  if (dst_place == src_place) {
+    platform::GPUPlaceGuard g(src_place.device);
+    platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
+  } else {
+    platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num,
+                            stream);
+  }
+}
+
+#endif  // PADDLE_ONLY_CPU
+
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/memory/memcpy.h
+++ b/paddle/memory/memcpy.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/platform/gpu_info.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace memory {
+
+template <typename DstPlace, typename SrcPlace>
+void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
+
+#ifndef PADDLE_ONLY_CPU
+template <typename DstPlace, typename SrcPlace>
+void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
+          cudaStream_t stream);
+#endif  // PADDLE_ONLY_CPU
+
+}  // namespace memory
+}  // namespace paddle
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -15,7 +15,8 @@ limitations under the License. */
 #include "paddle/memory/memory.h"
 #include "paddle/memory/detail/buddy_allocator.h"
 #include "paddle/memory/detail/system_allocator.h"
-#include "paddle/platform/assert.h"
+
+#include <cstring>  // for memcpy

 namespace paddle {
 namespace memory {

--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -14,19 +14,32 @@ limitations under the License. */

 #pragma once

+#include "paddle/platform/gpu_info.h"
 #include "paddle/platform/place.h"

 namespace paddle {
 namespace memory {

-template <class Place>
+template <typename Place>
 void* Alloc(Place, size_t);

-template <class Place>
+template <typename Place>
 void Free(Place, void*);

-template <class Place>
+template <typename Place>
 size_t Used(Place);

+template <typename T, /* must be POD types */
+          typename Place /* platform::GPUPlace or platform::CPUPlace */,
+          typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
+class PODDeleter {
+ public:
+  PODDeleter(Place place) : place_(place) {}
+  void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
+
+ private:
+  Place place_;
+};
+
 }  // namespace memory
 }  // namespace paddle
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -48,6 +48,7 @@ op_library(mul_op SRCS mul_op.cc mul_op.cu)
 op_library(rowwise_add_op SRCS rowwise_add_op.cu rowwise_add_op.cc)
 op_library(sigmoid_op SRCS sigmoid_op.cu sigmoid_op.cc)
 op_library(softmax_op SRCS softmax_op.cc softmax_op.cu)
+op_library(cross_entropy_op SRCS cross_entropy_op.cc cross_entropy_op.cu)

 op_library(fc_op SRCS fc_op.cc DEPS mul_op rowwise_add_op sigmoid_op
        softmax_op net)

--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -31,7 +31,7 @@ protected:
        "Inputs/Outputs of AddOp must all be set");
    PADDLE_ENFORCE(inputs[0]->dims() == inputs[1]->dims(),
                   "Two input of Add Op's dimension must be same.");
-    outputs[0]->set_dims(inputs[0]->dims());
+    outputs[0]->Resize(inputs[0]->dims());
  }
 };

@@ -66,8 +66,5 @@ protected:

 REGISTER_OP(add_two, paddle::operators::AddOp, paddle::operators::AddOpMaker);
 REGISTER_GRADIENT_OP(add_two, paddle::operators::AddOpGrad);
-
-typedef paddle::operators::AddKernel<::paddle::platform::CPUPlace, float>
-    AddKernel_CPU_float;
-REGISTER_OP_CPU_KERNEL(add_two, AddKernel_CPU_float);
-// REGISTER_OP_CPU_KERNEL(add_two, AddKernel_CPU_float);
+REGISTER_OP_CPU_KERNEL(
+    add_two, paddle::operators::AddKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/add_op.cu
 #include "paddle/operators/add_op.h"
 #include "paddle/framework/op_registry.h"

-typedef paddle::operators::AddKernel<::paddle::platform::GPUPlace, float> AddKernel_GPU_float;
 REGISTER_OP_GPU_KERNEL(add_two,
-                       AddKernel_GPU_float);
\ No newline at end of file
+                       paddle::operators::AddKernel<paddle::platform::GPUPlace, float>);
\ No newline at end of file
--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/cross_entropy_op.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/tensor.h"
+
+namespace paddle {
+namespace operators {
+
+class OnehotCrossEntropyOp : public framework::OperatorWithKernel {
+protected:
+  void InferShape(
+      const std::vector<const framework::Tensor *> &inputs,
+      const std::vector<framework::Tensor *> &outputs) const override {
+    PADDLE_ENFORCE(inputs.size() == 2,
+                   "Input size of OnehotCrossEntropyOp must be two");
+    PADDLE_ENFORCE(outputs.size() == 1,
+                   "Output size of OnehotCrossEntropyOp must be one");
+    PADDLE_ENFORCE(inputs[0] != nullptr && inputs[1] != nullptr,
+                   "Inputs of OnehotCrossEntropyOp must all be set");
+    PADDLE_ENFORCE(outputs[0] != nullptr,
+                   "Outputs of OnehotCrossEntropyOp must all be set");
+    PADDLE_ENFORCE(inputs[0]->dims().size() == 2, "X's dimension must be 2.");
+    PADDLE_ENFORCE(outputs[0]->dims().size() == 1,
+                   "label's dimension must be 1.");
+    outputs[0]->Resize(framework::make_ddim({inputs[0]->dims()[0]}));
+  }
+};
+
+class OnehotCrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
+public:
+  OnehotCrossEntropyOpMaker(framework::OpProto *proto,
+                            framework::OpAttrChecker *op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X", "The first input of OnehotCrossEntropyOp");
+    AddInput("label", "The second input of OnehotCrossEntropyOp");
+    AddOutput("Y", "The output of OnehotCrossEntropyOp");
+    AddComment(R"DOC(
+OnehotCrossEntropy Operator.
+
+                Y[i] = -log(X[i][j])
+
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(onehot_cross_entropy,
+            paddle::operators::OnehotCrossEntropyOp,
+            paddle::operators::OnehotCrossEntropyOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    onehot_cross_entropy,
+    paddle::operators::OnehotCrossEntropyOpKernel<::paddle::platform::CPUPlace,
+                                                  float>);
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
+#include "paddle/operators/cross_entropy_op.h"
+#include "paddle/framework/op_registry.h"
+
+REGISTER_OP_GPU_KERNEL(onehot_cross_entropy,
+                       paddle::operators::OnehotCrossEntropyOpKernel<
+                            ::paddle::platform::GPUPlace, float>);
\ No newline at end of file
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include "glog/logging.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename Place, typename T>
+class OnehotCrossEntropyOpKernel : public framework::OpKernel {
+public:
+  constexpr T LOG_THRESHOLD() const { return static_cast<T>(1e-20); }
+
+  void Compute(const framework::KernelContext& context) const override {
+    auto X = context.Input(0)->Get<framework::Tensor>();
+    const T* X_data = X.data<T>();
+    const int* label_data =
+        context.Input(1)->Get<framework::Tensor>().data<int>();
+    auto* Y = context.Output(0)->GetMutable<framework::Tensor>();
+
+    Y->mutable_data<T>(context.GetPlace());
+
+    T* Y_data = Y->data<T>();
+
+    int batch_size = X.dims()[0];
+    int class_num = X.dims()[1];
+
+    // Y[i] = -log(X[i][j])
+    for (int i = 0; i < batch_size; ++i) {
+      Y_data[i] = -std::log(
+          std::max(X_data[i * class_num + label_data[i]], LOG_THRESHOLD()));
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -12,9 +12,9 @@
   See the License for the specific language governing permissions and
   limitations under the License. */

-#include <paddle/framework/op_registry.h>
-#include <paddle/framework/tensor.h>
-#include <paddle/operators/mul_op.h>
+#include "paddle/operators/mul_op.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/tensor.h"

 namespace paddle {
 namespace operators {
@@ -33,7 +33,7 @@ protected:
        dim0[1] == dim1[0],
        "First matrix's width must be equal with second matrix's height.");
    PADDLE_ENFORCE(outputs.size() == 1, "The mul op must take one output");
-    outputs[0]->set_dims({dim0[0], dim1[1]});
+    outputs[0]->Resize({dim0[0], dim1[1]});
  }
 };

@@ -70,4 +70,4 @@ REGISTER_OP(mul, paddle::operators::MulOp, paddle::operators::MulOpMaker);
 REGISTER_GRADIENT_OP(mul, paddle::operators::MulOpGrad);

 REGISTER_OP_CPU_KERNEL(
-    mul, paddle::operators::MulKernel<paddle::platform::CPUPlace>);
+    mul, paddle::operators::MulKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/mul_op.cu
+++ b/paddle/operators/mul_op.cu
@@ -12,9 +12,9 @@
   See the License for the specific language governing permissions and
   limitations under the License. */

-#include <paddle/operators/mul_op.h>
-#include <paddle/framework/op_registry.h>
+#include "paddle/operators/mul_op.h"
+#include "paddle/framework/op_registry.h"

 REGISTER_OP_GPU_KERNEL(mul,
                       paddle::operators::MulKernel<paddle::platform
-                       ::GPUPlace>);
\ No newline at end of file
+                       ::GPUPlace, float>);
\ No newline at end of file
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -14,17 +14,30 @@

 #pragma once

-#include <glog/logging.h>
-#include <paddle/framework/operator.h>
+#include "glog/logging.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/operator.h"

 namespace paddle {
 namespace operators {

-template <typename Place>
+template <typename Place, typename T>
 class MulKernel : public framework::OpKernel {
 public:
-  void Compute(const framework::KernelContext &context) const override {
-    LOG(INFO) << "Mul kernel in " << typeid(Place).name();
+  void Compute(const framework::KernelContext& context) const override {
+    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair = {
+        {Eigen::IndexPair<Eigen::DenseIndex>(1, 0)}};
+
+    auto input0 = context.Input(0)->Get<framework::Tensor>();
+    auto input1 = context.Input(1)->Get<framework::Tensor>();
+    auto* output = context.Output(0)->GetMutable<framework::Tensor>();
+
+    output->mutable_data<T>(context.GetPlace());
+
+    framework::EigenMatrix<T>::From(*output).device(
+        *(context.GetEigenDevice<Place>())) =
+        framework::EigenMatrix<T>::From(input0).contract(
+            framework::EigenMatrix<T>::From(input1), dim_pair);
  }
 };
 }  // namespace operators

--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -12,8 +12,8 @@
   See the License for the specific language governing permissions and
   limitations under the License. */

-#include <paddle/framework/op_registry.h>
-#include <paddle/operators/rowwise_add_op.h>
+#include "paddle/operators/rowwise_add_op.h"
+#include "paddle/framework/op_registry.h"
 namespace paddle {
 namespace operators {

@@ -30,7 +30,7 @@ protected:
    PADDLE_ENFORCE(dim1.size() == 1, "The second input must be vector");
    PADDLE_ENFORCE(dim0[1] == dim1[0], "The width of two input must be same");
    PADDLE_ENFORCE(outputs.size() == 1, "The output size must be 1");
-    outputs[0]->set_dims(inputs[0]->dims());
+    outputs[0]->Resize(inputs[0]->dims());
  }
 };

@@ -58,4 +58,4 @@ REGISTER_OP(rowwise_add,
            paddle::operators::RowWiseAddOpMaker);
 REGISTER_OP_CPU_KERNEL(
    rowwise_add,
-    paddle::operators::RowWiseAddKernel<paddle::platform::CPUPlace>);
+    paddle::operators::RowWiseAddKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
-#include <paddle/framework/op_registry.h>
-#include <paddle/operators/rowwise_add_op.h>
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/rowwise_add_op.h"

 REGISTER_OP_GPU_KERNEL(
    rowwise_add,
-    paddle::operators::RowWiseAddKernel<paddle::platform ::GPUPlace>);
+    paddle::operators::RowWiseAddKernel<paddle::platform ::GPUPlace, float>);
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
@@ -13,17 +13,32 @@
   limitations under the License. */

 #pragma once
-#include <glog/logging.h>
-#include <paddle/framework/operator.h>
+#include "glog/logging.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/operator.h"

 namespace paddle {
 namespace operators {

-template <typename Place>
+template <typename Place, typename T>
 class RowWiseAddKernel : public framework::OpKernel {
 public:
-  void Compute(const framework::KernelContext &context) const override {
-    LOG(INFO) << "RowWiseAdd kernel in " << typeid(Place).name();
+  void Compute(const framework::KernelContext& context) const override {
+    auto in0 = context.Input(0)->Get<framework::Tensor>();
+    auto in1 = context.Input(1)->Get<framework::Tensor>();
+    auto* out = context.Output(0)->GetMutable<framework::Tensor>();
+    out->mutable_data<T>(context.GetPlace());
+
+    auto input = framework::EigenMatrix<T>::From(in0);
+    auto bias = framework::EigenVector<T>::From(in1);
+    auto output = framework::EigenMatrix<T>::From(*out);
+
+    const int bias_size = bias.dimension(0);
+    const int rest_size = input.size() / bias_size;
+    Eigen::DSizes<int, 1> one_d(input.size());
+    Eigen::DSizes<int, 1> bcast(rest_size);
+    output.reshape(one_d).device(*(context.GetEigenDevice<Place>())) =
+        input.reshape(one_d) + bias.broadcast(bcast).reshape(one_d);
  }
 };


--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -31,7 +31,7 @@ protected:
    PADDLE_ENFORCE(outputs[0] != nullptr, "outputs[0] mast be set");
    PADDLE_ENFORCE(inputs[0]->dims() == inputs[1]->dims(),
                   "Two input of SGD Op's dimension must be same.");
-    outputs[0]->set_dims(inputs[0]->dims());
+    outputs[0]->Resize(inputs[0]->dims());
  }
 };


--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
@@ -12,8 +12,8 @@
   See the License for the specific language governing permissions and
   limitations under the License. */

-#include <paddle/framework/op_registry.h>
-#include <paddle/operators/sigmoid_op.h>
+#include "paddle/operators/sigmoid_op.h"
+#include "paddle/framework/op_registry.h"
 namespace paddle {
 namespace operators {

@@ -24,7 +24,7 @@ protected:
      const std::vector<framework::Tensor *> &outputs) const override {
    PADDLE_ENFORCE(inputs.size() == 1, "Sigmoid Op only have one input");
    PADDLE_ENFORCE(outputs.size() == 1, "Sigmoid Op only have one output");
-    outputs[0]->set_dims(inputs[0]->dims());
+    outputs[0]->Resize(inputs[0]->dims());
  }
 };

@@ -34,7 +34,7 @@ public:
                 framework::OpAttrChecker *op_checker)
      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "sigmoid input");
-    AddInput("Y", "sigmoid output");
+    AddOutput("Y", "sigmoid output");
    AddComment("Sigmoid function");
  }
 };
@@ -59,4 +59,5 @@ REGISTER_OP(sigmoid,
 REGISTER_GRADIENT_OP(sigmoid, paddle::operators::SigmoidOpGrad);

 REGISTER_OP_CPU_KERNEL(
-    sigmoid, paddle::operators::SigmoidKernel<paddle::platform::CPUPlace>);
+    sigmoid,
+    paddle::operators::SigmoidKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/sigmoid_op.cu
+++ b/paddle/operators/sigmoid_op.cu
-#include <paddle/operators/sigmoid_op.h>
-#include <paddle/framework/op_registry.h>
+#include "paddle/operators/sigmoid_op.h"
+#include "paddle/framework/op_registry.h"

 REGISTER_OP_GPU_KERNEL(
-    sigmoid, paddle::operators::SigmoidKernel<paddle::platform::GPUPlace>);
+    sigmoid, paddle::operators::SigmoidKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/sigmoid_op.h
+++ b/paddle/operators/sigmoid_op.h
@@ -14,17 +14,25 @@

 #pragma once

-#include <glog/logging.h>
-#include <paddle/framework/operator.h>
+#include "glog/logging.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/operator.h"

 namespace paddle {
 namespace operators {

-template <typename Place>
+template <typename Place, typename T>
 class SigmoidKernel : public framework::OpKernel {
 public:
-  void Compute(const framework::KernelContext &context) const override {
-    LOG(INFO) << "Sigmoid kernel in " << typeid(Place).name();
+  void Compute(const framework::KernelContext& context) const override {
+    auto input = context.Input(0)->Get<framework::Tensor>();
+    auto* output = context.Output(0)->GetMutable<framework::Tensor>();
+
+    output->mutable_data<T>(context.GetPlace());
+
+    framework::EigenVector<T>::Flatten(*output).device(
+        *(context.GetEigenDevice<Place>())) =
+        1.0 / (1.0 + (-1.0 * framework::EigenVector<T>::Flatten(input)).exp());
  }
 };
 }  // namespace operators

--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -11,8 +11,8 @@
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License. */
-#include <paddle/framework/op_registry.h>
-#include <paddle/operators/softmax_op.h>
+#include "paddle/operators/softmax_op.h"
+#include "paddle/framework/op_registry.h"

 namespace paddle {
 namespace operators {
@@ -23,9 +23,11 @@ protected:
      const std::vector<const framework::Tensor *> &inputs,
      const std::vector<framework::Tensor *> &outputs) const override {
    PADDLE_ENFORCE(inputs.size() == 1, "Only one input is need for softmax");
+    PADDLE_ENFORCE(inputs[0]->dims().size() == 2,
+                   "The input of softmax op must be matrix");
    PADDLE_ENFORCE(outputs.size() == 1, "Only one output is need for softmax");

-    outputs[0]->set_dims(inputs[0]->dims());
+    outputs[0]->Resize(inputs[0]->dims());
  }
 };

@@ -58,5 +60,5 @@ namespace ops = paddle::operators;

 REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker);
 REGISTER_GRADIENT_OP(softmax, paddle::operators::SoftmaxOpGrad);
-
-REGISTER_OP_CPU_KERNEL(softmax, ops::SoftmaxKernel<paddle::platform::CPUPlace>);
+REGISTER_OP_CPU_KERNEL(softmax,
+                       ops::SoftmaxKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/softmax_op.cu
+++ b/paddle/operators/softmax_op.cu
-#include <paddle/framework/op_registry.h>
-#include <paddle/operators/softmax_op.h>
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/softmax_op.h"

 REGISTER_OP_GPU_KERNEL(
-    softmax, paddle::operators::SoftmaxKernel<paddle::platform::GPUPlace>);
+    softmax, paddle::operators::SoftmaxKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -14,17 +14,49 @@

 #pragma once

-#include <glog/logging.h>
-#include <paddle/framework/operator.h>
+#include "glog/logging.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/operator.h"

 namespace paddle {
 namespace operators {

-template <typename Place>
+template <typename Place, typename T>
 class SoftmaxKernel : public framework::OpKernel {
 public:
-  void Compute(const framework::KernelContext &context) const override {
-    LOG(INFO) << "Softmax kernel in " << typeid(Place).name();
+  void Compute(const framework::KernelContext& context) const override {
+    auto input = context.Input(0)->Get<framework::Tensor>();
+    auto* output = context.Output(0)->GetMutable<framework::Tensor>();
+    output->mutable_data<T>(context.GetPlace());
+
+    auto logits = framework::EigenMatrix<T>::From(input);
+    auto softmax = framework::EigenMatrix<T>::From(*output);
+
+    const int kBatchDim = 0;
+    const int kClassDim = 1;
+
+    const int batch_size = logits.dimension(kBatchDim);
+    const int num_classes = logits.dimension(kClassDim);
+
+    Eigen::DSizes<int, 1> along_class(kClassDim);
+    Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
+    Eigen::DSizes<int, 2> one_by_class(1, num_classes);
+
+    auto shifted_logits = (logits -
+                           logits.maximum(along_class)
+                               .eval()
+                               .reshape(batch_by_one)
+                               .broadcast(one_by_class));
+
+    softmax.device(*(context.GetEigenDevice<Place>())) = shifted_logits.exp();
+
+    softmax.device(*(context.GetEigenDevice<Place>())) =
+        (softmax *
+         softmax.sum(along_class)
+             .inverse()
+             .eval()
+             .reshape(batch_by_one)
+             .broadcast(one_by_class));
  }
 };
 }  // namespace operators

--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -43,10 +43,26 @@ namespace platform {
 // For more details, please check https://stackoverflow.com/a/43870188/724872.
 #define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)

+template <typename T>
+inline void throw_on_error(T e) {
+  throw_on_error(e, "");
+}
+
+template <typename... Args>
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    int stat, const Args&... args) {
+  if (UNLIKELY(!(stat))) {
+    throw std::runtime_error(
+        string::Sprintf(args...) +
+        string::Sprintf(" at [%s:%s];", __FILE__, __LINE__));
+  }
+}
+
 #ifndef PADDLE_ONLY_CPU

 template <typename... Args>
-inline void throw_on_error(cudaError_t e, const Args&... args) {
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    cudaError_t e, const Args&... args) {
  if (UNLIKELY(e)) {
    // clang-format off
    throw thrust::system_error(
@@ -58,7 +74,8 @@ inline void throw_on_error(cudaError_t e, const Args&... args) {
 }

 template <typename... Args>
-inline void throw_on_error(curandStatus_t stat, const Args&... args) {
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    curandStatus_t stat, const Args&... args) {
  if (stat != CURAND_STATUS_SUCCESS) {
    // clang-format off
    throw thrust::system_error(
@@ -70,7 +87,8 @@ inline void throw_on_error(curandStatus_t stat, const Args&... args) {
 }

 template <typename... Args>
-inline void throw_on_error(cudnnStatus_t stat, const Args&... args) {
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    cudnnStatus_t stat, const Args&... args) {
  if (stat == CUDNN_STATUS_SUCCESS) {
    return;
  } else {
@@ -84,7 +102,8 @@ inline void throw_on_error(cudnnStatus_t stat, const Args&... args) {
 }

 template <typename... Args>
-inline void throw_on_error(cublasStatus_t stat, const Args&... args) {
+inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
+    cublasStatus_t stat, const Args&... args) {
  std::string err;
  if (stat == CUBLAS_STATUS_SUCCESS) {
    return;
@@ -113,15 +132,6 @@ inline void throw_on_error(cublasStatus_t stat, const Args&... args) {

 #endif  // PADDLE_ONLY_CPU

-template <typename... Args>
-inline void throw_on_error(int stat, const Args&... args) {
-  if (UNLIKELY(!(stat))) {
-    throw std::runtime_error(
-        string::Sprintf(args...) +
-        string::Sprintf(" at [%s:%s];", __FILE__, __LINE__));
-  }
-}
-
 #define PADDLE_THROW(...)                                     \
  do {                                                        \
    throw std::runtime_error(                                 \
@@ -129,12 +139,9 @@ inline void throw_on_error(int stat, const Args&... args) {
        string::Sprintf(" at [%s:%s];", __FILE__, __LINE__)); \
  } while (0)

-/**
- * @brief Enforce a condition, otherwise throw an EnforceNotMet
- */
-#define PADDLE_ENFORCE(condition, ...)                          \
+#define PADDLE_ENFORCE(...)                          \
  do {                                               \
-    ::paddle::platform::throw_on_error(condition, __VA_ARGS__); \
+    ::paddle::platform::throw_on_error(__VA_ARGS__); \
  } while (0)

 }  // namespace platform

--- a/paddle/platform/gpu_info.cc
+++ b/paddle/platform/gpu_info.cc
@@ -44,7 +44,7 @@ void SetDeviceId(int id) {
                 "cudaSetDevice failed in paddle::platform::SetDeviceId");
 }

-void GpuMemoryUsage(size_t& available, size_t& total) {
+void GpuMemoryUsage(size_t &available, size_t &total) {
  PADDLE_ENFORCE(cudaMemGetInfo(&available, &total),
                 "cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
 }
@@ -82,5 +82,28 @@ size_t GpuMaxChunkSize() {
  return usable;
 }

+void GpuMemcpyAsync(void *dst, const void *src, size_t count,
+                    enum cudaMemcpyKind kind, cudaStream_t stream) {
+  PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream),
+                 "cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync");
+}
+
+void GpuMemcpySync(void *dst, const void *src, size_t count,
+                   enum cudaMemcpyKind kind) {
+  PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind),
+                 "cudaMemcpy failed in paddle::platform::GpuMemcpySync");
+  // note: cudaMemcpy may actually be asynchronous with respect to the caller,
+  //       block on stream 0 to make sure the copy has completed
+  PADDLE_ENFORCE(
+      cudaStreamSynchronize(0),
+      "cudaStreamSynchronize failed in paddle::platform::GpuMemcpySync");
+}
+
+void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
+                   size_t count, cudaStream_t stream) {
+  PADDLE_ENFORCE(
+      cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream),
+      "cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeer");
+}
 }  // namespace platform
 }  // namespace paddle
--- a/paddle/platform/gpu_info.h
+++ b/paddle/platform/gpu_info.h
@@ -16,6 +16,7 @@ limitations under the License. */

 #ifndef PADDLE_ONLY_CPU

+#include <cuda_runtime.h>
 #include <stddef.h>

 namespace paddle {
@@ -31,7 +32,7 @@ int GetCurrentDeviceId();
 void SetDeviceId(int device_id);

 //！Get the memory usage of current GPU device.
-void GpuMemoryUsage(size_t& available, size_t& total);
+void GpuMemoryUsage(size_t &available, size_t &total);

 //! Get the maximum allocation size of current GPU device.
 size_t GpuMaxAllocSize();
@@ -42,6 +43,18 @@ size_t GpuMinChunkSize();
 //! Get the maximum chunk size for GPU buddy allocator.
 size_t GpuMaxChunkSize();

+//! Copy memory from address src to dst asynchronously.
+void GpuMemcpyAsync(void *dst, const void *src, size_t count,
+                    enum cudaMemcpyKind kind, cudaStream_t stream);
+
+//! Copy memory from address src to dst synchronously.
+void GpuMemcpySync(void *dst, const void *src, size_t count,
+                   enum cudaMemcpyKind kind);
+
+//! Copy memory from one device to another device.
+void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
+                   size_t count, cudaStream_t stream);
+
 }  // namespace platform
 }  // namespace paddle


--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
 cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python
-        add_op fc_op sgd_op)
+        add_op fc_op sgd_op cross_entropy_op)
--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -13,25 +13,43 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <Python.h>
-#include <paddle/framework/op_registry.h>
-#include <paddle/framework/operator.h>
-#include <paddle/framework/scope.h>
-#include <paddle/pybind/tensor_bind.h>
-#include <pybind11/numpy.h>
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
 #include <fstream>
 #include <vector>

+#include "paddle/framework/net.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+#include "paddle/framework/scope.h"
+#include "paddle/pybind/tensor_bind.h"
+#include "pybind11/numpy.h"
+#include "pybind11/pybind11.h"
+#include "pybind11/stl.h"
+
 namespace py = pybind11;
 namespace pd = paddle::framework;

 USE_OP(add_two);
+USE_OP(onehot_cross_entropy);
 USE_OP_WITHOUT_KERNEL(fc);
 USE_OP(sgd);
+USE_OP(mul);
+USE_OP(sigmoid);
+USE_OP(softmax);
+USE_OP(rowwise_add);
+
+template <typename ClassType>
+void ExposeOperator(ClassType& m) {
+  m.def("infer_shape", &ClassType::type::InferShape)
+      .def("run", &ClassType::type::Run)
+      .def("outputs",
+           [](const typename ClassType::type& op) -> std::vector<std::string> {
+             return op.outputs_;
+           })
+      .def("__str__", &ClassType::type::DebugString);
+}

 PYBIND11_PLUGIN(core) {
-  py::module m("core", "C++ core of Paddle Paddle");
+  py::module m("core", "C++ core of PaddlePaddle");

  py::class_<pd::Tensor>(m, "Tensor", py::buffer_protocol())
      .def_buffer([](pd::Tensor& self) -> py::buffer_info {
@@ -41,7 +59,7 @@ PYBIND11_PLUGIN(core) {
           [](const pd::Tensor& self) { return pd::vectorize(self.dims()); })
      .def("set_dims",
           [](pd::Tensor& self, const std::vector<int>& dim) {
-             self.set_dims(pd::make_ddim(dim));
+             self.Resize(pd::make_ddim(dim));
           })
      .def("alloc_float",
           [](pd::Tensor& self) {
@@ -108,10 +126,10 @@ All parameter, weight, gradient are variables in Paddle.
        return new paddle::platform::CPUDeviceContext();
      });

-  py::class_<pd::OperatorBase, pd::OperatorPtr>(m, "Operator")
-      .def("__str__", &pd::OperatorBase::DebugString)
-      .def_static("create",
-                  [](py::bytes protobin) {
+  py::class_<pd::OperatorBase, std::shared_ptr<pd::OperatorBase>> operator_base(
+      m, "Operator");
+
+  operator_base.def_static("create", [](py::bytes protobin) {
    pd::OpDesc desc;
    PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
                   "Cannot parse user input to OpDesc");
@@ -119,10 +137,27 @@ All parameter, weight, gradient are variables in Paddle.
                   "User OpDesc is not initialized, reason %s",
                   desc.InitializationErrorString());
    return pd::OpRegistry::CreateOp(desc);
+  });
+  ExposeOperator(operator_base);
+
+  using PlainNetPtr = std::shared_ptr<pd::PlainNet>;
+  py::class_<pd::PlainNet, PlainNetPtr> plain_net(m, "PlainNet");
+
+  plain_net
+      .def_static("create",
+                  []() -> std::shared_ptr<pd::PlainNet> {
+                    auto retv = std::make_shared<pd::PlainNet>();
+                    retv->type_ = "plain_net";
+                    return retv;
+                  })
+      .def("add_op", &pd::PlainNet::AddOp)
+      .def("add_op",
+           [](PlainNetPtr& self, const PlainNetPtr& plain_net) -> void {
+             self->AddOp(std::static_pointer_cast<pd::OperatorBase>(plain_net));
           })
-      .def("infer_shape", &pd::OperatorBase::InferShape)
-      .def("run", &pd::OperatorBase::Run)
-      .def("outputs", [](const pd::OperatorPtr& op) { return op->outputs_; });
+      .def("complete_add_op", &pd::PlainNet::CompleteAddOp)
+      .def("complete_add_op", [](PlainNetPtr& self) { self->CompleteAddOp(); });
+  ExposeOperator(plain_net);

  return m.ptr();
 }
--- a/paddle/pybind/tensor_bind.h
+++ b/paddle/pybind/tensor_bind.h
@@ -86,7 +86,7 @@ void PyTensorSetFromArray(
    dims.push_back((int)array.shape()[i]);
  }

-  self.set_dims(framework::make_ddim(dims));
+  self.Resize(framework::make_ddim(dims));
  auto *dst = self.mutable_data<T>(paddle::platform::CPUPlace());
  std::memcpy(dst, array.data(), sizeof(T) * array.size());
 }

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -3219,6 +3219,10 @@ def ParameterHook(type, **kwargs):
        if sparsity_ratio is not None:
            hook.sparsity_ratio = sparsity_ratio
        return hook
+    elif type == 'dpruning':
+        hook = ParameterUpdaterHookConfig()
+        hook.type = type
+        return hook
    else:
        return None


--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -3173,11 +3173,11 @@ def memory(name,


 @wrap_bias_attr_default()
-@wrap_act_default(
-    param_names=['gate_act', 'state_act'], act=SigmoidActivation())
+@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
+@wrap_act_default(param_names=['state_act'], act=TanhActivation())
 @wrap_act_default(act=TanhActivation())
 @wrap_name_default('lstm_step')
-@layer_support()
+@layer_support(ERROR_CLIPPING, DROPOUT)
 def lstm_step_layer(input,
                    state,
                    size=None,
@@ -3531,12 +3531,7 @@ def SubsequenceInput(input):


 @wrap_name_default("recurrent_group")
-def recurrent_group(step,
-                    input,
-                    reverse=False,
-                    name=None,
-                    targetInlink=None,
-                    is_generating=False):
+def recurrent_group(step, input, reverse=False, name=None, targetInlink=None):
    """
    Recurrent layer group is an extremely flexible recurrent unit in
    PaddlePaddle. As long as the user defines the calculation done within a
@@ -3602,21 +3597,12 @@ def recurrent_group(step,

    :type targetInlink: LayerOutput|SubsequenceInput

-    :param is_generating: If is generating, none of input type should be LayerOutput;
-                          else, for training or testing, one of the input type must
-                          be LayerOutput.
-
-    :type is_generating: bool
-
    :return: LayerOutput object.
    :rtype: LayerOutput
    """
    model_type('recurrent_nn')

-    def is_single_input(x):
-        return isinstance(x, LayerOutput) or isinstance(x, StaticInput)
-
-    if is_single_input(input):
+    if isinstance(input, LayerOutput) or isinstance(input, StaticInput):
        input = [input]
    assert isinstance(input, collections.Sequence)

@@ -3630,13 +3616,8 @@ def recurrent_group(step,
        in_links=map(lambda x: x.name, in_links),
        seq_reversed=reverse)
    in_args = []
-    has_LayerOutput = False
    for each_input in input:
-        assert is_single_input(each_input)
-        if isinstance(each_input, LayerOutput):
-            in_args.append(each_input)
-            has_LayerOutput = True
-        else:  # StaticInput
+        if isinstance(each_input, StaticInput):  # StaticInput
            mem_name = "__%s_memory__" % each_input.input.name
            mem = memory(
                name=None,
@@ -3644,24 +3625,26 @@ def recurrent_group(step,
                boot_layer=each_input.input)
            mem.set_input(mem)
            in_args.append(mem)
-
-    assert (is_generating != has_LayerOutput)
+        else:
+            in_args.append(each_input)

    layer_outs = step(*in_args)

    if isinstance(layer_outs, LayerOutput):
        layer_outs = [layer_outs]

-    for ot in layer_outs:
-        assert isinstance(ot, LayerOutput)
-        ot.reverse = reverse
-        RecurrentLayerGroupSetOutLink(ot.name)
+    for layer_out in layer_outs:
+        assert isinstance(
+            layer_out, LayerOutput
+        ), "Type of step function's return value must be LayerOutput."
+        layer_out.reverse = reverse
+        RecurrentLayerGroupSetOutLink(layer_out.name)

    RecurrentLayerGroupEnd(name=name)

    for layer_out in layer_outs:
-        # Thee previous full_name is the name is the rnn group
-        # We need a full_name outside the rnn group
+        # The previous full_name is the name inside the recurrent group.
+        # We need a full_name outside the recurrent group.
        layer_out.full_name = MakeLayerNameInSubmodel(layer_out.name)

    if len(layer_outs) == 1:
@@ -3684,7 +3667,20 @@ class BaseGeneratedInput(object):

 class GeneratedInput(BaseGeneratedInput):
    def after_real_step(self, input):
-        return maxid_layer(input=input, name='__beam_search_predict__')
+        if isinstance(input, LayerOutput):
+            input = [input]
+        elif isinstance(input, collections.Sequence):
+            input = list(input)
+            if len(input) > 1:
+                logger.info(
+                    ("More than one layers inside the recurrent_group "
+                     "are returned as outputs of the entire recurrent_group "
+                     "PLEASE garantee the first output is probability of "
+                     "the predicted next word."))
+
+        return [maxid_layer(
+            input=input[0], name='__beam_search_predict__')] + (
+                input[1:] if len(input) > 1 else [])

    def before_real_step(self):
        predict_id = memory(
@@ -3871,6 +3867,7 @@ def beam_search(step,
    :type step: callable
    :param input: Input data for the recurrent unit, which should include the
                  previously generated words as a GeneratedInput object.
+                  In beam_search, none of the input's type should be LayerOutput.
    :type input: list
    :param bos_id: Index of the start symbol in the dictionary. The start symbol
                   is a special token for NLP task, which indicates the
@@ -3912,15 +3909,18 @@ def beam_search(step,

    real_input = []
    for i, each_input in enumerate(input):
-        assert isinstance(each_input, StaticInput) or isinstance(
-            each_input, BaseGeneratedInput)
+        assert not isinstance(each_input, LayerOutput), (
+            "in beam_search, "
+            "none of the input should has a type of LayerOutput.")
        if isinstance(each_input, BaseGeneratedInput):
-            assert generated_input_index == -1
+            assert generated_input_index == -1, ("recurrent_group accepts "
+                                                 "only one GeneratedInput.")
            generated_input_index = i
+
        else:
            real_input.append(each_input)

-    assert generated_input_index != -1
+    assert generated_input_index != -1, "No GeneratedInput is given."

    gipt = input[generated_input_index]

@@ -3941,17 +3941,11 @@ def beam_search(step,

        predict = gipt.after_real_step(step(*args))

-        eos_layer(input=predict, eos_id=eos_id, name=eos_name)
+        eos_layer(input=predict[0], eos_id=eos_id, name=eos_name)
        return predict

-    tmp = recurrent_group(
-        step=__real_step__,
-        input=real_input,
-        reverse=False,
-        name=name,
-        is_generating=True)
-
-    return tmp
+    return recurrent_group(
+        step=__real_step__, input=real_input, reverse=False, name=name)


 def __cost_input__(input, label, weight=None):

--- a/python/paddle/trainer_config_helpers/networks.py
+++ b/python/paddle/trainer_config_helpers/networks.py
@@ -614,18 +614,17 @@ def simple_lstm(input,

 @wrap_name_default('lstm_unit')
 def lstmemory_unit(input,
-                   memory_boot=None,
+                   out_memory=None,
                   name=None,
                   size=None,
                   param_attr=None,
                   act=None,
                   gate_act=None,
                   state_act=None,
-                   mixed_bias_attr=None,
+                   input_proj_bias_attr=None,
+                   input_proj_layer_attr=None,
                   lstm_bias_attr=None,
-                   mixed_layer_attr=None,
-                   lstm_layer_attr=None,
-                   get_output_layer_attr=None):
+                   lstm_layer_attr=None):
    """
    Define calculations that a LSTM unit performs during a single time step.
    This function itself is not a recurrent layer, so it can not be
@@ -662,8 +661,8 @@ def lstmemory_unit(input,

    :param input: input layer name.
    :type input: LayerOutput
-    :param memory_boot: the initialization state of the LSTM cell.
-    :type memory_boot: LayerOutput | None
+    :param out_memory: output of previous time step
+    :type out_memory: LayerOutput | None
    :param name: lstmemory unit name.
    :type name: basestring
    :param size: lstmemory unit size.
@@ -676,33 +675,35 @@ def lstmemory_unit(input,
    :type gate_act: BaseActivation
    :param state_act: lstm state activiation type.
    :type state_act: BaseActivation
-    :param mixed_bias_attr: bias parameter attribute of mixed layer.
+    :param input_proj_bias_attr: bias attribute for input-to-hidden projection.
                False means no bias, None means default bias.
-    :type mixed_bias_attr: ParameterAttribute|False
+    :type input_proj_bias_attr: ParameterAttribute|False|None
+    :param input_proj_layer_attr: extra layer attribute for input to hidden
+                projection of the LSTM unit, such as dropout, error clipping.
+    :type input_proj_layer_attr: ExtraLayerAttribute
    :param lstm_bias_attr: bias parameter attribute of lstm layer.
                False means no bias, None means default bias.
    :type lstm_bias_attr: ParameterAttribute|False
-    :param mixed_layer_attr: mixed layer's extra attribute.
-    :type mixed_layer_attr: ExtraLayerAttribute
    :param lstm_layer_attr: lstm layer's extra attribute.
    :type lstm_layer_attr: ExtraLayerAttribute
-    :param get_output_layer_attr: get output layer's extra attribute.
-    :type get_output_layer_attr: ExtraLayerAttribute
    :return: lstmemory unit name.
    :rtype: LayerOutput
    """
    if size is None:
        assert input.size % 4 == 0
        size = input.size / 4
+    if out_memory is None:
        out_mem = memory(name=name, size=size)
-    state_mem = memory(
-        name="%s_state" % name, size=size, boot_layer=memory_boot)
+    else:
+        out_mem = out_memory
+
+    state_mem = memory(name="%s_state" % name, size=size)

    with mixed_layer(
            name="%s_input_recurrent" % name,
            size=size * 4,
-            bias_attr=mixed_bias_attr,
-            layer_attr=mixed_layer_attr,
+            bias_attr=input_proj_bias_attr,
+            layer_attr=input_proj_layer_attr,
            act=IdentityActivation()) as m:
        m += identity_projection(input=input)
        m += full_matrix_projection(input=out_mem, param_attr=param_attr)
@@ -717,11 +718,7 @@ def lstmemory_unit(input,
        gate_act=gate_act,
        state_act=state_act,
        layer_attr=lstm_layer_attr)
-    get_output_layer(
-        name='%s_state' % name,
-        input=lstm_out,
-        arg_name='state',
-        layer_attr=get_output_layer_attr)
+    get_output_layer(name='%s_state' % name, input=lstm_out, arg_name='state')

    return lstm_out

@@ -730,17 +727,16 @@ def lstmemory_unit(input,
 def lstmemory_group(input,
                    size=None,
                    name=None,
-                    memory_boot=None,
+                    out_memory=None,
                    reverse=False,
                    param_attr=None,
                    act=None,
                    gate_act=None,
                    state_act=None,
-                    mixed_bias_attr=None,
+                    input_proj_bias_attr=None,
+                    input_proj_layer_attr=None,
                    lstm_bias_attr=None,
-                    mixed_layer_attr=None,
-                    lstm_layer_attr=None,
-                    get_output_layer_attr=None):
+                    lstm_layer_attr=None):
    """
    lstm_group is a recurrent_group version of Long Short Term Memory. It
    does exactly the same calculation as the lstmemory layer (see lstmemory in
@@ -774,8 +770,8 @@ def lstmemory_group(input,
    :type size: int
    :param name: name of the lstmemory group.
    :type name: basestring
-    :param memory_boot: the initialization state of LSTM cell.
-    :type memory_boot: LayerOutput | None
+    :param out_memory: output of previous time step
+    :type out_memory: LayerOutput | None
    :param reverse: is lstm reversed
    :type reverse: bool
    :param param_attr: Parameter config, None if use default.
@@ -786,18 +782,17 @@ def lstmemory_group(input,
    :type gate_act: BaseActivation
    :param state_act: lstm state activiation type.
    :type state_act: BaseActivation
-    :param mixed_bias_attr: bias parameter attribute of mixed layer.
-                            False means no bias, None means default bias.
-    :type mixed_bias_attr: ParameterAttribute|False
    :param lstm_bias_attr: bias parameter attribute of lstm layer.
                           False means no bias, None means default bias.
    :type lstm_bias_attr: ParameterAttribute|False
-    :param mixed_layer_attr: mixed layer's extra attribute.
-    :type mixed_layer_attr: ExtraLayerAttribute
+    :param input_proj_bias_attr: bias attribute for input-to-hidden projection.
+                False means no bias, None means default bias.
+    :type input_proj_bias_attr: ParameterAttribute|False|None
+    :param input_proj_layer_attr: extra layer attribute for input to hidden
+                projection of the LSTM unit, such as dropout, error clipping.
+    :type input_proj_layer_attr: ExtraLayerAttribute
    :param lstm_layer_attr: lstm layer's extra attribute.
    :type lstm_layer_attr: ExtraLayerAttribute
-    :param get_output_layer_attr: get output layer's extra attribute.
-    :type get_output_layer_attr: ExtraLayerAttribute
    :return: the lstmemory group.
    :rtype: LayerOutput
    """
@@ -805,18 +800,17 @@ def lstmemory_group(input,
    def __lstm_step__(ipt):
        return lstmemory_unit(
            input=ipt,
-            memory_boot=memory_boot,
            name=name,
            size=size,
-            mixed_bias_attr=mixed_bias_attr,
-            mixed_layer_attr=mixed_layer_attr,
-            param_attr=param_attr,
-            lstm_bias_attr=lstm_bias_attr,
            act=act,
            gate_act=gate_act,
            state_act=state_act,
+            out_memory=out_memory,
+            input_proj_bias_attr=input_proj_bias_attr,
+            input_proj_layer_attr=input_proj_layer_attr,
+            param_attr=param_attr,
            lstm_layer_attr=lstm_layer_attr,
-            get_output_layer_attr=get_output_layer_attr)
+            lstm_bias_attr=lstm_bias_attr)

    return recurrent_group(
        name='%s_recurrent_group' % name,

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/shared_lstm.protostr
@@ -104,7 +104,7 @@ layers {
  }
  bias_parameter_name: "lstm_bias"
  active_gate_type: "sigmoid"
-  active_state_type: "sigmoid"
+  active_state_type: "tanh"
 }
 layers {
  name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
@@ -183,7 +183,7 @@ layers {
  }
  bias_parameter_name: "lstm_bias"
  active_gate_type: "sigmoid"
-  active_state_type: "sigmoid"
+  active_state_type: "tanh"
 }
 layers {
  name: "__lstm_group_1___state@__lstm_group_1___recurrent_group"

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_rnn_group.protostr
@@ -258,7 +258,7 @@ layers {
  }
  bias_parameter_name: "___lstm_group_0__@__lstm_group_0___recurrent_group.wbias"
  active_gate_type: "sigmoid"
-  active_state_type: "sigmoid"
+  active_state_type: "tanh"
 }
 layers {
  name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"

--- a/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/shared_lstm.py
@@ -20,12 +20,13 @@ lstm1 = lstmemory_group(
    input=m1,
    param_attr=lstm_param,
    lstm_bias_attr=lstm_bias,
-    mixed_bias_attr=False)
+    input_proj_bias_attr=False)
+
 lstm2 = lstmemory_group(
    input=m2,
    param_attr=lstm_param,
    lstm_bias_attr=lstm_bias,
-    mixed_bias_attr=False)
+    input_proj_bias_attr=False)

 softmax_param = ParamAttr(name='softmax_param')


--- a/python/paddle/v2/dataset/flowers.py
+++ b/python/paddle/v2/dataset/flowers.py
@@ -116,7 +116,7 @@ def reader_creator(data_file,
            data = batch['data']
            labels = batch['label']
            for sample, label in itertools.izip(data, batch['label']):
-                yield sample, int(label)
+                yield sample, int(label) - 1

    if use_xmap:
        return xmap_readers(mapper, reader, cpu_count(), buffered_size)

--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
-add_python_test(test_framework test_protobuf.py test_scope.py
-    test_default_scope_funcs.py test_op_creation_methods.py
-    test_tensor.py test_fc_op.py test_add_two_op.py test_sgd_op.py)
+add_python_test(test_framework
+    test_protobuf.py
+    test_scope.py
+    test_default_scope_funcs.py
+    test_op_creation_methods.py
+    test_plain_net.py
+    test_tensor.py
+    test_fc_op.py
+    test_add_two_op.py
+    test_sgd_op.py
+    test_cross_entropy_op.py
+    test_mul_op.py
+    test_sigmoid_op.py
+    test_softmax_op.py
+    test_rowwise_add_op.py)
--- a/python/paddle/v2/framework/tests/op_test_util.py
+++ b/python/paddle/v2/framework/tests/op_test_util.py
@@ -56,7 +56,10 @@ class OpTestMeta(type):
            for out_name in func.all_output_args:
                actual = numpy.array(scope.get_var(out_name).get_tensor())
                expect = getattr(self, out_name)
-                numpy.testing.assert_almost_equal(actual, expect)
+                # TODO(qijun) The default decimal is 7, but numpy.dot and eigen.mul
+                # has some diff, and could not pass unittest. So I set decimal 3 here.
+                # And I will check this in future.
+                numpy.testing.assert_almost_equal(actual, expect, decimal=3)

        obj.test_all = test_all
        return obj
--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+import unittest
+import numpy
+from op_test_util import OpTestMeta
+
+
+class TestSGD(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "onehot_cross_entropy"
+        batch_size = 100
+        class_num = 10
+        self.X = numpy.random.random((batch_size, class_num)).astype("float32")
+        self.label = 5 * numpy.ones(batch_size).astype("int32")
+        Y = []
+        for i in range(0, batch_size):
+            Y.append(-numpy.log(self.X[i][self.label[i]]))
+        self.Y = numpy.array(Y).astype("float32")
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/v2/framework/tests/test_mul_op.py
+++ b/python/paddle/v2/framework/tests/test_mul_op.py
+import unittest
+from op_test_util import OpTestMeta
+import numpy as np
+
+
+class TestMulOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "mul"
+        self.X = np.random.random((32, 784)).astype("float32")
+        self.Y = np.random.random((784, 100)).astype("float32")
+        self.Out = np.dot(self.X, self.Y)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/framework/tests/test_plain_net.py
+++ b/python/paddle/v2/framework/tests/test_plain_net.py
+import paddle.v2.framework.core as core
+from paddle.v2.framework.create_op_creation_methods import op_creations
+import unittest
+
+
+class TestNet(unittest.TestCase):
+    def test_net_all(self):
+        net = core.PlainNet.create()
+        op1 = op_creations.add_two(X="X", Y="Y", Out="Out")
+        net.add_op(op1)
+
+        net2 = core.PlainNet.create()
+        net2.add_op(op_creations.fc(X="X", W="w", Y="fc.out"))
+        net2.complete_add_op(True)
+        net.add_op(net2)
+        net.complete_add_op(True)
+
+        expected = '''
+Op(plain_net), inputs:(@EMPTY@, X, Y, w), outputs:(@TEMP@fc@0, Out, fc.out).
+    Op(add_two), inputs:(X, Y), outputs:(Out).
+    Op(plain_net), inputs:(@EMPTY@, X, w), outputs:(@TEMP@fc@0, fc.out).
+        Op(fc), inputs:(X, w, @EMPTY@), outputs:(fc.out, @TEMP@fc@0).
+            Op(mul), inputs:(X, w), outputs:(@TEMP@fc@0).
+            Op(sigmoid), inputs:(@TEMP@fc@0), outputs:(fc.out).
+'''
+        self.assertEqual(expected, "\n" + str(net))
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/framework/tests/test_rowwise_add_op.py
+++ b/python/paddle/v2/framework/tests/test_rowwise_add_op.py
+import unittest
+from op_test_util import OpTestMeta
+import numpy as np
+
+
+class TestRowwiseAddOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "rowwise_add"
+        self.X = np.random.random((32, 784)).astype("float32")
+        self.b = np.random.random(784).astype("float32")
+        self.Out = np.add(self.X, self.b)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/framework/tests/test_sigmoid_op.py
+++ b/python/paddle/v2/framework/tests/test_sigmoid_op.py
+import unittest
+from op_test_util import OpTestMeta
+import numpy as np
+
+
+class TestSigmoidOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "sigmoid"
+        self.X = np.random.random((32, 100)).astype("float32")
+        self.Y = 1 / (1 + np.exp(-self.X))
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/framework/tests/test_softmax_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_op.py
+import unittest
+from op_test_util import OpTestMeta
+import numpy as np
+
+
+def stable_softmax(x):
+    """Compute the softmax of vector x in a numerically stable way."""
+    shiftx = x - np.max(x)
+    exps = np.exp(shiftx)
+    return exps / np.sum(exps)
+
+
+class TestSoftmaxOp(unittest.TestCase):
+    __metaclass__ = OpTestMeta
+
+    def setUp(self):
+        self.type = "softmax"
+        self.X = np.random.random((32, 100)).astype("float32")
+        self.Y = np.apply_along_axis(stable_softmax, 1, self.X)
+
+
+if __name__ == '__main__':
+    unittest.main()