提交 5f3bc2a4 编写于 作者: F fengjiayi

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into backward

......@@ -36,6 +36,8 @@ include(simd)
################################ Configurations #######################################
option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
option(WITH_MKLDNN "Compile PaddlePaddle with mkl-dnn support." OFF)
option(WITH_MKLML "Compile PaddlePaddle with mklml package." OFF)
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON)
option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON)
......@@ -74,6 +76,10 @@ if(ANDROID)
"Disable PYTHON when cross-compiling for Android" FORCE)
set(WITH_RDMA OFF CACHE STRING
"Disable RDMA when cross-compiling for Android" FORCE)
set(WITH_MKLDNN OFF CACHE STRING
"Disable MKLDNN when cross-compiling for Android" FORCE)
set(WITH_MKLML OFF CACHE STRING
"Disable MKLML package when cross-compiling for Android" FORCE)
endif(ANDROID)
set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
......@@ -87,6 +93,7 @@ endif()
########################################################################################
include(external/mklml) # download mklml package
include(external/zlib) # download, build, install zlib
include(external/gflags) # download, build, install gflags
include(external/glog) # download, build, install glog
......@@ -94,6 +101,7 @@ include(external/gtest) # download, build, install gtest
include(external/protobuf) # download, build, install protobuf
include(external/python) # download, build, install python
include(external/openblas) # download, build, install openblas
include(external/mkldnn) # download, build, install mkldnn
include(external/swig) # download, build, install swig
include(external/warpctc) # download, build, install warpctc
include(external/any) # download libn::any
......@@ -135,6 +143,10 @@ if(WITH_GPU)
endif(NOT WITH_DSO)
endif(WITH_GPU)
if(WITH_MKLDNN)
list(APPEND EXTERNAL_LIBS ${MKLDNN_LIBRARY} ${MKLDNN_IOMP_LIB})
endif()
if(USE_NNPACK)
include(external/nnpack)
list(APPEND EXTERNAL_LIBS ${NNPACK_LIBS})
......
......@@ -15,23 +15,44 @@
set(CBLAS_FOUND OFF)
## Find MKL First.
set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs")
set(MKL_ROOT ${INTEL_ROOT}/mkl CACHE PATH "Folder contains MKL")
## Find MKLML First.
if(WITH_MKLML AND MKLML_INC_DIR AND MKLML_LIB)
set(CBLAS_FOUND ON)
set(CBLAS_PROVIDER MKLML)
set(CBLAS_INC_DIR ${MKLML_INC_DIR})
set(CBLAS_LIBRARIES ${MKLML_LIB})
add_definitions(-DPADDLE_USE_MKLML)
add_definitions(-DLAPACK_FOUND)
message(STATUS "Found cblas and lapack in MKLML "
"(include: ${CBLAS_INC_DIR}, library: ${CBLAS_LIBRARIES})")
return()
endif()
## Then find MKL.
set(INTEL_MKL_ROOT "/opt/intel/mkl" CACHE PATH "Folder contains intel mkl libs")
set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains env MKL")
set(MKL_INCLUDE_SEARCH_PATHS
${MKL_ROOT}/include
${INTEL_MKL_ROOT}/include)
set(MKL_LIB_SEARCH_PATHS
${MKL_ROOT}/lib
${MKL_ROOT}/lib/intel64
${INTEL_MKL_ROOT}/lib
${INTEL_MKL_ROOT}/lib/intel64)
find_path(MKL_INC_DIR mkl.h PATHS
${MKL_ROOT}/include)
${MKL_INCLUDE_SEARCH_PATHS})
find_path(MKL_LAPACK_INC_DIR mkl_lapacke.h PATHS
${MKL_ROOT}/include)
${MKL_INCLUDE_SEARCH_PATHS})
find_library(MKL_CORE_LIB NAMES mkl_core PATHS
${MKL_ROOT}/lib
${MKL_ROOT}/lib/intel64)
${MKL_LIB_SEARCH_PATHS})
find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS
${MKL_ROOT}/lib
${MKL_ROOT}/lib/intel64)
${MKL_LIB_SEARCH_PATHS})
find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
${MKL_ROOT}/lib
${MKL_ROOT}/lib/intel64)
${MKL_LIB_SEARCH_PATHS})
if(MKL_LAPACK_INC_DIR AND MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
set(CBLAS_FOUND ON)
......
......@@ -67,6 +67,30 @@ else()
include_directories(${CUDA_TOOLKIT_INCLUDE})
endif(NOT WITH_GPU)
if(WITH_MKLDNN)
add_definitions(-DPADDLE_USE_MKLDNN)
if (WITH_MKLML AND MKLDNN_IOMP_DIR)
message(STATUS "Enable Intel OpenMP at ${MKLDNN_IOMP_DIR}")
set(OPENMP_FLAGS "-fopenmp")
set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
else()
find_package(OpenMP)
if(OPENMP_FOUND)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
else()
message(WARNING "Can not find OpenMP."
"Some performance features in MKLDNN may not be available")
endif()
endif()
endif(WITH_MKLDNN)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
......
......@@ -34,9 +34,15 @@ IF(WITH_TESTING)
"${GTEST_INSTALL_DIR}/lib/libgtest_main.a" CACHE FILEPATH "gtest main libraries." FORCE)
ENDIF(WIN32)
IF(WITH_MKLML)
# wait for mklml downloading completed
SET(GTEST_DEPENDS ${MKLML_PROJECT})
ENDIF()
ExternalProject_Add(
extern_gtest
${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${GTEST_DEPENDS}
GIT_REPOSITORY "https://github.com/google/googletest.git"
GIT_TAG "release-1.8.0"
PREFIX ${GTEST_SOURCES_DIR}
......
# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
IF(NOT ${WITH_MKLDNN})
return()
ENDIF(NOT ${WITH_MKLDNN})
INCLUDE(ExternalProject)
SET(MKLDNN_PROJECT "extern_mkldnn")
SET(MKLDNN_SOURCES_DIR ${THIRD_PARTY_PATH}/mkldnn)
SET(MKLDNN_INSTALL_ROOT ${CMAKE_INSTALL_PREFIX})
IF(NOT "$ENV{HOME}" STREQUAL "/root")
SET(MKLDNN_INSTALL_ROOT "$ENV{HOME}")
ENDIF()
SET(MKLDNN_INSTALL_DIR "${MKLDNN_INSTALL_ROOT}/opt/paddle/third_party/mkldnn")
SET(MKLDNN_INCLUDE_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
IF(WIN32)
MESSAGE(WARNING "It is not supported compiling with mkldnn in windows Paddle yet."
"Force WITH_MKLDNN=OFF")
SET(WITH_MKLDNN OFF)
return()
ELSE(WIN32)
SET(MKLDNN_LIBRARY "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
#SET(CMAKE_MACOSX_RPATH 1) # hold for MacOS
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
ENDIF(WIN32)
INCLUDE_DIRECTORIES(${MKLDNN_INCLUDE_DIR})
IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
SET(MKLDNN_DEPENDS ${MKLML_PROJECT})
SET(MKLDNN_MKLROOT ${MKLML_ROOT})
SET(MKLDNN_IOMP_LIB ${MKLML_IOMP_LIB})
SET(MKLDNN_IOMP_DIR ${MKLML_LIB_DIR})
ENDIF()
ExternalProject_Add(
${MKLDNN_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
DEPENDS ${MKLDNN_DEPENDS}
GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git"
GIT_TAG "v0.9"
PREFIX ${MKLDNN_SOURCES_DIR}
CONFIGURE_COMMAND mkdir -p <SOURCE_DIR>/build
BUILD_COMMAND cd <SOURCE_DIR>/build
&& cmake .. -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} -DMKLROOT=${MKLDNN_MKLROOT}
&& $(MAKE)
INSTALL_COMMAND cd <SOURCE_DIR>/build && $(MAKE) install
UPDATE_COMMAND ""
)
ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIBRARY})
ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIBRARY}")
LIST(APPEND external_project_dependencies mkldnn)
# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
IF(NOT ${WITH_MKLML})
return()
ENDIF(NOT ${WITH_MKLML})
INCLUDE(ExternalProject)
SET(MKLML_PROJECT "extern_mklml")
SET(MKLML_VER "mklml_lnx_2018.0.20170425")
SET(MKLML_URL "https://github.com/01org/mkl-dnn/releases/download/v0.9/${MKLML_VER}.tgz")
SET(MKLML_SOURCE_DIR "${THIRD_PARTY_PATH}/mklml")
SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
SET(MKLML_DST_DIR "opt/paddle/third_party/mklml")
SET(MKLML_INSTALL_ROOT "${CMAKE_INSTALL_PREFIX}")
IF(NOT "$ENV{HOME}" STREQUAL "/root")
SET(MKLML_INSTALL_ROOT "$ENV{HOME}")
ENDIF()
SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
SET(MKLML_ROOT ${MKLML_INSTALL_DIR}/${MKLML_VER})
SET(MKLML_INC_DIR ${MKLML_ROOT}/include)
SET(MKLML_LIB_DIR ${MKLML_ROOT}/lib)
SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so)
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLML_ROOT}/lib")
INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
SET(mklml_cmakefile ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt)
FILE(WRITE ${mklml_cmakefile} "PROJECT(MKLML)\n"
"cmake_minimum_required(VERSION 3.0)\n"
"install(DIRECTORY ${MKLML_VER}\n"
" DESTINATION ${MKLML_DST_DIR})\n")
ExternalProject_Add(
${MKLML_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${MKLML_SOURCE_DIR}
DOWNLOAD_DIR ${MKLML_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate -O ${MKLML_DOWNLOAD_DIR}/${MKLML_VER}.tgz ${MKLML_URL}
&& tar -xzf ${MKLML_DOWNLOAD_DIR}/${MKLML_VER}.tgz
DOWNLOAD_NO_PROGRESS 1
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLML_INSTALL_ROOT}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLML_INSTALL_ROOT}
)
ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
LIST(APPEND external_project_dependencies mklml)
......@@ -124,6 +124,7 @@ set(GPU_COMMON_FLAGS
-Wno-error=literal-suffix
-Wno-error=unused-local-typedefs
-Wno-error=unused-function # Warnings in Numpy Header.
-Wno-error=array-bounds # Warnings in Eigen::array
)
if (APPLE)
......
// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
import (
......
// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
import (
......
// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package connection
import (
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
if(WITH_TESTING)
go_test(master_test)
endif()
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
go_library(paddle_master SHARED DEPS paddle_go_optimizer)
// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
/*
......
// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package master
import (
......
// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package master
import (
......
// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package master_test
import (
......
// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package master
import (
......
// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package master
import "sync"
......
// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package master
import (
......
// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package master
import "testing"
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
if(WITH_TESTING)
go_test(pserver_test DEPS paddle_go_optimizer)
endif()
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
if(WITH_TESTING)
go_test(pserver_client_test DEPS paddle_go_optimizer)
endif()
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
cc_library(paddle_go_optimizer DEPS paddle_optimizer paddle_proto glog gflags protobuf)
target_link_libraries(paddle_go_optimizer stdc++ m)
......
// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package main
/*
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
cc_test(test_cclient SRCS test_cclient.c DEPS paddle_pserver_cclient paddle_go_optimizer)
add_style_check_target(test_cclient test_cclient.c)
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <stdio.h>
#include <stdlib.h>
......
// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package client
import (
......
// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package client_test
import (
......
// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package client
import (
......@@ -66,10 +80,10 @@ func (p *EtcdClient) List() []Server {
for {
for i := 0; i < psDesired; i++ {
ctx, cancel := context.WithTimeout(context.Background(), p.timeout)
cancel()
psKey := pserver.PsPath + strconv.Itoa(i)
log.Debugf("checking %s", psKey)
resp, err := p.client.Get(ctx, psKey)
cancel()
if err != nil {
log.Infof("Get psKey= %s error, %v", psKey, err)
time.Sleep(p.timeout)
......
// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package pserver
import (
......
// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package pserver
// #cgo CFLAGS: -I ../../
......
// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package pserver
import (
......
// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package pserver
import (
......
// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package pserver_test
import (
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
if(WITH_TESTING)
go_test(network_helper_test)
endif()
// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package networkhelper
import (
......
// Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
package networkhelper
import "testing"
......
......@@ -61,24 +61,24 @@ struct EigenTensor {
}
};
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
struct EigenMatrix : public EigenTensor<T, 2, MajorType, IndexType> {};
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
struct EigenVector : public EigenTensor<T, 1, MajorType, IndexType> {
// Flatten is to reshape a Tensor into a one dimension EigenVector
static typename EigenTensor<T, 1>::Type Flatten(Tensor& tensor) {
return EigenTensor<T, 1>::From(
// Flatten reshapes a Tensor into an EigenVector.
static typename EigenVector::Type Flatten(Tensor& tensor) {
return EigenVector::From(
tensor, make_ddim({static_cast<int>(product(tensor.dims_))}));
}
static typename EigenTensor<T, 1>::ConstType Flatten(const Tensor& tensor) {
return EigenTensor<T, 1>::From(
static typename EigenVector::ConstType Flatten(const Tensor& tensor) {
return EigenVector::From(
tensor, make_ddim({static_cast<int>(product(tensor.dims_))}));
}
};
template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenMatrix = EigenTensor<T, 2, MajorType, IndexType>;
} // namespace framework
} // namespace paddle
......@@ -49,19 +49,22 @@ void PlainNet::CompleteAddOp(bool calc) {
output_set.insert(opt);
}
}
inputs_.reserve(input_set.size());
std::copy(input_set.begin(), input_set.end(), std::back_inserter(inputs_));
std::sort(inputs_.begin(), inputs_.end());
outputs_.reserve(output_set.size());
std::copy(output_set.begin(), output_set.end(), std::back_inserter(outputs_));
std::sort(outputs_.begin(), outputs_.end());
std::vector<int> tmp_index;
tmp_index.reserve(temp_output.size());
int idx = 0;
for (auto& opt : output_set) {
if (Contains(temp_output, opt)) {
tmp_index.push_back(idx);
int output_len = static_cast<int>(outputs_.size());
for (int i = 0; i < output_len; ++i) {
if (Contains(temp_output, outputs_[i])) {
tmp_index.push_back(i);
}
outputs_.push_back(opt);
++idx;
}
attrs_["temporary_index"] = tmp_index;
......@@ -69,9 +72,12 @@ void PlainNet::CompleteAddOp(bool calc) {
std::string PlainNet::DebugString() const {
std::ostringstream os;
os << this->type_ << ":" << std::endl;
os << OperatorBase::DebugString() << std::endl;
for (auto& op : ops_) {
os << "\t" << op->DebugString() << std::endl;
std::istringstream is(op->DebugString());
for (std::string line; std::getline(is, line);) {
os << " " << line << std::endl;
}
}
return os.str();
}
......
......@@ -39,7 +39,7 @@ namespace framework {
*/
class Net : public OperatorBase {
public:
virtual void AddOp(const OperatorPtr& op) = 0;
virtual void AddOp(const std::shared_ptr<OperatorBase>& op) = 0;
virtual void CompleteAddOp(bool calc) = 0;
};
......@@ -57,7 +57,7 @@ class PlainNet : public Net {
* Infer all the operators' input and output variables' shapes, will be called
* before every mini-batch
*/
void InferShape(const ScopePtr& scope) const override {
void InferShape(const std::shared_ptr<Scope>& scope) const override {
for (auto& op : ops_) {
op->InferShape(scope);
}
......@@ -70,7 +70,7 @@ class PlainNet : public Net {
* scope will be used instead. If no OpContext is provicded, default context
* will be used.
*/
void Run(const ScopePtr& scope,
void Run(const std::shared_ptr<Scope>& scope,
const platform::DeviceContext& dev_ctx) const override {
for (auto& op : ops_) {
op->Run(scope, dev_ctx);
......@@ -80,7 +80,7 @@ class PlainNet : public Net {
/**
* @brief Add an operator by ptr
*/
void AddOp(const OperatorPtr& op) override {
void AddOp(const std::shared_ptr<OperatorBase>& op) override {
PADDLE_ENFORCE(!add_op_done_, "Cannot AddOp when this network is sealed");
ops_.push_back(op);
}
......@@ -89,7 +89,7 @@ class PlainNet : public Net {
std::string DebugString() const override;
std::vector<OperatorPtr> ops_;
std::vector<std::shared_ptr<OperatorBase>> ops_;
private:
bool add_op_done_{false};
......
......@@ -16,9 +16,11 @@ static int run_cnt = 0;
class TestOp : public OperatorBase {
public:
void InferShape(const ScopePtr& scope) const override { ++infer_shape_cnt; }
void Run(const ScopePtr& scope,
const platform::DeviceContext& dev_ctx) const override {
void InferShape(const std::shared_ptr<pd::Scope>& scope) const override {
++infer_shape_cnt;
}
void Run(const std::shared_ptr<pd::Scope>& scope,
const paddle::platform::DeviceContext& dev_ctx) const override {
++run_cnt;
}
};
......
......@@ -250,7 +250,7 @@ class OpRegistry {
grad_creators()[op_type] = [] { return new OpType; };
}
static OperatorPtr CreateOp(const std::string& type,
static std::shared_ptr<OperatorBase> CreateOp(const std::string& type,
const VarNameList& inputs,
const VarNameList& outputs,
const AttributeMap& attrs) {
......@@ -276,10 +276,10 @@ class OpRegistry {
}
op->Init();
return OperatorPtr(op);
return std::shared_ptr<OperatorBase>(op);
}
static OperatorPtr CreateOp(const OpDesc& op_desc) {
static std::shared_ptr<OperatorBase> CreateOp(const OpDesc& op_desc) {
std::vector<std::string> inputs;
inputs.reserve((size_t)op_desc.inputs_size());
std::copy(op_desc.inputs().begin(), op_desc.inputs().end(),
......
......@@ -7,9 +7,9 @@ namespace paddle {
namespace framework {
class CosineOp : public OperatorBase {
public:
void Run(const ScopePtr& scope,
void Run(const std::shared_ptr<Scope>& scope,
const platform::DeviceContext& dev_ctx) const override {}
void InferShape(const ScopePtr& scope) const override {}
void InferShape(const std::shared_ptr<Scope>& scope) const override {}
};
class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
......@@ -27,8 +27,8 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
class MyTestOp : public OperatorBase {
public:
void InferShape(const ScopePtr& scope) const override {}
void Run(const ScopePtr& scope,
void InferShape(const std::shared_ptr<Scope>& scope) const override {}
void Run(const std::shared_ptr<Scope>& scope,
const platform::DeviceContext& dev_ctx) const override {}
};
......@@ -67,7 +67,7 @@ TEST(OpRegistry, CreateOp) {
attr->set_type(paddle::framework::AttrType::FLOAT);
attr->set_f(scale);
paddle::framework::OperatorPtr op =
std::shared_ptr<paddle::framework::OperatorBase> op =
paddle::framework::OpRegistry::CreateOp(op_desc);
auto scope = std::make_shared<paddle::framework::Scope>();
paddle::platform::CPUDeviceContext dev_ctx;
......@@ -89,7 +89,6 @@ TEST(OpRegistry, IllegalAttr) {
bool caught = false;
try {
paddle::framework::OperatorPtr op __attribute__((unused)) =
paddle::framework::OpRegistry::CreateOp(op_desc);
} catch (std::runtime_error& err) {
caught = true;
......@@ -110,7 +109,7 @@ TEST(OpRegistry, DefaultValue) {
ASSERT_TRUE(op_desc.IsInitialized());
paddle::framework::OperatorPtr op =
std::shared_ptr<paddle::framework::OperatorBase> op =
paddle::framework::OpRegistry::CreateOp(op_desc);
auto scope = std::make_shared<paddle::framework::Scope>();
paddle::platform::CPUDeviceContext dev_ctx;
......@@ -136,7 +135,6 @@ TEST(OpRegistry, CustomChecker) {
// attr 'test_attr' is not set
bool caught = false;
try {
paddle::framework::OperatorPtr op __attribute__((unused)) =
paddle::framework::OpRegistry::CreateOp(op_desc);
} catch (std::runtime_error& err) {
caught = true;
......@@ -155,7 +153,6 @@ TEST(OpRegistry, CustomChecker) {
attr->set_i(3);
caught = false;
try {
paddle::framework::OperatorPtr op __attribute__((unused)) =
paddle::framework::OpRegistry::CreateOp(op_desc);
} catch (std::runtime_error& err) {
caught = true;
......@@ -174,8 +171,7 @@ TEST(OpRegistry, CustomChecker) {
attr->set_type(paddle::framework::AttrType::INT);
attr->set_i(4);
SetInputFormat(&op_desc);
paddle::framework::OperatorPtr op =
paddle::framework::OpRegistry::CreateOp(op_desc);
auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
paddle::platform::CPUDeviceContext dev_ctx;
auto scope = std::make_shared<paddle::framework::Scope>();
op->Run(scope, dev_ctx);
......
......@@ -47,7 +47,6 @@ struct EigenDeviceConverter<platform::GPUPlace> {
#endif
class OperatorBase;
using OperatorPtr = std::shared_ptr<OperatorBase>;
/**
* OperatorBase has the basic element that Net will call to do computation.
* Only CreateOperator from OpRegistry will new Operator directly. User
......@@ -85,10 +84,10 @@ class OperatorBase {
/// InferShape infer the size of Variables used by this Operator with
/// information inside scope
virtual void InferShape(const ScopePtr& scope) const = 0;
virtual void InferShape(const std::shared_ptr<Scope>& scope) const = 0;
/// Net will call this function to Run an op.
virtual void Run(const ScopePtr& scope,
virtual void Run(const std::shared_ptr<Scope>& scope,
const platform::DeviceContext& dev_ctx) const = 0;
// Get a input with argument's name described in `op_proto`
......@@ -213,7 +212,7 @@ class OperatorWithKernel : public OperatorBase {
using OpKernelMap =
std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;
void Run(const ScopePtr& scope,
void Run(const std::shared_ptr<Scope>& scope,
const platform::DeviceContext& dev_ctx) const final {
auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
opKernel->Compute(KernelContext(this, scope, dev_ctx));
......
......@@ -24,8 +24,8 @@ static int op_run_num = 0;
class OpWithoutKernelTest : public OperatorBase {
public:
void Init() override { x = 1; }
void InferShape(const ScopePtr& scope) const override {}
void Run(const ScopePtr& scope,
void InferShape(const std::shared_ptr<Scope>& scope) const override {}
void Run(const std::shared_ptr<Scope>& scope,
const platform::DeviceContext& dev_ctx) const override {
op_run_num++;
ASSERT_EQ((int)inputs_.size(), 1);
......@@ -70,8 +70,7 @@ TEST(OperatorBase, all) {
paddle::platform::CPUDeviceContext device_context;
auto scope = std::make_shared<paddle::framework::Scope>();
paddle::framework::OperatorPtr op =
paddle::framework::OpRegistry::CreateOp(op_desc);
auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
scope->CreateVariable("OUT1");
ASSERT_EQ(paddle::framework::op_run_num, 0);
op->Run(scope, device_context);
......@@ -189,8 +188,7 @@ TEST(OpKernel, all) {
paddle::platform::CPUDeviceContext cpu_device_context;
auto scope = std::make_shared<paddle::framework::Scope>();
paddle::framework::OperatorPtr op =
paddle::framework::OpRegistry::CreateOp(op_desc);
auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0);
op->Run(scope, cpu_device_context);
ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1);
......@@ -236,6 +234,6 @@ TEST(OpKernel, multi_inputs) {
paddle::platform::CPUDeviceContext cpu_device_context;
auto scope = std::make_shared<Scope>();
OperatorPtr op(paddle::framework::OpRegistry::CreateOp(op_desc));
auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
op->Run(scope, cpu_device_context);
}
......@@ -24,7 +24,6 @@ namespace paddle {
namespace framework {
class Scope;
using ScopePtr = std::shared_ptr<Scope>;
/**
* @brief Scope that manage all variables.
......@@ -44,7 +43,7 @@ class Scope {
/**
* @brief Initialize a Scope with parent.
*/
explicit Scope(const ScopePtr& parent) : parent_(parent) {}
explicit Scope(const std::shared_ptr<Scope>& parent) : parent_(parent) {}
/**
* @brief Create Variable
......@@ -91,7 +90,7 @@ class Scope {
private:
std::unordered_map<std::string, std::unique_ptr<Variable>> vars_;
ScopePtr parent_{nullptr};
std::shared_ptr<Scope> parent_{nullptr};
};
} // namespace framework
......
......@@ -48,25 +48,27 @@ class Tensor {
template <typename T>
const T* data() const {
CheckDims<T>();
EnforceSufficientMemory<T>();
return reinterpret_cast<const T*>(
reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
}
template <typename T>
T* data() {
CheckDims<T>();
EnforceSufficientMemory<T>();
return reinterpret_cast<T*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_);
}
template <typename T>
template <typename T, // must be POD types
typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
T* mutable_data(DDim dims, platform::Place place) {
set_dims(dims);
Resize(dims);
return mutable_data<T>(place);
}
template <typename T>
template <typename T, // must be POD types
typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
T* mutable_data(platform::Place place) {
PADDLE_ENFORCE(product(dims_) > 0,
"Tensor's numel must be larger than zero to call "
......@@ -95,11 +97,9 @@ class Tensor {
}
template <typename T>
void ShareDataFrom(const Tensor& src) {
src.CheckDims<T>();
holder_ = src.holder_;
set_dims(src.dims());
offset_ = src.offset_;
void ShareDataWith(const Tensor& src) {
src.EnforceSufficientMemory<T>();
*this = src;
}
template <typename T>
......@@ -107,9 +107,9 @@ class Tensor {
PADDLE_ENFORCE(platform::is_cpu_place(src.holder_->place()) &&
platform::is_cpu_place(dst_place),
"Tensor::CopyFrom only support CPU now.");
src.CheckDims<T>();
src.EnforceSufficientMemory<T>();
size_t size = product(src.dims_) * sizeof(T);
set_dims(src.dims());
Resize(src.dims());
const void* src_ptr = static_cast<const void*>(src.data<T>());
void* dst_ptr = static_cast<void*>(mutable_data<T>(dst_place));
memcpy(dst_ptr, src_ptr, size);
......@@ -117,34 +117,25 @@ class Tensor {
template <typename T>
Tensor Slice(const int& begin_idx, const int& end_idx) const {
CheckDims<T>();
PADDLE_ENFORCE(begin_idx >= 0 && end_idx <= dims_[0],
"Slice index is less than zero or out of bound.");
EnforceSufficientMemory<T>();
PADDLE_ENFORCE(begin_idx >= 0, "Slice begin index is less than zero.");
PADDLE_ENFORCE(end_idx <= dims_[0], "Slice end index is out of bound.");
PADDLE_ENFORCE(begin_idx < end_idx,
"Begin index must be less than end index.");
PADDLE_ENFORCE(dims_[0] != 1, "Can not slice a tensor with dims_[0] = 1.");
std::vector<int> d = vectorize(dims_);
int base = 1;
for (size_t i = 1; i < d.size(); ++i) {
base *= d[i];
}
int base = product(dims_) / dims_[0];
Tensor dst;
dst.holder_ = holder_;
DDim dst_dims = dims_;
dst_dims[0] = end_idx - begin_idx;
dst.set_dims(dst_dims);
dst.Resize(dst_dims);
dst.offset_ = offset_ + begin_idx * base * sizeof(T);
return dst;
}
void set_dims(const DDim& dims) {
if (dims == dims_) {
return;
}
dims_ = dims;
}
void Resize(const DDim& dims) { dims_ = dims; }
DDim dims() const { return dims_; }
const DDim& dims() const { return dims_; }
private:
// Placeholder hides type T, so it doesn't appear as a template
......@@ -159,21 +150,9 @@ class Tensor {
template <typename T, typename PlaceType>
struct PlaceholderImpl : public Placeholder {
private:
template <typename PType>
class Deleter {
public:
Deleter(PType place) : place_(place) {}
void operator()(T* ptr) { memory::Free(place_, static_cast<void*>(ptr)); }
private:
PType place_;
};
public:
PlaceholderImpl(PlaceType place, size_t size)
: ptr_(static_cast<T*>(memory::Alloc(place, size)),
Deleter<PlaceType>(place)),
memory::PODDeleter<T, PlaceType>(place)),
place_(place),
size_(size) {}
......@@ -182,13 +161,13 @@ class Tensor {
virtual paddle::platform::Place place() const { return place_; }
virtual std::type_index type() const { return std::type_index(typeid(T)); }
std::unique_ptr<T, Deleter<PlaceType>> ptr_;
std::unique_ptr<T, memory::PODDeleter<T, PlaceType>> ptr_;
platform::Place place_; // record the place of ptr_.
size_t size_; // size of the memory block.
};
template <typename T>
inline void CheckDims() const {
inline void EnforceSufficientMemory() const {
PADDLE_ENFORCE(holder_ != nullptr,
"Tenosr holds no memory. Call Tensor::mutable_data first.");
PADDLE_ENFORCE(holder_->size() >= product(dims_) * sizeof(T) + offset_,
......@@ -198,7 +177,11 @@ class Tensor {
std::shared_ptr<Placeholder> holder_; // holds the memory block if allocated.
DDim dims_;
size_t offset_; // marks the begin of tensor data area.
// A PlaceHolder may be shared by more than one tensor. Some of them may be
// slices of the others. So the offset_ is introduced here to indicate the
// byte offset between PlaceHolder::ptr_ and where tensor's data really
// begins.
size_t offset_;
};
} // namespace framework
......
......@@ -19,7 +19,7 @@ TEST(Tensor, Dims) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor tt;
tt.set_dims(make_ddim({2, 3, 4}));
tt.Resize(make_ddim({2, 3, 4}));
DDim dims = tt.dims();
ASSERT_EQ(arity(dims), 3);
for (int i = 0; i < 3; ++i) {
......@@ -97,7 +97,7 @@ TEST(Tensor, MutableData) {
#endif
}
TEST(Tensor, ShareDataFrom) {
TEST(Tensor, ShareDataWith) {
using namespace paddle::framework;
using namespace paddle::platform;
{
......@@ -106,7 +106,7 @@ TEST(Tensor, ShareDataFrom) {
// Try to share data form uninitialized tensor
bool caught = false;
try {
dst_tensor.ShareDataFrom<float>(src_tensor);
dst_tensor.ShareDataWith<float>(src_tensor);
} catch (std::runtime_error& err) {
caught = true;
std::string msg =
......@@ -119,7 +119,7 @@ TEST(Tensor, ShareDataFrom) {
ASSERT_TRUE(caught);
src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), CPUPlace());
dst_tensor.ShareDataFrom<int>(src_tensor);
dst_tensor.ShareDataWith<int>(src_tensor);
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
}
......@@ -128,7 +128,7 @@ TEST(Tensor, ShareDataFrom) {
Tensor src_tensor;
Tensor dst_tensor;
src_tensor.mutable_data<int>(make_ddim({2, 3, 4}), GPUPlace());
dst_tensor.ShareDataFrom<int>(src_tensor);
dst_tensor.ShareDataWith<int>(src_tensor);
ASSERT_EQ(src_tensor.data<int>(), dst_tensor.data<int>());
}
#endif
......
......@@ -31,13 +31,22 @@ public:
ConvolutionTest(const std::string& conv1,
const std::string& conv2,
TestType type,
bool useGroups = true,
std::string algo = "auto") {
for (size_t batchSize : {1, 32}) {
for (size_t inputSize : {7, 14, 54}) {
for (size_t filterSize : {1, 3, 5}) {
for (size_t inputChannels : {3, 64}) {
for (size_t outputChannels : {3, 64, 128}) {
if (inputChannels < outputChannels) break;
for (size_t outputChannels : {3, 64}) {
if (inputChannels > outputChannels) break;
size_t groups;
if (!useGroups) {
groups = 1;
} else {
if (outputChannels % inputChannels != 0) continue;
groups = inputChannels;
}
for (size_t stride : {1, 2}) {
for (size_t padding : {0, 1}) {
if (padding >= filterSize) break;
......@@ -62,13 +71,24 @@ public:
FuncConfig()
.set("paddings", paddings)
.set("strides", strides)
.set("groups", (size_t)1)
.set("groups", groups)
.set("algo", algo));
TensorShape input{
batchSize, inputChannels, inputSize, inputSize};
TensorShape filter{
outputChannels, inputChannels, filterSize, filterSize};
TensorShape filter;
if (groups > 1)
filter = TensorShape({groups,
outputChannels / groups,
inputChannels / groups,
filterSize,
filterSize});
else
filter = TensorShape({outputChannels,
inputChannels,
filterSize,
filterSize});
TensorShape output{
batchSize, outputChannels, outputSize, outputSize};
......@@ -85,7 +105,8 @@ public:
} else if (type == kBackwardFilterTest) {
test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter),
ADD_TO);
test.run();
}
}
......@@ -106,6 +127,7 @@ public:
ConvolutionTest2(const std::string& conv1,
const std::string& conv2,
TestType type,
bool useGroups = true,
std::string algo = "auto") {
for (size_t batchSize : {16}) {
for (size_t inputHeight : {7, 31}) {
......@@ -113,7 +135,15 @@ public:
for (size_t filterHeight : {1, 5}) {
for (size_t filterWidth : {3, 7}) {
for (size_t inputChannels : {7}) {
for (size_t outputChannels : {32}) {
for (size_t outputChannels : {7}) {
size_t groups;
if (!useGroups) {
groups = 1;
} else {
if (outputChannels % inputChannels != 0) continue;
groups = inputChannels;
}
size_t stride = 1;
size_t padding = 0;
size_t outputHeight =
......@@ -141,13 +171,24 @@ public:
FuncConfig()
.set("paddings", paddings)
.set("strides", strides)
.set("groups", (size_t)1)
.set("groups", groups)
.set("algo", algo));
TensorShape input{
batchSize, inputChannels, inputHeight, inputWidth};
TensorShape filter{
outputChannels, inputChannels, filterHeight, filterWidth};
TensorShape filter;
if (groups > 1)
filter = TensorShape({groups,
outputChannels / groups,
inputChannels / groups,
filterHeight,
filterWidth});
else
filter = TensorShape({outputChannels,
inputChannels,
filterHeight,
filterWidth});
TensorShape output{
batchSize, outputChannels, outputHeight, outputWidth};
......@@ -164,7 +205,8 @@ public:
} else if (type == kBackwardFilterTest) {
test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter),
ADD_TO);
test.run();
}
}
......@@ -177,34 +219,88 @@ public:
}
};
// ======Start Convolution TEST======
TEST(Forward, GEMM) {
ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test(
"NaiveConv-CPU", "GemmConv-CPU", kForwardTest);
"NaiveConv-CPU", "GemmConv-CPU", kForwardTest, false);
ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test2(
"NaiveConv-CPU", "GemmConv-CPU", kForwardTest);
"NaiveConv-CPU", "GemmConv-CPU", kForwardTest, false);
}
#ifndef PADDLE_ONLY_CPU
TEST(Forward, GEMM2) {
ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
"GemmConv-CPU", "GemmConv-GPU", kForwardTest);
"GemmConv-CPU", "GemmConv-GPU", kForwardTest, false);
ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
"GemmConv-CPU", "GemmConv-GPU", kForwardTest);
"GemmConv-CPU", "GemmConv-GPU", kForwardTest, false);
}
TEST(BackwardInput, GEMM) {
ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
"GemmConvGradInput-CPU", "GemmConvGradInput-GPU", kBackwardInputTest);
"GemmConvGradInput-CPU",
"GemmConvGradInput-GPU",
kBackwardInputTest,
false);
ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
"GemmConvGradInput-CPU", "GemmConvGradInput-GPU", kBackwardInputTest);
"GemmConvGradInput-CPU",
"GemmConvGradInput-GPU",
kBackwardInputTest,
false);
}
TEST(BackwardFilter, GEMM) {
ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
"GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", kBackwardFilterTest);
"GemmConvGradFilter-CPU",
"GemmConvGradFilter-GPU",
kBackwardFilterTest,
false);
ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
"GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", kBackwardFilterTest);
"GemmConvGradFilter-CPU",
"GemmConvGradFilter-GPU",
kBackwardFilterTest,
false);
}
#endif
// ======End Convolution TEST======
// ======Start DepthwiseConvolution TEST======
// TODO(zhaolong) The depthwise convolution cpu test will be added when the cpu
// version of depthwiseConv is implemented.
#ifndef PADDLE_ONLY_CPU
TEST(DepthwiseConvForward, GEMM2) {
ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
"GemmConv-CPU", "DepthwiseConv-GPU", kForwardTest);
ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
"GemmConv-CPU", "DepthwiseConv-GPU", kForwardTest);
}
TEST(DepthwiseConvBackwardInput, GEMM) {
ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
"GemmConvGradInput-CPU",
"DepthwiseConvGradInput-GPU",
kBackwardInputTest);
ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
"GemmConvGradInput-CPU",
"DepthwiseConvGradInput-GPU",
kBackwardInputTest);
}
TEST(DepthwiseConvBackwardFilter, GEMM) {
ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
"GemmConvGradFilter-CPU",
"DepthwiseConvGradFilter-GPU",
kBackwardFilterTest);
ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
"GemmConvGradFilter-CPU",
"DepthwiseConvGradFilter-GPU",
kBackwardFilterTest);
}
#endif
// ======End DepthwiseConvolution TEST======
} // namespace paddle
......@@ -39,7 +39,8 @@ void Crop<DEVICE_TYPE_GPU>(real* outputs,
const TensorShape inShape,
const TensorShape outShape,
const FuncConfig& conf) {
std::vector<uint32_t> crop_corner = conf.get<std::vector<uint32_t>>("crop_corner");
std::vector<uint32_t> crop_corner =
conf.get<std::vector<uint32_t>>("crop_corner");
int cropC = crop_corner[1];
int cropH = crop_corner[2];
int cropW = crop_corner[3];
......@@ -74,7 +75,8 @@ __global__ void KeCropDiff(const real* inGrad, real* outGrad,
const int c = (idx / inW / inH) % inC;
const int n = idx / inW / inH / inC;
const int off = ((n * outC + c + cropC) * outH + h + cropH) * outW + cropW + w;
const int off =
((n * outC + c + cropC) * outH + h + cropH) * outW + cropW + w;
outGrad[off] += inGrad[idx];
}
......@@ -86,7 +88,8 @@ void CropGrad<DEVICE_TYPE_GPU>(const real* inGrad,
const TensorShape inShape,
const TensorShape outShape,
const FuncConfig& conf) {
std::vector<uint32_t> crop_corner = conf.get<std::vector<uint32_t>>("crop_corner");
std::vector<uint32_t> crop_corner =
conf.get<std::vector<uint32_t>>("crop_corner");
int cropC = crop_corner[1];
int cropH = crop_corner[2];
int cropW = crop_corner[3];
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "DepthwiseConvOp.h"
#include "ConvOp.h"
#include "GemmFunctor.h"
namespace paddle {
template <class T>
class DepthwiseConvFunctor<DEVICE_TYPE_CPU, T> {
public:
void operator()(const T* inputData,
const T* filterData,
int batchSize,
int outputChannels,
int outputHeight,
int outputWidth,
int inputChannels,
int inputHeight,
int inputWidth,
int filterMultiplier,
int filterHeight,
int filterWidth,
int strideH,
int strideW,
int paddingH,
int paddingW,
T* outputData) {
// TODO(zhaolong) : cpu implementation of depthwise convolution
}
};
template <class T>
class DepthwiseConvGradInputFunctor<DEVICE_TYPE_CPU, T> {
public:
void operator()(const T* outputGrad,
const T* filterData,
int batchSize,
int outputChannels,
int outputHeight,
int outputWidth,
int inputChannels,
int inputHeight,
int inputWidth,
int filterMultiplier,
int filterHeight,
int filterWidth,
int strideH,
int strideW,
int paddingH,
int paddingW,
T* inputGrad) {}
// TODO(zhaolong) : cpu implementation of depthwise convolution
};
template <class T>
class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_CPU, T> {
public:
void operator()(const T* outputGrad,
const T* inputData,
int batchSize,
int outputChannels,
int outputHeight,
int outputWidth,
int inputChannels,
int inputHeight,
int inputWidth,
int filterMultiplier,
int filterHeight,
int filterWidth,
int strideH,
int strideW,
int paddingH,
int paddingW,
T* colData,
T* filterGrad) {}
// TODO(zhaolong) : cpu implementation of depthwise convolution
};
/*
* \brief Forward calculation of depthwise convolution.
*/
template <DeviceType Device>
class DepthwiseConvFunction : public ConvFunctionBase {
public:
void init(const FuncConfig& config) override {
ConvFunctionBase::init(config);
}
void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
const TensorShape& input = inputs[0].shape();
const TensorShape& filter = inputs[1].shape();
const TensorShape& output = outputs[0].shape();
checkShape(input, filter, output);
}
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
CHECK_EQ(numInputs_, inputs.size());
CHECK_EQ(numOutputs_, outputs.size());
check(inputs, outputs);
const TensorShape& input = inputs[0].shape();
const TensorShape& filter = inputs[1].shape();
const TensorShape& output = outputs[0].shape();
size_t batchSize = input[0];
size_t inputChannels = input[1];
size_t inputHeight = input[2];
size_t inputWidth = input[3];
size_t filterHeight = getFilterHeight(filter);
size_t filterWidth = getFilterWidth(filter);
size_t outputChannels = output[1];
size_t outputHeight = output[2];
size_t outputWidth = output[3];
size_t filterMultiplier = outputChannels / groups_;
CHECK_EQ(inputChannels, groups_);
real* inputData = inputs[0].data<real>();
real* filterData = inputs[1].data<real>();
real* outputData = outputs[0].data<real>();
DepthwiseConvFunctor<Device, real> depthwiseConv;
depthwiseConv(inputData,
filterData,
batchSize,
outputChannels,
outputHeight,
outputWidth,
inputChannels,
inputHeight,
inputWidth,
filterMultiplier,
filterHeight,
filterWidth,
strideH(),
strideW(),
paddingH(),
paddingW(),
outputData);
}
};
/*
* \brief Backward input calculation of depthwise convolution.
*/
template <DeviceType Device>
class DepthwiseConvGradInputFunction : public ConvFunctionBase {
public:
void init(const FuncConfig& config) override {
ConvFunctionBase::init(config);
}
void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
const TensorShape& output = inputs[0].shape();
const TensorShape& filter = inputs[1].shape();
const TensorShape& input = outputs[0].shape();
checkShape(input, filter, output);
}
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
CHECK_EQ(numInputs_, inputs.size());
CHECK_EQ(numOutputs_, outputs.size());
CHECK_EQ(outputs[0].getArgType(), ADD_TO);
check(inputs, outputs);
CHECK_EQ(outputs[0].getArgType(), ADD_TO);
const TensorShape& output = inputs[0].shape();
const TensorShape& filter = inputs[1].shape();
const TensorShape& input = outputs[0].shape();
size_t batchSize = input[0];
size_t inputChannels = input[1];
size_t inputHeight = input[2];
size_t inputWidth = input[3];
size_t filterHeight = getFilterHeight(filter);
size_t filterWidth = getFilterWidth(filter);
size_t outputChannels = output[1];
size_t outputHeight = output[2];
size_t outputWidth = output[3];
size_t filterMultiplier = outputChannels / groups_;
CHECK_EQ(inputChannels, groups_);
real* outputGrad = inputs[0].data<real>();
real* filterData = inputs[1].data<real>();
real* inputGrad = outputs[0].data<real>();
DepthwiseConvGradInputFunctor<Device, real> depthwiseConvGradInput;
depthwiseConvGradInput(outputGrad,
filterData,
batchSize,
outputChannels,
outputHeight,
outputWidth,
inputChannels,
inputHeight,
inputWidth,
filterMultiplier,
filterHeight,
filterWidth,
strideH(),
strideW(),
paddingH(),
paddingW(),
inputGrad);
}
};
/*
* \brief Backward filter calculation of depthwise convolution.
*/
template <DeviceType Device>
class DepthwiseConvGradFilterFunction : public ConvFunctionBase {
public:
void init(const FuncConfig& config) override {
ConvFunctionBase::init(config);
}
void check(const BufferArgs& inputs, const BufferArgs& outputs) override {
const TensorShape& output = inputs[0].shape();
const TensorShape& input = inputs[1].shape();
const TensorShape& filter = outputs[0].shape();
checkShape(input, filter, output);
}
void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
CHECK_EQ(numInputs_, inputs.size());
CHECK_EQ(numOutputs_, outputs.size());
CHECK_EQ(outputs[0].getArgType(), ADD_TO);
check(inputs, outputs);
const TensorShape& output = inputs[0].shape();
const TensorShape& input = inputs[1].shape();
const TensorShape& filter = outputs[0].shape();
size_t batchSize = input[0];
size_t inputChannels = input[1];
size_t inputHeight = input[2];
size_t inputWidth = input[3];
size_t filterHeight = getFilterHeight(filter);
size_t filterWidth = getFilterWidth(filter);
size_t outputChannels = output[1];
size_t outputHeight = output[2];
size_t outputWidth = output[3];
size_t filterMultiplier = outputChannels / groups_;
CHECK_EQ(inputChannels, groups_);
real* outputGrad = inputs[0].data<real>();
real* inputData = inputs[1].data<real>();
real* filterGrad = outputs[0].data<real>();
int size = outputChannels * filterHeight * filterWidth * outputHeight *
outputWidth;
resizeBuffer<Device>(size);
real* colData = reinterpret_cast<real*>(memory_->getBuf());
DepthwiseConvGradFilterFunctor<Device, real> depthwiseConvGradFilter;
depthwiseConvGradFilter(outputGrad,
inputData,
batchSize,
outputChannels,
outputHeight,
outputWidth,
inputChannels,
inputHeight,
inputWidth,
filterMultiplier,
filterHeight,
filterWidth,
strideH(),
strideW(),
paddingH(),
paddingW(),
colData,
filterGrad);
}
};
REGISTER_TYPED_FUNC(DepthwiseConv, CPU, DepthwiseConvFunction);
REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
CPU,
DepthwiseConvGradInputFunction);
REGISTER_TYPED_FUNC(DepthwiseConvGradFilter,
CPU,
DepthwiseConvGradFilterFunction);
#ifndef PADDLE_ONLY_CPU
REGISTER_TYPED_FUNC(DepthwiseConv, GPU, DepthwiseConvFunction);
REGISTER_TYPED_FUNC(DepthwiseConvGradInput,
GPU,
DepthwiseConvGradInputFunction);
REGISTER_TYPED_FUNC(DepthwiseConvGradFilter,
GPU,
DepthwiseConvGradFilterFunction);
#endif
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "TensorType.h"
namespace paddle {
/**
*\brief Depthwise convolution forward. The outputData
* of depthwise convolution is same with ExpandConvLayer
* when groups equals inputChannels in ExpandConvLayer.
*
* \param[in] inputData input data.
* \param[in] filterData the Paramters of the depthwise conv layer..
* \param[in] batchSize batch size of input data.
* \param[in] outputChannels channels of outputData.
* \param[in] outputHeight height of outputData.
* \param[in] outputWidth width of outputData.
* \param[in] inputChannels channels of inputData.
* \param[in] inputHeight height of inputData.
* \param[in] inputWidth width of inputData..
* \param[in] filterMultiplier equals to outputChannels/groups_.
* \param[in] filterHeight height of filter.
* \param[in] filterWidth widht of filter.
* \param[in] strideH stride size in height direction.
* \param[in] strideW stride size in width direction.
* \param[in] paddingH padding size in height direction.
* \param[in] paddingW padding size in width direction.
* \param[out] outputData outputData.
*
*/
template <DeviceType Device, class T>
class DepthwiseConvFunctor {
public:
void operator()(const T* inputData,
const T* filterData,
int batchSize,
int outputChannels,
int outputHeight,
int outputWidth,
int inputChannels,
int inputHeight,
int inputWidth,
int filterMultiplier,
int filterHeight,
int filterWidth,
int strideH,
int strideW,
int paddingH,
int paddingW,
T* outputData);
};
/**
*\brief Functor tot compute the depthwise convolution backprop w.r.t input.
*
*
* \param[in] outputGradData the grad data of output.
* \param[in] filterData the Paramters of the depthwise conv layer..
* \param[in] batchSize batch size of input data.
* \param[in] outputChannels channels of outputData.
* \param[in] outputHeight height of outputData.
* \param[in] outputWidth width of outputData.
* \param[in] inputChannels channels of input data.
* \param[in] inputHeight height of inputData.
* \param[in] inputWidth width of inputData.
* \param[in] filterMultiplier equals to outputChannels/groups_.
* \param[in] filterHeight height of filter.
* \param[in] filterWidth widht of filter.
* \param[in] strideH stride size in height direction.
* \param[in] strideW stride size in width direction.
* \param[in] paddingH padding size in height direction.
* \param[in] paddingW padding size in width direction.
* \param[out] inputGrad the grad data of input.
*
*/
template <DeviceType Device, class T>
class DepthwiseConvGradInputFunctor {
public:
void operator()(const T* outputGrad,
const T* filterData,
int batchSize,
int outputChannels,
int outputHeight,
int outputWidth,
int inputChannels,
int inputHeight,
int inputWidth,
int filterMultiplier,
int filterHeight,
int filterWidth,
int strideH,
int strideW,
int paddingH,
int paddingW,
T* inputGrad);
};
/**
*\brief Functor tot compute the depthwise convolution backprop w.r.t filter.
*
* \param[in] outputGradData the grad data of output.
* \param[in] inputData inputData.
* \param[in] batchSize batch size of input data.
* \param[in] outputChannels channels of outputData.
* \param[in] outputHeight height of outputData.
* \param[in] outputWidth width of outputData.
* \param[in] inputChannels channels of input data.
* \param[in] inputHeight height of inputData.
* \param[in] inputWidth width of inputData.
* \param[in] filterMultiplier equals to outputChannels/groups_.
* \param[in] filterHeight height of filter.
* \param[in] filterWidth widht of filter.
* \param[in] strideH stride size in height direction.
* \param[in] strideW stride size in width direction.
* \param[in] paddingH padding size in height direction.
* \param[in] paddingW padding size in width direction.
* \param[in] colData Auxiliary data when calculating filterGrad.
* \param[in] multiplierData Auxiliary data when calculating filterGrad.
* \param[out] filterGrad the grad data of filter.
*
*/
template <DeviceType Device, class T>
class DepthwiseConvGradFilterFunctor {
public:
void operator()(const T* outputGrad,
const T* inputData,
int batchSize,
int outputChannels,
int outputHeight,
int outputWidth,
int inputChannels,
int inputHeight,
int inputWidth,
int filterMultiplier,
int filterHeight,
int filterWidth,
int strideH,
int strideW,
int paddingH,
int paddingW,
T* colData,
T* filterGrad);
};
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "DepthwiseConvOp.h"
#include "GemmFunctor.h"
#include "paddle/math/BaseMatrix.h"
namespace paddle {
// CUDA kernel to compute the depthwise convolution forward pass
template <class T>
__global__
void ConvolutionDepthwiseForward(const int nthreads,
const T* const inputData, const T* const filterData,
const int batchSize, const int outputChannels, const int outputHeight,
const int outputWidth, const int inputChannels, const int inputHeight,
const int inputWidth, const int filterMultiplier, const int filterHeight,
const int filterWidth, const int strideH, const int strideW,
const int paddingH, const int paddingW, T* const outputData) {
int index =
(blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < nthreads) {
const int batch = index / outputChannels / outputHeight / outputWidth;
const int c_out = (index / outputHeight / outputWidth) % outputChannels;
const int h_out = (index / outputWidth) % outputHeight;
const int w_out = index % outputWidth;
const int c_in = c_out / filterMultiplier;
const T* weight = filterData + c_out * filterHeight * filterWidth;
T value = 0;
const int h_in_start = -paddingH + h_out * strideH;
const int w_in_start = -paddingW + w_out * strideW;
const int h_in_end = -paddingH + h_out * strideH + filterHeight - 1;
const int w_in_end = -paddingW + w_out * strideW + filterWidth - 1;
if ((h_in_start >= 0) && (h_in_end < inputHeight)
&& (w_in_start >= 0) && (w_in_end < inputWidth)) {
for (int kh = 0; kh < filterHeight; ++kh) {
for (int kw = 0; kw < filterWidth; ++kw) {
const int h_in = -paddingH + h_out * strideH + kh;
const int w_in = -paddingW + w_out * strideW + kw;
const int offset = ((batch * inputChannels + c_in)
* inputHeight + h_in) * inputWidth + w_in;
value += (*weight) * inputData[offset];
++weight;
}
}
} else {
for (int kh = 0; kh < filterHeight; ++kh) {
for (int kw = 0; kw < filterWidth; ++kw) {
const int h_in = -paddingH + h_out * strideH + kh;
const int w_in = -paddingW + w_out * strideW + kw;
if ((h_in >= 0) && (h_in < inputHeight)
&& (w_in >= 0) && (w_in < inputWidth)) {
const int offset = ((batch * inputChannels + c_in)
* inputHeight + h_in) * inputWidth + w_in;
value += (*weight) * inputData[offset];
}
++weight;
}
}
}
outputData[index] = value;
}
}
// CUDA kernel to compute the depthwise convolution backprop w.r.t input.
template <class T>
__global__
void ConvolutionDepthwiseInputBackward(const int nthreads,
const T* const top_diff, const T* const weight_data,
const int num, const int outputChannels, const int outputHeight,
const int outputWidth, const int inputChannels, const int inputHeight,
const int inputWidth, const int filterMultiplier, const int filterHeight,
const int filterWidth, const int strideH, const int strideW,
const int paddingH, const int paddingW, T* const bottom_diff) {
int index =
(blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < nthreads) {
const int batch = index / inputChannels / inputHeight / inputWidth;
const int c_in = (index / inputHeight / inputWidth) % inputChannels;
const int h_in = (index / inputWidth) % inputHeight;
const int w_in = index % inputWidth;
const int c_out_start = c_in * filterMultiplier;
int h_out_start = (h_in - filterHeight + paddingH + strideH)/strideH;
h_out_start = 0 > h_out_start ? 0 : h_out_start;
int h_out_end = (h_in + paddingH)/strideH;
h_out_end = outputHeight - 1 < h_out_end? outputHeight - 1 : h_out_end;
int w_out_start = (w_in - filterWidth + paddingW + strideW)/strideW;
w_out_start = 0 > w_out_start ? 0 : w_out_start;
int w_out_end = (w_in + paddingW)/strideW;
w_out_end = outputWidth - 1 < w_out_end? outputWidth - 1 : w_out_end;
T value = 0;
for (int c_out = c_out_start;
c_out < c_out_start + filterMultiplier; c_out ++) {
for (int h_out = h_out_start; h_out <= h_out_end; ++h_out) {
const int filter_h = h_in + paddingH - h_out * strideH;
for (int w_out = w_out_start; w_out <= w_out_end; ++w_out) {
const int filter_w = w_in + paddingW - w_out * strideW;
const int filter_offset = c_out * filterHeight * filterWidth
+ filter_h * filterWidth + filter_w;
const int top_diff_offset = ((batch * outputChannels + c_out) *
outputHeight + h_out)* outputWidth + w_out;
value += top_diff[top_diff_offset] * weight_data[filter_offset];
}
}
}
bottom_diff[index] += value;
}
}
// CUDA kernel to compute the depthwise convolution backprop w.r.t filter.
template <class T>
__global__
void ConvolutionDepthwiseFilterBackward(const int num_i, const int nthreads,
const T* const top_diff, const T* const inputData,
const int num, const int outputChannels, const int outputHeight,
const int outputWidth, const int inputChannels, const int inputHeight,
const int inputWidth, const int filterMultiplier, const int filterHeight,
const int filterWidth, const int strideH, const int strideW,
const int paddingH, const int paddingW, T* const buffer_data) {
int index =
(blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
if (index < nthreads) {
const int h_out = (index / outputWidth) % outputHeight;
const int w_out = index % outputWidth;
const int kh = (index / filterWidth / outputHeight / outputWidth)
% filterHeight;
const int kw = (index / outputHeight / outputWidth) % filterWidth;
const int h_in = -paddingH + h_out * strideH + kh;
const int w_in = -paddingW + w_out * strideW + kw;
if ((h_in >= 0) && (h_in < inputHeight)
&& (w_in >= 0) && (w_in < inputWidth)) {
const int c_out = index /
(filterHeight * filterWidth * outputHeight * outputWidth);
const int c_in = c_out / filterMultiplier;
const int batch = num_i;
const int top_offset = ((batch * outputChannels + c_out) *
outputHeight + h_out) * outputWidth + w_out;
const int bottom_offset = ((batch * inputChannels + c_in)
* inputHeight + h_in) * inputWidth + w_in;
buffer_data[index] = top_diff[top_offset] * inputData[bottom_offset];
} else {
buffer_data[index] = 0;
}
}
}
template <class T>
class DepthwiseConvFunctor<DEVICE_TYPE_GPU, T>{
public:
void operator()(const T* inputData,
const T* filterData,
int batchSize,
int outputChannels,
int outputHeight,
int outputWidth,
int inputChannels,
int inputHeight,
int inputWidth,
int filterMultiplier,
int filterHeight,
int filterWidth,
int strideH,
int strideW,
int paddingH,
int paddingW,
T* outputData){
int outputSize = batchSize * outputChannels * outputHeight * outputWidth;
size_t blocks = (outputSize + 1024 -1) / 1024;
size_t blockX = 512;
size_t blockY = (blocks+512-1)/512;
dim3 threads(1024, 1);
dim3 grid(blockX, blockY);
ConvolutionDepthwiseForward<T>
<<< grid, threads, 0, STREAM_DEFAULT >>>(
outputSize,
inputData,
filterData,
batchSize,
outputChannels,
outputHeight,
outputWidth,
inputChannels,
inputHeight,
inputWidth,
filterMultiplier,
filterHeight,
filterWidth,
strideH,
strideW,
paddingH,
paddingW,
outputData);
}
};
template <class T>
class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, T>{
public:
void operator()(const T* outputGrad,
const T* filterData,
int batchSize,
int outputChannels,
int outputHeight,
int outputWidth,
int inputChannels,
int inputHeight,
int inputWidth,
int filterMultiplier,
int filterHeight,
int filterWidth,
int strideH,
int strideW,
int paddingH,
int paddingW,
T* inputGrad){
int inputSize = batchSize * inputChannels * inputHeight * inputWidth;
size_t blocks = (inputSize + 1024 -1) / 1024;
size_t blockX = 512;
size_t blockY = (blocks+512-1)/512;
dim3 threads(1024, 1);
dim3 grid(blockX, blockY);
ConvolutionDepthwiseInputBackward<T>
// NOLINT_NEXT_LINE(whitespace/operators)
<<< grid, threads, 0, STREAM_DEFAULT >>>(
inputSize,
outputGrad,
filterData,
batchSize,
outputChannels,
outputHeight,
outputWidth,
inputChannels,
inputHeight,
inputWidth,
filterMultiplier,
filterHeight,
filterWidth,
strideH,
strideW,
paddingH,
paddingW,
inputGrad);
}
};
template <class T>
class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, T> {
public:
void operator()(const T* outputGrad,
const T* inputData,
int batchSize,
int outputChannels,
int outputHeight,
int outputWidth,
int inputChannels,
int inputHeight,
int inputWidth,
int filterMultiplier,
int filterHeight,
int filterWidth,
int strideH,
int strideW,
int paddingH,
int paddingW,
T* colData,
T* filterGrad){
int colDataSize = outputChannels * filterHeight * filterWidth
* outputHeight * outputWidth;
size_t blocks = (colDataSize + 1024 -1) / 1024;
size_t blockX = 512;
size_t blockY = (blocks+512-1)/512;
dim3 threads(1024, 1);
dim3 grid(blockX, blockY);
BaseMatrix filterGradMatrix(outputChannels * filterHeight * filterWidth,
1, filterGrad, false, true);
for (int i = 0; i < batchSize; i++) {
ConvolutionDepthwiseFilterBackward<T>
<<< grid, threads, 0, STREAM_DEFAULT >>>(
i,
colDataSize,
outputGrad,
inputData,
batchSize,
outputChannels,
outputHeight,
outputWidth,
inputChannels,
inputHeight,
inputWidth,
filterMultiplier,
filterHeight,
filterWidth,
strideH,
strideW,
paddingH,
paddingW,
colData);
int K = outputHeight * outputWidth;
int M = colDataSize / K;
BaseMatrix colMatrix(M, K, colData, false, true);
filterGradMatrix.sumRows(colMatrix, (T)1.0, (T)1.0);
}
}
};
#ifdef PADDLE_TYPE_DOUBLE
template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, double>;
template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, double>;
template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, double>;
#else
template class DepthwiseConvGradInputFunctor<DEVICE_TYPE_GPU, float>;
template class DepthwiseConvFunctor<DEVICE_TYPE_GPU, float>;
template class DepthwiseConvGradFilterFunctor<DEVICE_TYPE_GPU, float>;
#endif
} // namespace paddle
......@@ -38,10 +38,25 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
inputShape_.resize(numInputs);
filterShape_.resize(numInputs);
outputShape_.resize(numInputs);
std::string convType;
std::string convGradInputType;
std::string convGradFilterType;
for (int i = 0; i < config_.inputs_size(); i++) {
std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
if (useGpu_ && (size_t)groups_[i] == (size_t)channels_[i] && !isDeconv_) {
convType = "DepthwiseConv";
convGradInputType = "DepthwiseConvGradInput";
convGradFilterType = "DepthwiseConvGradFilter";
} else {
convType = "GemmConv";
convGradInputType = "GemmConvGradInput";
convGradFilterType = "GemmConvGradFilter";
}
if (FLAGS_use_nnpack) {
CHECK_EQ(isDeconv_, false);
createFunction(forward_,
......@@ -53,21 +68,21 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
.set("algo", std::string("auto")));
} else {
createFunction(forward_,
!isDeconv_ ? "GemmConv" : "GemmConvGradInput",
!isDeconv_ ? convType : convGradInputType,
FuncConfig()
.set("paddings", paddings)
.set("strides", strides)
.set("groups", (size_t)groups_[i]));
createFunction(backward_,
!isDeconv_ ? "GemmConvGradInput" : "GemmConv",
!isDeconv_ ? convGradInputType : convType,
FuncConfig()
.set("paddings", paddings)
.set("strides", strides)
.set("groups", (size_t)groups_[i]));
createFunction(backward_,
"GemmConvGradFilter",
convGradFilterType,
FuncConfig()
.set("paddings", paddings)
.set("strides", strides)
......
......@@ -347,6 +347,55 @@ TEST(Layer, CosSimVecMatLayer) {
}
}
void testDepthwiseConvLayer(const string& type, bool useGpu) {
TestConfig config;
config.biasSize = 32;
config.layerConfig.set_type(type);
config.layerConfig.set_num_filters(32);
config.layerConfig.set_partial_sum(1);
config.layerConfig.set_shared_biases(true);
config.inputDefs.push_back({INPUT_DATA, "layer_0", 2048, 192});
LayerInputConfig* input = config.layerConfig.add_inputs();
ConvConfig* conv = input->mutable_conv_conf();
conv->set_filter_size(2);
conv->set_filter_size_y(3);
conv->set_channels(16);
conv->set_padding(0);
conv->set_padding_y(1);
conv->set_stride(2);
conv->set_stride_y(2);
conv->set_groups(16);
conv->set_filter_channels(conv->channels() / conv->groups());
conv->set_img_size(16);
conv->set_img_size_y(8);
conv->set_output_x(outputSize(conv->img_size(),
conv->filter_size(),
conv->padding(),
conv->stride(),
/* caffeMode */ true));
conv->set_output_y(outputSize(conv->img_size_y(),
conv->filter_size_y(),
conv->padding_y(),
conv->stride_y(),
/* caffeMode */ true));
config.layerConfig.set_size(conv->output_x() * conv->output_y() *
config.layerConfig.num_filters());
testLayerGrad(config, "depthwise_conv", 100, false, useGpu);
// Use small batch_size and useWeight=true to test biasGrad
testLayerGrad(config, "depthwise_conv", 2, false, useGpu, true, 0.02);
}
TEST(Layer, depthwiseConvLayer) {
// 'depthwise_conv' is a sepecial case of 'exconv' whose
// groups size equals to the input channels size.
testDepthwiseConvLayer("exconv", /* useGpu= */ false);
#ifndef PADDLE_ONLY_CPU
testDepthwiseConvLayer("exconv", /* useGpu= */ true);
#endif
}
void testConvLayer(const string& type, bool trans, bool useGpu) {
TestConfig config;
config.biasSize = 16;
......
......@@ -202,7 +202,7 @@ double dotProduct<double>(const int n, const double* x, const double* y) {
return cblas_ddot(n, x, 1, y, 1);
}
#ifdef PADDLE_USE_MKL
#if defined(PADDLE_USE_MKL) || defined(PADDLE_USE_MKLML)
template <>
void vExp<float>(const int n, const float* a, float* r) {
......@@ -243,7 +243,55 @@ template <>
void vAdd<double>(const int n, const double* a, const double* b, double* r) {
vdAdd(n, a, b, r);
}
#else
DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
template <class T>
void vExp(const int n, const T* a, T* r) {
hl_cpu_apply_binary_op<T, binary::vExp<T>, 0, 0>(
binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
}
DEFINE_MATRIX_BINARY_OP(vLog, b = std::log(a));
template <class T>
void vLog(const int n, const T* a, T* r) {
hl_cpu_apply_binary_op<T, binary::vLog<T>, 0, 0>(
binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
}
DEFINE_MATRIX_BINARY_PARAMETER_OP(vPow, ONE_PARAMETER, b = std::pow(a, p));
template <class T>
void vPow(const int n, const T* a, const T b, T* r) {
hl_cpu_apply_binary_op<T, binary::vPow<T>, 0, 0>(
binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
}
DEFINE_MATRIX_TERNARY_OP(vAdd, c = a + b);
template <class T>
void vAdd(const int n, const T* a, const T* b, T* r) {
hl_cpu_apply_ternary_op<T, ternary::vAdd<T>, 0, 0>(ternary::vAdd<T>(),
const_cast<T*>(a),
const_cast<T*>(b),
r,
1,
n,
n,
n,
n);
}
template void vExp(const int n, const float* a, float* r);
template void vExp(const int n, const double* a, double* r);
template void vLog(const int n, const float* a, float* r);
template void vLog(const int n, const double* a, double* r);
template void vPow(const int n, const float* a, const float b, float* r);
template void vPow(const int n, const double* a, const double b, double* r);
template void vAdd(const int n, const float* a, const float* b, float* r);
template void vAdd(const int n, const double* a, const double* b, double* r);
#endif
#ifdef PADDLE_USE_MKL
template <>
void vInvSqrt<float>(const int n, const float* a, float* r) {
vsInvSqrt(n, a, r);
......@@ -275,20 +323,6 @@ void vTanh<double>(const int n, const double* a, double* r) {
}
#else
DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
template <class T>
void vExp(const int n, const T* a, T* r) {
hl_cpu_apply_binary_op<T, binary::vExp<T>, 0, 0>(
binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
}
DEFINE_MATRIX_BINARY_OP(vLog, b = std::log(a));
template <class T>
void vLog(const int n, const T* a, T* r) {
hl_cpu_apply_binary_op<T, binary::vLog<T>, 0, 0>(
binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
}
DEFINE_MATRIX_BINARY_OP(vInvSqrt, b = 1.0f / std::sqrt(a));
template <class T>
void vInvSqrt(const int n, const T* a, T* r) {
......@@ -312,41 +346,12 @@ void vTanh(const int n, const T* a, T* r) {
binary::vTanh<T>(), const_cast<T*>(a), r, 1, n, n, n);
}
DEFINE_MATRIX_BINARY_PARAMETER_OP(vPow, ONE_PARAMETER, b = std::pow(a, p));
template <class T>
void vPow(const int n, const T* a, const T b, T* r) {
hl_cpu_apply_binary_op<T, binary::vPow<T>, 0, 0>(
binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
}
DEFINE_MATRIX_TERNARY_OP(vAdd, c = a + b);
template <class T>
void vAdd(const int n, const T* a, const T* b, T* r) {
hl_cpu_apply_ternary_op<T, ternary::vAdd<T>, 0, 0>(ternary::vAdd<T>(),
const_cast<T*>(a),
const_cast<T*>(b),
r,
1,
n,
n,
n,
n);
}
template void vExp(const int n, const float* a, float* r);
template void vExp(const int n, const double* a, double* r);
template void vLog(const int n, const float* a, float* r);
template void vLog(const int n, const double* a, double* r);
template void vInvSqrt(const int n, const double* a, double* r);
template void vInvSqrt(const int n, const float* a, float* r);
template void vLog1p(const int n, const float* a, float* r);
template void vLog1p(const int n, const double* a, double* r);
template void vTanh(const int n, const float* a, float* r);
template void vTanh(const int n, const double* a, double* r);
template void vPow(const int n, const float* a, const float b, float* r);
template void vPow(const int n, const double* a, const double b, double* r);
template void vAdd(const int n, const float* a, const float* b, float* r);
template void vAdd(const int n, const double* a, const double* b, double* r);
#endif
......
......@@ -15,6 +15,12 @@ limitations under the License. */
#ifndef MATHFUNCTIONS_H_
#define MATHFUNCTIONS_H_
#ifdef PADDLE_USE_MKLML
#include <mkl_cblas.h>
#include <mkl_lapacke.h>
#include <mkl_vml_functions.h>
#endif
#ifdef PADDLE_USE_MKL
#include <mkl.h>
#include <mkl_lapacke.h>
......
add_subdirectory(detail)
cc_library(memory SRCS memory.cc)
cc_library(memcpy SRCS memcpy.cc DEPS device_context)
cc_library(paddle_memory
DEPS
memory meta_data
meta_cache memory_block
buddy_allocator system_allocator)
memory
memcpy
meta_data
meta_cache
memory_block
buddy_allocator
system_allocator)
cc_test(memory_test SRCS memory_test.cc DEPS place paddle_memory)
## Design
# Region-based Heterogeneous Memory Management
### Usage
To allocate 4KB CPU memory:
```cpp
p = memory::Alloc(platform::CPUPlace(), 4*1024);
```
To allocate 4KB memory on the 3rd GPU:
```cpp
p = memory::Alloc(platform::GPUPlace(2), 4*1024);
```
To free memory and check the so-far used amount of memory on a place:
```cpp
auto pl = platform::GPUPlace(0);
p = memory::Alloc(pl, 4*1024);
cout << memory::Used(pl);
memory::Free(pl, p);
```
### API
In `paddle/memory/memory.h` we have:
```cpp
namespace memory {
template <typename Place> void* Alloc(Place, size_t);
template <typename Place> void Free(Place, void*);
template <typename Place> size_t Used(Place);
} // namespace memory
```
These function templates have specializations on either `platform::CPUPlace` or `platform::GPUPlace`:
```cpp
template<>
void* Alloc<CPUPlace>(CPUPlace p, size_t size) {
return GetCPUBuddyAllocator()->Alloc(size);
}
```
and
```cpp
template<>
void Alloc<GPUPlace>(GPUPlace p, size_t size) {
return GetGPUBuddyAllocator(p.id)->Alloc(size);
}
```
Similar specializations exist for `Free` and `Used`.
### Implementation
`GetCPUBuddyAllocator` and `GetGPUBuddyAllocator` are singletions.
```cpp
BuddyAllocator* GetCPUBuddyAllocator() {
static BuddyAllocator* a = NULL;
if (a == NULL) {
a = new BuddyAllocator(new CPUAllocator /*backup allocator*/, ...);
}
return a;
}
BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
static BuddyAllocator* as = NULL;
if (as == NULL) {
as = new BuddyAllocator*[platform::NumGPUs()];
for (int gpu = 0; gpu < platform::NumGPUs(); gpu++) {
as[gpu] = new BuddyAllocator(new GPUAllocator(gpu) /* backup allocator */, ...);
}
}
return as[gpu_id);
```
#### `BuddyAllocator`
`BuddyAllocator` implements the buddy allocation algorithm. Its constructor takes parameters only related with the algorithm:
```cpp
BuddyAllocator::BuddyAllocator(initial_pool_size, max_pool_size) {
...
}
```
Please be aware that **`BuddyAllocator` always allocate aligned memory**, aligned on 32-bytes, which can hold a `BuddyAllocator::Block` object:
```cpp
class BuddyAllocator {
private:
struct Block {
size_t size;
Block* left, right;
size_t index; // allocator id
};
...
};
```
Because BuddyAllocator has the meta-data of each block, it can trace the used memory -- record the amount returned by `Alloc` freed in `Free`. Instead, `CPUAllocator` and `GPUAllocator` doesn't know the size of freed memory block and cannot do the trace.
#### System Allocators
The `GPUAllocator` and `CPUAllocator` are calls *system allocators*. They work as the fallback allocators of `BuddyAllocator`.
## Justification
I got inspiration from Majel and Caffe2, though above design look different from both.
### Caffe2
In Caffe2, `Tensor<Context>::mutable_data()` allocates the memroy. In particular, [`Tensor<Context>::mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L523) calls [`Tensor<Context>::raw_mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L459), which in turn calls [`Context::New`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L479).
There are two implementations of `Context`:
1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory.
1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202). This looks very similar to class `majel::GPUPlace`, who also has an `int id_` data member. `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.
### Majel
In Majel, there are basically two allocator types:
1. `cpu::SystemAllocator`, which has similar functionality to `caffe2::CPUContext::New/Delete`.
1. `gpu::SystemAllocator`, which has similar functionality to `caffe2::CUDAContext::New/Delete`.
However, memory allocation is not via these two allocators. Instead, these two allocators are defined in hidden namespaces.
In Majel there are hidden global variables like:
1. `cpu::SystemAllocator g_cpu_allocator`, and
1. `vector<gpu::SystemAllocator*> g_gpu_allocators(NUM_GPUS)`.
Programs allocate memory via a BuddyAllocator, which can take the `g_cpu_allocator` or a `g_gpu_allocators[gpu_id]` as its *fallback allocator*, so that if BuddyAllocator cannot find a block in its memory pool, it extends its memory pool by calling the fallback allocator's `New(size_t)`.
Please check out the [design documentation](http://gangliao.me) to find out more details about
buddy memory allocator for both CPU and GPU.
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/memory/memcpy.h"
#include <cstring> // for memcpy
#include "paddle/platform/device_context.h"
namespace paddle {
namespace memory {
template <>
void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
platform::CPUPlace,
const void* src, size_t num) {
std::memcpy(dst, src, num);
}
#ifndef PADDLE_ONLY_CPU
template <>
void Copy<platform::CPUPlace, platform::GPUPlace>(platform::CPUPlace dst_place,
void* dst,
platform::GPUPlace src_place,
const void* src, size_t num,
cudaStream_t stream) {
platform::GPUPlaceGuard g(src_place.device);
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
}
template <>
void Copy<platform::GPUPlace, platform::CPUPlace>(platform::GPUPlace dst_place,
void* dst,
platform::CPUPlace src_place,
const void* src, size_t num,
cudaStream_t stream) {
platform::GPUPlaceGuard g(dst_place.device);
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
}
template <>
void Copy<platform::GPUPlace, platform::GPUPlace>(platform::GPUPlace dst_place,
void* dst,
platform::GPUPlace src_place,
const void* src, size_t num,
cudaStream_t stream) {
if (dst_place == src_place) {
platform::GPUPlaceGuard g(src_place.device);
platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToDevice, stream);
} else {
platform::GpuMemcpyPeer(dst, dst_place.device, src, src_place.device, num,
stream);
}
}
#endif // PADDLE_ONLY_CPU
} // namespace memory
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/platform/gpu_info.h"
#include "paddle/platform/place.h"
namespace paddle {
namespace memory {
template <typename DstPlace, typename SrcPlace>
void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num);
#ifndef PADDLE_ONLY_CPU
template <typename DstPlace, typename SrcPlace>
void Copy(DstPlace, void* dst, SrcPlace, const void* src, size_t num,
cudaStream_t stream);
#endif // PADDLE_ONLY_CPU
} // namespace memory
} // namespace paddle
......@@ -15,7 +15,8 @@ limitations under the License. */
#include "paddle/memory/memory.h"
#include "paddle/memory/detail/buddy_allocator.h"
#include "paddle/memory/detail/system_allocator.h"
#include "paddle/platform/assert.h"
#include <cstring> // for memcpy
namespace paddle {
namespace memory {
......
......@@ -14,19 +14,32 @@ limitations under the License. */
#pragma once
#include "paddle/platform/gpu_info.h"
#include "paddle/platform/place.h"
namespace paddle {
namespace memory {
template <class Place>
template <typename Place>
void* Alloc(Place, size_t);
template <class Place>
template <typename Place>
void Free(Place, void*);
template <class Place>
template <typename Place>
size_t Used(Place);
template <typename T, /* must be POD types */
typename Place /* platform::GPUPlace or platform::CPUPlace */,
typename std::enable_if<std::is_pod<T>::value>::type* = nullptr>
class PODDeleter {
public:
PODDeleter(Place place) : place_(place) {}
void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); }
private:
Place place_;
};
} // namespace memory
} // namespace paddle
......@@ -48,6 +48,7 @@ op_library(mul_op SRCS mul_op.cc mul_op.cu)
op_library(rowwise_add_op SRCS rowwise_add_op.cu rowwise_add_op.cc)
op_library(sigmoid_op SRCS sigmoid_op.cu sigmoid_op.cc)
op_library(softmax_op SRCS softmax_op.cc softmax_op.cu)
op_library(cross_entropy_op SRCS cross_entropy_op.cc cross_entropy_op.cu)
op_library(fc_op SRCS fc_op.cc DEPS mul_op rowwise_add_op sigmoid_op
softmax_op net)
......
......@@ -31,7 +31,7 @@ protected:
"Inputs/Outputs of AddOp must all be set");
PADDLE_ENFORCE(inputs[0]->dims() == inputs[1]->dims(),
"Two input of Add Op's dimension must be same.");
outputs[0]->set_dims(inputs[0]->dims());
outputs[0]->Resize(inputs[0]->dims());
}
};
......@@ -66,8 +66,5 @@ protected:
REGISTER_OP(add_two, paddle::operators::AddOp, paddle::operators::AddOpMaker);
REGISTER_GRADIENT_OP(add_two, paddle::operators::AddOpGrad);
typedef paddle::operators::AddKernel<::paddle::platform::CPUPlace, float>
AddKernel_CPU_float;
REGISTER_OP_CPU_KERNEL(add_two, AddKernel_CPU_float);
// REGISTER_OP_CPU_KERNEL(add_two, AddKernel_CPU_float);
REGISTER_OP_CPU_KERNEL(
add_two, paddle::operators::AddKernel<paddle::platform::CPUPlace, float>);
#include "paddle/operators/add_op.h"
#include "paddle/framework/op_registry.h"
typedef paddle::operators::AddKernel<::paddle::platform::GPUPlace, float> AddKernel_GPU_float;
REGISTER_OP_GPU_KERNEL(add_two,
AddKernel_GPU_float);
\ No newline at end of file
paddle::operators::AddKernel<paddle::platform::GPUPlace, float>);
\ No newline at end of file
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/cross_entropy_op.h"
#include "paddle/framework/op_registry.h"
#include "paddle/framework/tensor.h"
namespace paddle {
namespace operators {
class OnehotCrossEntropyOp : public framework::OperatorWithKernel {
protected:
void InferShape(
const std::vector<const framework::Tensor *> &inputs,
const std::vector<framework::Tensor *> &outputs) const override {
PADDLE_ENFORCE(inputs.size() == 2,
"Input size of OnehotCrossEntropyOp must be two");
PADDLE_ENFORCE(outputs.size() == 1,
"Output size of OnehotCrossEntropyOp must be one");
PADDLE_ENFORCE(inputs[0] != nullptr && inputs[1] != nullptr,
"Inputs of OnehotCrossEntropyOp must all be set");
PADDLE_ENFORCE(outputs[0] != nullptr,
"Outputs of OnehotCrossEntropyOp must all be set");
PADDLE_ENFORCE(inputs[0]->dims().size() == 2, "X's dimension must be 2.");
PADDLE_ENFORCE(outputs[0]->dims().size() == 1,
"label's dimension must be 1.");
outputs[0]->Resize(framework::make_ddim({inputs[0]->dims()[0]}));
}
};
class OnehotCrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
public:
OnehotCrossEntropyOpMaker(framework::OpProto *proto,
framework::OpAttrChecker *op_checker)
: framework::OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "The first input of OnehotCrossEntropyOp");
AddInput("label", "The second input of OnehotCrossEntropyOp");
AddOutput("Y", "The output of OnehotCrossEntropyOp");
AddComment(R"DOC(
OnehotCrossEntropy Operator.
Y[i] = -log(X[i][j])
)DOC");
}
};
} // namespace operators
} // namespace paddle
REGISTER_OP(onehot_cross_entropy,
paddle::operators::OnehotCrossEntropyOp,
paddle::operators::OnehotCrossEntropyOpMaker);
REGISTER_OP_CPU_KERNEL(
onehot_cross_entropy,
paddle::operators::OnehotCrossEntropyOpKernel<::paddle::platform::CPUPlace,
float>);
#include "paddle/operators/cross_entropy_op.h"
#include "paddle/framework/op_registry.h"
REGISTER_OP_GPU_KERNEL(onehot_cross_entropy,
paddle::operators::OnehotCrossEntropyOpKernel<
::paddle::platform::GPUPlace, float>);
\ No newline at end of file
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "glog/logging.h"
#include "paddle/framework/operator.h"
namespace paddle {
namespace operators {
template <typename Place, typename T>
class OnehotCrossEntropyOpKernel : public framework::OpKernel {
public:
constexpr T LOG_THRESHOLD() const { return static_cast<T>(1e-20); }
void Compute(const framework::KernelContext& context) const override {
auto X = context.Input(0)->Get<framework::Tensor>();
const T* X_data = X.data<T>();
const int* label_data =
context.Input(1)->Get<framework::Tensor>().data<int>();
auto* Y = context.Output(0)->GetMutable<framework::Tensor>();
Y->mutable_data<T>(context.GetPlace());
T* Y_data = Y->data<T>();
int batch_size = X.dims()[0];
int class_num = X.dims()[1];
// Y[i] = -log(X[i][j])
for (int i = 0; i < batch_size; ++i) {
Y_data[i] = -std::log(
std::max(X_data[i * class_num + label_data[i]], LOG_THRESHOLD()));
}
}
};
} // namespace operators
} // namespace paddle
......@@ -12,9 +12,9 @@
See the License for the specific language governing permissions and
limitations under the License. */
#include <paddle/framework/op_registry.h>
#include <paddle/framework/tensor.h>
#include <paddle/operators/mul_op.h>
#include "paddle/operators/mul_op.h"
#include "paddle/framework/op_registry.h"
#include "paddle/framework/tensor.h"
namespace paddle {
namespace operators {
......@@ -33,7 +33,7 @@ protected:
dim0[1] == dim1[0],
"First matrix's width must be equal with second matrix's height.");
PADDLE_ENFORCE(outputs.size() == 1, "The mul op must take one output");
outputs[0]->set_dims({dim0[0], dim1[1]});
outputs[0]->Resize({dim0[0], dim1[1]});
}
};
......@@ -70,4 +70,4 @@ REGISTER_OP(mul, paddle::operators::MulOp, paddle::operators::MulOpMaker);
REGISTER_GRADIENT_OP(mul, paddle::operators::MulOpGrad);
REGISTER_OP_CPU_KERNEL(
mul, paddle::operators::MulKernel<paddle::platform::CPUPlace>);
mul, paddle::operators::MulKernel<paddle::platform::CPUPlace, float>);
......@@ -12,9 +12,9 @@
See the License for the specific language governing permissions and
limitations under the License. */
#include <paddle/operators/mul_op.h>
#include <paddle/framework/op_registry.h>
#include "paddle/operators/mul_op.h"
#include "paddle/framework/op_registry.h"
REGISTER_OP_GPU_KERNEL(mul,
paddle::operators::MulKernel<paddle::platform
::GPUPlace>);
\ No newline at end of file
::GPUPlace, float>);
\ No newline at end of file
......@@ -14,17 +14,30 @@
#pragma once
#include <glog/logging.h>
#include <paddle/framework/operator.h>
#include "glog/logging.h"
#include "paddle/framework/eigen.h"
#include "paddle/framework/operator.h"
namespace paddle {
namespace operators {
template <typename Place>
template <typename Place, typename T>
class MulKernel : public framework::OpKernel {
public:
void Compute(const framework::KernelContext &context) const override {
LOG(INFO) << "Mul kernel in " << typeid(Place).name();
void Compute(const framework::KernelContext& context) const override {
Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair = {
{Eigen::IndexPair<Eigen::DenseIndex>(1, 0)}};
auto input0 = context.Input(0)->Get<framework::Tensor>();
auto input1 = context.Input(1)->Get<framework::Tensor>();
auto* output = context.Output(0)->GetMutable<framework::Tensor>();
output->mutable_data<T>(context.GetPlace());
framework::EigenMatrix<T>::From(*output).device(
*(context.GetEigenDevice<Place>())) =
framework::EigenMatrix<T>::From(input0).contract(
framework::EigenMatrix<T>::From(input1), dim_pair);
}
};
} // namespace operators
......
......@@ -12,8 +12,8 @@
See the License for the specific language governing permissions and
limitations under the License. */
#include <paddle/framework/op_registry.h>
#include <paddle/operators/rowwise_add_op.h>
#include "paddle/operators/rowwise_add_op.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
......@@ -30,7 +30,7 @@ protected:
PADDLE_ENFORCE(dim1.size() == 1, "The second input must be vector");
PADDLE_ENFORCE(dim0[1] == dim1[0], "The width of two input must be same");
PADDLE_ENFORCE(outputs.size() == 1, "The output size must be 1");
outputs[0]->set_dims(inputs[0]->dims());
outputs[0]->Resize(inputs[0]->dims());
}
};
......@@ -58,4 +58,4 @@ REGISTER_OP(rowwise_add,
paddle::operators::RowWiseAddOpMaker);
REGISTER_OP_CPU_KERNEL(
rowwise_add,
paddle::operators::RowWiseAddKernel<paddle::platform::CPUPlace>);
paddle::operators::RowWiseAddKernel<paddle::platform::CPUPlace, float>);
#include <paddle/framework/op_registry.h>
#include <paddle/operators/rowwise_add_op.h>
#include "paddle/framework/op_registry.h"
#include "paddle/operators/rowwise_add_op.h"
REGISTER_OP_GPU_KERNEL(
rowwise_add,
paddle::operators::RowWiseAddKernel<paddle::platform ::GPUPlace>);
paddle::operators::RowWiseAddKernel<paddle::platform ::GPUPlace, float>);
......@@ -13,17 +13,32 @@
limitations under the License. */
#pragma once
#include <glog/logging.h>
#include <paddle/framework/operator.h>
#include "glog/logging.h"
#include "paddle/framework/eigen.h"
#include "paddle/framework/operator.h"
namespace paddle {
namespace operators {
template <typename Place>
template <typename Place, typename T>
class RowWiseAddKernel : public framework::OpKernel {
public:
void Compute(const framework::KernelContext &context) const override {
LOG(INFO) << "RowWiseAdd kernel in " << typeid(Place).name();
void Compute(const framework::KernelContext& context) const override {
auto in0 = context.Input(0)->Get<framework::Tensor>();
auto in1 = context.Input(1)->Get<framework::Tensor>();
auto* out = context.Output(0)->GetMutable<framework::Tensor>();
out->mutable_data<T>(context.GetPlace());
auto input = framework::EigenMatrix<T>::From(in0);
auto bias = framework::EigenVector<T>::From(in1);
auto output = framework::EigenMatrix<T>::From(*out);
const int bias_size = bias.dimension(0);
const int rest_size = input.size() / bias_size;
Eigen::DSizes<int, 1> one_d(input.size());
Eigen::DSizes<int, 1> bcast(rest_size);
output.reshape(one_d).device(*(context.GetEigenDevice<Place>())) =
input.reshape(one_d) + bias.broadcast(bcast).reshape(one_d);
}
};
......
......@@ -31,7 +31,7 @@ protected:
PADDLE_ENFORCE(outputs[0] != nullptr, "outputs[0] mast be set");
PADDLE_ENFORCE(inputs[0]->dims() == inputs[1]->dims(),
"Two input of SGD Op's dimension must be same.");
outputs[0]->set_dims(inputs[0]->dims());
outputs[0]->Resize(inputs[0]->dims());
}
};
......
......@@ -12,8 +12,8 @@
See the License for the specific language governing permissions and
limitations under the License. */
#include <paddle/framework/op_registry.h>
#include <paddle/operators/sigmoid_op.h>
#include "paddle/operators/sigmoid_op.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
......@@ -24,7 +24,7 @@ protected:
const std::vector<framework::Tensor *> &outputs) const override {
PADDLE_ENFORCE(inputs.size() == 1, "Sigmoid Op only have one input");
PADDLE_ENFORCE(outputs.size() == 1, "Sigmoid Op only have one output");
outputs[0]->set_dims(inputs[0]->dims());
outputs[0]->Resize(inputs[0]->dims());
}
};
......@@ -34,7 +34,7 @@ public:
framework::OpAttrChecker *op_checker)
: framework::OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "sigmoid input");
AddInput("Y", "sigmoid output");
AddOutput("Y", "sigmoid output");
AddComment("Sigmoid function");
}
};
......@@ -59,4 +59,5 @@ REGISTER_OP(sigmoid,
REGISTER_GRADIENT_OP(sigmoid, paddle::operators::SigmoidOpGrad);
REGISTER_OP_CPU_KERNEL(
sigmoid, paddle::operators::SigmoidKernel<paddle::platform::CPUPlace>);
sigmoid,
paddle::operators::SigmoidKernel<paddle::platform::CPUPlace, float>);
#include <paddle/operators/sigmoid_op.h>
#include <paddle/framework/op_registry.h>
#include "paddle/operators/sigmoid_op.h"
#include "paddle/framework/op_registry.h"
REGISTER_OP_GPU_KERNEL(
sigmoid, paddle::operators::SigmoidKernel<paddle::platform::GPUPlace>);
sigmoid, paddle::operators::SigmoidKernel<paddle::platform::GPUPlace, float>);
......@@ -14,17 +14,25 @@
#pragma once
#include <glog/logging.h>
#include <paddle/framework/operator.h>
#include "glog/logging.h"
#include "paddle/framework/eigen.h"
#include "paddle/framework/operator.h"
namespace paddle {
namespace operators {
template <typename Place>
template <typename Place, typename T>
class SigmoidKernel : public framework::OpKernel {
public:
void Compute(const framework::KernelContext &context) const override {
LOG(INFO) << "Sigmoid kernel in " << typeid(Place).name();
void Compute(const framework::KernelContext& context) const override {
auto input = context.Input(0)->Get<framework::Tensor>();
auto* output = context.Output(0)->GetMutable<framework::Tensor>();
output->mutable_data<T>(context.GetPlace());
framework::EigenVector<T>::Flatten(*output).device(
*(context.GetEigenDevice<Place>())) =
1.0 / (1.0 + (-1.0 * framework::EigenVector<T>::Flatten(input)).exp());
}
};
} // namespace operators
......
......@@ -11,8 +11,8 @@
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <paddle/framework/op_registry.h>
#include <paddle/operators/softmax_op.h>
#include "paddle/operators/softmax_op.h"
#include "paddle/framework/op_registry.h"
namespace paddle {
namespace operators {
......@@ -23,9 +23,11 @@ protected:
const std::vector<const framework::Tensor *> &inputs,
const std::vector<framework::Tensor *> &outputs) const override {
PADDLE_ENFORCE(inputs.size() == 1, "Only one input is need for softmax");
PADDLE_ENFORCE(inputs[0]->dims().size() == 2,
"The input of softmax op must be matrix");
PADDLE_ENFORCE(outputs.size() == 1, "Only one output is need for softmax");
outputs[0]->set_dims(inputs[0]->dims());
outputs[0]->Resize(inputs[0]->dims());
}
};
......@@ -58,5 +60,5 @@ namespace ops = paddle::operators;
REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker);
REGISTER_GRADIENT_OP(softmax, paddle::operators::SoftmaxOpGrad);
REGISTER_OP_CPU_KERNEL(softmax, ops::SoftmaxKernel<paddle::platform::CPUPlace>);
REGISTER_OP_CPU_KERNEL(softmax,
ops::SoftmaxKernel<paddle::platform::CPUPlace, float>);
#include <paddle/framework/op_registry.h>
#include <paddle/operators/softmax_op.h>
#include "paddle/framework/op_registry.h"
#include "paddle/operators/softmax_op.h"
REGISTER_OP_GPU_KERNEL(
softmax, paddle::operators::SoftmaxKernel<paddle::platform::GPUPlace>);
softmax, paddle::operators::SoftmaxKernel<paddle::platform::GPUPlace, float>);
......@@ -14,17 +14,49 @@
#pragma once
#include <glog/logging.h>
#include <paddle/framework/operator.h>
#include "glog/logging.h"
#include "paddle/framework/eigen.h"
#include "paddle/framework/operator.h"
namespace paddle {
namespace operators {
template <typename Place>
template <typename Place, typename T>
class SoftmaxKernel : public framework::OpKernel {
public:
void Compute(const framework::KernelContext &context) const override {
LOG(INFO) << "Softmax kernel in " << typeid(Place).name();
void Compute(const framework::KernelContext& context) const override {
auto input = context.Input(0)->Get<framework::Tensor>();
auto* output = context.Output(0)->GetMutable<framework::Tensor>();
output->mutable_data<T>(context.GetPlace());
auto logits = framework::EigenMatrix<T>::From(input);
auto softmax = framework::EigenMatrix<T>::From(*output);
const int kBatchDim = 0;
const int kClassDim = 1;
const int batch_size = logits.dimension(kBatchDim);
const int num_classes = logits.dimension(kClassDim);
Eigen::DSizes<int, 1> along_class(kClassDim);
Eigen::DSizes<int, 2> batch_by_one(batch_size, 1);
Eigen::DSizes<int, 2> one_by_class(1, num_classes);
auto shifted_logits = (logits -
logits.maximum(along_class)
.eval()
.reshape(batch_by_one)
.broadcast(one_by_class));
softmax.device(*(context.GetEigenDevice<Place>())) = shifted_logits.exp();
softmax.device(*(context.GetEigenDevice<Place>())) =
(softmax *
softmax.sum(along_class)
.inverse()
.eval()
.reshape(batch_by_one)
.broadcast(one_by_class));
}
};
} // namespace operators
......
......@@ -43,10 +43,26 @@ namespace platform {
// For more details, please check https://stackoverflow.com/a/43870188/724872.
#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
template <typename T>
inline void throw_on_error(T e) {
throw_on_error(e, "");
}
template <typename... Args>
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
int stat, const Args&... args) {
if (UNLIKELY(!(stat))) {
throw std::runtime_error(
string::Sprintf(args...) +
string::Sprintf(" at [%s:%s];", __FILE__, __LINE__));
}
}
#ifndef PADDLE_ONLY_CPU
template <typename... Args>
inline void throw_on_error(cudaError_t e, const Args&... args) {
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
cudaError_t e, const Args&... args) {
if (UNLIKELY(e)) {
// clang-format off
throw thrust::system_error(
......@@ -58,7 +74,8 @@ inline void throw_on_error(cudaError_t e, const Args&... args) {
}
template <typename... Args>
inline void throw_on_error(curandStatus_t stat, const Args&... args) {
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
curandStatus_t stat, const Args&... args) {
if (stat != CURAND_STATUS_SUCCESS) {
// clang-format off
throw thrust::system_error(
......@@ -70,7 +87,8 @@ inline void throw_on_error(curandStatus_t stat, const Args&... args) {
}
template <typename... Args>
inline void throw_on_error(cudnnStatus_t stat, const Args&... args) {
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
cudnnStatus_t stat, const Args&... args) {
if (stat == CUDNN_STATUS_SUCCESS) {
return;
} else {
......@@ -84,7 +102,8 @@ inline void throw_on_error(cudnnStatus_t stat, const Args&... args) {
}
template <typename... Args>
inline void throw_on_error(cublasStatus_t stat, const Args&... args) {
inline typename std::enable_if<sizeof...(Args) != 0, void>::type throw_on_error(
cublasStatus_t stat, const Args&... args) {
std::string err;
if (stat == CUBLAS_STATUS_SUCCESS) {
return;
......@@ -113,15 +132,6 @@ inline void throw_on_error(cublasStatus_t stat, const Args&... args) {
#endif // PADDLE_ONLY_CPU
template <typename... Args>
inline void throw_on_error(int stat, const Args&... args) {
if (UNLIKELY(!(stat))) {
throw std::runtime_error(
string::Sprintf(args...) +
string::Sprintf(" at [%s:%s];", __FILE__, __LINE__));
}
}
#define PADDLE_THROW(...) \
do { \
throw std::runtime_error( \
......@@ -129,12 +139,9 @@ inline void throw_on_error(int stat, const Args&... args) {
string::Sprintf(" at [%s:%s];", __FILE__, __LINE__)); \
} while (0)
/**
* @brief Enforce a condition, otherwise throw an EnforceNotMet
*/
#define PADDLE_ENFORCE(condition, ...) \
#define PADDLE_ENFORCE(...) \
do { \
::paddle::platform::throw_on_error(condition, __VA_ARGS__); \
::paddle::platform::throw_on_error(__VA_ARGS__); \
} while (0)
} // namespace platform
......
......@@ -44,7 +44,7 @@ void SetDeviceId(int id) {
"cudaSetDevice failed in paddle::platform::SetDeviceId");
}
void GpuMemoryUsage(size_t& available, size_t& total) {
void GpuMemoryUsage(size_t &available, size_t &total) {
PADDLE_ENFORCE(cudaMemGetInfo(&available, &total),
"cudaMemGetInfo failed in paddle::platform::GetMemoryUsage");
}
......@@ -82,5 +82,28 @@ size_t GpuMaxChunkSize() {
return usable;
}
void GpuMemcpyAsync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind, cudaStream_t stream) {
PADDLE_ENFORCE(cudaMemcpyAsync(dst, src, count, kind, stream),
"cudaMemcpyAsync failed in paddle::platform::GpuMemcpyAsync");
}
void GpuMemcpySync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind) {
PADDLE_ENFORCE(cudaMemcpy(dst, src, count, kind),
"cudaMemcpy failed in paddle::platform::GpuMemcpySync");
// note: cudaMemcpy may actually be asynchronous with respect to the caller,
// block on stream 0 to make sure the copy has completed
PADDLE_ENFORCE(
cudaStreamSynchronize(0),
"cudaStreamSynchronize failed in paddle::platform::GpuMemcpySync");
}
void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
size_t count, cudaStream_t stream) {
PADDLE_ENFORCE(
cudaMemcpyPeerAsync(dst, dst_device, src, src_device, count, stream),
"cudaMemcpyPeerAsync failed in paddle::platform::GpuMemcpyPeer");
}
} // namespace platform
} // namespace paddle
......@@ -16,6 +16,7 @@ limitations under the License. */
#ifndef PADDLE_ONLY_CPU
#include <cuda_runtime.h>
#include <stddef.h>
namespace paddle {
......@@ -31,7 +32,7 @@ int GetCurrentDeviceId();
void SetDeviceId(int device_id);
//!Get the memory usage of current GPU device.
void GpuMemoryUsage(size_t& available, size_t& total);
void GpuMemoryUsage(size_t &available, size_t &total);
//! Get the maximum allocation size of current GPU device.
size_t GpuMaxAllocSize();
......@@ -42,6 +43,18 @@ size_t GpuMinChunkSize();
//! Get the maximum chunk size for GPU buddy allocator.
size_t GpuMaxChunkSize();
//! Copy memory from address src to dst asynchronously.
void GpuMemcpyAsync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind, cudaStream_t stream);
//! Copy memory from address src to dst synchronously.
void GpuMemcpySync(void *dst, const void *src, size_t count,
enum cudaMemcpyKind kind);
//! Copy memory from one device to another device.
void GpuMemcpyPeer(void *dst, int dst_device, const void *src, int src_device,
size_t count, cudaStream_t stream);
} // namespace platform
} // namespace paddle
......
cc_library(paddle_pybind SHARED SRCS pybind.cc DEPS pybind python
add_op fc_op sgd_op)
add_op fc_op sgd_op cross_entropy_op)
......@@ -13,25 +13,43 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <Python.h>
#include <paddle/framework/op_registry.h>
#include <paddle/framework/operator.h>
#include <paddle/framework/scope.h>
#include <paddle/pybind/tensor_bind.h>
#include <pybind11/numpy.h>
#include <pybind11/pybind11.h>
#include <pybind11/stl.h>
#include <fstream>
#include <vector>
#include "paddle/framework/net.h"
#include "paddle/framework/op_registry.h"
#include "paddle/framework/operator.h"
#include "paddle/framework/scope.h"
#include "paddle/pybind/tensor_bind.h"
#include "pybind11/numpy.h"
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
namespace py = pybind11;
namespace pd = paddle::framework;
USE_OP(add_two);
USE_OP(onehot_cross_entropy);
USE_OP_WITHOUT_KERNEL(fc);
USE_OP(sgd);
USE_OP(mul);
USE_OP(sigmoid);
USE_OP(softmax);
USE_OP(rowwise_add);
template <typename ClassType>
void ExposeOperator(ClassType& m) {
m.def("infer_shape", &ClassType::type::InferShape)
.def("run", &ClassType::type::Run)
.def("outputs",
[](const typename ClassType::type& op) -> std::vector<std::string> {
return op.outputs_;
})
.def("__str__", &ClassType::type::DebugString);
}
PYBIND11_PLUGIN(core) {
py::module m("core", "C++ core of Paddle Paddle");
py::module m("core", "C++ core of PaddlePaddle");
py::class_<pd::Tensor>(m, "Tensor", py::buffer_protocol())
.def_buffer([](pd::Tensor& self) -> py::buffer_info {
......@@ -41,7 +59,7 @@ PYBIND11_PLUGIN(core) {
[](const pd::Tensor& self) { return pd::vectorize(self.dims()); })
.def("set_dims",
[](pd::Tensor& self, const std::vector<int>& dim) {
self.set_dims(pd::make_ddim(dim));
self.Resize(pd::make_ddim(dim));
})
.def("alloc_float",
[](pd::Tensor& self) {
......@@ -108,10 +126,10 @@ All parameter, weight, gradient are variables in Paddle.
return new paddle::platform::CPUDeviceContext();
});
py::class_<pd::OperatorBase, pd::OperatorPtr>(m, "Operator")
.def("__str__", &pd::OperatorBase::DebugString)
.def_static("create",
[](py::bytes protobin) {
py::class_<pd::OperatorBase, std::shared_ptr<pd::OperatorBase>> operator_base(
m, "Operator");
operator_base.def_static("create", [](py::bytes protobin) {
pd::OpDesc desc;
PADDLE_ENFORCE(desc.ParsePartialFromString(protobin),
"Cannot parse user input to OpDesc");
......@@ -119,10 +137,27 @@ All parameter, weight, gradient are variables in Paddle.
"User OpDesc is not initialized, reason %s",
desc.InitializationErrorString());
return pd::OpRegistry::CreateOp(desc);
});
ExposeOperator(operator_base);
using PlainNetPtr = std::shared_ptr<pd::PlainNet>;
py::class_<pd::PlainNet, PlainNetPtr> plain_net(m, "PlainNet");
plain_net
.def_static("create",
[]() -> std::shared_ptr<pd::PlainNet> {
auto retv = std::make_shared<pd::PlainNet>();
retv->type_ = "plain_net";
return retv;
})
.def("add_op", &pd::PlainNet::AddOp)
.def("add_op",
[](PlainNetPtr& self, const PlainNetPtr& plain_net) -> void {
self->AddOp(std::static_pointer_cast<pd::OperatorBase>(plain_net));
})
.def("infer_shape", &pd::OperatorBase::InferShape)
.def("run", &pd::OperatorBase::Run)
.def("outputs", [](const pd::OperatorPtr& op) { return op->outputs_; });
.def("complete_add_op", &pd::PlainNet::CompleteAddOp)
.def("complete_add_op", [](PlainNetPtr& self) { self->CompleteAddOp(); });
ExposeOperator(plain_net);
return m.ptr();
}
......@@ -86,7 +86,7 @@ void PyTensorSetFromArray(
dims.push_back((int)array.shape()[i]);
}
self.set_dims(framework::make_ddim(dims));
self.Resize(framework::make_ddim(dims));
auto *dst = self.mutable_data<T>(paddle::platform::CPUPlace());
std::memcpy(dst, array.data(), sizeof(T) * array.size());
}
......
......@@ -3219,6 +3219,10 @@ def ParameterHook(type, **kwargs):
if sparsity_ratio is not None:
hook.sparsity_ratio = sparsity_ratio
return hook
elif type == 'dpruning':
hook = ParameterUpdaterHookConfig()
hook.type = type
return hook
else:
return None
......
......@@ -3173,11 +3173,11 @@ def memory(name,
@wrap_bias_attr_default()
@wrap_act_default(
param_names=['gate_act', 'state_act'], act=SigmoidActivation())
@wrap_act_default(param_names=['gate_act'], act=SigmoidActivation())
@wrap_act_default(param_names=['state_act'], act=TanhActivation())
@wrap_act_default(act=TanhActivation())
@wrap_name_default('lstm_step')
@layer_support()
@layer_support(ERROR_CLIPPING, DROPOUT)
def lstm_step_layer(input,
state,
size=None,
......@@ -3531,12 +3531,7 @@ def SubsequenceInput(input):
@wrap_name_default("recurrent_group")
def recurrent_group(step,
input,
reverse=False,
name=None,
targetInlink=None,
is_generating=False):
def recurrent_group(step, input, reverse=False, name=None, targetInlink=None):
"""
Recurrent layer group is an extremely flexible recurrent unit in
PaddlePaddle. As long as the user defines the calculation done within a
......@@ -3602,21 +3597,12 @@ def recurrent_group(step,
:type targetInlink: LayerOutput|SubsequenceInput
:param is_generating: If is generating, none of input type should be LayerOutput;
else, for training or testing, one of the input type must
be LayerOutput.
:type is_generating: bool
:return: LayerOutput object.
:rtype: LayerOutput
"""
model_type('recurrent_nn')
def is_single_input(x):
return isinstance(x, LayerOutput) or isinstance(x, StaticInput)
if is_single_input(input):
if isinstance(input, LayerOutput) or isinstance(input, StaticInput):
input = [input]
assert isinstance(input, collections.Sequence)
......@@ -3630,13 +3616,8 @@ def recurrent_group(step,
in_links=map(lambda x: x.name, in_links),
seq_reversed=reverse)
in_args = []
has_LayerOutput = False
for each_input in input:
assert is_single_input(each_input)
if isinstance(each_input, LayerOutput):
in_args.append(each_input)
has_LayerOutput = True
else: # StaticInput
if isinstance(each_input, StaticInput): # StaticInput
mem_name = "__%s_memory__" % each_input.input.name
mem = memory(
name=None,
......@@ -3644,24 +3625,26 @@ def recurrent_group(step,
boot_layer=each_input.input)
mem.set_input(mem)
in_args.append(mem)
assert (is_generating != has_LayerOutput)
else:
in_args.append(each_input)
layer_outs = step(*in_args)
if isinstance(layer_outs, LayerOutput):
layer_outs = [layer_outs]
for ot in layer_outs:
assert isinstance(ot, LayerOutput)
ot.reverse = reverse
RecurrentLayerGroupSetOutLink(ot.name)
for layer_out in layer_outs:
assert isinstance(
layer_out, LayerOutput
), "Type of step function's return value must be LayerOutput."
layer_out.reverse = reverse
RecurrentLayerGroupSetOutLink(layer_out.name)
RecurrentLayerGroupEnd(name=name)
for layer_out in layer_outs:
# Thee previous full_name is the name is the rnn group
# We need a full_name outside the rnn group
# The previous full_name is the name inside the recurrent group.
# We need a full_name outside the recurrent group.
layer_out.full_name = MakeLayerNameInSubmodel(layer_out.name)
if len(layer_outs) == 1:
......@@ -3684,7 +3667,20 @@ class BaseGeneratedInput(object):
class GeneratedInput(BaseGeneratedInput):
def after_real_step(self, input):
return maxid_layer(input=input, name='__beam_search_predict__')
if isinstance(input, LayerOutput):
input = [input]
elif isinstance(input, collections.Sequence):
input = list(input)
if len(input) > 1:
logger.info(
("More than one layers inside the recurrent_group "
"are returned as outputs of the entire recurrent_group "
"PLEASE garantee the first output is probability of "
"the predicted next word."))
return [maxid_layer(
input=input[0], name='__beam_search_predict__')] + (
input[1:] if len(input) > 1 else [])
def before_real_step(self):
predict_id = memory(
......@@ -3871,6 +3867,7 @@ def beam_search(step,
:type step: callable
:param input: Input data for the recurrent unit, which should include the
previously generated words as a GeneratedInput object.
In beam_search, none of the input's type should be LayerOutput.
:type input: list
:param bos_id: Index of the start symbol in the dictionary. The start symbol
is a special token for NLP task, which indicates the
......@@ -3912,15 +3909,18 @@ def beam_search(step,
real_input = []
for i, each_input in enumerate(input):
assert isinstance(each_input, StaticInput) or isinstance(
each_input, BaseGeneratedInput)
assert not isinstance(each_input, LayerOutput), (
"in beam_search, "
"none of the input should has a type of LayerOutput.")
if isinstance(each_input, BaseGeneratedInput):
assert generated_input_index == -1
assert generated_input_index == -1, ("recurrent_group accepts "
"only one GeneratedInput.")
generated_input_index = i
else:
real_input.append(each_input)
assert generated_input_index != -1
assert generated_input_index != -1, "No GeneratedInput is given."
gipt = input[generated_input_index]
......@@ -3941,17 +3941,11 @@ def beam_search(step,
predict = gipt.after_real_step(step(*args))
eos_layer(input=predict, eos_id=eos_id, name=eos_name)
eos_layer(input=predict[0], eos_id=eos_id, name=eos_name)
return predict
tmp = recurrent_group(
step=__real_step__,
input=real_input,
reverse=False,
name=name,
is_generating=True)
return tmp
return recurrent_group(
step=__real_step__, input=real_input, reverse=False, name=name)
def __cost_input__(input, label, weight=None):
......
......@@ -614,18 +614,17 @@ def simple_lstm(input,
@wrap_name_default('lstm_unit')
def lstmemory_unit(input,
memory_boot=None,
out_memory=None,
name=None,
size=None,
param_attr=None,
act=None,
gate_act=None,
state_act=None,
mixed_bias_attr=None,
input_proj_bias_attr=None,
input_proj_layer_attr=None,
lstm_bias_attr=None,
mixed_layer_attr=None,
lstm_layer_attr=None,
get_output_layer_attr=None):
lstm_layer_attr=None):
"""
Define calculations that a LSTM unit performs during a single time step.
This function itself is not a recurrent layer, so it can not be
......@@ -662,8 +661,8 @@ def lstmemory_unit(input,
:param input: input layer name.
:type input: LayerOutput
:param memory_boot: the initialization state of the LSTM cell.
:type memory_boot: LayerOutput | None
:param out_memory: output of previous time step
:type out_memory: LayerOutput | None
:param name: lstmemory unit name.
:type name: basestring
:param size: lstmemory unit size.
......@@ -676,33 +675,35 @@ def lstmemory_unit(input,
:type gate_act: BaseActivation
:param state_act: lstm state activiation type.
:type state_act: BaseActivation
:param mixed_bias_attr: bias parameter attribute of mixed layer.
:param input_proj_bias_attr: bias attribute for input-to-hidden projection.
False means no bias, None means default bias.
:type mixed_bias_attr: ParameterAttribute|False
:type input_proj_bias_attr: ParameterAttribute|False|None
:param input_proj_layer_attr: extra layer attribute for input to hidden
projection of the LSTM unit, such as dropout, error clipping.
:type input_proj_layer_attr: ExtraLayerAttribute
:param lstm_bias_attr: bias parameter attribute of lstm layer.
False means no bias, None means default bias.
:type lstm_bias_attr: ParameterAttribute|False
:param mixed_layer_attr: mixed layer's extra attribute.
:type mixed_layer_attr: ExtraLayerAttribute
:param lstm_layer_attr: lstm layer's extra attribute.
:type lstm_layer_attr: ExtraLayerAttribute
:param get_output_layer_attr: get output layer's extra attribute.
:type get_output_layer_attr: ExtraLayerAttribute
:return: lstmemory unit name.
:rtype: LayerOutput
"""
if size is None:
assert input.size % 4 == 0
size = input.size / 4
if out_memory is None:
out_mem = memory(name=name, size=size)
state_mem = memory(
name="%s_state" % name, size=size, boot_layer=memory_boot)
else:
out_mem = out_memory
state_mem = memory(name="%s_state" % name, size=size)
with mixed_layer(
name="%s_input_recurrent" % name,
size=size * 4,
bias_attr=mixed_bias_attr,
layer_attr=mixed_layer_attr,
bias_attr=input_proj_bias_attr,
layer_attr=input_proj_layer_attr,
act=IdentityActivation()) as m:
m += identity_projection(input=input)
m += full_matrix_projection(input=out_mem, param_attr=param_attr)
......@@ -717,11 +718,7 @@ def lstmemory_unit(input,
gate_act=gate_act,
state_act=state_act,
layer_attr=lstm_layer_attr)
get_output_layer(
name='%s_state' % name,
input=lstm_out,
arg_name='state',
layer_attr=get_output_layer_attr)
get_output_layer(name='%s_state' % name, input=lstm_out, arg_name='state')
return lstm_out
......@@ -730,17 +727,16 @@ def lstmemory_unit(input,
def lstmemory_group(input,
size=None,
name=None,
memory_boot=None,
out_memory=None,
reverse=False,
param_attr=None,
act=None,
gate_act=None,
state_act=None,
mixed_bias_attr=None,
input_proj_bias_attr=None,
input_proj_layer_attr=None,
lstm_bias_attr=None,
mixed_layer_attr=None,
lstm_layer_attr=None,
get_output_layer_attr=None):
lstm_layer_attr=None):
"""
lstm_group is a recurrent_group version of Long Short Term Memory. It
does exactly the same calculation as the lstmemory layer (see lstmemory in
......@@ -774,8 +770,8 @@ def lstmemory_group(input,
:type size: int
:param name: name of the lstmemory group.
:type name: basestring
:param memory_boot: the initialization state of LSTM cell.
:type memory_boot: LayerOutput | None
:param out_memory: output of previous time step
:type out_memory: LayerOutput | None
:param reverse: is lstm reversed
:type reverse: bool
:param param_attr: Parameter config, None if use default.
......@@ -786,18 +782,17 @@ def lstmemory_group(input,
:type gate_act: BaseActivation
:param state_act: lstm state activiation type.
:type state_act: BaseActivation
:param mixed_bias_attr: bias parameter attribute of mixed layer.
False means no bias, None means default bias.
:type mixed_bias_attr: ParameterAttribute|False
:param lstm_bias_attr: bias parameter attribute of lstm layer.
False means no bias, None means default bias.
:type lstm_bias_attr: ParameterAttribute|False
:param mixed_layer_attr: mixed layer's extra attribute.
:type mixed_layer_attr: ExtraLayerAttribute
:param input_proj_bias_attr: bias attribute for input-to-hidden projection.
False means no bias, None means default bias.
:type input_proj_bias_attr: ParameterAttribute|False|None
:param input_proj_layer_attr: extra layer attribute for input to hidden
projection of the LSTM unit, such as dropout, error clipping.
:type input_proj_layer_attr: ExtraLayerAttribute
:param lstm_layer_attr: lstm layer's extra attribute.
:type lstm_layer_attr: ExtraLayerAttribute
:param get_output_layer_attr: get output layer's extra attribute.
:type get_output_layer_attr: ExtraLayerAttribute
:return: the lstmemory group.
:rtype: LayerOutput
"""
......@@ -805,18 +800,17 @@ def lstmemory_group(input,
def __lstm_step__(ipt):
return lstmemory_unit(
input=ipt,
memory_boot=memory_boot,
name=name,
size=size,
mixed_bias_attr=mixed_bias_attr,
mixed_layer_attr=mixed_layer_attr,
param_attr=param_attr,
lstm_bias_attr=lstm_bias_attr,
act=act,
gate_act=gate_act,
state_act=state_act,
out_memory=out_memory,
input_proj_bias_attr=input_proj_bias_attr,
input_proj_layer_attr=input_proj_layer_attr,
param_attr=param_attr,
lstm_layer_attr=lstm_layer_attr,
get_output_layer_attr=get_output_layer_attr)
lstm_bias_attr=lstm_bias_attr)
return recurrent_group(
name='%s_recurrent_group' % name,
......
......@@ -104,7 +104,7 @@ layers {
}
bias_parameter_name: "lstm_bias"
active_gate_type: "sigmoid"
active_state_type: "sigmoid"
active_state_type: "tanh"
}
layers {
name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
......@@ -183,7 +183,7 @@ layers {
}
bias_parameter_name: "lstm_bias"
active_gate_type: "sigmoid"
active_state_type: "sigmoid"
active_state_type: "tanh"
}
layers {
name: "__lstm_group_1___state@__lstm_group_1___recurrent_group"
......
......@@ -258,7 +258,7 @@ layers {
}
bias_parameter_name: "___lstm_group_0__@__lstm_group_0___recurrent_group.wbias"
active_gate_type: "sigmoid"
active_state_type: "sigmoid"
active_state_type: "tanh"
}
layers {
name: "__lstm_group_0___state@__lstm_group_0___recurrent_group"
......
......@@ -20,12 +20,13 @@ lstm1 = lstmemory_group(
input=m1,
param_attr=lstm_param,
lstm_bias_attr=lstm_bias,
mixed_bias_attr=False)
input_proj_bias_attr=False)
lstm2 = lstmemory_group(
input=m2,
param_attr=lstm_param,
lstm_bias_attr=lstm_bias,
mixed_bias_attr=False)
input_proj_bias_attr=False)
softmax_param = ParamAttr(name='softmax_param')
......
......@@ -116,7 +116,7 @@ def reader_creator(data_file,
data = batch['data']
labels = batch['label']
for sample, label in itertools.izip(data, batch['label']):
yield sample, int(label)
yield sample, int(label) - 1
if use_xmap:
return xmap_readers(mapper, reader, cpu_count(), buffered_size)
......
add_python_test(test_framework test_protobuf.py test_scope.py
test_default_scope_funcs.py test_op_creation_methods.py
test_tensor.py test_fc_op.py test_add_two_op.py test_sgd_op.py)
add_python_test(test_framework
test_protobuf.py
test_scope.py
test_default_scope_funcs.py
test_op_creation_methods.py
test_plain_net.py
test_tensor.py
test_fc_op.py
test_add_two_op.py
test_sgd_op.py
test_cross_entropy_op.py
test_mul_op.py
test_sigmoid_op.py
test_softmax_op.py
test_rowwise_add_op.py)
......@@ -56,7 +56,10 @@ class OpTestMeta(type):
for out_name in func.all_output_args:
actual = numpy.array(scope.get_var(out_name).get_tensor())
expect = getattr(self, out_name)
numpy.testing.assert_almost_equal(actual, expect)
# TODO(qijun) The default decimal is 7, but numpy.dot and eigen.mul
# has some diff, and could not pass unittest. So I set decimal 3 here.
# And I will check this in future.
numpy.testing.assert_almost_equal(actual, expect, decimal=3)
obj.test_all = test_all
return obj
import unittest
import numpy
from op_test_util import OpTestMeta
class TestSGD(unittest.TestCase):
__metaclass__ = OpTestMeta
def setUp(self):
self.type = "onehot_cross_entropy"
batch_size = 100
class_num = 10
self.X = numpy.random.random((batch_size, class_num)).astype("float32")
self.label = 5 * numpy.ones(batch_size).astype("int32")
Y = []
for i in range(0, batch_size):
Y.append(-numpy.log(self.X[i][self.label[i]]))
self.Y = numpy.array(Y).astype("float32")
if __name__ == "__main__":
unittest.main()
import unittest
from op_test_util import OpTestMeta
import numpy as np
class TestMulOp(unittest.TestCase):
__metaclass__ = OpTestMeta
def setUp(self):
self.type = "mul"
self.X = np.random.random((32, 784)).astype("float32")
self.Y = np.random.random((784, 100)).astype("float32")
self.Out = np.dot(self.X, self.Y)
if __name__ == '__main__':
unittest.main()
import paddle.v2.framework.core as core
from paddle.v2.framework.create_op_creation_methods import op_creations
import unittest
class TestNet(unittest.TestCase):
def test_net_all(self):
net = core.PlainNet.create()
op1 = op_creations.add_two(X="X", Y="Y", Out="Out")
net.add_op(op1)
net2 = core.PlainNet.create()
net2.add_op(op_creations.fc(X="X", W="w", Y="fc.out"))
net2.complete_add_op(True)
net.add_op(net2)
net.complete_add_op(True)
expected = '''
Op(plain_net), inputs:(@EMPTY@, X, Y, w), outputs:(@TEMP@fc@0, Out, fc.out).
Op(add_two), inputs:(X, Y), outputs:(Out).
Op(plain_net), inputs:(@EMPTY@, X, w), outputs:(@TEMP@fc@0, fc.out).
Op(fc), inputs:(X, w, @EMPTY@), outputs:(fc.out, @TEMP@fc@0).
Op(mul), inputs:(X, w), outputs:(@TEMP@fc@0).
Op(sigmoid), inputs:(@TEMP@fc@0), outputs:(fc.out).
'''
self.assertEqual(expected, "\n" + str(net))
if __name__ == '__main__':
unittest.main()
import unittest
from op_test_util import OpTestMeta
import numpy as np
class TestRowwiseAddOp(unittest.TestCase):
__metaclass__ = OpTestMeta
def setUp(self):
self.type = "rowwise_add"
self.X = np.random.random((32, 784)).astype("float32")
self.b = np.random.random(784).astype("float32")
self.Out = np.add(self.X, self.b)
if __name__ == '__main__':
unittest.main()
import unittest
from op_test_util import OpTestMeta
import numpy as np
class TestSigmoidOp(unittest.TestCase):
__metaclass__ = OpTestMeta
def setUp(self):
self.type = "sigmoid"
self.X = np.random.random((32, 100)).astype("float32")
self.Y = 1 / (1 + np.exp(-self.X))
if __name__ == '__main__':
unittest.main()
import unittest
from op_test_util import OpTestMeta
import numpy as np
def stable_softmax(x):
"""Compute the softmax of vector x in a numerically stable way."""
shiftx = x - np.max(x)
exps = np.exp(shiftx)
return exps / np.sum(exps)
class TestSoftmaxOp(unittest.TestCase):
__metaclass__ = OpTestMeta
def setUp(self):
self.type = "softmax"
self.X = np.random.random((32, 100)).astype("float32")
self.Y = np.apply_along_axis(stable_softmax, 1, self.X)
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册