提交 2523410e 编写于 作者: D dangqingqing

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into op_debug

......@@ -36,8 +36,7 @@ include(simd)
################################ Configurations #######################################
option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
option(WITH_MKLDNN "Compile PaddlePaddle with mkl-dnn support." ${AVX_FOUND})
option(WITH_MKLML "Compile PaddlePaddle with mklml package." ${AVX_FOUND})
option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND})
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON)
option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON)
......@@ -82,10 +81,8 @@ if(ANDROID OR IOS)
"Disable PYTHON when cross-compiling for Android and iOS" FORCE)
set(WITH_RDMA OFF CACHE STRING
"Disable RDMA when cross-compiling for Android and iOS" FORCE)
set(WITH_MKLDNN OFF CACHE STRING
"Disable MKLDNN when cross-compiling for Android and iOS" FORCE)
set(WITH_MKLML OFF CACHE STRING
"Disable MKLML package when cross-compiling for Android and iOS" FORCE)
set(WITH_MKL OFF CACHE STRING
"Disable MKL when cross-compiling for Android and iOS" FORCE)
# Compile PaddlePaddle mobile inference library
if (NOT WITH_C_API)
......@@ -111,6 +108,14 @@ else()
set(THIRD_PARTY_BUILD_TYPE Release)
endif()
set(WITH_MKLML ${WITH_MKL})
if (WITH_MKL AND AVX2_FOUND)
set(WITH_MKLDNN ON)
else()
message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN")
set(WITH_MKLDNN OFF)
endif()
########################################################################################
include(external/mklml) # download mklml package
......@@ -158,14 +163,15 @@ set(EXTERNAL_LIBS
)
if(WITH_GPU)
list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
if(NOT WITH_DSO)
list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
endif(NOT WITH_DSO)
include(cuda)
endif(WITH_GPU)
if(WITH_MKLML)
list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
endif()
if(WITH_MKLDNN)
list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB} ${MKLDNN_IOMP_LIB})
list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
endif()
if(USE_NNPACK)
......
......@@ -12,11 +12,11 @@ Machine:
System: CentOS release 6.3 (Final), Docker 1.12.1.
PaddlePaddle: paddlepaddle/paddle:latest (TODO: will rerun after 0.11.0)
- MKL-DNN tag v0.10
- MKLML 2018.0.20170720
PaddlePaddle: paddlepaddle/paddle:latest (for MKLML and MKL-DNN), paddlepaddle/paddle:latest-openblas (for OpenBLAS)
- MKL-DNN tag v0.11
- MKLML 2018.0.1.20171007
- OpenBLAS v0.2.20
(TODO: will rerun after 0.11.0)
On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.
......@@ -31,17 +31,37 @@ Input image size - 3 * 224 * 224, Time: images/second
| BatchSize | 64 | 128 | 256 |
|--------------|-------| -----| --------|
| OpenBLAS | 7.82 | 8.62 | 10.34 |
| MKLML | 11.02 | 12.86 | 15.33 |
| MKL-DNN | 27.69 | 28.8 | 29.27 |
| OpenBLAS | 7.80 | 9.00 | 10.80 |
| MKLML | 12.12 | 13.70 | 16.18 |
| MKL-DNN | 28.46 | 29.83 | 30.44 |
chart on batch size 128
TBD
- ResNet-50
| BatchSize | 64 | 128 | 256 |
|--------------|-------| ------| -------|
| OpenBLAS | 25.22 | 25.68 | 27.12 |
| MKLML | 32.52 | 31.89 | 33.12 |
| MKL-DNN | 81.69 | 82.35 | 84.08 |
chart on batch size 128
TBD
- ResNet
- GoogLeNet
| BatchSize | 64 | 128 | 256 |
|--------------|-------| ------| -------|
| OpenBLAS | 89.52 | 96.97 | 108.25 |
| MKLML | 128.46| 137.89| 158.63 |
| MKL-DNN     | 250.46| 264.83| 269.50 |
chart on batch size 128
TBD
### Laptop
TBD
### Desktop
......
......@@ -5,6 +5,7 @@ height = 224
width = 224
num_class = 1000
batch_size = get_config_arg('batch_size', int, 128)
use_gpu = get_config_arg('use_gpu', bool, True)
args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
define_py_data_sources2(
......@@ -16,6 +17,8 @@ settings(
learning_method=MomentumOptimizer(0.9),
regularization=L2Regularization(0.0005 * batch_size))
conv_projection = conv_projection if use_gpu else img_conv_layer
def inception2(name, input, channels, \
filter1,
filter3R, filter3,
......@@ -138,7 +141,7 @@ def inception(name, input, channels, \
cat = concat_layer(
name=name,
input=[cov1, cov3, cov5, covprj],
bias_attr=True,
bias_attr=True if use_gpu else False,
act=ReluActivation())
return cat
......
set -e
function train() {
unset OMP_NUM_THREADS MKL_NUM_THREADS
export OMP_DYNAMIC="FALSE"
export KMP_AFFINITY="granularity=fine,compact,0,0"
unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
topology=$1
layer_num=$2
bs=$3
......@@ -14,8 +12,6 @@ function train() {
elif [ $4 == "False" ]; then
thread=`nproc`
# each trainer_count use only 1 core to avoid conflict
export OMP_NUM_THREADS=1
export MKL_NUM_THREADS=1
log="logs/${topology}-${layer_num}-${thread}mklml-${bs}.log"
else
echo "Wrong input $3, use True or False."
......@@ -44,6 +40,7 @@ fi
for use_mkldnn in True False; do
for batchsize in 64 128 256; do
train vgg 19 $batchsize $use_mkldnn
train resnet 50 $batchsize $use_mkldnn
train resnet 50 $batchsize $use_mkldnn
train googlenet v1 $batchsize $use_mkldnn
done
done
......@@ -76,27 +76,14 @@ else()
include_directories(${CUDA_TOOLKIT_INCLUDE})
endif(NOT WITH_GPU)
if(WITH_MKLDNN)
add_definitions(-DPADDLE_USE_MKLDNN)
if (WITH_MKLML AND MKLDNN_IOMP_DIR)
message(STATUS "Enable Intel OpenMP at ${MKLDNN_IOMP_DIR}")
set(OPENMP_FLAGS "-fopenmp")
set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
else()
find_package(OpenMP)
if(OPENMP_FOUND)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
else()
message(WARNING "Can not find OpenMP."
"Some performance features in MKLDNN may not be available")
endif()
endif()
endif(WITH_MKLDNN)
if (WITH_MKLML AND MKLML_IOMP_LIB)
message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
set(OPENMP_FLAGS "-fopenmp")
set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
endif()
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
......
......@@ -76,11 +76,9 @@ set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")
# Set the architecture for iOS
if(NOT DEFINED IOS_ARCH)
if(IOS_PLATFORM STREQUAL "OS")
# FIXME(liuyiqun): support "armv7;armv7s;arm64" future
set(IOS_ARCH "arm64")
set(IOS_ARCH "armv7;armv7s;arm64")
elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
# FIXME(liuyiqun): support "i386;x86_64" future
set(IOS_ARCH "x86_64")
set(IOS_ARCH "i386;x86_64")
endif()
endif()
set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS")
......@@ -248,7 +246,7 @@ set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_
# Hidden visibilty is required for cxx on iOS
set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags")
set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first")
......
if(NOT WITH_GPU)
return()
endif()
set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
set(paddle_known_gpu_archs7 "30 35 50 52")
set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
######################################################################################
# A function for automatic detection of GPUs installed (if autodetection is enabled)
# Usage:
# detect_installed_gpus(out_variable)
function(detect_installed_gpus out_variable)
if(NOT CUDA_gpu_detect_output)
set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
file(WRITE ${cufile} ""
"#include <cstdio>\n"
"int main() {\n"
" int count = 0;\n"
" if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
" if (count == 0) return -1;\n"
" for (int device = 0; device < count; ++device) {\n"
" cudaDeviceProp prop;\n"
" if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
" std::printf(\"%d.%d \", prop.major, prop.minor);\n"
" }\n"
" return 0;\n"
"}\n")
execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "-ccbin=${CUDA_HOST_COMPILER}"
"--run" "${cufile}"
WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if(nvcc_res EQUAL 0)
# only keep the last line of nvcc_out
STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}")
STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}")
list(GET nvcc_out -1 nvcc_out)
string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}")
set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE)
endif()
endif()
if(NOT CUDA_gpu_detect_output)
message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
else()
set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
endif()
endfunction()
########################################################################
# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME
# Usage:
# select_nvcc_arch_flags(out_variable)
function(select_nvcc_arch_flags out_variable)
# List of arch names
set(archs_names "Kepler" "Maxwell" "Pascal" "All" "Manual")
set(archs_name_default "All")
if(NOT CMAKE_CROSSCOMPILING)
list(APPEND archs_names "Auto")
endif()
# set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
set(CUDA_ARCH_NAME ${archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${archs_names} )
mark_as_advanced(CUDA_ARCH_NAME)
# verify CUDA_ARCH_NAME value
if(NOT ";${archs_names};" MATCHES ";${CUDA_ARCH_NAME};")
string(REPLACE ";" ", " archs_names "${archs_names}")
message(FATAL_ERROR "Only ${archs_names} architeture names are supported.")
endif()
if(${CUDA_ARCH_NAME} STREQUAL "Manual")
set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
set(CUDA_ARCH_PTX "50" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
else()
unset(CUDA_ARCH_BIN CACHE)
unset(CUDA_ARCH_PTX CACHE)
endif()
if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
set(cuda_arch_bin "30 35")
elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
set(cuda_arch_bin "50")
elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
set(cuda_arch_bin "60 61")
elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
set(cuda_arch_bin "70")
elseif(${CUDA_ARCH_NAME} STREQUAL "All")
set(cuda_arch_bin ${paddle_known_gpu_archs})
elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
detect_installed_gpus(cuda_arch_bin)
else() # (${CUDA_ARCH_NAME} STREQUAL "Manual")
set(cuda_arch_bin ${CUDA_ARCH_BIN})
endif()
# remove dots and convert to lists
string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}")
string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
string(REGEX MATCHALL "[0-9]+" cuda_arch_ptx "${cuda_arch_ptx}")
list(REMOVE_DUPLICATES cuda_arch_bin)
list(REMOVE_DUPLICATES cuda_arch_ptx)
set(nvcc_flags "")
set(nvcc_archs_readable "")
# Tell NVCC to add binaries for the specified GPUs
foreach(arch ${cuda_arch_bin})
if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
# User explicitly specified PTX for the concrete BIN
list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
else()
# User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
list(APPEND nvcc_archs_readable sm_${arch})
endif()
endforeach()
# Tell NVCC to add PTX intermediate code for the specified architectures
foreach(arch ${cuda_arch_ptx})
list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch})
list(APPEND nvcc_archs_readable compute_${arch})
endforeach()
string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}")
set(${out_variable} ${nvcc_flags} PARENT_SCOPE)
set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE)
endfunction()
message(STATUS "CUDA detected: " ${CUDA_VERSION})
if (${CUDA_VERSION} LESS 7.0)
set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
# CUDA 8 may complain that sm_20 is no longer supported. Suppress the
# warning for now.
list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
endif()
include_directories(${CUDA_INCLUDE_DIRS})
list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
if(NOT WITH_DSO)
list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
endif(NOT WITH_DSO)
# setting nvcc arch flags
select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}")
# Set C++11 support
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
# So, don't set these flags here.
list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
# Set :expt-relaxed-constexpr to suppress Eigen warnings
list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL})
endif()
mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
......@@ -40,10 +40,9 @@ INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR})
IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
SET(MKLDNN_DEPENDS ${MKLML_PROJECT})
SET(MKLDNN_MKLROOT ${MKLML_ROOT})
SET(MKLDNN_IOMP_LIB ${MKLML_IOMP_LIB})
SET(MKLDNN_IOMP_DIR ${MKLML_LIB_DIR})
MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}")
MESSAGE(STATUS "Build MKLDNN with MKLML ${MKLML_ROOT}")
ELSE()
MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
ENDIF()
SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow")
......@@ -57,15 +56,16 @@ ExternalProject_Add(
PREFIX ${MKLDNN_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
CMAKE_ARGS -DMKLROOT=${MKLDNN_MKLROOT}
CMAKE_ARGS -DMKLROOT=${MKLML_ROOT}
CMAKE_ARGS -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
-DMKLROOT:PATH=${MKLDNN_MKLROOT}
-DMKLROOT:PATH=${MKLML_ROOT}
)
ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIB}")
MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
add_definitions(-DPADDLE_USE_MKLDNN)
LIST(APPEND external_project_dependencies mkldnn)
......@@ -29,7 +29,7 @@ IF(NOT ${CBLAS_FOUND})
"${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
CACHE FILEPATH "openblas library." FORCE)
SET(OPENBLAS_CC "${CMAKE_C_COMPILER}")
SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
IF(CMAKE_CROSSCOMPILING)
SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
......@@ -45,15 +45,14 @@ IF(NOT ${CBLAS_FOUND})
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
ENDIF()
ELSEIF(IOS)
# FIXME(liuyiqun): support multiple architectures
SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
IF(CMAKE_OSX_ARCHITECTURES MATCHES "armv7")
SET(OPENBLAS_CC "${OPENBLAS_CC} -arch armv7")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
ELSEIF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
ELSE()
MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. "
"You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.")
ENDIF()
ELSEIF(RPI)
# use hardfp
......
......@@ -12,6 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
IF(MOBILE_INFERENCE)
return()
ENDIF()
INCLUDE(ExternalProject)
SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc)
......
......@@ -149,58 +149,3 @@ endforeach()
foreach(flag ${GPU_COMMON_FLAGS})
safe_set_nvflag(${flag})
endforeach()
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
# So, don't set these flags here.
LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math)
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL})
endif()
function(specify_cuda_arch cuda_version cuda_arch)
if(${cuda_version} VERSION_GREATER "8.0")
foreach(capability 61 62)
if(${cuda_arch} STREQUAL ${capability})
list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
endif()
endforeach()
elseif(${cuda_version} VERSION_GREATER "7.0" and ${cuda_arch} STREQUAL "53")
list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
endif()
endfunction()
# Common gpu architectures: Kepler, Maxwell
foreach(capability 30 35 50)
list(APPEND __arch_flags " -gencode arch=compute_${capability},code=sm_${capability}")
endforeach()
if (CUDA_VERSION VERSION_GREATER "7.0" OR CUDA_VERSION VERSION_EQUAL "7.0")
list(APPEND __arch_flags " -gencode arch=compute_52,code=sm_52")
endif()
# Modern gpu architectures: Pascal
if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0")
list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60")
list(APPEND CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
endif()
# Custom gpu architecture
set(CUDA_ARCH)
if(CUDA_ARCH)
specify_cuda_arch(${CUDA_VERSION} ${CUDA_ARCH})
endif()
set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
......@@ -115,8 +115,8 @@ function(link_paddle_exe TARGET_NAME)
target_link_libraries(${TARGET_NAME} log)
endif(ANDROID)
if(WITH_MKLDNN AND WITH_MKLML AND MKLDNN_IOMP_DIR)
target_link_libraries(${TARGET_NAME} "-L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed")
if(WITH_MKLML AND MKLML_LIB_DIR AND MKLML_IOMP_LIB)
target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
endif()
add_dependencies(${TARGET_NAME} ${external_project_dependencies})
......
......@@ -335,6 +335,16 @@ bilinear_interp
.. autoclass:: paddle.v2.layer.bilinear_interp
:noindex:
dot_prod
---------
.. autoclass:: paddle.v2.layer.dot_prod
:noindex:
out_prod
--------
.. autoclass:: paddle.v2.layer.out_prod
:noindex:
power
-----
.. autoclass:: paddle.v2.layer.power
......@@ -372,6 +382,11 @@ cos_sim
.. autoclass:: paddle.v2.layer.cos_sim
:noindex:
l2_distance
-----------
.. autoclass:: paddle.v2.layer.l2_distance
:noindex:
trans
-----
.. autoclass:: paddle.v2.layer.trans
......
......@@ -36,13 +36,13 @@ Figure 1. PaddlePaddle on IA.
我们把集成方案大致分为了如下几个方面。
### CMake
我们会在`CMakeLists.txt`中会添加`WITH_MKLDNN`的选项,当设置这个值为`ON`的时候会启用编译MKL-DNN功能。同时会自动开启OpenMP用于提高MKL-DNN的性能
我们会在`CMakeLists.txt`中会给用户添加一个`WITH_MKL`的开关,他是负责`WITH_MKLML``WITH_MKLDNN`的总开关
同时,我们会引入`WITH_MKLML`选项,用于选择是否使用MKL-DNN自带的MKLML安装包。这个安装包可以独立于MKL-DNN使用,但是建议在开启MKL-DNN的同时也打开MKLML的开关,这样才能发挥最好的性能。
当打开`WITH_MKL`时,会开启MKLML的功能,作为PaddlePaddle的CBLAS和LAPACK库,同时会开启Intel OpenMP用于提高MKLML的性能。 如果系统支持AVX2指令集及以上,同时会开启MKL-DNN功能。
所以,我们会在`cmake/external`目录新建`mkldnn.cmake``mklml.cmake`文件,它们会在编译PaddlePaddle的时候下载对应的软件包,并放到PaddlePaddle的third party目录中
当关闭`WITH_MKL`时,MKLML和MKL-DNN功能会同时关闭
**备注**:当`WITH_MKLML=ON`的时候,会优先使用这个包作为PaddlePaddle的CBLAS和LAPACK库,所以会稍微改动`cmake/cblas.cmake`中的逻辑
所以,我们会在`cmake/external`目录新建`mkldnn.cmake``mklml.cmake`文件,它们会在编译PaddlePaddle的时候下载对应的软件包,并放到PaddlePaddle的third party目录中
### Layers
所有MKL-DNN相关的C++ layers,都会按照PaddlePaddle的目录结构存放在
......
# Python Data Reader Design Doc
At training and testing time, PaddlePaddle programs need to read data. To ease the users' work to write data reading code, we define that
During the training and testing phases, PaddlePaddle programs need to read data. To help the users write code that performs reading input data, we define the following:
- A *reader* is a function that reads data (from file, network, random number generator, etc) and yields data items.
- A *reader creator* is a function that returns a reader function.
- A *reader decorator* is a function, which accepts one or more readers, and returns a reader.
- A *batch reader* is a function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items.
- A *reader*: A function that reads data (from file, network, random number generator, etc) and yields the data items.
- A *reader creator*: A function that returns a reader function.
- A *reader decorator*: A function, which takes in one or more readers, and returns a reader.
- A *batch reader*: A function that reads data (from *reader*, file, network, random number generator, etc) and yields a batch of data items.
and provide function which converts reader to batch reader, frequently used reader creators and reader decorators.
and also provide a function which can convert a reader to a batch reader, frequently used reader creators and reader decorators.
## Data Reader Interface
Indeed, *data reader* doesn't have to be a function that reads and yields data items. It can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`):
*Data reader* doesn't have to be a function that reads and yields data items. It can just be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`) as follows:
```
iterable = data_reader()
```
Element produced from the iterable should be a **single** entry of data, **not** a mini batch. That entry of data could be a single item, or a tuple of items. Item should be of [supported type](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int)
The item produced from the iterable should be a **single** entry of data and **not** a mini batch. The entry of data could be a single item or a tuple of items. Item should be of one of the [supported types](http://www.paddlepaddle.org/doc/ui/data_provider/pydataprovider2.html?highlight=dense_vector#input-types) (e.g., numpy 1d array of float32, int, list of int etc.)
An example implementation for single item data reader creator:
An example implementation for single item data reader creator is as follows:
```python
def reader_creator_random_image(width, height):
......@@ -29,7 +29,7 @@ def reader_creator_random_image(width, height):
return reader
```
An example implementation for multiple item data reader creator:
An example implementation for multiple item data reader creator is as follows:
```python
def reader_creator_random_image_and_label(width, height, label):
def reader():
......@@ -40,9 +40,10 @@ def reader_creator_random_image_and_label(width, height, label):
## Batch Reader Interface
*batch reader* can be any function with no parameter that creates a iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list must be a tuple.
*Batch reader* can be any function without any parameters that creates an iterable (anything can be used in `for x in iterable`). The output of the iterable should be a batch (list) of data items. Each item inside the list should be a tuple.
Here are some valid outputs:
Here are valid outputs:
```python
# a mini batch of three data items. Each data item consist three columns of data, each of which is 1.
[(1, 1, 1),
......@@ -58,20 +59,22 @@ Here are valid outputs:
Please note that each item inside the list must be a tuple, below is an invalid output:
```python
# wrong, [1,1,1] needs to be inside a tuple: ([1,1,1],).
# Otherwise it's ambiguous whether [1,1,1] means a single column of data [1, 1, 1],
# or three column of datas, each of which is 1.
# Otherwise it is ambiguous whether [1,1,1] means a single column of data [1, 1, 1],
# or three columns of data, each of which is 1.
[[1,1,1],
[2,2,2],
[3,3,3]]
```
It's easy to convert from reader to batch reader:
It is easy to convert from a reader to a batch reader:
```python
mnist_train = paddle.dataset.mnist.train()
mnist_train_batch_reader = paddle.batch(mnist_train, 128)
```
Also easy to create custom batch reader:
It is also straight forward to create a custom batch reader:
```python
def custom_batch_reader():
while True:
......@@ -85,7 +88,8 @@ mnist_random_image_batch_reader = custom_batch_reader
## Usage
batch reader, mapping from item(s) read to data layer, batch size and number of total pass will be passed into `paddle.train`:
Following is how we can use the reader with PaddlePaddle:
The batch reader, a mapping from item(s) to data layer, the batch size and the number of total passes will be passed into `paddle.train` as follows:
```python
# two data layer is created:
......@@ -99,13 +103,13 @@ paddle.train(batch_reader, {"image":0, "label":1}, 128, 10, ...)
## Data Reader Decorator
*Data reader decorator* takes a single or multiple data reader, returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` syntax.
The *Data reader decorator* takes in a single reader or multiple data readers and returns a new data reader. It is similar to a [python decorator](https://wiki.python.org/moin/PythonDecorators), but it does not use `@` in the syntax.
Since we have a strict interface for data readers (no parameter, return a single data item). Data reader can be used flexiable via data reader decorators. Following are a few examples:
Since we have a strict interface for data readers (no parameters and return a single data item), a data reader can be used in a flexible way using data reader decorators. Following are a few examples:
### Prefetch Data
Since reading data may take time and training can not proceed without data. It is generally a good idea to prefetch data.
Since reading data may take some time and training can not proceed without data, it is generally a good idea to prefetch the data.
Use `paddle.reader.buffered` to prefetch data:
......@@ -117,9 +121,9 @@ buffered_reader = paddle.reader.buffered(paddle.dataset.mnist.train(), 100)
### Compose Multiple Data Readers
For example, we want to use a source of real images (reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
For example, if we want to use a source of real images (say reusing mnist dataset), and a source of random images as input for [Generative Adversarial Networks](https://arxiv.org/abs/1406.2661).
We can do:
We can do the following :
```python
def reader_creator_random_image(width, height):
......@@ -139,13 +143,13 @@ false_reader = reader_creator_bool(False)
reader = paddle.reader.compose(paddle.dataset.mnist.train(), data_reader_creator_random_image(20, 20), true_reader, false_reader)
# Skipped 1 because paddle.dataset.mnist.train() produces two items per data entry.
# And we don't care second item at this time.
# And we don't care about the second item at this time.
paddle.train(paddle.batch(reader, 128), {"true_image":0, "fake_image": 2, "true_label": 3, "false_label": 4}, ...)
```
### Shuffle
Given shuffle buffer size `n`, `paddle.reader.shuffle` will return a data reader that buffers `n` data entries and shuffle them before a data entry is read.
Given the shuffle buffer size `n`, `paddle.reader.shuffle` returns a data reader that buffers `n` data entries and shuffles them before a data entry is read.
Example:
```python
......@@ -154,21 +158,21 @@ reader = paddle.reader.shuffle(paddle.dataset.mnist.train(), 512)
## Q & A
### Why reader return only a single entry, but not a mini batch?
### Why does a reader return only a single entry, and not a mini batch?
Always returning a single entry make reusing existing data readers much easier (e.g., if existing reader return not a single entry but 3 entries, training code will be more complex because it need to handle cases like batch size 2).
Returning a single entry makes reusing existing data readers much easier (for example, if an existing reader returns 3 entries instead if a single entry, the training code will be more complicated because it need to handle cases like a batch size 2).
We provide function `paddle.batch` to turn (single entry) reader into batch reader.
We provide a function: `paddle.batch` to turn (a single entry) reader into a batch reader.
### Why do we need batch reader, isn't train take reader and batch_size as arguments sufficient?
### Why do we need a batch reader, isn't is sufficient to give the reader and batch_size as arguments during training ?
In most of the case, train taking reader and batch_size as arguments would be sufficent. However sometimes user want to customize order of data entries inside a mini batch. Or even change batch size dynamically.
In most of the cases, it would be sufficient to give the reader and batch_size as arguments to the train method. However sometimes the user wants to customize the order of data entries inside a mini batch, or even change the batch size dynamically. For these cases using a batch reader is very efficient and helpful.
### Why use a dictionary but not a list to provide mapping?
### Why use a dictionary instead of a list to provide mapping?
We decided to use dictionary (`{"image":0, "label":1}`) instead of list (`["image", "label"]`) is because that user can easily resue item (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or skip item (e.g., using `{"image_a":0, "label":2}`).
Using a dictionary (`{"image":0, "label":1}`) instead of a list (`["image", "label"]`) gives the advantage that the user can easily reuse the items (e.g., using `{"image_a":0, "image_b":0, "label":1}`) or even skip an item (e.g., using `{"image_a":0, "label":2}`).
### How to create custom data reader creator
### How to create a custom data reader creator ?
```python
def image_reader_creator(image_path, label_path, n):
......@@ -192,7 +196,7 @@ paddle.train(paddle.batch(reader, 128), {"image":0, "label":1}, ...)
### How is `paddle.train` implemented
An example implementation of paddle.train could be:
An example implementation of paddle.train is:
```python
def train(batch_reader, mapping, batch_size, total_pass):
......
......@@ -34,7 +34,7 @@ PaddlePaddle的文档构建有两种方式。
cd TO_YOUR_PADDLE_CLONE_PATH
mkdir -p build
cd build
cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_DOC=ON
cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
make gen_proto_py
make paddle_docs paddle_docs_cn
......
# 构建Android平台上的PaddlePaddle库
# Android平台编译指南
用户可通过如下两种方式,交叉编译Android平台上适用的PaddlePaddle库:
- 基于Docker容器的编译方式
......
# 构建iOS平台上的PaddlePaddle库
# iOS平台编译指南
交叉编译iOS平台上适用的PaddlePaddle库,需要在MacOS系统上进行。本文的将介绍在MacOS上,从源码交叉编译iOS平台上适用的PaddlePaddle库。
## 准备交叉编译环境
......@@ -25,7 +25,7 @@ iOS平台可选配置参数:
- `IOS_PLATFORM`,可设置为`OS/SIMULATOR`,默认值为`OS`
- `OS`,构建目标为`arm`架构的iPhone或者iPad等物理设备。
- `SIMULATOR`,构建目标为`x86`架构的模拟器平台。
- `IOS_ARCH`,目标架构。针对不同的`IOS_PLATFORM`,可设置的目标架构如下表所示:
- `IOS_ARCH`,目标架构。针对不同的`IOS_PLATFORM`,可设置的目标架构如下表所示,默认编译所有架构
<table class="docutils">
<colgroup>
......@@ -41,11 +41,11 @@ iOS平台可选配置参数:
<tbody valign="top">
<tr class="row-even">
<td>OS</td>
<td>armv7, armv7s, arm64 (默认)</td>
<td>armv7, armv7s, arm64 </td>
</tr>
<tr class="row-odd">
<td>SIMULATOR</td>
<td>i386, x86_64 (默认)</td>
<td>i386, x86_64 </td>
</tr>
</tbody>
</table>
......@@ -66,7 +66,7 @@ iOS平台可选配置参数:
```bash
cmake -DCMAKE_SYSTEM_NAME=iOS \
-DIOS_PLATFORM=OS \
-DIOS_ARCH="arm64" \
-DIOS_ARCH="armv7;arm64" \
-DIOS_ENABLE_BITCODE=ON \
-DIOS_USE_VECLIB_FOR_BLAS=ON \
-DCMAKE_INSTALL_PREFIX=your/path/to/install \
......@@ -112,6 +112,6 @@ $ make install
- `lib`目录,其中包含PaddlePaddle的C-API静态库
- `third_party`目录,其中包含所依赖的所有第三方库
注意,不同架构的PaddlePaddle库建议安装到不同的目录下,然后使用`lipo`工具将多个静态库合并成一个支持多个架构的fat库。
注意,如果PaddlePaddle库需要同时支持真机和模拟器,则需要分别编译真机和模拟器版本,然后使用`lipo`工具合并fat库。
自此,PaddlePaddle库已经安装完成,用户可将合成的fat库用于深度学习相关的iOS App中,调用方法见C-API文档。
# 构建Raspberry Pi平台上的PaddlePaddle库
# Raspberry Pi平台编译指南
通常有两个方法来构建基于 Rasspberry Pi 的版本:
......
......@@ -29,6 +29,9 @@ static void initPaddle(int argc, char** argv) {
extern "C" {
paddle_error paddle_init(int argc, char** argv) {
static bool isInit = false;
if (isInit) return kPD_NO_ERROR;
std::vector<char*> realArgv;
realArgv.reserve(argc + 1);
realArgv.push_back(strdup(""));
......@@ -37,6 +40,7 @@ paddle_error paddle_init(int argc, char** argv) {
}
initPaddle(argc + 1, realArgv.data());
free(realArgv[0]);
isInit = true;
return kPD_NO_ERROR;
}
}
#include <paddle/capi.h>
#include <time.h>
#include "../common/common.h"
#define CONFIG_BIN "./trainer_config.bin"
......@@ -27,20 +28,19 @@ int main() {
CHECK(paddle_arguments_resize(in_args, 1));
// Create input matrix.
paddle_matrix mat = paddle_matrix_create(/* sample_num */ 10,
paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
/* size */ 784,
/* useGPU */ false);
srand(time(0));
std::vector<paddle_real> input;
input.resize(784 * 10);
paddle_real* array;
// Get First row.
CHECK(paddle_matrix_get_row(mat, 0, &array));
for (int i = 0; i < input.size(); ++i) {
input[i] = rand() / ((float)RAND_MAX);
for (int i = 0; i < 784; ++i) {
array[i] = rand() / ((float)RAND_MAX);
}
// Set value for the input matrix
CHECK(paddle_matrix_set_value(mat, input.data()));
CHECK(paddle_arguments_set_value(in_args, 0, mat));
......@@ -53,17 +53,18 @@ int main() {
CHECK(paddle_arguments_get_value(out_args, 0, prob));
std::std::vector<paddle_real> result;
int height;
int width;
uint64_t height;
uint64_t width;
CHECK(paddle_matrix_get_shape(prob, &height, &width);
result.resize(height * width);
CHECK(paddle_matrix_get_value(prob, result.data()));
CHECK(paddle_matrix_get_shape(prob, &height, &width));
CHECK(paddle_matrix_get_row(prob, 0, &array));
printf("Prob: ");
printf("Prob: \n");
for (int i = 0; i < height * width; ++i) {
printf("%.2f ", result[i]);
printf("%.4f ", array[i]);
if ((i + 1) % width == 0) {
printf("\n");
}
}
printf("\n");
......
......@@ -25,7 +25,9 @@ limitations under the License. */
#include "hl_matrix.h"
#include "hl_sequence.h"
#include "hl_sparse.h"
#ifndef PADDLE_MOBILE_INFERENCE
#include "hl_warpctc_wrap.h"
#endif
#ifdef HPPL_STUB_FUNC
#include "stub/hl_aggregate_stub.h"
......
......@@ -270,6 +270,19 @@ static bool AllGradInSet(const std::vector<std::string>& names,
return false;
}
}
if (VLOG_IS_ON(10)) {
std::ostringstream sout;
sout << "All input {";
for (auto& name : names) {
sout << name << ",";
}
sout << "} is in {";
for (auto& name : set) {
sout << name << ",";
}
sout << "}";
VLOG(10) << sout.str();
}
return true;
}
......@@ -290,14 +303,12 @@ static void CreateGradVarInBlock(
auto ops = block_desc->AllOps();
for (size_t op_index = grad_op_start_index; op_index < ops.size();
++op_index) {
bool need_infer_shape = false;
std::unordered_set<std::string> new_vars;
ForEachVarName(ops[op_index]->Outputs(),
[&](const std::string& grad_var_name) {
if (block_desc->HasVar(grad_var_name)) {
return false;
}
need_infer_shape = true;
auto var = block_desc->Var(grad_var_name);
new_vars.insert(var->Name());
auto it = param_name_map.find(grad_var_name);
......@@ -311,23 +322,21 @@ static void CreateGradVarInBlock(
grad_record.op_idx_ = static_cast<int>(op_index);
return false; /* not break */
});
if (need_infer_shape) {
ops[op_index]->InferVarType(block_desc);
for (auto& arg : ops[op_index]->OutputArgumentNames()) {
if (new_vars.find(arg) == new_vars.end()) {
continue;
}
auto pname = FwdName(arg);
auto* param = block_desc->FindVarRecursive(pname);
auto* grad = block_desc->FindVar(arg);
if (param == nullptr) {
grad->SetDataType(DataType::FP32);
} else {
grad->SetDataType(param->GetDataType());
}
ops[op_index]->InferVarType(block_desc);
for (auto& arg : ops[op_index]->OutputArgumentNames()) {
if (new_vars.find(arg) == new_vars.end()) {
continue;
}
auto pname = FwdName(arg);
auto* param = block_desc->FindVarRecursive(pname);
auto* grad = block_desc->FindVar(arg);
if (param == nullptr) {
grad->SetDataType(DataType::FP32);
} else {
grad->SetDataType(param->GetDataType());
}
ops[op_index]->InferShape(*block_desc);
}
ops[op_index]->InferShape(*block_desc);
}
}
......@@ -387,6 +396,7 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
ProgramDescBind& program_desc, int block_idx,
std::unordered_set<std::string>* no_grad_vars,
std::unordered_map<std::string, std::string>* grad_to_var) {
VLOG(5) << "MakeBlockBackward";
BlockDescBind* cur_block = program_desc.MutableBlock(block_idx);
std::vector<OpDescBind*> op_descs = cur_block->AllOps();
std::unordered_map<std::string, std::vector<size_t>> dup_out_ops;
......@@ -394,9 +404,10 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
std::vector<std::unique_ptr<OpDescBind>> backward_descs;
for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
VLOG(5) << "Making backward " << (*it)->Type() << " op";
std::vector<std::unique_ptr<OpDescBind>> op_grads;
if ((*it)->Type() == "recurrent") {
if ((*it)->Type() == "recurrent" || (*it)->Type() == "while") {
int step_block_idx = (*it)->GetBlockAttr("step_block");
BlockDescBind* backward_block = CreateStepBlock(
program_desc, no_grad_vars, grad_to_var, step_block_idx);
......@@ -410,6 +421,15 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var);
}
if (VLOG_IS_ON(10)) {
std::ostringstream sout;
sout << "Made ";
for (auto& op_grad : op_grads) {
sout << op_grad->Type() << " ";
}
VLOG(10) << sout.str();
}
for (const auto& desc : op_grads) {
for (const std::string& out_name : desc->OutputArgumentNames()) {
if (out_name.find("@GRAD") == std::string::npos) {
......@@ -425,6 +445,8 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
op_grads.begin(), op_grads.end(), std::back_inserter(backward_descs),
[](std::unique_ptr<OpDescBind>& ptr) { return std::move(ptr); });
}
VLOG(5) << "Appending Sums";
// Check whether some variables are written more than once
std::list<std::pair<size_t, std::unique_ptr<OpDescBind>>> pending_sum_ops;
for (const auto& dup : dup_out_ops) {
......@@ -432,16 +454,22 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
const std::vector<size_t> dup_op = dup.second;
if (out_name != kEmptyVarName && dup_op.size() > 1) {
std::vector<std::string> sum_op_inputs;
std::string next_g_name = out_name;
for (size_t i = 0; i < dup_op.size(); ++i) {
VLOG(10) << backward_descs[dup_op[i]]->Type() << " has " << out_name
<< " duplicated";
std::string new_name = out_name + "@RENAME@" + std::to_string(i);
backward_descs[dup_op[i]]->Rename(out_name, new_name);
backward_descs[dup_op[i]]->RenameOutput(out_name, new_name);
backward_descs[dup_op[i]]->RenameInput(out_name, next_g_name);
sum_op_inputs.emplace_back(new_name);
next_g_name = sum_op_inputs.back();
}
std::unique_ptr<OpDescBind> sum_op(new OpDescBind(
"sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {}));
pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
}
}
pending_sum_ops.sort(
[](const std::pair<size_t, std::unique_ptr<OpDescBind>>& a,
const std::pair<size_t, std::unique_ptr<OpDescBind>>& b) {
......@@ -452,6 +480,8 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
std::move(p.second));
}
VLOG(5) << "MakeBlockBackward Finished";
return backward_descs;
}
......@@ -483,19 +513,14 @@ ParamGradInfoMap AppendBackward(
const int root_block_idx = 0;
auto root_block = program_desc.MutableBlock(root_block_idx);
// insert fill one op for target
// TODO(qiao) add some check to the target.
std::string fill_one_op_out = GradVarName(target.Name());
std::vector<int64_t> target_shape_desc = target.Shape();
std::vector<int> target_shape;
std::transform(target_shape_desc.begin(), target_shape_desc.end(),
std::back_inserter(target_shape),
[](int64_t dim) { return static_cast<int>(dim); });
bool is_scalar = target.Shape() == std::vector<int64_t>{1};
PADDLE_ENFORCE(is_scalar, "target should be scalar");
VLOG(3) << "backward from loss=" << target.Name()
<< " data_type=" << target.GetDataType();
std::unique_ptr<OpDescBind> fill_one_op(
new OpDescBind("fill_constant", {}, {{"Out", {fill_one_op_out}}},
{{"shape", target_shape},
{{"shape", std::vector<int>{1}},
{"value", static_cast<float>(1.0)},
{"data_type", target.GetDataType()}}));
// infer var type of fill_one_op
......
......@@ -508,6 +508,7 @@ TEST(Backward, simple_single_op) {
op->SetOutput("Out", {"out"});
auto target = f::VarDescBind("out");
target.SetShape({1});
auto var_to_grad = AppendBackward(program, target, {});
ASSERT_EQ(block->AllOps().size(), 3UL);
......@@ -544,6 +545,7 @@ TEST(Backward, default_attribute) {
op->CheckAttrs();
auto target = f::VarDescBind("out");
target.SetShape({1});
AppendBackward(program, target, {});
ASSERT_EQ(block->AllOps().size(), 3UL);
......@@ -581,6 +583,7 @@ TEST(Backward, simple_mult_op) {
op3->SetOutput("Out", {"out3"});
auto target = f::VarDescBind("out3");
target.SetShape({1});
size_t forward_len = block->AllOps().size();
auto var_to_grad = AppendBackward(program, target, {});
......@@ -670,6 +673,7 @@ TEST(Backward, intermedia_var_no_grad) {
op4->SetOutput("Out", {"out4"});
auto target = f::VarDescBind("out4");
target.SetShape({1});
size_t forward_len = block->AllOps().size();
auto var_to_grad = AppendBackward(program, target, {"out3"});
......@@ -730,6 +734,7 @@ TEST(Backward, var_no_grad) {
op2->SetOutput("Z", {"z2"});
auto target = f::VarDescBind("z2");
target.SetShape({1});
size_t forward_len = block->AllOps().size();
auto var_to_grad = AppendBackward(program, target, {"z1"});
......@@ -810,6 +815,7 @@ TEST(Backward, shared_var) {
op3->SetOutput("Out", {"out3"});
auto target = f::VarDescBind("out3");
target.SetShape({1});
size_t forward_len = block->AllOps().size();
auto var_to_grad = AppendBackward(program, target, {});
......@@ -888,6 +894,7 @@ TEST(Backward, half_backward) {
op1->SetOutput("Out", {"out"});
auto target = f::VarDescBind("out");
target.SetShape({1});
size_t forward_len = block->AllOps().size();
auto var_to_grad = AppendBackward(program, target, {"b"});
f::OpDescBind *fill_op = block->AllOps()[forward_len];
......
......@@ -29,6 +29,8 @@ inline DataType ToDataType(std::type_index type) {
return DataType::INT32;
} else if (typeid(int64_t).hash_code() == type.hash_code()) {
return DataType::INT64;
} else if (typeid(bool).hash_code() == type.hash_code()) {
return DataType::BOOL;
} else {
PADDLE_THROW("Not supported");
}
......@@ -44,6 +46,8 @@ inline std::type_index ToTypeIndex(DataType type) {
return typeid(int);
case DataType::INT64:
return typeid(int64_t);
case DataType::BOOL:
return typeid(bool);
default:
PADDLE_THROW("Not support type %d", type);
}
......@@ -64,6 +68,9 @@ inline void VisitDataType(DataType type, Visitor visitor) {
case DataType::INT64:
visitor.template operator()<int64_t>();
break;
case DataType::BOOL:
visitor.template operator()<bool>();
break;
default:
PADDLE_THROW("Not supported");
}
......
......@@ -60,8 +60,7 @@ void make_ddim(DDim& ddim, const int64_t* dims, int n) {
ddim = make_dim<9>(dims);
break;
default:
throw std::invalid_argument(
"Dynamic dimensions must have between [1, 9] dimensions.");
PADDLE_THROW("Dynamic dimensions must have between [1, 9] dimensions.");
}
}
......
......@@ -120,6 +120,7 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id,
for (auto& op_desc : block.AllOps()) {
auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
VLOG(10) << op->DebugString();
op->Run(*local_scope, *device);
}
if (create_local_scope) {
......
......@@ -235,6 +235,23 @@ void OpDescBind::Rename(const std::string &old_name,
need_update_ = true;
}
void OpDescBind::RenameOutput(const std::string &old_name,
const std::string &new_name) {
for (auto &output : outputs_) {
std::replace(output.second.begin(), output.second.end(), old_name,
new_name);
}
need_update_ = true;
}
void OpDescBind::RenameInput(const std::string &old_name,
const std::string &new_name) {
for (auto &input : inputs_) {
std::replace(input.second.begin(), input.second.end(), old_name, new_name);
}
need_update_ = true;
}
struct SetAttrDescVisitor : public boost::static_visitor<void> {
explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
mutable OpDesc::Attr *attr_;
......@@ -448,7 +465,12 @@ const std::vector<std::string> &CompileTimeInferShapeContext::Outputs(
DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
auto var = block_.FindVarRecursive(name);
PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
return framework::make_ddim(var->Shape());
try {
return framework::make_ddim(var->Shape());
} catch (...) {
VLOG(5) << "GetDim of variable " << name << " error";
std::rethrow_exception(std::current_exception());
}
}
void CompileTimeInferShapeContext::SetDim(const std::string &name,
......
......@@ -73,6 +73,10 @@ class OpDescBind {
void Rename(const std::string &old_name, const std::string &new_name);
void RenameOutput(const std::string &old_name, const std::string &new_name);
void RenameInput(const std::string &old_name, const std::string &new_name);
// Only be used in C++
const AttributeMap &GetAttrMap() const;
......
......@@ -403,19 +403,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
void OperatorWithKernel::Run(const Scope& scope,
const platform::DeviceContext& dev_ctx) const {
if (VLOG_IS_ON(1)) {
auto inputs = this->InputVars();
auto outputs = this->OutputVars(true);
std::ostringstream sout;
sout << "Run operator " << this->Type() << " From [";
std::ostream_iterator<std::string> out_it(sout, ",");
std::copy(inputs.begin(), inputs.end(), out_it);
sout << "] to [";
std::copy(outputs.begin(), outputs.end(), out_it);
sout << "]";
VLOG(1) << sout.str();
}
RuntimeInferShapeContext infer_shape_ctx(*this, scope);
this->InferShape(&infer_shape_ctx);
......
......@@ -38,11 +38,12 @@ Scope& Scope::NewScope() const {
Variable* Scope::Var(const std::string& name) {
auto iter = vars_.find(name);
if (iter != vars_.end()) {
VLOG(3) << "Get existing variable " << name;
return iter->second;
}
Variable* v = new Variable();
vars_[name] = v;
VLOG(3) << "Create variable " << name << " on scope";
VLOG(3) << "Create variable " << name;
v->name_ = &(vars_.find(name)->first);
return v;
}
......
......@@ -53,6 +53,10 @@ class InferShapeContext {
virtual bool IsRuntime() const = 0;
// Note: In while op, we need this to be public
void SetDims(const std::vector<std::string> &names,
const std::vector<framework::DDim> &dims);
protected:
virtual framework::DDim GetDim(const std::string &name) const = 0;
virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0;
......@@ -60,9 +64,6 @@ class InferShapeContext {
std::vector<framework::DDim> GetDims(
const std::vector<std::string> &names) const;
void SetDims(const std::vector<std::string> &names,
const std::vector<framework::DDim> &dims);
std::vector<VarDesc::VarType> GetVarTypes(
const std::vector<std::string> &names) const;
......
......@@ -73,7 +73,6 @@ if(MOBILE_INFERENCE)
list(REMOVE_ITEM GSERVER_SOURCES
dataproviders/DataProvider.cpp
dataproviders/MultiDataProvider.cpp
dataproviders/ProtoDataProvider.cpp
dataproviders/PyDataProvider2.cpp
dataproviders/PyDataProvider.cpp)
......
......@@ -212,6 +212,37 @@ Error __must_check backward(Argument& act) {
}
END_DEFINE_ACTIVATION(sequence_softmax)
/*
* @brief SoftSign Activation.
* \f[
* f(z) = \frac{z}{1 + |z|}
* \f]
*/
BEGIN_DEFINE_ACTIVATION(softsign)
private:
MatrixPtr denominator_;
Error __must_check forward(Argument& act) {
size_t height = act.value->getHeight();
size_t width = act.value->getWidth();
Matrix::resizeOrCreate(
denominator_, height, width, false, useGpu(act.deviceId));
denominator_->assign(*act.value);
denominator_->abs2();
denominator_->add(1.);
act.value->dotDiv(*act.value, *denominator_);
return Error();
}
Error __must_check backward(Argument& act) {
denominator_->square2();
denominator_->scalarDiv(*denominator_, 1.);
act.grad->dotMul(*act.grad, *denominator_);
return Error();
}
END_DEFINE_ACTIVATION(softsign)
/**
* @brief Relu Activation.
* forward. y = max(0, z)
......
......@@ -16,8 +16,8 @@ limitations under the License. */
#include <unistd.h>
#include <algorithm>
#include "ProtoDataProvider.h"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Stat.h"
#include "paddle/utils/StringUtil.h"
#include "paddle/utils/Util.h"
......@@ -164,8 +164,6 @@ DataProvider* DataProvider::create(const DataConfig& config,
REGISTER_DATA_PROVIDER(simple, SimpleDataProvider);
REGISTER_DATA_PROVIDER(dummy, DummyDataProvider);
REGISTER_DATA_PROVIDER(proto, ProtoDataProvider);
REGISTER_DATA_PROVIDER(proto_sequence, ProtoSequenceDataProvider);
int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) {
int64_t batchSize = doubleBuffer_ ? getNextBatchFromBuffer(size, batch)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "DataFormat.pb.h"
#include "paddle/utils/Stat.h"
#include "DataProvider.h"
#include "ProtoReader.h"
namespace paddle {
/**
* @brief Provider data from protobuf data file with each sample
* specified by proto message
*
* DataSample defined in DataFormat.proto.
*
* The file format is
*
* header
*
* sample1
*
* sample2
*
* ...
*
* sampleN
*
* @note: In the data file, each message is prefixed with its length.
* The read/write of the protbuf are implemented in ProtoReader.h
*/
class ProtoDataProvider : public DataProvider {
public:
ProtoDataProvider(const DataConfig& config,
bool useGpu,
bool loadDataAll = true);
virtual void reset();
/**
* @note this size includes the sequences which are skipped because they
* are longer than the batch size.
*/
virtual int64_t getSize() {
int64_t size = sampleNums_;
if (usageRatio_ < 1.0f) {
size = static_cast<int64_t>(size * usageRatio_);
}
return size;
}
virtual void shuffle();
void loadData(const std::vector<std::string>& fileList);
virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
protected:
/**
* @brief load protobuf data from a list of file
* @param[in] fileName file name of a file which contains
* a list of file names
*/
void loadData(const std::string& fileName);
/**
* @brief load protobuf data from file
* @param[in] fileName data file name
*/
void loadDataFile(const std::string& fileName);
/** @brief check data header of each data sample
* @param[in] header data header read from protobuf data
*/
void checkDataHeader(const DataHeader& header);
/**
* @brief fill protobuf data into slot_,
* slot_ is a vector of ProtoSlot in memory.
* @param[in] sample data sample read from protobuf data
*/
void fillSlots(const DataSample& sample);
/**
* @brief return true if each sample is one sequence, i.e., independent
* of other samples.
*/
inline bool iidData() const { return sequenceStartPositions_.empty(); }
/**
* @brief check that sample is consistent with header_
*/
void checkSample(const DataSample& sample);
template <class Op>
int64_t sequenceLoop(Op op, int64_t size);
template <class Op>
int64_t sampleLoop(Op op, int64_t size);
template <class Op>
int64_t subSampleLoop(Op op, int64_t size, int slot);
void showDataStats();
protected:
struct ProtoVarSlot {
std::vector<real> data;
std::vector<int> dims;
};
struct ProtoSlot {
SlotDef::SlotType type;
int dim;
std::vector<int> indexData;
std::vector<real> denseData;
std::vector<sparse_non_value_t> sparseNonValueData;
std::vector<sparse_float_value_t> sparseFloatValueData;
std::vector<int64_t> indices;
std::vector<int64_t> subIndices;
std::vector<ProtoVarSlot> varDenseData;
std::vector<std::vector<int>> varIndices;
std::vector<std::string> strData;
};
DataHeader header_;
int numVecSlots_;
std::vector<ProtoSlot> slots_;
size_t sampleNums_;
/**
* The starting position of each sequence in samples.
* The last element should be num of samples.
* If empty, each sample is one sequence.
*/
std::vector<size_t> sequenceStartPositions_;
int64_t currentSequenceIndex_;
// The size should be the number of sequences.
std::vector<size_t> shuffledSequenceIds_;
ThreadLocalD<DataBatch> cpuBatch_;
ThreadLocalD<DataBatch> gpuBatch_;
RWLock lock_;
std::vector<StatPtr> nnzStats_; // stats for number of none-zeros entries
};
/**
* @brief Special use for Proto data: instances should contain sparse-non-value
* slots
* and label.
*
* @note ProtoSequenceDataProvider treats each SPARSE SLOT as a SEQUENCE
*/
class ProtoSequenceDataProvider : public ProtoDataProvider {
public:
ProtoSequenceDataProvider(const DataConfig& config,
bool useGpu,
bool loadDataAll = true);
~ProtoSequenceDataProvider() {}
virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
};
} // namespace paddle
......@@ -41,6 +41,7 @@ bool BatchNormBaseLayer::init(const LayerMap& layerMap,
useGlobalStats_ = config_.use_global_stats();
}
movingAvgFraction_ = config_.moving_average_fraction();
epsilon_ = config_.epsilon();
weight_.reset(new Weight(1, channels_, parameters_[0]));
movingMean_.reset(new Weight(1, channels_, parameters_[1]));
......
......@@ -94,6 +94,8 @@ protected:
bool useGlobalStats_;
// use to compute moving mean and variance.
real movingAvgFraction_;
// Epsilon is a small random noise used in batch normalization for stability.
real epsilon_;
};
} // namespace paddle
......@@ -22,8 +22,6 @@ namespace paddle {
REGISTER_LAYER(batch_norm, BatchNormalizationLayer);
const real BatchNormalizationLayer::EPS = 1E-5;
bool BatchNormalizationLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
/* Initialize the basic parent class */
......@@ -53,7 +51,7 @@ void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {
calMovingMeanAndVar();
savedInvVar_->subScalar(-EPS);
savedInvVar_->subScalar(-epsilon_);
savedInvVar_->sqrt2(*savedInvVar_);
}
......@@ -74,7 +72,7 @@ void BatchNormalizationLayer::setMeanAndStd() {
savedInvVar_->copyFrom(*(movingVar_->getW()));
savedInvVar_->downClip(real(0.0));
savedInvVar_->subScalar(-EPS);
savedInvVar_->subScalar(-epsilon_);
savedInvVar_->sqrt2(*savedInvVar_);
}
......
......@@ -39,9 +39,6 @@ public:
void backward(const UpdateCallback& callback = nullptr) override;
protected:
/// Epsilon value used in the batch normalization formula.
static const real EPS;
/// Load pre-calculated mean and std.
void setMeanAndStd();
......
......@@ -21,8 +21,6 @@ namespace paddle {
REGISTER_LAYER(cudnn_batch_norm, CudnnBatchNormLayer);
const double CudnnBatchNormLayer::EPS = 1E-5;
bool CudnnBatchNormLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
/* Initialize the basic parent class */
......@@ -61,6 +59,9 @@ void CudnnBatchNormLayer::forward(PassType passType) {
real* movingMean = movingMean_->getW()->getData();
real* movingVar = movingVar_->getW()->getData();
// cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));
if (!useGlobalStats_) {
REGISTER_TIMER_INFO("CudnnBatchFwTimer", getName().c_str());
real* savedMean = savedMean_->getData();
......@@ -75,7 +76,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
1.0 - movingAvgFraction_,
movingMean,
movingVar,
EPS,
eps_,
savedMean,
savedInvVar);
} else {
......@@ -90,7 +91,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
beta,
movingMean,
movingVar,
EPS);
eps_);
} else {
// There is a limitation in cudnn library.
// When the batch size is larger than 1024 in cuDNN v5.1,
......@@ -101,7 +102,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
beta,
movingMean,
movingVar,
EPS,
eps_,
batchSize,
channels_,
imageH_ * imageD_,
......@@ -128,6 +129,9 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
real* savedMean = savedMean_->getData();
real* savedInvVar = savedInvVar_->getData();
// cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));
auto create = [](MatrixPtr& m, size_t h, size_t w, real** p) {
Matrix::resizeOrCreate(m, h, w, false, true);
m->zeroMem();
......@@ -157,7 +161,7 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
gamma,
gammaGrad,
betaGrad,
EPS,
eps_,
savedMean,
savedInvVar);
......
......@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once
#include <cudnn.h>
#include "BatchNormBaseLayer.h"
#include "Layer.h"
#include "paddle/utils/Stat.h"
......@@ -46,12 +47,9 @@ public:
void backward(const UpdateCallback& callback = nullptr) override;
protected:
/**
* Epsilon value used in the batch normalization formula.
* Minimum allowed value is CUDNN_BN_MIN_EPSILON defined in cudnn.h.
* Same epsilon value should be used in forward and backward functions.
*/
static const double EPS;
/// Epsilon value used in the batch normalization formula.
/// Same epsilon value should be used in forward and backward functions.
double eps_;
/// Input/output tensor descriptor desc
hl_tensor_descriptor ioDesc_;
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "Layer.h"
#include "paddle/math/Matrix.h"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Stat.h"
namespace paddle {
/**
* @brief A layer for computing the dot product of two vectors.
* Input1: vector (batchSize * dim)
* Input2: vector (batchSize * dim)
* Output: a matrix: (batchSize * 1)
*/
class DotProdLayer : public Layer {
public:
explicit DotProdLayer(const LayerConfig& config) : Layer(config) {}
~DotProdLayer() {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void forward(PassType passType) override;
void backward(const UpdateCallback& callback = nullptr) override;
};
REGISTER_LAYER(dot_prod, DotProdLayer);
bool DotProdLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
Layer::init(layerMap, parameterMap);
CHECK_EQ(inputLayers_.size(), 2U);
CHECK_EQ(1UL, getSize())
<< "The output dimensionality of this layer should be fixed to 1.";
return true;
}
void DotProdLayer::forward(PassType passType) {
Layer::forward(passType);
MatrixPtr inV0 = getInputValue(0);
MatrixPtr inV1 = getInputValue(1);
size_t batchSize = inV0->getHeight();
CHECK_EQ(inV1->getHeight(), batchSize);
CHECK_EQ(inV0->getWidth(), inV1->getWidth());
{
REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
reserveOutput(batchSize, 1);
}
MatrixPtr outV = getOutputValue();
{
REGISTER_TIMER_INFO("FwDotProdTimer", getName().c_str());
outV->sumOfProducts(*inV0, *inV1, 1, 0);
}
}
void DotProdLayer::backward(const UpdateCallback& callback) {
MatrixPtr inV0 = getInputValue(0);
MatrixPtr inV1 = getInputValue(1);
MatrixPtr outG = getOutputGrad();
MatrixPtr inG0 = getInputGrad(0);
MatrixPtr inG1 = getInputGrad(1);
{
REGISTER_TIMER_INFO("BwDotProdTimer", getName().c_str());
if (inG0) {
inG0->addRowScale(0, *inV1, *outG);
}
if (inG1) {
inG1->addRowScale(0, *inV0, *outG);
}
}
}
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "L2DistanceLayer.h"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Stat.h"
namespace paddle {
REGISTER_LAYER(l2_distance, L2DistanceLayer);
bool L2DistanceLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
/* Initialize the basic parent class */
Layer::init(layerMap, parameterMap);
CHECK_EQ(inputLayers_.size(), 2UL) << "The L2DistanceLayer accepts two and "
<< "only two inputs.";
CHECK_EQ(getSize(), 1UL) << "The output dimensionality of L2DistanceLayer "
<< "is fixed to be 1.";
return true;
}
void L2DistanceLayer::forward(PassType passType) {
Layer::forward(passType);
const auto inV1 = getInputValue(0);
const auto inV2 = getInputValue(1);
CHECK(inV1 && inV2);
CHECK_EQ(inV1->getHeight(), inV2->getHeight())
<< "The height of two inputs of this layer must be the same.";
CHECK_EQ(inV1->getWidth(), inV2->getWidth())
<< "The width of two inputs of this layer must be the same.";
int batchSize = inV1->getHeight();
int output_dim = getSize();
{
REGISTER_TIMER_INFO("L2DistanceBpAtvTimer", getName().c_str());
reserveOutput(batchSize, output_dim);
auto outV = getOutputValue();
CHECK(outV) << "The output matrix should not be null.";
Matrix::resizeOrCreate(
inputSub_, inV1->getHeight(), inV1->getWidth(), false, useGpu_);
inputSub_->assign(*inV1);
inputSub_->sub(*inV2);
outV->sumOfProducts(*inputSub_, *inputSub_, 1, 0);
outV->sqrt2(*outV);
}
}
void L2DistanceLayer::backward(const UpdateCallback& callback) {
const auto outG = getOutputGrad();
const auto outV = getOutputValue();
CHECK(outG && outV);
auto inGrad1 = getInputGrad(0);
auto inGrad2 = getInputGrad(1);
{
REGISTER_TIMER_INFO("L2DistanceBpAtvTimer", getName().c_str());
if (inGrad1 || inGrad2) {
outV->scalarDiv(*outV, 1.);
outV->dotMul(*outG, *outV);
}
if (inGrad1) inGrad1->addRowScale(0, *inputSub_, *outV);
if (inGrad2) {
inputSub_->mulScalar(-1.);
inGrad2->addRowScale(0, *inputSub_, *outV);
}
}
}
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "Layer.h"
#include "paddle/math/Matrix.h"
namespace paddle {
/**
* @brief The layer calculates the l2 distance between two input vectors.
* \f[
* f(\bf{x}, \bf{y}) = \sqrt{\sum_{i=1}^D(x_i - y_i)}
* \f]
*
* - Input1: A vector (batchSize * dataDim)
* - Input2: A vector (batchSize * dataDim)
* - Output: A vector (batchSize * 1)
*
* The configuration api is: l2_distance_layer.
*/
class L2DistanceLayer : public Layer {
public:
explicit L2DistanceLayer(const LayerConfig& config) : Layer(config) {}
~L2DistanceLayer() {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void forward(PassType passType) override;
void backward(const UpdateCallback& callback = nullptr) override;
private:
// Store the result of subtracting Input2 from Input1 in forward computation,
// which will be reused in backward computation.
MatrixPtr inputSub_;
};
} // namespace paddle
......@@ -38,12 +38,13 @@ bool MKLDNNAddtoLayer::init(const LayerMap& layerMap,
}
void MKLDNNAddtoLayer::reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
CHECK_EQ(layerSize_, getSize()) << "this layer size can not be changed";
reshapeInput(bs, ih, iw);
ic = inputLayers_[0]->getSize() / ih / iw;
CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
CHECK_EQ(inputElemenCnt_, (size_t)bs * ic * ih * iw);
CHECK_EQ(inputLayers_[0]->getOutputValue()->getElementCnt(),
(size_t)bs * ic * ih * iw);
for (size_t i = 0; i < inputLayers_.size(); i++) {
CHECK_EQ(int64_t(bs), inputLayers_[i]->getOutput().getBatchSize());
CHECK_EQ(layerSize_, inputLayers_[i]->getSize());
......@@ -57,47 +58,43 @@ void MKLDNNAddtoLayer::reshape(
}
void MKLDNNAddtoLayer::resetFwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
resetFwdBuffers(inVals_, bias, out);
in = inVals_[0];
resetFwdBuffers(inputs, biasVal_, out);
std::shared_ptr<sum::primitive_desc> fwdPD;
std::shared_ptr<sum::primitive_desc> biasPD;
resetFwdPD(fwdPD, biasPD, inVals_, bias, out);
resetFwdPD(fwdPD, biasPD, inputs, biasVal_, out);
resetFwdPipeline(pipeline, fwdPD, biasPD, inVals_, bias, out);
resetFwdPipeline(pipeline, fwdPD, biasPD, inputs, biasVal_, out);
}
void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
resetBwdBuffers(inGrads_, bias, out);
in = inGrads_[0];
resetBwdBuffers(inputs, biasGrad_, out);
// backward only need share output grad to input grad
for (size_t i = 0; i < inGrads_.size(); i++) {
if (inGrads_[i] != nullptr) {
inGrads_[i] = out;
inputLayers_[i]->getOutputGrad()->setData(inGrads_[i]->getData());
for (size_t i = 0; i < inputs.size(); i++) {
if (inputs[i] != nullptr) {
inputs[i] = out;
inputLayers_[i]->getOutputGrad()->setData(inputs[i]->getData());
}
}
// backward bias
bwdBias_ = nullptr;
if (bias) {
if (biasGrad_) {
std::vector<float> scales(bs_, 1.0);
std::vector<memory::primitive_desc> srcPDs(bs_, bias->getPrimitiveDesc());
auto biasPD = sum::primitive_desc(bias->getMemoryDesc(), scales, srcPDs);
std::vector<memory::primitive_desc> srcPDs(bs_,
biasGrad_->getPrimitiveDesc());
auto biasPD =
sum::primitive_desc(biasGrad_->getMemoryDesc(), scales, srcPDs);
std::vector<primitive::at> srcs;
for (size_t i = 0; i < grads_.size(); ++i) {
srcs.push_back(*(grads_[i]));
}
bwdBias_.reset(new sum(biasPD, srcs, *bias));
bwdBias_.reset(new sum(biasPD, srcs, *biasGrad_));
pipeline.push_back(*bwdBias_);
}
}
......@@ -208,7 +205,7 @@ void MKLDNNAddtoLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
inputs.resize(inputLayers_.size());
for (size_t i = 0; i < inputs.size(); i++) {
resetInGrad(inputs[i], inVal_->getPrimitiveDesc(), i);
resetInGrad(inputs[i], inVals_[i]->getPrimitiveDesc(), i);
CHECK_PRIMITIVE_DESC_EQ(inputs[i], out->getPrimitiveDesc());
}
......
......@@ -26,9 +26,6 @@ namespace paddle {
*/
class MKLDNNAddtoLayer : public MKLDNNLayer {
protected:
std::vector<MKLDNNMatrixPtr> inVals_;
std::vector<MKLDNNMatrixPtr> inGrads_;
// layer size == ic * ih * iw == oc * oh *ow, and can not be changed
size_t layerSize_;
......@@ -50,52 +47,19 @@ public:
const ParameterMap& parameterMap) override;
void reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
void resetFwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) override;
void resetBwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) override;
void updateWeights(const UpdateCallback& callback) override;
void printValueFormat() override {
for (size_t i = 0; i < inVals_.size(); ++i) {
VLOG(MKLDNN_FMTS) << i << " input: " << inVals_[i]->getFormat() << " >>>";
}
if (outVal_) {
VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
}
if (extOutVal_) {
VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
}
}
void printGradFormat() override {
if (extOutGrad_) {
VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
}
if (outGrad_) {
VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
}
for (size_t i = 0; i < inGrads_.size(); ++i) {
VLOG(MKLDNN_FMTS) << i << " input: " << inGrads_[i]->getFormat() << "<<<";
}
}
protected:
/**
* Forward functions: reset buffers(inputs, output, bias),
* reset primitive descriptor,
* reset pipeline.
*/
void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out);
......@@ -110,17 +74,10 @@ protected:
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out);
/**
* Backward functions: reset buffers(inputs, output, bias)
*/
void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out);
/**
* prepare for bias
*/
void prepareBias(MKLDNNMatrixPtr& bias,
const MatrixPtr& biasMat,
const MKLDNNMatrixPtr& out,
......
......@@ -21,8 +21,6 @@ namespace paddle {
REGISTER_LAYER(mkldnn_batch_norm, MKLDNNBatchNormLayer);
const real MKLDNNBatchNormLayer::EPS = 1E-5;
bool MKLDNNBatchNormLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
if (!MKLDNNLayer::init(layerMap, parameterMap)) {
......@@ -50,6 +48,8 @@ bool MKLDNNBatchNormLayer::init(const LayerMap& layerMap,
useGlobalStats_ = config_.use_global_stats();
}
movingAvgFraction_ = config_.moving_average_fraction();
epsilon_ = config_.epsilon();
VLOG(MKLDNN_BASE) << "--- " << (useGlobalStats_ ? "use" : "do not use")
<< " --- global stats";
VLOG(MKLDNN_BASE) << "Moving average fraction: " << movingAvgFraction_;
......@@ -116,21 +116,20 @@ void MKLDNNBatchNormLayer::calMovingMeanAndVar() {
}
void MKLDNNBatchNormLayer::reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
reshapeInput(bs, ih, iw);
oh = ih;
ow = iw;
// ic_ and oc can not be changed
CHECK_EQ(inputElemenCnt_ / bs / ih / iw, (size_t)ic)
CHECK_EQ((size_t)ic,
inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
<< "Input channel can not be changed";
reshapeOutput(oh, ow);
resizeOutput(bs, oc * oh * ow);
}
void MKLDNNBatchNormLayer::resetFwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
// In training phase, it will always calculate mean and var,
// so useGlobalStats must be false.
......@@ -140,25 +139,23 @@ void MKLDNNBatchNormLayer::resetFwd(std::vector<primitive>& pipeline,
useGlobalStats_ = false;
}
resetFwdBuffers(in, wgt, out);
resetFwdBuffers(inputs[0], wgtVal_, out);
resetFwdPD(fwdPD_, in, wgt, out);
resetFwdPD(fwdPD_, inputs[0], wgtVal_, out);
resetFwdPipeline(pipeline, fwdPD_, in, wgt, out);
resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, out);
}
void MKLDNNBatchNormLayer::resetBwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
std::shared_ptr<bn_bwd::primitive_desc> pd;
resetBwdBuffers(in, wgt, out);
resetBwdBuffers(inputs[0], wgtGrad_, out);
resetBwdPD(pd, in, wgt, out);
resetBwdPD(pd, inputs[0], wgtGrad_, out);
resetBwdPipeline(pipeline, pd, in, wgt, out);
resetBwdPipeline(pipeline, pd, inputs[0], wgtGrad_, out);
}
void MKLDNNBatchNormLayer::forward(PassType passType) {
......@@ -213,7 +210,7 @@ void MKLDNNBatchNormLayer::resetFwdPD(
if (wgt) {
flags_ = (flags_ | batch_normalization_flag::use_scale_shift);
}
auto fwdDesc = bn_fwd::desc(pk, in->getMemoryDesc(), EPS, flags_);
auto fwdDesc = bn_fwd::desc(pk, in->getMemoryDesc(), epsilon_, flags_);
pd.reset(new bn_fwd::primitive_desc(fwdDesc, engine_));
CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
if (wgt) {
......@@ -260,9 +257,9 @@ void MKLDNNBatchNormLayer::resetFwdPipeline(
void MKLDNNBatchNormLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& out) {
CHECK(inVal_ && outVal_);
CHECK(inVals_[0] && outVal_);
resetOutGrad(out, outVal_->getPrimitiveDesc());
resetInGrad(in, inVal_->getPrimitiveDesc());
resetInGrad(in, inVals_[0]->getPrimitiveDesc());
if (gradScaleShift_) {
CHECK(wgtVal_);
resetWithMatrix(wgt, gradScaleShift_, wgtVal_->getPrimitiveDesc());
......@@ -280,7 +277,7 @@ void MKLDNNBatchNormLayer::resetBwdPD(
}
CHECK_PRIMITIVE_DESC_EQ(out, in->getPrimitiveDesc());
auto md = in->getMemoryDesc();
auto bwdDesc = bn_bwd::desc(prop_kind::backward, md, md, EPS, flags_);
auto bwdDesc = bn_bwd::desc(prop_kind::backward, md, md, epsilon_, flags_);
pd.reset(new bn_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
CHECK(pd->weights_primitive_desc() == fwdPD_->weights_primitive_desc());
CHECK_PRIMITIVE_DESC_EQ(wgt, pd->diff_weights_primitive_desc());
......@@ -297,11 +294,12 @@ void MKLDNNBatchNormLayer::resetBwdPipeline(
if (pd == nullptr) {
return;
}
CHECK(inVal_);
CHECK(inVals_[0]);
bwdData_.reset(
wgt && wgtVal_
? new bn_bwd(*pd, *inVal_, *mean_, *var_, *out, *wgtVal_, *in, *wgt)
: new bn_bwd(*pd, *inVal_, *mean_, *var_, *out, *in));
? new bn_bwd(
*pd, *inVals_[0], *mean_, *var_, *out, *wgtVal_, *in, *wgt)
: new bn_bwd(*pd, *inVals_[0], *mean_, *var_, *out, *in));
pipeline.push_back(*bwdData_);
}
......
......@@ -32,7 +32,8 @@ protected:
std::shared_ptr<bn_fwd::primitive_desc> fwdPD_;
// Epsilon value used in the batch normalization formula.
static const real EPS;
real epsilon_;
// weight and bias in paddle
std::unique_ptr<Weight> weight_;
std::unique_ptr<Weight> biases_;
......@@ -73,18 +74,14 @@ public:
void forward(PassType passType) override;
void reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
void resetFwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) override;
void resetBwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) override;
void updateWeights(const UpdateCallback& callback) override;
......@@ -98,11 +95,7 @@ protected:
* moving = moving * AvgFraction + local * (1 - AvgFraction)
*/
void calMovingMeanAndVar();
/**
* Forward functions: reset buffers(input, weight, output),
* reset primitive descriptor,
* reset pipeline.
*/
void resetFwdBuffers(MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& out);
......@@ -115,12 +108,6 @@ protected:
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& out);
/**
* Backward functions: reset buffers(input, weight, output),
* reset primitive descriptor,
* reset pipeline.
*/
void resetBwdBuffers(MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& out);
......
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "MKLDNNConcatLayer.h"
using namespace mkldnn; // NOLINT
typedef memory::format format;
namespace paddle {
REGISTER_LAYER(mkldnn_concat, MKLDNNConcatLayer);
bool MKLDNNConcatLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
if (!MKLDNNLayer::init(layerMap, parameterMap)) {
return false;
}
CHECK_GT(inputLayers_.size(), 1UL);
CHECK(!biasParameter_);
return true;
}
void MKLDNNConcatLayer::reshape(
int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
reshapeInput(bs, ih, iw);
ic = inputLayers_[0]->getSize() / ih / iw;
CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
CHECK_EQ(inputLayers_[0]->getOutputValue()->getElementCnt(),
(size_t)bs * ic * ih * iw);
CHECK_GT(inputLayers_.size(), 1UL);
channels_.resize(inputLayers_.size());
channels_[0] = ic;
oc = ic;
for (size_t i = 1; i < inputLayers_.size(); i++) {
int batchsize, height, witdh;
reshapeInput(batchsize, height, witdh, i);
CHECK_EQ(bs, batchsize);
CHECK_EQ(ih, height);
CHECK_EQ(iw, witdh);
channels_[i] = inputLayers_[i]->getSize() / height / witdh;
CHECK_EQ((size_t)channels_[i] * height * witdh, inputLayers_[i]->getSize());
oc += channels_[i];
}
oh = ih;
ow = iw;
reshapeOutput(oh, ow);
resizeOutput(bs, oc * oh * ow);
}
void MKLDNNConcatLayer::resetFwd(std::vector<primitive>& pipeline,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
resetFwdBuffers(inputs, out);
std::shared_ptr<concat::primitive_desc> fwdPD;
resetFwdPD(fwdPD, inputs, out);
resetFwdPipeline(pipeline, fwdPD, inputs, out);
}
void MKLDNNConcatLayer::resetBwd(std::vector<primitive>& pipeline,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
resetBwdBuffers(inputs, out);
resetBwdPipeline(pipeline, bwds_, inputs, out);
}
void MKLDNNConcatLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
inputs.resize(inputLayers_.size());
bool has8c = false, has16c = false, hasnc = false;
for (size_t i = 0; i < inputs.size(); i++) {
resetInValue(inputs[i], nullptr, i, channels_[i]);
CHECK(inputs[i]);
auto dm = inputs[i]->getDims();
// inputs format can be different, but ndims must equal
CHECK(i == 0 || dm.size() == inputs[0]->getDims().size());
CHECK_EQ(bs_, dm[0]);
CHECK_EQ(channels_[i], dm[1]);
if (dm.size() > 2) {
CHECK_EQ(ih_, dm[2]);
CHECK_EQ(iw_, dm[3]);
}
if (inputs[i]->getFormat() == format::nc) {
hasnc = true;
}
if (inputs[i]->getFormat() == format::nChw8c) {
has8c = true;
}
if (inputs[i]->getFormat() == format::nChw16c) {
has16c = true;
}
}
format outFmt;
if (has16c && oc_ % 16 == 0) {
outFmt = format::nChw16c;
} else if (has8c && oc_ % 8 == 0) {
outFmt = format::nChw8c;
} else if (hasnc) {
CHECK(oh_ == 1 && ow_ == 1);
outFmt = format::nc;
} else {
outFmt = format::nchw;
}
memory::dims outDims =
hasnc ? memory::dims{bs_, oc_} : memory::dims{bs_, oc_, oh_, ow_};
auto outPD = MKLDNNMatrix::createPrimitiveDesc(outDims, outFmt, engine_);
resetOutValue(out, outPD);
}
void MKLDNNConcatLayer::resetFwdPD(std::shared_ptr<concat::primitive_desc>& pd,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr out) {
std::vector<memory::primitive_desc> srcPDs;
for (size_t i = 0; i < inputs.size(); i++) {
srcPDs.push_back(inputs[i]->getPrimitiveDesc());
}
CHECK(out);
pd.reset(new concat::primitive_desc(out->getMemoryDesc(), axis_, srcPDs));
CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
}
void MKLDNNConcatLayer::resetFwdPipeline(
std::vector<primitive>& pipeline,
std::shared_ptr<concat::primitive_desc>& pd,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
std::vector<primitive::at> srcs;
for (size_t i = 0; i < inputs.size(); i++) {
srcs.push_back(*(inputs[i]));
}
fwd_.reset(new concat(*pd, srcs, *out));
pipeline.push_back(*fwd_);
}
void MKLDNNConcatLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
CHECK(outVal_);
resetOutGrad(out, outVal_->getPrimitiveDesc());
CHECK(out);
inputs.resize(inputLayers_.size());
for (size_t i = 0; i < inputs.size(); i++) {
CHECK(inVals_[i]);
resetInGrad(inputs[i], inVals_[i]->getPrimitiveDesc(), i);
CHECK_PRIMITIVE_DESC_EQ(inputs[i], inVals_[i]->getPrimitiveDesc());
}
}
void MKLDNNConcatLayer::resetBwdPipeline(
std::vector<mkldnn::primitive>& pipeline,
std::vector<std::shared_ptr<mkldnn::primitive>>& prims,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
// reset the backward primitives
memory::dims offsets = {0, 0, 0, 0};
prims.resize(inputs.size());
CHECK_EQ(inputs.size(), channels_.size());
for (size_t i = 0; i < inputs.size(); i++) {
auto viewPD = view::primitive_desc(
out->getPrimitiveDesc(), inputs[i]->getDims(), offsets);
auto bwdPD = reorder::primitive_desc(viewPD.dst_primitive_desc(),
inputs[i]->getPrimitiveDesc());
prims[i].reset(new reorder(bwdPD, *out, *(inputs[i])));
offsets[axis_] += channels_[i];
// push to pipeline
pipeline.push_back(*prims[i]);
}
}
} // namespace paddle
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "MKLDNNLayer.h"
#include "mkldnn.hpp"
namespace paddle {
/**
* @brief A subclass of MKLDNNLayer Concatenate layer.
*
* The config file api is mkldnn_concat
*/
class MKLDNNConcatLayer : public MKLDNNLayer {
protected:
std::vector<std::shared_ptr<mkldnn::primitive>> bwds_;
// input channel numbers
std::vector<int> channels_;
// concat_dimension in MKLDNN
// if axis_ == 0, concat batchsize
// if axis_ == 1, concat channel (default)
int axis_;
public:
explicit MKLDNNConcatLayer(const LayerConfig& config)
: MKLDNNLayer(config), axis_(1) {}
~MKLDNNConcatLayer() {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void reshape(
int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
void resetFwd(std::vector<mkldnn::primitive>& pipeline,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) override;
void resetBwd(std::vector<mkldnn::primitive>& pipeline,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) override;
void printSizeInfo() override {
CHECK_EQ(channels_.size(), inputLayers_.size());
for (size_t i = 0; i < channels_.size(); ++i) {
VLOG(MKLDNN_SIZES) << "Input " << i << ", " << inputLayers_[i]->getName()
<< ": " << bs_ << ", " << channels_[i] << ", " << ih_
<< ", " << iw_;
}
VLOG(MKLDNN_SIZES) << "Output: " << bs_ << ", " << oc_ << ", " << oh_
<< ", " << ow_;
}
size_t keepCondition() {
// reset when the total element size of all inputs changed
size_t totalSize = inputLayers_[0]->getOutputValue()->getElementCnt();
for (size_t i = 1; i < inputLayers_.size(); ++i) {
totalSize += inputLayers_[i]->getOutputValue()->getElementCnt();
}
return totalSize;
}
protected:
void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out);
void resetFwdPD(std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr out);
void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out);
void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out);
void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
std::vector<std::shared_ptr<mkldnn::primitive>>& prims,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out);
};
} // namespace paddle
......@@ -90,7 +90,7 @@ void MKLDNNConvLayer::convertWeightsToPaddle() {
}
void MKLDNNConvLayer::reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
reshapeInput(bs, ih, iw);
// cal output sizes
......@@ -105,21 +105,17 @@ void MKLDNNConvLayer::reshape(
}
void MKLDNNConvLayer::resetFwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
resetFwdPD(fwdPD_);
resetFwdBuffers(fwdPD_, in, wgt, bias, out);
resetFwdBuffers(fwdPD_, inputs[0], wgtVal_, biasVal_, out);
resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out);
resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, biasVal_, out);
}
void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
std::shared_ptr<conv_bwdWgt::primitive_desc> bwdWgtPD;
std::shared_ptr<conv_bwdData::primitive_desc> bwdDataPD;
......@@ -128,9 +124,10 @@ void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
resetBwdDataPD(bwdDataPD);
resetBwdBuffers(bwdWgtPD, bwdDataPD, in, wgt, bias, out);
resetBwdBuffers(bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out);
resetBwdPipeline(
pipeline, bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
}
void MKLDNNConvLayer::updateWeights(const UpdateCallback& callback) {
......@@ -236,14 +233,14 @@ void MKLDNNConvLayer::resetBwdWgtPD(
loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
// create backward weight using input, output and weight value memory desc
CHECK(inVal_) << "Should have internal input value";
CHECK(inVals_[0]) << "Should have internal input value";
CHECK(outVal_) << "Should have internal output value";
CHECK(wgtVal_) << "Should have weight value";
algorithm algo = algorithm::convolution_direct;
padding_kind padKind = padding_kind::zero;
auto bwdWgtDesc = biasVal_ != nullptr
? conv_bwdWgt::desc(algo,
inVal_->getMemoryDesc(),
inVals_[0]->getMemoryDesc(),
wgtVal_->getMemoryDesc(),
biasVal_->getMemoryDesc(),
outVal_->getMemoryDesc(),
......@@ -252,7 +249,7 @@ void MKLDNNConvLayer::resetBwdWgtPD(
padR,
padKind)
: conv_bwdWgt::desc(algo,
inVal_->getMemoryDesc(),
inVals_[0]->getMemoryDesc(),
wgtVal_->getMemoryDesc(),
outVal_->getMemoryDesc(),
strides,
......@@ -260,7 +257,7 @@ void MKLDNNConvLayer::resetBwdWgtPD(
padR,
padKind);
pd.reset(new conv_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
CHECK_PRIMITIVE_DESC_EQ(inVal_, pd->src_primitive_desc());
CHECK_PRIMITIVE_DESC_EQ(inVals_[0], pd->src_primitive_desc());
CHECK_PRIMITIVE_DESC_EQ(
outVal_,
pd->diff_dst_primitive_desc(),
......@@ -280,12 +277,12 @@ void MKLDNNConvLayer::resetBwdDataPD(
memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
CHECK(inVal_) << "Should have internal input value";
CHECK(inVals_[0]) << "Should have internal input value";
CHECK(outVal_) << "Should have internal output value";
// create backward data using input and output value memory desc
// but using weight memory desc with any format
auto bwdDataDesc = conv_bwdData::desc(algorithm::convolution_direct,
inVal_->getMemoryDesc(),
inVals_[0]->getMemoryDesc(),
MKLDNNMatrix::createMemoryDesc(wgtDims),
outVal_->getMemoryDesc(),
strides,
......@@ -294,7 +291,7 @@ void MKLDNNConvLayer::resetBwdDataPD(
padding_kind::zero);
pd.reset(new conv_bwdData::primitive_desc(bwdDataDesc, engine_, *fwdPD_));
CHECK_PRIMITIVE_DESC_EQ(
inVal_,
inVals_[0],
pd->diff_src_primitive_desc(),
"primitive desc of in value and grad should be equal");
CHECK_PRIMITIVE_DESC_EQ(
......@@ -346,12 +343,12 @@ void MKLDNNConvLayer::resetBwdPipeline(
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
CHECK(inVal_);
CHECK(inVals_[0]);
// add bwdWgt handle
if (bias) {
bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVal_, *out, *wgt, *bias));
bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVals_[0], *out, *wgt, *bias));
} else {
bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVal_, *out, *wgt));
bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVals_[0], *out, *wgt));
}
pipeline.push_back(*bwdWgt_);
......
......@@ -69,18 +69,14 @@ public:
const ParameterMap& parameterMap) override;
void reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
void resetFwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) override;
void resetBwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) override;
void updateWeights(const UpdateCallback& callback) override;
......@@ -107,48 +103,26 @@ protected:
mkldnn::memory::dims& padL,
mkldnn::memory::dims& padR);
/**
* reset the forward primitive descriptor.
*/
void resetFwdPD(std::shared_ptr<conv_fwd::primitive_desc>& pd);
/**
* reset the MKLDNNMatrix buffers used in forward.
*/
void resetFwdBuffers(std::shared_ptr<conv_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out);
/**
* reset the forward pipeline.
*/
void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
std::shared_ptr<conv_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out);
/**
* reset the backward weight primitive descriptor.
*/
void resetBwdWgtPD(std::shared_ptr<conv_bwdWgt::primitive_desc>& pd);
/**
* reset the backward data primitive descriptor.
*/
void resetBwdDataPD(std::shared_ptr<conv_bwdData::primitive_desc>& pd);
/**
* reset the MKLDNNMatrix buffers used in backward.
*/
void resetBwdBuffers(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out);
/**
* reset the backward pipeline.
*/
void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
......
......@@ -74,7 +74,7 @@ void MKLDNNFcLayer::convertWeightsToPaddle() {
}
void MKLDNNFcLayer::reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
reshapeInput(bs, ih, iw);
CHECK_EQ(iLayerSize_, inputLayers_[0]->getSize());
......@@ -87,32 +87,29 @@ void MKLDNNFcLayer::reshape(
}
void MKLDNNFcLayer::resetFwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
resetFwdBuffers(in, wgt, bias, out);
resetFwdBuffers(inputs[0], wgtVal_, biasVal_, out);
resetFwdPD(fwdPD_, in, wgt, bias, out);
resetFwdPD(fwdPD_, inputs[0], wgtVal_, biasVal_, out);
resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out);
resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, biasVal_, out);
}
void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
std::shared_ptr<fc_bwdWgt::primitive_desc> bwdWgtPD;
std::shared_ptr<fc_bwdData::primitive_desc> bwdDataPD;
resetBwdBuffers(in, wgt, bias, out);
resetBwdBuffers(inputs[0], wgtGrad_, biasGrad_, out);
resetBwdWgtPD(bwdWgtPD, wgt, bias, out);
resetBwdWgtPD(bwdWgtPD, wgtGrad_, biasGrad_, out);
resetBwdDataPD(bwdDataPD, in, out);
resetBwdDataPD(bwdDataPD, inputs[0], out);
resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out);
resetBwdPipeline(
pipeline, bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
}
void MKLDNNFcLayer::updateWeights(const UpdateCallback& callback) {
......@@ -193,9 +190,9 @@ void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
CHECK(inVal_ && outVal_);
CHECK(inVals_[0] && outVal_);
resetOutGrad(out, outVal_->getPrimitiveDesc());
resetInGrad(in, inVal_->getPrimitiveDesc());
resetInGrad(in, inVals_[0]->getPrimitiveDesc());
CHECK(wgtVal_);
resetWithMatrix(wgt, weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
......@@ -212,14 +209,15 @@ void MKLDNNFcLayer::resetBwdWgtPD(
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
CHECK(inVal_);
fc_bwdWgt::desc bwdWgtDesc = bias ? fc_bwdWgt::desc(inVal_->getMemoryDesc(),
wgt->getMemoryDesc(),
bias->getMemoryDesc(),
out->getMemoryDesc())
: fc_bwdWgt::desc(inVal_->getMemoryDesc(),
wgt->getMemoryDesc(),
out->getMemoryDesc());
CHECK(inVals_[0]);
fc_bwdWgt::desc bwdWgtDesc =
bias ? fc_bwdWgt::desc(inVals_[0]->getMemoryDesc(),
wgt->getMemoryDesc(),
bias->getMemoryDesc(),
out->getMemoryDesc())
: fc_bwdWgt::desc(inVals_[0]->getMemoryDesc(),
wgt->getMemoryDesc(),
out->getMemoryDesc());
pd.reset(new fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
}
......@@ -245,11 +243,11 @@ void MKLDNNFcLayer::resetBwdPipeline(
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) {
CHECK(inVal_);
CHECK(inVals_[0]);
if (bias) {
bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVal_, *out, *wgt, *bias));
bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVals_[0], *out, *wgt, *bias));
} else {
bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVal_, *out, *wgt));
bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVals_[0], *out, *wgt));
}
pipeline.push_back(*bwdWgt_);
......
......@@ -52,18 +52,14 @@ public:
const ParameterMap& parameterMap) override;
void reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
void resetFwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) override;
void resetBwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) override;
void updateWeights(const UpdateCallback& callback) override;
......@@ -73,11 +69,6 @@ public:
void convertWeightsToPaddle() override;
protected:
/**
* Forward functions: reset buffers(input, output, weight and bias),
* reset primitive descriptor,
* reset pipeline.
*/
void resetFwdBuffers(MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
......@@ -93,13 +84,6 @@ protected:
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out);
/**
* Backward functions: reset buffers(input, output, weight and bias),
* reset primitive descriptor for backward weight,
* reset primitive descriptor for backward data,
* reset pipeline.
*/
void resetBwdBuffers(MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
......
......@@ -21,8 +21,8 @@ namespace paddle {
bool MKLDNNLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
<< "Please set WITH_MKLDNN=ON "
CHECK(FLAGS_use_mkldnn) << "MKLDNNLayers only support use_mkldnn."
<< "Please set WITH_MKL=ON "
<< "and set use_mkldnn=True";
CHECK(!useGpu_) << "Do not support GPU yet";
......@@ -48,31 +48,20 @@ void MKLDNNLayer::forward(PassType passType) {
REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
CHECK(!inputLayers_.empty());
copySeqInfoToOutputs();
size_t elemenCnt = inputLayers_[0]->getOutputValue()->getElementCnt();
if (inputElemenCnt_ != elemenCnt) {
if (condition_ != keepCondition()) {
VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
// reset when input total sizes changed, not only the batchsize
inputElemenCnt_ = elemenCnt;
pipelineFwd_.clear();
condition_ = keepCondition();
reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
// all cpu device output grad or value share output's
printSizeInfo();
// the output_.value and output_.grad are shared with CPU device
shareCPUDevice();
resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_);
// MKLDNNLayer output value should be MKLDNNMatrix
// so external output value is necessary.
// Then external input value is not necessary,
// since input may be mkldnn internal buffer.
CHECK(extOutVal_) << "external output value is necessary";
output_.value = std::dynamic_pointer_cast<Matrix>(extOutVal_);
CHECK(inVal_ && outVal_) << "internal memories are necessary";
if (cvtInVal_) {
pipelineFwd_.insert(pipelineFwd_.begin(), *cvtInVal_);
}
if (cvtOutVal_) {
pipelineFwd_.push_back(*cvtOutVal_);
}
pipelineFwd_.clear();
inVals_.resize(inputLayers_.size(), nullptr);
extInVals_.resize(inputLayers_.size(), nullptr);
cvtInVals_.resize(inputLayers_.size(), nullptr);
resetFwd(pipelineFwd_, inVals_, outVal_);
prepareValueConversions(pipelineFwd_);
convertWeightsFromPaddle();
printSizeInfo();
printValueFormat();
needResetBwd_ = true;
}
......@@ -80,8 +69,8 @@ void MKLDNNLayer::forward(PassType passType) {
if (inputLayers_[0]->getType() == "data" && inputLayers_.size() == 1) {
// Update input value data when input layer is "data" type,
// since the input value data address might be changed.
CHECK(extInVal_);
extInVal_->setData(getInputValue(0, CPU_DEVICE)->getData());
CHECK(extInVals_[0]);
extInVals_[0]->setData(getInputValue(0, CPU_DEVICE)->getData());
}
if (!outputOnlyMKLDNN_) {
......@@ -99,22 +88,13 @@ void MKLDNNLayer::backward(const UpdateCallback& callback) {
if (needResetBwd_) {
VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
pipelineBwd_.clear();
inGrads_.resize(inputLayers_.size(), nullptr);
extInGrads_.resize(inputLayers_.size(), nullptr);
cvtInGrads_.resize(inputLayers_.size(), nullptr);
pipelineMergeGrad_.clear();
mergeGrad_ = nullptr;
resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
// external output grad is not necessary
// since output may be mkldnn internal buffer or merge them directly.
CHECK(outGrad_) << "internal output grad is necessary";
if (extOutGrad_) {
CHECK_EQ(extOutGrad_->getData(), output_.grad->getData())
<< "the external buffer should share the same data with output_.grad";
}
if (cvtOutGrad_) {
pipelineBwd_.insert(pipelineBwd_.begin(), *cvtOutGrad_);
}
if (cvtInGrad_) {
pipelineBwd_.push_back(*cvtInGrad_);
}
resetBwd(pipelineBwd_, inGrads_, outGrad_);
prepareGradConversions(pipelineBwd_);
printGradFormat();
needResetBwd_ = false;
}
......@@ -138,8 +118,11 @@ void MKLDNNLayer::backward(const UpdateCallback& callback) {
}
}
void MKLDNNLayer::reshapeInput(int& batchsize, int& height, int& width) {
const Argument& input = inputLayers_[0]->getOutput();
void MKLDNNLayer::reshapeInput(int& batchsize,
int& height,
int& width,
size_t idx) {
const Argument& input = inputLayers_[idx]->getOutput();
batchsize = input.getBatchSize();
int h = input.getFrameHeight();
int w = input.getFrameWidth();
......@@ -173,27 +156,30 @@ void MKLDNNLayer::resetWithMatrix(MKLDNNMatrixPtr& dnn,
void MKLDNNLayer::resetInValue(
MKLDNNMatrixPtr& in,
const std::shared_ptr<memory::primitive_desc>& intPD,
size_t inputIdx) {
cvtInVal_ = nullptr;
extInVal_ = nullptr;
size_t idx,
int inputChannel) {
cvtInVals_[idx] = nullptr;
extInVals_[idx] = nullptr;
in = nullptr;
CHECK_GT(bs_ * ic_ * ih_ * iw_, 0);
inputChannel = inputChannel == 0 ? ic_ : inputChannel;
CHECK_GT(bs_ * inputChannel * ih_ * iw_, 0);
auto extPD = MKLDNNMatrix::createPrimitiveDesc(
{bs_, ic_, ih_, iw_}, format::nchw, engine_);
const MatrixPtr& inMat = inputLayers_[inputIdx]->getOutputValue();
extInVal_ = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
CHECK_EQ(inputIsOnlyMKLDNN(), extInVal_ != nullptr);
if (extInVal_ == nullptr || extInVal_->getFormat() == format::nc) {
extInVal_ = MKLDNNMatrix::create(extPD, inMat);
{bs_, inputChannel, ih_, iw_}, format::nchw, engine_);
const MatrixPtr& inMat = inputLayers_[idx]->getOutputValue();
extInVals_[idx] = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
CHECK_EQ(inputIsOnlyMKLDNN(), extInVals_[idx] != nullptr);
if (extInVals_[idx] == nullptr ||
extInVals_[idx]->getFormat() == format::nc) {
extInVals_[idx] = MKLDNNMatrix::create(extPD, inMat);
}
in = extInVal_;
in = extInVals_[idx];
if (nullptr == intPD || in->getPrimitiveDesc() == *intPD) {
return;
}
// need create reorder
in = MKLDNNMatrix::create(*intPD);
cvtInVal_ = MKLDNNMatrix::createReorder(extInVal_, in);
CHECK(cvtInVal_) << "should not be emptry";
cvtInVals_[idx] = MKLDNNMatrix::createReorder(extInVals_[idx], in);
CHECK(cvtInVals_[idx]) << "should not be emptry";
}
void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out,
......@@ -215,11 +201,11 @@ void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out,
void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
memory::primitive_desc intPD,
size_t inputIdx) {
cvtInGrad_ = nullptr;
extInGrad_ = nullptr;
size_t idx) {
cvtInGrads_[idx] = nullptr;
extInGrads_[idx] = nullptr;
in = nullptr;
LayerPtr& input = inputLayers_[inputIdx];
LayerPtr& input = inputLayers_[idx];
if (input->getOutputGrad() == nullptr) {
// no need input grad
return;
......@@ -234,23 +220,25 @@ void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
in = MKLDNNMatrix::create(intPD, inMat);
Argument& arg = input->getOutput(this->getName());
arg.grad = std::dynamic_pointer_cast<Matrix>(in);
CHECK_PRIMITIVE_DESC_EQ(inVal_, intPD);
CHECK_PRIMITIVE_DESC_EQ(inVals_[idx], intPD);
if (inputIsOnlyMKLDNN()) {
return;
}
extInGrad_ = in;
if (isPaddleFormat(extInGrad_->getFormat())) {
extInGrads_[idx] = in;
if (isPaddleFormat(extInGrads_[idx]->getFormat())) {
return;
}
// need create reorder
CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat()))
CHECK(extInVals_[idx] != nullptr &&
isPaddleFormat(extInVals_[idx]->getFormat()))
<< "should have external input value and the format must be nchw(nc)";
extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat);
CHECK_PRIMITIVE_DESC_EQ(inVal_, intPD);
extInGrads_[idx] =
MKLDNNMatrix::create(extInVals_[idx]->getPrimitiveDesc(), inMat);
CHECK_PRIMITIVE_DESC_EQ(inVals_[idx], intPD);
in = MKLDNNMatrix::create(intPD);
cvtInGrad_ = MKLDNNMatrix::createReorder(in, extInGrad_);
CHECK(cvtInGrad_);
cvtInGrads_[idx] = MKLDNNMatrix::createReorder(in, extInGrads_[idx]);
CHECK(cvtInGrads_[idx]);
}
void MKLDNNLayer::resetOutGrad(MKLDNNMatrixPtr& out,
......@@ -306,22 +294,8 @@ void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) {
srcs.push_back(*src);
}
// TODO(TJ): remove me when mkldnn sum support different formats
for (size_t i = 1; i < srcPDs.size(); ++i) {
CHECK(srcPDs[0] == srcPDs[i]);
}
tmpOutGrad_ = out;
tmpCvt_ = nullptr;
if (out->getPrimitiveDesc() != srcPDs[0]) {
tmpOutGrad_ = MKLDNNMatrix::create(srcPDs[0]);
tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out);
CHECK(tmpCvt_);
pipelineMergeGrad_.push_back(*tmpCvt_);
}
auto sumPD =
sum::primitive_desc(tmpOutGrad_->getMemoryDesc(), scales, srcPDs);
mergeGrad_.reset(new sum(sumPD, srcs, *tmpOutGrad_));
auto sumPD = sum::primitive_desc(out->getMemoryDesc(), scales, srcPDs);
mergeGrad_.reset(new sum(sumPD, srcs, *out));
pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_);
}
......
......@@ -34,15 +34,16 @@ typedef std::shared_ptr<MKLDNNLayer> MKLDNNLayerPtr;
*/
class MKLDNNLayer : public Layer {
protected:
// input value element count
size_t inputElemenCnt_;
// batch size
int bs_;
// their sizes are always from the first input layer
// input image channel, height and width
int ic_, ih_, iw_;
// output image channel, height and width
int oc_, oh_, ow_;
// the condition that forward need be reset
size_t condition_;
// backward also need reset after reset forward handle
bool needResetBwd_;
......@@ -67,18 +68,18 @@ protected:
* When all layers are mkldnn layers, they could save internal data.
*/
// below MKLDNNMatrix buffers are all internal buffers
MKLDNNMatrixPtr inVal_;
MKLDNNMatrixPtr inGrad_;
std::vector<MKLDNNMatrixPtr> inVals_;
std::vector<MKLDNNMatrixPtr> inGrads_;
MKLDNNMatrixPtr outVal_;
MKLDNNMatrixPtr outGrad_;
// below are external value and grad
MKLDNNMatrixPtr extInVal_;
MKLDNNMatrixPtr extInGrad_;
std::vector<MKLDNNMatrixPtr> extInVals_;
std::vector<MKLDNNMatrixPtr> extInGrads_;
MKLDNNMatrixPtr extOutVal_;
MKLDNNMatrixPtr extOutGrad_;
// convert handle between external and internal buffers
std::shared_ptr<mkldnn::reorder> cvtInVal_;
std::shared_ptr<mkldnn::reorder> cvtInGrad_;
std::vector<std::shared_ptr<mkldnn::reorder>> cvtInVals_;
std::vector<std::shared_ptr<mkldnn::reorder>> cvtInGrads_;
std::shared_ptr<mkldnn::reorder> cvtOutVal_;
std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
......@@ -93,23 +94,11 @@ protected:
std::vector<mkldnn::primitive> pipelineMergeGrad_;
// tmp input argument to save input grad, only used to merge grad
Argument tmpInArg_;
// since mkldnn sum do not support different formats:
// can refer to https://github.com/01org/mkl-dnn/issues/134
// so need create reorder manually and save tmp MKLDNNMatrix
MKLDNNMatrixPtr tmpOutGrad_;
std::shared_ptr<mkldnn::primitive> tmpCvt_;
public:
explicit MKLDNNLayer(const LayerConfig& config)
: Layer(config),
inputElemenCnt_(0),
bs_(0),
ic_(0),
ih_(0),
iw_(0),
oc_(0),
oh_(0),
ow_(0),
condition_(0),
needResetBwd_(true),
outputOnlyMKLDNN_(false),
engine_(mkldnn::engine::cpu, 0),
......@@ -125,31 +114,28 @@ public:
virtual void backward(const UpdateCallback& callback);
/**
* reshape the input image sizes
* and reset output image and buffer size
* output channel can not be changed
* reshape the input and output channels and image sizes
* and reset output buffer size
*/
virtual void reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) = 0;
int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) = 0;
/**
* reset the mkldnn forward primitve and memories
* only would be called when input size changes
* weight and bias buffers should be coverd by child class itself
*/
virtual void resetFwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) = 0;
/**
* reset the mkldnn backward primitve and memories
* only would be called when needed
* weight and bias buffers should be coverd by child class itself
*/
virtual void resetBwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) = 0;
/**
......@@ -175,10 +161,19 @@ public:
void addOutputArgument(int deviceId) { Layer::addOutputArgument(deviceId); }
protected:
/**
* Some layers may have different condition to reset the forward.
* The function returns the condition that do not need reset forward.
*/
inline virtual size_t keepCondition() {
// reset when the first input element size changed, not only the batchsize
return inputLayers_[0]->getOutputValue()->getElementCnt();
}
/**
* reshape the input image sizes and input batchsize
*/
void reshapeInput(int& batchsize, int& height, int& width);
void reshapeInput(int& batchsize, int& height, int& width, size_t idx = 0);
/**
* reshape output image sizes
......@@ -196,11 +191,13 @@ protected:
/**
* reset input value from input MKLDNNMatrix and internal primitive desc.
* reset both internal and external buffer and create reorder if necessary.
* input channel may be different in concat.
*/
void resetInValue(
MKLDNNMatrixPtr& in,
const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr,
size_t inputIdx = 0);
size_t idx = 0,
int inputChannel = 0);
/**
* reset output value from internal primitive desc.
......@@ -215,7 +212,7 @@ protected:
*/
void resetInGrad(MKLDNNMatrixPtr& in,
mkldnn::memory::primitive_desc intPD,
size_t inputIdx = 0);
size_t idx = 0);
/**
* reset output grad from internal primitive desc.
......@@ -293,17 +290,19 @@ protected:
* print the mkldnn memory format of value
*/
virtual void printValueFormat() {
if (extInVal_) {
VLOG(MKLDNN_FMTS) << extInVal_->getFormat() << " >>> ";
}
if (inVal_) {
VLOG(MKLDNN_FMTS) << inVal_->getFormat() << " >>>";
for (size_t i = 0; i < inVals_.size(); ++i) {
if (!inVals_[i]) {
continue;
}
VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
<< ": " << (extInVals_[i] ? extInVals_[i]->getFormat()
: inVals_[i]->getFormat())
<< " >>> " << inVals_[i]->getFormat() << " >>>";
}
if (outVal_) {
VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
}
if (extOutVal_) {
VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> "
<< (extOutVal_ ? extOutVal_->getFormat()
: outVal_->getFormat());
}
if (wgtVal_) {
VLOG(MKLDNN_FMTS) << "Weight value format: " << wgtVal_->getFormat();
......@@ -317,17 +316,19 @@ protected:
* print the mkldnn memory format of grad
*/
virtual void printGradFormat() {
if (extOutGrad_) {
VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
}
if (outGrad_) {
VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< "
<< (extOutGrad_ ? extOutGrad_->getFormat()
: outGrad_->getFormat());
}
if (inGrad_) {
VLOG(MKLDNN_FMTS) << inGrad_->getFormat() << " <<<";
}
if (extInGrad_) {
VLOG(MKLDNN_FMTS) << extInGrad_->getFormat() << " <<< ";
for (size_t i = 0; i < inGrads_.size(); ++i) {
if (!inGrads_[i]) {
continue;
}
VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
<< ": " << (extInGrads_[i] ? extInGrads_[i]->getFormat()
: inGrads_[i]->getFormat())
<< " <<< " << inGrads_[i]->getFormat() << " <<<";
}
if (wgtGrad_) {
VLOG(MKLDNN_FMTS) << "Weight grad format: " << wgtGrad_->getFormat();
......@@ -434,6 +435,41 @@ private:
outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
}
}
void prepareValueConversions(std::vector<mkldnn::primitive>& pipeline) {
// MKLDNNLayer output value should be MKLDNNMatrix
// so external output value is necessary.
// Then external input value is not necessary,
// since input may be mkldnn internal buffer.
CHECK(extOutVal_) << "external output value is necessary";
output_.value = std::dynamic_pointer_cast<Matrix>(extOutVal_);
CHECK(inVals_[0] && outVal_) << "internal memories are necessary";
for (size_t i = 0; i < cvtInVals_.size(); ++i) {
if (cvtInVals_[i]) {
pipeline.insert(pipeline.begin(), *cvtInVals_[i]);
}
}
if (cvtOutVal_) {
pipeline.push_back(*cvtOutVal_);
}
}
void prepareGradConversions(std::vector<mkldnn::primitive>& pipeline) {
// external output grad is not necessary
// since output may be mkldnn internal buffer or merge them directly.
CHECK(outGrad_) << "internal output grad is necessary";
if (extOutGrad_) {
CHECK_EQ(extOutGrad_->getData(), output_.grad->getData())
<< "the external buffer should share the same data with output_.grad";
}
if (cvtOutGrad_) {
pipeline.insert(pipeline.begin(), *cvtOutGrad_);
}
for (size_t i = 0; i < cvtInGrads_.size(); ++i) {
if (cvtInGrads_[i]) {
pipeline.push_back(*cvtInGrads_[i]);
}
}
}
};
} // namespace paddle
......@@ -58,10 +58,11 @@ bool MKLDNNPoolLayer::init(const LayerMap& layerMap,
}
void MKLDNNPoolLayer::reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) {
int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
reshapeInput(bs, ih, iw);
// ic_ and oc can not be changed
CHECK_EQ(inputElemenCnt_ / bs / ih / iw, (size_t)ic)
CHECK_EQ((size_t)ic,
inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
<< "Input channel can not be changed";
// cal output sizes
......@@ -74,29 +75,25 @@ void MKLDNNPoolLayer::reshape(
}
void MKLDNNPoolLayer::resetFwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
resetFwdBuffers(in, out);
resetFwdBuffers(inputs[0], out);
resetFwdPD(fwdPD_, in, out);
resetFwdPD(fwdPD_, inputs[0], out);
resetFwdPipeline(pipeline, fwdPD_, in, out);
resetFwdPipeline(pipeline, fwdPD_, inputs[0], out);
}
void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
std::shared_ptr<pool_bwd::primitive_desc> pd;
resetBwdBuffers(in, out);
resetBwdBuffers(inputs[0], out);
resetBwdPD(pd, in, out);
resetBwdPD(pd, inputs[0], out);
resetBwdPipeline(pipeline, pd, in, out);
resetBwdPipeline(pipeline, pd, inputs[0], out);
}
void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
......@@ -151,9 +148,9 @@ void MKLDNNPoolLayer::resetFwdPipeline(
void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& out) {
CHECK(inVal_ && outVal_);
CHECK(inVals_[0] && outVal_);
resetOutGrad(out, outVal_->getPrimitiveDesc());
resetInGrad(in, inVal_->getPrimitiveDesc());
resetInGrad(in, inVals_[0]->getPrimitiveDesc());
}
void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
......
......@@ -53,18 +53,14 @@ public:
const ParameterMap& parameterMap) override;
void reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override;
int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
void resetFwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) override;
void resetBwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) override;
void printSizeInfo() override {
......@@ -75,11 +71,6 @@ public:
}
protected:
/**
* Forward functions: reset buffers(input, output),
* reset primitive descriptor,
* reset pipeline.
*/
void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
void resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr in,
......@@ -88,12 +79,6 @@ protected:
std::shared_ptr<pool_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& out);
/**
* Backward functions: reset buffers(input, output),
* reset primitive descriptor,
* reset pipeline.
*/
void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
void resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
MKLDNNMatrixPtr& in,
......
......@@ -29,7 +29,7 @@ gserver_test(test_KmaxSeqScore)
gserver_test(test_Expand)
gserver_test(test_MaxPoolingWithMaskOutput)
########## test_Mkldnn layers and activations ##########
########## test_MKLDNN layers and activations ##########
if(WITH_MKLDNN)
add_unittest_without_exec(test_MKLDNN
test_MKLDNN.cpp
......@@ -62,17 +62,6 @@ if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
endif()
if(NOT MOBILE_INFERENCE)
################### test_ProtoDataProvider ############
add_unittest_without_exec(test_ProtoDataProvider
test_ProtoDataProvider.cpp)
# test_ProtoDataProvider will mkdir as same name,
# so if WORKING_DIRECTORY is default directory, then
# mkdir will get error.
add_test(NAME test_ProtoDataProvider
COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoDataProvider
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
################## test_Evaluator #######################
add_unittest(test_Evaluator
test_Evaluator.cpp)
......@@ -110,3 +99,24 @@ add_test(NAME test_PyDataProvider2
COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
)
################# test_CompareSparse ##################
add_unittest_without_exec(test_CompareSparse
test_CompareSparse.cpp)
if(NOT ON_TRAVIS)
add_test(NAME test_CompareSparse
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
./.set_port.sh -p port -n 6
${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
endif()
################ test_CompareTwoNets ######################
add_unittest_without_exec(test_CompareTwoNets
test_CompareTwoNets.cpp)
add_test(NAME test_CompareTwoNets
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
......@@ -23,7 +23,7 @@ limitations under the License. */
namespace paddle {
/**
* @brief test the functionality of Mkldnnlayers
* @brief test the functionality of MKLDNNlayers and MKLDNNActivations
* refer to paddle original function
*/
class MKLDNNTester {
......
./test_ProtoDataProvider/data1.bin
./test_ProtoDataProvider/data2.bin
./test_ProtoDataProvider/data1.bin.gz
./test_ProtoDataProvider/data2.bin.gz
#!/usr/bin/env python
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
......@@ -14,27 +15,50 @@
from paddle.trainer_config_helpers import *
################################### Data Configuration ###################################
TrainData(ProtoData(files = "trainer/tests/mnist.list"))
################################### Algorithm Configuration ###################################
settings(batch_size = 1000,
learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
################################### Network Configuration ###################################
data = data_layer(name ="input", size=784)
######################## data source ################################
dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
dict_file = dict()
for line_count, line in enumerate(open(dict_path, "r")):
dict_file[line.strip()] = line_count
fc1 = fc_layer(input=data, size=800,
bias_attr=True,
act=SigmoidActivation())
define_py_data_sources2(
train_list='gserver/tests/Sequence/train.list',
test_list=None,
module='sequenceGen',
obj='process',
args={"dict_file": dict_file})
fc2 = fc_layer(input=fc1, size=800,
bias_attr=True,
act=SigmoidActivation())
settings(batch_size=5)
######################## network configure ################################
dict_dim = len(open(dict_path, 'r').readlines())
word_dim = 128
hidden_dim = 256
label_dim = 3
sparse_update = get_config_arg("sparse_update", bool, False)
output = fc_layer(input=[fc1, fc2], size=10,
bias_attr=True,
act=SoftmaxActivation())
data = data_layer(name="word", size=dict_dim)
lbl = data_layer(name ="label", size=1)
emb = embedding_layer(
input=data,
size=word_dim,
param_attr=ParamAttr(sparse_update=sparse_update))
cost = classification_cost(input=output, label=lbl)
outputs(cost)
with mixed_layer(size=hidden_dim * 4) as lstm_input:
lstm_input += full_matrix_projection(input=emb)
lstm = lstmemory(
input=lstm_input,
act=TanhActivation(),
gate_act=SigmoidActivation(),
state_act=TanhActivation())
lstm_last = last_seq(input=lstm)
with mixed_layer(
size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
output += full_matrix_projection(input=lstm_last)
outputs(
classification_cost(
input=output, label=data_layer(
name="label", size=1)))
#!/usr/bin/env python
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
......@@ -14,27 +15,42 @@
from paddle.trainer_config_helpers import *
################################### Data Configuration ###################################
TrainData(ProtoData(files = "trainer/tests/mnist.list"))
################################### Algorithm Configuration ###################################
settings(batch_size = 1000,
learning_method = MomentumOptimizer(momentum=0.5, sparse=False))
################################### Network Configuration ###################################
data = data_layer(name ="input", size=784)
######################## data source ################################
dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
dict_file = dict()
for line_count, line in enumerate(open(dict_path, "r")):
dict_file[line.strip()] = line_count
fc1 = fc_layer(input=data, size=800,
bias_attr=True,
act=SigmoidActivation())
define_py_data_sources2(
train_list='gserver/tests/Sequence/train.list',
test_list=None,
module='sequenceGen',
obj='process',
args={"dict_file": dict_file})
fc2 = fc_layer(input=fc1, size=800,
bias_attr=True,
act=SigmoidActivation())
settings(batch_size=5)
######################## network configure ################################
dict_dim = len(open(dict_path, 'r').readlines())
word_dim = 128
hidden_dim = 128
label_dim = 3
output = fc_layer(input=[fc1, fc2], size=10,
bias_attr=True,
act=SoftmaxActivation())
# This config is designed to be equivalent with sequence_recurrent_group.py
lbl = data_layer(name ="label", size=1)
data = data_layer(name="word", size=dict_dim)
cost = classification_cost(input=output, label=lbl)
outputs(cost)
emb = embedding_layer(
input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
recurrent = recurrent_layer(input=emb, bias_attr=False, act=SoftmaxActivation())
recurrent_last = last_seq(input=recurrent)
with mixed_layer(
size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
output += full_matrix_projection(input=recurrent_last)
outputs(
classification_cost(
input=output, label=data_layer(
name="label", size=1)))
#!/usr/bin/env python
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
######################## data source ################################
dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
dict_file = dict()
for line_count, line in enumerate(open(dict_path, "r")):
dict_file[line.strip()] = line_count
define_py_data_sources2(
train_list='gserver/tests/Sequence/train.list',
test_list=None,
module='sequenceGen',
obj='process',
args={"dict_file": dict_file})
settings(batch_size=5)
######################## network configure ################################
dict_dim = len(open(dict_path, 'r').readlines())
word_dim = 128
hidden_dim = 128
label_dim = 3
# This config is designed to be equivalent with sequence_recurrent.py
data = data_layer(name="word", size=dict_dim)
emb = embedding_layer(
input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
def step(y):
mem = memory(name="rnn_state", size=hidden_dim)
with mixed_layer(
name="rnn_state",
size=hidden_dim,
bias_attr=False,
act=SoftmaxActivation()) as out:
out += identity_projection(input=y)
out += full_matrix_projection(
input=mem, param_attr=ParamAttr(name="___recurrent_layer_0__"))
return out
recurrent = recurrent_group(name="rnn", step=step, input=emb)
recurrent_last = last_seq(input=recurrent)
with mixed_layer(
size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
output += full_matrix_projection(input=recurrent_last)
outputs(
classification_cost(
input=output, label=data_layer(
name="label", size=1)))
......@@ -22,8 +22,7 @@ limitations under the License. */
using namespace paddle; // NOLINT
using namespace std; // NOLINT
static const string& configFile1 =
"trainer/tests/sample_trainer_config_compare_sparse.conf";
static const string& configFile1 = "gserver/tests/sequence_lstm.conf";
DECLARE_bool(use_gpu);
DECLARE_string(config);
......
......@@ -30,8 +30,6 @@ DECLARE_bool(use_gpu);
DECLARE_string(config);
DECLARE_string(nics);
DEFINE_string(config_file_a, "", "config of one network to compare");
DEFINE_string(config_file_b, "", "config of another network to compare");
DEFINE_bool(need_high_accuracy,
false,
"whether need to run in double accuracy");
......@@ -42,6 +40,10 @@ DEFINE_double(
DECLARE_bool(thread_local_rand_use_global_seed);
DECLARE_int32(seed);
static const string& config_file_a = "gserver/tests/sequence_recurrent.py";
static const string& config_file_b =
"gserver/tests/sequence_recurrent_group.py";
struct ComData {
vector<Argument> outArgs;
vector<ParameterPtr> parameters;
......@@ -66,6 +68,7 @@ void calcGradient(ComData& data, const string configFile) {
DataBatch dataBatch;
int32_t batchSize = trainer.getConfig().opt_config().batch_size();
trainer.getDataProvider()->reset();
trainer.getDataProvider()->setSkipShuffle();
trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch);
......@@ -167,11 +170,11 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
TEST(Trainer, create) {
ComData dataA;
calcGradient(dataA, FLAGS_config_file_a);
calcGradient(dataA, config_file_a);
LOG(INFO) << "\n\nforwardBackward of Network A is finished\n\n";
ComData dataB;
calcGradient(dataB, FLAGS_config_file_b);
calcGradient(dataB, config_file_b);
LOG(INFO) << "\n\nforwardBackward of the Network B is finished\n\n";
compareGradient(dataA, dataB);
......
......@@ -583,6 +583,7 @@ TEST(Layer, maxoutLayer) {
testLayerGrad(config, "maxout", 10, false, useGpu);
}
}
void testFcLayer(string format, size_t nnz) {
TestConfig config;
config.biasSize = 1024;
......@@ -1081,6 +1082,21 @@ TEST(Layer, InterpolationLayer) {
}
}
TEST(Layer, DotProdLayer) {
TestConfig config;
config.layerConfig.set_type("dot_prod");
config.layerConfig.set_size(1);
config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
config.layerConfig.add_inputs();
config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
config.layerConfig.add_inputs();
for (auto useGpu : {false, true}) {
testLayerGrad(config, "dot_prod", 10, false, useGpu);
}
}
TEST(Layer, OuterProdLayer) {
TestConfig config;
config.layerConfig.set_type("out_prod");
......@@ -2429,6 +2445,25 @@ TEST(Layer, ScaleSubRegionLayer) {
}
}
TEST(Layer, L2DistanceLayer) {
TestConfig config;
config.layerConfig.set_type("l2_distance");
config.layerConfig.set_size(1);
config.biasSize = 0;
const size_t input_dim = 27;
const size_t batch_size = 11;
config.inputDefs.push_back({INPUT_DATA, "layer_0", input_dim, 0});
config.inputDefs.push_back({INPUT_DATA, "layer_1", input_dim, 0});
config.layerConfig.add_inputs();
config.layerConfig.add_inputs();
for (auto useGpu : {false, true}) {
testLayerGrad(config, "l2_distance", batch_size, false, useGpu);
}
}
int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
initMain(argc, argv);
......
......@@ -313,6 +313,47 @@ TEST(MKLDNNLayer, AddtoLayer) {
testAddtoLayer({4, 12, 1, 1}, 3);
}
static void getMKLDNNConcatConfig(TestConfig& cfg,
const std::vector<testImageDesc>& inputs) {
CHECK_GE(inputs.size(), 2UL) << "at least two inputs";
int oc = inputs[0].ic;
for (size_t i = 1; i < inputs.size(); ++i) {
CHECK_EQ(inputs[i].bs, inputs[0].bs);
CHECK_EQ(inputs[i].ih, inputs[0].ih);
CHECK_EQ(inputs[i].iw, inputs[0].iw);
oc += inputs[i].ic;
}
cfg.biasSize = 0;
cfg.layerConfig.set_type("mkldnn_concat");
cfg.layerConfig.set_size(oc * inputs[0].ih * inputs[0].iw);
cfg.layerConfig.set_active_type("relu");
for (size_t i = 0; i < inputs.size(); ++i) {
std::stringstream ss;
ss << "layer_" << i;
cfg.inputDefs.push_back(
{INPUT_DATA,
ss.str(),
(size_t)(inputs[i].ic) * inputs[i].ih * inputs[i].iw,
0});
LayerInputConfig* input = cfg.layerConfig.add_inputs();
ImageConfig* img_conf = input->mutable_image_conf();
img_conf->set_channels(inputs[i].ic);
img_conf->set_img_size_y(inputs[i].ih);
img_conf->set_img_size(inputs[i].iw);
}
}
void testConcatLayer(const std::vector<testImageDesc>& inputs) {
TestConfig dnnConfig;
getMKLDNNConcatConfig(dnnConfig, inputs);
RUN_MKLDNN_TEST_LAYER(dnnConfig, "concat", inputs[0])
}
TEST(MKLDNNLayer, ConcatLayer) {
testConcatLayer({{64, 128, 1, 1}, {64, 32, 1, 1}, {64, 64, 1, 1}});
testConcatLayer({{32, 100, 8, 8}, {32, 10, 8, 8}});
}
void testActivation(std::string actType, const testImageDesc& pm) {
// TODO(TJ): remove me when paddle support elu activation
if (actType == "mkldnn_elu") {
......
......@@ -17,9 +17,13 @@ limitations under the License. */
#include "paddle/utils/StringUtil.h"
#include "paddle/utils/Util.h"
#ifndef PADDLE_MOBILE_INFERENCE
DEFINE_int32(pool_limit_size,
536870912,
"maximum memory size managed by a memory pool, default is 512M");
#else
DEFINE_int32(pool_limit_size, 0, "default is 0");
#endif
namespace paddle {
......
......@@ -61,6 +61,18 @@ function(op_library TARGET)
set(pybind_flag 1)
endif()
if ("${TARGET}" STREQUAL "compare_op")
set(pybind_flag 1)
file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal);\n")
endif()
# conv_op contains several operators
if ("${TARGET}" STREQUAL "conv_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
endif()
# pool_op contains several operators
if ("${TARGET}" STREQUAL "pool_op")
set(pybind_flag 1)
......@@ -68,23 +80,23 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP(pool2d);\n")
endif()
if ("${TARGET}" STREQUAL "compare_op")
# pool_cudnn_op contains several operators
if ("${TARGET}" STREQUAL "pool_cudnn_op")
set(pybind_flag 1)
file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal);\n")
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(pool2d_cudnn);\n")
endif()
# pool_with_index_op contains several operators
if ("${TARGET}" STREQUAL "pool_with_index_op")
if ("${TARGET}" STREQUAL "logical_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n")
file(APPEND ${pybind_file} "USE_OP(logical_and);\n")
endif()
# conv_op contains several operators
if ("${TARGET}" STREQUAL "conv_op")
# pool_with_index_op contains several operators
if ("${TARGET}" STREQUAL "pool_with_index_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n")
endif()
# conv_transpose_op contains several operators
......@@ -93,12 +105,12 @@ function(op_library TARGET)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(conv2d_transpose);\n")
endif()
# pool_cudnn_op contains several operators
if ("${TARGET}" STREQUAL "pool_cudnn_op")
# conv_transpose_cudnn_op contains two operators
if ("${TARGET}" STREQUAL "conv_transpose_cudnn_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(pool2d_cudnn);\n")
file(APPEND ${pybind_file} "USE_OP(conv2d_transpose_cudnn);\n")
endif()
# save_restore_op contains several operators
......@@ -172,6 +184,7 @@ set(DEPS_OPS
sequence_softmax_op
sum_op
pool_op
maxout_op
pool_with_index_op
conv_op
conv_transpose_op
......@@ -198,6 +211,7 @@ op_library(sgd_op DEPS selected_rows_functor)
op_library(adagrad_op DEPS selected_rows_functor)
op_library(conv_op DEPS vol2col)
op_library(pool_op DEPS pooling)
op_library(maxout_op DEPS maxouting)
op_library(pool_with_index_op DEPS pooling)
op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op)
......
此差异已折叠。
......@@ -109,4 +109,5 @@ paramOut = param + paramUpdate$$
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker);
REGISTER_OP_CPU_KERNEL(
adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUPlace, float>);
adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUPlace, float>,
ops::AdadeltaOpKernel<paddle::platform::CPUPlace, double>);
......@@ -17,4 +17,5 @@
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(
adadelta, ops::AdadeltaOpKernel<paddle::platform::GPUPlace, float>);
adadelta, ops::AdadeltaOpKernel<paddle::platform::GPUPlace, float>,
ops::AdadeltaOpKernel<paddle::platform::GPUPlace, double>);
......@@ -33,8 +33,8 @@ class AdadeltaOpKernel : public framework::OpKernel<T> {
avg_squared_grad_out_tensor->mutable_data<T>(ctx.GetPlace());
avg_squared_update_out_tensor->mutable_data<T>(ctx.GetPlace());
float rho = ctx.Attr<float>("rho");
float epsilon = ctx.Attr<float>("epsilon");
T rho = static_cast<T>(ctx.Attr<float>("rho"));
T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
auto param = framework::EigenVector<T>::Flatten(
*ctx.Input<framework::Tensor>("Param"));
......
......@@ -14,8 +14,8 @@
#define EIGEN_USE_GPU
#include "paddle/operators/adagrad_op.h"
#include "paddle/operators/math/selected_rows_functor.h"
#include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/selected_rows_functor.h"
#include "paddle/platform/cuda_helper.h"
namespace paddle {
......@@ -134,8 +134,8 @@ struct SparseAdagradFunctor<platform::GPUPlace, T> {
T, 256><<<grid2, threads, 0,
reinterpret_cast<const platform::CUDADeviceContext&>(context)
.stream()>>>(grad_merge_data, grad_merge->rows().data(),
lr, param_data,
moment_data, grad_width, epsilon);
lr, param_data, moment_data, grad_width,
epsilon);
}
};
......
......@@ -127,4 +127,5 @@ paramOut = param - learningRate * moment_1/ ($\sqrt{(moment_2)} + \epsilon)$$
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(adam, ops::AdamOp, ops::AdamOpMaker);
REGISTER_OP_CPU_KERNEL(adam,
ops::AdamOpKernel<paddle::platform::CPUPlace, float>);
ops::AdamOpKernel<paddle::platform::CPUPlace, float>,
ops::AdamOpKernel<paddle::platform::CPUPlace, double>);
......@@ -17,4 +17,5 @@
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(adam,
ops::AdamOpKernel<paddle::platform::GPUPlace, float>);
ops::AdamOpKernel<paddle::platform::GPUPlace, float>,
ops::AdamOpKernel<paddle::platform::GPUPlace, double>);
......@@ -31,9 +31,9 @@ class AdamOpKernel : public framework::OpKernel<T> {
moment1_out_tensor->mutable_data<T>(ctx.GetPlace());
moment2_out_tensor->mutable_data<T>(ctx.GetPlace());
float beta1 = ctx.Attr<float>("beta1");
float beta2 = ctx.Attr<float>("beta2");
float epsilon = ctx.Attr<float>("epsilon");
T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
auto param = framework::EigenVector<T>::Flatten(
*ctx.Input<framework::Tensor>("Param"));
......
......@@ -126,4 +126,5 @@ division by 0 error.
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(adamax, ops::AdamaxOp, ops::AdamaxOpMaker);
REGISTER_OP_CPU_KERNEL(adamax,
ops::AdamaxOpKernel<paddle::platform::CPUPlace, float>);
ops::AdamaxOpKernel<paddle::platform::CPUPlace, float>,
ops::AdamaxOpKernel<paddle::platform::CPUPlace, double>);
......@@ -17,4 +17,5 @@
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(adamax,
ops::AdamaxOpKernel<paddle::platform::GPUPlace, float>);
ops::AdamaxOpKernel<paddle::platform::GPUPlace, float>,
ops::AdamaxOpKernel<paddle::platform::GPUPlace, double>);
......@@ -31,9 +31,9 @@ class AdamaxOpKernel : public framework::OpKernel<T> {
moment_out_tensor->mutable_data<T>(ctx.GetPlace());
inf_norm_out_tensor->mutable_data<T>(ctx.GetPlace());
float beta1 = ctx.Attr<float>("beta1");
float beta2 = ctx.Attr<float>("beta2");
float epsilon = ctx.Attr<float>("epsilon");
T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
auto param = framework::EigenVector<T>::Flatten(
*ctx.Input<framework::Tensor>("Param"));
......
......@@ -42,6 +42,7 @@ class ArrayOp : public framework::OperatorBase {
} else {
offset = static_cast<size_t>(*i_tensor.data<int64_t>());
}
VLOG(10) << " Offset = " << offset;
return offset;
}
};
......
......@@ -139,7 +139,7 @@ bool BeamSearch::NextItemSet(std::vector<BeamSearch::Item> *items) {
items->reserve(framework::product(ids.dims()));
for (size_t offset = abs_lod[lod_level_][sent_offset_];
offset < abs_lod[lod_level_][sent_offset_ + 1]; offset++) {
for (int d = 0; d < instance_dim; d++) {
for (size_t d = 0; d < instance_dim; d++) {
const size_t dim_offset = offset * instance_dim + d;
items->emplace_back(offset, ids_data[dim_offset],
scores_data[dim_offset]);
......
......@@ -63,7 +63,6 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> {
math::gemm<Place, T>(ctx.device_context(), CblasNoTrans, CblasNoTrans,
batch_size, y_dim, x_dim, 1, x->data<T>(),
weight_mat.data<T>(), 0, left_mul.data<T>());
Eigen::array<int, 2> shape({{static_cast<int>(out->dims()[0]), 1}});
output_col_vec.device(place) =
(left_mul_mat * y_mat).sum(Eigen::DSizes<int, 1>(1));
}
......
......@@ -40,7 +40,8 @@ REGISTER_OP(conv_cudnn, ops::ConvOp, ops::CudnnConvOpMaker, conv_cudnn_grad,
ops::ConvOpGrad);
REGISTER_OP_CPU_KERNEL(conv_cudnn,
ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP_CPU_KERNEL(
conv_cudnn_grad,
ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
conv_cudnn_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
......@@ -226,9 +226,8 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
T alpha = 1.0f, beta = 0.0f;
if (input_grad) {
T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*input_grad);
t.device(ctx.GetEigenDevice<platform::GPUPlace>()) =
t.constant(static_cast<T>(0));
// Because beta is zero, it is unnecessary to reset input_grad.
for (int i = 0; i < groups; i++) {
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardData(
handle, &alpha, cudnn_filter_desc,
......@@ -241,9 +240,8 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
// ------------------- cudnn conv backward filter ---------------------
if (filter_grad) {
T* filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
auto t = framework::EigenVector<T>::Flatten(*filter_grad);
t.device(ctx.GetEigenDevice<platform::GPUPlace>()) =
t.constant(static_cast<T>(0));
// Because beta is zero, it is unnecessary to reset filter_grad.
for (int i = 0; i < groups; i++) {
PADDLE_ENFORCE(platform::dynload::cudnnConvolutionBackwardFilter(
handle, &alpha, cudnn_input_desc, input_data + i * group_offset_in,
......@@ -261,6 +259,8 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
} // namespace operators
} // namespace paddle
REGISTER_OP_GPU_KERNEL(conv_cudnn, paddle::operators::CudnnConvOpKernel<float>);
REGISTER_OP_GPU_KERNEL(conv_cudnn, paddle::operators::CudnnConvOpKernel<float>,
paddle::operators::CudnnConvOpKernel<double>);
REGISTER_OP_GPU_KERNEL(conv_cudnn_grad,
paddle::operators::CudnnConvGradOpKernel<float>);
paddle::operators::CudnnConvGradOpKernel<float>,
paddle::operators::CudnnConvGradOpKernel<double>);
......@@ -225,11 +225,15 @@ REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
ops::ConvOpGrad);
REGISTER_OP_CPU_KERNEL(conv2d,
ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP_CPU_KERNEL(
conv2d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
conv2d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP_CPU_KERNEL(conv3d,
ops::GemmConvKernel<paddle::platform::CPUPlace, float>);
ops::GemmConvKernel<paddle::platform::CPUPlace, float>,
ops::GemmConvKernel<paddle::platform::CPUPlace, double>);
REGISTER_OP_CPU_KERNEL(
conv3d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>);
conv3d_grad, ops::GemmConvGradKernel<paddle::platform::CPUPlace, float>,
ops::GemmConvGradKernel<paddle::platform::CPUPlace, double>);
......@@ -17,11 +17,15 @@
namespace ops = paddle::operators;
REGISTER_OP_GPU_KERNEL(conv2d,
ops::GemmConvKernel<paddle::platform::GPUPlace, float>);
ops::GemmConvKernel<paddle::platform::GPUPlace, float>,
ops::GemmConvKernel<paddle::platform::GPUPlace, double>);
REGISTER_OP_GPU_KERNEL(
conv2d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>);
conv2d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>,
ops::GemmConvGradKernel<paddle::platform::GPUPlace, double>);
REGISTER_OP_GPU_KERNEL(conv3d,
ops::GemmConvKernel<paddle::platform::GPUPlace, float>);
ops::GemmConvKernel<paddle::platform::GPUPlace, float>,
ops::GemmConvKernel<paddle::platform::GPUPlace, double>);
REGISTER_OP_GPU_KERNEL(
conv3d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>);
conv3d_grad, ops::GemmConvGradKernel<paddle::platform::GPUPlace, float>,
ops::GemmConvGradKernel<paddle::platform::GPUPlace, double>);
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册