提交 989ee8b9 编写于 作者: K Kavya Srinet

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into design_doc_edit

...@@ -28,4 +28,3 @@ cmake_install.cmake ...@@ -28,4 +28,3 @@ cmake_install.cmake
paddle/.timestamp paddle/.timestamp
python/paddlepaddle.egg-info/ python/paddlepaddle.egg-info/
paddle/pybind/pybind.h paddle/pybind/pybind.h
python/paddle/v2/framework/tests/tmp/*
...@@ -36,8 +36,7 @@ include(simd) ...@@ -36,8 +36,7 @@ include(simd)
################################ Configurations ####################################### ################################ Configurations #######################################
option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
option(WITH_MKLDNN "Compile PaddlePaddle with mkl-dnn support." ${AVX_FOUND}) option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND})
option(WITH_MKLML "Compile PaddlePaddle with mklml package." ${AVX_FOUND})
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON) option(WITH_TESTING "Compile PaddlePaddle with unit testing" ON)
option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON) option(WITH_SWIG_PY "Compile PaddlePaddle with inference api" ON)
...@@ -82,10 +81,8 @@ if(ANDROID OR IOS) ...@@ -82,10 +81,8 @@ if(ANDROID OR IOS)
"Disable PYTHON when cross-compiling for Android and iOS" FORCE) "Disable PYTHON when cross-compiling for Android and iOS" FORCE)
set(WITH_RDMA OFF CACHE STRING set(WITH_RDMA OFF CACHE STRING
"Disable RDMA when cross-compiling for Android and iOS" FORCE) "Disable RDMA when cross-compiling for Android and iOS" FORCE)
set(WITH_MKLDNN OFF CACHE STRING set(WITH_MKL OFF CACHE STRING
"Disable MKLDNN when cross-compiling for Android and iOS" FORCE) "Disable MKL when cross-compiling for Android and iOS" FORCE)
set(WITH_MKLML OFF CACHE STRING
"Disable MKLML package when cross-compiling for Android and iOS" FORCE)
# Compile PaddlePaddle mobile inference library # Compile PaddlePaddle mobile inference library
if (NOT WITH_C_API) if (NOT WITH_C_API)
...@@ -111,6 +108,14 @@ else() ...@@ -111,6 +108,14 @@ else()
set(THIRD_PARTY_BUILD_TYPE Release) set(THIRD_PARTY_BUILD_TYPE Release)
endif() endif()
set(WITH_MKLML ${WITH_MKL})
if (WITH_MKL AND AVX2_FOUND)
set(WITH_MKLDNN ON)
else()
message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN")
set(WITH_MKLDNN OFF)
endif()
######################################################################################## ########################################################################################
include(external/mklml) # download mklml package include(external/mklml) # download mklml package
...@@ -158,14 +163,15 @@ set(EXTERNAL_LIBS ...@@ -158,14 +163,15 @@ set(EXTERNAL_LIBS
) )
if(WITH_GPU) if(WITH_GPU)
list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY}) include(cuda)
if(NOT WITH_DSO)
list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
endif(NOT WITH_DSO)
endif(WITH_GPU) endif(WITH_GPU)
if(WITH_MKLML)
list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
endif()
if(WITH_MKLDNN) if(WITH_MKLDNN)
list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB} ${MKLDNN_IOMP_LIB}) list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
endif() endif()
if(USE_NNPACK) if(USE_NNPACK)
......
...@@ -12,11 +12,11 @@ Machine: ...@@ -12,11 +12,11 @@ Machine:
System: CentOS release 6.3 (Final), Docker 1.12.1. System: CentOS release 6.3 (Final), Docker 1.12.1.
PaddlePaddle: paddlepaddle/paddle:latest (TODO: will rerun after 0.11.0) PaddlePaddle: paddlepaddle/paddle:latest (for MKLML and MKL-DNN), paddlepaddle/paddle:latest-openblas (for OpenBLAS)
- MKL-DNN tag v0.11
- MKL-DNN tag v0.10 - MKLML 2018.0.1.20171007
- MKLML 2018.0.20170720
- OpenBLAS v0.2.20 - OpenBLAS v0.2.20
(TODO: will rerun after 0.11.0)
On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively. On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.
...@@ -31,17 +31,37 @@ Input image size - 3 * 224 * 224, Time: images/second ...@@ -31,17 +31,37 @@ Input image size - 3 * 224 * 224, Time: images/second
| BatchSize | 64 | 128 | 256 | | BatchSize | 64 | 128 | 256 |
|--------------|-------| -----| --------| |--------------|-------| -----| --------|
| OpenBLAS | 7.82 | 8.62 | 10.34 | | OpenBLAS | 7.80 | 9.00 | 10.80 |
| MKLML | 11.02 | 12.86 | 15.33 | | MKLML | 12.12 | 13.70 | 16.18 |
| MKL-DNN | 27.69 | 28.8 | 29.27 | | MKL-DNN | 28.46 | 29.83 | 30.44 |
chart on batch size 128
TBD
- ResNet-50
| BatchSize | 64 | 128 | 256 |
|--------------|-------| ------| -------|
| OpenBLAS | 25.22 | 25.68 | 27.12 |
| MKLML | 32.52 | 31.89 | 33.12 |
| MKL-DNN | 81.69 | 82.35 | 84.08 |
chart on batch size 128 chart on batch size 128
TBD TBD
- ResNet
- GoogLeNet - GoogLeNet
| BatchSize | 64 | 128 | 256 |
|--------------|-------| ------| -------|
| OpenBLAS | 89.52 | 96.97 | 108.25 |
| MKLML | 128.46| 137.89| 158.63 |
| MKL-DNN     | 250.46| 264.83| 269.50 |
chart on batch size 128
TBD
### Laptop ### Laptop
TBD TBD
### Desktop ### Desktop
......
...@@ -5,6 +5,7 @@ height = 224 ...@@ -5,6 +5,7 @@ height = 224
width = 224 width = 224
num_class = 1000 num_class = 1000
batch_size = get_config_arg('batch_size', int, 128) batch_size = get_config_arg('batch_size', int, 128)
use_gpu = get_config_arg('use_gpu', bool, True)
args = {'height': height, 'width': width, 'color': True, 'num_class': num_class} args = {'height': height, 'width': width, 'color': True, 'num_class': num_class}
define_py_data_sources2( define_py_data_sources2(
...@@ -16,6 +17,8 @@ settings( ...@@ -16,6 +17,8 @@ settings(
learning_method=MomentumOptimizer(0.9), learning_method=MomentumOptimizer(0.9),
regularization=L2Regularization(0.0005 * batch_size)) regularization=L2Regularization(0.0005 * batch_size))
conv_projection = conv_projection if use_gpu else img_conv_layer
def inception2(name, input, channels, \ def inception2(name, input, channels, \
filter1, filter1,
filter3R, filter3, filter3R, filter3,
...@@ -138,7 +141,7 @@ def inception(name, input, channels, \ ...@@ -138,7 +141,7 @@ def inception(name, input, channels, \
cat = concat_layer( cat = concat_layer(
name=name, name=name,
input=[cov1, cov3, cov5, covprj], input=[cov1, cov3, cov5, covprj],
bias_attr=True, bias_attr=True if use_gpu else False,
act=ReluActivation()) act=ReluActivation())
return cat return cat
......
set -e set -e
function train() { function train() {
unset OMP_NUM_THREADS MKL_NUM_THREADS unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
export OMP_DYNAMIC="FALSE"
export KMP_AFFINITY="granularity=fine,compact,0,0"
topology=$1 topology=$1
layer_num=$2 layer_num=$2
bs=$3 bs=$3
...@@ -14,8 +12,6 @@ function train() { ...@@ -14,8 +12,6 @@ function train() {
elif [ $4 == "False" ]; then elif [ $4 == "False" ]; then
thread=`nproc` thread=`nproc`
# each trainer_count use only 1 core to avoid conflict # each trainer_count use only 1 core to avoid conflict
export OMP_NUM_THREADS=1
export MKL_NUM_THREADS=1
log="logs/${topology}-${layer_num}-${thread}mklml-${bs}.log" log="logs/${topology}-${layer_num}-${thread}mklml-${bs}.log"
else else
echo "Wrong input $3, use True or False." echo "Wrong input $3, use True or False."
...@@ -44,6 +40,7 @@ fi ...@@ -44,6 +40,7 @@ fi
for use_mkldnn in True False; do for use_mkldnn in True False; do
for batchsize in 64 128 256; do for batchsize in 64 128 256; do
train vgg 19 $batchsize $use_mkldnn train vgg 19 $batchsize $use_mkldnn
train resnet 50 $batchsize $use_mkldnn train resnet 50 $batchsize $use_mkldnn
train googlenet v1 $batchsize $use_mkldnn
done done
done done
...@@ -76,27 +76,14 @@ else() ...@@ -76,27 +76,14 @@ else()
include_directories(${CUDA_TOOLKIT_INCLUDE}) include_directories(${CUDA_TOOLKIT_INCLUDE})
endif(NOT WITH_GPU) endif(NOT WITH_GPU)
if(WITH_MKLDNN) if (WITH_MKLML AND MKLML_IOMP_LIB)
add_definitions(-DPADDLE_USE_MKLDNN) message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
if (WITH_MKLML AND MKLDNN_IOMP_DIR) set(OPENMP_FLAGS "-fopenmp")
message(STATUS "Enable Intel OpenMP at ${MKLDNN_IOMP_DIR}") set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(OPENMP_FLAGS "-fopenmp") set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS})
set(CMAKE_C_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}")
set(CMAKE_CXX_CREATE_SHARED_LIBRARY_FORBIDDEN_FLAGS ${OPENMP_FLAGS}) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OPENMP_FLAGS}") endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OPENMP_FLAGS}")
else()
find_package(OpenMP)
if(OPENMP_FOUND)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
else()
message(WARNING "Can not find OpenMP."
"Some performance features in MKLDNN may not be available")
endif()
endif()
endif(WITH_MKLDNN)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
......
...@@ -76,11 +76,9 @@ set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform") ...@@ -76,11 +76,9 @@ set(IOS_PLATFORM ${IOS_PLATFORM} CACHE STRING "Type of iOS Platform")
# Set the architecture for iOS # Set the architecture for iOS
if(NOT DEFINED IOS_ARCH) if(NOT DEFINED IOS_ARCH)
if(IOS_PLATFORM STREQUAL "OS") if(IOS_PLATFORM STREQUAL "OS")
# FIXME(liuyiqun): support "armv7;armv7s;arm64" future set(IOS_ARCH "armv7;armv7s;arm64")
set(IOS_ARCH "arm64")
elseif(IOS_PLATFORM STREQUAL "SIMULATOR") elseif(IOS_PLATFORM STREQUAL "SIMULATOR")
# FIXME(liuyiqun): support "i386;x86_64" future set(IOS_ARCH "i386;x86_64")
set(IOS_ARCH "x86_64")
endif() endif()
endif() endif()
set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS") set(CMAKE_OSX_ARCHITECTURES ${IOS_ARCH} CACHE string "Build architecture for iOS")
...@@ -248,7 +246,7 @@ set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_ ...@@ -248,7 +246,7 @@ set(IOS_COMPILER_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} ${XCODE_IOS_BITCODE_
# Hidden visibilty is required for cxx on iOS # Hidden visibilty is required for cxx on iOS
set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags") set(CMAKE_C_FLAGS "${IOS_COMPILER_FLAGS} ${CMAKE_C_FLAGS}" CACHE STRING "C flags")
set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags") set(CMAKE_CXX_FLAGS "${IOS_COMPILER_FLAGS} -fvisibility=hidden -fvisibility-inlines-hidden ${CMAKE_CXX_FLAGS}" CACHE STRING "CXX flags")
set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first") set(IOS_LINK_FLAGS "${XCODE_IOS_PLATFORM_VERSION_FLAGS} -Wl,-search_paths_first")
......
if(NOT WITH_GPU)
return()
endif()
set(paddle_known_gpu_archs "30 35 50 52 60 61 70")
set(paddle_known_gpu_archs7 "30 35 50 52")
set(paddle_known_gpu_archs8 "30 35 50 52 60 61")
######################################################################################
# A function for automatic detection of GPUs installed (if autodetection is enabled)
# Usage:
# detect_installed_gpus(out_variable)
function(detect_installed_gpus out_variable)
if(NOT CUDA_gpu_detect_output)
set(cufile ${PROJECT_BINARY_DIR}/detect_cuda_archs.cu)
file(WRITE ${cufile} ""
"#include <cstdio>\n"
"int main() {\n"
" int count = 0;\n"
" if (cudaSuccess != cudaGetDeviceCount(&count)) return -1;\n"
" if (count == 0) return -1;\n"
" for (int device = 0; device < count; ++device) {\n"
" cudaDeviceProp prop;\n"
" if (cudaSuccess == cudaGetDeviceProperties(&prop, device))\n"
" std::printf(\"%d.%d \", prop.major, prop.minor);\n"
" }\n"
" return 0;\n"
"}\n")
execute_process(COMMAND "${CUDA_NVCC_EXECUTABLE}" "-ccbin=${CUDA_HOST_COMPILER}"
"--run" "${cufile}"
WORKING_DIRECTORY "${PROJECT_BINARY_DIR}/CMakeFiles/"
RESULT_VARIABLE nvcc_res OUTPUT_VARIABLE nvcc_out
ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
if(nvcc_res EQUAL 0)
# only keep the last line of nvcc_out
STRING(REGEX REPLACE ";" "\\\\;" nvcc_out "${nvcc_out}")
STRING(REGEX REPLACE "\n" ";" nvcc_out "${nvcc_out}")
list(GET nvcc_out -1 nvcc_out)
string(REPLACE "2.1" "2.1(2.0)" nvcc_out "${nvcc_out}")
set(CUDA_gpu_detect_output ${nvcc_out} CACHE INTERNAL "Returned GPU architetures from detect_installed_gpus tool" FORCE)
endif()
endif()
if(NOT CUDA_gpu_detect_output)
message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
else()
set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
endif()
endfunction()
########################################################################
# Function for selecting GPU arch flags for nvcc based on CUDA_ARCH_NAME
# Usage:
# select_nvcc_arch_flags(out_variable)
function(select_nvcc_arch_flags out_variable)
# List of arch names
set(archs_names "Kepler" "Maxwell" "Pascal" "All" "Manual")
set(archs_name_default "All")
if(NOT CMAKE_CROSSCOMPILING)
list(APPEND archs_names "Auto")
endif()
# set CUDA_ARCH_NAME strings (so it will be seen as dropbox in CMake-Gui)
set(CUDA_ARCH_NAME ${archs_name_default} CACHE STRING "Select target NVIDIA GPU achitecture.")
set_property( CACHE CUDA_ARCH_NAME PROPERTY STRINGS "" ${archs_names} )
mark_as_advanced(CUDA_ARCH_NAME)
# verify CUDA_ARCH_NAME value
if(NOT ";${archs_names};" MATCHES ";${CUDA_ARCH_NAME};")
string(REPLACE ";" ", " archs_names "${archs_names}")
message(FATAL_ERROR "Only ${archs_names} architeture names are supported.")
endif()
if(${CUDA_ARCH_NAME} STREQUAL "Manual")
set(CUDA_ARCH_BIN ${paddle_known_gpu_archs} CACHE STRING "Specify 'real' GPU architectures to build binaries for, BIN(PTX) format is supported")
set(CUDA_ARCH_PTX "50" CACHE STRING "Specify 'virtual' PTX architectures to build PTX intermediate code for")
mark_as_advanced(CUDA_ARCH_BIN CUDA_ARCH_PTX)
else()
unset(CUDA_ARCH_BIN CACHE)
unset(CUDA_ARCH_PTX CACHE)
endif()
if(${CUDA_ARCH_NAME} STREQUAL "Kepler")
set(cuda_arch_bin "30 35")
elseif(${CUDA_ARCH_NAME} STREQUAL "Maxwell")
set(cuda_arch_bin "50")
elseif(${CUDA_ARCH_NAME} STREQUAL "Pascal")
set(cuda_arch_bin "60 61")
elseif(${CUDA_ARCH_NAME} STREQUAL "Volta")
set(cuda_arch_bin "70")
elseif(${CUDA_ARCH_NAME} STREQUAL "All")
set(cuda_arch_bin ${paddle_known_gpu_archs})
elseif(${CUDA_ARCH_NAME} STREQUAL "Auto")
detect_installed_gpus(cuda_arch_bin)
else() # (${CUDA_ARCH_NAME} STREQUAL "Manual")
set(cuda_arch_bin ${CUDA_ARCH_BIN})
endif()
# remove dots and convert to lists
string(REGEX REPLACE "\\." "" cuda_arch_bin "${cuda_arch_bin}")
string(REGEX REPLACE "\\." "" cuda_arch_ptx "${CUDA_ARCH_PTX}")
string(REGEX MATCHALL "[0-9()]+" cuda_arch_bin "${cuda_arch_bin}")
string(REGEX MATCHALL "[0-9]+" cuda_arch_ptx "${cuda_arch_ptx}")
list(REMOVE_DUPLICATES cuda_arch_bin)
list(REMOVE_DUPLICATES cuda_arch_ptx)
set(nvcc_flags "")
set(nvcc_archs_readable "")
# Tell NVCC to add binaries for the specified GPUs
foreach(arch ${cuda_arch_bin})
if(arch MATCHES "([0-9]+)\\(([0-9]+)\\)")
# User explicitly specified PTX for the concrete BIN
list(APPEND nvcc_flags -gencode arch=compute_${CMAKE_MATCH_2},code=sm_${CMAKE_MATCH_1})
list(APPEND nvcc_archs_readable sm_${CMAKE_MATCH_1})
else()
# User didn't explicitly specify PTX for the concrete BIN, we assume PTX=BIN
list(APPEND nvcc_flags -gencode arch=compute_${arch},code=sm_${arch})
list(APPEND nvcc_archs_readable sm_${arch})
endif()
endforeach()
# Tell NVCC to add PTX intermediate code for the specified architectures
foreach(arch ${cuda_arch_ptx})
list(APPEND nvcc_flags -gencode arch=compute_${arch},code=compute_${arch})
list(APPEND nvcc_archs_readable compute_${arch})
endforeach()
string(REPLACE ";" " " nvcc_archs_readable "${nvcc_archs_readable}")
set(${out_variable} ${nvcc_flags} PARENT_SCOPE)
set(${out_variable}_readable ${nvcc_archs_readable} PARENT_SCOPE)
endfunction()
message(STATUS "CUDA detected: " ${CUDA_VERSION})
if (${CUDA_VERSION} LESS 7.0)
set(paddle_known_gpu_archs ${paddle_known_gpu_archs})
elseif (${CUDA_VERSION} LESS 8.0) # CUDA 7.x
set(paddle_known_gpu_archs ${paddle_known_gpu_archs7})
list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
elseif (${CUDA_VERSION} LESS 9.0) # CUDA 8.x
set(paddle_known_gpu_archs ${paddle_known_gpu_archs8})
list(APPEND CUDA_NVCC_FLAGS "-D_MWAITXINTRIN_H_INCLUDED")
list(APPEND CUDA_NVCC_FLAGS "-D__STRICT_ANSI__")
# CUDA 8 may complain that sm_20 is no longer supported. Suppress the
# warning for now.
list(APPEND CUDA_NVCC_FLAGS "-Wno-deprecated-gpu-targets")
endif()
include_directories(${CUDA_INCLUDE_DIRS})
list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
if(NOT WITH_DSO)
list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
endif(NOT WITH_DSO)
# setting nvcc arch flags
select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
list(APPEND CUDA_NVCC_FLAGS ${NVCC_FLAGS_EXTRA})
message(STATUS "Added CUDA NVCC flags for: ${NVCC_FLAGS_EXTRA_readable}")
# Set C++11 support
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
# So, don't set these flags here.
list(APPEND CUDA_NVCC_FLAGS "-std=c++11")
list(APPEND CUDA_NVCC_FLAGS "--use_fast_math")
list(APPEND CUDA_NVCC_FLAGS "-Xcompiler -fPIC")
# Set :expt-relaxed-constexpr to suppress Eigen warnings
list(APPEND CUDA_NVCC_FLAGS "--expt-relaxed-constexpr")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
list(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL})
endif()
mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
...@@ -40,10 +40,9 @@ INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR}) ...@@ -40,10 +40,9 @@ INCLUDE_DIRECTORIES(${MKLDNN_INC_DIR})
IF(${CBLAS_PROVIDER} STREQUAL "MKLML") IF(${CBLAS_PROVIDER} STREQUAL "MKLML")
SET(MKLDNN_DEPENDS ${MKLML_PROJECT}) SET(MKLDNN_DEPENDS ${MKLML_PROJECT})
SET(MKLDNN_MKLROOT ${MKLML_ROOT}) MESSAGE(STATUS "Build MKLDNN with MKLML ${MKLML_ROOT}")
SET(MKLDNN_IOMP_LIB ${MKLML_IOMP_LIB}) ELSE()
SET(MKLDNN_IOMP_DIR ${MKLML_LIB_DIR}) MESSAGE(FATAL_ERROR "Should enable MKLML when build MKLDNN")
MESSAGE(STATUS "Build MKLDNN with ${MKLDNN_MKLROOT}")
ENDIF() ENDIF()
SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow") SET(MKLDNN_CFLAG "${CMAKE_C_FLAGS} -Wno-error=strict-overflow")
...@@ -57,15 +56,16 @@ ExternalProject_Add( ...@@ -57,15 +56,16 @@ ExternalProject_Add(
PREFIX ${MKLDNN_SOURCES_DIR} PREFIX ${MKLDNN_SOURCES_DIR}
UPDATE_COMMAND "" UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR} CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
CMAKE_ARGS -DMKLROOT=${MKLDNN_MKLROOT} CMAKE_ARGS -DMKLROOT=${MKLML_ROOT}
CMAKE_ARGS -DCMAKE_C_FLAGS=${MKLDNN_CFLAG} CMAKE_ARGS -DCMAKE_C_FLAGS=${MKLDNN_CFLAG}
CMAKE_ARGS -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG} CMAKE_ARGS -DCMAKE_CXX_FLAGS=${MKLDNN_CXXFLAG}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR} CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
-DMKLROOT:PATH=${MKLDNN_MKLROOT} -DMKLROOT:PATH=${MKLML_ROOT}
) )
ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL) ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB}) SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT}) ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
MESSAGE(STATUS "Mkldnn library: ${MKLDNN_LIB}") MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
add_definitions(-DPADDLE_USE_MKLDNN)
LIST(APPEND external_project_dependencies mkldnn) LIST(APPEND external_project_dependencies mkldnn)
...@@ -29,7 +29,7 @@ IF(NOT ${CBLAS_FOUND}) ...@@ -29,7 +29,7 @@ IF(NOT ${CBLAS_FOUND})
"${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}" "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
CACHE FILEPATH "openblas library." FORCE) CACHE FILEPATH "openblas library." FORCE)
SET(OPENBLAS_CC "${CMAKE_C_COMPILER}") SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
IF(CMAKE_CROSSCOMPILING) IF(CMAKE_CROSSCOMPILING)
SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER}) SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
...@@ -45,15 +45,14 @@ IF(NOT ${CBLAS_FOUND}) ...@@ -45,15 +45,14 @@ IF(NOT ${CBLAS_FOUND})
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0) SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
ENDIF() ENDIF()
ELSEIF(IOS) ELSEIF(IOS)
# FIXME(liuyiqun): support multiple architectures IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5") SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}") SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
IF(CMAKE_OSX_ARCHITECTURES MATCHES "armv7")
SET(OPENBLAS_CC "${OPENBLAS_CC} -arch armv7")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
ELSEIF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64") SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX}) SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
ELSE()
MESSAGE(FATAL_ERROR "OpenBLAS only support arm64 architectures on iOS. "
"You can set IOS_USE_VECLIB_FOR_BLAS=ON or USE_EIGEN_FOR_BLAS=ON to use other blas library instead.")
ENDIF() ENDIF()
ELSEIF(RPI) ELSEIF(RPI)
# use hardfp # use hardfp
...@@ -98,7 +97,7 @@ IF(NOT ${CBLAS_FOUND}) ...@@ -98,7 +97,7 @@ IF(NOT ${CBLAS_FOUND})
ENDIF() ENDIF()
INSTALL(CODE "execute_process( INSTALL(CODE "execute_process(
COMMAND ${CMAKE_COMMAND} -E copy_directory ${CBLAS_INSTALL_DIR}/lib COMMAND ${CMAKE_COMMAND} -E copy_directory ${CBLAS_INSTALL_DIR}/lib
destination ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR} ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}
)" )"
) )
INSTALL(CODE "MESSAGE(STATUS \"Installing: \" INSTALL(CODE "MESSAGE(STATUS \"Installing: \"
......
...@@ -12,6 +12,10 @@ ...@@ -12,6 +12,10 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
IF(MOBILE_INFERENCE)
return()
ENDIF()
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc) SET(WARPCTC_SOURCES_DIR ${THIRD_PARTY_PATH}/warpctc)
......
...@@ -149,58 +149,3 @@ endforeach() ...@@ -149,58 +149,3 @@ endforeach()
foreach(flag ${GPU_COMMON_FLAGS}) foreach(flag ${GPU_COMMON_FLAGS})
safe_set_nvflag(${flag}) safe_set_nvflag(${flag})
endforeach() endforeach()
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
# Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
# So, don't set these flags here.
LIST(APPEND CUDA_NVCC_FLAGS -std=c++11)
LIST(APPEND CUDA_NVCC_FLAGS --use_fast_math)
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
elseif(CMAKE_BUILD_TYPE STREQUAL "Release")
LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELEASE})
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
LIST(APPEND CUDA_NVCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL})
endif()
function(specify_cuda_arch cuda_version cuda_arch)
if(${cuda_version} VERSION_GREATER "8.0")
foreach(capability 61 62)
if(${cuda_arch} STREQUAL ${capability})
list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
endif()
endforeach()
elseif(${cuda_version} VERSION_GREATER "7.0" and ${cuda_arch} STREQUAL "53")
list(APPEND __arch_flags " -gencode arch=compute_${cuda_arch},code=sm_${cuda_arch}")
endif()
endfunction()
# Common gpu architectures: Kepler, Maxwell
foreach(capability 30 35 50)
list(APPEND __arch_flags " -gencode arch=compute_${capability},code=sm_${capability}")
endforeach()
if (CUDA_VERSION VERSION_GREATER "7.0" OR CUDA_VERSION VERSION_EQUAL "7.0")
list(APPEND __arch_flags " -gencode arch=compute_52,code=sm_52")
endif()
# Modern gpu architectures: Pascal
if (CUDA_VERSION VERSION_GREATER "8.0" OR CUDA_VERSION VERSION_EQUAL "8.0")
list(APPEND __arch_flags " -gencode arch=compute_60,code=sm_60")
list(APPEND CUDA_NVCC_FLAGS --expt-relaxed-constexpr)
endif()
# Custom gpu architecture
set(CUDA_ARCH)
if(CUDA_ARCH)
specify_cuda_arch(${CUDA_VERSION} ${CUDA_ARCH})
endif()
set(CUDA_NVCC_FLAGS ${__arch_flags} ${CUDA_NVCC_FLAGS})
...@@ -115,8 +115,8 @@ function(link_paddle_exe TARGET_NAME) ...@@ -115,8 +115,8 @@ function(link_paddle_exe TARGET_NAME)
target_link_libraries(${TARGET_NAME} log) target_link_libraries(${TARGET_NAME} log)
endif(ANDROID) endif(ANDROID)
if(WITH_MKLDNN AND WITH_MKLML AND MKLDNN_IOMP_DIR) if(WITH_MKLML AND MKLML_LIB_DIR AND MKLML_IOMP_LIB)
target_link_libraries(${TARGET_NAME} "-L${MKLDNN_IOMP_DIR} -liomp5 -Wl,--as-needed") target_link_libraries(${TARGET_NAME} "-L${MKLML_LIB_DIR} -liomp5 -Wl,--as-needed")
endif() endif()
add_dependencies(${TARGET_NAME} ${external_project_dependencies}) add_dependencies(${TARGET_NAME} ${external_project_dependencies})
......
...@@ -335,6 +335,16 @@ bilinear_interp ...@@ -335,6 +335,16 @@ bilinear_interp
.. autoclass:: paddle.v2.layer.bilinear_interp .. autoclass:: paddle.v2.layer.bilinear_interp
:noindex: :noindex:
dot_prod
---------
.. autoclass:: paddle.v2.layer.dot_prod
:noindex:
out_prod
--------
.. autoclass:: paddle.v2.layer.out_prod
:noindex:
power power
----- -----
.. autoclass:: paddle.v2.layer.power .. autoclass:: paddle.v2.layer.power
...@@ -372,6 +382,11 @@ cos_sim ...@@ -372,6 +382,11 @@ cos_sim
.. autoclass:: paddle.v2.layer.cos_sim .. autoclass:: paddle.v2.layer.cos_sim
:noindex: :noindex:
l2_distance
-----------
.. autoclass:: paddle.v2.layer.l2_distance
:noindex:
trans trans
----- -----
.. autoclass:: paddle.v2.layer.trans .. autoclass:: paddle.v2.layer.trans
......
## Evaluator Design
### The Problem
During training or serving, we provide the evaluation function to measure the model performance, e.g., accuracy, precision. In the operator based framework design, the data go through the network pipeline batch by batch. As a result, inside the operator, we only can calculate one minibatch metrics. We need to provide a mechanism to calculate the metrics for each N pass/batch the user wanted.
### Evaluator Design
Currently, every operation is expressed in the graph. we divide the evaluator process into three steps.
1. Initialize the metric state and add it into the block.
2. Calculate the statistic of the metric state in every mini-batch. The single operator is only responsible for calculating necessary statistics for one mini-batch. For example, accuracy operator only calculate a minibatch data if run once.
3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices.
### Implementation
This design is shown in python API.
Each metric operator need to caculate the metric statistic and return the batch aware states, Python side responsible for accumulate the states for each pass.
```python
class Evaluator(object):
"""
Evaluator Base class.
"""
def __init__(self, name, **kwargs):
"""
Different evaluator may has different metric states. E.g, Accuracy need two variables, total and right sample counts.
Auc need four variables, `true_positives`,
`true_negatives`, `false_positives` and `false_negatives`. So every evaluator should create its needed variables and append to main_program
The initialization of Evaluator should be responsible for:
create metric states and append to the main_program
"""
pass
def _update_ops(self, input, label, **kwargs)
"""
Add mini-batch evaluator caculate operators to the main_program.
Add increment operator to accumulate the metric states.
"""
def reset(self, executor, reset_program=None):
"""
Reset metric states at the begin of each pass/user specified batch number.
Execute the reset_program to reset the states.
"""
def eval(self, executor, eval_program=None):
"""
Merge the mini-batch statistics to form the evaluation result for multiple mini-batches.
Execute the eval_program and return the result.
"""
return eval_result
```
...@@ -36,13 +36,13 @@ Figure 1. PaddlePaddle on IA. ...@@ -36,13 +36,13 @@ Figure 1. PaddlePaddle on IA.
我们把集成方案大致分为了如下几个方面。 我们把集成方案大致分为了如下几个方面。
### CMake ### CMake
我们会在`CMakeLists.txt`中会添加`WITH_MKLDNN`的选项,当设置这个值为`ON`的时候会启用编译MKL-DNN功能。同时会自动开启OpenMP用于提高MKL-DNN的性能 我们会在`CMakeLists.txt`中会给用户添加一个`WITH_MKL`的开关,他是负责`WITH_MKLML``WITH_MKLDNN`的总开关
同时,我们会引入`WITH_MKLML`选项,用于选择是否使用MKL-DNN自带的MKLML安装包。这个安装包可以独立于MKL-DNN使用,但是建议在开启MKL-DNN的同时也打开MKLML的开关,这样才能发挥最好的性能。 当打开`WITH_MKL`时,会开启MKLML的功能,作为PaddlePaddle的CBLAS和LAPACK库,同时会开启Intel OpenMP用于提高MKLML的性能。 如果系统支持AVX2指令集及以上,同时会开启MKL-DNN功能。
所以,我们会在`cmake/external`目录新建`mkldnn.cmake``mklml.cmake`文件,它们会在编译PaddlePaddle的时候下载对应的软件包,并放到PaddlePaddle的third party目录中 当关闭`WITH_MKL`时,MKLML和MKL-DNN功能会同时关闭
**备注**:当`WITH_MKLML=ON`的时候,会优先使用这个包作为PaddlePaddle的CBLAS和LAPACK库,所以会稍微改动`cmake/cblas.cmake`中的逻辑 所以,我们会在`cmake/external`目录新建`mkldnn.cmake``mklml.cmake`文件,它们会在编译PaddlePaddle的时候下载对应的软件包,并放到PaddlePaddle的third party目录中
### Layers ### Layers
所有MKL-DNN相关的C++ layers,都会按照PaddlePaddle的目录结构存放在 所有MKL-DNN相关的C++ layers,都会按照PaddlePaddle的目录结构存放在
......
...@@ -34,7 +34,7 @@ PaddlePaddle的文档构建有两种方式。 ...@@ -34,7 +34,7 @@ PaddlePaddle的文档构建有两种方式。
cd TO_YOUR_PADDLE_CLONE_PATH cd TO_YOUR_PADDLE_CLONE_PATH
mkdir -p build mkdir -p build
cd build cd build
cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_DOC=ON cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKL=OFF -DWITH_DOC=ON
make gen_proto_py make gen_proto_py
make paddle_docs paddle_docs_cn make paddle_docs paddle_docs_cn
......
# 构建Android平台上的PaddlePaddle库 # Android平台编译指南
用户可通过如下两种方式,交叉编译Android平台上适用的PaddlePaddle库: 用户可通过如下两种方式,交叉编译Android平台上适用的PaddlePaddle库:
- 基于Docker容器的编译方式 - 基于Docker容器的编译方式
......
# 构建iOS平台上的PaddlePaddle库 # iOS平台编译指南
交叉编译iOS平台上适用的PaddlePaddle库,需要在MacOS系统上进行。本文的将介绍在MacOS上,从源码交叉编译iOS平台上适用的PaddlePaddle库。 交叉编译iOS平台上适用的PaddlePaddle库,需要在MacOS系统上进行。本文的将介绍在MacOS上,从源码交叉编译iOS平台上适用的PaddlePaddle库。
## 准备交叉编译环境 ## 准备交叉编译环境
...@@ -25,7 +25,7 @@ iOS平台可选配置参数: ...@@ -25,7 +25,7 @@ iOS平台可选配置参数:
- `IOS_PLATFORM`,可设置为`OS/SIMULATOR`,默认值为`OS` - `IOS_PLATFORM`,可设置为`OS/SIMULATOR`,默认值为`OS`
- `OS`,构建目标为`arm`架构的iPhone或者iPad等物理设备。 - `OS`,构建目标为`arm`架构的iPhone或者iPad等物理设备。
- `SIMULATOR`,构建目标为`x86`架构的模拟器平台。 - `SIMULATOR`,构建目标为`x86`架构的模拟器平台。
- `IOS_ARCH`,目标架构。针对不同的`IOS_PLATFORM`,可设置的目标架构如下表所示: - `IOS_ARCH`,目标架构。针对不同的`IOS_PLATFORM`,可设置的目标架构如下表所示,默认编译所有架构
<table class="docutils"> <table class="docutils">
<colgroup> <colgroup>
...@@ -41,11 +41,11 @@ iOS平台可选配置参数: ...@@ -41,11 +41,11 @@ iOS平台可选配置参数:
<tbody valign="top"> <tbody valign="top">
<tr class="row-even"> <tr class="row-even">
<td>OS</td> <td>OS</td>
<td>armv7, armv7s, arm64 (默认)</td> <td>armv7, armv7s, arm64 </td>
</tr> </tr>
<tr class="row-odd"> <tr class="row-odd">
<td>SIMULATOR</td> <td>SIMULATOR</td>
<td>i386, x86_64 (默认)</td> <td>i386, x86_64 </td>
</tr> </tr>
</tbody> </tbody>
</table> </table>
...@@ -66,7 +66,7 @@ iOS平台可选配置参数: ...@@ -66,7 +66,7 @@ iOS平台可选配置参数:
```bash ```bash
cmake -DCMAKE_SYSTEM_NAME=iOS \ cmake -DCMAKE_SYSTEM_NAME=iOS \
-DIOS_PLATFORM=OS \ -DIOS_PLATFORM=OS \
-DIOS_ARCH="arm64" \ -DIOS_ARCH="armv7;arm64" \
-DIOS_ENABLE_BITCODE=ON \ -DIOS_ENABLE_BITCODE=ON \
-DIOS_USE_VECLIB_FOR_BLAS=ON \ -DIOS_USE_VECLIB_FOR_BLAS=ON \
-DCMAKE_INSTALL_PREFIX=your/path/to/install \ -DCMAKE_INSTALL_PREFIX=your/path/to/install \
...@@ -112,6 +112,6 @@ $ make install ...@@ -112,6 +112,6 @@ $ make install
- `lib`目录,其中包含PaddlePaddle的C-API静态库 - `lib`目录,其中包含PaddlePaddle的C-API静态库
- `third_party`目录,其中包含所依赖的所有第三方库 - `third_party`目录,其中包含所依赖的所有第三方库
注意,不同架构的PaddlePaddle库建议安装到不同的目录下,然后使用`lipo`工具将多个静态库合并成一个支持多个架构的fat库。 注意,如果PaddlePaddle库需要同时支持真机和模拟器,则需要分别编译真机和模拟器版本,然后使用`lipo`工具合并fat库。
自此,PaddlePaddle库已经安装完成,用户可将合成的fat库用于深度学习相关的iOS App中,调用方法见C-API文档。 自此,PaddlePaddle库已经安装完成,用户可将合成的fat库用于深度学习相关的iOS App中,调用方法见C-API文档。
# 构建Raspberry Pi平台上的PaddlePaddle库 # Raspberry Pi平台编译指南
通常有两个方法来构建基于 Rasspberry Pi 的版本: 通常有两个方法来构建基于 Rasspberry Pi 的版本:
......
...@@ -29,6 +29,9 @@ static void initPaddle(int argc, char** argv) { ...@@ -29,6 +29,9 @@ static void initPaddle(int argc, char** argv) {
extern "C" { extern "C" {
paddle_error paddle_init(int argc, char** argv) { paddle_error paddle_init(int argc, char** argv) {
static bool isInit = false;
if (isInit) return kPD_NO_ERROR;
std::vector<char*> realArgv; std::vector<char*> realArgv;
realArgv.reserve(argc + 1); realArgv.reserve(argc + 1);
realArgv.push_back(strdup("")); realArgv.push_back(strdup(""));
...@@ -37,6 +40,7 @@ paddle_error paddle_init(int argc, char** argv) { ...@@ -37,6 +40,7 @@ paddle_error paddle_init(int argc, char** argv) {
} }
initPaddle(argc + 1, realArgv.data()); initPaddle(argc + 1, realArgv.data());
free(realArgv[0]); free(realArgv[0]);
isInit = true;
return kPD_NO_ERROR; return kPD_NO_ERROR;
} }
} }
#include <paddle/capi.h> #include <paddle/capi.h>
#include <time.h> #include <time.h>
#include "../common/common.h" #include "../common/common.h"
#define CONFIG_BIN "./trainer_config.bin" #define CONFIG_BIN "./trainer_config.bin"
...@@ -27,20 +28,19 @@ int main() { ...@@ -27,20 +28,19 @@ int main() {
CHECK(paddle_arguments_resize(in_args, 1)); CHECK(paddle_arguments_resize(in_args, 1));
// Create input matrix. // Create input matrix.
paddle_matrix mat = paddle_matrix_create(/* sample_num */ 10, paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
/* size */ 784, /* size */ 784,
/* useGPU */ false); /* useGPU */ false);
srand(time(0)); srand(time(0));
std::vector<paddle_real> input; paddle_real* array;
input.resize(784 * 10);
// Get First row.
CHECK(paddle_matrix_get_row(mat, 0, &array));
for (int i = 0; i < input.size(); ++i) { for (int i = 0; i < 784; ++i) {
input[i] = rand() / ((float)RAND_MAX); array[i] = rand() / ((float)RAND_MAX);
} }
// Set value for the input matrix
CHECK(paddle_matrix_set_value(mat, input.data()));
CHECK(paddle_arguments_set_value(in_args, 0, mat)); CHECK(paddle_arguments_set_value(in_args, 0, mat));
...@@ -53,17 +53,18 @@ int main() { ...@@ -53,17 +53,18 @@ int main() {
CHECK(paddle_arguments_get_value(out_args, 0, prob)); CHECK(paddle_arguments_get_value(out_args, 0, prob));
std::std::vector<paddle_real> result; uint64_t height;
int height; uint64_t width;
int width;
CHECK(paddle_matrix_get_shape(prob, &height, &width); CHECK(paddle_matrix_get_shape(prob, &height, &width));
result.resize(height * width); CHECK(paddle_matrix_get_row(prob, 0, &array));
CHECK(paddle_matrix_get_value(prob, result.data()));
printf("Prob: "); printf("Prob: \n");
for (int i = 0; i < height * width; ++i) { for (int i = 0; i < height * width; ++i) {
printf("%.2f ", result[i]); printf("%.4f ", array[i]);
if ((i + 1) % width == 0) {
printf("\n");
}
} }
printf("\n"); printf("\n");
......
...@@ -25,7 +25,9 @@ limitations under the License. */ ...@@ -25,7 +25,9 @@ limitations under the License. */
#include "hl_matrix.h" #include "hl_matrix.h"
#include "hl_sequence.h" #include "hl_sequence.h"
#include "hl_sparse.h" #include "hl_sparse.h"
#ifndef PADDLE_MOBILE_INFERENCE
#include "hl_warpctc_wrap.h" #include "hl_warpctc_wrap.h"
#endif
#ifdef HPPL_STUB_FUNC #ifdef HPPL_STUB_FUNC
#include "stub/hl_aggregate_stub.h" #include "stub/hl_aggregate_stub.h"
......
...@@ -38,9 +38,9 @@ py_proto_compile(framework_py_proto SRCS framework.proto) ...@@ -38,9 +38,9 @@ py_proto_compile(framework_py_proto SRCS framework.proto)
add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
add_dependencies(framework_py_proto framework_py_proto_init) add_dependencies(framework_py_proto framework_py_proto_init)
add_custom_command(TARGET framework_py_proto POST_BUILD add_custom_command(TARGET framework_py_proto POST_BUILD
COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/proto COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/proto
COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/proto/ COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/v2/fluid/proto/
COMMENT "Copy generated python proto into directory paddle/v2/framework/proto." COMMENT "Copy generated python proto into directory paddle/v2/fluid/proto."
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
cc_library(backward SRCS backward.cc DEPS net_op) cc_library(backward SRCS backward.cc DEPS net_op)
......
...@@ -270,6 +270,19 @@ static bool AllGradInSet(const std::vector<std::string>& names, ...@@ -270,6 +270,19 @@ static bool AllGradInSet(const std::vector<std::string>& names,
return false; return false;
} }
} }
if (VLOG_IS_ON(10)) {
std::ostringstream sout;
sout << "All input {";
for (auto& name : names) {
sout << name << ",";
}
sout << "} is in {";
for (auto& name : set) {
sout << name << ",";
}
sout << "}";
VLOG(10) << sout.str();
}
return true; return true;
} }
...@@ -290,14 +303,12 @@ static void CreateGradVarInBlock( ...@@ -290,14 +303,12 @@ static void CreateGradVarInBlock(
auto ops = block_desc->AllOps(); auto ops = block_desc->AllOps();
for (size_t op_index = grad_op_start_index; op_index < ops.size(); for (size_t op_index = grad_op_start_index; op_index < ops.size();
++op_index) { ++op_index) {
bool need_infer_shape = false;
std::unordered_set<std::string> new_vars; std::unordered_set<std::string> new_vars;
ForEachVarName(ops[op_index]->Outputs(), ForEachVarName(ops[op_index]->Outputs(),
[&](const std::string& grad_var_name) { [&](const std::string& grad_var_name) {
if (block_desc->HasVar(grad_var_name)) { if (block_desc->HasVar(grad_var_name)) {
return false; return false;
} }
need_infer_shape = true;
auto var = block_desc->Var(grad_var_name); auto var = block_desc->Var(grad_var_name);
new_vars.insert(var->Name()); new_vars.insert(var->Name());
auto it = param_name_map.find(grad_var_name); auto it = param_name_map.find(grad_var_name);
...@@ -311,23 +322,21 @@ static void CreateGradVarInBlock( ...@@ -311,23 +322,21 @@ static void CreateGradVarInBlock(
grad_record.op_idx_ = static_cast<int>(op_index); grad_record.op_idx_ = static_cast<int>(op_index);
return false; /* not break */ return false; /* not break */
}); });
if (need_infer_shape) { ops[op_index]->InferVarType(block_desc);
ops[op_index]->InferVarType(block_desc); for (auto& arg : ops[op_index]->OutputArgumentNames()) {
for (auto& arg : ops[op_index]->OutputArgumentNames()) { if (new_vars.find(arg) == new_vars.end()) {
if (new_vars.find(arg) == new_vars.end()) { continue;
continue; }
} auto pname = FwdName(arg);
auto pname = FwdName(arg); auto* param = block_desc->FindVarRecursive(pname);
auto* param = block_desc->FindVarRecursive(pname); auto* grad = block_desc->FindVar(arg);
auto* grad = block_desc->FindVar(arg); if (param == nullptr) {
if (param == nullptr) { grad->SetDataType(DataType::FP32);
grad->SetDataType(DataType::FP32); } else {
} else { grad->SetDataType(param->GetDataType());
grad->SetDataType(param->GetDataType());
}
} }
ops[op_index]->InferShape(*block_desc);
} }
ops[op_index]->InferShape(*block_desc);
} }
} }
...@@ -387,6 +396,7 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward( ...@@ -387,6 +396,7 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
ProgramDescBind& program_desc, int block_idx, ProgramDescBind& program_desc, int block_idx,
std::unordered_set<std::string>* no_grad_vars, std::unordered_set<std::string>* no_grad_vars,
std::unordered_map<std::string, std::string>* grad_to_var) { std::unordered_map<std::string, std::string>* grad_to_var) {
VLOG(5) << "MakeBlockBackward";
BlockDescBind* cur_block = program_desc.MutableBlock(block_idx); BlockDescBind* cur_block = program_desc.MutableBlock(block_idx);
std::vector<OpDescBind*> op_descs = cur_block->AllOps(); std::vector<OpDescBind*> op_descs = cur_block->AllOps();
std::unordered_map<std::string, std::vector<size_t>> dup_out_ops; std::unordered_map<std::string, std::vector<size_t>> dup_out_ops;
...@@ -394,9 +404,10 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward( ...@@ -394,9 +404,10 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
std::vector<std::unique_ptr<OpDescBind>> backward_descs; std::vector<std::unique_ptr<OpDescBind>> backward_descs;
for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) { for (auto it = op_descs.rbegin(); it != op_descs.rend(); ++it) {
VLOG(5) << "Making backward " << (*it)->Type() << " op";
std::vector<std::unique_ptr<OpDescBind>> op_grads; std::vector<std::unique_ptr<OpDescBind>> op_grads;
if ((*it)->Type() == "recurrent") { if ((*it)->Type() == "recurrent" || (*it)->Type() == "while") {
int step_block_idx = (*it)->GetBlockAttr("step_block"); int step_block_idx = (*it)->GetBlockAttr("step_block");
BlockDescBind* backward_block = CreateStepBlock( BlockDescBind* backward_block = CreateStepBlock(
program_desc, no_grad_vars, grad_to_var, step_block_idx); program_desc, no_grad_vars, grad_to_var, step_block_idx);
...@@ -410,6 +421,15 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward( ...@@ -410,6 +421,15 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var); op_grads = MakeOpGrad(*it, no_grad_vars, grad_to_var);
} }
if (VLOG_IS_ON(10)) {
std::ostringstream sout;
sout << "Made ";
for (auto& op_grad : op_grads) {
sout << op_grad->Type() << " ";
}
VLOG(10) << sout.str();
}
for (const auto& desc : op_grads) { for (const auto& desc : op_grads) {
for (const std::string& out_name : desc->OutputArgumentNames()) { for (const std::string& out_name : desc->OutputArgumentNames()) {
if (out_name.find("@GRAD") == std::string::npos) { if (out_name.find("@GRAD") == std::string::npos) {
...@@ -425,6 +445,8 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward( ...@@ -425,6 +445,8 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
op_grads.begin(), op_grads.end(), std::back_inserter(backward_descs), op_grads.begin(), op_grads.end(), std::back_inserter(backward_descs),
[](std::unique_ptr<OpDescBind>& ptr) { return std::move(ptr); }); [](std::unique_ptr<OpDescBind>& ptr) { return std::move(ptr); });
} }
VLOG(5) << "Appending Sums";
// Check whether some variables are written more than once // Check whether some variables are written more than once
std::list<std::pair<size_t, std::unique_ptr<OpDescBind>>> pending_sum_ops; std::list<std::pair<size_t, std::unique_ptr<OpDescBind>>> pending_sum_ops;
for (const auto& dup : dup_out_ops) { for (const auto& dup : dup_out_ops) {
...@@ -432,16 +454,22 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward( ...@@ -432,16 +454,22 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
const std::vector<size_t> dup_op = dup.second; const std::vector<size_t> dup_op = dup.second;
if (out_name != kEmptyVarName && dup_op.size() > 1) { if (out_name != kEmptyVarName && dup_op.size() > 1) {
std::vector<std::string> sum_op_inputs; std::vector<std::string> sum_op_inputs;
std::string next_g_name = out_name;
for (size_t i = 0; i < dup_op.size(); ++i) { for (size_t i = 0; i < dup_op.size(); ++i) {
VLOG(10) << backward_descs[dup_op[i]]->Type() << " has " << out_name
<< " duplicated";
std::string new_name = out_name + "@RENAME@" + std::to_string(i); std::string new_name = out_name + "@RENAME@" + std::to_string(i);
backward_descs[dup_op[i]]->Rename(out_name, new_name); backward_descs[dup_op[i]]->RenameOutput(out_name, new_name);
backward_descs[dup_op[i]]->RenameInput(out_name, next_g_name);
sum_op_inputs.emplace_back(new_name); sum_op_inputs.emplace_back(new_name);
next_g_name = sum_op_inputs.back();
} }
std::unique_ptr<OpDescBind> sum_op(new OpDescBind( std::unique_ptr<OpDescBind> sum_op(new OpDescBind(
"sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {})); "sum", {{"X", sum_op_inputs}}, {{"Out", {out_name}}}, {}));
pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)}); pending_sum_ops.push_back({dup_op.back(), std::move(sum_op)});
} }
} }
pending_sum_ops.sort( pending_sum_ops.sort(
[](const std::pair<size_t, std::unique_ptr<OpDescBind>>& a, [](const std::pair<size_t, std::unique_ptr<OpDescBind>>& a,
const std::pair<size_t, std::unique_ptr<OpDescBind>>& b) { const std::pair<size_t, std::unique_ptr<OpDescBind>>& b) {
...@@ -452,6 +480,8 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward( ...@@ -452,6 +480,8 @@ std::vector<std::unique_ptr<OpDescBind>> MakeBlockBackward(
std::move(p.second)); std::move(p.second));
} }
VLOG(5) << "MakeBlockBackward Finished";
return backward_descs; return backward_descs;
} }
...@@ -483,19 +513,14 @@ ParamGradInfoMap AppendBackward( ...@@ -483,19 +513,14 @@ ParamGradInfoMap AppendBackward(
const int root_block_idx = 0; const int root_block_idx = 0;
auto root_block = program_desc.MutableBlock(root_block_idx); auto root_block = program_desc.MutableBlock(root_block_idx);
// insert fill one op for target
// TODO(qiao) add some check to the target.
std::string fill_one_op_out = GradVarName(target.Name()); std::string fill_one_op_out = GradVarName(target.Name());
std::vector<int64_t> target_shape_desc = target.Shape(); bool is_scalar = target.Shape() == std::vector<int64_t>{1};
std::vector<int> target_shape; PADDLE_ENFORCE(is_scalar, "target should be scalar");
std::transform(target_shape_desc.begin(), target_shape_desc.end(),
std::back_inserter(target_shape),
[](int64_t dim) { return static_cast<int>(dim); });
VLOG(3) << "backward from loss=" << target.Name() VLOG(3) << "backward from loss=" << target.Name()
<< " data_type=" << target.GetDataType(); << " data_type=" << target.GetDataType();
std::unique_ptr<OpDescBind> fill_one_op( std::unique_ptr<OpDescBind> fill_one_op(
new OpDescBind("fill_constant", {}, {{"Out", {fill_one_op_out}}}, new OpDescBind("fill_constant", {}, {{"Out", {fill_one_op_out}}},
{{"shape", target_shape}, {{"shape", std::vector<int>{1}},
{"value", static_cast<float>(1.0)}, {"value", static_cast<float>(1.0)},
{"data_type", target.GetDataType()}})); {"data_type", target.GetDataType()}}));
// infer var type of fill_one_op // infer var type of fill_one_op
......
...@@ -508,6 +508,7 @@ TEST(Backward, simple_single_op) { ...@@ -508,6 +508,7 @@ TEST(Backward, simple_single_op) {
op->SetOutput("Out", {"out"}); op->SetOutput("Out", {"out"});
auto target = f::VarDescBind("out"); auto target = f::VarDescBind("out");
target.SetShape({1});
auto var_to_grad = AppendBackward(program, target, {}); auto var_to_grad = AppendBackward(program, target, {});
ASSERT_EQ(block->AllOps().size(), 3UL); ASSERT_EQ(block->AllOps().size(), 3UL);
...@@ -544,6 +545,7 @@ TEST(Backward, default_attribute) { ...@@ -544,6 +545,7 @@ TEST(Backward, default_attribute) {
op->CheckAttrs(); op->CheckAttrs();
auto target = f::VarDescBind("out"); auto target = f::VarDescBind("out");
target.SetShape({1});
AppendBackward(program, target, {}); AppendBackward(program, target, {});
ASSERT_EQ(block->AllOps().size(), 3UL); ASSERT_EQ(block->AllOps().size(), 3UL);
...@@ -581,6 +583,7 @@ TEST(Backward, simple_mult_op) { ...@@ -581,6 +583,7 @@ TEST(Backward, simple_mult_op) {
op3->SetOutput("Out", {"out3"}); op3->SetOutput("Out", {"out3"});
auto target = f::VarDescBind("out3"); auto target = f::VarDescBind("out3");
target.SetShape({1});
size_t forward_len = block->AllOps().size(); size_t forward_len = block->AllOps().size();
auto var_to_grad = AppendBackward(program, target, {}); auto var_to_grad = AppendBackward(program, target, {});
...@@ -670,6 +673,7 @@ TEST(Backward, intermedia_var_no_grad) { ...@@ -670,6 +673,7 @@ TEST(Backward, intermedia_var_no_grad) {
op4->SetOutput("Out", {"out4"}); op4->SetOutput("Out", {"out4"});
auto target = f::VarDescBind("out4"); auto target = f::VarDescBind("out4");
target.SetShape({1});
size_t forward_len = block->AllOps().size(); size_t forward_len = block->AllOps().size();
auto var_to_grad = AppendBackward(program, target, {"out3"}); auto var_to_grad = AppendBackward(program, target, {"out3"});
...@@ -730,6 +734,7 @@ TEST(Backward, var_no_grad) { ...@@ -730,6 +734,7 @@ TEST(Backward, var_no_grad) {
op2->SetOutput("Z", {"z2"}); op2->SetOutput("Z", {"z2"});
auto target = f::VarDescBind("z2"); auto target = f::VarDescBind("z2");
target.SetShape({1});
size_t forward_len = block->AllOps().size(); size_t forward_len = block->AllOps().size();
auto var_to_grad = AppendBackward(program, target, {"z1"}); auto var_to_grad = AppendBackward(program, target, {"z1"});
...@@ -810,6 +815,7 @@ TEST(Backward, shared_var) { ...@@ -810,6 +815,7 @@ TEST(Backward, shared_var) {
op3->SetOutput("Out", {"out3"}); op3->SetOutput("Out", {"out3"});
auto target = f::VarDescBind("out3"); auto target = f::VarDescBind("out3");
target.SetShape({1});
size_t forward_len = block->AllOps().size(); size_t forward_len = block->AllOps().size();
auto var_to_grad = AppendBackward(program, target, {}); auto var_to_grad = AppendBackward(program, target, {});
...@@ -888,6 +894,7 @@ TEST(Backward, half_backward) { ...@@ -888,6 +894,7 @@ TEST(Backward, half_backward) {
op1->SetOutput("Out", {"out"}); op1->SetOutput("Out", {"out"});
auto target = f::VarDescBind("out"); auto target = f::VarDescBind("out");
target.SetShape({1});
size_t forward_len = block->AllOps().size(); size_t forward_len = block->AllOps().size();
auto var_to_grad = AppendBackward(program, target, {"b"}); auto var_to_grad = AppendBackward(program, target, {"b"});
f::OpDescBind *fill_op = block->AllOps()[forward_len]; f::OpDescBind *fill_op = block->AllOps()[forward_len];
......
...@@ -29,6 +29,8 @@ inline DataType ToDataType(std::type_index type) { ...@@ -29,6 +29,8 @@ inline DataType ToDataType(std::type_index type) {
return DataType::INT32; return DataType::INT32;
} else if (typeid(int64_t).hash_code() == type.hash_code()) { } else if (typeid(int64_t).hash_code() == type.hash_code()) {
return DataType::INT64; return DataType::INT64;
} else if (typeid(bool).hash_code() == type.hash_code()) {
return DataType::BOOL;
} else { } else {
PADDLE_THROW("Not supported"); PADDLE_THROW("Not supported");
} }
...@@ -44,6 +46,8 @@ inline std::type_index ToTypeIndex(DataType type) { ...@@ -44,6 +46,8 @@ inline std::type_index ToTypeIndex(DataType type) {
return typeid(int); return typeid(int);
case DataType::INT64: case DataType::INT64:
return typeid(int64_t); return typeid(int64_t);
case DataType::BOOL:
return typeid(bool);
default: default:
PADDLE_THROW("Not support type %d", type); PADDLE_THROW("Not support type %d", type);
} }
...@@ -64,6 +68,9 @@ inline void VisitDataType(DataType type, Visitor visitor) { ...@@ -64,6 +68,9 @@ inline void VisitDataType(DataType type, Visitor visitor) {
case DataType::INT64: case DataType::INT64:
visitor.template operator()<int64_t>(); visitor.template operator()<int64_t>();
break; break;
case DataType::BOOL:
visitor.template operator()<bool>();
break;
default: default:
PADDLE_THROW("Not supported"); PADDLE_THROW("Not supported");
} }
......
...@@ -60,8 +60,7 @@ void make_ddim(DDim& ddim, const int64_t* dims, int n) { ...@@ -60,8 +60,7 @@ void make_ddim(DDim& ddim, const int64_t* dims, int n) {
ddim = make_dim<9>(dims); ddim = make_dim<9>(dims);
break; break;
default: default:
throw std::invalid_argument( PADDLE_THROW("Dynamic dimensions must have between [1, 9] dimensions.");
"Dynamic dimensions must have between [1, 9] dimensions.");
} }
} }
......
...@@ -120,6 +120,7 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id, ...@@ -120,6 +120,7 @@ void Executor::Run(const ProgramDescBind& pdesc, Scope* scope, int block_id,
for (auto& op_desc : block.AllOps()) { for (auto& op_desc : block.AllOps()) {
auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
VLOG(10) << op->DebugString();
op->Run(*local_scope, *device); op->Run(*local_scope, *device);
} }
if (create_local_scope) { if (create_local_scope) {
......
...@@ -235,6 +235,23 @@ void OpDescBind::Rename(const std::string &old_name, ...@@ -235,6 +235,23 @@ void OpDescBind::Rename(const std::string &old_name,
need_update_ = true; need_update_ = true;
} }
void OpDescBind::RenameOutput(const std::string &old_name,
const std::string &new_name) {
for (auto &output : outputs_) {
std::replace(output.second.begin(), output.second.end(), old_name,
new_name);
}
need_update_ = true;
}
void OpDescBind::RenameInput(const std::string &old_name,
const std::string &new_name) {
for (auto &input : inputs_) {
std::replace(input.second.begin(), input.second.end(), old_name, new_name);
}
need_update_ = true;
}
struct SetAttrDescVisitor : public boost::static_visitor<void> { struct SetAttrDescVisitor : public boost::static_visitor<void> {
explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {} explicit SetAttrDescVisitor(OpDesc::Attr *attr) : attr_(attr) {}
mutable OpDesc::Attr *attr_; mutable OpDesc::Attr *attr_;
...@@ -448,7 +465,12 @@ const std::vector<std::string> &CompileTimeInferShapeContext::Outputs( ...@@ -448,7 +465,12 @@ const std::vector<std::string> &CompileTimeInferShapeContext::Outputs(
DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const { DDim CompileTimeInferShapeContext::GetDim(const std::string &name) const {
auto var = block_.FindVarRecursive(name); auto var = block_.FindVarRecursive(name);
PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name); PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name);
return framework::make_ddim(var->Shape()); try {
return framework::make_ddim(var->Shape());
} catch (...) {
VLOG(5) << "GetDim of variable " << name << " error";
std::rethrow_exception(std::current_exception());
}
} }
void CompileTimeInferShapeContext::SetDim(const std::string &name, void CompileTimeInferShapeContext::SetDim(const std::string &name,
......
...@@ -73,6 +73,10 @@ class OpDescBind { ...@@ -73,6 +73,10 @@ class OpDescBind {
void Rename(const std::string &old_name, const std::string &new_name); void Rename(const std::string &old_name, const std::string &new_name);
void RenameOutput(const std::string &old_name, const std::string &new_name);
void RenameInput(const std::string &old_name, const std::string &new_name);
// Only be used in C++ // Only be used in C++
const AttributeMap &GetAttrMap() const; const AttributeMap &GetAttrMap() const;
......
...@@ -403,19 +403,6 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -403,19 +403,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
void OperatorWithKernel::Run(const Scope& scope, void OperatorWithKernel::Run(const Scope& scope,
const platform::DeviceContext& dev_ctx) const { const platform::DeviceContext& dev_ctx) const {
if (VLOG_IS_ON(1)) {
auto inputs = this->InputVars();
auto outputs = this->OutputVars(true);
std::ostringstream sout;
sout << "Run operator " << this->Type() << " From [";
std::ostream_iterator<std::string> out_it(sout, ",");
std::copy(inputs.begin(), inputs.end(), out_it);
sout << "] to [";
std::copy(outputs.begin(), outputs.end(), out_it);
sout << "]";
VLOG(1) << sout.str();
}
RuntimeInferShapeContext infer_shape_ctx(*this, scope); RuntimeInferShapeContext infer_shape_ctx(*this, scope);
this->InferShape(&infer_shape_ctx); this->InferShape(&infer_shape_ctx);
......
...@@ -38,11 +38,12 @@ Scope& Scope::NewScope() const { ...@@ -38,11 +38,12 @@ Scope& Scope::NewScope() const {
Variable* Scope::Var(const std::string& name) { Variable* Scope::Var(const std::string& name) {
auto iter = vars_.find(name); auto iter = vars_.find(name);
if (iter != vars_.end()) { if (iter != vars_.end()) {
VLOG(3) << "Get existing variable " << name;
return iter->second; return iter->second;
} }
Variable* v = new Variable(); Variable* v = new Variable();
vars_[name] = v; vars_[name] = v;
VLOG(3) << "Create variable " << name << " on scope"; VLOG(3) << "Create variable " << name;
v->name_ = &(vars_.find(name)->first); v->name_ = &(vars_.find(name)->first);
return v; return v;
} }
......
...@@ -53,6 +53,10 @@ class InferShapeContext { ...@@ -53,6 +53,10 @@ class InferShapeContext {
virtual bool IsRuntime() const = 0; virtual bool IsRuntime() const = 0;
// Note: In while op, we need this to be public
void SetDims(const std::vector<std::string> &names,
const std::vector<framework::DDim> &dims);
protected: protected:
virtual framework::DDim GetDim(const std::string &name) const = 0; virtual framework::DDim GetDim(const std::string &name) const = 0;
virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0; virtual void SetDim(const std::string &name, const framework::DDim &dim) = 0;
...@@ -60,9 +64,6 @@ class InferShapeContext { ...@@ -60,9 +64,6 @@ class InferShapeContext {
std::vector<framework::DDim> GetDims( std::vector<framework::DDim> GetDims(
const std::vector<std::string> &names) const; const std::vector<std::string> &names) const;
void SetDims(const std::vector<std::string> &names,
const std::vector<framework::DDim> &dims);
std::vector<VarDesc::VarType> GetVarTypes( std::vector<VarDesc::VarType> GetVarTypes(
const std::vector<std::string> &names) const; const std::vector<std::string> &names) const;
......
...@@ -73,7 +73,6 @@ if(MOBILE_INFERENCE) ...@@ -73,7 +73,6 @@ if(MOBILE_INFERENCE)
list(REMOVE_ITEM GSERVER_SOURCES list(REMOVE_ITEM GSERVER_SOURCES
dataproviders/DataProvider.cpp dataproviders/DataProvider.cpp
dataproviders/MultiDataProvider.cpp dataproviders/MultiDataProvider.cpp
dataproviders/ProtoDataProvider.cpp
dataproviders/PyDataProvider2.cpp dataproviders/PyDataProvider2.cpp
dataproviders/PyDataProvider.cpp) dataproviders/PyDataProvider.cpp)
......
...@@ -212,6 +212,37 @@ Error __must_check backward(Argument& act) { ...@@ -212,6 +212,37 @@ Error __must_check backward(Argument& act) {
} }
END_DEFINE_ACTIVATION(sequence_softmax) END_DEFINE_ACTIVATION(sequence_softmax)
/*
* @brief SoftSign Activation.
* \f[
* f(z) = \frac{z}{1 + |z|}
* \f]
*/
BEGIN_DEFINE_ACTIVATION(softsign)
private:
MatrixPtr denominator_;
Error __must_check forward(Argument& act) {
size_t height = act.value->getHeight();
size_t width = act.value->getWidth();
Matrix::resizeOrCreate(
denominator_, height, width, false, useGpu(act.deviceId));
denominator_->assign(*act.value);
denominator_->abs2();
denominator_->add(1.);
act.value->dotDiv(*act.value, *denominator_);
return Error();
}
Error __must_check backward(Argument& act) {
denominator_->square2();
denominator_->scalarDiv(*denominator_, 1.);
act.grad->dotMul(*act.grad, *denominator_);
return Error();
}
END_DEFINE_ACTIVATION(softsign)
/** /**
* @brief Relu Activation. * @brief Relu Activation.
* forward. y = max(0, z) * forward. y = max(0, z)
......
...@@ -16,8 +16,8 @@ limitations under the License. */ ...@@ -16,8 +16,8 @@ limitations under the License. */
#include <unistd.h> #include <unistd.h>
#include <algorithm> #include <algorithm>
#include "ProtoDataProvider.h"
#include "paddle/utils/Logging.h" #include "paddle/utils/Logging.h"
#include "paddle/utils/Stat.h"
#include "paddle/utils/StringUtil.h" #include "paddle/utils/StringUtil.h"
#include "paddle/utils/Util.h" #include "paddle/utils/Util.h"
...@@ -164,8 +164,6 @@ DataProvider* DataProvider::create(const DataConfig& config, ...@@ -164,8 +164,6 @@ DataProvider* DataProvider::create(const DataConfig& config,
REGISTER_DATA_PROVIDER(simple, SimpleDataProvider); REGISTER_DATA_PROVIDER(simple, SimpleDataProvider);
REGISTER_DATA_PROVIDER(dummy, DummyDataProvider); REGISTER_DATA_PROVIDER(dummy, DummyDataProvider);
REGISTER_DATA_PROVIDER(proto, ProtoDataProvider);
REGISTER_DATA_PROVIDER(proto_sequence, ProtoSequenceDataProvider);
int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) { int64_t DataProvider::getNextBatch(int64_t size, DataBatch* batch) {
int64_t batchSize = doubleBuffer_ ? getNextBatchFromBuffer(size, batch) int64_t batchSize = doubleBuffer_ ? getNextBatchFromBuffer(size, batch)
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "DataFormat.pb.h"
#include "paddle/utils/Stat.h"
#include "DataProvider.h"
#include "ProtoReader.h"
namespace paddle {
/**
* @brief Provider data from protobuf data file with each sample
* specified by proto message
*
* DataSample defined in DataFormat.proto.
*
* The file format is
*
* header
*
* sample1
*
* sample2
*
* ...
*
* sampleN
*
* @note: In the data file, each message is prefixed with its length.
* The read/write of the protbuf are implemented in ProtoReader.h
*/
class ProtoDataProvider : public DataProvider {
public:
ProtoDataProvider(const DataConfig& config,
bool useGpu,
bool loadDataAll = true);
virtual void reset();
/**
* @note this size includes the sequences which are skipped because they
* are longer than the batch size.
*/
virtual int64_t getSize() {
int64_t size = sampleNums_;
if (usageRatio_ < 1.0f) {
size = static_cast<int64_t>(size * usageRatio_);
}
return size;
}
virtual void shuffle();
void loadData(const std::vector<std::string>& fileList);
virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
protected:
/**
* @brief load protobuf data from a list of file
* @param[in] fileName file name of a file which contains
* a list of file names
*/
void loadData(const std::string& fileName);
/**
* @brief load protobuf data from file
* @param[in] fileName data file name
*/
void loadDataFile(const std::string& fileName);
/** @brief check data header of each data sample
* @param[in] header data header read from protobuf data
*/
void checkDataHeader(const DataHeader& header);
/**
* @brief fill protobuf data into slot_,
* slot_ is a vector of ProtoSlot in memory.
* @param[in] sample data sample read from protobuf data
*/
void fillSlots(const DataSample& sample);
/**
* @brief return true if each sample is one sequence, i.e., independent
* of other samples.
*/
inline bool iidData() const { return sequenceStartPositions_.empty(); }
/**
* @brief check that sample is consistent with header_
*/
void checkSample(const DataSample& sample);
template <class Op>
int64_t sequenceLoop(Op op, int64_t size);
template <class Op>
int64_t sampleLoop(Op op, int64_t size);
template <class Op>
int64_t subSampleLoop(Op op, int64_t size, int slot);
void showDataStats();
protected:
struct ProtoVarSlot {
std::vector<real> data;
std::vector<int> dims;
};
struct ProtoSlot {
SlotDef::SlotType type;
int dim;
std::vector<int> indexData;
std::vector<real> denseData;
std::vector<sparse_non_value_t> sparseNonValueData;
std::vector<sparse_float_value_t> sparseFloatValueData;
std::vector<int64_t> indices;
std::vector<int64_t> subIndices;
std::vector<ProtoVarSlot> varDenseData;
std::vector<std::vector<int>> varIndices;
std::vector<std::string> strData;
};
DataHeader header_;
int numVecSlots_;
std::vector<ProtoSlot> slots_;
size_t sampleNums_;
/**
* The starting position of each sequence in samples.
* The last element should be num of samples.
* If empty, each sample is one sequence.
*/
std::vector<size_t> sequenceStartPositions_;
int64_t currentSequenceIndex_;
// The size should be the number of sequences.
std::vector<size_t> shuffledSequenceIds_;
ThreadLocalD<DataBatch> cpuBatch_;
ThreadLocalD<DataBatch> gpuBatch_;
RWLock lock_;
std::vector<StatPtr> nnzStats_; // stats for number of none-zeros entries
};
/**
* @brief Special use for Proto data: instances should contain sparse-non-value
* slots
* and label.
*
* @note ProtoSequenceDataProvider treats each SPARSE SLOT as a SEQUENCE
*/
class ProtoSequenceDataProvider : public ProtoDataProvider {
public:
ProtoSequenceDataProvider(const DataConfig& config,
bool useGpu,
bool loadDataAll = true);
~ProtoSequenceDataProvider() {}
virtual int64_t getNextBatchInternal(int64_t size, DataBatch* batch);
};
} // namespace paddle
...@@ -41,6 +41,7 @@ bool BatchNormBaseLayer::init(const LayerMap& layerMap, ...@@ -41,6 +41,7 @@ bool BatchNormBaseLayer::init(const LayerMap& layerMap,
useGlobalStats_ = config_.use_global_stats(); useGlobalStats_ = config_.use_global_stats();
} }
movingAvgFraction_ = config_.moving_average_fraction(); movingAvgFraction_ = config_.moving_average_fraction();
epsilon_ = config_.epsilon();
weight_.reset(new Weight(1, channels_, parameters_[0])); weight_.reset(new Weight(1, channels_, parameters_[0]));
movingMean_.reset(new Weight(1, channels_, parameters_[1])); movingMean_.reset(new Weight(1, channels_, parameters_[1]));
......
...@@ -94,6 +94,8 @@ protected: ...@@ -94,6 +94,8 @@ protected:
bool useGlobalStats_; bool useGlobalStats_;
// use to compute moving mean and variance. // use to compute moving mean and variance.
real movingAvgFraction_; real movingAvgFraction_;
// Epsilon is a small random noise used in batch normalization for stability.
real epsilon_;
}; };
} // namespace paddle } // namespace paddle
...@@ -22,8 +22,6 @@ namespace paddle { ...@@ -22,8 +22,6 @@ namespace paddle {
REGISTER_LAYER(batch_norm, BatchNormalizationLayer); REGISTER_LAYER(batch_norm, BatchNormalizationLayer);
const real BatchNormalizationLayer::EPS = 1E-5;
bool BatchNormalizationLayer::init(const LayerMap& layerMap, bool BatchNormalizationLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) { const ParameterMap& parameterMap) {
/* Initialize the basic parent class */ /* Initialize the basic parent class */
...@@ -53,7 +51,7 @@ void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) { ...@@ -53,7 +51,7 @@ void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {
calMovingMeanAndVar(); calMovingMeanAndVar();
savedInvVar_->subScalar(-EPS); savedInvVar_->subScalar(-epsilon_);
savedInvVar_->sqrt2(*savedInvVar_); savedInvVar_->sqrt2(*savedInvVar_);
} }
...@@ -74,7 +72,7 @@ void BatchNormalizationLayer::setMeanAndStd() { ...@@ -74,7 +72,7 @@ void BatchNormalizationLayer::setMeanAndStd() {
savedInvVar_->copyFrom(*(movingVar_->getW())); savedInvVar_->copyFrom(*(movingVar_->getW()));
savedInvVar_->downClip(real(0.0)); savedInvVar_->downClip(real(0.0));
savedInvVar_->subScalar(-EPS); savedInvVar_->subScalar(-epsilon_);
savedInvVar_->sqrt2(*savedInvVar_); savedInvVar_->sqrt2(*savedInvVar_);
} }
......
...@@ -39,9 +39,6 @@ public: ...@@ -39,9 +39,6 @@ public:
void backward(const UpdateCallback& callback = nullptr) override; void backward(const UpdateCallback& callback = nullptr) override;
protected: protected:
/// Epsilon value used in the batch normalization formula.
static const real EPS;
/// Load pre-calculated mean and std. /// Load pre-calculated mean and std.
void setMeanAndStd(); void setMeanAndStd();
......
...@@ -21,8 +21,6 @@ namespace paddle { ...@@ -21,8 +21,6 @@ namespace paddle {
REGISTER_LAYER(cudnn_batch_norm, CudnnBatchNormLayer); REGISTER_LAYER(cudnn_batch_norm, CudnnBatchNormLayer);
const double CudnnBatchNormLayer::EPS = 1E-5;
bool CudnnBatchNormLayer::init(const LayerMap& layerMap, bool CudnnBatchNormLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) { const ParameterMap& parameterMap) {
/* Initialize the basic parent class */ /* Initialize the basic parent class */
...@@ -61,6 +59,9 @@ void CudnnBatchNormLayer::forward(PassType passType) { ...@@ -61,6 +59,9 @@ void CudnnBatchNormLayer::forward(PassType passType) {
real* movingMean = movingMean_->getW()->getData(); real* movingMean = movingMean_->getW()->getData();
real* movingVar = movingVar_->getW()->getData(); real* movingVar = movingVar_->getW()->getData();
// cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));
if (!useGlobalStats_) { if (!useGlobalStats_) {
REGISTER_TIMER_INFO("CudnnBatchFwTimer", getName().c_str()); REGISTER_TIMER_INFO("CudnnBatchFwTimer", getName().c_str());
real* savedMean = savedMean_->getData(); real* savedMean = savedMean_->getData();
...@@ -75,7 +76,7 @@ void CudnnBatchNormLayer::forward(PassType passType) { ...@@ -75,7 +76,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
1.0 - movingAvgFraction_, 1.0 - movingAvgFraction_,
movingMean, movingMean,
movingVar, movingVar,
EPS, eps_,
savedMean, savedMean,
savedInvVar); savedInvVar);
} else { } else {
...@@ -90,7 +91,7 @@ void CudnnBatchNormLayer::forward(PassType passType) { ...@@ -90,7 +91,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
beta, beta,
movingMean, movingMean,
movingVar, movingVar,
EPS); eps_);
} else { } else {
// There is a limitation in cudnn library. // There is a limitation in cudnn library.
// When the batch size is larger than 1024 in cuDNN v5.1, // When the batch size is larger than 1024 in cuDNN v5.1,
...@@ -101,7 +102,7 @@ void CudnnBatchNormLayer::forward(PassType passType) { ...@@ -101,7 +102,7 @@ void CudnnBatchNormLayer::forward(PassType passType) {
beta, beta,
movingMean, movingMean,
movingVar, movingVar,
EPS, eps_,
batchSize, batchSize,
channels_, channels_,
imageH_ * imageD_, imageH_ * imageD_,
...@@ -128,6 +129,9 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) { ...@@ -128,6 +129,9 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
real* savedMean = savedMean_->getData(); real* savedMean = savedMean_->getData();
real* savedInvVar = savedInvVar_->getData(); real* savedInvVar = savedInvVar_->getData();
// cuDNN does not allow an epsilon value less than CUDNN_BN_MIN_EPSILON.
eps_ = std::max(CUDNN_BN_MIN_EPSILON, static_cast<double>(epsilon_));
auto create = [](MatrixPtr& m, size_t h, size_t w, real** p) { auto create = [](MatrixPtr& m, size_t h, size_t w, real** p) {
Matrix::resizeOrCreate(m, h, w, false, true); Matrix::resizeOrCreate(m, h, w, false, true);
m->zeroMem(); m->zeroMem();
...@@ -157,7 +161,7 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) { ...@@ -157,7 +161,7 @@ void CudnnBatchNormLayer::backward(const UpdateCallback& callback) {
gamma, gamma,
gammaGrad, gammaGrad,
betaGrad, betaGrad,
EPS, eps_,
savedMean, savedMean,
savedInvVar); savedInvVar);
......
...@@ -14,6 +14,7 @@ limitations under the License. */ ...@@ -14,6 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include <cudnn.h>
#include "BatchNormBaseLayer.h" #include "BatchNormBaseLayer.h"
#include "Layer.h" #include "Layer.h"
#include "paddle/utils/Stat.h" #include "paddle/utils/Stat.h"
...@@ -46,12 +47,9 @@ public: ...@@ -46,12 +47,9 @@ public:
void backward(const UpdateCallback& callback = nullptr) override; void backward(const UpdateCallback& callback = nullptr) override;
protected: protected:
/** /// Epsilon value used in the batch normalization formula.
* Epsilon value used in the batch normalization formula. /// Same epsilon value should be used in forward and backward functions.
* Minimum allowed value is CUDNN_BN_MIN_EPSILON defined in cudnn.h. double eps_;
* Same epsilon value should be used in forward and backward functions.
*/
static const double EPS;
/// Input/output tensor descriptor desc /// Input/output tensor descriptor desc
hl_tensor_descriptor ioDesc_; hl_tensor_descriptor ioDesc_;
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "Layer.h"
#include "paddle/math/Matrix.h"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Stat.h"
namespace paddle {
/**
* @brief A layer for computing the dot product of two vectors.
* Input1: vector (batchSize * dim)
* Input2: vector (batchSize * dim)
* Output: a matrix: (batchSize * 1)
*/
class DotProdLayer : public Layer {
public:
explicit DotProdLayer(const LayerConfig& config) : Layer(config) {}
~DotProdLayer() {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void forward(PassType passType) override;
void backward(const UpdateCallback& callback = nullptr) override;
};
REGISTER_LAYER(dot_prod, DotProdLayer);
bool DotProdLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
Layer::init(layerMap, parameterMap);
CHECK_EQ(inputLayers_.size(), 2U);
CHECK_EQ(1UL, getSize())
<< "The output dimensionality of this layer should be fixed to 1.";
return true;
}
void DotProdLayer::forward(PassType passType) {
Layer::forward(passType);
MatrixPtr inV0 = getInputValue(0);
MatrixPtr inV1 = getInputValue(1);
size_t batchSize = inV0->getHeight();
CHECK_EQ(inV1->getHeight(), batchSize);
CHECK_EQ(inV0->getWidth(), inV1->getWidth());
{
REGISTER_TIMER_INFO("FwResetTimer", getName().c_str());
reserveOutput(batchSize, 1);
}
MatrixPtr outV = getOutputValue();
{
REGISTER_TIMER_INFO("FwDotProdTimer", getName().c_str());
outV->sumOfProducts(*inV0, *inV1, 1, 0);
}
}
void DotProdLayer::backward(const UpdateCallback& callback) {
MatrixPtr inV0 = getInputValue(0);
MatrixPtr inV1 = getInputValue(1);
MatrixPtr outG = getOutputGrad();
MatrixPtr inG0 = getInputGrad(0);
MatrixPtr inG1 = getInputGrad(1);
{
REGISTER_TIMER_INFO("BwDotProdTimer", getName().c_str());
if (inG0) {
inG0->addRowScale(0, *inV1, *outG);
}
if (inG1) {
inG1->addRowScale(0, *inV0, *outG);
}
}
}
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "L2DistanceLayer.h"
#include "paddle/utils/Logging.h"
#include "paddle/utils/Stat.h"
namespace paddle {
REGISTER_LAYER(l2_distance, L2DistanceLayer);
bool L2DistanceLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
/* Initialize the basic parent class */
Layer::init(layerMap, parameterMap);
CHECK_EQ(inputLayers_.size(), 2UL) << "The L2DistanceLayer accepts two and "
<< "only two inputs.";
CHECK_EQ(getSize(), 1UL) << "The output dimensionality of L2DistanceLayer "
<< "is fixed to be 1.";
return true;
}
void L2DistanceLayer::forward(PassType passType) {
Layer::forward(passType);
const auto inV1 = getInputValue(0);
const auto inV2 = getInputValue(1);
CHECK(inV1 && inV2);
CHECK_EQ(inV1->getHeight(), inV2->getHeight())
<< "The height of two inputs of this layer must be the same.";
CHECK_EQ(inV1->getWidth(), inV2->getWidth())
<< "The width of two inputs of this layer must be the same.";
int batchSize = inV1->getHeight();
int output_dim = getSize();
{
REGISTER_TIMER_INFO("L2DistanceBpAtvTimer", getName().c_str());
reserveOutput(batchSize, output_dim);
auto outV = getOutputValue();
CHECK(outV) << "The output matrix should not be null.";
Matrix::resizeOrCreate(
inputSub_, inV1->getHeight(), inV1->getWidth(), false, useGpu_);
inputSub_->assign(*inV1);
inputSub_->sub(*inV2);
outV->sumOfProducts(*inputSub_, *inputSub_, 1, 0);
outV->sqrt2(*outV);
}
}
void L2DistanceLayer::backward(const UpdateCallback& callback) {
const auto outG = getOutputGrad();
const auto outV = getOutputValue();
CHECK(outG && outV);
auto inGrad1 = getInputGrad(0);
auto inGrad2 = getInputGrad(1);
{
REGISTER_TIMER_INFO("L2DistanceBpAtvTimer", getName().c_str());
if (inGrad1 || inGrad2) {
outV->scalarDiv(*outV, 1.);
outV->dotMul(*outG, *outV);
}
if (inGrad1) inGrad1->addRowScale(0, *inputSub_, *outV);
if (inGrad2) {
inputSub_->mulScalar(-1.);
inGrad2->addRowScale(0, *inputSub_, *outV);
}
}
}
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "Layer.h"
#include "paddle/math/Matrix.h"
namespace paddle {
/**
* @brief The layer calculates the l2 distance between two input vectors.
* \f[
* f(\bf{x}, \bf{y}) = \sqrt{\sum_{i=1}^D(x_i - y_i)}
* \f]
*
* - Input1: A vector (batchSize * dataDim)
* - Input2: A vector (batchSize * dataDim)
* - Output: A vector (batchSize * 1)
*
* The configuration api is: l2_distance_layer.
*/
class L2DistanceLayer : public Layer {
public:
explicit L2DistanceLayer(const LayerConfig& config) : Layer(config) {}
~L2DistanceLayer() {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void forward(PassType passType) override;
void backward(const UpdateCallback& callback = nullptr) override;
private:
// Store the result of subtracting Input2 from Input1 in forward computation,
// which will be reused in backward computation.
MatrixPtr inputSub_;
};
} // namespace paddle
...@@ -38,12 +38,13 @@ bool MKLDNNAddtoLayer::init(const LayerMap& layerMap, ...@@ -38,12 +38,13 @@ bool MKLDNNAddtoLayer::init(const LayerMap& layerMap,
} }
void MKLDNNAddtoLayer::reshape( void MKLDNNAddtoLayer::reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) { int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
CHECK_EQ(layerSize_, getSize()) << "this layer size can not be changed"; CHECK_EQ(layerSize_, getSize()) << "this layer size can not be changed";
reshapeInput(bs, ih, iw); reshapeInput(bs, ih, iw);
ic = inputLayers_[0]->getSize() / ih / iw; ic = inputLayers_[0]->getSize() / ih / iw;
CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize()); CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
CHECK_EQ(inputElemenCnt_, (size_t)bs * ic * ih * iw); CHECK_EQ(inputLayers_[0]->getOutputValue()->getElementCnt(),
(size_t)bs * ic * ih * iw);
for (size_t i = 0; i < inputLayers_.size(); i++) { for (size_t i = 0; i < inputLayers_.size(); i++) {
CHECK_EQ(int64_t(bs), inputLayers_[i]->getOutput().getBatchSize()); CHECK_EQ(int64_t(bs), inputLayers_[i]->getOutput().getBatchSize());
CHECK_EQ(layerSize_, inputLayers_[i]->getSize()); CHECK_EQ(layerSize_, inputLayers_[i]->getSize());
...@@ -57,47 +58,43 @@ void MKLDNNAddtoLayer::reshape( ...@@ -57,47 +58,43 @@ void MKLDNNAddtoLayer::reshape(
} }
void MKLDNNAddtoLayer::resetFwd(std::vector<primitive>& pipeline, void MKLDNNAddtoLayer::resetFwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in, std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) { MKLDNNMatrixPtr& out) {
resetFwdBuffers(inVals_, bias, out); resetFwdBuffers(inputs, biasVal_, out);
in = inVals_[0];
std::shared_ptr<sum::primitive_desc> fwdPD; std::shared_ptr<sum::primitive_desc> fwdPD;
std::shared_ptr<sum::primitive_desc> biasPD; std::shared_ptr<sum::primitive_desc> biasPD;
resetFwdPD(fwdPD, biasPD, inVals_, bias, out); resetFwdPD(fwdPD, biasPD, inputs, biasVal_, out);
resetFwdPipeline(pipeline, fwdPD, biasPD, inVals_, bias, out); resetFwdPipeline(pipeline, fwdPD, biasPD, inputs, biasVal_, out);
} }
void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline, void MKLDNNAddtoLayer::resetBwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in, std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) { MKLDNNMatrixPtr& out) {
resetBwdBuffers(inGrads_, bias, out); resetBwdBuffers(inputs, biasGrad_, out);
in = inGrads_[0];
// backward only need share output grad to input grad // backward only need share output grad to input grad
for (size_t i = 0; i < inGrads_.size(); i++) { for (size_t i = 0; i < inputs.size(); i++) {
if (inGrads_[i] != nullptr) { if (inputs[i] != nullptr) {
inGrads_[i] = out; inputs[i] = out;
inputLayers_[i]->getOutputGrad()->setData(inGrads_[i]->getData()); inputLayers_[i]->getOutputGrad()->setData(inputs[i]->getData());
} }
} }
// backward bias // backward bias
bwdBias_ = nullptr; bwdBias_ = nullptr;
if (bias) { if (biasGrad_) {
std::vector<float> scales(bs_, 1.0); std::vector<float> scales(bs_, 1.0);
std::vector<memory::primitive_desc> srcPDs(bs_, bias->getPrimitiveDesc()); std::vector<memory::primitive_desc> srcPDs(bs_,
auto biasPD = sum::primitive_desc(bias->getMemoryDesc(), scales, srcPDs); biasGrad_->getPrimitiveDesc());
auto biasPD =
sum::primitive_desc(biasGrad_->getMemoryDesc(), scales, srcPDs);
std::vector<primitive::at> srcs; std::vector<primitive::at> srcs;
for (size_t i = 0; i < grads_.size(); ++i) { for (size_t i = 0; i < grads_.size(); ++i) {
srcs.push_back(*(grads_[i])); srcs.push_back(*(grads_[i]));
} }
bwdBias_.reset(new sum(biasPD, srcs, *bias)); bwdBias_.reset(new sum(biasPD, srcs, *biasGrad_));
pipeline.push_back(*bwdBias_); pipeline.push_back(*bwdBias_);
} }
} }
...@@ -208,7 +205,7 @@ void MKLDNNAddtoLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs, ...@@ -208,7 +205,7 @@ void MKLDNNAddtoLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
inputs.resize(inputLayers_.size()); inputs.resize(inputLayers_.size());
for (size_t i = 0; i < inputs.size(); i++) { for (size_t i = 0; i < inputs.size(); i++) {
resetInGrad(inputs[i], inVal_->getPrimitiveDesc(), i); resetInGrad(inputs[i], inVals_[i]->getPrimitiveDesc(), i);
CHECK_PRIMITIVE_DESC_EQ(inputs[i], out->getPrimitiveDesc()); CHECK_PRIMITIVE_DESC_EQ(inputs[i], out->getPrimitiveDesc());
} }
......
...@@ -26,9 +26,6 @@ namespace paddle { ...@@ -26,9 +26,6 @@ namespace paddle {
*/ */
class MKLDNNAddtoLayer : public MKLDNNLayer { class MKLDNNAddtoLayer : public MKLDNNLayer {
protected: protected:
std::vector<MKLDNNMatrixPtr> inVals_;
std::vector<MKLDNNMatrixPtr> inGrads_;
// layer size == ic * ih * iw == oc * oh *ow, and can not be changed // layer size == ic * ih * iw == oc * oh *ow, and can not be changed
size_t layerSize_; size_t layerSize_;
...@@ -50,52 +47,19 @@ public: ...@@ -50,52 +47,19 @@ public:
const ParameterMap& parameterMap) override; const ParameterMap& parameterMap) override;
void reshape( void reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override; int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
void resetFwd(std::vector<mkldnn::primitive>& pipeline, void resetFwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in, std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override; MKLDNNMatrixPtr& out) override;
void resetBwd(std::vector<mkldnn::primitive>& pipeline, void resetBwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in, std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override; MKLDNNMatrixPtr& out) override;
void updateWeights(const UpdateCallback& callback) override; void updateWeights(const UpdateCallback& callback) override;
void printValueFormat() override {
for (size_t i = 0; i < inVals_.size(); ++i) {
VLOG(MKLDNN_FMTS) << i << " input: " << inVals_[i]->getFormat() << " >>>";
}
if (outVal_) {
VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> ";
}
if (extOutVal_) {
VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
}
}
void printGradFormat() override {
if (extOutGrad_) {
VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
}
if (outGrad_) {
VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< ";
}
for (size_t i = 0; i < inGrads_.size(); ++i) {
VLOG(MKLDNN_FMTS) << i << " input: " << inGrads_[i]->getFormat() << "<<<";
}
}
protected: protected:
/**
* Forward functions: reset buffers(inputs, output, bias),
* reset primitive descriptor,
* reset pipeline.
*/
void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs, void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out); MKLDNNMatrixPtr& out);
...@@ -110,17 +74,10 @@ protected: ...@@ -110,17 +74,10 @@ protected:
std::vector<MKLDNNMatrixPtr>& inputs, std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out); MKLDNNMatrixPtr& out);
/**
* Backward functions: reset buffers(inputs, output, bias)
*/
void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs, void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out); MKLDNNMatrixPtr& out);
/**
* prepare for bias
*/
void prepareBias(MKLDNNMatrixPtr& bias, void prepareBias(MKLDNNMatrixPtr& bias,
const MatrixPtr& biasMat, const MatrixPtr& biasMat,
const MKLDNNMatrixPtr& out, const MKLDNNMatrixPtr& out,
......
...@@ -21,8 +21,6 @@ namespace paddle { ...@@ -21,8 +21,6 @@ namespace paddle {
REGISTER_LAYER(mkldnn_batch_norm, MKLDNNBatchNormLayer); REGISTER_LAYER(mkldnn_batch_norm, MKLDNNBatchNormLayer);
const real MKLDNNBatchNormLayer::EPS = 1E-5;
bool MKLDNNBatchNormLayer::init(const LayerMap& layerMap, bool MKLDNNBatchNormLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) { const ParameterMap& parameterMap) {
if (!MKLDNNLayer::init(layerMap, parameterMap)) { if (!MKLDNNLayer::init(layerMap, parameterMap)) {
...@@ -50,6 +48,8 @@ bool MKLDNNBatchNormLayer::init(const LayerMap& layerMap, ...@@ -50,6 +48,8 @@ bool MKLDNNBatchNormLayer::init(const LayerMap& layerMap,
useGlobalStats_ = config_.use_global_stats(); useGlobalStats_ = config_.use_global_stats();
} }
movingAvgFraction_ = config_.moving_average_fraction(); movingAvgFraction_ = config_.moving_average_fraction();
epsilon_ = config_.epsilon();
VLOG(MKLDNN_BASE) << "--- " << (useGlobalStats_ ? "use" : "do not use") VLOG(MKLDNN_BASE) << "--- " << (useGlobalStats_ ? "use" : "do not use")
<< " --- global stats"; << " --- global stats";
VLOG(MKLDNN_BASE) << "Moving average fraction: " << movingAvgFraction_; VLOG(MKLDNN_BASE) << "Moving average fraction: " << movingAvgFraction_;
...@@ -116,21 +116,20 @@ void MKLDNNBatchNormLayer::calMovingMeanAndVar() { ...@@ -116,21 +116,20 @@ void MKLDNNBatchNormLayer::calMovingMeanAndVar() {
} }
void MKLDNNBatchNormLayer::reshape( void MKLDNNBatchNormLayer::reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) { int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
reshapeInput(bs, ih, iw); reshapeInput(bs, ih, iw);
oh = ih; oh = ih;
ow = iw; ow = iw;
// ic_ and oc can not be changed // ic_ and oc can not be changed
CHECK_EQ(inputElemenCnt_ / bs / ih / iw, (size_t)ic) CHECK_EQ((size_t)ic,
inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
<< "Input channel can not be changed"; << "Input channel can not be changed";
reshapeOutput(oh, ow); reshapeOutput(oh, ow);
resizeOutput(bs, oc * oh * ow); resizeOutput(bs, oc * oh * ow);
} }
void MKLDNNBatchNormLayer::resetFwd(std::vector<primitive>& pipeline, void MKLDNNBatchNormLayer::resetFwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in, std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) { MKLDNNMatrixPtr& out) {
// In training phase, it will always calculate mean and var, // In training phase, it will always calculate mean and var,
// so useGlobalStats must be false. // so useGlobalStats must be false.
...@@ -140,25 +139,23 @@ void MKLDNNBatchNormLayer::resetFwd(std::vector<primitive>& pipeline, ...@@ -140,25 +139,23 @@ void MKLDNNBatchNormLayer::resetFwd(std::vector<primitive>& pipeline,
useGlobalStats_ = false; useGlobalStats_ = false;
} }
resetFwdBuffers(in, wgt, out); resetFwdBuffers(inputs[0], wgtVal_, out);
resetFwdPD(fwdPD_, in, wgt, out); resetFwdPD(fwdPD_, inputs[0], wgtVal_, out);
resetFwdPipeline(pipeline, fwdPD_, in, wgt, out); resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, out);
} }
void MKLDNNBatchNormLayer::resetBwd(std::vector<primitive>& pipeline, void MKLDNNBatchNormLayer::resetBwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in, std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) { MKLDNNMatrixPtr& out) {
std::shared_ptr<bn_bwd::primitive_desc> pd; std::shared_ptr<bn_bwd::primitive_desc> pd;
resetBwdBuffers(in, wgt, out); resetBwdBuffers(inputs[0], wgtGrad_, out);
resetBwdPD(pd, in, wgt, out); resetBwdPD(pd, inputs[0], wgtGrad_, out);
resetBwdPipeline(pipeline, pd, in, wgt, out); resetBwdPipeline(pipeline, pd, inputs[0], wgtGrad_, out);
} }
void MKLDNNBatchNormLayer::forward(PassType passType) { void MKLDNNBatchNormLayer::forward(PassType passType) {
...@@ -213,7 +210,7 @@ void MKLDNNBatchNormLayer::resetFwdPD( ...@@ -213,7 +210,7 @@ void MKLDNNBatchNormLayer::resetFwdPD(
if (wgt) { if (wgt) {
flags_ = (flags_ | batch_normalization_flag::use_scale_shift); flags_ = (flags_ | batch_normalization_flag::use_scale_shift);
} }
auto fwdDesc = bn_fwd::desc(pk, in->getMemoryDesc(), EPS, flags_); auto fwdDesc = bn_fwd::desc(pk, in->getMemoryDesc(), epsilon_, flags_);
pd.reset(new bn_fwd::primitive_desc(fwdDesc, engine_)); pd.reset(new bn_fwd::primitive_desc(fwdDesc, engine_));
CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc()); CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
if (wgt) { if (wgt) {
...@@ -260,9 +257,9 @@ void MKLDNNBatchNormLayer::resetFwdPipeline( ...@@ -260,9 +257,9 @@ void MKLDNNBatchNormLayer::resetFwdPipeline(
void MKLDNNBatchNormLayer::resetBwdBuffers(MKLDNNMatrixPtr& in, void MKLDNNBatchNormLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& out) { MKLDNNMatrixPtr& out) {
CHECK(inVal_ && outVal_); CHECK(inVals_[0] && outVal_);
resetOutGrad(out, outVal_->getPrimitiveDesc()); resetOutGrad(out, outVal_->getPrimitiveDesc());
resetInGrad(in, inVal_->getPrimitiveDesc()); resetInGrad(in, inVals_[0]->getPrimitiveDesc());
if (gradScaleShift_) { if (gradScaleShift_) {
CHECK(wgtVal_); CHECK(wgtVal_);
resetWithMatrix(wgt, gradScaleShift_, wgtVal_->getPrimitiveDesc()); resetWithMatrix(wgt, gradScaleShift_, wgtVal_->getPrimitiveDesc());
...@@ -280,7 +277,7 @@ void MKLDNNBatchNormLayer::resetBwdPD( ...@@ -280,7 +277,7 @@ void MKLDNNBatchNormLayer::resetBwdPD(
} }
CHECK_PRIMITIVE_DESC_EQ(out, in->getPrimitiveDesc()); CHECK_PRIMITIVE_DESC_EQ(out, in->getPrimitiveDesc());
auto md = in->getMemoryDesc(); auto md = in->getMemoryDesc();
auto bwdDesc = bn_bwd::desc(prop_kind::backward, md, md, EPS, flags_); auto bwdDesc = bn_bwd::desc(prop_kind::backward, md, md, epsilon_, flags_);
pd.reset(new bn_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_)); pd.reset(new bn_bwd::primitive_desc(bwdDesc, engine_, *fwdPD_));
CHECK(pd->weights_primitive_desc() == fwdPD_->weights_primitive_desc()); CHECK(pd->weights_primitive_desc() == fwdPD_->weights_primitive_desc());
CHECK_PRIMITIVE_DESC_EQ(wgt, pd->diff_weights_primitive_desc()); CHECK_PRIMITIVE_DESC_EQ(wgt, pd->diff_weights_primitive_desc());
...@@ -297,11 +294,12 @@ void MKLDNNBatchNormLayer::resetBwdPipeline( ...@@ -297,11 +294,12 @@ void MKLDNNBatchNormLayer::resetBwdPipeline(
if (pd == nullptr) { if (pd == nullptr) {
return; return;
} }
CHECK(inVal_); CHECK(inVals_[0]);
bwdData_.reset( bwdData_.reset(
wgt && wgtVal_ wgt && wgtVal_
? new bn_bwd(*pd, *inVal_, *mean_, *var_, *out, *wgtVal_, *in, *wgt) ? new bn_bwd(
: new bn_bwd(*pd, *inVal_, *mean_, *var_, *out, *in)); *pd, *inVals_[0], *mean_, *var_, *out, *wgtVal_, *in, *wgt)
: new bn_bwd(*pd, *inVals_[0], *mean_, *var_, *out, *in));
pipeline.push_back(*bwdData_); pipeline.push_back(*bwdData_);
} }
......
...@@ -32,7 +32,8 @@ protected: ...@@ -32,7 +32,8 @@ protected:
std::shared_ptr<bn_fwd::primitive_desc> fwdPD_; std::shared_ptr<bn_fwd::primitive_desc> fwdPD_;
// Epsilon value used in the batch normalization formula. // Epsilon value used in the batch normalization formula.
static const real EPS; real epsilon_;
// weight and bias in paddle // weight and bias in paddle
std::unique_ptr<Weight> weight_; std::unique_ptr<Weight> weight_;
std::unique_ptr<Weight> biases_; std::unique_ptr<Weight> biases_;
...@@ -73,18 +74,14 @@ public: ...@@ -73,18 +74,14 @@ public:
void forward(PassType passType) override; void forward(PassType passType) override;
void reshape( void reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override; int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
void resetFwd(std::vector<mkldnn::primitive>& pipeline, void resetFwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in, std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override; MKLDNNMatrixPtr& out) override;
void resetBwd(std::vector<mkldnn::primitive>& pipeline, void resetBwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in, std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override; MKLDNNMatrixPtr& out) override;
void updateWeights(const UpdateCallback& callback) override; void updateWeights(const UpdateCallback& callback) override;
...@@ -98,11 +95,7 @@ protected: ...@@ -98,11 +95,7 @@ protected:
* moving = moving * AvgFraction + local * (1 - AvgFraction) * moving = moving * AvgFraction + local * (1 - AvgFraction)
*/ */
void calMovingMeanAndVar(); void calMovingMeanAndVar();
/**
* Forward functions: reset buffers(input, weight, output),
* reset primitive descriptor,
* reset pipeline.
*/
void resetFwdBuffers(MKLDNNMatrixPtr& in, void resetFwdBuffers(MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& out); MKLDNNMatrixPtr& out);
...@@ -115,12 +108,6 @@ protected: ...@@ -115,12 +108,6 @@ protected:
MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& out); MKLDNNMatrixPtr& out);
/**
* Backward functions: reset buffers(input, weight, output),
* reset primitive descriptor,
* reset pipeline.
*/
void resetBwdBuffers(MKLDNNMatrixPtr& in, void resetBwdBuffers(MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& out); MKLDNNMatrixPtr& out);
......
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "MKLDNNConcatLayer.h"
using namespace mkldnn; // NOLINT
typedef memory::format format;
namespace paddle {
REGISTER_LAYER(mkldnn_concat, MKLDNNConcatLayer);
bool MKLDNNConcatLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) {
if (!MKLDNNLayer::init(layerMap, parameterMap)) {
return false;
}
CHECK_GT(inputLayers_.size(), 1UL);
CHECK(!biasParameter_);
return true;
}
void MKLDNNConcatLayer::reshape(
int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
reshapeInput(bs, ih, iw);
ic = inputLayers_[0]->getSize() / ih / iw;
CHECK_EQ((size_t)ic * ih * iw, inputLayers_[0]->getSize());
CHECK_EQ(inputLayers_[0]->getOutputValue()->getElementCnt(),
(size_t)bs * ic * ih * iw);
CHECK_GT(inputLayers_.size(), 1UL);
channels_.resize(inputLayers_.size());
channels_[0] = ic;
oc = ic;
for (size_t i = 1; i < inputLayers_.size(); i++) {
int batchsize, height, witdh;
reshapeInput(batchsize, height, witdh, i);
CHECK_EQ(bs, batchsize);
CHECK_EQ(ih, height);
CHECK_EQ(iw, witdh);
channels_[i] = inputLayers_[i]->getSize() / height / witdh;
CHECK_EQ((size_t)channels_[i] * height * witdh, inputLayers_[i]->getSize());
oc += channels_[i];
}
oh = ih;
ow = iw;
reshapeOutput(oh, ow);
resizeOutput(bs, oc * oh * ow);
}
void MKLDNNConcatLayer::resetFwd(std::vector<primitive>& pipeline,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
resetFwdBuffers(inputs, out);
std::shared_ptr<concat::primitive_desc> fwdPD;
resetFwdPD(fwdPD, inputs, out);
resetFwdPipeline(pipeline, fwdPD, inputs, out);
}
void MKLDNNConcatLayer::resetBwd(std::vector<primitive>& pipeline,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
resetBwdBuffers(inputs, out);
resetBwdPipeline(pipeline, bwds_, inputs, out);
}
void MKLDNNConcatLayer::resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
inputs.resize(inputLayers_.size());
bool has8c = false, has16c = false, hasnc = false;
for (size_t i = 0; i < inputs.size(); i++) {
resetInValue(inputs[i], nullptr, i, channels_[i]);
CHECK(inputs[i]);
auto dm = inputs[i]->getDims();
// inputs format can be different, but ndims must equal
CHECK(i == 0 || dm.size() == inputs[0]->getDims().size());
CHECK_EQ(bs_, dm[0]);
CHECK_EQ(channels_[i], dm[1]);
if (dm.size() > 2) {
CHECK_EQ(ih_, dm[2]);
CHECK_EQ(iw_, dm[3]);
}
if (inputs[i]->getFormat() == format::nc) {
hasnc = true;
}
if (inputs[i]->getFormat() == format::nChw8c) {
has8c = true;
}
if (inputs[i]->getFormat() == format::nChw16c) {
has16c = true;
}
}
format outFmt;
if (has16c && oc_ % 16 == 0) {
outFmt = format::nChw16c;
} else if (has8c && oc_ % 8 == 0) {
outFmt = format::nChw8c;
} else if (hasnc) {
CHECK(oh_ == 1 && ow_ == 1);
outFmt = format::nc;
} else {
outFmt = format::nchw;
}
memory::dims outDims =
hasnc ? memory::dims{bs_, oc_} : memory::dims{bs_, oc_, oh_, ow_};
auto outPD = MKLDNNMatrix::createPrimitiveDesc(outDims, outFmt, engine_);
resetOutValue(out, outPD);
}
void MKLDNNConcatLayer::resetFwdPD(std::shared_ptr<concat::primitive_desc>& pd,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr out) {
std::vector<memory::primitive_desc> srcPDs;
for (size_t i = 0; i < inputs.size(); i++) {
srcPDs.push_back(inputs[i]->getPrimitiveDesc());
}
CHECK(out);
pd.reset(new concat::primitive_desc(out->getMemoryDesc(), axis_, srcPDs));
CHECK_PRIMITIVE_DESC_EQ(out, pd->dst_primitive_desc());
}
void MKLDNNConcatLayer::resetFwdPipeline(
std::vector<primitive>& pipeline,
std::shared_ptr<concat::primitive_desc>& pd,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
std::vector<primitive::at> srcs;
for (size_t i = 0; i < inputs.size(); i++) {
srcs.push_back(*(inputs[i]));
}
fwd_.reset(new concat(*pd, srcs, *out));
pipeline.push_back(*fwd_);
}
void MKLDNNConcatLayer::resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
CHECK(outVal_);
resetOutGrad(out, outVal_->getPrimitiveDesc());
CHECK(out);
inputs.resize(inputLayers_.size());
for (size_t i = 0; i < inputs.size(); i++) {
CHECK(inVals_[i]);
resetInGrad(inputs[i], inVals_[i]->getPrimitiveDesc(), i);
CHECK_PRIMITIVE_DESC_EQ(inputs[i], inVals_[i]->getPrimitiveDesc());
}
}
void MKLDNNConcatLayer::resetBwdPipeline(
std::vector<mkldnn::primitive>& pipeline,
std::vector<std::shared_ptr<mkldnn::primitive>>& prims,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) {
// reset the backward primitives
memory::dims offsets = {0, 0, 0, 0};
prims.resize(inputs.size());
CHECK_EQ(inputs.size(), channels_.size());
for (size_t i = 0; i < inputs.size(); i++) {
auto viewPD = view::primitive_desc(
out->getPrimitiveDesc(), inputs[i]->getDims(), offsets);
auto bwdPD = reorder::primitive_desc(viewPD.dst_primitive_desc(),
inputs[i]->getPrimitiveDesc());
prims[i].reset(new reorder(bwdPD, *out, *(inputs[i])));
offsets[axis_] += channels_[i];
// push to pipeline
pipeline.push_back(*prims[i]);
}
}
} // namespace paddle
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "MKLDNNLayer.h"
#include "mkldnn.hpp"
namespace paddle {
/**
* @brief A subclass of MKLDNNLayer Concatenate layer.
*
* The config file api is mkldnn_concat
*/
class MKLDNNConcatLayer : public MKLDNNLayer {
protected:
std::vector<std::shared_ptr<mkldnn::primitive>> bwds_;
// input channel numbers
std::vector<int> channels_;
// concat_dimension in MKLDNN
// if axis_ == 0, concat batchsize
// if axis_ == 1, concat channel (default)
int axis_;
public:
explicit MKLDNNConcatLayer(const LayerConfig& config)
: MKLDNNLayer(config), axis_(1) {}
~MKLDNNConcatLayer() {}
bool init(const LayerMap& layerMap,
const ParameterMap& parameterMap) override;
void reshape(
int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
void resetFwd(std::vector<mkldnn::primitive>& pipeline,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) override;
void resetBwd(std::vector<mkldnn::primitive>& pipeline,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out) override;
void printSizeInfo() override {
CHECK_EQ(channels_.size(), inputLayers_.size());
for (size_t i = 0; i < channels_.size(); ++i) {
VLOG(MKLDNN_SIZES) << "Input " << i << ", " << inputLayers_[i]->getName()
<< ": " << bs_ << ", " << channels_[i] << ", " << ih_
<< ", " << iw_;
}
VLOG(MKLDNN_SIZES) << "Output: " << bs_ << ", " << oc_ << ", " << oh_
<< ", " << ow_;
}
size_t keepCondition() {
// reset when the total element size of all inputs changed
size_t totalSize = inputLayers_[0]->getOutputValue()->getElementCnt();
for (size_t i = 1; i < inputLayers_.size(); ++i) {
totalSize += inputLayers_[i]->getOutputValue()->getElementCnt();
}
return totalSize;
}
protected:
void resetFwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out);
void resetFwdPD(std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr out);
void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
std::shared_ptr<mkldnn::concat::primitive_desc>& pd,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out);
void resetBwdBuffers(std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out);
void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
std::vector<std::shared_ptr<mkldnn::primitive>>& prims,
std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& out);
};
} // namespace paddle
...@@ -90,7 +90,7 @@ void MKLDNNConvLayer::convertWeightsToPaddle() { ...@@ -90,7 +90,7 @@ void MKLDNNConvLayer::convertWeightsToPaddle() {
} }
void MKLDNNConvLayer::reshape( void MKLDNNConvLayer::reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) { int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
reshapeInput(bs, ih, iw); reshapeInput(bs, ih, iw);
// cal output sizes // cal output sizes
...@@ -105,21 +105,17 @@ void MKLDNNConvLayer::reshape( ...@@ -105,21 +105,17 @@ void MKLDNNConvLayer::reshape(
} }
void MKLDNNConvLayer::resetFwd(std::vector<primitive>& pipeline, void MKLDNNConvLayer::resetFwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in, std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) { MKLDNNMatrixPtr& out) {
resetFwdPD(fwdPD_); resetFwdPD(fwdPD_);
resetFwdBuffers(fwdPD_, in, wgt, bias, out); resetFwdBuffers(fwdPD_, inputs[0], wgtVal_, biasVal_, out);
resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out); resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, biasVal_, out);
} }
void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline, void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in, std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) { MKLDNNMatrixPtr& out) {
std::shared_ptr<conv_bwdWgt::primitive_desc> bwdWgtPD; std::shared_ptr<conv_bwdWgt::primitive_desc> bwdWgtPD;
std::shared_ptr<conv_bwdData::primitive_desc> bwdDataPD; std::shared_ptr<conv_bwdData::primitive_desc> bwdDataPD;
...@@ -128,9 +124,10 @@ void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline, ...@@ -128,9 +124,10 @@ void MKLDNNConvLayer::resetBwd(std::vector<primitive>& pipeline,
resetBwdDataPD(bwdDataPD); resetBwdDataPD(bwdDataPD);
resetBwdBuffers(bwdWgtPD, bwdDataPD, in, wgt, bias, out); resetBwdBuffers(bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out); resetBwdPipeline(
pipeline, bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
} }
void MKLDNNConvLayer::updateWeights(const UpdateCallback& callback) { void MKLDNNConvLayer::updateWeights(const UpdateCallback& callback) {
...@@ -236,14 +233,14 @@ void MKLDNNConvLayer::resetBwdWgtPD( ...@@ -236,14 +233,14 @@ void MKLDNNConvLayer::resetBwdWgtPD(
loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR); loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
// create backward weight using input, output and weight value memory desc // create backward weight using input, output and weight value memory desc
CHECK(inVal_) << "Should have internal input value"; CHECK(inVals_[0]) << "Should have internal input value";
CHECK(outVal_) << "Should have internal output value"; CHECK(outVal_) << "Should have internal output value";
CHECK(wgtVal_) << "Should have weight value"; CHECK(wgtVal_) << "Should have weight value";
algorithm algo = algorithm::convolution_direct; algorithm algo = algorithm::convolution_direct;
padding_kind padKind = padding_kind::zero; padding_kind padKind = padding_kind::zero;
auto bwdWgtDesc = biasVal_ != nullptr auto bwdWgtDesc = biasVal_ != nullptr
? conv_bwdWgt::desc(algo, ? conv_bwdWgt::desc(algo,
inVal_->getMemoryDesc(), inVals_[0]->getMemoryDesc(),
wgtVal_->getMemoryDesc(), wgtVal_->getMemoryDesc(),
biasVal_->getMemoryDesc(), biasVal_->getMemoryDesc(),
outVal_->getMemoryDesc(), outVal_->getMemoryDesc(),
...@@ -252,7 +249,7 @@ void MKLDNNConvLayer::resetBwdWgtPD( ...@@ -252,7 +249,7 @@ void MKLDNNConvLayer::resetBwdWgtPD(
padR, padR,
padKind) padKind)
: conv_bwdWgt::desc(algo, : conv_bwdWgt::desc(algo,
inVal_->getMemoryDesc(), inVals_[0]->getMemoryDesc(),
wgtVal_->getMemoryDesc(), wgtVal_->getMemoryDesc(),
outVal_->getMemoryDesc(), outVal_->getMemoryDesc(),
strides, strides,
...@@ -260,7 +257,7 @@ void MKLDNNConvLayer::resetBwdWgtPD( ...@@ -260,7 +257,7 @@ void MKLDNNConvLayer::resetBwdWgtPD(
padR, padR,
padKind); padKind);
pd.reset(new conv_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_)); pd.reset(new conv_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
CHECK_PRIMITIVE_DESC_EQ(inVal_, pd->src_primitive_desc()); CHECK_PRIMITIVE_DESC_EQ(inVals_[0], pd->src_primitive_desc());
CHECK_PRIMITIVE_DESC_EQ( CHECK_PRIMITIVE_DESC_EQ(
outVal_, outVal_,
pd->diff_dst_primitive_desc(), pd->diff_dst_primitive_desc(),
...@@ -280,12 +277,12 @@ void MKLDNNConvLayer::resetBwdDataPD( ...@@ -280,12 +277,12 @@ void MKLDNNConvLayer::resetBwdDataPD(
memory::dims wgtDims, biasDims, strides, dilations, padL, padR; memory::dims wgtDims, biasDims, strides, dilations, padL, padR;
loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR); loadConvSettings(wgtDims, biasDims, strides, dilations, padL, padR);
CHECK(inVal_) << "Should have internal input value"; CHECK(inVals_[0]) << "Should have internal input value";
CHECK(outVal_) << "Should have internal output value"; CHECK(outVal_) << "Should have internal output value";
// create backward data using input and output value memory desc // create backward data using input and output value memory desc
// but using weight memory desc with any format // but using weight memory desc with any format
auto bwdDataDesc = conv_bwdData::desc(algorithm::convolution_direct, auto bwdDataDesc = conv_bwdData::desc(algorithm::convolution_direct,
inVal_->getMemoryDesc(), inVals_[0]->getMemoryDesc(),
MKLDNNMatrix::createMemoryDesc(wgtDims), MKLDNNMatrix::createMemoryDesc(wgtDims),
outVal_->getMemoryDesc(), outVal_->getMemoryDesc(),
strides, strides,
...@@ -294,7 +291,7 @@ void MKLDNNConvLayer::resetBwdDataPD( ...@@ -294,7 +291,7 @@ void MKLDNNConvLayer::resetBwdDataPD(
padding_kind::zero); padding_kind::zero);
pd.reset(new conv_bwdData::primitive_desc(bwdDataDesc, engine_, *fwdPD_)); pd.reset(new conv_bwdData::primitive_desc(bwdDataDesc, engine_, *fwdPD_));
CHECK_PRIMITIVE_DESC_EQ( CHECK_PRIMITIVE_DESC_EQ(
inVal_, inVals_[0],
pd->diff_src_primitive_desc(), pd->diff_src_primitive_desc(),
"primitive desc of in value and grad should be equal"); "primitive desc of in value and grad should be equal");
CHECK_PRIMITIVE_DESC_EQ( CHECK_PRIMITIVE_DESC_EQ(
...@@ -346,12 +343,12 @@ void MKLDNNConvLayer::resetBwdPipeline( ...@@ -346,12 +343,12 @@ void MKLDNNConvLayer::resetBwdPipeline(
MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) { MKLDNNMatrixPtr& out) {
CHECK(inVal_); CHECK(inVals_[0]);
// add bwdWgt handle // add bwdWgt handle
if (bias) { if (bias) {
bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVal_, *out, *wgt, *bias)); bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVals_[0], *out, *wgt, *bias));
} else { } else {
bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVal_, *out, *wgt)); bwdWgt_.reset(new conv_bwdWgt(*wgtPD, *inVals_[0], *out, *wgt));
} }
pipeline.push_back(*bwdWgt_); pipeline.push_back(*bwdWgt_);
......
...@@ -69,18 +69,14 @@ public: ...@@ -69,18 +69,14 @@ public:
const ParameterMap& parameterMap) override; const ParameterMap& parameterMap) override;
void reshape( void reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override; int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
void resetFwd(std::vector<mkldnn::primitive>& pipeline, void resetFwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in, std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override; MKLDNNMatrixPtr& out) override;
void resetBwd(std::vector<mkldnn::primitive>& pipeline, void resetBwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in, std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override; MKLDNNMatrixPtr& out) override;
void updateWeights(const UpdateCallback& callback) override; void updateWeights(const UpdateCallback& callback) override;
...@@ -107,48 +103,26 @@ protected: ...@@ -107,48 +103,26 @@ protected:
mkldnn::memory::dims& padL, mkldnn::memory::dims& padL,
mkldnn::memory::dims& padR); mkldnn::memory::dims& padR);
/**
* reset the forward primitive descriptor.
*/
void resetFwdPD(std::shared_ptr<conv_fwd::primitive_desc>& pd); void resetFwdPD(std::shared_ptr<conv_fwd::primitive_desc>& pd);
/**
* reset the MKLDNNMatrix buffers used in forward.
*/
void resetFwdBuffers(std::shared_ptr<conv_fwd::primitive_desc>& pd, void resetFwdBuffers(std::shared_ptr<conv_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out); MKLDNNMatrixPtr& out);
/**
* reset the forward pipeline.
*/
void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline, void resetFwdPipeline(std::vector<mkldnn::primitive>& pipeline,
std::shared_ptr<conv_fwd::primitive_desc>& pd, std::shared_ptr<conv_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out); MKLDNNMatrixPtr& out);
/**
* reset the backward weight primitive descriptor.
*/
void resetBwdWgtPD(std::shared_ptr<conv_bwdWgt::primitive_desc>& pd); void resetBwdWgtPD(std::shared_ptr<conv_bwdWgt::primitive_desc>& pd);
/**
* reset the backward data primitive descriptor.
*/
void resetBwdDataPD(std::shared_ptr<conv_bwdData::primitive_desc>& pd); void resetBwdDataPD(std::shared_ptr<conv_bwdData::primitive_desc>& pd);
/**
* reset the MKLDNNMatrix buffers used in backward.
*/
void resetBwdBuffers(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD, void resetBwdBuffers(std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
std::shared_ptr<conv_bwdData::primitive_desc>& dataPD, std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out); MKLDNNMatrixPtr& out);
/**
* reset the backward pipeline.
*/
void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline, void resetBwdPipeline(std::vector<mkldnn::primitive>& pipeline,
std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD, std::shared_ptr<conv_bwdWgt::primitive_desc>& wgtPD,
std::shared_ptr<conv_bwdData::primitive_desc>& dataPD, std::shared_ptr<conv_bwdData::primitive_desc>& dataPD,
......
...@@ -74,7 +74,7 @@ void MKLDNNFcLayer::convertWeightsToPaddle() { ...@@ -74,7 +74,7 @@ void MKLDNNFcLayer::convertWeightsToPaddle() {
} }
void MKLDNNFcLayer::reshape( void MKLDNNFcLayer::reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) { int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
reshapeInput(bs, ih, iw); reshapeInput(bs, ih, iw);
CHECK_EQ(iLayerSize_, inputLayers_[0]->getSize()); CHECK_EQ(iLayerSize_, inputLayers_[0]->getSize());
...@@ -87,32 +87,29 @@ void MKLDNNFcLayer::reshape( ...@@ -87,32 +87,29 @@ void MKLDNNFcLayer::reshape(
} }
void MKLDNNFcLayer::resetFwd(std::vector<primitive>& pipeline, void MKLDNNFcLayer::resetFwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in, std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) { MKLDNNMatrixPtr& out) {
resetFwdBuffers(in, wgt, bias, out); resetFwdBuffers(inputs[0], wgtVal_, biasVal_, out);
resetFwdPD(fwdPD_, in, wgt, bias, out); resetFwdPD(fwdPD_, inputs[0], wgtVal_, biasVal_, out);
resetFwdPipeline(pipeline, fwdPD_, in, wgt, bias, out); resetFwdPipeline(pipeline, fwdPD_, inputs[0], wgtVal_, biasVal_, out);
} }
void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline, void MKLDNNFcLayer::resetBwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in, std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) { MKLDNNMatrixPtr& out) {
std::shared_ptr<fc_bwdWgt::primitive_desc> bwdWgtPD; std::shared_ptr<fc_bwdWgt::primitive_desc> bwdWgtPD;
std::shared_ptr<fc_bwdData::primitive_desc> bwdDataPD; std::shared_ptr<fc_bwdData::primitive_desc> bwdDataPD;
resetBwdBuffers(in, wgt, bias, out); resetBwdBuffers(inputs[0], wgtGrad_, biasGrad_, out);
resetBwdWgtPD(bwdWgtPD, wgt, bias, out); resetBwdWgtPD(bwdWgtPD, wgtGrad_, biasGrad_, out);
resetBwdDataPD(bwdDataPD, in, out); resetBwdDataPD(bwdDataPD, inputs[0], out);
resetBwdPipeline(pipeline, bwdWgtPD, bwdDataPD, in, wgt, bias, out); resetBwdPipeline(
pipeline, bwdWgtPD, bwdDataPD, inputs[0], wgtGrad_, biasGrad_, out);
} }
void MKLDNNFcLayer::updateWeights(const UpdateCallback& callback) { void MKLDNNFcLayer::updateWeights(const UpdateCallback& callback) {
...@@ -193,9 +190,9 @@ void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in, ...@@ -193,9 +190,9 @@ void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) { MKLDNNMatrixPtr& out) {
CHECK(inVal_ && outVal_); CHECK(inVals_[0] && outVal_);
resetOutGrad(out, outVal_->getPrimitiveDesc()); resetOutGrad(out, outVal_->getPrimitiveDesc());
resetInGrad(in, inVal_->getPrimitiveDesc()); resetInGrad(in, inVals_[0]->getPrimitiveDesc());
CHECK(wgtVal_); CHECK(wgtVal_);
resetWithMatrix(wgt, weight_->getWGrad(), wgtVal_->getPrimitiveDesc()); resetWithMatrix(wgt, weight_->getWGrad(), wgtVal_->getPrimitiveDesc());
...@@ -212,14 +209,15 @@ void MKLDNNFcLayer::resetBwdWgtPD( ...@@ -212,14 +209,15 @@ void MKLDNNFcLayer::resetBwdWgtPD(
MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) { MKLDNNMatrixPtr& out) {
CHECK(inVal_); CHECK(inVals_[0]);
fc_bwdWgt::desc bwdWgtDesc = bias ? fc_bwdWgt::desc(inVal_->getMemoryDesc(), fc_bwdWgt::desc bwdWgtDesc =
wgt->getMemoryDesc(), bias ? fc_bwdWgt::desc(inVals_[0]->getMemoryDesc(),
bias->getMemoryDesc(), wgt->getMemoryDesc(),
out->getMemoryDesc()) bias->getMemoryDesc(),
: fc_bwdWgt::desc(inVal_->getMemoryDesc(), out->getMemoryDesc())
wgt->getMemoryDesc(), : fc_bwdWgt::desc(inVals_[0]->getMemoryDesc(),
out->getMemoryDesc()); wgt->getMemoryDesc(),
out->getMemoryDesc());
pd.reset(new fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_)); pd.reset(new fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, *fwdPD_));
} }
...@@ -245,11 +243,11 @@ void MKLDNNFcLayer::resetBwdPipeline( ...@@ -245,11 +243,11 @@ void MKLDNNFcLayer::resetBwdPipeline(
MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) { MKLDNNMatrixPtr& out) {
CHECK(inVal_); CHECK(inVals_[0]);
if (bias) { if (bias) {
bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVal_, *out, *wgt, *bias)); bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVals_[0], *out, *wgt, *bias));
} else { } else {
bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVal_, *out, *wgt)); bwdWgt_.reset(new fc_bwdWgt(*bwdWgtPD, *inVals_[0], *out, *wgt));
} }
pipeline.push_back(*bwdWgt_); pipeline.push_back(*bwdWgt_);
......
...@@ -52,18 +52,14 @@ public: ...@@ -52,18 +52,14 @@ public:
const ParameterMap& parameterMap) override; const ParameterMap& parameterMap) override;
void reshape( void reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override; int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
void resetFwd(std::vector<mkldnn::primitive>& pipeline, void resetFwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in, std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override; MKLDNNMatrixPtr& out) override;
void resetBwd(std::vector<mkldnn::primitive>& pipeline, void resetBwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in, std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override; MKLDNNMatrixPtr& out) override;
void updateWeights(const UpdateCallback& callback) override; void updateWeights(const UpdateCallback& callback) override;
...@@ -73,11 +69,6 @@ public: ...@@ -73,11 +69,6 @@ public:
void convertWeightsToPaddle() override; void convertWeightsToPaddle() override;
protected: protected:
/**
* Forward functions: reset buffers(input, output, weight and bias),
* reset primitive descriptor,
* reset pipeline.
*/
void resetFwdBuffers(MKLDNNMatrixPtr& in, void resetFwdBuffers(MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& bias,
...@@ -93,13 +84,6 @@ protected: ...@@ -93,13 +84,6 @@ protected:
MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out); MKLDNNMatrixPtr& out);
/**
* Backward functions: reset buffers(input, output, weight and bias),
* reset primitive descriptor for backward weight,
* reset primitive descriptor for backward data,
* reset pipeline.
*/
void resetBwdBuffers(MKLDNNMatrixPtr& in, void resetBwdBuffers(MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& wgt, MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias, MKLDNNMatrixPtr& bias,
......
...@@ -21,8 +21,8 @@ namespace paddle { ...@@ -21,8 +21,8 @@ namespace paddle {
bool MKLDNNLayer::init(const LayerMap& layerMap, bool MKLDNNLayer::init(const LayerMap& layerMap,
const ParameterMap& parameterMap) { const ParameterMap& parameterMap) {
CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn." CHECK(FLAGS_use_mkldnn) << "MKLDNNLayers only support use_mkldnn."
<< "Please set WITH_MKLDNN=ON " << "Please set WITH_MKL=ON "
<< "and set use_mkldnn=True"; << "and set use_mkldnn=True";
CHECK(!useGpu_) << "Do not support GPU yet"; CHECK(!useGpu_) << "Do not support GPU yet";
...@@ -48,31 +48,20 @@ void MKLDNNLayer::forward(PassType passType) { ...@@ -48,31 +48,20 @@ void MKLDNNLayer::forward(PassType passType) {
REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str()); REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
CHECK(!inputLayers_.empty()); CHECK(!inputLayers_.empty());
copySeqInfoToOutputs(); copySeqInfoToOutputs();
size_t elemenCnt = inputLayers_[0]->getOutputValue()->getElementCnt(); if (condition_ != keepCondition()) {
if (inputElemenCnt_ != elemenCnt) {
VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward"; VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
// reset when input total sizes changed, not only the batchsize condition_ = keepCondition();
inputElemenCnt_ = elemenCnt;
pipelineFwd_.clear();
reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_); reshape(bs_, ic_, ih_, iw_, oc_, oh_, ow_);
// all cpu device output grad or value share output's printSizeInfo();
// the output_.value and output_.grad are shared with CPU device
shareCPUDevice(); shareCPUDevice();
resetFwd(pipelineFwd_, inVal_, wgtVal_, biasVal_, outVal_); pipelineFwd_.clear();
// MKLDNNLayer output value should be MKLDNNMatrix inVals_.resize(inputLayers_.size(), nullptr);
// so external output value is necessary. extInVals_.resize(inputLayers_.size(), nullptr);
// Then external input value is not necessary, cvtInVals_.resize(inputLayers_.size(), nullptr);
// since input may be mkldnn internal buffer. resetFwd(pipelineFwd_, inVals_, outVal_);
CHECK(extOutVal_) << "external output value is necessary"; prepareValueConversions(pipelineFwd_);
output_.value = std::dynamic_pointer_cast<Matrix>(extOutVal_);
CHECK(inVal_ && outVal_) << "internal memories are necessary";
if (cvtInVal_) {
pipelineFwd_.insert(pipelineFwd_.begin(), *cvtInVal_);
}
if (cvtOutVal_) {
pipelineFwd_.push_back(*cvtOutVal_);
}
convertWeightsFromPaddle(); convertWeightsFromPaddle();
printSizeInfo();
printValueFormat(); printValueFormat();
needResetBwd_ = true; needResetBwd_ = true;
} }
...@@ -80,8 +69,8 @@ void MKLDNNLayer::forward(PassType passType) { ...@@ -80,8 +69,8 @@ void MKLDNNLayer::forward(PassType passType) {
if (inputLayers_[0]->getType() == "data" && inputLayers_.size() == 1) { if (inputLayers_[0]->getType() == "data" && inputLayers_.size() == 1) {
// Update input value data when input layer is "data" type, // Update input value data when input layer is "data" type,
// since the input value data address might be changed. // since the input value data address might be changed.
CHECK(extInVal_); CHECK(extInVals_[0]);
extInVal_->setData(getInputValue(0, CPU_DEVICE)->getData()); extInVals_[0]->setData(getInputValue(0, CPU_DEVICE)->getData());
} }
if (!outputOnlyMKLDNN_) { if (!outputOnlyMKLDNN_) {
...@@ -99,22 +88,13 @@ void MKLDNNLayer::backward(const UpdateCallback& callback) { ...@@ -99,22 +88,13 @@ void MKLDNNLayer::backward(const UpdateCallback& callback) {
if (needResetBwd_) { if (needResetBwd_) {
VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward"; VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
pipelineBwd_.clear(); pipelineBwd_.clear();
inGrads_.resize(inputLayers_.size(), nullptr);
extInGrads_.resize(inputLayers_.size(), nullptr);
cvtInGrads_.resize(inputLayers_.size(), nullptr);
pipelineMergeGrad_.clear(); pipelineMergeGrad_.clear();
mergeGrad_ = nullptr; mergeGrad_ = nullptr;
resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_); resetBwd(pipelineBwd_, inGrads_, outGrad_);
// external output grad is not necessary prepareGradConversions(pipelineBwd_);
// since output may be mkldnn internal buffer or merge them directly.
CHECK(outGrad_) << "internal output grad is necessary";
if (extOutGrad_) {
CHECK_EQ(extOutGrad_->getData(), output_.grad->getData())
<< "the external buffer should share the same data with output_.grad";
}
if (cvtOutGrad_) {
pipelineBwd_.insert(pipelineBwd_.begin(), *cvtOutGrad_);
}
if (cvtInGrad_) {
pipelineBwd_.push_back(*cvtInGrad_);
}
printGradFormat(); printGradFormat();
needResetBwd_ = false; needResetBwd_ = false;
} }
...@@ -138,8 +118,11 @@ void MKLDNNLayer::backward(const UpdateCallback& callback) { ...@@ -138,8 +118,11 @@ void MKLDNNLayer::backward(const UpdateCallback& callback) {
} }
} }
void MKLDNNLayer::reshapeInput(int& batchsize, int& height, int& width) { void MKLDNNLayer::reshapeInput(int& batchsize,
const Argument& input = inputLayers_[0]->getOutput(); int& height,
int& width,
size_t idx) {
const Argument& input = inputLayers_[idx]->getOutput();
batchsize = input.getBatchSize(); batchsize = input.getBatchSize();
int h = input.getFrameHeight(); int h = input.getFrameHeight();
int w = input.getFrameWidth(); int w = input.getFrameWidth();
...@@ -173,27 +156,30 @@ void MKLDNNLayer::resetWithMatrix(MKLDNNMatrixPtr& dnn, ...@@ -173,27 +156,30 @@ void MKLDNNLayer::resetWithMatrix(MKLDNNMatrixPtr& dnn,
void MKLDNNLayer::resetInValue( void MKLDNNLayer::resetInValue(
MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& in,
const std::shared_ptr<memory::primitive_desc>& intPD, const std::shared_ptr<memory::primitive_desc>& intPD,
size_t inputIdx) { size_t idx,
cvtInVal_ = nullptr; int inputChannel) {
extInVal_ = nullptr; cvtInVals_[idx] = nullptr;
extInVals_[idx] = nullptr;
in = nullptr; in = nullptr;
CHECK_GT(bs_ * ic_ * ih_ * iw_, 0); inputChannel = inputChannel == 0 ? ic_ : inputChannel;
CHECK_GT(bs_ * inputChannel * ih_ * iw_, 0);
auto extPD = MKLDNNMatrix::createPrimitiveDesc( auto extPD = MKLDNNMatrix::createPrimitiveDesc(
{bs_, ic_, ih_, iw_}, format::nchw, engine_); {bs_, inputChannel, ih_, iw_}, format::nchw, engine_);
const MatrixPtr& inMat = inputLayers_[inputIdx]->getOutputValue(); const MatrixPtr& inMat = inputLayers_[idx]->getOutputValue();
extInVal_ = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat); extInVals_[idx] = std::dynamic_pointer_cast<MKLDNNMatrix>(inMat);
CHECK_EQ(inputIsOnlyMKLDNN(), extInVal_ != nullptr); CHECK_EQ(inputIsOnlyMKLDNN(), extInVals_[idx] != nullptr);
if (extInVal_ == nullptr || extInVal_->getFormat() == format::nc) { if (extInVals_[idx] == nullptr ||
extInVal_ = MKLDNNMatrix::create(extPD, inMat); extInVals_[idx]->getFormat() == format::nc) {
extInVals_[idx] = MKLDNNMatrix::create(extPD, inMat);
} }
in = extInVal_; in = extInVals_[idx];
if (nullptr == intPD || in->getPrimitiveDesc() == *intPD) { if (nullptr == intPD || in->getPrimitiveDesc() == *intPD) {
return; return;
} }
// need create reorder // need create reorder
in = MKLDNNMatrix::create(*intPD); in = MKLDNNMatrix::create(*intPD);
cvtInVal_ = MKLDNNMatrix::createReorder(extInVal_, in); cvtInVals_[idx] = MKLDNNMatrix::createReorder(extInVals_[idx], in);
CHECK(cvtInVal_) << "should not be emptry"; CHECK(cvtInVals_[idx]) << "should not be emptry";
} }
void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out, void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out,
...@@ -215,11 +201,11 @@ void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out, ...@@ -215,11 +201,11 @@ void MKLDNNLayer::resetOutValue(MKLDNNMatrixPtr& out,
void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in, void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
memory::primitive_desc intPD, memory::primitive_desc intPD,
size_t inputIdx) { size_t idx) {
cvtInGrad_ = nullptr; cvtInGrads_[idx] = nullptr;
extInGrad_ = nullptr; extInGrads_[idx] = nullptr;
in = nullptr; in = nullptr;
LayerPtr& input = inputLayers_[inputIdx]; LayerPtr& input = inputLayers_[idx];
if (input->getOutputGrad() == nullptr) { if (input->getOutputGrad() == nullptr) {
// no need input grad // no need input grad
return; return;
...@@ -234,23 +220,25 @@ void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in, ...@@ -234,23 +220,25 @@ void MKLDNNLayer::resetInGrad(MKLDNNMatrixPtr& in,
in = MKLDNNMatrix::create(intPD, inMat); in = MKLDNNMatrix::create(intPD, inMat);
Argument& arg = input->getOutput(this->getName()); Argument& arg = input->getOutput(this->getName());
arg.grad = std::dynamic_pointer_cast<Matrix>(in); arg.grad = std::dynamic_pointer_cast<Matrix>(in);
CHECK_PRIMITIVE_DESC_EQ(inVal_, intPD); CHECK_PRIMITIVE_DESC_EQ(inVals_[idx], intPD);
if (inputIsOnlyMKLDNN()) { if (inputIsOnlyMKLDNN()) {
return; return;
} }
extInGrad_ = in; extInGrads_[idx] = in;
if (isPaddleFormat(extInGrad_->getFormat())) { if (isPaddleFormat(extInGrads_[idx]->getFormat())) {
return; return;
} }
// need create reorder // need create reorder
CHECK(extInVal_ != nullptr && isPaddleFormat(extInVal_->getFormat())) CHECK(extInVals_[idx] != nullptr &&
isPaddleFormat(extInVals_[idx]->getFormat()))
<< "should have external input value and the format must be nchw(nc)"; << "should have external input value and the format must be nchw(nc)";
extInGrad_ = MKLDNNMatrix::create(extInVal_->getPrimitiveDesc(), inMat); extInGrads_[idx] =
CHECK_PRIMITIVE_DESC_EQ(inVal_, intPD); MKLDNNMatrix::create(extInVals_[idx]->getPrimitiveDesc(), inMat);
CHECK_PRIMITIVE_DESC_EQ(inVals_[idx], intPD);
in = MKLDNNMatrix::create(intPD); in = MKLDNNMatrix::create(intPD);
cvtInGrad_ = MKLDNNMatrix::createReorder(in, extInGrad_); cvtInGrads_[idx] = MKLDNNMatrix::createReorder(in, extInGrads_[idx]);
CHECK(cvtInGrad_); CHECK(cvtInGrads_[idx]);
} }
void MKLDNNLayer::resetOutGrad(MKLDNNMatrixPtr& out, void MKLDNNLayer::resetOutGrad(MKLDNNMatrixPtr& out,
...@@ -306,22 +294,8 @@ void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) { ...@@ -306,22 +294,8 @@ void MKLDNNLayer::resetMergeGrad(MKLDNNMatrixPtr& out) {
srcs.push_back(*src); srcs.push_back(*src);
} }
// TODO(TJ): remove me when mkldnn sum support different formats auto sumPD = sum::primitive_desc(out->getMemoryDesc(), scales, srcPDs);
for (size_t i = 1; i < srcPDs.size(); ++i) { mergeGrad_.reset(new sum(sumPD, srcs, *out));
CHECK(srcPDs[0] == srcPDs[i]);
}
tmpOutGrad_ = out;
tmpCvt_ = nullptr;
if (out->getPrimitiveDesc() != srcPDs[0]) {
tmpOutGrad_ = MKLDNNMatrix::create(srcPDs[0]);
tmpCvt_ = MKLDNNMatrix::createReorder(tmpOutGrad_, out);
CHECK(tmpCvt_);
pipelineMergeGrad_.push_back(*tmpCvt_);
}
auto sumPD =
sum::primitive_desc(tmpOutGrad_->getMemoryDesc(), scales, srcPDs);
mergeGrad_.reset(new sum(sumPD, srcs, *tmpOutGrad_));
pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_); pipelineMergeGrad_.insert(pipelineMergeGrad_.begin(), *mergeGrad_);
} }
......
...@@ -34,15 +34,16 @@ typedef std::shared_ptr<MKLDNNLayer> MKLDNNLayerPtr; ...@@ -34,15 +34,16 @@ typedef std::shared_ptr<MKLDNNLayer> MKLDNNLayerPtr;
*/ */
class MKLDNNLayer : public Layer { class MKLDNNLayer : public Layer {
protected: protected:
// input value element count
size_t inputElemenCnt_;
// batch size // batch size
int bs_; int bs_;
// their sizes are always from the first input layer
// input image channel, height and width // input image channel, height and width
int ic_, ih_, iw_; int ic_, ih_, iw_;
// output image channel, height and width // output image channel, height and width
int oc_, oh_, ow_; int oc_, oh_, ow_;
// the condition that forward need be reset
size_t condition_;
// backward also need reset after reset forward handle // backward also need reset after reset forward handle
bool needResetBwd_; bool needResetBwd_;
...@@ -67,18 +68,18 @@ protected: ...@@ -67,18 +68,18 @@ protected:
* When all layers are mkldnn layers, they could save internal data. * When all layers are mkldnn layers, they could save internal data.
*/ */
// below MKLDNNMatrix buffers are all internal buffers // below MKLDNNMatrix buffers are all internal buffers
MKLDNNMatrixPtr inVal_; std::vector<MKLDNNMatrixPtr> inVals_;
MKLDNNMatrixPtr inGrad_; std::vector<MKLDNNMatrixPtr> inGrads_;
MKLDNNMatrixPtr outVal_; MKLDNNMatrixPtr outVal_;
MKLDNNMatrixPtr outGrad_; MKLDNNMatrixPtr outGrad_;
// below are external value and grad // below are external value and grad
MKLDNNMatrixPtr extInVal_; std::vector<MKLDNNMatrixPtr> extInVals_;
MKLDNNMatrixPtr extInGrad_; std::vector<MKLDNNMatrixPtr> extInGrads_;
MKLDNNMatrixPtr extOutVal_; MKLDNNMatrixPtr extOutVal_;
MKLDNNMatrixPtr extOutGrad_; MKLDNNMatrixPtr extOutGrad_;
// convert handle between external and internal buffers // convert handle between external and internal buffers
std::shared_ptr<mkldnn::reorder> cvtInVal_; std::vector<std::shared_ptr<mkldnn::reorder>> cvtInVals_;
std::shared_ptr<mkldnn::reorder> cvtInGrad_; std::vector<std::shared_ptr<mkldnn::reorder>> cvtInGrads_;
std::shared_ptr<mkldnn::reorder> cvtOutVal_; std::shared_ptr<mkldnn::reorder> cvtOutVal_;
std::shared_ptr<mkldnn::reorder> cvtOutGrad_; std::shared_ptr<mkldnn::reorder> cvtOutGrad_;
...@@ -93,23 +94,11 @@ protected: ...@@ -93,23 +94,11 @@ protected:
std::vector<mkldnn::primitive> pipelineMergeGrad_; std::vector<mkldnn::primitive> pipelineMergeGrad_;
// tmp input argument to save input grad, only used to merge grad // tmp input argument to save input grad, only used to merge grad
Argument tmpInArg_; Argument tmpInArg_;
// since mkldnn sum do not support different formats:
// can refer to https://github.com/01org/mkl-dnn/issues/134
// so need create reorder manually and save tmp MKLDNNMatrix
MKLDNNMatrixPtr tmpOutGrad_;
std::shared_ptr<mkldnn::primitive> tmpCvt_;
public: public:
explicit MKLDNNLayer(const LayerConfig& config) explicit MKLDNNLayer(const LayerConfig& config)
: Layer(config), : Layer(config),
inputElemenCnt_(0), condition_(0),
bs_(0),
ic_(0),
ih_(0),
iw_(0),
oc_(0),
oh_(0),
ow_(0),
needResetBwd_(true), needResetBwd_(true),
outputOnlyMKLDNN_(false), outputOnlyMKLDNN_(false),
engine_(mkldnn::engine::cpu, 0), engine_(mkldnn::engine::cpu, 0),
...@@ -125,31 +114,28 @@ public: ...@@ -125,31 +114,28 @@ public:
virtual void backward(const UpdateCallback& callback); virtual void backward(const UpdateCallback& callback);
/** /**
* reshape the input image sizes * reshape the input and output channels and image sizes
* and reset output image and buffer size * and reset output buffer size
* output channel can not be changed
*/ */
virtual void reshape( virtual void reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) = 0; int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) = 0;
/** /**
* reset the mkldnn forward primitve and memories * reset the mkldnn forward primitve and memories
* only would be called when input size changes * only would be called when input size changes
* weight and bias buffers should be coverd by child class itself
*/ */
virtual void resetFwd(std::vector<mkldnn::primitive>& pipeline, virtual void resetFwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in, std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) = 0; MKLDNNMatrixPtr& out) = 0;
/** /**
* reset the mkldnn backward primitve and memories * reset the mkldnn backward primitve and memories
* only would be called when needed * only would be called when needed
* weight and bias buffers should be coverd by child class itself
*/ */
virtual void resetBwd(std::vector<mkldnn::primitive>& pipeline, virtual void resetBwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in, std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) = 0; MKLDNNMatrixPtr& out) = 0;
/** /**
...@@ -175,10 +161,19 @@ public: ...@@ -175,10 +161,19 @@ public:
void addOutputArgument(int deviceId) { Layer::addOutputArgument(deviceId); } void addOutputArgument(int deviceId) { Layer::addOutputArgument(deviceId); }
protected: protected:
/**
* Some layers may have different condition to reset the forward.
* The function returns the condition that do not need reset forward.
*/
inline virtual size_t keepCondition() {
// reset when the first input element size changed, not only the batchsize
return inputLayers_[0]->getOutputValue()->getElementCnt();
}
/** /**
* reshape the input image sizes and input batchsize * reshape the input image sizes and input batchsize
*/ */
void reshapeInput(int& batchsize, int& height, int& width); void reshapeInput(int& batchsize, int& height, int& width, size_t idx = 0);
/** /**
* reshape output image sizes * reshape output image sizes
...@@ -196,11 +191,13 @@ protected: ...@@ -196,11 +191,13 @@ protected:
/** /**
* reset input value from input MKLDNNMatrix and internal primitive desc. * reset input value from input MKLDNNMatrix and internal primitive desc.
* reset both internal and external buffer and create reorder if necessary. * reset both internal and external buffer and create reorder if necessary.
* input channel may be different in concat.
*/ */
void resetInValue( void resetInValue(
MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& in,
const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr, const std::shared_ptr<mkldnn::memory::primitive_desc>& intPD = nullptr,
size_t inputIdx = 0); size_t idx = 0,
int inputChannel = 0);
/** /**
* reset output value from internal primitive desc. * reset output value from internal primitive desc.
...@@ -215,7 +212,7 @@ protected: ...@@ -215,7 +212,7 @@ protected:
*/ */
void resetInGrad(MKLDNNMatrixPtr& in, void resetInGrad(MKLDNNMatrixPtr& in,
mkldnn::memory::primitive_desc intPD, mkldnn::memory::primitive_desc intPD,
size_t inputIdx = 0); size_t idx = 0);
/** /**
* reset output grad from internal primitive desc. * reset output grad from internal primitive desc.
...@@ -293,17 +290,19 @@ protected: ...@@ -293,17 +290,19 @@ protected:
* print the mkldnn memory format of value * print the mkldnn memory format of value
*/ */
virtual void printValueFormat() { virtual void printValueFormat() {
if (extInVal_) { for (size_t i = 0; i < inVals_.size(); ++i) {
VLOG(MKLDNN_FMTS) << extInVal_->getFormat() << " >>> "; if (!inVals_[i]) {
} continue;
if (inVal_) { }
VLOG(MKLDNN_FMTS) << inVal_->getFormat() << " >>>"; VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
<< ": " << (extInVals_[i] ? extInVals_[i]->getFormat()
: inVals_[i]->getFormat())
<< " >>> " << inVals_[i]->getFormat() << " >>>";
} }
if (outVal_) { if (outVal_) {
VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> "; VLOG(MKLDNN_FMTS) << outVal_->getFormat() << " >>> "
} << (extOutVal_ ? extOutVal_->getFormat()
if (extOutVal_) { : outVal_->getFormat());
VLOG(MKLDNN_FMTS) << extOutVal_->getFormat();
} }
if (wgtVal_) { if (wgtVal_) {
VLOG(MKLDNN_FMTS) << "Weight value format: " << wgtVal_->getFormat(); VLOG(MKLDNN_FMTS) << "Weight value format: " << wgtVal_->getFormat();
...@@ -317,17 +316,19 @@ protected: ...@@ -317,17 +316,19 @@ protected:
* print the mkldnn memory format of grad * print the mkldnn memory format of grad
*/ */
virtual void printGradFormat() { virtual void printGradFormat() {
if (extOutGrad_) {
VLOG(MKLDNN_FMTS) << extOutGrad_->getFormat();
}
if (outGrad_) { if (outGrad_) {
VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< "; VLOG(MKLDNN_FMTS) << outGrad_->getFormat() << " <<< "
<< (extOutGrad_ ? extOutGrad_->getFormat()
: outGrad_->getFormat());
} }
if (inGrad_) { for (size_t i = 0; i < inGrads_.size(); ++i) {
VLOG(MKLDNN_FMTS) << inGrad_->getFormat() << " <<<"; if (!inGrads_[i]) {
} continue;
if (extInGrad_) { }
VLOG(MKLDNN_FMTS) << extInGrad_->getFormat() << " <<< "; VLOG(MKLDNN_FMTS) << "Input " << i << ", " << inputLayers_[i]->getName()
<< ": " << (extInGrads_[i] ? extInGrads_[i]->getFormat()
: inGrads_[i]->getFormat())
<< " <<< " << inGrads_[i]->getFormat() << " <<<";
} }
if (wgtGrad_) { if (wgtGrad_) {
VLOG(MKLDNN_FMTS) << "Weight grad format: " << wgtGrad_->getFormat(); VLOG(MKLDNN_FMTS) << "Weight grad format: " << wgtGrad_->getFormat();
...@@ -434,6 +435,41 @@ private: ...@@ -434,6 +435,41 @@ private:
outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims; outputOtherDevice_[i].cpuSequenceDims = output_.cpuSequenceDims;
} }
} }
void prepareValueConversions(std::vector<mkldnn::primitive>& pipeline) {
// MKLDNNLayer output value should be MKLDNNMatrix
// so external output value is necessary.
// Then external input value is not necessary,
// since input may be mkldnn internal buffer.
CHECK(extOutVal_) << "external output value is necessary";
output_.value = std::dynamic_pointer_cast<Matrix>(extOutVal_);
CHECK(inVals_[0] && outVal_) << "internal memories are necessary";
for (size_t i = 0; i < cvtInVals_.size(); ++i) {
if (cvtInVals_[i]) {
pipeline.insert(pipeline.begin(), *cvtInVals_[i]);
}
}
if (cvtOutVal_) {
pipeline.push_back(*cvtOutVal_);
}
}
void prepareGradConversions(std::vector<mkldnn::primitive>& pipeline) {
// external output grad is not necessary
// since output may be mkldnn internal buffer or merge them directly.
CHECK(outGrad_) << "internal output grad is necessary";
if (extOutGrad_) {
CHECK_EQ(extOutGrad_->getData(), output_.grad->getData())
<< "the external buffer should share the same data with output_.grad";
}
if (cvtOutGrad_) {
pipeline.insert(pipeline.begin(), *cvtOutGrad_);
}
for (size_t i = 0; i < cvtInGrads_.size(); ++i) {
if (cvtInGrads_[i]) {
pipeline.push_back(*cvtInGrads_[i]);
}
}
}
}; };
} // namespace paddle } // namespace paddle
...@@ -58,10 +58,11 @@ bool MKLDNNPoolLayer::init(const LayerMap& layerMap, ...@@ -58,10 +58,11 @@ bool MKLDNNPoolLayer::init(const LayerMap& layerMap,
} }
void MKLDNNPoolLayer::reshape( void MKLDNNPoolLayer::reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) { int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) {
reshapeInput(bs, ih, iw); reshapeInput(bs, ih, iw);
// ic_ and oc can not be changed // ic_ and oc can not be changed
CHECK_EQ(inputElemenCnt_ / bs / ih / iw, (size_t)ic) CHECK_EQ((size_t)ic,
inputLayers_[0]->getOutputValue()->getElementCnt() / bs / ih / iw)
<< "Input channel can not be changed"; << "Input channel can not be changed";
// cal output sizes // cal output sizes
...@@ -74,29 +75,25 @@ void MKLDNNPoolLayer::reshape( ...@@ -74,29 +75,25 @@ void MKLDNNPoolLayer::reshape(
} }
void MKLDNNPoolLayer::resetFwd(std::vector<primitive>& pipeline, void MKLDNNPoolLayer::resetFwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in, std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) { MKLDNNMatrixPtr& out) {
resetFwdBuffers(in, out); resetFwdBuffers(inputs[0], out);
resetFwdPD(fwdPD_, in, out); resetFwdPD(fwdPD_, inputs[0], out);
resetFwdPipeline(pipeline, fwdPD_, in, out); resetFwdPipeline(pipeline, fwdPD_, inputs[0], out);
} }
void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline, void MKLDNNPoolLayer::resetBwd(std::vector<primitive>& pipeline,
MKLDNNMatrixPtr& in, std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) { MKLDNNMatrixPtr& out) {
std::shared_ptr<pool_bwd::primitive_desc> pd; std::shared_ptr<pool_bwd::primitive_desc> pd;
resetBwdBuffers(in, out); resetBwdBuffers(inputs[0], out);
resetBwdPD(pd, in, out); resetBwdPD(pd, inputs[0], out);
resetBwdPipeline(pipeline, pd, in, out); resetBwdPipeline(pipeline, pd, inputs[0], out);
} }
void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in, void MKLDNNPoolLayer::resetFwdBuffers(MKLDNNMatrixPtr& in,
...@@ -151,9 +148,9 @@ void MKLDNNPoolLayer::resetFwdPipeline( ...@@ -151,9 +148,9 @@ void MKLDNNPoolLayer::resetFwdPipeline(
void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in, void MKLDNNPoolLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& out) { MKLDNNMatrixPtr& out) {
CHECK(inVal_ && outVal_); CHECK(inVals_[0] && outVal_);
resetOutGrad(out, outVal_->getPrimitiveDesc()); resetOutGrad(out, outVal_->getPrimitiveDesc());
resetInGrad(in, inVal_->getPrimitiveDesc()); resetInGrad(in, inVals_[0]->getPrimitiveDesc());
} }
void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd, void MKLDNNPoolLayer::resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
......
...@@ -53,18 +53,14 @@ public: ...@@ -53,18 +53,14 @@ public:
const ParameterMap& parameterMap) override; const ParameterMap& parameterMap) override;
void reshape( void reshape(
int& bs, int& ic, int& ih, int& iw, int oc, int& oh, int& ow) override; int& bs, int& ic, int& ih, int& iw, int& oc, int& oh, int& ow) override;
void resetFwd(std::vector<mkldnn::primitive>& pipeline, void resetFwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in, std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override; MKLDNNMatrixPtr& out) override;
void resetBwd(std::vector<mkldnn::primitive>& pipeline, void resetBwd(std::vector<mkldnn::primitive>& pipeline,
MKLDNNMatrixPtr& in, std::vector<MKLDNNMatrixPtr>& inputs,
MKLDNNMatrixPtr& wgt,
MKLDNNMatrixPtr& bias,
MKLDNNMatrixPtr& out) override; MKLDNNMatrixPtr& out) override;
void printSizeInfo() override { void printSizeInfo() override {
...@@ -75,11 +71,6 @@ public: ...@@ -75,11 +71,6 @@ public:
} }
protected: protected:
/**
* Forward functions: reset buffers(input, output),
* reset primitive descriptor,
* reset pipeline.
*/
void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out); void resetFwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
void resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd, void resetFwdPD(std::shared_ptr<pool_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr in, MKLDNNMatrixPtr in,
...@@ -88,12 +79,6 @@ protected: ...@@ -88,12 +79,6 @@ protected:
std::shared_ptr<pool_fwd::primitive_desc>& pd, std::shared_ptr<pool_fwd::primitive_desc>& pd,
MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& in,
MKLDNNMatrixPtr& out); MKLDNNMatrixPtr& out);
/**
* Backward functions: reset buffers(input, output),
* reset primitive descriptor,
* reset pipeline.
*/
void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out); void resetBwdBuffers(MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& out);
void resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd, void resetBwdPD(std::shared_ptr<pool_bwd::primitive_desc>& pd,
MKLDNNMatrixPtr& in, MKLDNNMatrixPtr& in,
......
...@@ -100,8 +100,9 @@ void ROIPoolLayer::forward(PassType passType) { ...@@ -100,8 +100,9 @@ void ROIPoolLayer::forward(PassType passType) {
size_t roiEndH = round(bottomROIs[4] * spatialScale_); size_t roiEndH = round(bottomROIs[4] * spatialScale_);
CHECK_GE(roiBatchIdx, 0UL); CHECK_GE(roiBatchIdx, 0UL);
CHECK_LT(roiBatchIdx, batchSize); CHECK_LT(roiBatchIdx, batchSize);
size_t roiHeight = std::max(roiEndH - roiStartH + 1, 1UL); size_t roiHeight =
size_t roiWidth = std::max(roiEndW - roiStartW + 1, 1UL); std::max(roiEndH - roiStartH + 1, static_cast<size_t>(1));
size_t roiWidth = std::max(roiEndW - roiStartW + 1, static_cast<size_t>(1));
real binSizeH = real binSizeH =
static_cast<real>(roiHeight) / static_cast<real>(pooledHeight_); static_cast<real>(roiHeight) / static_cast<real>(pooledHeight_);
real binSizeW = real binSizeW =
...@@ -114,10 +115,14 @@ void ROIPoolLayer::forward(PassType passType) { ...@@ -114,10 +115,14 @@ void ROIPoolLayer::forward(PassType passType) {
size_t wstart = static_cast<size_t>(std::floor(pw * binSizeW)); size_t wstart = static_cast<size_t>(std::floor(pw * binSizeW));
size_t hend = static_cast<size_t>(std::ceil((ph + 1) * binSizeH)); size_t hend = static_cast<size_t>(std::ceil((ph + 1) * binSizeH));
size_t wend = static_cast<size_t>(std::ceil((pw + 1) * binSizeW)); size_t wend = static_cast<size_t>(std::ceil((pw + 1) * binSizeW));
hstart = std::min(std::max(hstart + roiStartH, 0UL), height_); hstart = std::min(
wstart = std::min(std::max(wstart + roiStartW, 0UL), width_); std::max(hstart + roiStartH, static_cast<size_t>(0)), height_);
hend = std::min(std::max(hend + roiStartH, 0UL), height_); wstart = std::min(
wend = std::min(std::max(wend + roiStartW, 0UL), width_); std::max(wstart + roiStartW, static_cast<size_t>(0)), width_);
hend = std::min(std::max(hend + roiStartH, static_cast<size_t>(0)),
height_);
wend = std::min(std::max(wend + roiStartW, static_cast<size_t>(0)),
width_);
bool isEmpty = (hend <= hstart) || (wend <= wstart); bool isEmpty = (hend <= hstart) || (wend <= wstart);
size_t poolIndex = ph * pooledWidth_ + pw; size_t poolIndex = ph * pooledWidth_ + pw;
......
...@@ -29,7 +29,7 @@ gserver_test(test_KmaxSeqScore) ...@@ -29,7 +29,7 @@ gserver_test(test_KmaxSeqScore)
gserver_test(test_Expand) gserver_test(test_Expand)
gserver_test(test_MaxPoolingWithMaskOutput) gserver_test(test_MaxPoolingWithMaskOutput)
########## test_Mkldnn layers and activations ########## ########## test_MKLDNN layers and activations ##########
if(WITH_MKLDNN) if(WITH_MKLDNN)
add_unittest_without_exec(test_MKLDNN add_unittest_without_exec(test_MKLDNN
test_MKLDNN.cpp test_MKLDNN.cpp
...@@ -62,17 +62,6 @@ if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE) ...@@ -62,17 +62,6 @@ if(NOT WITH_DOUBLE AND NOT MOBILE_INFERENCE)
endif() endif()
if(NOT MOBILE_INFERENCE) if(NOT MOBILE_INFERENCE)
################### test_ProtoDataProvider ############
add_unittest_without_exec(test_ProtoDataProvider
test_ProtoDataProvider.cpp)
# test_ProtoDataProvider will mkdir as same name,
# so if WORKING_DIRECTORY is default directory, then
# mkdir will get error.
add_test(NAME test_ProtoDataProvider
COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoDataProvider
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
################## test_Evaluator ####################### ################## test_Evaluator #######################
add_unittest(test_Evaluator add_unittest(test_Evaluator
test_Evaluator.cpp) test_Evaluator.cpp)
...@@ -110,3 +99,24 @@ add_test(NAME test_PyDataProvider2 ...@@ -110,3 +99,24 @@ add_test(NAME test_PyDataProvider2
COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2 COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
) )
################# test_CompareSparse ##################
add_unittest_without_exec(test_CompareSparse
test_CompareSparse.cpp)
if(NOT ON_TRAVIS)
add_test(NAME test_CompareSparse
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
./.set_port.sh -p port -n 6
${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
endif()
################ test_CompareTwoNets ######################
add_unittest_without_exec(test_CompareTwoNets
test_CompareTwoNets.cpp)
add_test(NAME test_CompareTwoNets
COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
...@@ -23,7 +23,7 @@ limitations under the License. */ ...@@ -23,7 +23,7 @@ limitations under the License. */
namespace paddle { namespace paddle {
/** /**
* @brief test the functionality of Mkldnnlayers * @brief test the functionality of MKLDNNlayers and MKLDNNActivations
* refer to paddle original function * refer to paddle original function
*/ */
class MKLDNNTester { class MKLDNNTester {
......
./test_ProtoDataProvider/data1.bin
./test_ProtoDataProvider/data2.bin
./test_ProtoDataProvider/data1.bin.gz
./test_ProtoDataProvider/data2.bin.gz
#!/usr/bin/env python
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
...@@ -14,27 +15,50 @@ ...@@ -14,27 +15,50 @@
from paddle.trainer_config_helpers import * from paddle.trainer_config_helpers import *
################################### Data Configuration ################################### ######################## data source ################################
TrainData(ProtoData(files = "trainer/tests/mnist.list")) dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
################################### Algorithm Configuration ################################### dict_file = dict()
settings(batch_size = 1000, for line_count, line in enumerate(open(dict_path, "r")):
learning_method = MomentumOptimizer(momentum=0.5, sparse=False)) dict_file[line.strip()] = line_count
################################### Network Configuration ###################################
data = data_layer(name ="input", size=784)
fc1 = fc_layer(input=data, size=800, define_py_data_sources2(
bias_attr=True, train_list='gserver/tests/Sequence/train.list',
act=SigmoidActivation()) test_list=None,
module='sequenceGen',
obj='process',
args={"dict_file": dict_file})
fc2 = fc_layer(input=fc1, size=800, settings(batch_size=5)
bias_attr=True, ######################## network configure ################################
act=SigmoidActivation()) dict_dim = len(open(dict_path, 'r').readlines())
word_dim = 128
hidden_dim = 256
label_dim = 3
sparse_update = get_config_arg("sparse_update", bool, False)
output = fc_layer(input=[fc1, fc2], size=10, data = data_layer(name="word", size=dict_dim)
bias_attr=True,
act=SoftmaxActivation())
lbl = data_layer(name ="label", size=1) emb = embedding_layer(
input=data,
size=word_dim,
param_attr=ParamAttr(sparse_update=sparse_update))
cost = classification_cost(input=output, label=lbl) with mixed_layer(size=hidden_dim * 4) as lstm_input:
outputs(cost) lstm_input += full_matrix_projection(input=emb)
lstm = lstmemory(
input=lstm_input,
act=TanhActivation(),
gate_act=SigmoidActivation(),
state_act=TanhActivation())
lstm_last = last_seq(input=lstm)
with mixed_layer(
size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
output += full_matrix_projection(input=lstm_last)
outputs(
classification_cost(
input=output, label=data_layer(
name="label", size=1)))
#!/usr/bin/env python
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved # Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
...@@ -14,27 +15,42 @@ ...@@ -14,27 +15,42 @@
from paddle.trainer_config_helpers import * from paddle.trainer_config_helpers import *
################################### Data Configuration ################################### ######################## data source ################################
TrainData(ProtoData(files = "trainer/tests/mnist.list")) dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
################################### Algorithm Configuration ################################### dict_file = dict()
settings(batch_size = 1000, for line_count, line in enumerate(open(dict_path, "r")):
learning_method = MomentumOptimizer(momentum=0.5, sparse=False)) dict_file[line.strip()] = line_count
################################### Network Configuration ###################################
data = data_layer(name ="input", size=784)
fc1 = fc_layer(input=data, size=800, define_py_data_sources2(
bias_attr=True, train_list='gserver/tests/Sequence/train.list',
act=SigmoidActivation()) test_list=None,
module='sequenceGen',
obj='process',
args={"dict_file": dict_file})
fc2 = fc_layer(input=fc1, size=800, settings(batch_size=5)
bias_attr=True, ######################## network configure ################################
act=SigmoidActivation()) dict_dim = len(open(dict_path, 'r').readlines())
word_dim = 128
hidden_dim = 128
label_dim = 3
output = fc_layer(input=[fc1, fc2], size=10, # This config is designed to be equivalent with sequence_recurrent_group.py
bias_attr=True,
act=SoftmaxActivation())
lbl = data_layer(name ="label", size=1) data = data_layer(name="word", size=dict_dim)
cost = classification_cost(input=output, label=lbl) emb = embedding_layer(
outputs(cost) input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
recurrent = recurrent_layer(input=emb, bias_attr=False, act=SoftmaxActivation())
recurrent_last = last_seq(input=recurrent)
with mixed_layer(
size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
output += full_matrix_projection(input=recurrent_last)
outputs(
classification_cost(
input=output, label=data_layer(
name="label", size=1)))
#!/usr/bin/env python
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.trainer_config_helpers import *
######################## data source ################################
dict_path = 'gserver/tests/Sequence/tour_dict_phrase.dict'
dict_file = dict()
for line_count, line in enumerate(open(dict_path, "r")):
dict_file[line.strip()] = line_count
define_py_data_sources2(
train_list='gserver/tests/Sequence/train.list',
test_list=None,
module='sequenceGen',
obj='process',
args={"dict_file": dict_file})
settings(batch_size=5)
######################## network configure ################################
dict_dim = len(open(dict_path, 'r').readlines())
word_dim = 128
hidden_dim = 128
label_dim = 3
# This config is designed to be equivalent with sequence_recurrent.py
data = data_layer(name="word", size=dict_dim)
emb = embedding_layer(
input=data, size=word_dim, param_attr=ParamAttr(name="emb"))
def step(y):
mem = memory(name="rnn_state", size=hidden_dim)
with mixed_layer(
name="rnn_state",
size=hidden_dim,
bias_attr=False,
act=SoftmaxActivation()) as out:
out += identity_projection(input=y)
out += full_matrix_projection(
input=mem, param_attr=ParamAttr(name="___recurrent_layer_0__"))
return out
recurrent = recurrent_group(name="rnn", step=step, input=emb)
recurrent_last = last_seq(input=recurrent)
with mixed_layer(
size=label_dim, act=SoftmaxActivation(), bias_attr=True) as output:
output += full_matrix_projection(input=recurrent_last)
outputs(
classification_cost(
input=output, label=data_layer(
name="label", size=1)))
...@@ -22,8 +22,7 @@ limitations under the License. */ ...@@ -22,8 +22,7 @@ limitations under the License. */
using namespace paddle; // NOLINT using namespace paddle; // NOLINT
using namespace std; // NOLINT using namespace std; // NOLINT
static const string& configFile1 = static const string& configFile1 = "gserver/tests/sequence_lstm.conf";
"trainer/tests/sample_trainer_config_compare_sparse.conf";
DECLARE_bool(use_gpu); DECLARE_bool(use_gpu);
DECLARE_string(config); DECLARE_string(config);
......
...@@ -30,8 +30,6 @@ DECLARE_bool(use_gpu); ...@@ -30,8 +30,6 @@ DECLARE_bool(use_gpu);
DECLARE_string(config); DECLARE_string(config);
DECLARE_string(nics); DECLARE_string(nics);
DEFINE_string(config_file_a, "", "config of one network to compare");
DEFINE_string(config_file_b, "", "config of another network to compare");
DEFINE_bool(need_high_accuracy, DEFINE_bool(need_high_accuracy,
false, false,
"whether need to run in double accuracy"); "whether need to run in double accuracy");
...@@ -42,6 +40,10 @@ DEFINE_double( ...@@ -42,6 +40,10 @@ DEFINE_double(
DECLARE_bool(thread_local_rand_use_global_seed); DECLARE_bool(thread_local_rand_use_global_seed);
DECLARE_int32(seed); DECLARE_int32(seed);
static const string& config_file_a = "gserver/tests/sequence_recurrent.py";
static const string& config_file_b =
"gserver/tests/sequence_recurrent_group.py";
struct ComData { struct ComData {
vector<Argument> outArgs; vector<Argument> outArgs;
vector<ParameterPtr> parameters; vector<ParameterPtr> parameters;
...@@ -66,6 +68,7 @@ void calcGradient(ComData& data, const string configFile) { ...@@ -66,6 +68,7 @@ void calcGradient(ComData& data, const string configFile) {
DataBatch dataBatch; DataBatch dataBatch;
int32_t batchSize = trainer.getConfig().opt_config().batch_size(); int32_t batchSize = trainer.getConfig().opt_config().batch_size();
trainer.getDataProvider()->reset();
trainer.getDataProvider()->setSkipShuffle(); trainer.getDataProvider()->setSkipShuffle();
trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch); trainer.getDataProvider()->getNextBatch(batchSize, &dataBatch);
...@@ -167,11 +170,11 @@ void compareGradient(ComData& comDataA, ComData& comDataB) { ...@@ -167,11 +170,11 @@ void compareGradient(ComData& comDataA, ComData& comDataB) {
TEST(Trainer, create) { TEST(Trainer, create) {
ComData dataA; ComData dataA;
calcGradient(dataA, FLAGS_config_file_a); calcGradient(dataA, config_file_a);
LOG(INFO) << "\n\nforwardBackward of Network A is finished\n\n"; LOG(INFO) << "\n\nforwardBackward of Network A is finished\n\n";
ComData dataB; ComData dataB;
calcGradient(dataB, FLAGS_config_file_b); calcGradient(dataB, config_file_b);
LOG(INFO) << "\n\nforwardBackward of the Network B is finished\n\n"; LOG(INFO) << "\n\nforwardBackward of the Network B is finished\n\n";
compareGradient(dataA, dataB); compareGradient(dataA, dataB);
......
...@@ -583,6 +583,7 @@ TEST(Layer, maxoutLayer) { ...@@ -583,6 +583,7 @@ TEST(Layer, maxoutLayer) {
testLayerGrad(config, "maxout", 10, false, useGpu); testLayerGrad(config, "maxout", 10, false, useGpu);
} }
} }
void testFcLayer(string format, size_t nnz) { void testFcLayer(string format, size_t nnz) {
TestConfig config; TestConfig config;
config.biasSize = 1024; config.biasSize = 1024;
...@@ -1081,6 +1082,21 @@ TEST(Layer, InterpolationLayer) { ...@@ -1081,6 +1082,21 @@ TEST(Layer, InterpolationLayer) {
} }
} }
TEST(Layer, DotProdLayer) {
TestConfig config;
config.layerConfig.set_type("dot_prod");
config.layerConfig.set_size(1);
config.inputDefs.push_back({INPUT_DATA, "layer_0", 10, 0});
config.layerConfig.add_inputs();
config.inputDefs.push_back({INPUT_DATA, "layer_1", 10, 0});
config.layerConfig.add_inputs();
for (auto useGpu : {false, true}) {
testLayerGrad(config, "dot_prod", 10, false, useGpu);
}
}
TEST(Layer, OuterProdLayer) { TEST(Layer, OuterProdLayer) {
TestConfig config; TestConfig config;
config.layerConfig.set_type("out_prod"); config.layerConfig.set_type("out_prod");
...@@ -2429,6 +2445,25 @@ TEST(Layer, ScaleSubRegionLayer) { ...@@ -2429,6 +2445,25 @@ TEST(Layer, ScaleSubRegionLayer) {
} }
} }
TEST(Layer, L2DistanceLayer) {
TestConfig config;
config.layerConfig.set_type("l2_distance");
config.layerConfig.set_size(1);
config.biasSize = 0;
const size_t input_dim = 27;
const size_t batch_size = 11;
config.inputDefs.push_back({INPUT_DATA, "layer_0", input_dim, 0});
config.inputDefs.push_back({INPUT_DATA, "layer_1", input_dim, 0});
config.layerConfig.add_inputs();
config.layerConfig.add_inputs();
for (auto useGpu : {false, true}) {
testLayerGrad(config, "l2_distance", batch_size, false, useGpu);
}
}
int main(int argc, char** argv) { int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv); testing::InitGoogleTest(&argc, argv);
initMain(argc, argv); initMain(argc, argv);
......
...@@ -313,6 +313,47 @@ TEST(MKLDNNLayer, AddtoLayer) { ...@@ -313,6 +313,47 @@ TEST(MKLDNNLayer, AddtoLayer) {
testAddtoLayer({4, 12, 1, 1}, 3); testAddtoLayer({4, 12, 1, 1}, 3);
} }
static void getMKLDNNConcatConfig(TestConfig& cfg,
const std::vector<testImageDesc>& inputs) {
CHECK_GE(inputs.size(), 2UL) << "at least two inputs";
int oc = inputs[0].ic;
for (size_t i = 1; i < inputs.size(); ++i) {
CHECK_EQ(inputs[i].bs, inputs[0].bs);
CHECK_EQ(inputs[i].ih, inputs[0].ih);
CHECK_EQ(inputs[i].iw, inputs[0].iw);
oc += inputs[i].ic;
}
cfg.biasSize = 0;
cfg.layerConfig.set_type("mkldnn_concat");
cfg.layerConfig.set_size(oc * inputs[0].ih * inputs[0].iw);
cfg.layerConfig.set_active_type("relu");
for (size_t i = 0; i < inputs.size(); ++i) {
std::stringstream ss;
ss << "layer_" << i;
cfg.inputDefs.push_back(
{INPUT_DATA,
ss.str(),
(size_t)(inputs[i].ic) * inputs[i].ih * inputs[i].iw,
0});
LayerInputConfig* input = cfg.layerConfig.add_inputs();
ImageConfig* img_conf = input->mutable_image_conf();
img_conf->set_channels(inputs[i].ic);
img_conf->set_img_size_y(inputs[i].ih);
img_conf->set_img_size(inputs[i].iw);
}
}
void testConcatLayer(const std::vector<testImageDesc>& inputs) {
TestConfig dnnConfig;
getMKLDNNConcatConfig(dnnConfig, inputs);
RUN_MKLDNN_TEST_LAYER(dnnConfig, "concat", inputs[0])
}
TEST(MKLDNNLayer, ConcatLayer) {
testConcatLayer({{64, 128, 1, 1}, {64, 32, 1, 1}, {64, 64, 1, 1}});
testConcatLayer({{32, 100, 8, 8}, {32, 10, 8, 8}});
}
void testActivation(std::string actType, const testImageDesc& pm) { void testActivation(std::string actType, const testImageDesc& pm) {
// TODO(TJ): remove me when paddle support elu activation // TODO(TJ): remove me when paddle support elu activation
if (actType == "mkldnn_elu") { if (actType == "mkldnn_elu") {
......
...@@ -17,9 +17,13 @@ limitations under the License. */ ...@@ -17,9 +17,13 @@ limitations under the License. */
#include "paddle/utils/StringUtil.h" #include "paddle/utils/StringUtil.h"
#include "paddle/utils/Util.h" #include "paddle/utils/Util.h"
#ifndef PADDLE_MOBILE_INFERENCE
DEFINE_int32(pool_limit_size, DEFINE_int32(pool_limit_size,
536870912, 536870912,
"maximum memory size managed by a memory pool, default is 512M"); "maximum memory size managed by a memory pool, default is 512M");
#else
DEFINE_int32(pool_limit_size, 0, "default is 0");
#endif
namespace paddle { namespace paddle {
......
# Region-based Heterogeneous Memory Management # Region-based Heterogeneous Memory Management
## Design
Please check out the [design documentation](http://gangliao.me) to find out more details about ### Usage
buddy memory allocator for both CPU and GPU.
To allocate 4KB CPU memory:
```cpp
p = memory::Alloc(platform::CPUPlace(), 4*1024);
```
To allocate 4KB memory on the 3rd GPU:
```cpp
p = memory::Alloc(platform::GPUPlace(2), 4*1024);
```
To free memory and check the so-far used amount of memory on a place:
```cpp
auto pl = platform::GPUPlace(0);
p = memory::Alloc(pl, 4*1024);
cout << memory::Used(pl);
memory::Free(pl, p);
```
### API
In `paddle/memory/memory.h` we have:
```cpp
namespace memory {
template <typename Place> void* Alloc(Place, size_t);
template <typename Place> void Free(Place, void*);
template <typename Place> size_t Used(Place);
} // namespace memory
```
These function templates have specializations on either `platform::CPUPlace` or `platform::GPUPlace`:
```cpp
template<>
void* Alloc<CPUPlace>(CPUPlace p, size_t size) {
return GetCPUBuddyAllocator()->Alloc(size);
}
```
and
```cpp
template<>
void Alloc<GPUPlace>(GPUPlace p, size_t size) {
return GetGPUBuddyAllocator(p.id)->Alloc(size);
}
```
Similar specializations exist for `Free` and `Used`.
### Implementation
`GetCPUBuddyAllocator` and `GetGPUBuddyAllocator` are singletions.
```cpp
BuddyAllocator* GetCPUBuddyAllocator() {
static BuddyAllocator* a = NULL;
if (a == NULL) {
a = new BuddyAllocator(new CPUAllocator /*backup allocator*/, ...);
}
return a;
}
BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
static BuddyAllocator* as = NULL;
if (as == NULL) {
as = new BuddyAllocator*[platform::NumGPUs()];
for (int gpu = 0; gpu < platform::NumGPUs(); gpu++) {
as[gpu] = new BuddyAllocator(new GPUAllocator(gpu) /* backup allocator */, ...);
}
}
return as[gpu_id);
```
#### `BuddyAllocator`
`BuddyAllocator` implements the buddy allocation algorithm. Its constructor takes parameters only related with the algorithm:
```cpp
BuddyAllocator::BuddyAllocator(initial_pool_size, max_pool_size) {
...
}
```
Please be aware that **`BuddyAllocator` always allocate aligned memory**, aligned on 32-bytes, which can hold a `BuddyAllocator::Block` object:
```cpp
class BuddyAllocator {
private:
struct Block {
size_t size;
Block* left, right;
size_t index; // allocator id
};
...
};
```
Because BuddyAllocator has the meta-data of each block, it can trace the used memory -- record the amount returned by `Alloc` freed in `Free`. Instead, `CPUAllocator` and `GPUAllocator` doesn't know the size of freed memory block and cannot do the trace.
#### System Allocators
The `GPUAllocator` and `CPUAllocator` are calls *system allocators*. They work as the fallback allocators of `BuddyAllocator`.
## Justification
I got inspiration from Majel and Caffe2, though above design look different from both.
### Caffe2
In Caffe2, `Tensor<Context>::mutable_data()` allocates the memroy. In particular, [`Tensor<Context>::mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L523) calls [`Tensor<Context>::raw_mutable_data`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L459), which in turn calls [`Context::New`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/tensor.h#L479).
There are two implementations of `Context`:
1. [`CPUContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L105), whose [`New` method](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.h#L131) calls [`g_cpu_allocator.get()->New(size_t)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context.cc#L15) to allocate the memory.
1. [`CUDAContext`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L99), which has a data member [`int gpu_id_`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.h#L202). This looks very similar to class `majel::GPUPlace`, who also has an `int id_` data member. `CUDAContext::New(size_t)` calls [`g_cub_allocator->DeviceAllocate(&ptr, nbytes)`](https://github.com/caffe2/caffe2/blob/v0.7.0/caffe2/core/context_gpu.cu#L355) to allocate the memory.
### Majel
In Majel, there are basically two allocator types:
1. `cpu::SystemAllocator`, which has similar functionality to `caffe2::CPUContext::New/Delete`.
1. `gpu::SystemAllocator`, which has similar functionality to `caffe2::CUDAContext::New/Delete`.
However, memory allocation is not via these two allocators. Instead, these two allocators are defined in hidden namespaces.
In Majel there are hidden global variables like:
1. `cpu::SystemAllocator g_cpu_allocator`, and
1. `vector<gpu::SystemAllocator*> g_gpu_allocators(NUM_GPUS)`.
Programs allocate memory via a BuddyAllocator, which can take the `g_cpu_allocator` or a `g_gpu_allocators[gpu_id]` as its *fallback allocator*, so that if BuddyAllocator cannot find a block in its memory pool, it extends its memory pool by calling the fallback allocator's `New(size_t)`.
...@@ -9,6 +9,7 @@ function(op_library TARGET) ...@@ -9,6 +9,7 @@ function(op_library TARGET)
set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE) set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE)
set(cc_srcs) set(cc_srcs)
set(cu_srcs) set(cu_srcs)
set(cu_cc_srcs)
set(op_common_deps operator op_registry math_function) set(op_common_deps operator op_registry math_function)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
...@@ -22,6 +23,9 @@ function(op_library TARGET) ...@@ -22,6 +23,9 @@ function(op_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
list(APPEND cc_srcs ${TARGET}.cc) list(APPEND cc_srcs ${TARGET}.cc)
endif() endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
list(APPEND cu_cc_srcs ${TARGET}.cu.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
list(APPEND cu_srcs ${TARGET}.cu) list(APPEND cu_srcs ${TARGET}.cu)
endif() endif()
...@@ -29,6 +33,8 @@ function(op_library TARGET) ...@@ -29,6 +33,8 @@ function(op_library TARGET)
foreach(src ${op_library_SRCS}) foreach(src ${op_library_SRCS})
if (${src} MATCHES ".*\\.cu$") if (${src} MATCHES ".*\\.cu$")
list(APPEND cu_srcs ${src}) list(APPEND cu_srcs ${src})
elseif(${src} MATCHES ".*\\.cu.cc$")
list(APPEND cu_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cc$") elseif(${src} MATCHES ".*\\.cc$")
list(APPEND cc_srcs ${src}) list(APPEND cc_srcs ${src})
else() else()
...@@ -43,7 +49,7 @@ function(op_library TARGET) ...@@ -43,7 +49,7 @@ function(op_library TARGET)
endif() endif()
if (WITH_GPU) if (WITH_GPU)
nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
${op_common_deps}) ${op_common_deps})
else() else()
cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS} cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS}
...@@ -55,6 +61,18 @@ function(op_library TARGET) ...@@ -55,6 +61,18 @@ function(op_library TARGET)
set(pybind_flag 1) set(pybind_flag 1)
endif() endif()
if ("${TARGET}" STREQUAL "compare_op")
set(pybind_flag 1)
file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal);\n")
endif()
# conv_op contains several operators
if ("${TARGET}" STREQUAL "conv_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
endif()
# pool_op contains several operators # pool_op contains several operators
if ("${TARGET}" STREQUAL "pool_op") if ("${TARGET}" STREQUAL "pool_op")
set(pybind_flag 1) set(pybind_flag 1)
...@@ -62,23 +80,23 @@ function(op_library TARGET) ...@@ -62,23 +80,23 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP(pool2d);\n") file(APPEND ${pybind_file} "USE_OP(pool2d);\n")
endif() endif()
if ("${TARGET}" STREQUAL "compare_op") # pool_cudnn_op contains several operators
if ("${TARGET}" STREQUAL "pool_cudnn_op")
set(pybind_flag 1) set(pybind_flag 1)
file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal);\n") # It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(pool2d_cudnn);\n")
endif() endif()
# pool_with_index_op contains several operators if ("${TARGET}" STREQUAL "logical_op")
if ("${TARGET}" STREQUAL "pool_with_index_op")
set(pybind_flag 1) set(pybind_flag 1)
# It's enough to just adding one operator to pybind file(APPEND ${pybind_file} "USE_OP(logical_and);\n")
file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n")
endif() endif()
# conv_op contains several operators # pool_with_index_op contains several operators
if ("${TARGET}" STREQUAL "conv_op") if ("${TARGET}" STREQUAL "pool_with_index_op")
set(pybind_flag 1) set(pybind_flag 1)
# It's enough to just adding one operator to pybind # It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(conv2d);\n") file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n")
endif() endif()
# conv_transpose_op contains several operators # conv_transpose_op contains several operators
...@@ -87,12 +105,12 @@ function(op_library TARGET) ...@@ -87,12 +105,12 @@ function(op_library TARGET)
# It's enough to just adding one operator to pybind # It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(conv2d_transpose);\n") file(APPEND ${pybind_file} "USE_OP(conv2d_transpose);\n")
endif() endif()
# pool_cudnn_op contains several operators # conv_transpose_cudnn_op contains two operators
if ("${TARGET}" STREQUAL "pool_cudnn_op") if ("${TARGET}" STREQUAL "conv_transpose_cudnn_op")
set(pybind_flag 1) set(pybind_flag 1)
# It's enough to just adding one operator to pybind # It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(pool2d_cudnn);\n") file(APPEND ${pybind_file} "USE_OP(conv2d_transpose_cudnn);\n")
endif() endif()
# save_restore_op contains several operators # save_restore_op contains several operators
...@@ -140,7 +158,9 @@ function(op_library TARGET) ...@@ -140,7 +158,9 @@ function(op_library TARGET)
# pybind USE_CPU_ONLY_OP # pybind USE_CPU_ONLY_OP
list(LENGTH cu_srcs cu_srcs_len) list(LENGTH cu_srcs cu_srcs_len)
if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0) list(LENGTH cu_cc_srcs cu_cc_srcs_len)
if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n") file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
set(pybind_flag 1) set(pybind_flag 1)
endif() endif()
...@@ -160,11 +180,13 @@ set(DEPS_OPS ...@@ -160,11 +180,13 @@ set(DEPS_OPS
recurrent_op recurrent_op
dynamic_recurrent_op dynamic_recurrent_op
softmax_with_cross_entropy_op softmax_with_cross_entropy_op
softmax_op
sequence_softmax_op
sum_op sum_op
pool_op pool_op
maxout_op
pool_with_index_op pool_with_index_op
conv_op conv_op
lstm_op
conv_transpose_op conv_transpose_op
nccl_op nccl_op
sequence_conv_op sequence_conv_op
...@@ -174,14 +196,22 @@ set(DEPS_OPS ...@@ -174,14 +196,22 @@ set(DEPS_OPS
array_to_lod_tensor_op array_to_lod_tensor_op
lstm_op lstm_op
tensor_array_read_write_op tensor_array_read_write_op
gru_op) gru_op
adagrad_op
sgd_op)
op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op) op_library(cond_op SRCS cond_op.cc DEPS framework_proto tensor operator net_op)
op_library(cross_entropy_op DEPS cross_entropy) op_library(cross_entropy_op DEPS cross_entropy)
op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax) op_library(softmax_with_cross_entropy_op DEPS cross_entropy softmax)
op_library(softmax_op DEPS softmax)
op_library(sequence_softmax_op DEPS softmax)
op_library(sum_op DEPS selected_rows_functor)
op_library(sgd_op DEPS selected_rows_functor)
op_library(adagrad_op DEPS selected_rows_functor)
op_library(conv_op DEPS vol2col) op_library(conv_op DEPS vol2col)
op_library(sum_op DEPS net_op selected_rows_functor)
op_library(pool_op DEPS pooling) op_library(pool_op DEPS pooling)
op_library(maxout_op DEPS maxouting)
op_library(pool_with_index_op DEPS pooling) op_library(pool_with_index_op DEPS pooling)
op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table) op_library(lod_rank_table_op SRCS lod_rank_table_op.cc DEPS lod_rank_table)
op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op) op_library(lod_tensor_to_array_op SRCS lod_tensor_to_array_op.cc DEPS lod_rank_table_op)
...@@ -220,6 +250,6 @@ cc_test(dynamic_recurrent_op_test SRCS dynamic_recurrent_op_test.cc ...@@ -220,6 +250,6 @@ cc_test(dynamic_recurrent_op_test SRCS dynamic_recurrent_op_test.cc
rnn/recurrent_op_utils.cc rnn/recurrent_op_utils.cc
DEPS dynamic_recurrent_op) DEPS dynamic_recurrent_op)
if(WITH_GPU) if(WITH_GPU)
nv_test(nccl_op_test SRCS nccl_op_test.cu DEPS nccl_op gpu_info device_context) cc_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
endif() endif()
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
...@@ -30,6 +30,10 @@ class AccuracyOp : public framework::OperatorWithKernel { ...@@ -30,6 +30,10 @@ class AccuracyOp : public framework::OperatorWithKernel {
"Input (Label) of accuracy op should not be null."); "Input (Label) of accuracy op should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Accuracy"), PADDLE_ENFORCE(ctx->HasOutput("Accuracy"),
"Output (Accuracy) of AccuracyOp should not be null."); "Output (Accuracy) of AccuracyOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Correct"),
"Output (Correct) of AccuracyOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Total"),
"Output (Total) of AccuracyOp should not be null.");
auto inference_dim = ctx->GetInputDim("Out"); auto inference_dim = ctx->GetInputDim("Out");
auto label_dim = ctx->GetInputDim("Label"); auto label_dim = ctx->GetInputDim("Label");
...@@ -43,6 +47,8 @@ class AccuracyOp : public framework::OperatorWithKernel { ...@@ -43,6 +47,8 @@ class AccuracyOp : public framework::OperatorWithKernel {
" the same as label."); " the same as label.");
ctx->SetOutputDim("Accuracy", {1}); ctx->SetOutputDim("Accuracy", {1});
ctx->SetOutputDim("Correct", {1});
ctx->SetOutputDim("Total", {1});
ctx->ShareLoD("Out", /*->*/ "Accuracy"); ctx->ShareLoD("Out", /*->*/ "Accuracy");
} }
...@@ -66,6 +72,8 @@ class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -66,6 +72,8 @@ class AccuracyOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("Label", "Label of the training data"); AddInput("Label", "Label of the training data");
// TODO(typhoonzero): AddInput("Weight", ... // TODO(typhoonzero): AddInput("Weight", ...
AddOutput("Accuracy", "The accuracy of current batch"); AddOutput("Accuracy", "The accuracy of current batch");
AddOutput("Correct", "The correct samples count of current batch");
AddOutput("Total", "The samples count of current batch");
AddComment(R"DOC( AddComment(R"DOC(
Accuracy Operator. Accuracy Operator.
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
...@@ -109,4 +109,5 @@ paramOut = param + paramUpdate$$ ...@@ -109,4 +109,5 @@ paramOut = param + paramUpdate$$
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker); REGISTER_OP_WITHOUT_GRADIENT(adadelta, ops::AdadeltaOp, ops::AdadeltaOpMaker);
REGISTER_OP_CPU_KERNEL( REGISTER_OP_CPU_KERNEL(
adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUPlace, float>); adadelta, ops::AdadeltaOpKernel<paddle::platform::CPUPlace, float>,
ops::AdadeltaOpKernel<paddle::platform::CPUPlace, double>);
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册